Commit 4c983898 authored by Rémi Denis-Courmont's avatar Rémi Denis-Courmont

deinterlace: split MMXEXT acceleration for IVTC to separate functions

That causes some duplication, but that is required for clobber lists.
parent b3683d44
...@@ -94,6 +94,7 @@ static void FieldFromPlane( plane_t *p_dst, const plane_t *p_src, int i_field ) ...@@ -94,6 +94,7 @@ static void FieldFromPlane( plane_t *p_dst, const plane_t *p_src, int i_field )
p_dst->p_pixels += p_src->i_pitch; p_dst->p_pixels += p_src->i_pitch;
} }
#define T 10
/** /**
* Internal helper function for EstimateNumBlocksWithMotion(): * Internal helper function for EstimateNumBlocksWithMotion():
* estimates whether there is motion in the given 8x8 block on one plane * estimates whether there is motion in the given 8x8 block on one plane
...@@ -113,28 +114,75 @@ static void FieldFromPlane( plane_t *p_dst, const plane_t *p_src, int i_field ) ...@@ -113,28 +114,75 @@ static void FieldFromPlane( plane_t *p_dst, const plane_t *p_src, int i_field )
* @param[in] p_pix_c Base pointer to the same block in current picture * @param[in] p_pix_c Base pointer to the same block in current picture
* @param i_pitch_prev i_pitch of previous picture * @param i_pitch_prev i_pitch of previous picture
* @param i_pitch_curr i_pitch of current picture * @param i_pitch_curr i_pitch of current picture
* @param b_mmx (vlc_CPU() & VLC_CPU_MMXEXT) or false.
* @param[out] pi_top 1 if top field of the block had motion, 0 if no * @param[out] pi_top 1 if top field of the block had motion, 0 if no
* @param[out] pi_bot 1 if bottom field of the block had motion, 0 if no * @param[out] pi_bot 1 if bottom field of the block had motion, 0 if no
* @return 1 if the block had motion, 0 if no * @return 1 if the block had motion, 0 if no
* @see EstimateNumBlocksWithMotion() * @see EstimateNumBlocksWithMotion()
*/ */
static inline int TestForMotionInBlock( uint8_t *p_pix_p, uint8_t *p_pix_c, static int TestForMotionInBlock( uint8_t *p_pix_p, uint8_t *p_pix_c,
int i_pitch_prev, int i_pitch_curr, int i_pitch_prev, int i_pitch_curr,
bool b_mmx,
int* pi_top, int* pi_bot ) int* pi_top, int* pi_bot )
{ {
/* Pixel luma/chroma difference threshold to detect motion. */ /* Pixel luma/chroma difference threshold to detect motion. */
#define T 10
int32_t i_motion = 0; int32_t i_motion = 0;
int32_t i_top_motion = 0; int32_t i_top_motion = 0;
int32_t i_bot_motion = 0; int32_t i_bot_motion = 0;
/* See below for the C version to see more quickly what this does. */ for( int y = 0; y < 8; ++y )
#ifdef CAN_COMPILE_MMXEXT {
if( b_mmx ) uint8_t *pc = p_pix_c;
uint8_t *pp = p_pix_p;
int score = 0;
for( int x = 0; x < 8; ++x )
{ {
int_fast16_t C = abs((*pc) - (*pp));
if( C > T )
++score;
++pc;
++pp;
}
i_motion += score;
if( y % 2 == 0 )
i_top_motion += score;
else
i_bot_motion += score;
p_pix_c += i_pitch_curr;
p_pix_p += i_pitch_prev;
}
/* Field motion thresholds.
Empirical value - works better in practice than the "4" that
would be consistent with the full-block threshold.
Especially the opening scene of The Third ep. 1 (just after the OP)
works better with this. It also fixes some talking scenes in
Stellvia ep. 1, where the cadence would otherwise catch on incorrectly,
leading to more interlacing artifacts than by just using the emergency
mode frame composer.
*/
(*pi_top) = ( i_top_motion >= 8 );
(*pi_bot) = ( i_bot_motion >= 8 );
/* Full-block threshold = (8*8)/8: motion is detected if 1/8 of the block
changes "enough". */
return (i_motion >= 8);
}
#ifdef CAN_COMPILE_MMXEXT
VLC_MMX
static int TestForMotionInBlockMMX( uint8_t *p_pix_p, uint8_t *p_pix_c,
int i_pitch_prev, int i_pitch_curr,
int* pi_top, int* pi_bot )
{
int32_t i_motion = 0;
int32_t i_top_motion = 0;
int32_t i_bot_motion = 0;
static const mmx_t bT = { .ub = { T, T, T, T, T, T, T, T } }; static const mmx_t bT = { .ub = { T, T, T, T, T, T, T, T } };
pxor_r2r( mm6, mm6 ); /* zero, used in psadbw */ pxor_r2r( mm6, mm6 ); /* zero, used in psadbw */
movq_m2r( bT, mm5 ); movq_m2r( bT, mm5 );
...@@ -191,54 +239,12 @@ static inline int TestForMotionInBlock( uint8_t *p_pix_p, uint8_t *p_pix_c, ...@@ -191,54 +239,12 @@ static inline int TestForMotionInBlock( uint8_t *p_pix_p, uint8_t *p_pix_c,
i_motion /= 255; i_motion /= 255;
emms(); emms();
}
else
#endif
{
for( int y = 0; y < 8; ++y )
{
uint8_t *pc = p_pix_c;
uint8_t *pp = p_pix_p;
int score = 0;
for( int x = 0; x < 8; ++x )
{
int_fast16_t C = abs((*pc) - (*pp));
if( C > T )
++score;
++pc;
++pp;
}
i_motion += score;
if( y % 2 == 0 )
i_top_motion += score;
else
i_bot_motion += score;
p_pix_c += i_pitch_curr;
p_pix_p += i_pitch_prev;
}
}
/* Field motion thresholds.
Empirical value - works better in practice than the "4" that
would be consistent with the full-block threshold.
Especially the opening scene of The Third ep. 1 (just after the OP)
works better with this. It also fixes some talking scenes in
Stellvia ep. 1, where the cadence would otherwise catch on incorrectly,
leading to more interlacing artifacts than by just using the emergency
mode frame composer.
*/
(*pi_top) = ( i_top_motion >= 8 ); (*pi_top) = ( i_top_motion >= 8 );
(*pi_bot) = ( i_bot_motion >= 8 ); (*pi_bot) = ( i_bot_motion >= 8 );
/* Full-block threshold = (8*8)/8: motion is detected if 1/8 of the block
changes "enough". */
return (i_motion >= 8); return (i_motion >= 8);
} }
#endif
#undef T #undef T
/***************************************************************************** /*****************************************************************************
...@@ -386,11 +392,12 @@ int EstimateNumBlocksWithMotion( const picture_t* p_prev, ...@@ -386,11 +392,12 @@ int EstimateNumBlocksWithMotion( const picture_t* p_prev,
if( p_prev->i_planes != p_curr->i_planes ) if( p_prev->i_planes != p_curr->i_planes )
return -1; return -1;
int (*motion_in_block)(uint8_t *, uint8_t *, int , int, int *, int *) =
TestForMotionInBlock;
/* We must tell our inline helper whether to use MMX acceleration. */ /* We must tell our inline helper whether to use MMX acceleration. */
#ifdef CAN_COMPILE_MMXEXT #ifdef CAN_COMPILE_MMXEXT
const bool b_mmx = vlc_CPU_MMXEXT(); if (vlc_CPU_MMXEXT())
#else motion_in_block = TestForMotionInBlockMMX;
const bool b_mmx = false;
#endif #endif
int i_score = 0; int i_score = 0;
...@@ -419,9 +426,8 @@ int EstimateNumBlocksWithMotion( const picture_t* p_prev, ...@@ -419,9 +426,8 @@ int EstimateNumBlocksWithMotion( const picture_t* p_prev,
for( int bx = 0; bx < i_mbx; ++bx ) for( int bx = 0; bx < i_mbx; ++bx )
{ {
int i_top_temp, i_bot_temp; int i_top_temp, i_bot_temp;
i_score += TestForMotionInBlock( p_pix_p, p_pix_c, i_score += motion_in_block( p_pix_p, p_pix_c,
i_pitch_prev, i_pitch_curr, i_pitch_prev, i_pitch_curr,
b_mmx,
&i_top_temp, &i_bot_temp ); &i_top_temp, &i_bot_temp );
i_score_top += i_top_temp; i_score_top += i_top_temp;
i_score_bot += i_bot_temp; i_score_bot += i_bot_temp;
...@@ -440,39 +446,22 @@ int EstimateNumBlocksWithMotion( const picture_t* p_prev, ...@@ -440,39 +446,22 @@ int EstimateNumBlocksWithMotion( const picture_t* p_prev,
return i_score; return i_score;
} }
/* See header for function doc. */ /* Threshold (value from Transcode 1.1.5) */
int CalculateInterlaceScore( const picture_t* p_pic_top, #define T 100
#ifdef CAN_COMPILE_MMXEXT
VLC_MMX
static int CalculateInterlaceScoreMMX( const picture_t* p_pic_top,
const picture_t* p_pic_bot ) const picture_t* p_pic_bot )
{ {
/* assert( p_pic_top->i_planes == p_pic_bot->i_planes );
We use the comb metric from the IVTC filter of Transcode 1.1.5.
This was found to work better for the particular purpose of IVTC
than RenderX()'s comb metric.
Note that we *must not* subsample at all in order to catch interlacing
in telecined frames with localized motion (e.g. anime with characters
talking, where only mouths move and everything else stays still.)
*/
assert( p_pic_top != NULL );
assert( p_pic_bot != NULL );
if( p_pic_top->i_planes != p_pic_bot->i_planes )
return -1;
/* Amount of bits must be known for MMX, thus int32_t. /* Amount of bits must be known for MMX, thus int32_t.
Doesn't hurt the C implementation. */ Doesn't hurt the C implementation. */
int32_t i_score_mmx = 0; /* this must be divided by 255 when finished */ int32_t i_score_mmx = 0; /* this must be divided by 255 when finished */
int32_t i_score_c = 0; /* this counts as-is (used for non-MMX parts) */ int32_t i_score_c = 0; /* this counts as-is (used for non-MMX parts) */
#ifdef CAN_COMPILE_MMXEXT
# ifndef __SSE__
const unsigned u_cpu = vlc_CPU();
if( u_cpu & VLC_CPU_MMXEXT )
# endif
pxor_r2r( mm7, mm7 ); /* we will keep score in mm7 */ pxor_r2r( mm7, mm7 ); /* we will keep score in mm7 */
#endif
for( int i_plane = 0 ; i_plane < p_pic_top->i_planes ; ++i_plane ) for( int i_plane = 0 ; i_plane < p_pic_top->i_planes ; ++i_plane )
{ {
...@@ -505,19 +494,12 @@ int CalculateInterlaceScore( const picture_t* p_pic_top, ...@@ -505,19 +494,12 @@ int CalculateInterlaceScore( const picture_t* p_pic_top,
int x = 0; int x = 0;
/* Threshold (value from Transcode 1.1.5) */
#define T 100
#ifdef CAN_COMPILE_MMXEXT
/* Easy-to-read C version further below. /* Easy-to-read C version further below.
Assumptions: 0 < T < 127 Assumptions: 0 < T < 127
# of pixels < (2^32)/255 # of pixels < (2^32)/255
Note: calculates score * 255 Note: calculates score * 255
*/ */
# ifndef __SSE__
if( u_cpu & VLC_CPU_MMXEXT )
# endif
{
static const mmx_t b0 = { .uq = 0x0000000000000000ULL }; static const mmx_t b0 = { .uq = 0x0000000000000000ULL };
static const mmx_t b128 = { .uq = 0x8080808080808080ULL }; static const mmx_t b128 = { .uq = 0x8080808080808080ULL };
static const mmx_t bT = { .ub = { T, T, T, T, T, T, T, T } }; static const mmx_t bT = { .ub = { T, T, T, T, T, T, T, T } };
...@@ -557,8 +539,7 @@ int CalculateInterlaceScore( const picture_t* p_pic_top, ...@@ -557,8 +539,7 @@ int CalculateInterlaceScore( const picture_t* p_pic_top,
p_p += 8; p_p += 8;
p_n += 8; p_n += 8;
} }
}
#endif
for( ; x < w; ++x ) for( ; x < w; ++x )
{ {
/* Worst case: need 17 bits for "comb". */ /* Worst case: need 17 bits for "comb". */
...@@ -594,17 +575,102 @@ int CalculateInterlaceScore( const picture_t* p_pic_top, ...@@ -594,17 +575,102 @@ int CalculateInterlaceScore( const picture_t* p_pic_top,
} }
} }
#ifdef CAN_COMPILE_MMXEXT
# ifndef __SSE__
if( u_cpu & VLC_CPU_MMXEXT )
# endif
{
movd_r2m( mm7, i_score_mmx ); movd_r2m( mm7, i_score_mmx );
emms(); emms();
i_score_mmx /= 255;
} return i_score_mmx/255 + i_score_c;
}
#endif #endif
return i_score_mmx + i_score_c; /* See header for function doc. */
int CalculateInterlaceScore( const picture_t* p_pic_top,
const picture_t* p_pic_bot )
{
/*
We use the comb metric from the IVTC filter of Transcode 1.1.5.
This was found to work better for the particular purpose of IVTC
than RenderX()'s comb metric.
Note that we *must not* subsample at all in order to catch interlacing
in telecined frames with localized motion (e.g. anime with characters
talking, where only mouths move and everything else stays still.)
*/
assert( p_pic_top != NULL );
assert( p_pic_bot != NULL );
if( p_pic_top->i_planes != p_pic_bot->i_planes )
return -1;
#ifdef CAN_COMPILE_MMXEXT
if (vlc_CPU_MMXEXT())
return CalculateInterlaceScoreMMX( p_pic_top, p_pic_bot );
#endif
int32_t i_score = 0;
for( int i_plane = 0 ; i_plane < p_pic_top->i_planes ; ++i_plane )
{
/* Sanity check */
if( p_pic_top->p[i_plane].i_visible_lines !=
p_pic_bot->p[i_plane].i_visible_lines )
return -1;
const int i_lasty = p_pic_top->p[i_plane].i_visible_lines-1;
const int w = FFMIN( p_pic_top->p[i_plane].i_visible_pitch,
p_pic_bot->p[i_plane].i_visible_pitch );
/* Current line / neighbouring lines picture pointers */
const picture_t *cur = p_pic_bot;
const picture_t *ngh = p_pic_top;
int wc = cur->p[i_plane].i_pitch;
int wn = ngh->p[i_plane].i_pitch;
/* Transcode 1.1.5 only checks every other line. Checking every line
works better for anime, which may contain horizontal,
one pixel thick cartoon outlines.
*/
for( int y = 1; y < i_lasty; ++y )
{
uint8_t *p_c = &cur->p[i_plane].p_pixels[y*wc]; /* this line */
uint8_t *p_p = &ngh->p[i_plane].p_pixels[(y-1)*wn]; /* prev line */
uint8_t *p_n = &ngh->p[i_plane].p_pixels[(y+1)*wn]; /* next line */
for( int x = 0; x < w; ++x )
{
/* Worst case: need 17 bits for "comb". */
int_fast32_t C = *p_c;
int_fast32_t P = *p_p;
int_fast32_t N = *p_n;
/* Comments in Transcode's filter_ivtc.c attribute this
combing metric to Gunnar Thalin.
The idea is that if the picture is interlaced, both
expressions will have the same sign, and this comes
up positive. The value T = 100 has been chosen such
that a pixel difference of 10 (on average) will
trigger the detector.
*/
int_fast32_t comb = (P - C) * (N - C);
if( comb > T )
++i_score;
++p_c;
++p_p;
++p_n;
}
/* Now the other field - swap current and neighbour pictures */
const picture_t *tmp = cur;
cur = ngh;
ngh = tmp;
int tmp_pitch = wc;
wc = wn;
wn = tmp_pitch;
}
}
return i_score;
} }
#undef T #undef T
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment