Commit 4c983898 authored by Rémi Denis-Courmont's avatar Rémi Denis-Courmont

deinterlace: split MMXEXT acceleration for IVTC to separate functions

That causes some duplication, but that is required for clobber lists.
parent b3683d44
......@@ -94,6 +94,7 @@ static void FieldFromPlane( plane_t *p_dst, const plane_t *p_src, int i_field )
p_dst->p_pixels += p_src->i_pitch;
}
#define T 10
/**
* Internal helper function for EstimateNumBlocksWithMotion():
* estimates whether there is motion in the given 8x8 block on one plane
......@@ -113,112 +114,44 @@ static void FieldFromPlane( plane_t *p_dst, const plane_t *p_src, int i_field )
* @param[in] p_pix_c Base pointer to the same block in current picture
* @param i_pitch_prev i_pitch of previous picture
* @param i_pitch_curr i_pitch of current picture
* @param b_mmx (vlc_CPU() & VLC_CPU_MMXEXT) or false.
* @param[out] pi_top 1 if top field of the block had motion, 0 if no
* @param[out] pi_bot 1 if bottom field of the block had motion, 0 if no
* @return 1 if the block had motion, 0 if no
* @see EstimateNumBlocksWithMotion()
*/
static inline int TestForMotionInBlock( uint8_t *p_pix_p, uint8_t *p_pix_c,
int i_pitch_prev, int i_pitch_curr,
bool b_mmx,
int* pi_top, int* pi_bot )
static int TestForMotionInBlock( uint8_t *p_pix_p, uint8_t *p_pix_c,
int i_pitch_prev, int i_pitch_curr,
int* pi_top, int* pi_bot )
{
/* Pixel luma/chroma difference threshold to detect motion. */
#define T 10
int32_t i_motion = 0;
int32_t i_top_motion = 0;
int32_t i_bot_motion = 0;
/* See below for the C version to see more quickly what this does. */
#ifdef CAN_COMPILE_MMXEXT
if( b_mmx )
{
static const mmx_t bT = { .ub = { T, T, T, T, T, T, T, T } };
pxor_r2r( mm6, mm6 ); /* zero, used in psadbw */
movq_m2r( bT, mm5 );
pxor_r2r( mm3, mm3 ); /* score (top field) */
pxor_r2r( mm4, mm4 ); /* score (bottom field) */
for( int y = 0; y < 8; y+=2 )
{
/* top field */
movq_m2r( *((uint64_t*)p_pix_c), mm0 );
movq_m2r( *((uint64_t*)p_pix_p), mm1 );
movq_r2r( mm0, mm2 );
psubusb_r2r( mm1, mm2 );
psubusb_r2r( mm0, mm1 );
pcmpgtb_r2r( mm5, mm2 );
pcmpgtb_r2r( mm5, mm1 );
psadbw_r2r( mm6, mm2 );
psadbw_r2r( mm6, mm1 );
paddd_r2r( mm2, mm1 );
paddd_r2r( mm1, mm3 ); /* add to top field score */
p_pix_c += i_pitch_curr;
p_pix_p += i_pitch_prev;
/* bottom field - handling identical to top field, except... */
movq_m2r( *((uint64_t*)p_pix_c), mm0 );
movq_m2r( *((uint64_t*)p_pix_p), mm1 );
movq_r2r( mm0, mm2 );
psubusb_r2r( mm1, mm2 );
psubusb_r2r( mm0, mm1 );
pcmpgtb_r2r( mm5, mm2 );
pcmpgtb_r2r( mm5, mm1 );
psadbw_r2r( mm6, mm2 );
psadbw_r2r( mm6, mm1 );
paddd_r2r( mm2, mm1 );
paddd_r2r( mm1, mm4 ); /* ...here we add to bottom field score */
p_pix_c += i_pitch_curr;
p_pix_p += i_pitch_prev;
}
movq_r2r( mm3, mm7 ); /* score (total) */
paddd_r2r( mm4, mm7 );
movd_r2m( mm3, i_top_motion );
movd_r2m( mm4, i_bot_motion );
movd_r2m( mm7, i_motion );
/* The loop counts actual score * 255. */
i_top_motion /= 255;
i_bot_motion /= 255;
i_motion /= 255;
emms();
}
else
#endif
for( int y = 0; y < 8; ++y )
{
for( int y = 0; y < 8; ++y )
uint8_t *pc = p_pix_c;
uint8_t *pp = p_pix_p;
int score = 0;
for( int x = 0; x < 8; ++x )
{
uint8_t *pc = p_pix_c;
uint8_t *pp = p_pix_p;
int score = 0;
for( int x = 0; x < 8; ++x )
{
int_fast16_t C = abs((*pc) - (*pp));
if( C > T )
++score;
int_fast16_t C = abs((*pc) - (*pp));
if( C > T )
++score;
++pc;
++pp;
}
++pc;
++pp;
}
i_motion += score;
if( y % 2 == 0 )
i_top_motion += score;
else
i_bot_motion += score;
i_motion += score;
if( y % 2 == 0 )
i_top_motion += score;
else
i_bot_motion += score;
p_pix_c += i_pitch_curr;
p_pix_p += i_pitch_prev;
}
p_pix_c += i_pitch_curr;
p_pix_p += i_pitch_prev;
}
/* Field motion thresholds.
......@@ -239,6 +172,79 @@ static inline int TestForMotionInBlock( uint8_t *p_pix_p, uint8_t *p_pix_c,
changes "enough". */
return (i_motion >= 8);
}
#ifdef CAN_COMPILE_MMXEXT
VLC_MMX
static int TestForMotionInBlockMMX( uint8_t *p_pix_p, uint8_t *p_pix_c,
int i_pitch_prev, int i_pitch_curr,
int* pi_top, int* pi_bot )
{
int32_t i_motion = 0;
int32_t i_top_motion = 0;
int32_t i_bot_motion = 0;
static const mmx_t bT = { .ub = { T, T, T, T, T, T, T, T } };
pxor_r2r( mm6, mm6 ); /* zero, used in psadbw */
movq_m2r( bT, mm5 );
pxor_r2r( mm3, mm3 ); /* score (top field) */
pxor_r2r( mm4, mm4 ); /* score (bottom field) */
for( int y = 0; y < 8; y+=2 )
{
/* top field */
movq_m2r( *((uint64_t*)p_pix_c), mm0 );
movq_m2r( *((uint64_t*)p_pix_p), mm1 );
movq_r2r( mm0, mm2 );
psubusb_r2r( mm1, mm2 );
psubusb_r2r( mm0, mm1 );
pcmpgtb_r2r( mm5, mm2 );
pcmpgtb_r2r( mm5, mm1 );
psadbw_r2r( mm6, mm2 );
psadbw_r2r( mm6, mm1 );
paddd_r2r( mm2, mm1 );
paddd_r2r( mm1, mm3 ); /* add to top field score */
p_pix_c += i_pitch_curr;
p_pix_p += i_pitch_prev;
/* bottom field - handling identical to top field, except... */
movq_m2r( *((uint64_t*)p_pix_c), mm0 );
movq_m2r( *((uint64_t*)p_pix_p), mm1 );
movq_r2r( mm0, mm2 );
psubusb_r2r( mm1, mm2 );
psubusb_r2r( mm0, mm1 );
pcmpgtb_r2r( mm5, mm2 );
pcmpgtb_r2r( mm5, mm1 );
psadbw_r2r( mm6, mm2 );
psadbw_r2r( mm6, mm1 );
paddd_r2r( mm2, mm1 );
paddd_r2r( mm1, mm4 ); /* ...here we add to bottom field score */
p_pix_c += i_pitch_curr;
p_pix_p += i_pitch_prev;
}
movq_r2r( mm3, mm7 ); /* score (total) */
paddd_r2r( mm4, mm7 );
movd_r2m( mm3, i_top_motion );
movd_r2m( mm4, i_bot_motion );
movd_r2m( mm7, i_motion );
/* The loop counts actual score * 255. */
i_top_motion /= 255;
i_bot_motion /= 255;
i_motion /= 255;
emms();
(*pi_top) = ( i_top_motion >= 8 );
(*pi_bot) = ( i_bot_motion >= 8 );
return (i_motion >= 8);
}
#endif
#undef T
/*****************************************************************************
......@@ -386,11 +392,12 @@ int EstimateNumBlocksWithMotion( const picture_t* p_prev,
if( p_prev->i_planes != p_curr->i_planes )
return -1;
int (*motion_in_block)(uint8_t *, uint8_t *, int , int, int *, int *) =
TestForMotionInBlock;
/* We must tell our inline helper whether to use MMX acceleration. */
#ifdef CAN_COMPILE_MMXEXT
const bool b_mmx = vlc_CPU_MMXEXT();
#else
const bool b_mmx = false;
if (vlc_CPU_MMXEXT())
motion_in_block = TestForMotionInBlockMMX;
#endif
int i_score = 0;
......@@ -419,10 +426,9 @@ int EstimateNumBlocksWithMotion( const picture_t* p_prev,
for( int bx = 0; bx < i_mbx; ++bx )
{
int i_top_temp, i_bot_temp;
i_score += TestForMotionInBlock( p_pix_p, p_pix_c,
i_pitch_prev, i_pitch_curr,
b_mmx,
&i_top_temp, &i_bot_temp );
i_score += motion_in_block( p_pix_p, p_pix_c,
i_pitch_prev, i_pitch_curr,
&i_top_temp, &i_bot_temp );
i_score_top += i_top_temp;
i_score_bot += i_bot_temp;
......@@ -440,39 +446,22 @@ int EstimateNumBlocksWithMotion( const picture_t* p_prev,
return i_score;
}
/* See header for function doc. */
int CalculateInterlaceScore( const picture_t* p_pic_top,
const picture_t* p_pic_bot )
{
/*
We use the comb metric from the IVTC filter of Transcode 1.1.5.
This was found to work better for the particular purpose of IVTC
than RenderX()'s comb metric.
Note that we *must not* subsample at all in order to catch interlacing
in telecined frames with localized motion (e.g. anime with characters
talking, where only mouths move and everything else stays still.)
*/
assert( p_pic_top != NULL );
assert( p_pic_bot != NULL );
/* Threshold (value from Transcode 1.1.5) */
#define T 100
if( p_pic_top->i_planes != p_pic_bot->i_planes )
return -1;
#ifdef CAN_COMPILE_MMXEXT
VLC_MMX
static int CalculateInterlaceScoreMMX( const picture_t* p_pic_top,
const picture_t* p_pic_bot )
{
assert( p_pic_top->i_planes == p_pic_bot->i_planes );
/* Amount of bits must be known for MMX, thus int32_t.
Doesn't hurt the C implementation. */
int32_t i_score_mmx = 0; /* this must be divided by 255 when finished */
int32_t i_score_c = 0; /* this counts as-is (used for non-MMX parts) */
#ifdef CAN_COMPILE_MMXEXT
# ifndef __SSE__
const unsigned u_cpu = vlc_CPU();
if( u_cpu & VLC_CPU_MMXEXT )
# endif
pxor_r2r( mm7, mm7 ); /* we will keep score in mm7 */
#endif
pxor_r2r( mm7, mm7 ); /* we will keep score in mm7 */
for( int i_plane = 0 ; i_plane < p_pic_top->i_planes ; ++i_plane )
{
......@@ -505,60 +494,52 @@ int CalculateInterlaceScore( const picture_t* p_pic_top,
int x = 0;
/* Threshold (value from Transcode 1.1.5) */
#define T 100
#ifdef CAN_COMPILE_MMXEXT
/* Easy-to-read C version further below.
Assumptions: 0 < T < 127
# of pixels < (2^32)/255
Note: calculates score * 255
*/
# ifndef __SSE__
if( u_cpu & VLC_CPU_MMXEXT )
# endif
{
static const mmx_t b0 = { .uq = 0x0000000000000000ULL };
static const mmx_t b128 = { .uq = 0x8080808080808080ULL };
static const mmx_t bT = { .ub = { T, T, T, T, T, T, T, T } };
static const mmx_t b0 = { .uq = 0x0000000000000000ULL };
static const mmx_t b128 = { .uq = 0x8080808080808080ULL };
static const mmx_t bT = { .ub = { T, T, T, T, T, T, T, T } };
for( ; x < w8; x += 8 )
{
movq_m2r( *((int64_t*)p_c), mm0 );
movq_m2r( *((int64_t*)p_p), mm1 );
movq_m2r( *((int64_t*)p_n), mm2 );
psubb_m2r( b128, mm0 );
psubb_m2r( b128, mm1 );
psubb_m2r( b128, mm2 );
psubsb_r2r( mm0, mm1 );
psubsb_r2r( mm0, mm2 );
pxor_r2r( mm3, mm3 );
pxor_r2r( mm4, mm4 );
pxor_r2r( mm5, mm5 );
pxor_r2r( mm6, mm6 );
punpcklbw_r2r( mm1, mm3 );
punpcklbw_r2r( mm2, mm4 );
punpckhbw_r2r( mm1, mm5 );
punpckhbw_r2r( mm2, mm6 );
pmulhw_r2r( mm3, mm4 );
pmulhw_r2r( mm5, mm6 );
packsswb_r2r(mm4, mm6);
pcmpgtb_m2r( bT, mm6 );
psadbw_m2r( b0, mm6 );
paddd_r2r( mm6, mm7 );
p_c += 8;
p_p += 8;
p_n += 8;
}
for( ; x < w8; x += 8 )
{
movq_m2r( *((int64_t*)p_c), mm0 );
movq_m2r( *((int64_t*)p_p), mm1 );
movq_m2r( *((int64_t*)p_n), mm2 );
psubb_m2r( b128, mm0 );
psubb_m2r( b128, mm1 );
psubb_m2r( b128, mm2 );
psubsb_r2r( mm0, mm1 );
psubsb_r2r( mm0, mm2 );
pxor_r2r( mm3, mm3 );
pxor_r2r( mm4, mm4 );
pxor_r2r( mm5, mm5 );
pxor_r2r( mm6, mm6 );
punpcklbw_r2r( mm1, mm3 );
punpcklbw_r2r( mm2, mm4 );
punpckhbw_r2r( mm1, mm5 );
punpckhbw_r2r( mm2, mm6 );
pmulhw_r2r( mm3, mm4 );
pmulhw_r2r( mm5, mm6 );
packsswb_r2r(mm4, mm6);
pcmpgtb_m2r( bT, mm6 );
psadbw_m2r( b0, mm6 );
paddd_r2r( mm6, mm7 );
p_c += 8;
p_p += 8;
p_n += 8;
}
#endif
for( ; x < w; ++x )
{
/* Worst case: need 17 bits for "comb". */
......@@ -594,17 +575,102 @@ int CalculateInterlaceScore( const picture_t* p_pic_top,
}
}
movd_r2m( mm7, i_score_mmx );
emms();
return i_score_mmx/255 + i_score_c;
}
#endif
/* See header for function doc. */
int CalculateInterlaceScore( const picture_t* p_pic_top,
const picture_t* p_pic_bot )
{
/*
We use the comb metric from the IVTC filter of Transcode 1.1.5.
This was found to work better for the particular purpose of IVTC
than RenderX()'s comb metric.
Note that we *must not* subsample at all in order to catch interlacing
in telecined frames with localized motion (e.g. anime with characters
talking, where only mouths move and everything else stays still.)
*/
assert( p_pic_top != NULL );
assert( p_pic_bot != NULL );
if( p_pic_top->i_planes != p_pic_bot->i_planes )
return -1;
#ifdef CAN_COMPILE_MMXEXT
# ifndef __SSE__
if( u_cpu & VLC_CPU_MMXEXT )
# endif
if (vlc_CPU_MMXEXT())
return CalculateInterlaceScoreMMX( p_pic_top, p_pic_bot );
#endif
int32_t i_score = 0;
for( int i_plane = 0 ; i_plane < p_pic_top->i_planes ; ++i_plane )
{
movd_r2m( mm7, i_score_mmx );
emms();
i_score_mmx /= 255;
/* Sanity check */
if( p_pic_top->p[i_plane].i_visible_lines !=
p_pic_bot->p[i_plane].i_visible_lines )
return -1;
const int i_lasty = p_pic_top->p[i_plane].i_visible_lines-1;
const int w = FFMIN( p_pic_top->p[i_plane].i_visible_pitch,
p_pic_bot->p[i_plane].i_visible_pitch );
/* Current line / neighbouring lines picture pointers */
const picture_t *cur = p_pic_bot;
const picture_t *ngh = p_pic_top;
int wc = cur->p[i_plane].i_pitch;
int wn = ngh->p[i_plane].i_pitch;
/* Transcode 1.1.5 only checks every other line. Checking every line
works better for anime, which may contain horizontal,
one pixel thick cartoon outlines.
*/
for( int y = 1; y < i_lasty; ++y )
{
uint8_t *p_c = &cur->p[i_plane].p_pixels[y*wc]; /* this line */
uint8_t *p_p = &ngh->p[i_plane].p_pixels[(y-1)*wn]; /* prev line */
uint8_t *p_n = &ngh->p[i_plane].p_pixels[(y+1)*wn]; /* next line */
for( int x = 0; x < w; ++x )
{
/* Worst case: need 17 bits for "comb". */
int_fast32_t C = *p_c;
int_fast32_t P = *p_p;
int_fast32_t N = *p_n;
/* Comments in Transcode's filter_ivtc.c attribute this
combing metric to Gunnar Thalin.
The idea is that if the picture is interlaced, both
expressions will have the same sign, and this comes
up positive. The value T = 100 has been chosen such
that a pixel difference of 10 (on average) will
trigger the detector.
*/
int_fast32_t comb = (P - C) * (N - C);
if( comb > T )
++i_score;
++p_c;
++p_p;
++p_n;
}
/* Now the other field - swap current and neighbour pictures */
const picture_t *tmp = cur;
cur = ngh;
ngh = tmp;
int tmp_pitch = wc;
wc = wn;
wn = tmp_pitch;
}
}
#endif
return i_score_mmx + i_score_c;
return i_score;
}
#undef T
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment