deinterlace: split MMXEXT acceleration for IVTC to separate functions

That causes some duplication, but that is required for clobber lists.

deinterlace: split MMXEXT acceleration for IVTC to separate functions
That causes some duplication, but that is required for clobber lists.
4c983898 · Rémi Denis-Courmont · b3683d44 · 4c983898
Commit 4c983898 authored Dec 17, 2012 by Rémi Denis-Courmont
Hide whitespace changes
Inline Side-by-side

Showing with 242 additions and 176 deletions

modules/video_filter/deinterlace/helpers.c modules/video_filter/deinterlace/helpers.c +242 -176

No files found.
--- a/modules/video_filter/deinterlace/helpers.c
+++ b/modules/video_filter/deinterlace/helpers.c
@@ -94,6 +94,7 @@ static void FieldFromPlane( plane_t *p_dst, const plane_t *p_src, int i_field )
        p_dst->p_pixels += p_src->i_pitch;
 }

+#define T 10
 /**
 * Internal helper function for EstimateNumBlocksWithMotion():
 * estimates whether there is motion in the given 8x8 block on one plane
@@ -113,112 +114,44 @@ static void FieldFromPlane( plane_t *p_dst, const plane_t *p_src, int i_field )
 * @param[in] p_pix_c Base pointer to the same block in current picture
 * @param i_pitch_prev i_pitch of previous picture
 * @param i_pitch_curr i_pitch of current picture
- * @param b_mmx (vlc_CPU() & VLC_CPU_MMXEXT) or false.
 * @param[out] pi_top 1 if top field of the block had motion, 0 if no
 * @param[out] pi_bot 1 if bottom field of the block had motion, 0 if no
 * @return 1 if the block had motion, 0 if no
 * @see EstimateNumBlocksWithMotion()
 */
-static inline int TestForMotionInBlock( uint8_t *p_pix_p, uint8_t *p_pix_c,
-                                        int i_pitch_prev, int i_pitch_curr,
-                                        bool b_mmx,
-                                        int* pi_top, int* pi_bot )
+static int TestForMotionInBlock( uint8_t *p_pix_p, uint8_t *p_pix_c,
+                                 int i_pitch_prev, int i_pitch_curr,
+                                 int* pi_top, int* pi_bot )
 {
 /* Pixel luma/chroma difference threshold to detect motion. */
-#define T 10

    int32_t i_motion = 0;
    int32_t i_top_motion = 0;
    int32_t i_bot_motion = 0;

-/* See below for the C version to see more quickly what this does. */
-#ifdef CAN_COMPILE_MMXEXT
-    if( b_mmx )
-    {
-        static const mmx_t bT   = { .ub = { T, T, T, T, T, T, T, T } };
-        pxor_r2r( mm6, mm6 ); /* zero, used in psadbw */
-        movq_m2r( bT,  mm5 );
-
-        pxor_r2r( mm3, mm3 ); /* score (top field) */
-        pxor_r2r( mm4, mm4 ); /* score (bottom field) */
-        for( int y = 0; y < 8; y+=2 )
-        {
-            /* top field */
-            movq_m2r( *((uint64_t*)p_pix_c), mm0 );
-            movq_m2r( *((uint64_t*)p_pix_p), mm1 );
-            movq_r2r( mm0, mm2 );
-            psubusb_r2r( mm1, mm2 );
-            psubusb_r2r( mm0, mm1 );
-
-            pcmpgtb_r2r( mm5, mm2 );
-            pcmpgtb_r2r( mm5, mm1 );
-            psadbw_r2r(  mm6, mm2 );
-            psadbw_r2r(  mm6, mm1 );
-
-            paddd_r2r( mm2, mm1 );
-            paddd_r2r( mm1, mm3 ); /* add to top field score */
-
-            p_pix_c += i_pitch_curr;
-            p_pix_p += i_pitch_prev;
-
-            /* bottom field - handling identical to top field, except... */
-            movq_m2r( *((uint64_t*)p_pix_c), mm0 );
-            movq_m2r( *((uint64_t*)p_pix_p), mm1 );
-            movq_r2r( mm0, mm2 );
-            psubusb_r2r( mm1, mm2 );
-            psubusb_r2r( mm0, mm1 );
-
-            pcmpgtb_r2r( mm5, mm2 );
-            pcmpgtb_r2r( mm5, mm1 );
-            psadbw_r2r(  mm6, mm2 );
-            psadbw_r2r(  mm6, mm1 );
-
-            paddd_r2r( mm2, mm1 );
-            paddd_r2r( mm1, mm4 ); /* ...here we add to bottom field score */
-
-            p_pix_c += i_pitch_curr;
-            p_pix_p += i_pitch_prev;
-        }
-        movq_r2r(  mm3, mm7 ); /* score (total) */
-        paddd_r2r( mm4, mm7 );
-        movd_r2m( mm3, i_top_motion );
-        movd_r2m( mm4, i_bot_motion );
-        movd_r2m( mm7, i_motion );
-
-        /* The loop counts actual score * 255. */
-        i_top_motion /= 255;
-        i_bot_motion /= 255;
-        i_motion     /= 255;
-
-        emms();
-    }
-    else
-#endif
+    for( int y = 0; y < 8; ++y )
    {
-        for( int y = 0; y < 8; ++y )
+        uint8_t *pc = p_pix_c;
+        uint8_t *pp = p_pix_p;
+        int score = 0;
+        for( int x = 0; x < 8; ++x )
        {
-            uint8_t *pc = p_pix_c;
-            uint8_t *pp = p_pix_p;
-            int score = 0;
-            for( int x = 0; x < 8; ++x )
-            {
-                int_fast16_t C = abs((*pc) - (*pp));
-                if( C > T )
-                    ++score;
+            int_fast16_t C = abs((*pc) - (*pp));
+            if( C > T )
+                ++score;

-                ++pc;
-                ++pp;
-            }
+            ++pc;
+            ++pp;
+        }

-            i_motion += score;
-            if( y % 2 == 0 )
-                i_top_motion += score;
-            else
-                i_bot_motion += score;
+        i_motion += score;
+        if( y % 2 == 0 )
+            i_top_motion += score;
+        else
+            i_bot_motion += score;

-            p_pix_c += i_pitch_curr;
-            p_pix_p += i_pitch_prev;
-        }
+        p_pix_c += i_pitch_curr;
+        p_pix_p += i_pitch_prev;
    }

    /* Field motion thresholds.
@@ -239,6 +172,79 @@ static inline int TestForMotionInBlock( uint8_t *p_pix_p, uint8_t *p_pix_c,
       changes "enough". */
    return (i_motion >= 8);
 }
+
+#ifdef CAN_COMPILE_MMXEXT
+VLC_MMX
+static int TestForMotionInBlockMMX( uint8_t *p_pix_p, uint8_t *p_pix_c,
+                                    int i_pitch_prev, int i_pitch_curr,
+                                    int* pi_top, int* pi_bot )
+{
+    int32_t i_motion = 0;
+    int32_t i_top_motion = 0;
+    int32_t i_bot_motion = 0;
+
+    static const mmx_t bT   = { .ub = { T, T, T, T, T, T, T, T } };
+    pxor_r2r( mm6, mm6 ); /* zero, used in psadbw */
+    movq_m2r( bT,  mm5 );
+
+    pxor_r2r( mm3, mm3 ); /* score (top field) */
+    pxor_r2r( mm4, mm4 ); /* score (bottom field) */
+    for( int y = 0; y < 8; y+=2 )
+    {
+        /* top field */
+        movq_m2r( *((uint64_t*)p_pix_c), mm0 );
+        movq_m2r( *((uint64_t*)p_pix_p), mm1 );
+        movq_r2r( mm0, mm2 );
+        psubusb_r2r( mm1, mm2 );
+        psubusb_r2r( mm0, mm1 );
+
+        pcmpgtb_r2r( mm5, mm2 );
+        pcmpgtb_r2r( mm5, mm1 );
+        psadbw_r2r(  mm6, mm2 );
+        psadbw_r2r(  mm6, mm1 );
+
+        paddd_r2r( mm2, mm1 );
+        paddd_r2r( mm1, mm3 ); /* add to top field score */
+
+        p_pix_c += i_pitch_curr;
+        p_pix_p += i_pitch_prev;
+
+        /* bottom field - handling identical to top field, except... */
+        movq_m2r( *((uint64_t*)p_pix_c), mm0 );
+        movq_m2r( *((uint64_t*)p_pix_p), mm1 );
+        movq_r2r( mm0, mm2 );
+        psubusb_r2r( mm1, mm2 );
+        psubusb_r2r( mm0, mm1 );
+
+        pcmpgtb_r2r( mm5, mm2 );
+        pcmpgtb_r2r( mm5, mm1 );
+        psadbw_r2r(  mm6, mm2 );
+        psadbw_r2r(  mm6, mm1 );
+
+        paddd_r2r( mm2, mm1 );
+        paddd_r2r( mm1, mm4 ); /* ...here we add to bottom field score */
+
+        p_pix_c += i_pitch_curr;
+        p_pix_p += i_pitch_prev;
+    }
+    movq_r2r(  mm3, mm7 ); /* score (total) */
+    paddd_r2r( mm4, mm7 );
+    movd_r2m( mm3, i_top_motion );
+    movd_r2m( mm4, i_bot_motion );
+    movd_r2m( mm7, i_motion );
+
+    /* The loop counts actual score * 255. */
+    i_top_motion /= 255;
+    i_bot_motion /= 255;
+    i_motion     /= 255;
+
+    emms();
+
+    (*pi_top) = ( i_top_motion >= 8 );
+    (*pi_bot) = ( i_bot_motion >= 8 );
+    return (i_motion >= 8);
+}
+#endif
 #undef T

 /*****************************************************************************
@@ -386,11 +392,12 @@ int EstimateNumBlocksWithMotion( const picture_t* p_prev,
    if( p_prev->i_planes != p_curr->i_planes )
        return -1;

+    int (*motion_in_block)(uint8_t *, uint8_t *, int , int, int *, int *) =
+        TestForMotionInBlock;
    /* We must tell our inline helper whether to use MMX acceleration. */
 #ifdef CAN_COMPILE_MMXEXT
-    const bool b_mmx = vlc_CPU_MMXEXT();
-#else
-    const bool b_mmx = false;
+    if (vlc_CPU_MMXEXT())
+        motion_in_block = TestForMotionInBlockMMX;
 #endif

    int i_score = 0;
@@ -419,10 +426,9 @@ int EstimateNumBlocksWithMotion( const picture_t* p_prev,
            for( int bx = 0; bx < i_mbx; ++bx )
            {
                int i_top_temp, i_bot_temp;
-                i_score += TestForMotionInBlock( p_pix_p, p_pix_c,
-                                                 i_pitch_prev, i_pitch_curr,
-                                                 b_mmx,
-                                                 &i_top_temp, &i_bot_temp );
+                i_score += motion_in_block( p_pix_p, p_pix_c,
+                                            i_pitch_prev, i_pitch_curr,
+                                            &i_top_temp, &i_bot_temp );
                i_score_top += i_top_temp;
                i_score_bot += i_bot_temp;

@@ -440,39 +446,22 @@ int EstimateNumBlocksWithMotion( const picture_t* p_prev,
    return i_score;
 }

-/* See header for function doc. */
-int CalculateInterlaceScore( const picture_t* p_pic_top,
-                             const picture_t* p_pic_bot )
-{
-    /*
-        We use the comb metric from the IVTC filter of Transcode 1.1.5.
-        This was found to work better for the particular purpose of IVTC
-        than RenderX()'s comb metric.
-
-        Note that we *must not* subsample at all in order to catch interlacing
-        in telecined frames with localized motion (e.g. anime with characters
-        talking, where only mouths move and everything else stays still.)
-    */
-
-    assert( p_pic_top != NULL );
-    assert( p_pic_bot != NULL );
+/* Threshold (value from Transcode 1.1.5) */
+#define T 100

-    if( p_pic_top->i_planes != p_pic_bot->i_planes )
-        return -1;
+#ifdef CAN_COMPILE_MMXEXT
+VLC_MMX
+static int CalculateInterlaceScoreMMX( const picture_t* p_pic_top,
+                                       const picture_t* p_pic_bot )
+{
+    assert( p_pic_top->i_planes == p_pic_bot->i_planes );

    /* Amount of bits must be known for MMX, thus int32_t.
       Doesn't hurt the C implementation. */
    int32_t i_score_mmx = 0; /* this must be divided by 255 when finished  */
    int32_t i_score_c   = 0; /* this counts as-is (used for non-MMX parts) */

-#ifdef CAN_COMPILE_MMXEXT
-# ifndef __SSE__
-    const unsigned u_cpu = vlc_CPU();
-
-    if( u_cpu & VLC_CPU_MMXEXT )
-# endif
-        pxor_r2r( mm7, mm7 ); /* we will keep score in mm7 */
-#endif
+    pxor_r2r( mm7, mm7 ); /* we will keep score in mm7 */

    for( int i_plane = 0 ; i_plane < p_pic_top->i_planes ; ++i_plane )
    {
@@ -505,60 +494,52 @@ int CalculateInterlaceScore( const picture_t* p_pic_top,

            int x = 0;

-/* Threshold (value from Transcode 1.1.5) */
-#define T 100
-#ifdef CAN_COMPILE_MMXEXT
            /* Easy-to-read C version further below.

               Assumptions: 0 < T < 127
                            # of pixels < (2^32)/255
               Note: calculates score * 255
            */
-# ifndef __SSE__
-            if( u_cpu & VLC_CPU_MMXEXT )
-# endif
-            {
-                static const mmx_t b0   = { .uq = 0x0000000000000000ULL };
-                static const mmx_t b128 = { .uq = 0x8080808080808080ULL };
-                static const mmx_t bT   = { .ub = { T, T, T, T, T, T, T, T } };
+            static const mmx_t b0   = { .uq = 0x0000000000000000ULL };
+            static const mmx_t b128 = { .uq = 0x8080808080808080ULL };
+            static const mmx_t bT   = { .ub = { T, T, T, T, T, T, T, T } };

-                for( ; x < w8; x += 8 )
-                {
-                    movq_m2r( *((int64_t*)p_c), mm0 );
-                    movq_m2r( *((int64_t*)p_p), mm1 );
-                    movq_m2r( *((int64_t*)p_n), mm2 );
-
-                    psubb_m2r( b128, mm0 );
-                    psubb_m2r( b128, mm1 );
-                    psubb_m2r( b128, mm2 );
-
-                    psubsb_r2r( mm0, mm1 );
-                    psubsb_r2r( mm0, mm2 );
-
-                    pxor_r2r( mm3, mm3 );
-                    pxor_r2r( mm4, mm4 );
-                    pxor_r2r( mm5, mm5 );
-                    pxor_r2r( mm6, mm6 );
-
-                    punpcklbw_r2r( mm1, mm3 );
-                    punpcklbw_r2r( mm2, mm4 );
-                    punpckhbw_r2r( mm1, mm5 );
-                    punpckhbw_r2r( mm2, mm6 );
-
-                    pmulhw_r2r( mm3, mm4 );
-                    pmulhw_r2r( mm5, mm6 );
-
-                    packsswb_r2r(mm4, mm6);
-                    pcmpgtb_m2r( bT, mm6 );
-                    psadbw_m2r( b0, mm6 );
-                    paddd_r2r( mm6, mm7 );
-
-                    p_c += 8;
-                    p_p += 8;
-                    p_n += 8;
-                }
+            for( ; x < w8; x += 8 )
+            {
+                movq_m2r( *((int64_t*)p_c), mm0 );
+                movq_m2r( *((int64_t*)p_p), mm1 );
+                movq_m2r( *((int64_t*)p_n), mm2 );
+
+                psubb_m2r( b128, mm0 );
+                psubb_m2r( b128, mm1 );
+                psubb_m2r( b128, mm2 );
+
+                psubsb_r2r( mm0, mm1 );
+                psubsb_r2r( mm0, mm2 );
+
+                pxor_r2r( mm3, mm3 );
+                pxor_r2r( mm4, mm4 );
+                pxor_r2r( mm5, mm5 );
+                pxor_r2r( mm6, mm6 );
+
+                punpcklbw_r2r( mm1, mm3 );
+                punpcklbw_r2r( mm2, mm4 );
+                punpckhbw_r2r( mm1, mm5 );
+                punpckhbw_r2r( mm2, mm6 );
+
+                pmulhw_r2r( mm3, mm4 );
+                pmulhw_r2r( mm5, mm6 );
+
+                packsswb_r2r(mm4, mm6);
+                pcmpgtb_m2r( bT, mm6 );
+                psadbw_m2r( b0, mm6 );
+                paddd_r2r( mm6, mm7 );
+
+                p_c += 8;
+                p_p += 8;
+                p_n += 8;
            }
-#endif
+
            for( ; x < w; ++x )
            {
                /* Worst case: need 17 bits for "comb". */
@@ -594,17 +575,102 @@ int CalculateInterlaceScore( const picture_t* p_pic_top,
        }
    }

+    movd_r2m( mm7, i_score_mmx );
+    emms();
+
+    return i_score_mmx/255 + i_score_c;
+}
+#endif
+
+/* See header for function doc. */
+int CalculateInterlaceScore( const picture_t* p_pic_top,
+                             const picture_t* p_pic_bot )
+{
+    /*
+        We use the comb metric from the IVTC filter of Transcode 1.1.5.
+        This was found to work better for the particular purpose of IVTC
+        than RenderX()'s comb metric.
+
+        Note that we *must not* subsample at all in order to catch interlacing
+        in telecined frames with localized motion (e.g. anime with characters
+        talking, where only mouths move and everything else stays still.)
+    */
+
+    assert( p_pic_top != NULL );
+    assert( p_pic_bot != NULL );
+
+    if( p_pic_top->i_planes != p_pic_bot->i_planes )
+        return -1;
+
 #ifdef CAN_COMPILE_MMXEXT
-# ifndef __SSE__
-    if( u_cpu & VLC_CPU_MMXEXT )
-# endif
+    if (vlc_CPU_MMXEXT())
+        return CalculateInterlaceScoreMMX( p_pic_top, p_pic_bot );
+#endif
+
+    int32_t i_score = 0;
+
+    for( int i_plane = 0 ; i_plane < p_pic_top->i_planes ; ++i_plane )
    {
-        movd_r2m( mm7, i_score_mmx );
-        emms();
-        i_score_mmx /= 255;
+        /* Sanity check */
+        if( p_pic_top->p[i_plane].i_visible_lines !=
+            p_pic_bot->p[i_plane].i_visible_lines )
+            return -1;
+
+        const int i_lasty = p_pic_top->p[i_plane].i_visible_lines-1;
+        const int w = FFMIN( p_pic_top->p[i_plane].i_visible_pitch,
+                             p_pic_bot->p[i_plane].i_visible_pitch );
+
+        /* Current line / neighbouring lines picture pointers */
+        const picture_t *cur = p_pic_bot;
+        const picture_t *ngh = p_pic_top;
+        int wc = cur->p[i_plane].i_pitch;
+        int wn = ngh->p[i_plane].i_pitch;
+
+        /* Transcode 1.1.5 only checks every other line. Checking every line
+           works better for anime, which may contain horizontal,
+           one pixel thick cartoon outlines.
+        */
+        for( int y = 1; y < i_lasty; ++y )
+        {
+            uint8_t *p_c = &cur->p[i_plane].p_pixels[y*wc];     /* this line */
+            uint8_t *p_p = &ngh->p[i_plane].p_pixels[(y-1)*wn]; /* prev line */
+            uint8_t *p_n = &ngh->p[i_plane].p_pixels[(y+1)*wn]; /* next line */
+
+            for( int x = 0; x < w; ++x )
+            {
+                /* Worst case: need 17 bits for "comb". */
+                int_fast32_t C = *p_c;
+                int_fast32_t P = *p_p;
+                int_fast32_t N = *p_n;
+
+                /* Comments in Transcode's filter_ivtc.c attribute this
+                   combing metric to Gunnar Thalin.
+
+                    The idea is that if the picture is interlaced, both
+                    expressions will have the same sign, and this comes
+                    up positive. The value T = 100 has been chosen such
+                    that a pixel difference of 10 (on average) will
+                    trigger the detector.
+                */
+                int_fast32_t comb = (P - C) * (N - C);
+                if( comb > T )
+                    ++i_score;
+
+                ++p_c;
+                ++p_p;
+                ++p_n;
+            }
+
+            /* Now the other field - swap current and neighbour pictures */
+            const picture_t *tmp = cur;
+            cur = ngh;
+            ngh = tmp;
+            int tmp_pitch = wc;
+            wc = wn;
+            wn = tmp_pitch;
+        }
    }
-#endif

-    return i_score_mmx + i_score_c;
+    return i_score;
 }
 #undef T