deinterlace: split MMXEXT acceleration for IVTC to separate functions

That causes some duplication, but that is required for clobber lists.

deinterlace: split MMXEXT acceleration for IVTC to separate functions
That causes some duplication, but that is required for clobber lists.
4c983898 · Rémi Denis-Courmont · b3683d44 · 4c983898
Commit 4c983898 authored Dec 17, 2012 by Rémi Denis-Courmont
Show whitespace changes
Inline Side-by-side

Showing with 242 additions and 176 deletions

modules/video_filter/deinterlace/helpers.c modules/video_filter/deinterlace/helpers.c +242 -176

No files found.
--- a/modules/video_filter/deinterlace/helpers.c
+++ b/modules/video_filter/deinterlace/helpers.c
@@ -94,6 +94,7 @@ static void FieldFromPlane( plane_t *p_dst, const plane_t *p_src, int i_field )
        p_dst->p_pixels += p_src->i_pitch;
 }

+#define T 10
 /**
 * Internal helper function for EstimateNumBlocksWithMotion():
 * estimates whether there is motion in the given 8x8 block on one plane
@@ -113,28 +114,75 @@ static void FieldFromPlane( plane_t *p_dst, const plane_t *p_src, int i_field )
 * @param[in] p_pix_c Base pointer to the same block in current picture
 * @param i_pitch_prev i_pitch of previous picture
 * @param i_pitch_curr i_pitch of current picture
- * @param b_mmx (vlc_CPU() & VLC_CPU_MMXEXT) or false.
 * @param[out] pi_top 1 if top field of the block had motion, 0 if no
 * @param[out] pi_bot 1 if bottom field of the block had motion, 0 if no
 * @return 1 if the block had motion, 0 if no
 * @see EstimateNumBlocksWithMotion()
 */
-static inline int TestForMotionInBlock( uint8_t *p_pix_p, uint8_t *p_pix_c,
+static int TestForMotionInBlock( uint8_t *p_pix_p, uint8_t *p_pix_c,
                                 int i_pitch_prev, int i_pitch_curr,
-                                        bool b_mmx,
                                 int* pi_top, int* pi_bot )
 {
 /* Pixel luma/chroma difference threshold to detect motion. */
-#define T 10

    int32_t i_motion = 0;
    int32_t i_top_motion = 0;
    int32_t i_bot_motion = 0;

-/* See below for the C version to see more quickly what this does. */
-#ifdef CAN_COMPILE_MMXEXT
-    if( b_mmx )
+    for( int y = 0; y < 8; ++y )
+    {
+        uint8_t *pc = p_pix_c;
+        uint8_t *pp = p_pix_p;
+        int score = 0;
+        for( int x = 0; x < 8; ++x )
        {
+            int_fast16_t C = abs((*pc) - (*pp));
+            if( C > T )
+                ++score;
+
+            ++pc;
+            ++pp;
+        }
+
+        i_motion += score;
+        if( y % 2 == 0 )
+            i_top_motion += score;
+        else
+            i_bot_motion += score;
+
+        p_pix_c += i_pitch_curr;
+        p_pix_p += i_pitch_prev;
+    }
+
+    /* Field motion thresholds.
+
+       Empirical value - works better in practice than the "4" that
+       would be consistent with the full-block threshold.
+
+       Especially the opening scene of The Third ep. 1 (just after the OP)
+       works better with this. It also fixes some talking scenes in
+       Stellvia ep. 1, where the cadence would otherwise catch on incorrectly,
+       leading to more interlacing artifacts than by just using the emergency
+       mode frame composer.
+    */
+    (*pi_top) = ( i_top_motion >= 8 );
+    (*pi_bot) = ( i_bot_motion >= 8 );
+
+    /* Full-block threshold = (8*8)/8: motion is detected if 1/8 of the block
+       changes "enough". */
+    return (i_motion >= 8);
+}
+
+#ifdef CAN_COMPILE_MMXEXT
+VLC_MMX
+static int TestForMotionInBlockMMX( uint8_t *p_pix_p, uint8_t *p_pix_c,
+                                    int i_pitch_prev, int i_pitch_curr,
+                                    int* pi_top, int* pi_bot )
+{
+    int32_t i_motion = 0;
+    int32_t i_top_motion = 0;
+    int32_t i_bot_motion = 0;
+
    static const mmx_t bT   = { .ub = { T, T, T, T, T, T, T, T } };
    pxor_r2r( mm6, mm6 ); /* zero, used in psadbw */
    movq_m2r( bT,  mm5 );
@@ -191,54 +239,12 @@ static inline int TestForMotionInBlock( uint8_t *p_pix_p, uint8_t *p_pix_c,
    i_motion     /= 255;

    emms();
-    }
-    else
-#endif
-    {
-        for( int y = 0; y < 8; ++y )
-        {
-            uint8_t *pc = p_pix_c;
-            uint8_t *pp = p_pix_p;
-            int score = 0;
-            for( int x = 0; x < 8; ++x )
-            {
-                int_fast16_t C = abs((*pc) - (*pp));
-                if( C > T )
-                    ++score;
-
-                ++pc;
-                ++pp;
-            }
-
-            i_motion += score;
-            if( y % 2 == 0 )
-                i_top_motion += score;
-            else
-                i_bot_motion += score;
-
-            p_pix_c += i_pitch_curr;
-            p_pix_p += i_pitch_prev;
-        }
-    }

-    /* Field motion thresholds.
-
-       Empirical value - works better in practice than the "4" that
-       would be consistent with the full-block threshold.
-
-       Especially the opening scene of The Third ep. 1 (just after the OP)
-       works better with this. It also fixes some talking scenes in
-       Stellvia ep. 1, where the cadence would otherwise catch on incorrectly,
-       leading to more interlacing artifacts than by just using the emergency
-       mode frame composer.
-    */
    (*pi_top) = ( i_top_motion >= 8 );
    (*pi_bot) = ( i_bot_motion >= 8 );
-
-    /* Full-block threshold = (8*8)/8: motion is detected if 1/8 of the block
-       changes "enough". */
    return (i_motion >= 8);
 }
+#endif
 #undef T

 /*****************************************************************************
@@ -386,11 +392,12 @@ int EstimateNumBlocksWithMotion( const picture_t* p_prev,
    if( p_prev->i_planes != p_curr->i_planes )
        return -1;

+    int (*motion_in_block)(uint8_t *, uint8_t *, int , int, int *, int *) =
+        TestForMotionInBlock;
    /* We must tell our inline helper whether to use MMX acceleration. */
 #ifdef CAN_COMPILE_MMXEXT
-    const bool b_mmx = vlc_CPU_MMXEXT();
-#else
-    const bool b_mmx = false;
+    if (vlc_CPU_MMXEXT())
+        motion_in_block = TestForMotionInBlockMMX;
 #endif

    int i_score = 0;
@@ -419,9 +426,8 @@ int EstimateNumBlocksWithMotion( const picture_t* p_prev,
            for( int bx = 0; bx < i_mbx; ++bx )
            {
                int i_top_temp, i_bot_temp;
-                i_score += TestForMotionInBlock( p_pix_p, p_pix_c,
+                i_score += motion_in_block( p_pix_p, p_pix_c,
                                            i_pitch_prev, i_pitch_curr,
-                                                 b_mmx,
                                            &i_top_temp, &i_bot_temp );
                i_score_top += i_top_temp;
                i_score_bot += i_bot_temp;
@@ -440,39 +446,22 @@ int EstimateNumBlocksWithMotion( const picture_t* p_prev,
    return i_score;
 }

-/* See header for function doc. */
-int CalculateInterlaceScore( const picture_t* p_pic_top,
+/* Threshold (value from Transcode 1.1.5) */
+#define T 100
+
+#ifdef CAN_COMPILE_MMXEXT
+VLC_MMX
+static int CalculateInterlaceScoreMMX( const picture_t* p_pic_top,
                                       const picture_t* p_pic_bot )
 {
-    /*
-        We use the comb metric from the IVTC filter of Transcode 1.1.5.
-        This was found to work better for the particular purpose of IVTC
-        than RenderX()'s comb metric.
-
-        Note that we *must not* subsample at all in order to catch interlacing
-        in telecined frames with localized motion (e.g. anime with characters
-        talking, where only mouths move and everything else stays still.)
-    */
-
-    assert( p_pic_top != NULL );
-    assert( p_pic_bot != NULL );
-
-    if( p_pic_top->i_planes != p_pic_bot->i_planes )
-        return -1;
+    assert( p_pic_top->i_planes == p_pic_bot->i_planes );

    /* Amount of bits must be known for MMX, thus int32_t.
       Doesn't hurt the C implementation. */
    int32_t i_score_mmx = 0; /* this must be divided by 255 when finished  */
    int32_t i_score_c   = 0; /* this counts as-is (used for non-MMX parts) */

-#ifdef CAN_COMPILE_MMXEXT
-# ifndef __SSE__
-    const unsigned u_cpu = vlc_CPU();
-
-    if( u_cpu & VLC_CPU_MMXEXT )
-# endif
    pxor_r2r( mm7, mm7 ); /* we will keep score in mm7 */
-#endif

    for( int i_plane = 0 ; i_plane < p_pic_top->i_planes ; ++i_plane )
    {
@@ -505,19 +494,12 @@ int CalculateInterlaceScore( const picture_t* p_pic_top,

            int x = 0;

-/* Threshold (value from Transcode 1.1.5) */
-#define T 100
-#ifdef CAN_COMPILE_MMXEXT
            /* Easy-to-read C version further below.

               Assumptions: 0 < T < 127
                            # of pixels < (2^32)/255
               Note: calculates score * 255
            */
-# ifndef __SSE__
-            if( u_cpu & VLC_CPU_MMXEXT )
-# endif
-            {
            static const mmx_t b0   = { .uq = 0x0000000000000000ULL };
            static const mmx_t b128 = { .uq = 0x8080808080808080ULL };
            static const mmx_t bT   = { .ub = { T, T, T, T, T, T, T, T } };
@@ -557,8 +539,7 @@ int CalculateInterlaceScore( const picture_t* p_pic_top,
                p_p += 8;
                p_n += 8;
            }
-            }
-#endif
+
            for( ; x < w; ++x )
            {
                /* Worst case: need 17 bits for "comb". */
@@ -594,17 +575,102 @@ int CalculateInterlaceScore( const picture_t* p_pic_top,
        }
    }

-#ifdef CAN_COMPILE_MMXEXT
-# ifndef __SSE__
-    if( u_cpu & VLC_CPU_MMXEXT )
-# endif
-    {
    movd_r2m( mm7, i_score_mmx );
    emms();
-        i_score_mmx /= 255;
-    }
+
+    return i_score_mmx/255 + i_score_c;
+}
 #endif

-    return i_score_mmx + i_score_c;
+/* See header for function doc. */
+int CalculateInterlaceScore( const picture_t* p_pic_top,
+                             const picture_t* p_pic_bot )
+{
+    /*
+        We use the comb metric from the IVTC filter of Transcode 1.1.5.
+        This was found to work better for the particular purpose of IVTC
+        than RenderX()'s comb metric.
+
+        Note that we *must not* subsample at all in order to catch interlacing
+        in telecined frames with localized motion (e.g. anime with characters
+        talking, where only mouths move and everything else stays still.)
+    */
+
+    assert( p_pic_top != NULL );
+    assert( p_pic_bot != NULL );
+
+    if( p_pic_top->i_planes != p_pic_bot->i_planes )
+        return -1;
+
+#ifdef CAN_COMPILE_MMXEXT
+    if (vlc_CPU_MMXEXT())
+        return CalculateInterlaceScoreMMX( p_pic_top, p_pic_bot );
+#endif
+
+    int32_t i_score = 0;
+
+    for( int i_plane = 0 ; i_plane < p_pic_top->i_planes ; ++i_plane )
+    {
+        /* Sanity check */
+        if( p_pic_top->p[i_plane].i_visible_lines !=
+            p_pic_bot->p[i_plane].i_visible_lines )
+            return -1;
+
+        const int i_lasty = p_pic_top->p[i_plane].i_visible_lines-1;
+        const int w = FFMIN( p_pic_top->p[i_plane].i_visible_pitch,
+                             p_pic_bot->p[i_plane].i_visible_pitch );
+
+        /* Current line / neighbouring lines picture pointers */
+        const picture_t *cur = p_pic_bot;
+        const picture_t *ngh = p_pic_top;
+        int wc = cur->p[i_plane].i_pitch;
+        int wn = ngh->p[i_plane].i_pitch;
+
+        /* Transcode 1.1.5 only checks every other line. Checking every line
+           works better for anime, which may contain horizontal,
+           one pixel thick cartoon outlines.
+        */
+        for( int y = 1; y < i_lasty; ++y )
+        {
+            uint8_t *p_c = &cur->p[i_plane].p_pixels[y*wc];     /* this line */
+            uint8_t *p_p = &ngh->p[i_plane].p_pixels[(y-1)*wn]; /* prev line */
+            uint8_t *p_n = &ngh->p[i_plane].p_pixels[(y+1)*wn]; /* next line */
+
+            for( int x = 0; x < w; ++x )
+            {
+                /* Worst case: need 17 bits for "comb". */
+                int_fast32_t C = *p_c;
+                int_fast32_t P = *p_p;
+                int_fast32_t N = *p_n;
+
+                /* Comments in Transcode's filter_ivtc.c attribute this
+                   combing metric to Gunnar Thalin.
+
+                    The idea is that if the picture is interlaced, both
+                    expressions will have the same sign, and this comes
+                    up positive. The value T = 100 has been chosen such
+                    that a pixel difference of 10 (on average) will
+                    trigger the detector.
+                */
+                int_fast32_t comb = (P - C) * (N - C);
+                if( comb > T )
+                    ++i_score;
+
+                ++p_c;
+                ++p_p;
+                ++p_n;
+            }
+
+            /* Now the other field - swap current and neighbour pictures */
+            const picture_t *tmp = cur;
+            cur = ngh;
+            ngh = tmp;
+            int tmp_pitch = wc;
+            wc = wn;
+            wn = tmp_pitch;
+        }
+    }
+
+    return i_score;
 }
 #undef T