deinterlace: move Phosphor MMX acceleration to a separate function

86ca8255 · Rémi Denis-Courmont · 4c983898 · 86ca8255
Commit 86ca8255 authored Dec 17, 2012 by Rémi Denis-Courmont
Show whitespace changes
Inline Side-by-side

Showing with 124 additions and 67 deletions

modules/video_filter/deinterlace/algo_phosphor.c modules/video_filter/deinterlace/algo_phosphor.c +124 -67

No files found.
--- a/modules/video_filter/deinterlace/algo_phosphor.c
+++ b/modules/video_filter/deinterlace/algo_phosphor.c
@@ -78,11 +78,6 @@ static void DarkenField( picture_t *p_dst,
    /* Bitwise ANDing with this clears the i_strength highest bits
       of each byte */
-#ifdef CAN_COMPILE_MMXEXT
-    const bool mmxext = vlc_CPU_MMXEXT();
-    uint64_t i_strength_u64 = i_strength; /* for MMX version (needs to know
-                                             number of bits) */
-#endif
    const uint8_t  remove_high_u8 = 0xFF >> i_strength;
    const uint64_t remove_high_u64 = remove_high_u8 *
                                            INT64_C(0x0101010101010101);
@@ -92,7 +87,7 @@ static void DarkenField( picture_t *p_dst,
       For luma, the operation is just a shift + bitwise AND, so we vectorize
       even in the C version.
-       There is an MMX version, too, because it performs about twice faster.
+       There is an MMX version too, because it performs about twice faster.
    */
    int i_plane = Y_PLANE;
    uint8_t *p_out, *p_out_end;
@@ -112,9 +107,80 @@ static void DarkenField( picture_t *p_dst,
        uint64_t *po = (uint64_t *)p_out;
        int x = 0;
+        for( ; x < w8; x += 8, ++po )
+            (*po) = ( ((*po) >> i_strength) & remove_high_u64 );
+        /* handle the width remainder */
+        uint8_t *po_temp = (uint8_t *)po;
+        for( ; x < w; ++x, ++po_temp )
+            (*po_temp) = ( ((*po_temp) >> i_strength) & remove_high_u8 );
+    }
+    /* Process chroma if the field chromas are independent.
+       The origin (black) is at YUV = (0, 128, 128) in the uint8 format.
+       The chroma processing is a bit more complicated than luma,
+       and needs MMX for vectorization.
+    */
+    if( process_chroma )
+    {
+        for( i_plane++ /* luma already handled*/;
+             i_plane < p_dst->i_planes;
+             i_plane++ )
+        {
+            int w = p_dst->p[i_plane].i_visible_pitch;
+            p_out = p_dst->p[i_plane].p_pixels;
+            p_out_end = p_out + p_dst->p[i_plane].i_pitch
+                              * p_dst->p[i_plane].i_visible_lines;
+            /* skip first line for bottom field */
+            if( i_field == 1 )
+                p_out += p_dst->p[i_plane].i_pitch;
+            for( ; p_out < p_out_end ; p_out += 2*p_dst->p[i_plane].i_pitch )
+            {
+                /* Handle the width remainder */
+                uint8_t *po = p_out;
+                for( int x = 0; x < w; ++x, ++po )
+                    (*po) = 128 + ( ((*po) - 128) / (1 << i_strength) );
+            } /* for p_out... */
+        } /* for i_plane... */
+    } /* if process_chroma */
+}
 #ifdef CAN_COMPILE_MMXEXT
-        if( mmxext )
+VLC_MMX
+static void DarkenFieldMMX( picture_t *p_dst,
+                            const int i_field, const int i_strength,
+                            bool process_chroma )
+{
+    assert( p_dst != NULL );
+    assert( i_field == 0 || i_field == 1 );
+    assert( i_strength >= 1 && i_strength <= 3 );
+    uint64_t i_strength_u64 = i_strength; /* needs to know number of bits */
+    const uint8_t  remove_high_u8 = 0xFF >> i_strength;
+    const uint64_t remove_high_u64 = remove_high_u8 *
+                                            INT64_C(0x0101010101010101);
+    int i_plane = Y_PLANE;
+    uint8_t *p_out, *p_out_end;
+    int w = p_dst->p[i_plane].i_visible_pitch;
+    p_out = p_dst->p[i_plane].p_pixels;
+    p_out_end = p_out + p_dst->p[i_plane].i_pitch
+                      * p_dst->p[i_plane].i_visible_lines;
+    /* skip first line for bottom field */
+    if( i_field == 1 )
+        p_out += p_dst->p[i_plane].i_pitch;
+    int wm8 = w % 8;   /* remainder */
+    int w8  = w - wm8; /* part of width that is divisible by 8 */
+    for( ; p_out < p_out_end ; p_out += 2*p_dst->p[i_plane].i_pitch )
    {
+        uint64_t *po = (uint64_t *)p_out;
+        int x = 0;
        movq_m2r( i_strength_u64,  mm1 );
        movq_m2r( remove_high_u64, mm2 );
        for( ; x < w8; x += 8 )
@@ -126,13 +192,6 @@ static void DarkenField( picture_t *p_dst,
            movq_r2m( mm0, (*po++) );
        }
-        }
-        else
-#endif
-        {
-            for( ; x < w8; x += 8, ++po )
-                (*po) = ( ((*po) >> i_strength) & remove_high_u64 );
-        }
        /* handle the width remainder */
        uint8_t *po_temp = (uint8_t *)po;
@@ -148,16 +207,14 @@ static void DarkenField( picture_t *p_dst,
    */
    if( process_chroma )
    {
-        for( i_plane = 0 ; i_plane < p_dst->i_planes ; i_plane++ )
+        for( i_plane++ /* luma already handled */;
+             i_plane < p_dst->i_planes;
+             i_plane++ )
        {
-            if( i_plane == Y_PLANE )
-                continue; /* luma already handled */
            int w = p_dst->p[i_plane].i_visible_pitch;
-#ifdef CAN_COMPILE_MMXEXT
            int wm8 = w % 8;   /* remainder */
            int w8  = w - wm8; /* part of width that is divisible by 8 */
-#endif
            p_out = p_dst->p[i_plane].p_pixels;
            p_out_end = p_out + p_dst->p[i_plane].i_pitch
                              * p_dst->p[i_plane].i_visible_lines;
@@ -170,19 +227,16 @@ static void DarkenField( picture_t *p_dst,
            {
                int x = 0;
-#ifdef CAN_COMPILE_MMXEXT
                /* See also easy-to-read C version below. */
-                if( mmxext )
-                {
                static const mmx_t b128 = { .uq = 0x8080808080808080ULL };
                movq_m2r( b128, mm5 );
                movq_m2r( i_strength_u64,  mm6 );
                movq_m2r( remove_high_u64, mm7 );
-                    uint64_t *po = (uint64_t *)p_out;
+                uint64_t *po8 = (uint64_t *)p_out;
                for( ; x < w8; x += 8 )
                {
-                        movq_m2r( (*po), mm0 );
+                    movq_m2r( (*po8), mm0 );
                    movq_r2r( mm5, mm2 ); /* 128 */
                    movq_r2r( mm0, mm1 ); /* copy of data */
@@ -199,13 +253,10 @@ static void DarkenField( picture_t *p_dst,
                    psubb_r2r( mm2, mm1 );
                    paddb_r2r( mm5, mm1 );
-                        movq_r2m( mm1, (*po++) );
+                    movq_r2m( mm1, (*po8++) );
-                    }
                }
-#endif
-                /* C version - handle the width remainder
+                /* C version - handle the width remainder */
-                   (or everything if no MMX) */
                uint8_t *po = p_out;
                for( ; x < w; ++x, ++po )
                    (*po) = 128 + ( ((*po) - 128) / (1 << i_strength) );
@@ -213,11 +264,9 @@ static void DarkenField( picture_t *p_dst,
        } /* for i_plane... */
    } /* if process_chroma */
-#ifdef CAN_COMPILE_MMXEXT
-    if( mmxext )
    emms();
-#endif
 }
+#endif
 /*****************************************************************************
 * Public functions
@@ -303,9 +352,17 @@ int RenderPhosphor( filter_t *p_filter,
       In most use cases the dimmer is used.
    */
    if( p_sys->phosphor.i_dimmer_strength > 0 )
+    {
+#ifdef CAN_COMPILE_MMXEXT
+        if( vlc_CPU_MMXEXT() )
+            DarkenFieldMMX( p_dst, !i_field, p_sys->phosphor.i_dimmer_strength,
+                p_sys->chroma->p[1].h.num == p_sys->chroma->p[1].h.den &&
+                p_sys->chroma->p[2].h.num == p_sys->chroma->p[2].h.den );
+        else
+#endif
            DarkenField( p_dst, !i_field, p_sys->phosphor.i_dimmer_strength,
                p_sys->chroma->p[1].h.num == p_sys->chroma->p[1].h.den &&
                p_sys->chroma->p[2].h.num == p_sys->chroma->p[2].h.den );
+    }
    return VLC_SUCCESS;
 }