Sepia filter ASM optimization

Added functions to improve YUV performance, still needs a bit of work on RGB Signed-off-by: Jean-Baptiste Kempf <jb@videolan.org>

Sepia filter ASM optimization
Added functions to improve YUV performance, still needs a bit of work on RGB Signed-off-by: Jean-Baptiste Kempf <jb@videolan.org>
972c39b6 · Martin Briza · Jean-Baptiste Kempf · 23e6f1bf · 972c39b6
Commit 972c39b6 authored Apr 09, 2011 by Martin Briza Committed by Jean-Baptiste Kempf Apr 09, 2011
Show whitespace changes
Inline Side-by-side

Showing with 250 additions and 53 deletions

modules/video_filter/sepia.c modules/video_filter/sepia.c +250 -53

No files found.
--- a/modules/video_filter/sepia.c
+++ b/modules/video_filter/sepia.c
@@ -32,6 +32,7 @@
 #include <vlc_common.h>
 #include <vlc_plugin.h>
 #include <vlc_filter.h>
+#include <vlc_cpu.h>
 #include <assert.h>
 #include "filter_picture.h"
@@ -46,7 +47,8 @@ static void RVSepia( picture_t *, picture_t *, int );
 static void PlanarI420Sepia( picture_t *, picture_t *, int);
 static void PackedYUVSepia( picture_t *, picture_t *, int);
 static picture_t *Filter( filter_t *, picture_t * );
+inline void Sepia8ySSE41( uint8_t *, const uint8_t *, volatile uint8_t * );
+inline void Memcpy8BMMX( uint8_t *, const uint8_t * );
 static const char *const ppsz_filter_options[] = {
    "intensity", NULL
 };
@@ -212,6 +214,94 @@ static void PlanarI420Sepia( picture_t *p_pic, picture_t *p_outpic,
    // prepared values to copy for U and V channels
    const uint8_t filling_const_8u = 128 - i_intensity / 6;
    const uint8_t filling_const_8v = 128 + i_intensity / 14;
+    #if defined(CAN_COMPILE_SSE4_1) && 1
+    if (vlc_CPU() & CPU_CAPABILITY_SSE4_1)
+    {
+        /*prepare array of values to copy with mmx, compute only once
+          to improve speed */
+        volatile uint8_t intensity_array[8] = { i_intensity, i_intensity,
+            i_intensity, i_intensity, i_intensity, i_intensity,
+            i_intensity, i_intensity };
+        const uint8_t filling_array_8u[8] =
+            { filling_const_8u, filling_const_8u, filling_const_8u,
+            filling_const_8u, filling_const_8u, filling_const_8u,
+            filling_const_8u, filling_const_8u };
+        const uint8_t filling_array_8v[8] =
+            { filling_const_8v, filling_const_8v, filling_const_8v,
+            filling_const_8v, filling_const_8v, filling_const_8v,
+            filling_const_8v, filling_const_8v };
+        /* iterate for every two visible line in the frame */
+        for (int y = 0; y < p_pic->p[Y_PLANE].i_visible_lines - 1; y += 2)
+        {
+            const int i_dy_line1_start = y * p_outpic->p[Y_PLANE].i_pitch;
+            const int i_dy_line2_start =
+            (y + 1) * p_outpic->p[Y_PLANE].i_pitch;
+            const int i_du_line_start =
+            (y / 2) * p_outpic->p[U_PLANE].i_pitch;
+            const int i_dv_line_start =
+            (y / 2) * p_outpic->p[V_PLANE].i_pitch;
+            int x = 0;
+            /* iterate for every visible line in the frame (eight values at once) */
+            for (; x < p_pic->p[Y_PLANE].i_visible_pitch - 15; x += 16)
+            {
+                /* Compute yellow channel values with asm function */
+                Sepia8ySSE41(
+                          &p_outpic->p[Y_PLANE].p_pixels[i_dy_line1_start + x],
+                          &p_pic->p[Y_PLANE].p_pixels[i_dy_line1_start + x],
+                          intensity_array );
+                Sepia8ySSE41(
+                          &p_outpic->p[Y_PLANE].p_pixels[i_dy_line2_start + x],
+                          &p_pic->p[Y_PLANE].p_pixels[i_dy_line2_start + x],
+                          intensity_array );
+                Sepia8ySSE41(
+                          &p_outpic->p[Y_PLANE].p_pixels[i_dy_line1_start + x + 8],
+                          &p_pic->p[Y_PLANE].p_pixels[i_dy_line1_start + x + 8],
+                          intensity_array );
+                Sepia8ySSE41(
+                          &p_outpic->p[Y_PLANE].p_pixels[i_dy_line2_start + x + 8],
+                          &p_pic->p[Y_PLANE].p_pixels[i_dy_line2_start + x + 8],
+                          intensity_array );
+                /* Copy precomputed values to destination image memory location */
+                Memcpy8BMMX(
+                          &p_outpic->p[U_PLANE].p_pixels[i_du_line_start + (x / 2)],
+                          filling_array_8u );
+                Memcpy8BMMX(&p_outpic->p[V_PLANE].p_pixels[i_dv_line_start + (x / 2)],
+                          filling_array_8v );
+            }
+            /* Completing the job, the cycle above takes really big chunks, so
+              this makes sure the job will be done completely */
+            for (; x < p_pic->p[Y_PLANE].i_visible_pitch - 1; x += 2)
+            {
+                // y = y - y/4 {to prevent overflow} + intensity / 4
+                p_outpic->p[Y_PLANE].p_pixels[i_dy_line1_start + x] =
+                    p_pic->p[Y_PLANE].p_pixels[i_dy_line1_start + x] -
+                    (p_pic->p[Y_PLANE].p_pixels[i_dy_line1_start + x] >> 2) +
+                    (i_intensity >> 2);
+                p_outpic->p[Y_PLANE].p_pixels[i_dy_line1_start + x + 1] =
+                    p_pic->p[Y_PLANE].p_pixels[i_dy_line1_start + x + 1] -
+                    (p_pic->p[Y_PLANE].p_pixels[i_dy_line1_start + x + 1] >> 2) +
+                    (i_intensity >> 2);
+                p_outpic->p[Y_PLANE].p_pixels[i_dy_line2_start + x] =
+                    p_pic->p[Y_PLANE].p_pixels[i_dy_line2_start + x] -
+                    (p_pic->p[Y_PLANE].p_pixels[i_dy_line2_start + x] >> 2) +
+                    (i_intensity >> 2);
+                p_outpic->p[Y_PLANE].p_pixels[i_dy_line2_start + x + 1] =
+                    p_pic->p[Y_PLANE].p_pixels[i_dy_line2_start + x + 1] -
+                    (p_pic->p[Y_PLANE].p_pixels[i_dy_line2_start + x + 1] >> 2) +
+                    (i_intensity >> 2);
+                // u = 128 {half => B&W} - intensity / 6
+                p_outpic->p[U_PLANE].p_pixels[i_du_line_start + (x / 2)] =
+                    filling_const_8u;
+                // v = 128 {half => B&W} + intensity / 14
+                p_outpic->p[V_PLANE].p_pixels[i_dv_line_start + (x / 2)] =
+                    filling_const_8v;
+            }
+        }
+    } else
+#endif
+    {
        /* iterate for every two visible line in the frame */
        for( int y = 0; y < p_pic->p[Y_PLANE].i_visible_lines - 1; y += 2)
        {
@@ -252,6 +342,7 @@ static void PlanarI420Sepia( picture_t *p_pic, picture_t *p_outpic,
                    filling_const_8v;
            }
        }
+    }
 }
 /*****************************************************************************
@@ -278,7 +369,69 @@ static void PackedYUVSepia( picture_t *p_pic, picture_t *p_outpic,
    p_in_end = p_in + p_pic->p[0].i_visible_lines
        * p_pic->p[0].i_pitch;
    p_out = p_outpic->p[0].p_pixels;
+#if defined(CAN_COMPILE_SSE4_1)
+    if (vlc_CPU() & CPU_CAPABILITY_SSE4_1)
+    {
+        /*prepare array of values to copy with mmx, compute only once
+          to improve speed */
+        volatile uint8_t intensity_array[8] = { i_intensity, i_intensity,
+            i_intensity, i_intensity, i_intensity, i_intensity,
+            i_intensity,
+            i_intensity
+        };
+        const uint8_t filling_array_8u[8] =
+            { filling_const_8u, filling_const_8u,
+            filling_const_8u, filling_const_8u, filling_const_8u,
+            filling_const_8u,
+            filling_const_8u, filling_const_8u
+        };
+        const uint8_t filling_array_8v[8] =
+            { filling_const_8v, filling_const_8v,
+            filling_const_8v, filling_const_8v, filling_const_8v,
+            filling_const_8v,
+            filling_const_8v, filling_const_8v
+        };
+        /* iterate for every two visible line in the frame */
+        while (p_in < p_in_end)
+        {
+            p_line_end = p_in + p_pic->p[0].i_visible_pitch;
+            while (p_in < p_line_end)
+            {
+                Sepia8ySSE41(&p_out[i_yindex], &p_in[i_yindex],
+                          intensity_array);
+                Sepia8ySSE41(&p_out[i_yindex + 8], &p_in[i_yindex + 8],
+                          intensity_array);
+                Sepia8ySSE41(&p_out[i_yindex + 16], &p_in[i_yindex + 16],
+                          intensity_array);
+                Sepia8ySSE41(&p_out[i_yindex + 24], &p_in[i_yindex + 24],
+                          intensity_array);
+                Memcpy8BMMX(&p_out[i_uindex], filling_array_8u);
+                Memcpy8BMMX(&p_out[i_vindex], filling_array_8v);
+                p_in += 32;
+                p_out += 32;
+            }
+            while (p_in < p_line_end)
+            {
+                p_out[i_yindex] =
+                    p_in[i_yindex] - (p_in[i_yindex] >> 2) +
+                    (i_intensity >> 2);
+                p_out[i_yindex + 2] =
+                    p_in[i_yindex + 2] - (p_in[i_yindex + 2] >> 2) +
+                    (i_intensity >> 2);
+                p_out[i_uindex] = filling_const_8u;
+                p_out[i_vindex] = filling_const_8v;
+                p_in += 4;
+                p_out += 4;
+            }
+            p_in += p_pic->p[0].i_pitch - p_pic->p[0].i_visible_pitch;
+            p_out += p_outpic->p[0].i_pitch
+            - p_outpic->p[0].i_visible_pitch;
+        }
+    } else
+#endif
+    {
        while( p_in < p_in_end )
        {
            p_line_end = p_in + p_pic->p[0].i_visible_pitch;
@@ -299,6 +452,7 @@ static void PackedYUVSepia( picture_t *p_pic, picture_t *p_outpic,
            p_out += p_outpic->p[0].i_pitch
                - p_outpic->p[0].i_visible_pitch;
        }
+    }
 }
 /*****************************************************************************
@@ -314,7 +468,6 @@ static void RVSepia( picture_t *p_pic, picture_t *p_outpic, int i_intensity )
 #define ONE_HALF  (1 << (SCALEBITS - 1))
 #define FIX(x)    ((int) ((x) * (1<<SCALEBITS) + 0.5))
    uint8_t *p_in, *p_in_end, *p_line_end, *p_out;
-    int i_r, i_g, i_b;
    bool b_isRV32 = p_pic->format.i_chroma == VLC_CODEC_RGB32;
    int i_rindex = 0, i_gindex = 1, i_bindex = 2;
@@ -372,6 +525,50 @@ static void RVSepia( picture_t *p_pic, picture_t *p_outpic, int i_intensity )
 #undef FIX
 }
+/*****************************************************************************
+ * Sepia8ySSE41
+ *****************************************************************************
+ * This function applies sepia effect to eight bytes of yellow using SSE4.1
+ * instructions. It copies those 8 bytes to 128b register and fills the gaps
+ * with zeroes and following operations are made with word-operating instructs.
+ *****************************************************************************/
+inline void Sepia8ySSE41(uint8_t * dst, const uint8_t * src,
+               volatile uint8_t * i_intensity)
+{
+#if defined(CAN_COMPILE_SSE4_1) && 1
+    __asm__ volatile (
+              "pmovzxbw      (%1),   %%xmm1\n"    // y = y - y / 4 + i_intensity / 4
+              "pmovzxbw      (%1),   %%xmm2\n"    // store bytes as words with 0s in between
+              "pmovzxbw      (%2),   %%xmm3\n"
+              "psrlw          $2,    %%xmm2\n"    // rotate right 2
+              "psubusb       %%xmm1, %%xmm2\n"    // subtract
+              "psrlw          $2,    %%xmm3\n"
+              "paddsb        %%xmm1, %%xmm3\n"    // add
+              "packuswb      %%xmm2, %%xmm1\n"    // pack back to bytes
+              "movq          %%xmm1, (%0)  \n"    // load to dest
+              :
+              :"r" (dst), "r"(src), "r"(i_intensity)
+              :"memory");
+#endif
+}
+/*****************************************************************************
+ * Memcpy8BMMX: Copies 8 bytes of memory in two instructions
+ *****************************************************************************
+ * Not quite clean, but it should be fast.
+ *****************************************************************************/
+inline void Memcpy8BMMX(uint8_t * dst, const uint8_t * src)
+{
+#if defined(CAN_COMPILE_MMX) && 1
+    __asm__ volatile (
+              "movq       (%1), %%xmm0\n"
+              "movq       %%xmm0, (%0)\n"
+              :
+              :"r" (dst), "r"(src)
+              :"memory");
+#endif
+}
 static int FilterCallback ( vlc_object_t *p_this, char const *psz_var,
                            vlc_value_t oldval, vlc_value_t newval,
                            void *p_data )