sepia: clobber XMM registers correctly

(cherry picked from commit a865ced4888701e8caf0137672a4857c2b0d47d7) Conflicts: modules/video_filter/sepia.c

sepia: clobber XMM registers correctly
(cherry picked from commit a865ced4888701e8caf0137672a4857c2b0d47d7) Conflicts: modules/video_filter/sepia.c
708e92e9 · Rémi Denis-Courmont · d879cd1f · 708e92e9
Commit 708e92e9 authored Oct 13, 2012 by Rémi Denis-Courmont
Hide whitespace changes
Inline Side-by-side

Showing with 117 additions and 119 deletions

modules/video_filter/sepia.c modules/video_filter/sepia.c +117 -119

No files found.
--- a/modules/video_filter/sepia.c
+++ b/modules/video_filter/sepia.c
@@ -206,6 +206,7 @@ static picture_t *Filter( filter_t *p_filter, picture_t *p_pic )
 * instructions. It copies those 8 bytes to 128b register and fills the gaps
 * with zeroes and following operations are made with word-operating instructs.
 *****************************************************************************/
+VLC_SSE
 static inline void Sepia8ySSE2(uint8_t * dst, const uint8_t * src,
                         int i_intensity_spread)
 {
@@ -225,7 +226,82 @@ static inline void Sepia8ySSE2(uint8_t * dst, const uint8_t * src,
        "movq          %%xmm1, (%0)  \n"    // load to dest
        :
        :"r" (dst), "r"(src), "r"(i_intensity_spread)
-        :"memory");
+        :"memory", "xmm1", "xmm2", "xmm3");
+}
+
+VLC_SSE
+static void PlanarI420SepiaSSE( picture_t *p_pic, picture_t *p_outpic,
+                                int i_intensity )
+{
+    /* prepared values to copy for U and V channels */
+    const uint8_t filling_const_8u = 128 - i_intensity / 6;
+    const uint8_t filling_const_8v = 128 + i_intensity / 14;
+    /* prepared value for faster broadcasting in xmm register */
+    int i_intensity_spread = 0x10001 * (uint8_t) i_intensity;
+
+    __asm__ volatile(
+        "pxor      %%xmm7, %%xmm7\n"
+        ::: "xmm7");
+
+    /* iterate for every two visible line in the frame */
+    for (int y = 0; y < p_pic->p[Y_PLANE].i_visible_lines - 1; y += 2)
+    {
+        const int i_dy_line1_start = y * p_outpic->p[Y_PLANE].i_pitch;
+        const int i_dy_line2_start = (y + 1) * p_outpic->p[Y_PLANE].i_pitch;
+        const int i_du_line_start =  (y / 2) * p_outpic->p[U_PLANE].i_pitch;
+        const int i_dv_line_start =  (y / 2) * p_outpic->p[V_PLANE].i_pitch;
+        int x = 0;
+        /* iterate for every visible line in the frame (eight values at once) */
+        for ( ; x < p_pic->p[Y_PLANE].i_visible_pitch - 15; x += 16 )
+        {
+            /* Compute yellow channel values with asm function */
+            Sepia8ySSE2(&p_outpic->p[Y_PLANE].p_pixels[i_dy_line1_start + x],
+                        &p_pic->p[Y_PLANE].p_pixels[i_dy_line1_start + x],
+                        i_intensity_spread );
+            Sepia8ySSE2(&p_outpic->p[Y_PLANE].p_pixels[i_dy_line2_start + x],
+                        &p_pic->p[Y_PLANE].p_pixels[i_dy_line2_start + x],
+                        i_intensity_spread );
+            Sepia8ySSE2(&p_outpic->p[Y_PLANE].p_pixels[i_dy_line1_start + x + 8],
+                        &p_pic->p[Y_PLANE].p_pixels[i_dy_line1_start + x + 8],
+                        i_intensity_spread );
+            Sepia8ySSE2(&p_outpic->p[Y_PLANE].p_pixels[i_dy_line2_start + x + 8],
+                        &p_pic->p[Y_PLANE].p_pixels[i_dy_line2_start + x + 8],
+                        i_intensity_spread );
+            /* Copy precomputed values to destination memory location */
+            memset(&p_outpic->p[U_PLANE].p_pixels[i_du_line_start + (x / 2)],
+                   filling_const_8u, 8 );
+            memset(&p_outpic->p[V_PLANE].p_pixels[i_dv_line_start + (x / 2)],
+                   filling_const_8v, 8 );
+        }
+        /* Completing the job, the cycle above takes really big chunks, so
+           this makes sure the job will be done completely */
+        for ( ; x < p_pic->p[Y_PLANE].i_visible_pitch - 1; x += 2 )
+        {
+            // y = y - y/4 {to prevent overflow} + intensity / 4
+            p_outpic->p[Y_PLANE].p_pixels[i_dy_line1_start + x] =
+                p_pic->p[Y_PLANE].p_pixels[i_dy_line1_start + x] -
+                (p_pic->p[Y_PLANE].p_pixels[i_dy_line1_start + x] >> 2) +
+                (i_intensity >> 2);
+            p_outpic->p[Y_PLANE].p_pixels[i_dy_line1_start + x + 1] =
+                p_pic->p[Y_PLANE].p_pixels[i_dy_line1_start + x + 1] -
+                (p_pic->p[Y_PLANE].p_pixels[i_dy_line1_start + x + 1] >> 2) +
+                (i_intensity >> 2);
+            p_outpic->p[Y_PLANE].p_pixels[i_dy_line2_start + x] =
+                p_pic->p[Y_PLANE].p_pixels[i_dy_line2_start + x] -
+                (p_pic->p[Y_PLANE].p_pixels[i_dy_line2_start + x] >> 2) +
+                (i_intensity >> 2);
+            p_outpic->p[Y_PLANE].p_pixels[i_dy_line2_start + x + 1] =
+                p_pic->p[Y_PLANE].p_pixels[i_dy_line2_start + x + 1] -
+                (p_pic->p[Y_PLANE].p_pixels[i_dy_line2_start + x + 1] >> 2) +
+                (i_intensity >> 2);
+            // u = 128 {half => B&W} - intensity / 6
+            p_outpic->p[U_PLANE].p_pixels[i_du_line_start + (x / 2)] =
+                filling_const_8u;
+            // v = 128 {half => B&W} + intensity / 14
+            p_outpic->p[V_PLANE].p_pixels[i_dv_line_start + (x / 2)] =
+                filling_const_8v;
+        }
+    }
 }
 #endif

@@ -240,131 +316,53 @@ static inline void Sepia8ySSE2(uint8_t * dst, const uint8_t * src,
 static void PlanarI420Sepia( picture_t *p_pic, picture_t *p_outpic,
                               int i_intensity )
 {
-    // prepared values to copy for U and V channels
-    const uint8_t filling_const_8u = 128 - i_intensity / 6;
-    const uint8_t filling_const_8v = 128 + i_intensity / 14;
-
 #if defined(CAN_COMPILE_SSE2)
    if (vlc_CPU() & CPU_CAPABILITY_SSE2)
-    {
-        /* prepared value for faster broadcasting in xmm register */
-        int i_intensity_spread = 0x10001 * (uint8_t) i_intensity;
+        return PlanarI420SepiaSSE( p_pic, p_outpic, i_intensity );
+#endif

-        __asm__ volatile(
-            "pxor      %%xmm7, %%xmm7\n"
-        ::);
+    // prepared values to copy for U and V channels
+    const uint8_t filling_const_8u = 128 - i_intensity / 6;
+    const uint8_t filling_const_8v = 128 + i_intensity / 14;

-        /* iterate for every two visible line in the frame */
-        for (int y = 0; y < p_pic->p[Y_PLANE].i_visible_lines - 1; y += 2)
-        {
-            const int i_dy_line1_start = y * p_outpic->p[Y_PLANE].i_pitch;
-            const int i_dy_line2_start =
-            (y + 1) * p_outpic->p[Y_PLANE].i_pitch;
-            const int i_du_line_start =
-            (y / 2) * p_outpic->p[U_PLANE].i_pitch;
-            const int i_dv_line_start =
-            (y / 2) * p_outpic->p[V_PLANE].i_pitch;
-            int x = 0;
-            /* iterate for every visible line in the frame (eight values at once) */
-            for ( ; x < p_pic->p[Y_PLANE].i_visible_pitch - 15; x += 16 )
-            {
-                /* Compute yellow channel values with asm function */
-                Sepia8ySSE2(
-                    &p_outpic->p[Y_PLANE].p_pixels[i_dy_line1_start + x],
-                    &p_pic->p[Y_PLANE].p_pixels[i_dy_line1_start + x],
-                    i_intensity_spread );
-                Sepia8ySSE2(
-                    &p_outpic->p[Y_PLANE].p_pixels[i_dy_line2_start + x],
-                    &p_pic->p[Y_PLANE].p_pixels[i_dy_line2_start + x],
-                    i_intensity_spread );
-                Sepia8ySSE2(
-                    &p_outpic->p[Y_PLANE].p_pixels[i_dy_line1_start + x + 8],
-                    &p_pic->p[Y_PLANE].p_pixels[i_dy_line1_start + x + 8],
-                    i_intensity_spread );
-                Sepia8ySSE2(
-                    &p_outpic->p[Y_PLANE].p_pixels[i_dy_line2_start + x + 8],
-                    &p_pic->p[Y_PLANE].p_pixels[i_dy_line2_start + x + 8],
-                    i_intensity_spread );
-                /* Copy precomputed values to destination memory location */
-                vlc_memset(
-                    &p_outpic->p[U_PLANE].p_pixels[i_du_line_start + (x / 2)],
-                    filling_const_8u, 8 );
-                vlc_memset(
-                    &p_outpic->p[V_PLANE].p_pixels[i_dv_line_start + (x / 2)],
-                    filling_const_8v, 8 );
-            }
-            /* Completing the job, the cycle above takes really big chunks, so
-              this makes sure the job will be done completely */
-            for ( ; x < p_pic->p[Y_PLANE].i_visible_pitch - 1; x += 2 )
-            {
-                // y = y - y/4 {to prevent overflow} + intensity / 4
-                p_outpic->p[Y_PLANE].p_pixels[i_dy_line1_start + x] =
-                    p_pic->p[Y_PLANE].p_pixels[i_dy_line1_start + x] -
-                    (p_pic->p[Y_PLANE].p_pixels[i_dy_line1_start + x] >> 2) +
-                    (i_intensity >> 2);
-                p_outpic->p[Y_PLANE].p_pixels[i_dy_line1_start + x + 1] =
-                    p_pic->p[Y_PLANE].p_pixels[i_dy_line1_start + x + 1] -
-                    (p_pic->p[Y_PLANE].p_pixels[i_dy_line1_start + x + 1] >> 2) +
-                    (i_intensity >> 2);
-                p_outpic->p[Y_PLANE].p_pixels[i_dy_line2_start + x] =
-                    p_pic->p[Y_PLANE].p_pixels[i_dy_line2_start + x] -
-                    (p_pic->p[Y_PLANE].p_pixels[i_dy_line2_start + x] >> 2) +
-                    (i_intensity >> 2);
-                p_outpic->p[Y_PLANE].p_pixels[i_dy_line2_start + x + 1] =
-                    p_pic->p[Y_PLANE].p_pixels[i_dy_line2_start + x + 1] -
-                    (p_pic->p[Y_PLANE].p_pixels[i_dy_line2_start + x + 1] >> 2) +
-                    (i_intensity >> 2);
-                // u = 128 {half => B&W} - intensity / 6
-                p_outpic->p[U_PLANE].p_pixels[i_du_line_start + (x / 2)] =
-                    filling_const_8u;
-                // v = 128 {half => B&W} + intensity / 14
-                p_outpic->p[V_PLANE].p_pixels[i_dv_line_start + (x / 2)] =
-                    filling_const_8v;
-            }
-        }
-    }
-    else
-#endif
+    /* iterate for every two visible line in the frame */
+    for( int y = 0; y < p_pic->p[Y_PLANE].i_visible_lines - 1; y += 2)
    {
+        const int i_dy_line1_start = y * p_outpic->p[Y_PLANE].i_pitch;
+        const int i_dy_line2_start = ( y + 1 ) * p_outpic->p[Y_PLANE].i_pitch;
+        const int i_du_line_start = (y/2) * p_outpic->p[U_PLANE].i_pitch;
+        const int i_dv_line_start = (y/2) * p_outpic->p[V_PLANE].i_pitch;
+        // to prevent sigsegv if one pic is smaller (theoretically)
+        int i_picture_size_limit = p_pic->p[Y_PLANE].i_visible_pitch
+                  < p_outpic->p[Y_PLANE].i_visible_pitch
+                  ? (p_pic->p[Y_PLANE].i_visible_pitch - 1) :
+                  (p_outpic->p[Y_PLANE].i_visible_pitch - 1);
        /* iterate for every two visible line in the frame */
-        for( int y = 0; y < p_pic->p[Y_PLANE].i_visible_lines - 1; y += 2)
+        for( int x = 0; x < i_picture_size_limit; x += 2)
        {
-            const int i_dy_line1_start = y * p_outpic->p[Y_PLANE].i_pitch;
-            const int i_dy_line2_start = ( y + 1 ) * p_outpic->p[Y_PLANE].i_pitch;
-            const int i_du_line_start = (y/2) * p_outpic->p[U_PLANE].i_pitch;
-            const int i_dv_line_start = (y/2) * p_outpic->p[V_PLANE].i_pitch;
-            // to prevent sigsegv if one pic is smaller (theoretically)
-            int i_picture_size_limit = p_pic->p[Y_PLANE].i_visible_pitch
-                      < p_outpic->p[Y_PLANE].i_visible_pitch
-                      ? (p_pic->p[Y_PLANE].i_visible_pitch - 1) :
-                      (p_outpic->p[Y_PLANE].i_visible_pitch - 1);
-            /* iterate for every two visible line in the frame */
-            for( int x = 0; x < i_picture_size_limit; x += 2)
-            {
-                // y = y - y/4 {to prevent overflow} + intensity / 4
-                p_outpic->p[Y_PLANE].p_pixels[i_dy_line1_start + x] =
-                    p_pic->p[Y_PLANE].p_pixels[i_dy_line1_start + x] -
-                    (p_pic->p[Y_PLANE].p_pixels[i_dy_line1_start + x] >> 2) +
-                    (i_intensity >> 2);
-                p_outpic->p[Y_PLANE].p_pixels[i_dy_line1_start + x + 1] =
-                    p_pic->p[Y_PLANE].p_pixels[i_dy_line1_start + x + 1] -
-                    (p_pic->p[Y_PLANE].p_pixels[i_dy_line1_start + x + 1] >> 2) +
-                    (i_intensity >> 2);
-                p_outpic->p[Y_PLANE].p_pixels[i_dy_line2_start + x] =
-                    p_pic->p[Y_PLANE].p_pixels[i_dy_line2_start + x] -
-                    (p_pic->p[Y_PLANE].p_pixels[i_dy_line2_start + x] >> 2) +
-                    (i_intensity >> 2);
-                p_outpic->p[Y_PLANE].p_pixels[i_dy_line2_start + x + 1] =
-                    p_pic->p[Y_PLANE].p_pixels[i_dy_line2_start + x + 1] -
-                    (p_pic->p[Y_PLANE].p_pixels[i_dy_line2_start + x + 1] >> 2) +
-                    (i_intensity >> 2);
-                // u = 128 {half => B&W} - intensity / 6
-                p_outpic->p[U_PLANE].p_pixels[i_du_line_start + (x / 2)] =
-                    filling_const_8u;
-                // v = 128 {half => B&W} + intensity / 14
-                p_outpic->p[V_PLANE].p_pixels[i_dv_line_start + (x / 2)] =
-                    filling_const_8v;
-            }
+            // y = y - y/4 {to prevent overflow} + intensity / 4
+            p_outpic->p[Y_PLANE].p_pixels[i_dy_line1_start + x] =
+                p_pic->p[Y_PLANE].p_pixels[i_dy_line1_start + x] -
+                (p_pic->p[Y_PLANE].p_pixels[i_dy_line1_start + x] >> 2) +
+                (i_intensity >> 2);
+            p_outpic->p[Y_PLANE].p_pixels[i_dy_line1_start + x + 1] =
+                p_pic->p[Y_PLANE].p_pixels[i_dy_line1_start + x + 1] -
+                (p_pic->p[Y_PLANE].p_pixels[i_dy_line1_start + x + 1] >> 2) +
+                (i_intensity >> 2);
+            p_outpic->p[Y_PLANE].p_pixels[i_dy_line2_start + x] =
+                p_pic->p[Y_PLANE].p_pixels[i_dy_line2_start + x] -
+                (p_pic->p[Y_PLANE].p_pixels[i_dy_line2_start + x] >> 2) +
+                (i_intensity >> 2);
+            p_outpic->p[Y_PLANE].p_pixels[i_dy_line2_start + x + 1] =
+                p_pic->p[Y_PLANE].p_pixels[i_dy_line2_start + x + 1] -
+                (p_pic->p[Y_PLANE].p_pixels[i_dy_line2_start + x + 1] >> 2) +
+                (i_intensity >> 2);
+            // u = 128 {half => B&W} - intensity / 6
+            p_outpic->p[U_PLANE].p_pixels[i_du_line_start + (x / 2)] =
+                filling_const_8u;
+            // v = 128 {half => B&W} + intensity / 14
+            p_outpic->p[V_PLANE].p_pixels[i_dv_line_start + (x / 2)] =
+                filling_const_8v;
        }
    }
 }