chromas: more SSE2/MMX fixes, added I420_RGBA conversion

5e4dc54c · Damien Fouilleul · c23c9ae9 · 5e4dc54c · 5e4dc54c · 5e4dc54c
Commit 5e4dc54c authored Aug 02, 2007 by Damien Fouilleul
5 changed files
--- a/modules/video_chroma/i420_rgb.c
+++ b/modules/video_chroma/i420_rgb.c
@@ -161,8 +161,7 @@ static int Activate( vlc_object_t *p_this )
                    {
                        /* R8G8B8A8 pixel format */
                        msg_Dbg(p_this, "RGB pixel format is R8G8B8A8");
-                        //p_vout->chroma.pf_convert = E_(I420_B8G8R8A8);
-                        return -1;
+                        p_vout->chroma.pf_convert = E_(I420_R8G8B8A8);
                    }
                    else if( p_vout->output.i_rmask == 0x0000ff00
                          && p_vout->output.i_gmask == 0x00ff0000

--- a/modules/video_chroma/i420_rgb.h
+++ b/modules/video_chroma/i420_rgb.h
@@ -64,6 +64,7 @@ void E_(I420_RGB32)        ( vout_thread_t *, picture_t *, picture_t * );
 void E_(I420_R5G5B5)       ( vout_thread_t *, picture_t *, picture_t * );
 void E_(I420_R5G6B5)       ( vout_thread_t *, picture_t *, picture_t * );
 void E_(I420_A8R8G8B8)     ( vout_thread_t *, picture_t *, picture_t * );
+void E_(I420_R8G8B8A8)     ( vout_thread_t *, picture_t *, picture_t * );
 void E_(I420_B8G8R8A8)     ( vout_thread_t *, picture_t *, picture_t * );
 void E_(I420_A8B8G8R8)     ( vout_thread_t *, picture_t *, picture_t * );
 #endif

--- a/modules/video_chroma/i420_rgb16.c
+++ b/modules/video_chroma/i420_rgb16.c
@@ -1140,6 +1140,245 @@ void E_(I420_A8R8G8B8)( vout_thread_t *p_vout, picture_t *p_src,
 #endif
 }

+void E_(I420_R8G8B8A8)( vout_thread_t *p_vout, picture_t *p_src,
+                                            picture_t *p_dest )
+{
+    /* We got this one from the old arguments */
+    uint32_t *p_pic = (uint32_t*)p_dest->p->p_pixels;
+    uint8_t  *p_y   = p_src->Y_PIXELS;
+    uint8_t  *p_u   = p_src->U_PIXELS;
+    uint8_t  *p_v   = p_src->V_PIXELS;
+
+    vlc_bool_t  b_hscale;                         /* horizontal scaling type */
+    unsigned int i_vscale;                          /* vertical scaling type */
+    unsigned int i_x, i_y;                /* horizontal and vertical indexes */
+
+    int         i_right_margin;
+    int         i_rewind;
+    int         i_scale_count;                       /* scale modulo counter */
+    int         i_chroma_width = p_vout->render.i_width / 2; /* chroma width */
+    uint32_t *  p_pic_start;       /* beginning of the current line for copy */
+    /* Conversion buffer pointer */
+    uint32_t *  p_buffer_start = (uint32_t*)p_vout->chroma.p_sys->p_buffer;
+    uint32_t *  p_buffer;
+
+    /* Offset array pointer */
+    int *       p_offset_start = p_vout->chroma.p_sys->p_offset;
+    int *       p_offset;
+
+    const int i_source_margin = p_src->p[0].i_pitch
+                                 - p_src->p[0].i_visible_pitch;
+    const int i_source_margin_c = p_src->p[1].i_pitch
+                                 - p_src->p[1].i_visible_pitch;
+
+    i_right_margin = p_dest->p->i_pitch - p_dest->p->i_visible_pitch;
+
+    /* Rule: when a picture of size (x1,y1) with aspect ratio r1 is rendered
+     * on a picture of size (x2,y2) with aspect ratio r2, if x1 grows to x1'
+     * then y1 grows to y1' = x1' * y2/x2 * r2/r1 */
+    SetOffset( p_vout->render.i_width, p_vout->render.i_height,
+               p_vout->output.i_width, p_vout->output.i_height,
+               &b_hscale, &i_vscale, p_offset_start );
+
+    /*
+     * Perform conversion
+     */
+    i_scale_count = ( i_vscale == 1 ) ?
+                    p_vout->output.i_height : p_vout->render.i_height;
+
+#if defined (MODULE_NAME_IS_i420_rgb_sse2)
+
+    if( p_vout->render.i_width & 15 )
+    {
+        i_rewind = 16 - ( p_vout->render.i_width & 15 );
+    }
+    else
+    {
+        i_rewind = 0;
+    }
+
+    /*
+    ** SSE2 128 bits fetch/store instructions are faster 
+    ** if memory access is 16 bytes aligned
+    */
+
+    p_buffer = b_hscale ? p_buffer_start : p_pic;
+    if( 0 == (15 & (p_src->p[Y_PLANE].i_pitch|
+                    p_dest->p->i_pitch|
+                    ((int)p_y)|
+                    ((int)p_buffer))) )
+    {
+        /* use faster SSE2 aligned fetch and store */
+        for( i_y = 0; i_y < p_vout->render.i_height; i_y++ )
+        {
+            p_pic_start = p_pic;
+
+            for ( i_x = p_vout->render.i_width / 16; i_x--; )
+            {
+                SSE2_CALL (
+                    SSE2_INIT_32_ALIGNED
+                    SSE2_YUV_MUL
+                    SSE2_YUV_ADD
+                    SSE2_UNPACK_32_RGBA_ALIGNED
+                );
+                p_y += 16;
+                p_u += 8;
+                p_v += 8;
+                p_buffer += 16;
+            }
+
+            /* Here we do some unaligned reads and duplicate conversions, but
+             * at least we have all the pixels */
+            if( i_rewind )
+            {
+                p_y -= i_rewind;
+                p_u -= i_rewind >> 1;
+                p_v -= i_rewind >> 1;
+                p_buffer -= i_rewind;
+                SSE2_CALL (
+                    SSE2_INIT_32_UNALIGNED
+                    SSE2_YUV_MUL
+                    SSE2_YUV_ADD
+                    SSE2_UNPACK_32_RGBA_UNALIGNED
+                );
+                p_y += 16;
+                p_u += 4;
+                p_v += 4;
+            }
+            SCALE_WIDTH;
+            SCALE_HEIGHT( 420, 4 );
+
+            p_y += i_source_margin;
+            if( i_y % 2 )
+            {
+                p_u += i_source_margin_c;
+                p_v += i_source_margin_c;
+            }
+            p_buffer = b_hscale ? p_buffer_start : p_pic;
+        }
+    }
+    else
+    {
+        /* use slower SSE2 unaligned fetch and store */
+        for( i_y = 0; i_y < p_vout->render.i_height; i_y++ )
+        {
+            p_pic_start = p_pic;
+            p_buffer = b_hscale ? p_buffer_start : p_pic;
+
+            for ( i_x = p_vout->render.i_width / 16; i_x--; )
+            {
+                SSE2_CALL (
+                    SSE2_INIT_32_UNALIGNED
+                    SSE2_YUV_MUL
+                    SSE2_YUV_ADD
+                    SSE2_UNPACK_32_RGBA_UNALIGNED
+                );
+                p_y += 16;
+                p_u += 8;
+                p_v += 8;
+                p_buffer += 16;
+            }
+
+            /* Here we do some unaligned reads and duplicate conversions, but
+             * at least we have all the pixels */
+            if( i_rewind )
+            {
+                p_y -= i_rewind;
+                p_u -= i_rewind >> 1;
+                p_v -= i_rewind >> 1;
+                p_buffer -= i_rewind;
+                SSE2_CALL (
+                    SSE2_INIT_32_UNALIGNED
+                    SSE2_YUV_MUL
+                    SSE2_YUV_ADD
+                    SSE2_UNPACK_32_RGBA_UNALIGNED
+                );
+                p_y += 16;
+                p_u += 8;
+                p_v += 8;
+            }
+            SCALE_WIDTH;
+            SCALE_HEIGHT( 420, 4 );
+
+            p_y += i_source_margin;
+            if( i_y % 2 )
+            {
+                p_u += i_source_margin_c;
+                p_v += i_source_margin_c;
+            }
+            p_buffer = b_hscale ? p_buffer_start : p_pic;
+        }
+    }
+
+    /* make sure all SSE2 stores are visible thereafter */
+    SSE2_END;
+
+#else // defined (MODULE_NAME_IS_i420_rgb_mmx)
+
+    if( p_vout->render.i_width & 7 )
+    {
+        i_rewind = 8 - ( p_vout->render.i_width & 7 );
+    }
+    else
+    {
+        i_rewind = 0;
+    }
+
+    for( i_y = 0; i_y < p_vout->render.i_height; i_y++ )
+    {
+        p_pic_start = p_pic;
+        p_buffer = b_hscale ? p_buffer_start : p_pic;
+
+        for ( i_x = p_vout->render.i_width / 8; i_x--; )
+        {
+            MMX_CALL (
+                MMX_INIT_32
+                MMX_YUV_MUL
+                MMX_YUV_ADD
+                MMX_UNPACK_32_RGBA
+            );
+            p_y += 8;
+            p_u += 4;
+            p_v += 4;
+            p_buffer += 8;
+        }
+
+        /* Here we do some unaligned reads and duplicate conversions, but
+         * at least we have all the pixels */
+        if( i_rewind )
+        {
+            p_y -= i_rewind;
+            p_u -= i_rewind >> 1;
+            p_v -= i_rewind >> 1;
+            p_buffer -= i_rewind;
+            MMX_CALL (
+                MMX_INIT_32
+                MMX_YUV_MUL
+                MMX_YUV_ADD
+                MMX_UNPACK_32_RGBA
+            );
+            p_y += 8;
+            p_u += 4;
+            p_v += 4;
+            p_buffer += 8;
+        }
+        SCALE_WIDTH;
+        SCALE_HEIGHT( 420, 4 );
+
+        p_y += i_source_margin;
+        if( i_y % 2 )
+        {
+            p_u += i_source_margin_c;
+            p_v += i_source_margin_c;
+        }
+    }
+
+    /* re-enable FPU registers */
+    MMX_END;
+
+#endif
+}
+
 void E_(I420_B8G8R8A8)( vout_thread_t *p_vout, picture_t *p_src,
                                            picture_t *p_dest )
 {

--- a/modules/video_chroma/i420_rgb_mmx.h
+++ b/modules/video_chroma/i420_rgb_mmx.h
--- a/modules/video_chroma/i420_yuy2.h
+++ b/modules/video_chroma/i420_yuy2.h
@@ -138,56 +138,56 @@ movq      %%mm1, (%1)   # Store YUYV                                      \n\
 #define MMX_END _mm_empty()
    
 #define MMX_YUV420_YUYV                     \
-    mm1 = _mm_cvtsi32_si64((int)*p_u);      \
-    mm2 = _mm_cvtsi32_si64((int)*p_v);      \
+    mm1 = _mm_cvtsi32_si64(*(int*)p_u);     \
+    mm2 = _mm_cvtsi32_si64(*(int*)p_v);     \
    mm0 = (__m64)*(uint64_t*)p_y1;          \
    mm3 = (__m64)*(uint64_t*)p_y2;          \
    mm1 = _mm_unpacklo_pi8(mm1, mm2);       \
    mm2 = mm0;                              \
    mm2 = _mm_unpacklo_pi8(mm2, mm1);       \
-    *(uin64_t)p_line1 = (uint64)mm2;        \
+    *(uint64_t*)p_line1 = (uint64_t)mm2;    \
    mm0 = _mm_unpackhi_pi8(mm0, mm1);       \
-    *(uin64_t)(p_line1 + 4) = (uint64)mm0;  \
+    *(uint64_t*)(p_line1+8) = (uint64_t)mm0;\
    mm4 = mm3;                              \
    mm4 = _mm_unpacklo_pi8(mm4, mm1);       \
-    *(uin64_t)p_line2 = (uint64)mm4;        \
+    *(uint64_t*)p_line2 = (uint64_t)mm4;    \
    mm3 = _mm_unpackhi_pi8(mm3, mm1);       \
-    *(uin64_t)(p_line2 + 4) = (uint64)mm4;
+    *(uint64_t*)(p_line2+8) = (uint64_t)mm3;

 #define MMX_YUV420_YVYU                     \
-    mm2 = _mm_cvtsi32_si64((int)*p_u);      \
-    mm1 = _mm_cvtsi32_si64((int)*p_v);      \
+    mm2 = _mm_cvtsi32_si64(*(int*)p_u);     \
+    mm1 = _mm_cvtsi32_si64(*(int*)p_v);     \
    mm0 = (__m64)*(uint64_t*)p_y1;          \
    mm3 = (__m64)*(uint64_t*)p_y2;          \
    mm1 = _mm_unpacklo_pi8(mm1, mm2);       \
    mm2 = mm0;                              \
    mm2 = _mm_unpacklo_pi8(mm2, mm1);       \
-    *(uin64_t)p_line1 = (uint64)mm2;        \
+    *(uint64_t*)p_line1 = (uint64_t)mm2;    \
    mm0 = _mm_unpackhi_pi8(mm0, mm1);       \
-    *(uin64_t)(p_line1 + 4) = (uint64)mm0;  \
+    *(uint64_t*)(p_line1+8) = (uint64_t)mm0;\
    mm4 = mm3;                              \
    mm4 = _mm_unpacklo_pi8(mm4, mm1);       \
-    *(uin64_t)p_line2 = (uint64)mm4;        \
+    *(uint64_t*)p_line2 = (uint64_t)mm4;    \
    mm3 = _mm_unpackhi_pi8(mm3, mm1);       \
-    *(uin64_t)(p_line2 + 4) = (uint64)mm4;
+    *(uint64_t*)(p_line2+8) = (uint64_t)mm3;

 #define MMX_YUV420_UYVY                     \
-    mm1 = _mm_cvtsi32_si64((int)*p_u);      \
-    mm2 = _mm_cvtsi32_si64((int)*p_v);      \
+    mm1 = _mm_cvtsi32_si64(*(int*)p_u);     \
+    mm2 = _mm_cvtsi32_si64(*(int*)p_v);     \
    mm0 = (__m64)*(uint64_t*)p_y1;          \
    mm3 = (__m64)*(uint64_t*)p_y2;          \
    mm1 = _mm_unpacklo_pi8(mm1, mm2);       \
    mm2 = mm1;                              \
    mm2 = _mm_unpacklo_pi8(mm2, mm0);       \
-    *(uin64_t)p_line1 = (uint64)mm2;        \
+    *(uint64_t*)p_line1 = (uint64_t)mm2;    \
    mm2 = mm1;                              \
    mm2 = _mm_unpackhi_pi8(mm2, mm0);       \
-    *(uin64_t)(p_line1 + 4) = (uint64)mm2;  \
+    *(uint64_t*)(p_line1+8) = (uint64_t)mm2;\
    mm4 = mm1;                              \
    mm4 = _mm_unpacklo_pi8(mm4, mm3);       \
-    *(uin64_t)p_line2 = (uint64)mm4;        \
+    *(uint64_t*)p_line2 = (uint64_t)mm4;    \
    mm1 = _mm_unpackhi_pi8(mm1, mm3);       \
-    *(uin64_t)(p_line2 + 4) = (uint64)mm1;
+    *(uint64_t*)(p_line2+8) = (uint64_t)mm1;

 #endif