Commit 9da31971 authored by Sam Hocevar's avatar Sam Hocevar

* modules/video_chroma/i420_rgb_mmx.h: wrote an MMX intrinsics version of

    the conversion routines. Now that we no longer use inline asm, these
    modules do not generate non-PIC code with -fPIC.
parent 31de0101
......@@ -34,6 +34,9 @@
#if defined (MODULE_NAME_IS_i420_rgb)
# include "i420_rgb_c.h"
#elif defined (MODULE_NAME_IS_i420_rgb_mmx)
# if defined(HAVE_MMX_INTRINSICS)
# include <mmintrin.h>
# endif
# include "i420_rgb_mmx.h"
#endif
......@@ -278,6 +281,13 @@ void E_(I420_RGB16)( vout_thread_t *p_vout, picture_t *p_src,
/* 15bpp 5/5/5 */
for ( i_x = p_vout->render.i_width / 8; i_x--; )
{
# if defined (HAVE_MMX_INTRINSICS)
__m64 mm0, mm1, mm2, mm3, mm4, mm5, mm6, mm7;
INTRINSICS_INIT_16
INTRINSICS_YUV_MUL
INTRINSICS_YUV_ADD
INTRINSICS_UNPACK_15
# else
__asm__( MMX_INIT_16
: : "r" (p_y), "r" (p_u), "r" (p_v), "r" (p_buffer) );
......@@ -286,6 +296,7 @@ void E_(I420_RGB16)( vout_thread_t *p_vout, picture_t *p_src,
MMX_YUV_ADD
MMX_UNPACK_15
: : "r" (p_y), "r" (p_u), "r" (p_v), "r" (p_buffer) );
# endif
p_y += 8;
p_u += 4;
......@@ -298,6 +309,13 @@ void E_(I420_RGB16)( vout_thread_t *p_vout, picture_t *p_src,
/* 16bpp 5/6/5 */
for ( i_x = p_vout->render.i_width / 8; i_x--; )
{
# if defined (HAVE_MMX_INTRINSICS)
__m64 mm0, mm1, mm2, mm3, mm4, mm5, mm6, mm7;
INTRINSICS_INIT_16
INTRINSICS_YUV_MUL
INTRINSICS_YUV_ADD
INTRINSICS_UNPACK_16
# else
__asm__( MMX_INIT_16
: : "r" (p_y), "r" (p_u), "r" (p_v), "r" (p_buffer) );
......@@ -306,6 +324,7 @@ void E_(I420_RGB16)( vout_thread_t *p_vout, picture_t *p_src,
MMX_YUV_ADD
MMX_UNPACK_16
: : "r" (p_y), "r" (p_u), "r" (p_v), "r" (p_buffer) );
# endif
p_y += 8;
p_u += 4;
......@@ -319,6 +338,11 @@ void E_(I420_RGB16)( vout_thread_t *p_vout, picture_t *p_src,
* at least we have all the pixels */
if( i_rewind )
{
#if defined (MODULE_NAME_IS_i420_rgb_mmx)
# if defined (HAVE_MMX_INTRINSICS)
__m64 mm0, mm1, mm2, mm3, mm4, mm5, mm6, mm7;
# endif
#endif
p_y -= i_rewind;
p_u -= i_rewind >> 1;
p_v -= i_rewind >> 1;
......@@ -329,26 +353,43 @@ void E_(I420_RGB16)( vout_thread_t *p_vout, picture_t *p_src,
CONVERT_YUV_PIXEL(2); CONVERT_Y_PIXEL(2);
CONVERT_YUV_PIXEL(2); CONVERT_Y_PIXEL(2);
#elif defined (MODULE_NAME_IS_i420_rgb_mmx)
# if defined (HAVE_MMX_INTRINSICS)
INTRINSICS_INIT_16
# else
__asm__( MMX_INIT_16
: : "r" (p_y), "r" (p_u), "r" (p_v), "r" (p_buffer) );
# endif
if( p_vout->output.i_rmask == 0x7c00 )
{
/* 15bpp 5/5/5 */
# if defined (HAVE_MMX_INTRINSICS)
INTRINSICS_YUV_MUL
INTRINSICS_YUV_ADD
INTRINSICS_UNPACK_15
# else
__asm__( ".align 8"
MMX_YUV_MUL
MMX_YUV_ADD
MMX_UNPACK_15
: : "r" (p_y), "r" (p_u), "r" (p_v), "r" (p_buffer) );
# endif
}
else
{
# if defined (HAVE_MMX_INTRINSICS)
INTRINSICS_YUV_MUL
INTRINSICS_YUV_ADD
INTRINSICS_UNPACK_16
# else
/* 16bpp 5/6/5 */
__asm__( ".align 8"
MMX_YUV_MUL
MMX_YUV_ADD
MMX_UNPACK_16
: : "r" (p_y), "r" (p_u), "r" (p_v), "r" (p_buffer) );
# endif
}
p_y += 8;
......@@ -453,6 +494,13 @@ void E_(I420_RGB32)( vout_thread_t *p_vout, picture_t *p_src,
CONVERT_YUV_PIXEL(4); CONVERT_Y_PIXEL(4);
CONVERT_YUV_PIXEL(4); CONVERT_Y_PIXEL(4);
#elif defined (MODULE_NAME_IS_i420_rgb_mmx)
# if defined (HAVE_MMX_INTRINSICS)
__m64 mm0, mm1, mm2, mm3, mm4, mm5, mm6, mm7;
INTRINSICS_INIT_32
INTRINSICS_YUV_MUL
INTRINSICS_YUV_ADD
INTRINSICS_UNPACK_32
# else
__asm__( MMX_INIT_32
: : "r" (p_y), "r" (p_u), "r" (p_v), "r" (p_buffer) );
......@@ -461,6 +509,7 @@ void E_(I420_RGB32)( vout_thread_t *p_vout, picture_t *p_src,
MMX_YUV_ADD
MMX_UNPACK_32
: : "r" (p_y), "r" (p_u), "r" (p_v), "r" (p_buffer) );
# endif
p_y += 8;
p_u += 4;
......@@ -473,6 +522,11 @@ void E_(I420_RGB32)( vout_thread_t *p_vout, picture_t *p_src,
* at least we have all the pixels */
if( i_rewind )
{
#if defined (MODULE_NAME_IS_i420_rgb_mmx)
# if defined (HAVE_MMX_INTRINSICS)
__m64 mm0, mm1, mm2, mm3, mm4, mm5, mm6, mm7;
# endif
#endif
p_y -= i_rewind;
p_u -= i_rewind >> 1;
p_v -= i_rewind >> 1;
......@@ -483,6 +537,12 @@ void E_(I420_RGB32)( vout_thread_t *p_vout, picture_t *p_src,
CONVERT_YUV_PIXEL(4); CONVERT_Y_PIXEL(4);
CONVERT_YUV_PIXEL(4); CONVERT_Y_PIXEL(4);
#elif defined (MODULE_NAME_IS_i420_rgb_mmx)
# if defined (HAVE_MMX_INTRINSICS)
INTRINSICS_INIT_32
INTRINSICS_YUV_MUL
INTRINSICS_YUV_ADD
INTRINSICS_UNPACK_32
# else
__asm__( MMX_INIT_32
: : "r" (p_y), "r" (p_u), "r" (p_v), "r" (p_buffer) );
......@@ -491,6 +551,7 @@ void E_(I420_RGB32)( vout_thread_t *p_vout, picture_t *p_src,
MMX_YUV_ADD
MMX_UNPACK_32
: : "r" (p_y), "r" (p_u), "r" (p_v), "r" (p_buffer) );
# endif
p_y += 8;
p_u += 4;
......
......@@ -53,6 +53,13 @@ movq (%0), %%mm6 # Load 8 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 \n\
#movl $0, (%3) # cache preload for image \n\
"
#define INTRINSICS_INIT_16 \
mm0 = (__m64)(uint64_t)*(uint32_t *)p_u; \
mm1 = (__m64)(uint64_t)*(uint32_t *)p_v; \
mm4 = (__m64)(uint64_t)0; \
mm6 = (__m64)*(uint64_t *)p_y; \
/* *(uint16_t *)p_buffer = 0; */
#define MMX_INIT_16_GRAY " \n\
movq (%0), %%mm6 # Load 8 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 \n\
#movl $0, (%3) # cache preload for image \n\
......@@ -66,6 +73,13 @@ pxor %%mm4, %%mm4 # zero mm4 \n\
movq (%0), %%mm6 # Load 8 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 \n\
"
#define INTRINSICS_INIT_32 \
mm0 = (__m64)(uint64_t)*(uint32_t *)p_u; \
*(uint16_t *)p_buffer = 0; \
mm1 = (__m64)(uint64_t)*(uint32_t *)p_v; \
mm4 = (__m64)(uint64_t)0; \
mm6 = (__m64)*(uint64_t *)p_y;
/*
* Do the multiply part of the conversion for even and odd pixels,
* register usage:
......@@ -101,6 +115,30 @@ pmulhw mmx_Y_coeff, %%mm6 # Mul 4 Y even 00 y6 00 y4 00 y2 00 y0 \n\
pmulhw mmx_Y_coeff, %%mm7 # Mul 4 Y odd 00 y7 00 y5 00 y3 00 y1 \n\
"
#define INTRINSICS_YUV_MUL \
mm0 = _mm_unpacklo_pi8(mm0, mm4); \
mm1 = _mm_unpacklo_pi8(mm1, mm4); \
mm0 = _mm_subs_pi16(mm0, (__m64)mmx_80w); \
mm1 = _mm_subs_pi16(mm1, (__m64)mmx_80w); \
mm0 = _mm_slli_pi16(mm0, 3); \
mm1 = _mm_slli_pi16(mm1, 3); \
mm2 = mm0; \
mm3 = mm1; \
mm2 = _mm_mulhi_pi16(mm2, (__m64)mmx_U_green); \
mm3 = _mm_mulhi_pi16(mm3, (__m64)mmx_V_green); \
mm0 = _mm_mulhi_pi16(mm0, (__m64)mmx_U_blue); \
mm1 = _mm_mulhi_pi16(mm1, (__m64)mmx_V_red); \
mm2 = _mm_adds_pi16(mm2, mm3); \
\
mm6 = _mm_subs_pu8(mm6, (__m64)mmx_10w); \
mm7 = mm6; \
mm6 = _mm_and_si64(mm6, (__m64)mmx_00ffw); \
mm7 = _mm_srli_pi16(mm7, 8); \
mm6 = _mm_slli_pi16(mm6, 3); \
mm7 = _mm_slli_pi16(mm7, 3); \
mm6 = _mm_mulhi_pi16(mm6, (__m64)mmx_Y_coeff); \
mm7 = _mm_mulhi_pi16(mm7, (__m64)mmx_Y_coeff);
/*
* Do the addition part of the conversion for even and odd pixels,
* register usage:
......@@ -137,6 +175,29 @@ punpcklbw %%mm4, %%mm1 # R7 R6 R5 R4 R3 R2 R1 R0 \n\
punpcklbw %%mm5, %%mm2 # G7 G6 G5 G4 G3 G2 G1 G0 \n\
"
#define INTRINSICS_YUV_ADD \
mm3 = mm0; \
mm4 = mm1; \
mm5 = mm2; \
mm0 = _mm_adds_pi16(mm0, mm6); \
mm3 = _mm_adds_pi16(mm3, mm7); \
mm1 = _mm_adds_pi16(mm1, mm6); \
mm4 = _mm_adds_pi16(mm4, mm7); \
mm2 = _mm_adds_pi16(mm2, mm6); \
mm5 = _mm_adds_pi16(mm5, mm7); \
\
mm0 = _mm_packs_pu16(mm0, mm0); \
mm1 = _mm_packs_pu16(mm1, mm1); \
mm2 = _mm_packs_pu16(mm2, mm2); \
\
mm3 = _mm_packs_pu16(mm3, mm3); \
mm4 = _mm_packs_pu16(mm4, mm4); \
mm5 = _mm_packs_pu16(mm5, mm5); \
\
mm0 = _mm_unpacklo_pi8(mm0, mm3); \
mm1 = _mm_unpacklo_pi8(mm1, mm4); \
mm2 = _mm_unpacklo_pi8(mm2, mm5);
/*
* Grayscale case, only use Y
*/
......@@ -215,6 +276,31 @@ movd 4(%2), %%mm1 # Load 4 Cr __ __ __ __ v3 v2 v1 v0 \n\
movq %%mm5, 8(%3) # store pixel 4-7 \n\
"
#define INTRINSICS_UNPACK_15 \
mm0 = _mm_and_si64(mm0, (__m64)mmx_mask_f8); \
mm0 = _mm_srli_pi16(mm0, 3); \
mm2 = _mm_and_si64(mm2, (__m64)mmx_mask_f8); \
mm1 = _mm_and_si64(mm1, (__m64)mmx_mask_f8); \
mm1 = _mm_srli_pi16(mm1, 1); \
mm4 = (__m64)(uint64_t)0; \
mm5 = mm0; \
mm7 = mm2; \
\
mm2 = _mm_unpacklo_pi8(mm2, mm4); \
mm0 = _mm_unpacklo_pi8(mm0, mm1); \
mm2 = _mm_slli_pi16(mm2, 2); \
mm0 = _mm_or_si64(mm0, mm2); \
mm6 = (__m64)*(uint64_t *)(p_y + 8); \
*(uint64_t *)p_buffer = (uint64_t)mm0; \
\
mm7 = _mm_unpackhi_pi8(mm7, mm4); \
mm5 = _mm_unpackhi_pi8(mm5, mm1); \
mm7 = _mm_slli_pi16(mm7, 2); \
mm0 = (__m64)(uint64_t)*(uint32_t *)(p_u + 4); \
mm5 = _mm_or_si64(mm5, mm7); \
mm1 = (__m64)(uint64_t)*(uint32_t *)(p_v + 4); \
*(uint64_t *)(p_buffer + 4) = (uint64_t)mm5;
/*
* convert RGB plane to RGB 16 bits,
* mm0 -> B, mm1 -> R, mm2 -> G,
......@@ -250,6 +336,30 @@ movd 4(%2), %%mm1 # Load 4 Cr __ __ __ __ v3 v2 v1 v0 \n\
movq %%mm5, 8(%3) # store pixel 4-7 \n\
"
#define INTRINSICS_UNPACK_16 \
mm0 = _mm_and_si64(mm0, (__m64)mmx_mask_f8); \
mm2 = _mm_and_si64(mm2, (__m64)mmx_mask_fc); \
mm1 = _mm_and_si64(mm1, (__m64)mmx_mask_f8); \
mm0 = _mm_srli_pi16(mm0, 3); \
mm4 = (__m64)(uint64_t)0; \
mm5 = mm0; \
mm7 = mm2; \
\
mm2 = _mm_unpacklo_pi8(mm2, mm4); \
mm0 = _mm_unpacklo_pi8(mm0, mm1); \
mm2 = _mm_slli_pi16(mm2, 3); \
mm0 = _mm_or_si64(mm0, mm2); \
mm6 = (__m64)*(uint64_t *)(p_y + 8); \
*(uint64_t *)p_buffer = (uint64_t)mm0; \
\
mm7 = _mm_unpackhi_pi8(mm7, mm4); \
mm5 = _mm_unpackhi_pi8(mm5, mm1); \
mm7 = _mm_slli_pi16(mm7, 3); \
mm0 = (__m64)(uint64_t)*(uint32_t *)(p_u + 4); \
mm5 = _mm_or_si64(mm5, mm7); \
mm1 = (__m64)(uint64_t)*(uint32_t *)(p_v + 4); \
*(uint64_t *)(p_buffer + 4) = (uint64_t)mm5;
/*
* convert RGB plane to RGB packed format,
* mm0 -> B, mm1 -> R, mm2 -> G, mm3 -> 0,
......@@ -286,3 +396,26 @@ movq %%mm4, 24(%3) # Store ARGB7 ARGB6 \n\
#movq 8(%0), %%mm6 # Load 8 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 \n\
"
#define INTRINSICS_UNPACK_32 \
mm3 = (__m64)(uint64_t)0; \
mm6 = mm0; \
mm7 = mm1; \
mm4 = mm0; \
mm5 = mm1; \
mm6 = _mm_unpacklo_pi8(mm6, mm2); \
mm7 = _mm_unpacklo_pi8(mm7, mm3); \
mm6 = _mm_unpacklo_pi16(mm6, mm7); \
*(uint64_t *)p_buffer = (uint64_t)mm6; \
mm6 = mm0; \
mm6 = _mm_unpacklo_pi8(mm6, mm2); \
mm6 = _mm_unpackhi_pi16(mm6, mm7); \
*(uint64_t *)(p_buffer + 2) = (uint64_t)mm6; \
mm4 = _mm_unpackhi_pi8(mm4, mm2); \
mm5 = _mm_unpackhi_pi8(mm5, mm3); \
mm4 = _mm_unpacklo_pi16(mm4, mm5); \
*(uint64_t *)(p_buffer + 4) = (uint64_t)mm4; \
mm4 = mm0; \
mm4 = _mm_unpackhi_pi8(mm4, mm2); \
mm4 = _mm_unpackhi_pi16(mm4, mm5); \
*(uint64_t *)(p_buffer + 6) = (uint64_t)mm4; \
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment