Commit 5e4dc54c authored by Damien Fouilleul's avatar Damien Fouilleul

chromas: more SSE2/MMX fixes, added I420_RGBA conversion

parent c23c9ae9
......@@ -161,8 +161,7 @@ static int Activate( vlc_object_t *p_this )
{
/* R8G8B8A8 pixel format */
msg_Dbg(p_this, "RGB pixel format is R8G8B8A8");
//p_vout->chroma.pf_convert = E_(I420_B8G8R8A8);
return -1;
p_vout->chroma.pf_convert = E_(I420_R8G8B8A8);
}
else if( p_vout->output.i_rmask == 0x0000ff00
&& p_vout->output.i_gmask == 0x00ff0000
......
......@@ -64,6 +64,7 @@ void E_(I420_RGB32) ( vout_thread_t *, picture_t *, picture_t * );
void E_(I420_R5G5B5) ( vout_thread_t *, picture_t *, picture_t * );
void E_(I420_R5G6B5) ( vout_thread_t *, picture_t *, picture_t * );
void E_(I420_A8R8G8B8) ( vout_thread_t *, picture_t *, picture_t * );
void E_(I420_R8G8B8A8) ( vout_thread_t *, picture_t *, picture_t * );
void E_(I420_B8G8R8A8) ( vout_thread_t *, picture_t *, picture_t * );
void E_(I420_A8B8G8R8) ( vout_thread_t *, picture_t *, picture_t * );
#endif
......
......@@ -1140,6 +1140,245 @@ void E_(I420_A8R8G8B8)( vout_thread_t *p_vout, picture_t *p_src,
#endif
}
void E_(I420_R8G8B8A8)( vout_thread_t *p_vout, picture_t *p_src,
picture_t *p_dest )
{
/* We got this one from the old arguments */
uint32_t *p_pic = (uint32_t*)p_dest->p->p_pixels;
uint8_t *p_y = p_src->Y_PIXELS;
uint8_t *p_u = p_src->U_PIXELS;
uint8_t *p_v = p_src->V_PIXELS;
vlc_bool_t b_hscale; /* horizontal scaling type */
unsigned int i_vscale; /* vertical scaling type */
unsigned int i_x, i_y; /* horizontal and vertical indexes */
int i_right_margin;
int i_rewind;
int i_scale_count; /* scale modulo counter */
int i_chroma_width = p_vout->render.i_width / 2; /* chroma width */
uint32_t * p_pic_start; /* beginning of the current line for copy */
/* Conversion buffer pointer */
uint32_t * p_buffer_start = (uint32_t*)p_vout->chroma.p_sys->p_buffer;
uint32_t * p_buffer;
/* Offset array pointer */
int * p_offset_start = p_vout->chroma.p_sys->p_offset;
int * p_offset;
const int i_source_margin = p_src->p[0].i_pitch
- p_src->p[0].i_visible_pitch;
const int i_source_margin_c = p_src->p[1].i_pitch
- p_src->p[1].i_visible_pitch;
i_right_margin = p_dest->p->i_pitch - p_dest->p->i_visible_pitch;
/* Rule: when a picture of size (x1,y1) with aspect ratio r1 is rendered
* on a picture of size (x2,y2) with aspect ratio r2, if x1 grows to x1'
* then y1 grows to y1' = x1' * y2/x2 * r2/r1 */
SetOffset( p_vout->render.i_width, p_vout->render.i_height,
p_vout->output.i_width, p_vout->output.i_height,
&b_hscale, &i_vscale, p_offset_start );
/*
* Perform conversion
*/
i_scale_count = ( i_vscale == 1 ) ?
p_vout->output.i_height : p_vout->render.i_height;
#if defined (MODULE_NAME_IS_i420_rgb_sse2)
if( p_vout->render.i_width & 15 )
{
i_rewind = 16 - ( p_vout->render.i_width & 15 );
}
else
{
i_rewind = 0;
}
/*
** SSE2 128 bits fetch/store instructions are faster
** if memory access is 16 bytes aligned
*/
p_buffer = b_hscale ? p_buffer_start : p_pic;
if( 0 == (15 & (p_src->p[Y_PLANE].i_pitch|
p_dest->p->i_pitch|
((int)p_y)|
((int)p_buffer))) )
{
/* use faster SSE2 aligned fetch and store */
for( i_y = 0; i_y < p_vout->render.i_height; i_y++ )
{
p_pic_start = p_pic;
for ( i_x = p_vout->render.i_width / 16; i_x--; )
{
SSE2_CALL (
SSE2_INIT_32_ALIGNED
SSE2_YUV_MUL
SSE2_YUV_ADD
SSE2_UNPACK_32_RGBA_ALIGNED
);
p_y += 16;
p_u += 8;
p_v += 8;
p_buffer += 16;
}
/* Here we do some unaligned reads and duplicate conversions, but
* at least we have all the pixels */
if( i_rewind )
{
p_y -= i_rewind;
p_u -= i_rewind >> 1;
p_v -= i_rewind >> 1;
p_buffer -= i_rewind;
SSE2_CALL (
SSE2_INIT_32_UNALIGNED
SSE2_YUV_MUL
SSE2_YUV_ADD
SSE2_UNPACK_32_RGBA_UNALIGNED
);
p_y += 16;
p_u += 4;
p_v += 4;
}
SCALE_WIDTH;
SCALE_HEIGHT( 420, 4 );
p_y += i_source_margin;
if( i_y % 2 )
{
p_u += i_source_margin_c;
p_v += i_source_margin_c;
}
p_buffer = b_hscale ? p_buffer_start : p_pic;
}
}
else
{
/* use slower SSE2 unaligned fetch and store */
for( i_y = 0; i_y < p_vout->render.i_height; i_y++ )
{
p_pic_start = p_pic;
p_buffer = b_hscale ? p_buffer_start : p_pic;
for ( i_x = p_vout->render.i_width / 16; i_x--; )
{
SSE2_CALL (
SSE2_INIT_32_UNALIGNED
SSE2_YUV_MUL
SSE2_YUV_ADD
SSE2_UNPACK_32_RGBA_UNALIGNED
);
p_y += 16;
p_u += 8;
p_v += 8;
p_buffer += 16;
}
/* Here we do some unaligned reads and duplicate conversions, but
* at least we have all the pixels */
if( i_rewind )
{
p_y -= i_rewind;
p_u -= i_rewind >> 1;
p_v -= i_rewind >> 1;
p_buffer -= i_rewind;
SSE2_CALL (
SSE2_INIT_32_UNALIGNED
SSE2_YUV_MUL
SSE2_YUV_ADD
SSE2_UNPACK_32_RGBA_UNALIGNED
);
p_y += 16;
p_u += 8;
p_v += 8;
}
SCALE_WIDTH;
SCALE_HEIGHT( 420, 4 );
p_y += i_source_margin;
if( i_y % 2 )
{
p_u += i_source_margin_c;
p_v += i_source_margin_c;
}
p_buffer = b_hscale ? p_buffer_start : p_pic;
}
}
/* make sure all SSE2 stores are visible thereafter */
SSE2_END;
#else // defined (MODULE_NAME_IS_i420_rgb_mmx)
if( p_vout->render.i_width & 7 )
{
i_rewind = 8 - ( p_vout->render.i_width & 7 );
}
else
{
i_rewind = 0;
}
for( i_y = 0; i_y < p_vout->render.i_height; i_y++ )
{
p_pic_start = p_pic;
p_buffer = b_hscale ? p_buffer_start : p_pic;
for ( i_x = p_vout->render.i_width / 8; i_x--; )
{
MMX_CALL (
MMX_INIT_32
MMX_YUV_MUL
MMX_YUV_ADD
MMX_UNPACK_32_RGBA
);
p_y += 8;
p_u += 4;
p_v += 4;
p_buffer += 8;
}
/* Here we do some unaligned reads and duplicate conversions, but
* at least we have all the pixels */
if( i_rewind )
{
p_y -= i_rewind;
p_u -= i_rewind >> 1;
p_v -= i_rewind >> 1;
p_buffer -= i_rewind;
MMX_CALL (
MMX_INIT_32
MMX_YUV_MUL
MMX_YUV_ADD
MMX_UNPACK_32_RGBA
);
p_y += 8;
p_u += 4;
p_v += 4;
p_buffer += 8;
}
SCALE_WIDTH;
SCALE_HEIGHT( 420, 4 );
p_y += i_source_margin;
if( i_y % 2 )
{
p_u += i_source_margin_c;
p_v += i_source_margin_c;
}
}
/* re-enable FPU registers */
MMX_END;
#endif
}
void E_(I420_B8G8R8A8)( vout_thread_t *p_vout, picture_t *p_src,
picture_t *p_dest )
{
......
......@@ -300,6 +300,26 @@ punpckhwd %%mm1, %%mm0 # 00 R7 B7 G7 00 R6 B6 G6 \n\
movq %%mm0, 24(%3) # Store ARGB7 ARGB6 \n\
"
#define MMX_UNPACK_32_RGBA " \n\
pxor %%mm3, %%mm3 # zero mm3 \n\
movq %%mm2, %%mm4 # G7 G6 G5 G4 G3 G2 G1 G0 \n\
punpcklbw %%mm1, %%mm4 # R3 G3 R2 G2 R1 G1 R0 G0 \n\
punpcklbw %%mm0, %%mm3 # B3 00 B2 00 B1 00 B0 00 \n\
movq %%mm3, %%mm5 # R3 00 R2 00 R1 00 R0 00 \n\
punpcklwd %%mm4, %%mm3 # R1 G1 B1 00 R0 G0 B0 00 \n\
movq %%mm3, (%3) # Store RGBA1 RGBA0 \n\
punpckhwd %%mm4, %%mm5 # R3 G3 B3 00 R2 G2 B2 00 \n\
movq %%mm5, 8(%3) # Store RGBA3 RGBA2 \n\
pxor %%mm6, %%mm6 # zero mm6 \n\
punpckhbw %%mm1, %%mm2 # R7 G7 R6 G6 R5 G5 R4 G4 \n\
punpckhbw %%mm0, %%mm6 # B7 00 B6 00 B5 00 B4 00 \n\
movq %%mm6, %%mm0 # B7 00 B6 00 B5 00 B4 00 \n\
punpcklwd %%mm2, %%mm6 # R5 G5 B5 00 R4 G4 B4 00 \n\
movq %%mm6, 16(%3) # Store RGBA5 RGBA4 \n\
punpckhwd %%mm2, %%mm0 # R7 G7 B7 00 R6 G6 B6 00 \n\
movq %%mm0, 24(%3) # Store RGBA7 RGBA6 \n\
"
#define MMX_UNPACK_32_BGRA " \n\
pxor %%mm3, %%mm3 # zero mm3 \n\
movq %%mm2, %%mm4 # G7 G6 G5 G4 G3 G2 G1 G0 \n\
......@@ -356,15 +376,15 @@ movq %%mm2, 24(%3) # Store ABGR7 ABGR6 \n\
#define MMX_END _mm_empty()
#define MMX_INIT_16 \
mm0 = _mm_cvtsi32_si64((int)*p_u); \
mm1 = _mm_cvtsi32_si64((int)*p_v); \
mm0 = _mm_cvtsi32_si64(*(int*)p_u); \
mm1 = _mm_cvtsi32_si64(*(int*)p_v); \
mm4 = _mm_setzero_si64(); \
mm6 = (__m64)*(uint64_t *)p_y
mm6 = (__m64)*(uint64_t *)p_y;
#define MMX_INIT_32 \
mm0 = _mm_cvtsi32_si64((int)*p_u); \
mm0 = _mm_cvtsi32_si64(*(int*)p_u); \
*(uint16_t *)p_buffer = 0; \
mm1 = _mm_cvtsi32_si64((int)*p_v); \
mm1 = _mm_cvtsi32_si64(*(int*)p_v); \
mm4 = _mm_setzero_si64(); \
mm6 = (__m64)*(uint64_t *)p_y;
......@@ -483,6 +503,25 @@ movq %%mm2, 24(%3) # Store ABGR7 ABGR6 \n\
mm0 = _mm_unpackhi_pi16(mm0, mm1); \
*(uint64_t *)(p_buffer + 6) = (uint64_t)mm0;
#define MMX_UNPACK_32_RGBA \
mm3 = _mm_setzero_si64(); \
mm4 = mm2; \
mm4 = _mm_unpacklo_pi8(mm4, mm1); \
mm3 = _mm_unpacklo_pi8(mm3, mm0); \
mm5 = mm3; \
mm3 = _mm_unpacklo_pi16(mm3, mm4); \
*(uint64_t *)p_buffer = (uint64_t)mm3; \
mm5 = _mm_unpackhi_pi16(mm5, mm4); \
*(uint64_t *)(p_buffer + 2) = (uint64_t)mm5;\
mm6 = _mm_setzero_si64(); \
mm2 = _mm_unpackhi_pi8(mm2, mm1); \
mm6 = _mm_unpackhi_pi8(mm6, mm0); \
mm0 = mm6; \
mm6 = _mm_unpacklo_pi16(mm6, mm2); \
*(uint64_t *)(p_buffer + 4) = (uint64_t)mm6;\
mm0 = _mm_unpackhi_pi16(mm0, mm2); \
*(uint64_t *)(p_buffer + 6) = (uint64_t)mm0;
#define MMX_UNPACK_32_BGRA \
mm3 = _mm_setzero_si64(); \
mm4 = mm2; \
......@@ -503,7 +542,23 @@ movq %%mm2, 24(%3) # Store ABGR7 ABGR6 \n\
*(uint64_t *)(p_buffer + 6) = (uint64_t)mm0;
#define MMX_UNPACK_32_ABGR \
;
mm3 = _mm_setzero_si64(); \
mm4 = mm1; \
mm4 = _mm_unpacklo_pi8(mm4, mm2); \
mm5 = mm0; \
mm5 = _mm_unpacklo_pi8(mm5, mm3); \
mm6 = mm4; \
mm4 = _mm_unpacklo_pi16(mm4, mm5); \
*(uint64_t *)p_buffer = (uint64_t)mm4; \
mm6 = _mm_unpackhi_pi16(mm6, mm5); \
*(uint64_t *)(p_buffer + 2) = (uint64_t)mm6;\
mm1 = _mm_unpackhi_pi8(mm1, mm2); \
mm0 = _mm_unpackhi_pi8(mm0, mm3); \
mm2 = mm1; \
mm1 = _mm_unpacklo_pi16(mm1, mm0); \
*(uint64_t *)(p_buffer + 4) = (uint64_t)mm1;\
mm2 = _mm_unpackhi_pi16(mm2, mm0); \
*(uint64_t *)(p_buffer + 6) = (uint64_t)mm2;
#endif
......@@ -795,6 +850,46 @@ punpckhwd %%xmm1, %%xmm0 # 00 R7 B7 G7 00 R6 B6 G6 \n\
movdqu %%xmm0, 48(%3) # Store ARGB15 ARGB14 ARGB13 ARGB12 \n\
"
#define SSE2_UNPACK_32_RGBA_ALIGNED " \n\
pxor %%xmm3, %%xmm3 # zero mm3 \n\
movdqa %%xmm2, %%xmm4 # G7 G6 G5 G4 G3 G2 G1 G0 \n\
punpcklbw %%xmm1, %%xmm4 # R3 G3 R2 G2 R1 G1 R0 G0 \n\
punpcklbw %%xmm0, %%xmm3 # B3 00 B2 00 B1 00 B0 00 \n\
movdqa %%xmm3, %%xmm5 # R3 00 R2 00 R1 00 R0 00 \n\
punpcklwd %%xmm4, %%xmm3 # R1 G1 B1 00 R0 B0 G0 00 \n\
movntdq %%xmm3, (%3) # Store RGBA3 RGBA2 RGBA1 RGBA0 \n\
punpckhwd %%xmm4, %%xmm5 # R3 G3 B3 00 R2 G2 B2 00 \n\
movntdq %%xmm5, 16(%3) # Store RGBA7 RGBA6 RGBA5 RGBA4 \n\
pxor %%xmm6, %%xmm6 # zero mm6 \n\
punpckhbw %%xmm1, %%xmm2 # R7 G7 R6 G6 R5 G5 R4 G4 \n\
punpckhbw %%xmm0, %%xmm6 # B7 00 B6 00 B5 00 B4 00 \n\
movdqa %%xmm6, %%xmm0 # B7 00 B6 00 B5 00 B4 00 \n\
punpcklwd %%xmm2, %%xmm6 # R5 G5 B5 00 R4 G4 B4 00 \n\
movntdq %%xmm6, 32(%3) # Store BGRA11 BGRA10 BGRA9 RGBA8 \n\
punpckhwd %%xmm2, %%xmm0 # R7 G7 B7 00 R6 G6 B6 00 \n\
movntdq %%xmm0, 48(%3) # Store RGBA15 RGBA14 RGBA13 RGBA12 \n\
"
#define SSE2_UNPACK_32_RGBA_UNALIGNED " \n\
pxor %%xmm3, %%xmm3 # zero mm3 \n\
movdqa %%xmm2, %%xmm4 # G7 G6 G5 G4 G3 G2 G1 G0 \n\
punpcklbw %%xmm1, %%xmm4 # R3 G3 R2 G2 R1 G1 R0 G0 \n\
punpcklbw %%xmm0, %%xmm3 # B3 00 B2 00 B1 00 B0 00 \n\
movdqa %%xmm3, %%xmm5 # R3 00 R2 00 R1 00 R0 00 \n\
punpcklwd %%xmm4, %%xmm3 # R1 G1 B1 00 R0 B0 G0 00 \n\
movdqu %%xmm3, (%3) # Store RGBA3 RGBA2 RGBA1 RGBA0 \n\
punpckhwd %%xmm4, %%xmm5 # R3 G3 B3 00 R2 G2 B2 00 \n\
movdqu %%xmm5, 16(%3) # Store RGBA7 RGBA6 RGBA5 RGBA4 \n\
pxor %%xmm6, %%xmm6 # zero mm6 \n\
punpckhbw %%xmm1, %%xmm2 # R7 G7 R6 G6 R5 G5 R4 G4 \n\
punpckhbw %%xmm0, %%xmm6 # B7 00 B6 00 B5 00 B4 00 \n\
movdqa %%xmm6, %%xmm0 # B7 00 B6 00 B5 00 B4 00 \n\
punpcklwd %%xmm2, %%xmm6 # R5 G5 B5 00 R4 G4 B4 00 \n\
movdqu %%xmm6, 32(%3) # Store RGBA11 RGBA10 RGBA9 RGBA8 \n\
punpckhwd %%xmm2, %%xmm0 # R7 G7 B7 00 R6 G6 B6 00 \n\
movdqu %%xmm0, 48(%3) # Store RGBA15 RGBA14 RGBA13 RGBA12 \n\
"
#define SSE2_UNPACK_32_BGRA_ALIGNED " \n\
pxor %%xmm3, %%xmm3 # zero mm3 \n\
movdqa %%xmm2, %%xmm4 # G7 G6 G5 G4 G3 G2 G1 G0 \n\
......@@ -881,11 +976,11 @@ movdqu %%xmm2, 48(%3) # Store ABGR15 ABGR14 ABGR13 ABGR12 \n\
#include <emmintrin.h>
#define SSE2_CALL(SSE2_INSTRUCTIONS) \
do { \
__m128i xmm0, xmm1, xmm2, xmm3, \
xmm4, xmm5, xmm6, xmm7; \
SSE2_INSTRUCTIONS \
#define SSE2_CALL(SSE2_INSTRUCTIONS) \
do { \
__m128i xmm0, xmm1, xmm2, xmm3, \
xmm4, xmm5, xmm6, xmm7; \
SSE2_INSTRUCTIONS \
} while(0)
#define SSE2_END _mm_sfence()
......@@ -971,179 +1066,249 @@ movdqu %%xmm2, 48(%3) # Store ABGR15 ABGR14 ABGR13 ABGR12 \n\
xmm1 = _mm_unpacklo_epi8(xmm1, xmm4); \
xmm2 = _mm_unpacklo_epi8(xmm2, xmm5);
#define SSE2_UNPACK_15_ALIGNED \
xmm5 = _mm_set1_epi32(0xf8f8f8f8UL); \
xmm0 = _mm_and_si128(xmm0, xmm5); \
xmm0 = _mm_srli_epi16(xmm0, 3); \
xmm2 = _mm_and_si128(xmm2, xmm5); \
xmm1 = _mm_and_si128(xmm1, xmm5); \
xmm1 = _mm_srli_epi16(xmm1, 1); \
xmm4 = _mm_setzero_si128(); \
xmm5 = xmm0; \
xmm7 = xmm2; \
#define SSE2_UNPACK_15_ALIGNED \
xmm5 = _mm_set1_epi32(0xf8f8f8f8UL); \
xmm0 = _mm_and_si128(xmm0, xmm5); \
xmm0 = _mm_srli_epi16(xmm0, 3); \
xmm2 = _mm_and_si128(xmm2, xmm5); \
xmm1 = _mm_and_si128(xmm1, xmm5); \
xmm1 = _mm_srli_epi16(xmm1, 1); \
xmm4 = _mm_setzero_si128(); \
xmm5 = xmm0; \
xmm7 = xmm2; \
\
xmm2 = _mm_unpacklo_epi8(xmm2, xmm4); \
xmm0 = _mm_unpacklo_epi8(xmm0, xmm1); \
xmm2 = _mm_slli_epi16(xmm2, 2); \
xmm0 = _mm_or_si128(xmm0, xmm2); \
_mm_stream_si128((__m128i*)p_buffer, xmm0); \
xmm2 = _mm_unpacklo_epi8(xmm2, xmm4); \
xmm0 = _mm_unpacklo_epi8(xmm0, xmm1); \
xmm2 = _mm_slli_epi16(xmm2, 2); \
xmm0 = _mm_or_si128(xmm0, xmm2); \
_mm_stream_si128((__m128i*)p_buffer, xmm0); \
\
xmm7 = _mm_unpackhi_epi8(xmm7, xmm4); \
xmm5 = _mm_unpackhi_epi8(xmm5, xmm1); \
xmm7 = _mm_slli_epi16(xmm7, 2); \
xmm5 = _mm_or_si128(xmm5, xmm7); \
xmm7 = _mm_unpackhi_epi8(xmm7, xmm4); \
xmm5 = _mm_unpackhi_epi8(xmm5, xmm1); \
xmm7 = _mm_slli_epi16(xmm7, 2); \
xmm5 = _mm_or_si128(xmm5, xmm7); \
_mm_stream_si128((__m128i*)(p_buffer+8), xmm5);
#define SSE2_UNPACK_15_UNALIGNED \
xmm5 = _mm_set1_epi32(0xf8f8f8f8UL); \
xmm0 = _mm_and_si128(xmm0, xmm5); \
xmm0 = _mm_srli_epi16(xmm0, 3); \
xmm2 = _mm_and_si128(xmm2, xmm5); \
xmm1 = _mm_and_si128(xmm1, xmm5); \
xmm1 = _mm_srli_epi16(xmm1, 1); \
xmm4 = _mm_setzero_si128(); \
xmm5 = xmm0; \
xmm7 = xmm2; \
#define SSE2_UNPACK_15_UNALIGNED \
xmm5 = _mm_set1_epi32(0xf8f8f8f8UL); \
xmm0 = _mm_and_si128(xmm0, xmm5); \
xmm0 = _mm_srli_epi16(xmm0, 3); \
xmm2 = _mm_and_si128(xmm2, xmm5); \
xmm1 = _mm_and_si128(xmm1, xmm5); \
xmm1 = _mm_srli_epi16(xmm1, 1); \
xmm4 = _mm_setzero_si128(); \
xmm5 = xmm0; \
xmm7 = xmm2; \
\
xmm2 = _mm_unpacklo_epi8(xmm2, xmm4); \
xmm0 = _mm_unpacklo_epi8(xmm0, xmm1); \
xmm2 = _mm_slli_epi16(xmm2, 2); \
xmm0 = _mm_or_si128(xmm0, xmm2); \
_mm_storeu_si128((__m128i*)p_buffer, xmm0); \
xmm2 = _mm_unpacklo_epi8(xmm2, xmm4); \
xmm0 = _mm_unpacklo_epi8(xmm0, xmm1); \
xmm2 = _mm_slli_epi16(xmm2, 2); \
xmm0 = _mm_or_si128(xmm0, xmm2); \
_mm_storeu_si128((__m128i*)p_buffer, xmm0); \
\
xmm7 = _mm_unpackhi_epi8(xmm7, xmm4); \
xmm5 = _mm_unpackhi_epi8(xmm5, xmm1); \
xmm7 = _mm_slli_epi16(xmm7, 2); \
xmm5 = _mm_or_si128(xmm5, xmm7); \
xmm7 = _mm_unpackhi_epi8(xmm7, xmm4); \
xmm5 = _mm_unpackhi_epi8(xmm5, xmm1); \
xmm7 = _mm_slli_epi16(xmm7, 2); \
xmm5 = _mm_or_si128(xmm5, xmm7); \
_mm_storeu_si128((__m128i*)(p_buffer+16), xmm5);
#define SSE2_UNPACK_16_ALIGNED \
xmm5 = _mm_set1_epi32(0xf8f8f8f8UL); \
xmm0 = _mm_and_si128(xmm0, xmm5); \
xmm1 = _mm_and_si128(xmm1, xmm5); \
xmm5 = _mm_set1_epi32(0xfcfcfcfcUL); \
xmm2 = _mm_and_si128(xmm2, xmm5); \
xmm0 = _mm_srli_epi16(xmm0, 3); \
xmm4 = _mm_setzero_si128(); \
xmm5 = xmm0; \
xmm7 = xmm2; \
#define SSE2_UNPACK_16_ALIGNED \
xmm5 = _mm_set1_epi32(0xf8f8f8f8UL); \
xmm0 = _mm_and_si128(xmm0, xmm5); \
xmm1 = _mm_and_si128(xmm1, xmm5); \
xmm5 = _mm_set1_epi32(0xfcfcfcfcUL); \
xmm2 = _mm_and_si128(xmm2, xmm5); \
xmm0 = _mm_srli_epi16(xmm0, 3); \
xmm4 = _mm_setzero_si128(); \
xmm5 = xmm0; \
xmm7 = xmm2; \
\
xmm2 = _mm_unpacklo_epi8(xmm2, xmm4); \
xmm0 = _mm_unpacklo_epi8(xmm0, xmm1); \
xmm2 = _mm_slli_epi16(xmm2, 3); \
xmm0 = _mm_or_si128(xmm0, xmm2); \
_mm_stream_si128((__m128i*)p_buffer, xmm0); \
xmm2 = _mm_unpacklo_epi8(xmm2, xmm4); \
xmm0 = _mm_unpacklo_epi8(xmm0, xmm1); \
xmm2 = _mm_slli_epi16(xmm2, 3); \
xmm0 = _mm_or_si128(xmm0, xmm2); \
_mm_stream_si128((__m128i*)p_buffer, xmm0); \
\
xmm7 = _mm_unpackhi_epi8(xmm7, xmm4); \
xmm5 = _mm_unpackhi_epi8(xmm5, xmm1); \
xmm7 = _mm_slli_epi16(xmm7, 3); \
xmm5 = _mm_or_si128(xmm5, xmm7); \
xmm7 = _mm_unpackhi_epi8(xmm7, xmm4); \
xmm5 = _mm_unpackhi_epi8(xmm5, xmm1); \
xmm7 = _mm_slli_epi16(xmm7, 3); \
xmm5 = _mm_or_si128(xmm5, xmm7); \
_mm_stream_si128((__m128i*)(p_buffer+8), xmm5);
#define SSE2_UNPACK_16_UNALIGNED \
xmm5 = _mm_set1_epi32(0xf8f8f8f8UL); \
xmm0 = _mm_and_si128(xmm0, xmm5); \
xmm1 = _mm_and_si128(xmm1, xmm5); \
xmm5 = _mm_set1_epi32(0xfcfcfcfcUL); \
xmm2 = _mm_and_si128(xmm2, xmm5); \
xmm0 = _mm_srli_epi16(xmm0, 3); \
xmm4 = _mm_setzero_si128(); \
xmm5 = xmm0; \
xmm7 = xmm2; \
#define SSE2_UNPACK_16_UNALIGNED \
xmm5 = _mm_set1_epi32(0xf8f8f8f8UL); \
xmm0 = _mm_and_si128(xmm0, xmm5); \
xmm1 = _mm_and_si128(xmm1, xmm5); \
xmm5 = _mm_set1_epi32(0xfcfcfcfcUL); \
xmm2 = _mm_and_si128(xmm2, xmm5); \
xmm0 = _mm_srli_epi16(xmm0, 3); \
xmm4 = _mm_setzero_si128(); \
xmm5 = xmm0; \
xmm7 = xmm2; \
\
xmm2 = _mm_unpacklo_epi8(xmm2, xmm4); \
xmm0 = _mm_unpacklo_epi8(xmm0, xmm1); \
xmm2 = _mm_slli_epi16(xmm2, 3); \
xmm0 = _mm_or_si128(xmm0, xmm2); \
_mm_storeu_si128((__m128i*)p_buffer, xmm0); \
xmm2 = _mm_unpacklo_epi8(xmm2, xmm4); \
xmm0 = _mm_unpacklo_epi8(xmm0, xmm1); \
xmm2 = _mm_slli_epi16(xmm2, 3); \
xmm0 = _mm_or_si128(xmm0, xmm2); \
_mm_storeu_si128((__m128i*)p_buffer, xmm0); \
\
xmm7 = _mm_unpackhi_epi8(xmm7, xmm4); \
xmm5 = _mm_unpackhi_epi8(xmm5, xmm1); \
xmm7 = _mm_slli_epi16(xmm7, 3); \
xmm5 = _mm_or_si128(xmm5, xmm7); \
xmm7 = _mm_unpackhi_epi8(xmm7, xmm4); \
xmm5 = _mm_unpackhi_epi8(xmm5, xmm1); \
xmm7 = _mm_slli_epi16(xmm7, 3); \
xmm5 = _mm_or_si128(xmm5, xmm7); \
_mm_storeu_si128((__m128i*)(p_buffer+8), xmm5);
#define SSE2_UNPACK_32_ARGB_ALIGNED \
xmm3 = _mm_setzero_si128(); \
xmm4 = xmm0; \
xmm4 = _mm_unpacklo_epi8(xmm4, xmm2); \
xmm5 = xmm1; \
xmm5 = _mm_unpacklo_epi8(xmm5, xmm3); \
xmm6 = xmm4; \
xmm4 = _mm_unpacklo_epi16(xmm4, xmm5); \
_mm_stream_si128((__m128i*)(p_buffer), xmm4); \
xmm6 = _mm_unpackhi_epi16(xmm6, xmm5); \
#define SSE2_UNPACK_32_ARGB_ALIGNED \
xmm3 = _mm_setzero_si128(); \
xmm4 = xmm0; \
xmm4 = _mm_unpacklo_epi8(xmm4, xmm2); \
xmm5 = xmm1; \
xmm5 = _mm_unpacklo_epi8(xmm5, xmm3); \
xmm6 = xmm4; \
xmm4 = _mm_unpacklo_epi16(xmm4, xmm5); \
_mm_stream_si128((__m128i*)(p_buffer), xmm4); \
xmm6 = _mm_unpackhi_epi16(xmm6, xmm5); \
_mm_stream_si128((__m128i*)(p_buffer+4), xmm6); \
xmm0 = _mm_unpackhi_epi8(xmm0, xmm2); \
xmm1 = _mm_unpackhi_epi8(xmm1, xmm3); \
xmm5 = xmm0; \
xmm5 = _mm_unpacklo_epi16(xmm5, xmm1); \
xmm0 = _mm_unpackhi_epi8(xmm0, xmm2); \
xmm1 = _mm_unpackhi_epi8(xmm1, xmm3); \
xmm5 = xmm0; \
xmm5 = _mm_unpacklo_epi16(xmm5, xmm1); \
_mm_stream_si128((__m128i*)(p_buffer+8), xmm5); \
xmm0 = _mm_unpackhi_epi16(xmm0, xmm1); \
xmm0 = _mm_unpackhi_epi16(xmm0, xmm1); \
_mm_stream_si128((__m128i*)(p_buffer+12), xmm0);
#define SSE2_UNPACK_32_ARGB_UNALIGNED \
xmm3 = _mm_setzero_si128(); \
xmm4 = xmm0; \
xmm4 = _mm_unpacklo_epi8(xmm4, xmm2); \
xmm5 = xmm1; \
xmm5 = _mm_unpacklo_epi8(xmm5, xmm3); \
xmm6 = xmm4; \
xmm4 = _mm_unpacklo_epi16(xmm4, xmm5); \
_mm_storeu_si128((__m128i*)(p_buffer), xmm4); \
xmm6 = _mm_unpackhi_epi16(xmm6, xmm5); \
#define SSE2_UNPACK_32_ARGB_UNALIGNED \
xmm3 = _mm_setzero_si128(); \
xmm4 = xmm0; \
xmm4 = _mm_unpacklo_epi8(xmm4, xmm2); \
xmm5 = xmm1; \
xmm5 = _mm_unpacklo_epi8(xmm5, xmm3); \
xmm6 = xmm4; \
xmm4 = _mm_unpacklo_epi16(xmm4, xmm5); \
_mm_storeu_si128((__m128i*)(p_buffer), xmm4); \
xmm6 = _mm_unpackhi_epi16(xmm6, xmm5); \
_mm_storeu_si128((__m128i*)(p_buffer+4), xmm6); \
xmm0 = _mm_unpackhi_epi8(xmm0, xmm2); \
xmm1 = _mm_unpackhi_epi8(xmm1, xmm3); \
xmm5 = xmm0; \
xmm5 = _mm_unpacklo_epi16(xmm5, xmm1); \
xmm0 = _mm_unpackhi_epi8(xmm0, xmm2); \
xmm1 = _mm_unpackhi_epi8(xmm1, xmm3); \
xmm5 = xmm0; \
xmm5 = _mm_unpacklo_epi16(xmm5, xmm1); \
_mm_storeu_si128((__m128i*)(p_buffer+8), xmm5); \
xmm0 = _mm_unpackhi_epi16(xmm0, xmm1); \
xmm0 = _mm_unpackhi_epi16(xmm0, xmm1); \
_mm_storeu_si128((__m128i*)(p_buffer+12), xmm0);
#define SSE2_UNPACK_32_BGRA_ALIGNED \
xmm3 = _mm_setzero_si128(); \
xmm4 = xmm2; \
xmm4 = _mm_unpacklo_epi8(xmm4, xmm0); \
xmm3 = _mm_unpacklo_epi8(xmm3, xmm1); \
xmm5 = xmm3; \
xmm3 = _mm_unpacklo_epi16(xmm3, xmm4); \
_mm_stream_si128((__m128i*)(p_buffer), xmm3); \
xmm5 = _mm_unpackhi_epi16(xmm5, xmm4); \
#define SSE2_UNPACK_32_RGBA_ALIGNED \
xmm3 = _mm_setzero_si128(); \
xmm4 = xmm2; \
xmm4 = _mm_unpacklo_epi8(xmm4, xmm1); \
xmm3 = _mm_unpacklo_epi8(xmm3, xmm0); \
xmm5 = xmm3; \
xmm3 = _mm_unpacklo_epi16(xmm3, xmm4); \
_mm_stream_si128((__m128i*)(p_buffer), xmm3); \
xmm5 = _mm_unpackhi_epi16(xmm5, xmm4); \
_mm_stream_si128((__m128i*)(p_buffer+4), xmm5); \
xmm6 = _mm_setzero_si128(); \
xmm2 = _mm_unpackhi_epi8(xmm2, xmm0); \
xmm6 = _mm_unpackhi_epi8(xmm6, xmm1); \
xmm0 = xmm6; \
xmm6 = _mm_unpacklo_epi16(xmm6, xmm2); \
xmm6 = _mm_setzero_si128(); \
xmm2 = _mm_unpackhi_epi8(xmm2, xmm1); \
xmm6 = _mm_unpackhi_epi8(xmm6, xmm0); \
xmm0 = xmm6; \
xmm6 = _mm_unpacklo_epi16(xmm6, xmm2); \
_mm_stream_si128((__m128i*)(p_buffer+8), xmm6); \
xmm0 = _mm_unpackhi_epi16(xmm0, xmm2); \
xmm0 = _mm_unpackhi_epi16(xmm0, xmm2); \
_mm_stream_si128((__m128i*)(p_buffer+12), xmm0);
#define SSE2_UNPACK_32_BGRA_UNALIGNED \
xmm3 = _mm_setzero_si128(); \
xmm4 = xmm2; \
xmm4 = _mm_unpacklo_epi8(xmm4, xmm0); \
xmm3 = _mm_unpacklo_epi8(xmm3, xmm1); \
xmm5 = xmm3; \
xmm3 = _mm_unpacklo_epi16(xmm3, xmm4); \
_mm_storeu_si128((__m128i*)(p_buffer), xmm3); \
xmm5 = _mm_unpackhi_epi16(xmm5, xmm4); \
#define SSE2_UNPACK_32_RGBA_UNALIGNED \
xmm3 = _mm_setzero_si128(); \
xmm4 = xmm2; \
xmm4 = _mm_unpacklo_epi8(xmm4, xmm1); \
xmm3 = _mm_unpacklo_epi8(xmm3, xmm0); \
xmm5 = xmm3; \
xmm3 = _mm_unpacklo_epi16(xmm3, xmm4); \
_mm_storeu_si128((__m128i*)(p_buffer), xmm3); \
xmm5 = _mm_unpackhi_epi16(xmm5, xmm4); \
_mm_storeu_si128((__m128i*)(p_buffer+4), xmm5); \
xmm6 = _mm_setzero_si128(); \
xmm2 = _mm_unpackhi_epi8(xmm2, xmm0); \
xmm6 = _mm_unpackhi_epi8(xmm6, xmm1); \
xmm0 = xmm6; \
xmm6 = _mm_unpacklo_epi16(xmm6, xmm2); \
xmm6 = _mm_setzero_si128(); \
xmm2 = _mm_unpackhi_epi8(xmm2, xmm1); \
xmm6 = _mm_unpackhi_epi8(xmm6, xmm0); \
xmm0 = xmm6; \
xmm6 = _mm_unpacklo_epi16(xmm6, xmm2); \
_mm_storeu_si128((__m128i*)(p_buffer+8), xmm6); \
xmm0 = _mm_unpackhi_epi16(xmm0, xmm2); \
xmm0 = _mm_unpackhi_epi16(xmm0, xmm2); \
_mm_storeu_si128((__m128i*)(p_buffer+12), xmm0);
#define SSE2_UNPACK_32_ABGR_ALIGNED \
;
#define SSE2_UNPACK_32_BGRA_ALIGNED \
xmm3 = _mm_setzero_si128(); \
xmm4 = xmm2; \
xmm4 = _mm_unpacklo_epi8(xmm4, xmm0); \
xmm3 = _mm_unpacklo_epi8(xmm3, xmm1); \
xmm5 = xmm3; \
xmm3 = _mm_unpacklo_epi16(xmm3, xmm4); \
_mm_stream_si128((__m128i*)(p_buffer), xmm3); \
xmm5 = _mm_unpackhi_epi16(xmm5, xmm4); \
_mm_stream_si128((__m128i*)(p_buffer+4), xmm5); \
xmm6 = _mm_setzero_si128(); \
xmm2 = _mm_unpackhi_epi8(xmm2, xmm0); \
xmm6 = _mm_unpackhi_epi8(xmm6, xmm1); \
xmm0 = xmm6; \
xmm6 = _mm_unpacklo_epi16(xmm6, xmm2); \
_mm_stream_si128((__m128i*)(p_buffer+8), xmm6); \
xmm0 = _mm_unpackhi_epi16(xmm0, xmm2); \
_mm_stream_si128((__m128i*)(p_buffer+12), xmm0);
#define SSE2_UNPACK_32_BGRA_UNALIGNED \
xmm3 = _mm_setzero_si128(); \
xmm4 = xmm2; \
xmm4 = _mm_unpacklo_epi8(xmm4, xmm0); \
xmm3 = _mm_unpacklo_epi8(xmm3, xmm1); \
xmm5 = xmm3; \
xmm3 = _mm_unpacklo_epi16(xmm3, xmm4); \
_mm_storeu_si128((__m128i*)(p_buffer), xmm3); \
xmm5 = _mm_unpackhi_epi16(xmm5, xmm4); \
_mm_storeu_si128((__m128i*)(p_buffer+4), xmm5); \
xmm6 = _mm_setzero_si128(); \
xmm2 = _mm_unpackhi_epi8(xmm2, xmm0); \
xmm6 = _mm_unpackhi_epi8(xmm6, xmm1); \
xmm0 = xmm6; \
xmm6 = _mm_unpacklo_epi16(xmm6, xmm2); \
_mm_storeu_si128((__m128i*)(p_buffer+8), xmm6); \
xmm0 = _mm_unpackhi_epi16(xmm0, xmm2); \
_mm_storeu_si128((__m128i*)(p_buffer+12), xmm0);
#define SSE2_UNPACK_32_ABGR_UNALIGNED \
;
#define SSE2_UNPACK_32_ABGR_ALIGNED \
xmm3 = _mm_setzero_si128(); \
xmm4 = xmm1; \
xmm4 = _mm_unpacklo_epi8(xmm4, xmm2); \
xmm5 = xmm0; \
xmm5 = _mm_unpacklo_epi8(xmm5, xmm3); \
xmm6 = xmm4; \
xmm4 = _mm_unpacklo_epi16(xmm4, xmm5); \
_mm_stream_si128((__m128i*)(p_buffer), xmm4); \
xmm6 = _mm_unpackhi_epi16(xmm6, xmm5); \
_mm_stream_si128((__m128i*)(p_buffer+4), xmm6); \
xmm1 = _mm_unpackhi_epi8(xmm1, xmm2); \
xmm0 = _mm_unpackhi_epi8(xmm0, xmm3); \
xmm2 = xmm1; \
xmm1 = _mm_unpacklo_epi16(xmm1, xmm0); \
_mm_stream_si128((__m128i*)(p_buffer+8), xmm1); \
xmm2 = _mm_unpackhi_epi16(xmm2, xmm0); \
_mm_stream_si128((__m128i*)(p_buffer+12), xmm2);
#define SSE2_UNPACK_32_ABGR_UNALIGNED \
xmm3 = _mm_setzero_si128(); \
xmm4 = xmm1; \
xmm4 = _mm_unpacklo_epi8(xmm4, xmm2); \
xmm5 = xmm0; \
xmm5 = _mm_unpacklo_epi8(xmm5, xmm3); \
xmm6 = xmm4; \
xmm4 = _mm_unpacklo_epi16(xmm4, xmm5); \
_mm_storeu_si128((__m128i*)(p_buffer), xmm4); \
xmm6 = _mm_unpackhi_epi16(xmm6, xmm5); \
_mm_storeu_si128((__m128i*)(p_buffer+4), xmm6); \
xmm1 = _mm_unpackhi_epi8(xmm1, xmm2); \
xmm0 = _mm_unpackhi_epi8(xmm0, xmm3); \
xmm2 = xmm1; \
xmm1 = _mm_unpacklo_epi16(xmm1, xmm0); \
_mm_storeu_si128((__m128i*)(p_buffer+8), xmm1); \
xmm2 = _mm_unpackhi_epi16(xmm2, xmm0); \
_mm_storeu_si128((__m128i*)(p_buffer+12), xmm2);
#endif
......
......@@ -138,56 +138,56 @@ movq %%mm1, (%1) # Store YUYV \n\
#define MMX_END _mm_empty()
#define MMX_YUV420_YUYV \
mm1 = _mm_cvtsi32_si64((int)*p_u); \
mm2 = _mm_cvtsi32_si64((int)*p_v); \
mm1 = _mm_cvtsi32_si64(*(int*)p_u); \
mm2 = _mm_cvtsi32_si64(*(int*)p_v); \
mm0 = (__m64)*(uint64_t*)p_y1; \
mm3 = (__m64)*(uint64_t*)p_y2; \
mm1 = _mm_unpacklo_pi8(mm1, mm2); \
mm2 = mm0; \
mm2 = _mm_unpacklo_pi8(mm2, mm1); \
*(uin64_t)p_line1 = (uint64)mm2; \
*(uint64_t*)p_line1 = (uint64_t)mm2; \
mm0 = _mm_unpackhi_pi8(mm0, mm1); \
*(uin64_t)(p_line1 + 4) = (uint64)mm0; \
*(uint64_t*)(p_line1+8) = (uint64_t)mm0;\
mm4 = mm3; \
mm4 = _mm_unpacklo_pi8(mm4, mm1); \
*(uin64_t)p_line2 = (uint64)mm4; \
*(uint64_t*)p_line2 = (uint64_t)mm4; \
mm3 = _mm_unpackhi_pi8(mm3, mm1); \
*(uin64_t)(p_line2 + 4) = (uint64)mm4;
*(uint64_t*)(p_line2+8) = (uint64_t)mm3;
#define MMX_YUV420_YVYU \
mm2 = _mm_cvtsi32_si64((int)*p_u); \
mm1 = _mm_cvtsi32_si64((int)*p_v); \
mm2 = _mm_cvtsi32_si64(*(int*)p_u); \
mm1 = _mm_cvtsi32_si64(*(int*)p_v); \
mm0 = (__m64)*(uint64_t*)p_y1; \
mm3 = (__m64)*(uint64_t*)p_y2; \
mm1 = _mm_unpacklo_pi8(mm1, mm2); \
mm2 = mm0; \
mm2 = _mm_unpacklo_pi8(mm2, mm1); \
*(uin64_t)p_line1 = (uint64)mm2; \
*(uint64_t*)p_line1 = (uint64_t)mm2; \
mm0 = _mm_unpackhi_pi8(mm0, mm1); \
*(uin64_t)(p_line1 + 4) = (uint64)mm0; \
*(uint64_t*)(p_line1+8) = (uint64_t)mm0;\
mm4 = mm3; \
mm4 = _mm_unpacklo_pi8(mm4, mm1); \
*(uin64_t)p_line2 = (uint64)mm4; \
*(uint64_t*)p_line2 = (uint64_t)mm4; \
mm3 = _mm_unpackhi_pi8(mm3, mm1); \
*(uin64_t)(p_line2 + 4) = (uint64)mm4;
*(uint64_t*)(p_line2+8) = (uint64_t)mm3;
#define MMX_YUV420_UYVY \
mm1 = _mm_cvtsi32_si64((int)*p_u); \
mm2 = _mm_cvtsi32_si64((int)*p_v); \
mm1 = _mm_cvtsi32_si64(*(int*)p_u); \
mm2 = _mm_cvtsi32_si64(*(int*)p_v); \
mm0 = (__m64)*(uint64_t*)p_y1; \
mm3 = (__m64)*(uint64_t*)p_y2; \
mm1 = _mm_unpacklo_pi8(mm1, mm2); \
mm2 = mm1; \
mm2 = _mm_unpacklo_pi8(mm2, mm0); \
*(uin64_t)p_line1 = (uint64)mm2; \
*(uint64_t*)p_line1 = (uint64_t)mm2; \
mm2 = mm1; \
mm2 = _mm_unpackhi_pi8(mm2, mm0); \
*(uin64_t)(p_line1 + 4) = (uint64)mm2; \
*(uint64_t*)(p_line1+8) = (uint64_t)mm2;\
mm4 = mm1; \
mm4 = _mm_unpacklo_pi8(mm4, mm3); \
*(uin64_t)p_line2 = (uint64)mm4; \
*(uint64_t*)p_line2 = (uint64_t)mm4; \
mm1 = _mm_unpackhi_pi8(mm1, mm3); \
*(uin64_t)(p_line2 + 4) = (uint64)mm1;
*(uint64_t*)(p_line2+8) = (uint64_t)mm1;
#endif
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment