Commit f4f90e67 authored by Damien Fouilleul's avatar Damien Fouilleul

video_chroma: a few SSE2 fixes

parent a3eb2a70
......@@ -448,12 +448,6 @@ void E_(I420_R5G5B5)( vout_thread_t *p_vout, picture_t *p_src,
}
p_buffer = b_hscale ? p_buffer_start : p_pic;
}
/* make sure all SSE2 stores are visible thereafter */
#if defined (CAN_COMPILE_SSE2)
__asm__ __volatile__ ( "sfence" );
#else
_mm_sfence();
#endif
}
else
{
......@@ -526,6 +520,14 @@ void E_(I420_R5G5B5)( vout_thread_t *p_vout, picture_t *p_src,
p_buffer = b_hscale ? p_buffer_start : p_pic;
}
}
/* make sure all SSE2 stores are visible thereafter */
#if defined (CAN_COMPILE_SSE2)
__asm__ __volatile__ ( "sfence" ::: "memory" );
#else
_mm_sfence();
#endif
#else // defined (MODULE_NAME_IS_i420_rgb_mmx)
if( p_vout->render.i_width & 7 )
......@@ -755,12 +757,6 @@ void E_(I420_R5G6B5)( vout_thread_t *p_vout, picture_t *p_src,
}
p_buffer = b_hscale ? p_buffer_start : p_pic;
}
/* make sure all SSE2 stores are visible thereafter */
#if defined (CAN_COMPILE_SSE2)
__asm__ __volatile__ ( "sfence" );
#else
_mm_sfence();
#endif
}
else
{
......@@ -833,6 +829,14 @@ void E_(I420_R5G6B5)( vout_thread_t *p_vout, picture_t *p_src,
p_buffer = b_hscale ? p_buffer_start : p_pic;
}
}
/* make sure all SSE2 stores are visible thereafter */
#if defined (CAN_COMPILE_SSE2)
__asm__ __volatile__ ( "sfence" ::: "memory" );
#else
_mm_sfence();
#endif
#else // defined (MODULE_NAME_IS_i420_rgb_mmx)
if( p_vout->render.i_width & 7 )
......@@ -1179,12 +1183,6 @@ void E_(I420_A8R8G8B8)( vout_thread_t *p_vout, picture_t *p_src,
}
p_buffer = b_hscale ? p_buffer_start : p_pic;
}
/* make sure all SSE2 stores are visible thereafter */
#if defined (CAN_COMPILE_SSE2)
__asm__ __volatile__ ( "sfence" );
#else
_mm_sfence();
#endif
}
else
{
......@@ -1263,7 +1261,14 @@ void E_(I420_A8R8G8B8)( vout_thread_t *p_vout, picture_t *p_src,
}
}
/* make sure all SSE2 stores are visible thereafter */
#if defined (CAN_COMPILE_SSE2)
__asm__ __volatile__ ( "sfence" ::: "memory" );
#else
_mm_sfence();
#endif
#else // defined (MODULE_NAME_IS_i420_rgb_mmx)
if( p_vout->render.i_width & 7 )
{
......@@ -1500,12 +1505,6 @@ void E_(I420_B8G8R8A8)( vout_thread_t *p_vout, picture_t *p_src,
}
p_buffer = b_hscale ? p_buffer_start : p_pic;
}
/* make sure all SSE2 stores are visible thereafter */
#if defined (CAN_COMPILE_SSE2)
__asm__ __volatile__ ( "sfence" );
#else
_mm_sfence();
#endif
}
else
{
......
......@@ -61,7 +61,6 @@ movq (%0), %%mm6 # Load 8 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 \n\
"
#define SSE2_INIT_16_ALIGNED " \n\
prefetcht1 (%3) # cache preload for image \n\
movq (%1), %%xmm0 # Load 8 Cb 00 00 00 00 u3 u2 u1 u0 \n\
movq (%2), %%xmm1 # Load 8 Cr 00 00 00 00 v3 v2 v1 v0 \n\
pxor %%xmm4, %%xmm4 # zero mm4 \n\
......@@ -69,11 +68,11 @@ movdqa (%0), %%xmm6 # Load 16 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 \n\
"
#define SSE2_INIT_16_UNALIGNED " \n\
prefetcht1 (%3) # cache preload for image \n\
movq (%1), %%xmm0 # Load 8 Cb 00 00 00 00 u3 u2 u1 u0 \n\
movq (%2), %%xmm1 # Load 8 Cr 00 00 00 00 v3 v2 v1 v0 \n\
pxor %%xmm4, %%xmm4 # zero mm4 \n\
movdqu (%0), %%xmm6 # Load 16 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 \n\
prefetchnta (%3) # Tell CPU not to cache output RGB data \n\
"
#define MMX_INTRINSICS_INIT_16 \
......@@ -91,11 +90,11 @@ movdqu (%0), %%xmm6 # Load 16 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 \n\
xmm6 = _mm_load_si128((__m128i *)p_y); \
#define SSE2_INTRINSICS_INIT_16_UNALIGNED \
_mm_prefetch(p_buffer, _MM_HINT_T1); \
xmm0 = _mm_loadl_epi64((__m128i *)p_u); \
xmm1 = _mm_loadl_epi64((__m128i *)p_v); \
xmm4 = _mm_setzero_si128(); \
xmm6 = _mm_loadu_si128((__m128i *)p_y); \
_mm_prefetch(p_buffer, _MM_HINT_NTA); \
#define MMX_INIT_16_GRAY " \n\
movq (%0), %%mm6 # Load 8 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 \n\
......@@ -118,11 +117,11 @@ movdqa (%0), %%xmm6 # Load 16 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 \n\
"
#define SSE2_INIT_32_UNALIGNED " \n\
prefetcht1 (%3) # cache preload for image \n\
movq (%1), %%xmm0 # Load 8 Cb 00 00 00 00 u3 u2 u1 u0 \n\
movq (%2), %%xmm1 # Load 8 Cr 00 00 00 00 v3 v2 v1 v0 \n\
pxor %%xmm4, %%xmm4 # zero mm4 \n\
movdqu (%0), %%xmm6 # Load 16 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 \n\
prefetchnta (%3) # Tell CPU not to cache output RGB data \n\
"
#define MMX_INTRINSICS_INIT_32 \
......@@ -141,11 +140,11 @@ movdqu (%0), %%xmm6 # Load 16 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 \n\
xmm6 = _mm_load_si128((__m128i *)p_y); \
#define SSE2_INTRINSICS_INIT_32_UNALIGNED \
_mm_prefetch(p_buffer, _MM_HINT_T1); \
xmm0 = _mm_loadl_epi64((__m128i *)p_u); \
xmm1 = _mm_loadl_epi64((__m128i *)p_v); \
xmm4 = _mm_setzero_si128(); \
xmm6 = _mm_loadu_si128((__m128i *)p_y); \
_mm_prefetch(p_buffer, _MM_HINT_NTA); \
/*
* Do the multiply part of the conversion for even and odd pixels,
......@@ -260,7 +259,7 @@ pmulhw %%xmm5, %%xmm7 # Mul 8 Y odd 00 y7 00 y5 00 y3 00 y1 \n\
#define SSE2_INTRINSICS_YUV_MUL \
xmm0 = _mm_unpacklo_epi8(xmm0, xmm4); \
xmm1 = _mm_unpacklo_epi8(xmm1, xmm4); \
xmm5 = _mm_set1_epi32(0x80808080UL); \
xmm5 = _mm_set1_epi32(0x00800080UL); \
xmm0 = _mm_subs_epi16(xmm0, xmm5); \
xmm1 = _mm_subs_epi16(xmm1, xmm5); \
xmm0 = _mm_slli_epi16(xmm0, 3); \
......@@ -1001,7 +1000,7 @@ movdqu %%xmm5, 24(%3) # Store BGRA15 BGRA14 BGRA13 BGRA12 \n\
xmm5 = xmm3; \
xmm3 = _mm_unpacklo_epi16(xmm3, xmm1); \
_mm_stream_si128((__m128i*)(p_buffer+8), xmm3); \
xmm5 = _xmm_unpackhi_pi16(xmm5, xmm4); \
xmm5 = _mm_unpackhi_epi16(xmm5, xmm4); \
_mm_stream_si128((__m128i*)(p_buffer+12), xmm5); \
#define SSE2_INTRINSICS_UNPACK_32_BGRA_UNALIGNED \
......@@ -1021,6 +1020,6 @@ movdqu %%xmm5, 24(%3) # Store BGRA15 BGRA14 BGRA13 BGRA12 \n\
xmm5 = xmm3; \
xmm3 = _mm_unpacklo_epi16(xmm3, xmm1); \
_mm_storeu_si128((__m128i*)(p_buffer+8), xmm3); \
xmm5 = _xmm_unpackhi_pi16(xmm5, xmm4); \
xmm5 = _mm_unpackhi_epi16(xmm5, xmm4); \
_mm_storeu_si128((__m128i*)(p_buffer+12), xmm5); \
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment