Commit f4f90e67 authored by Damien Fouilleul's avatar Damien Fouilleul

video_chroma: a few SSE2 fixes

parent a3eb2a70
...@@ -448,12 +448,6 @@ void E_(I420_R5G5B5)( vout_thread_t *p_vout, picture_t *p_src, ...@@ -448,12 +448,6 @@ void E_(I420_R5G5B5)( vout_thread_t *p_vout, picture_t *p_src,
} }
p_buffer = b_hscale ? p_buffer_start : p_pic; p_buffer = b_hscale ? p_buffer_start : p_pic;
} }
/* make sure all SSE2 stores are visible thereafter */
#if defined (CAN_COMPILE_SSE2)
__asm__ __volatile__ ( "sfence" );
#else
_mm_sfence();
#endif
} }
else else
{ {
...@@ -526,6 +520,14 @@ void E_(I420_R5G5B5)( vout_thread_t *p_vout, picture_t *p_src, ...@@ -526,6 +520,14 @@ void E_(I420_R5G5B5)( vout_thread_t *p_vout, picture_t *p_src,
p_buffer = b_hscale ? p_buffer_start : p_pic; p_buffer = b_hscale ? p_buffer_start : p_pic;
} }
} }
/* make sure all SSE2 stores are visible thereafter */
#if defined (CAN_COMPILE_SSE2)
__asm__ __volatile__ ( "sfence" ::: "memory" );
#else
_mm_sfence();
#endif
#else // defined (MODULE_NAME_IS_i420_rgb_mmx) #else // defined (MODULE_NAME_IS_i420_rgb_mmx)
if( p_vout->render.i_width & 7 ) if( p_vout->render.i_width & 7 )
...@@ -755,12 +757,6 @@ void E_(I420_R5G6B5)( vout_thread_t *p_vout, picture_t *p_src, ...@@ -755,12 +757,6 @@ void E_(I420_R5G6B5)( vout_thread_t *p_vout, picture_t *p_src,
} }
p_buffer = b_hscale ? p_buffer_start : p_pic; p_buffer = b_hscale ? p_buffer_start : p_pic;
} }
/* make sure all SSE2 stores are visible thereafter */
#if defined (CAN_COMPILE_SSE2)
__asm__ __volatile__ ( "sfence" );
#else
_mm_sfence();
#endif
} }
else else
{ {
...@@ -833,6 +829,14 @@ void E_(I420_R5G6B5)( vout_thread_t *p_vout, picture_t *p_src, ...@@ -833,6 +829,14 @@ void E_(I420_R5G6B5)( vout_thread_t *p_vout, picture_t *p_src,
p_buffer = b_hscale ? p_buffer_start : p_pic; p_buffer = b_hscale ? p_buffer_start : p_pic;
} }
} }
/* make sure all SSE2 stores are visible thereafter */
#if defined (CAN_COMPILE_SSE2)
__asm__ __volatile__ ( "sfence" ::: "memory" );
#else
_mm_sfence();
#endif
#else // defined (MODULE_NAME_IS_i420_rgb_mmx) #else // defined (MODULE_NAME_IS_i420_rgb_mmx)
if( p_vout->render.i_width & 7 ) if( p_vout->render.i_width & 7 )
...@@ -1179,12 +1183,6 @@ void E_(I420_A8R8G8B8)( vout_thread_t *p_vout, picture_t *p_src, ...@@ -1179,12 +1183,6 @@ void E_(I420_A8R8G8B8)( vout_thread_t *p_vout, picture_t *p_src,
} }
p_buffer = b_hscale ? p_buffer_start : p_pic; p_buffer = b_hscale ? p_buffer_start : p_pic;
} }
/* make sure all SSE2 stores are visible thereafter */
#if defined (CAN_COMPILE_SSE2)
__asm__ __volatile__ ( "sfence" );
#else
_mm_sfence();
#endif
} }
else else
{ {
...@@ -1263,7 +1261,14 @@ void E_(I420_A8R8G8B8)( vout_thread_t *p_vout, picture_t *p_src, ...@@ -1263,7 +1261,14 @@ void E_(I420_A8R8G8B8)( vout_thread_t *p_vout, picture_t *p_src,
} }
} }
/* make sure all SSE2 stores are visible thereafter */
#if defined (CAN_COMPILE_SSE2)
__asm__ __volatile__ ( "sfence" ::: "memory" );
#else #else
_mm_sfence();
#endif
#else // defined (MODULE_NAME_IS_i420_rgb_mmx)
if( p_vout->render.i_width & 7 ) if( p_vout->render.i_width & 7 )
{ {
...@@ -1500,12 +1505,6 @@ void E_(I420_B8G8R8A8)( vout_thread_t *p_vout, picture_t *p_src, ...@@ -1500,12 +1505,6 @@ void E_(I420_B8G8R8A8)( vout_thread_t *p_vout, picture_t *p_src,
} }
p_buffer = b_hscale ? p_buffer_start : p_pic; p_buffer = b_hscale ? p_buffer_start : p_pic;
} }
/* make sure all SSE2 stores are visible thereafter */
#if defined (CAN_COMPILE_SSE2)
__asm__ __volatile__ ( "sfence" );
#else
_mm_sfence();
#endif
} }
else else
{ {
......
...@@ -61,7 +61,6 @@ movq (%0), %%mm6 # Load 8 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 \n\ ...@@ -61,7 +61,6 @@ movq (%0), %%mm6 # Load 8 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 \n\
" "
#define SSE2_INIT_16_ALIGNED " \n\ #define SSE2_INIT_16_ALIGNED " \n\
prefetcht1 (%3) # cache preload for image \n\
movq (%1), %%xmm0 # Load 8 Cb 00 00 00 00 u3 u2 u1 u0 \n\ movq (%1), %%xmm0 # Load 8 Cb 00 00 00 00 u3 u2 u1 u0 \n\
movq (%2), %%xmm1 # Load 8 Cr 00 00 00 00 v3 v2 v1 v0 \n\ movq (%2), %%xmm1 # Load 8 Cr 00 00 00 00 v3 v2 v1 v0 \n\
pxor %%xmm4, %%xmm4 # zero mm4 \n\ pxor %%xmm4, %%xmm4 # zero mm4 \n\
...@@ -69,11 +68,11 @@ movdqa (%0), %%xmm6 # Load 16 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 \n\ ...@@ -69,11 +68,11 @@ movdqa (%0), %%xmm6 # Load 16 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 \n\
" "
#define SSE2_INIT_16_UNALIGNED " \n\ #define SSE2_INIT_16_UNALIGNED " \n\
prefetcht1 (%3) # cache preload for image \n\
movq (%1), %%xmm0 # Load 8 Cb 00 00 00 00 u3 u2 u1 u0 \n\ movq (%1), %%xmm0 # Load 8 Cb 00 00 00 00 u3 u2 u1 u0 \n\
movq (%2), %%xmm1 # Load 8 Cr 00 00 00 00 v3 v2 v1 v0 \n\ movq (%2), %%xmm1 # Load 8 Cr 00 00 00 00 v3 v2 v1 v0 \n\
pxor %%xmm4, %%xmm4 # zero mm4 \n\ pxor %%xmm4, %%xmm4 # zero mm4 \n\
movdqu (%0), %%xmm6 # Load 16 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 \n\ movdqu (%0), %%xmm6 # Load 16 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 \n\
prefetchnta (%3) # Tell CPU not to cache output RGB data \n\
" "
#define MMX_INTRINSICS_INIT_16 \ #define MMX_INTRINSICS_INIT_16 \
...@@ -91,11 +90,11 @@ movdqu (%0), %%xmm6 # Load 16 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 \n\ ...@@ -91,11 +90,11 @@ movdqu (%0), %%xmm6 # Load 16 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 \n\
xmm6 = _mm_load_si128((__m128i *)p_y); \ xmm6 = _mm_load_si128((__m128i *)p_y); \
#define SSE2_INTRINSICS_INIT_16_UNALIGNED \ #define SSE2_INTRINSICS_INIT_16_UNALIGNED \
_mm_prefetch(p_buffer, _MM_HINT_T1); \
xmm0 = _mm_loadl_epi64((__m128i *)p_u); \ xmm0 = _mm_loadl_epi64((__m128i *)p_u); \
xmm1 = _mm_loadl_epi64((__m128i *)p_v); \ xmm1 = _mm_loadl_epi64((__m128i *)p_v); \
xmm4 = _mm_setzero_si128(); \ xmm4 = _mm_setzero_si128(); \
xmm6 = _mm_loadu_si128((__m128i *)p_y); \ xmm6 = _mm_loadu_si128((__m128i *)p_y); \
_mm_prefetch(p_buffer, _MM_HINT_NTA); \
#define MMX_INIT_16_GRAY " \n\ #define MMX_INIT_16_GRAY " \n\
movq (%0), %%mm6 # Load 8 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 \n\ movq (%0), %%mm6 # Load 8 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 \n\
...@@ -118,11 +117,11 @@ movdqa (%0), %%xmm6 # Load 16 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 \n\ ...@@ -118,11 +117,11 @@ movdqa (%0), %%xmm6 # Load 16 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 \n\
" "
#define SSE2_INIT_32_UNALIGNED " \n\ #define SSE2_INIT_32_UNALIGNED " \n\
prefetcht1 (%3) # cache preload for image \n\
movq (%1), %%xmm0 # Load 8 Cb 00 00 00 00 u3 u2 u1 u0 \n\ movq (%1), %%xmm0 # Load 8 Cb 00 00 00 00 u3 u2 u1 u0 \n\
movq (%2), %%xmm1 # Load 8 Cr 00 00 00 00 v3 v2 v1 v0 \n\ movq (%2), %%xmm1 # Load 8 Cr 00 00 00 00 v3 v2 v1 v0 \n\
pxor %%xmm4, %%xmm4 # zero mm4 \n\ pxor %%xmm4, %%xmm4 # zero mm4 \n\
movdqu (%0), %%xmm6 # Load 16 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 \n\ movdqu (%0), %%xmm6 # Load 16 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 \n\
prefetchnta (%3) # Tell CPU not to cache output RGB data \n\
" "
#define MMX_INTRINSICS_INIT_32 \ #define MMX_INTRINSICS_INIT_32 \
...@@ -141,11 +140,11 @@ movdqu (%0), %%xmm6 # Load 16 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 \n\ ...@@ -141,11 +140,11 @@ movdqu (%0), %%xmm6 # Load 16 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 \n\
xmm6 = _mm_load_si128((__m128i *)p_y); \ xmm6 = _mm_load_si128((__m128i *)p_y); \
#define SSE2_INTRINSICS_INIT_32_UNALIGNED \ #define SSE2_INTRINSICS_INIT_32_UNALIGNED \
_mm_prefetch(p_buffer, _MM_HINT_T1); \
xmm0 = _mm_loadl_epi64((__m128i *)p_u); \ xmm0 = _mm_loadl_epi64((__m128i *)p_u); \
xmm1 = _mm_loadl_epi64((__m128i *)p_v); \ xmm1 = _mm_loadl_epi64((__m128i *)p_v); \
xmm4 = _mm_setzero_si128(); \ xmm4 = _mm_setzero_si128(); \
xmm6 = _mm_loadu_si128((__m128i *)p_y); \ xmm6 = _mm_loadu_si128((__m128i *)p_y); \
_mm_prefetch(p_buffer, _MM_HINT_NTA); \
/* /*
* Do the multiply part of the conversion for even and odd pixels, * Do the multiply part of the conversion for even and odd pixels,
...@@ -260,7 +259,7 @@ pmulhw %%xmm5, %%xmm7 # Mul 8 Y odd 00 y7 00 y5 00 y3 00 y1 \n\ ...@@ -260,7 +259,7 @@ pmulhw %%xmm5, %%xmm7 # Mul 8 Y odd 00 y7 00 y5 00 y3 00 y1 \n\
#define SSE2_INTRINSICS_YUV_MUL \ #define SSE2_INTRINSICS_YUV_MUL \
xmm0 = _mm_unpacklo_epi8(xmm0, xmm4); \ xmm0 = _mm_unpacklo_epi8(xmm0, xmm4); \
xmm1 = _mm_unpacklo_epi8(xmm1, xmm4); \ xmm1 = _mm_unpacklo_epi8(xmm1, xmm4); \
xmm5 = _mm_set1_epi32(0x80808080UL); \ xmm5 = _mm_set1_epi32(0x00800080UL); \
xmm0 = _mm_subs_epi16(xmm0, xmm5); \ xmm0 = _mm_subs_epi16(xmm0, xmm5); \
xmm1 = _mm_subs_epi16(xmm1, xmm5); \ xmm1 = _mm_subs_epi16(xmm1, xmm5); \
xmm0 = _mm_slli_epi16(xmm0, 3); \ xmm0 = _mm_slli_epi16(xmm0, 3); \
...@@ -1001,7 +1000,7 @@ movdqu %%xmm5, 24(%3) # Store BGRA15 BGRA14 BGRA13 BGRA12 \n\ ...@@ -1001,7 +1000,7 @@ movdqu %%xmm5, 24(%3) # Store BGRA15 BGRA14 BGRA13 BGRA12 \n\
xmm5 = xmm3; \ xmm5 = xmm3; \
xmm3 = _mm_unpacklo_epi16(xmm3, xmm1); \ xmm3 = _mm_unpacklo_epi16(xmm3, xmm1); \
_mm_stream_si128((__m128i*)(p_buffer+8), xmm3); \ _mm_stream_si128((__m128i*)(p_buffer+8), xmm3); \
xmm5 = _xmm_unpackhi_pi16(xmm5, xmm4); \ xmm5 = _mm_unpackhi_epi16(xmm5, xmm4); \
_mm_stream_si128((__m128i*)(p_buffer+12), xmm5); \ _mm_stream_si128((__m128i*)(p_buffer+12), xmm5); \
#define SSE2_INTRINSICS_UNPACK_32_BGRA_UNALIGNED \ #define SSE2_INTRINSICS_UNPACK_32_BGRA_UNALIGNED \
...@@ -1021,6 +1020,6 @@ movdqu %%xmm5, 24(%3) # Store BGRA15 BGRA14 BGRA13 BGRA12 \n\ ...@@ -1021,6 +1020,6 @@ movdqu %%xmm5, 24(%3) # Store BGRA15 BGRA14 BGRA13 BGRA12 \n\
xmm5 = xmm3; \ xmm5 = xmm3; \
xmm3 = _mm_unpacklo_epi16(xmm3, xmm1); \ xmm3 = _mm_unpacklo_epi16(xmm3, xmm1); \
_mm_storeu_si128((__m128i*)(p_buffer+8), xmm3); \ _mm_storeu_si128((__m128i*)(p_buffer+8), xmm3); \
xmm5 = _xmm_unpackhi_pi16(xmm5, xmm4); \ xmm5 = _mm_unpackhi_epi16(xmm5, xmm4); \
_mm_storeu_si128((__m128i*)(p_buffer+12), xmm5); \ _mm_storeu_si128((__m128i*)(p_buffer+12), xmm5); \
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment