Commit df3b5eec authored by Damien Fouilleul's avatar Damien Fouilleul

video chromas: finalize SSE2 improvements

parent 30900cb3
...@@ -190,6 +190,9 @@ E: Damien.Fouilleul@laposte.net ...@@ -190,6 +190,9 @@ E: Damien.Fouilleul@laposte.net
C: Quovodis C: Quovodis
D: ActiveX control D: ActiveX control
D: Safari/Firefox plugin for MacOS X D: Safari/Firefox plugin for MacOS X
D: Direct3D Video output
D: SSE2 chroma converters
D: improved MMX chroma converters
S: Ireland S: Ireland
N: Alexis Guillard N: Alexis Guillard
......
...@@ -81,6 +81,8 @@ Video output and filters: ...@@ -81,6 +81,8 @@ Video output and filters:
was previously part of the mosaic module. was previously part of the mosaic module.
* Fix random characters problem in RSS filter. * Fix random characters problem in RSS filter.
* Add rotate-deciangle for more precision on rotate filter * Add rotate-deciangle for more precision on rotate filter
* Support for Intel SSE2 intruction set in chroma converters
* Improved use of Intel MMX intruction set in chroma converters
Audio output Audio output
* Replay gain support. * Replay gain support.
......
...@@ -366,8 +366,8 @@ movdqu %%xmm1, 16(%1) # Store high UYVY \n\ ...@@ -366,8 +366,8 @@ movdqu %%xmm1, 16(%1) # Store high UYVY \n\
#define SSE2_YUV420_YUYV_UNALIGNED \ #define SSE2_YUV420_YUYV_UNALIGNED \
xmm1 = _mm_loadl_epi64((__m128i *)p_u); \ xmm1 = _mm_loadl_epi64((__m128i *)p_u); \
xmm2 = _mm_loadl_epi64((__m128i *)p_v); \ xmm2 = _mm_loadl_epi64((__m128i *)p_v); \
xmm0 = _mm_load_si128((__m128i *)p_y1); \ xmm0 = _mm_loadu_si128((__m128i *)p_y1); \
xmm3 = _mm_load_si128((__m128i *)p_y2); \ xmm3 = _mm_loadu_si128((__m128i *)p_y2); \
_mm_prefetch(p_line1, _MM_HINT_NTA); \ _mm_prefetch(p_line1, _MM_HINT_NTA); \
_mm_prefetch(p_line2, _MM_HINT_NTA); \ _mm_prefetch(p_line2, _MM_HINT_NTA); \
xmm1 = _mm_unpacklo_epi8(xmm1, xmm2); \ xmm1 = _mm_unpacklo_epi8(xmm1, xmm2); \
...@@ -402,8 +402,8 @@ movdqu %%xmm1, 16(%1) # Store high UYVY \n\ ...@@ -402,8 +402,8 @@ movdqu %%xmm1, 16(%1) # Store high UYVY \n\
#define SSE2_YUV420_YVYU_UNALIGNED \ #define SSE2_YUV420_YVYU_UNALIGNED \
xmm1 = _mm_loadl_epi64((__m128i *)p_v); \ xmm1 = _mm_loadl_epi64((__m128i *)p_v); \
xmm2 = _mm_loadl_epi64((__m128i *)p_u); \ xmm2 = _mm_loadl_epi64((__m128i *)p_u); \
xmm0 = _mm_load_si128((__m128i *)p_y1); \ xmm0 = _mm_loadu_si128((__m128i *)p_y1); \
xmm3 = _mm_load_si128((__m128i *)p_y2); \ xmm3 = _mm_loadu_si128((__m128i *)p_y2); \
_mm_prefetch(p_line1, _MM_HINT_NTA); \ _mm_prefetch(p_line1, _MM_HINT_NTA); \
_mm_prefetch(p_line2, _MM_HINT_NTA); \ _mm_prefetch(p_line2, _MM_HINT_NTA); \
xmm1 = _mm_unpacklo_epi8(xmm1, xmm2); \ xmm1 = _mm_unpacklo_epi8(xmm1, xmm2); \
...@@ -439,8 +439,8 @@ movdqu %%xmm1, 16(%1) # Store high UYVY \n\ ...@@ -439,8 +439,8 @@ movdqu %%xmm1, 16(%1) # Store high UYVY \n\
#define SSE2_YUV420_UYVY_UNALIGNED \ #define SSE2_YUV420_UYVY_UNALIGNED \
xmm1 = _mm_loadl_epi64((__m128i *)p_u); \ xmm1 = _mm_loadl_epi64((__m128i *)p_u); \
xmm2 = _mm_loadl_epi64((__m128i *)p_v); \ xmm2 = _mm_loadl_epi64((__m128i *)p_v); \
xmm0 = _mm_load_si128((__m128i *)p_y1); \ xmm0 = _mm_loadu_si128((__m128i *)p_y1); \
xmm3 = _mm_load_si128((__m128i *)p_y2); \ xmm3 = _mm_loadu_si128((__m128i *)p_y2); \
_mm_prefetch(p_line1, _MM_HINT_NTA); \ _mm_prefetch(p_line1, _MM_HINT_NTA); \
_mm_prefetch(p_line2, _MM_HINT_NTA); \ _mm_prefetch(p_line2, _MM_HINT_NTA); \
xmm1 = _mm_unpacklo_epi8(xmm1, xmm2); \ xmm1 = _mm_unpacklo_epi8(xmm1, xmm2); \
......
...@@ -442,6 +442,61 @@ static void I422_cyuv( vout_thread_t *p_vout, picture_t *p_source, ...@@ -442,6 +442,61 @@ static void I422_cyuv( vout_thread_t *p_vout, picture_t *p_source,
int i_x, i_y; int i_x, i_y;
const int i_source_margin = p_source->p[0].i_pitch
- p_source->p[0].i_visible_pitch;
const int i_source_margin_c = p_source->p[1].i_pitch
- p_source->p[1].i_visible_pitch;
const int i_dest_margin = p_dest->p->i_pitch
- p_dest->p->i_visible_pitch;
#if defined (MODULE_NAME_IS_i422_yuy2_sse2)
if( 0 == (15 & (p_source->p[Y_PLANE].i_pitch|p_dest->p->i_pitch|
((int)p_line|(int)p_y))) )
{
/* use faster SSE2 aligned fetch and store */
for( i_y = p_vout->render.i_height ; i_y-- ; )
{
p_line -= 2 * p_dest->p->i_pitch;
for( i_x = p_vout->render.i_width / 16 ; i_x-- ; )
{
SSE2_CALL( SSE2_YUV422_UYVY_ALIGNED );
}
for( i_x = ( p_vout->render.i_width % 16 ) / 2; i_x-- ; )
{
C_YUV422_UYVY( p_line, p_y, p_u, p_v );
}
p_y += i_source_margin;
p_u += i_source_margin_c;
p_v += i_source_margin_c;
p_line += i_dest_margin;
}
}
else {
/* use slower SSE2 unaligned fetch and store */
for( i_y = p_vout->render.i_height ; i_y-- ; )
{
p_line -= 2 * p_dest->p->i_pitch;
for( i_x = p_vout->render.i_width / 16 ; i_x-- ; )
{
SSE2_CALL( SSE2_YUV422_UYVY_UNALIGNED );
}
for( i_x = ( p_vout->render.i_width % 16 ) / 2; i_x-- ; )
{
C_YUV422_UYVY( p_line, p_y, p_u, p_v );
}
p_y += i_source_margin;
p_u += i_source_margin_c;
p_v += i_source_margin_c;
p_line += i_dest_margin;
}
}
SSE2_END;
#else
for( i_y = p_vout->render.i_height ; i_y-- ; ) for( i_y = p_vout->render.i_height ; i_y-- ; )
{ {
for( i_x = p_vout->render.i_width / 8 ; i_x-- ; ) for( i_x = p_vout->render.i_width / 8 ; i_x-- ; )
...@@ -457,12 +512,18 @@ static void I422_cyuv( vout_thread_t *p_vout, picture_t *p_source, ...@@ -457,12 +512,18 @@ static void I422_cyuv( vout_thread_t *p_vout, picture_t *p_source,
MMX_CALL( MMX_YUV422_UYVY ); MMX_CALL( MMX_YUV422_UYVY );
#endif #endif
} }
p_y += i_source_margin;
p_u += i_source_margin_c;
p_v += i_source_margin_c;
p_line += i_dest_margin;
} }
#if defined (MODULE_NAME_IS_i422_yuy2_mmx) #if defined (MODULE_NAME_IS_i422_yuy2_mmx)
MMX_END; MMX_END;
#elif defined (MODULE_NAME_IS_i422_yuy2_sse2) #elif defined (MODULE_NAME_IS_i422_yuy2_sse2)
SSE2_END; SSE2_END;
#endif #endif
#endif
} }
/***************************************************************************** /*****************************************************************************
......
...@@ -233,9 +233,82 @@ movdqu %%xmm1, 16(%0) # Store high UYVY \n\ ...@@ -233,9 +233,82 @@ movdqu %%xmm1, 16(%0) # Store high UYVY \n\
#include <emmintrin.h> #include <emmintrin.h>
#define SSE2_CALL(SSE2_INSTRUCTIONS) \
do { \
__m128i xmm0, xmm1, xmm2; \
SSE2_INSTRUCTIONS \
p_line += 32; p_y += 16; \
p_u += 8; p_v += 8; \
} while(0)
#define SSE2_END _mm_sfence() #define SSE2_END _mm_sfence()
#define SSE2_YUV422_YUYV_ALIGNED \
xmm0 = _mm_load_si128((__m128i *)p_y); \
xmm1 = _mm_loadl_epi64((__m128i *)p_u); \
xmm2 = _mm_loadl_epi64((__m128i *)p_v); \
xmm1 = _mm_unpacklo_epi8(xmm1, xmm2); \
xmm2 = xmm0; \
xmm2 = _mm_unpacklo_epi8(xmm2, xmm1); \
_mm_stream_si128((__m128i*)(p_line), xmm2); \
xmm0 = _mm_unpackhi_epi8(xmm0, xmm1); \
_mm_stream_si128((__m128i*)(p_line+16), xmm0);
#define SSE2_YUV422_YUYV_UNALIGNED \
xmm0 = _mm_loadu_si128((__m128i *)p_y); \
xmm1 = _mm_loadl_epi64((__m128i *)p_u); \
xmm2 = _mm_loadl_epi64((__m128i *)p_v); \
xmm1 = _mm_unpacklo_epi8(xmm1, xmm2); \
xmm2 = xmm0; \
xmm2 = _mm_unpacklo_epi8(xmm2, xmm1); \
_mm_storeu_si128((__m128i*)(p_line), xmm2); \
xmm0 = _mm_unpackhi_epi8(xmm0, xmm1); \
_mm_storeu_si128((__m128i*)(p_line+16), xmm0);
#define SSE2_YUV422_YVYU_ALIGNED \
xmm0 = _mm_load_si128((__m128i *)p_y); \
xmm2 = _mm_loadl_epi64((__m128i *)p_u); \
xmm1 = _mm_loadl_epi64((__m128i *)p_v); \
xmm1 = _mm_unpacklo_epi8(xmm1, xmm2); \
xmm2 = xmm0; \
xmm2 = _mm_unpacklo_epi8(xmm2, xmm1); \
_mm_stream_si128((__m128i*)(p_line), xmm2); \
xmm0 = _mm_unpackhi_epi8(xmm0, xmm1); \
_mm_stream_si128((__m128i*)(p_line+16), xmm0);
#define SSE2_YUV422_YVYU_UNALIGNED \
xmm0 = _mm_loadu_si128((__m128i *)p_y); \
xmm2 = _mm_loadl_epi64((__m128i *)p_u); \
xmm1 = _mm_loadl_epi64((__m128i *)p_v); \
xmm1 = _mm_unpacklo_epi8(xmm1, xmm2); \
xmm2 = xmm0; \
xmm2 = _mm_unpacklo_epi8(xmm2, xmm1); \
_mm_storeu_si128((__m128i*)(p_line), xmm2); \
xmm0 = _mm_unpackhi_epi8(xmm0, xmm1); \
_mm_storeu_si128((__m128i*)(p_line+16), xmm0);
#define SSE2_YUV422_UYVY_ALIGNED \
xmm0 = _mm_load_si128((__m128i *)p_y); \
xmm1 = _mm_loadl_epi64((__m128i *)p_u); \
xmm2 = _mm_loadl_epi64((__m128i *)p_v); \
xmm1 = _mm_unpacklo_epi8(xmm1, xmm2); \
xmm2 = xmm1; \
xmm2 = _mm_unpacklo_epi8(xmm2, xmm0); \
_mm_stream_si128((__m128i*)(p_line), xmm2); \
xmm1 = _mm_unpackhi_epi8(xmm1, xmm0); \
_mm_stream_si128((__m128i*)(p_line+16), xmm1);
#define SSE2_YUV422_UYVY_UNALIGNED \
xmm0 = _mm_loadu_si128((__m128i *)p_y); \
xmm1 = _mm_loadl_epi64((__m128i *)p_u); \
xmm2 = _mm_loadl_epi64((__m128i *)p_v); \
xmm1 = _mm_unpacklo_epi8(xmm1, xmm2); \
xmm2 = xmm1; \
xmm2 = _mm_unpacklo_epi8(xmm2, xmm0); \
_mm_storeu_si128((__m128i*)(p_line), xmm2); \
xmm1 = _mm_unpackhi_epi8(xmm1, xmm0); \
_mm_storeu_si128((__m128i*)(p_line+16), xmm1);
#endif #endif
#endif #endif
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment