Commit 5e4dc54c authored by Damien Fouilleul's avatar Damien Fouilleul

chromas: more SSE2/MMX fixes, added I420_RGBA conversion

parent c23c9ae9
......@@ -161,8 +161,7 @@ static int Activate( vlc_object_t *p_this )
{
/* R8G8B8A8 pixel format */
msg_Dbg(p_this, "RGB pixel format is R8G8B8A8");
//p_vout->chroma.pf_convert = E_(I420_B8G8R8A8);
return -1;
p_vout->chroma.pf_convert = E_(I420_R8G8B8A8);
}
else if( p_vout->output.i_rmask == 0x0000ff00
&& p_vout->output.i_gmask == 0x00ff0000
......
......@@ -64,6 +64,7 @@ void E_(I420_RGB32) ( vout_thread_t *, picture_t *, picture_t * );
void E_(I420_R5G5B5) ( vout_thread_t *, picture_t *, picture_t * );
void E_(I420_R5G6B5) ( vout_thread_t *, picture_t *, picture_t * );
void E_(I420_A8R8G8B8) ( vout_thread_t *, picture_t *, picture_t * );
void E_(I420_R8G8B8A8) ( vout_thread_t *, picture_t *, picture_t * );
void E_(I420_B8G8R8A8) ( vout_thread_t *, picture_t *, picture_t * );
void E_(I420_A8B8G8R8) ( vout_thread_t *, picture_t *, picture_t * );
#endif
......
......@@ -1140,6 +1140,245 @@ void E_(I420_A8R8G8B8)( vout_thread_t *p_vout, picture_t *p_src,
#endif
}
void E_(I420_R8G8B8A8)( vout_thread_t *p_vout, picture_t *p_src,
picture_t *p_dest )
{
/* We got this one from the old arguments */
uint32_t *p_pic = (uint32_t*)p_dest->p->p_pixels;
uint8_t *p_y = p_src->Y_PIXELS;
uint8_t *p_u = p_src->U_PIXELS;
uint8_t *p_v = p_src->V_PIXELS;
vlc_bool_t b_hscale; /* horizontal scaling type */
unsigned int i_vscale; /* vertical scaling type */
unsigned int i_x, i_y; /* horizontal and vertical indexes */
int i_right_margin;
int i_rewind;
int i_scale_count; /* scale modulo counter */
int i_chroma_width = p_vout->render.i_width / 2; /* chroma width */
uint32_t * p_pic_start; /* beginning of the current line for copy */
/* Conversion buffer pointer */
uint32_t * p_buffer_start = (uint32_t*)p_vout->chroma.p_sys->p_buffer;
uint32_t * p_buffer;
/* Offset array pointer */
int * p_offset_start = p_vout->chroma.p_sys->p_offset;
int * p_offset;
const int i_source_margin = p_src->p[0].i_pitch
- p_src->p[0].i_visible_pitch;
const int i_source_margin_c = p_src->p[1].i_pitch
- p_src->p[1].i_visible_pitch;
i_right_margin = p_dest->p->i_pitch - p_dest->p->i_visible_pitch;
/* Rule: when a picture of size (x1,y1) with aspect ratio r1 is rendered
* on a picture of size (x2,y2) with aspect ratio r2, if x1 grows to x1'
* then y1 grows to y1' = x1' * y2/x2 * r2/r1 */
SetOffset( p_vout->render.i_width, p_vout->render.i_height,
p_vout->output.i_width, p_vout->output.i_height,
&b_hscale, &i_vscale, p_offset_start );
/*
* Perform conversion
*/
i_scale_count = ( i_vscale == 1 ) ?
p_vout->output.i_height : p_vout->render.i_height;
#if defined (MODULE_NAME_IS_i420_rgb_sse2)
if( p_vout->render.i_width & 15 )
{
i_rewind = 16 - ( p_vout->render.i_width & 15 );
}
else
{
i_rewind = 0;
}
/*
** SSE2 128 bits fetch/store instructions are faster
** if memory access is 16 bytes aligned
*/
p_buffer = b_hscale ? p_buffer_start : p_pic;
if( 0 == (15 & (p_src->p[Y_PLANE].i_pitch|
p_dest->p->i_pitch|
((int)p_y)|
((int)p_buffer))) )
{
/* use faster SSE2 aligned fetch and store */
for( i_y = 0; i_y < p_vout->render.i_height; i_y++ )
{
p_pic_start = p_pic;
for ( i_x = p_vout->render.i_width / 16; i_x--; )
{
SSE2_CALL (
SSE2_INIT_32_ALIGNED
SSE2_YUV_MUL
SSE2_YUV_ADD
SSE2_UNPACK_32_RGBA_ALIGNED
);
p_y += 16;
p_u += 8;
p_v += 8;
p_buffer += 16;
}
/* Here we do some unaligned reads and duplicate conversions, but
* at least we have all the pixels */
if( i_rewind )
{
p_y -= i_rewind;
p_u -= i_rewind >> 1;
p_v -= i_rewind >> 1;
p_buffer -= i_rewind;
SSE2_CALL (
SSE2_INIT_32_UNALIGNED
SSE2_YUV_MUL
SSE2_YUV_ADD
SSE2_UNPACK_32_RGBA_UNALIGNED
);
p_y += 16;
p_u += 4;
p_v += 4;
}
SCALE_WIDTH;
SCALE_HEIGHT( 420, 4 );
p_y += i_source_margin;
if( i_y % 2 )
{
p_u += i_source_margin_c;
p_v += i_source_margin_c;
}
p_buffer = b_hscale ? p_buffer_start : p_pic;
}
}
else
{
/* use slower SSE2 unaligned fetch and store */
for( i_y = 0; i_y < p_vout->render.i_height; i_y++ )
{
p_pic_start = p_pic;
p_buffer = b_hscale ? p_buffer_start : p_pic;
for ( i_x = p_vout->render.i_width / 16; i_x--; )
{
SSE2_CALL (
SSE2_INIT_32_UNALIGNED
SSE2_YUV_MUL
SSE2_YUV_ADD
SSE2_UNPACK_32_RGBA_UNALIGNED
);
p_y += 16;
p_u += 8;
p_v += 8;
p_buffer += 16;
}
/* Here we do some unaligned reads and duplicate conversions, but
* at least we have all the pixels */
if( i_rewind )
{
p_y -= i_rewind;
p_u -= i_rewind >> 1;
p_v -= i_rewind >> 1;
p_buffer -= i_rewind;
SSE2_CALL (
SSE2_INIT_32_UNALIGNED
SSE2_YUV_MUL
SSE2_YUV_ADD
SSE2_UNPACK_32_RGBA_UNALIGNED
);
p_y += 16;
p_u += 8;
p_v += 8;
}
SCALE_WIDTH;
SCALE_HEIGHT( 420, 4 );
p_y += i_source_margin;
if( i_y % 2 )
{
p_u += i_source_margin_c;
p_v += i_source_margin_c;
}
p_buffer = b_hscale ? p_buffer_start : p_pic;
}
}
/* make sure all SSE2 stores are visible thereafter */
SSE2_END;
#else // defined (MODULE_NAME_IS_i420_rgb_mmx)
if( p_vout->render.i_width & 7 )
{
i_rewind = 8 - ( p_vout->render.i_width & 7 );
}
else
{
i_rewind = 0;
}
for( i_y = 0; i_y < p_vout->render.i_height; i_y++ )
{
p_pic_start = p_pic;
p_buffer = b_hscale ? p_buffer_start : p_pic;
for ( i_x = p_vout->render.i_width / 8; i_x--; )
{
MMX_CALL (
MMX_INIT_32
MMX_YUV_MUL
MMX_YUV_ADD
MMX_UNPACK_32_RGBA
);
p_y += 8;
p_u += 4;
p_v += 4;
p_buffer += 8;
}
/* Here we do some unaligned reads and duplicate conversions, but
* at least we have all the pixels */
if( i_rewind )
{
p_y -= i_rewind;
p_u -= i_rewind >> 1;
p_v -= i_rewind >> 1;
p_buffer -= i_rewind;
MMX_CALL (
MMX_INIT_32
MMX_YUV_MUL
MMX_YUV_ADD
MMX_UNPACK_32_RGBA
);
p_y += 8;
p_u += 4;
p_v += 4;
p_buffer += 8;
}
SCALE_WIDTH;
SCALE_HEIGHT( 420, 4 );
p_y += i_source_margin;
if( i_y % 2 )
{
p_u += i_source_margin_c;
p_v += i_source_margin_c;
}
}
/* re-enable FPU registers */
MMX_END;
#endif
}
void E_(I420_B8G8R8A8)( vout_thread_t *p_vout, picture_t *p_src,
picture_t *p_dest )
{
......
This diff is collapsed.
......@@ -138,56 +138,56 @@ movq %%mm1, (%1) # Store YUYV \n\
#define MMX_END _mm_empty()
#define MMX_YUV420_YUYV \
mm1 = _mm_cvtsi32_si64((int)*p_u); \
mm2 = _mm_cvtsi32_si64((int)*p_v); \
mm1 = _mm_cvtsi32_si64(*(int*)p_u); \
mm2 = _mm_cvtsi32_si64(*(int*)p_v); \
mm0 = (__m64)*(uint64_t*)p_y1; \
mm3 = (__m64)*(uint64_t*)p_y2; \
mm1 = _mm_unpacklo_pi8(mm1, mm2); \
mm2 = mm0; \
mm2 = _mm_unpacklo_pi8(mm2, mm1); \
*(uin64_t)p_line1 = (uint64)mm2; \
*(uint64_t*)p_line1 = (uint64_t)mm2; \
mm0 = _mm_unpackhi_pi8(mm0, mm1); \
*(uin64_t)(p_line1 + 4) = (uint64)mm0; \
*(uint64_t*)(p_line1+8) = (uint64_t)mm0;\
mm4 = mm3; \
mm4 = _mm_unpacklo_pi8(mm4, mm1); \
*(uin64_t)p_line2 = (uint64)mm4; \
*(uint64_t*)p_line2 = (uint64_t)mm4; \
mm3 = _mm_unpackhi_pi8(mm3, mm1); \
*(uin64_t)(p_line2 + 4) = (uint64)mm4;
*(uint64_t*)(p_line2+8) = (uint64_t)mm3;
#define MMX_YUV420_YVYU \
mm2 = _mm_cvtsi32_si64((int)*p_u); \
mm1 = _mm_cvtsi32_si64((int)*p_v); \
mm2 = _mm_cvtsi32_si64(*(int*)p_u); \
mm1 = _mm_cvtsi32_si64(*(int*)p_v); \
mm0 = (__m64)*(uint64_t*)p_y1; \
mm3 = (__m64)*(uint64_t*)p_y2; \
mm1 = _mm_unpacklo_pi8(mm1, mm2); \
mm2 = mm0; \
mm2 = _mm_unpacklo_pi8(mm2, mm1); \
*(uin64_t)p_line1 = (uint64)mm2; \
*(uint64_t*)p_line1 = (uint64_t)mm2; \
mm0 = _mm_unpackhi_pi8(mm0, mm1); \
*(uin64_t)(p_line1 + 4) = (uint64)mm0; \
*(uint64_t*)(p_line1+8) = (uint64_t)mm0;\
mm4 = mm3; \
mm4 = _mm_unpacklo_pi8(mm4, mm1); \
*(uin64_t)p_line2 = (uint64)mm4; \
*(uint64_t*)p_line2 = (uint64_t)mm4; \
mm3 = _mm_unpackhi_pi8(mm3, mm1); \
*(uin64_t)(p_line2 + 4) = (uint64)mm4;
*(uint64_t*)(p_line2+8) = (uint64_t)mm3;
#define MMX_YUV420_UYVY \
mm1 = _mm_cvtsi32_si64((int)*p_u); \
mm2 = _mm_cvtsi32_si64((int)*p_v); \
mm1 = _mm_cvtsi32_si64(*(int*)p_u); \
mm2 = _mm_cvtsi32_si64(*(int*)p_v); \
mm0 = (__m64)*(uint64_t*)p_y1; \
mm3 = (__m64)*(uint64_t*)p_y2; \
mm1 = _mm_unpacklo_pi8(mm1, mm2); \
mm2 = mm1; \
mm2 = _mm_unpacklo_pi8(mm2, mm0); \
*(uin64_t)p_line1 = (uint64)mm2; \
*(uint64_t*)p_line1 = (uint64_t)mm2; \
mm2 = mm1; \
mm2 = _mm_unpackhi_pi8(mm2, mm0); \
*(uin64_t)(p_line1 + 4) = (uint64)mm2; \
*(uint64_t*)(p_line1+8) = (uint64_t)mm2;\
mm4 = mm1; \
mm4 = _mm_unpacklo_pi8(mm4, mm3); \
*(uin64_t)p_line2 = (uint64)mm4; \
*(uint64_t*)p_line2 = (uint64_t)mm4; \
mm1 = _mm_unpackhi_pi8(mm1, mm3); \
*(uin64_t)(p_line2 + 4) = (uint64)mm1;
*(uint64_t*)(p_line2+8) = (uint64_t)mm1;
#endif
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment