Commit a865ced4 authored by Rémi Denis-Courmont's avatar Rémi Denis-Courmont

sepia: clobber XMM registers correctly

parent 18d7c971
...@@ -206,6 +206,7 @@ static picture_t *Filter( filter_t *p_filter, picture_t *p_pic ) ...@@ -206,6 +206,7 @@ static picture_t *Filter( filter_t *p_filter, picture_t *p_pic )
* instructions. It copies those 8 bytes to 128b register and fills the gaps * instructions. It copies those 8 bytes to 128b register and fills the gaps
* with zeroes and following operations are made with word-operating instructs. * with zeroes and following operations are made with word-operating instructs.
*****************************************************************************/ *****************************************************************************/
VLC_SSE
static inline void Sepia8ySSE2(uint8_t * dst, const uint8_t * src, static inline void Sepia8ySSE2(uint8_t * dst, const uint8_t * src,
int i_intensity_spread) int i_intensity_spread)
{ {
...@@ -225,72 +226,51 @@ static inline void Sepia8ySSE2(uint8_t * dst, const uint8_t * src, ...@@ -225,72 +226,51 @@ static inline void Sepia8ySSE2(uint8_t * dst, const uint8_t * src,
"movq %%xmm1, (%0) \n" // load to dest "movq %%xmm1, (%0) \n" // load to dest
: :
:"r" (dst), "r"(src), "r"(i_intensity_spread) :"r" (dst), "r"(src), "r"(i_intensity_spread)
:"memory"); :"memory", "xmm1", "xmm2", "xmm3");
} }
#endif
/***************************************************************************** VLC_SSE
* PlanarI420Sepia: Applies sepia to one frame of the planar I420 video static void PlanarI420SepiaSSE( picture_t *p_pic, picture_t *p_outpic,
*****************************************************************************
* This function applies sepia effect to one frame of the video by iterating
* through video lines. We iterate for every two lines and for every two pixels
* in line to calculate new sepia values for four y components as well for u
* and v components.
*****************************************************************************/
static void PlanarI420Sepia( picture_t *p_pic, picture_t *p_outpic,
int i_intensity ) int i_intensity )
{ {
// prepared values to copy for U and V channels /* prepared values to copy for U and V channels */
const uint8_t filling_const_8u = 128 - i_intensity / 6; const uint8_t filling_const_8u = 128 - i_intensity / 6;
const uint8_t filling_const_8v = 128 + i_intensity / 14; const uint8_t filling_const_8v = 128 + i_intensity / 14;
#if defined(CAN_COMPILE_SSE2)
if (vlc_CPU_SSE2())
{
/* prepared value for faster broadcasting in xmm register */ /* prepared value for faster broadcasting in xmm register */
int i_intensity_spread = 0x10001 * (uint8_t) i_intensity; int i_intensity_spread = 0x10001 * (uint8_t) i_intensity;
__asm__ volatile( __asm__ volatile(
"pxor %%xmm7, %%xmm7\n" "pxor %%xmm7, %%xmm7\n"
::); ::: "xmm7");
/* iterate for every two visible line in the frame */ /* iterate for every two visible line in the frame */
for (int y = 0; y < p_pic->p[Y_PLANE].i_visible_lines - 1; y += 2) for (int y = 0; y < p_pic->p[Y_PLANE].i_visible_lines - 1; y += 2)
{ {
const int i_dy_line1_start = y * p_outpic->p[Y_PLANE].i_pitch; const int i_dy_line1_start = y * p_outpic->p[Y_PLANE].i_pitch;
const int i_dy_line2_start = const int i_dy_line2_start = (y + 1) * p_outpic->p[Y_PLANE].i_pitch;
(y + 1) * p_outpic->p[Y_PLANE].i_pitch; const int i_du_line_start = (y / 2) * p_outpic->p[U_PLANE].i_pitch;
const int i_du_line_start = const int i_dv_line_start = (y / 2) * p_outpic->p[V_PLANE].i_pitch;
(y / 2) * p_outpic->p[U_PLANE].i_pitch;
const int i_dv_line_start =
(y / 2) * p_outpic->p[V_PLANE].i_pitch;
int x = 0; int x = 0;
/* iterate for every visible line in the frame (eight values at once) */ /* iterate for every visible line in the frame (eight values at once) */
for ( ; x < p_pic->p[Y_PLANE].i_visible_pitch - 15; x += 16 ) for ( ; x < p_pic->p[Y_PLANE].i_visible_pitch - 15; x += 16 )
{ {
/* Compute yellow channel values with asm function */ /* Compute yellow channel values with asm function */
Sepia8ySSE2( Sepia8ySSE2(&p_outpic->p[Y_PLANE].p_pixels[i_dy_line1_start + x],
&p_outpic->p[Y_PLANE].p_pixels[i_dy_line1_start + x],
&p_pic->p[Y_PLANE].p_pixels[i_dy_line1_start + x], &p_pic->p[Y_PLANE].p_pixels[i_dy_line1_start + x],
i_intensity_spread ); i_intensity_spread );
Sepia8ySSE2( Sepia8ySSE2(&p_outpic->p[Y_PLANE].p_pixels[i_dy_line2_start + x],
&p_outpic->p[Y_PLANE].p_pixels[i_dy_line2_start + x],
&p_pic->p[Y_PLANE].p_pixels[i_dy_line2_start + x], &p_pic->p[Y_PLANE].p_pixels[i_dy_line2_start + x],
i_intensity_spread ); i_intensity_spread );
Sepia8ySSE2( Sepia8ySSE2(&p_outpic->p[Y_PLANE].p_pixels[i_dy_line1_start + x + 8],
&p_outpic->p[Y_PLANE].p_pixels[i_dy_line1_start + x + 8],
&p_pic->p[Y_PLANE].p_pixels[i_dy_line1_start + x + 8], &p_pic->p[Y_PLANE].p_pixels[i_dy_line1_start + x + 8],
i_intensity_spread ); i_intensity_spread );
Sepia8ySSE2( Sepia8ySSE2(&p_outpic->p[Y_PLANE].p_pixels[i_dy_line2_start + x + 8],
&p_outpic->p[Y_PLANE].p_pixels[i_dy_line2_start + x + 8],
&p_pic->p[Y_PLANE].p_pixels[i_dy_line2_start + x + 8], &p_pic->p[Y_PLANE].p_pixels[i_dy_line2_start + x + 8],
i_intensity_spread ); i_intensity_spread );
/* Copy precomputed values to destination memory location */ /* Copy precomputed values to destination memory location */
memset( memset(&p_outpic->p[U_PLANE].p_pixels[i_du_line_start + (x / 2)],
&p_outpic->p[U_PLANE].p_pixels[i_du_line_start + (x / 2)],
filling_const_8u, 8 ); filling_const_8u, 8 );
memset( memset(&p_outpic->p[V_PLANE].p_pixels[i_dv_line_start + (x / 2)],
&p_outpic->p[V_PLANE].p_pixels[i_dv_line_start + (x / 2)],
filling_const_8v, 8 ); filling_const_8v, 8 );
} }
/* Completing the job, the cycle above takes really big chunks, so /* Completing the job, the cycle above takes really big chunks, so
...@@ -322,10 +302,29 @@ static void PlanarI420Sepia( picture_t *p_pic, picture_t *p_outpic, ...@@ -322,10 +302,29 @@ static void PlanarI420Sepia( picture_t *p_pic, picture_t *p_outpic,
filling_const_8v; filling_const_8v;
} }
} }
} }
else
#endif #endif
{
/*****************************************************************************
* PlanarI420Sepia: Applies sepia to one frame of the planar I420 video
*****************************************************************************
* This function applies sepia effect to one frame of the video by iterating
* through video lines. We iterate for every two lines and for every two pixels
* in line to calculate new sepia values for four y components as well for u
* and v components.
*****************************************************************************/
static void PlanarI420Sepia( picture_t *p_pic, picture_t *p_outpic,
int i_intensity )
{
#if defined(CAN_COMPILE_SSE2)
if (vlc_CPU_SSE2())
return PlanarI420SepiaSSE( p_pic, p_outpic, i_intensity );
#endif
// prepared values to copy for U and V channels
const uint8_t filling_const_8u = 128 - i_intensity / 6;
const uint8_t filling_const_8v = 128 + i_intensity / 14;
/* iterate for every two visible line in the frame */ /* iterate for every two visible line in the frame */
for( int y = 0; y < p_pic->p[Y_PLANE].i_visible_lines - 1; y += 2) for( int y = 0; y < p_pic->p[Y_PLANE].i_visible_lines - 1; y += 2)
{ {
...@@ -366,7 +365,6 @@ static void PlanarI420Sepia( picture_t *p_pic, picture_t *p_outpic, ...@@ -366,7 +365,6 @@ static void PlanarI420Sepia( picture_t *p_pic, picture_t *p_outpic,
filling_const_8v; filling_const_8v;
} }
} }
}
} }
/***************************************************************************** /*****************************************************************************
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment