Commit 708e92e9 authored by Rémi Denis-Courmont's avatar Rémi Denis-Courmont

sepia: clobber XMM registers correctly

(cherry picked from commit a865ced4888701e8caf0137672a4857c2b0d47d7)

Conflicts:
	modules/video_filter/sepia.c
parent d879cd1f
......@@ -206,6 +206,7 @@ static picture_t *Filter( filter_t *p_filter, picture_t *p_pic )
* instructions. It copies those 8 bytes to 128b register and fills the gaps
* with zeroes and following operations are made with word-operating instructs.
*****************************************************************************/
VLC_SSE
static inline void Sepia8ySSE2(uint8_t * dst, const uint8_t * src,
int i_intensity_spread)
{
......@@ -225,72 +226,51 @@ static inline void Sepia8ySSE2(uint8_t * dst, const uint8_t * src,
"movq %%xmm1, (%0) \n" // load to dest
:
:"r" (dst), "r"(src), "r"(i_intensity_spread)
:"memory");
:"memory", "xmm1", "xmm2", "xmm3");
}
#endif
/*****************************************************************************
* PlanarI420Sepia: Applies sepia to one frame of the planar I420 video
*****************************************************************************
* This function applies sepia effect to one frame of the video by iterating
* through video lines. We iterate for every two lines and for every two pixels
* in line to calculate new sepia values for four y components as well for u
* and v components.
*****************************************************************************/
static void PlanarI420Sepia( picture_t *p_pic, picture_t *p_outpic,
VLC_SSE
static void PlanarI420SepiaSSE( picture_t *p_pic, picture_t *p_outpic,
int i_intensity )
{
// prepared values to copy for U and V channels
/* prepared values to copy for U and V channels */
const uint8_t filling_const_8u = 128 - i_intensity / 6;
const uint8_t filling_const_8v = 128 + i_intensity / 14;
#if defined(CAN_COMPILE_SSE2)
if (vlc_CPU() & CPU_CAPABILITY_SSE2)
{
/* prepared value for faster broadcasting in xmm register */
int i_intensity_spread = 0x10001 * (uint8_t) i_intensity;
__asm__ volatile(
"pxor %%xmm7, %%xmm7\n"
::);
::: "xmm7");
/* iterate for every two visible line in the frame */
for (int y = 0; y < p_pic->p[Y_PLANE].i_visible_lines - 1; y += 2)
{
const int i_dy_line1_start = y * p_outpic->p[Y_PLANE].i_pitch;
const int i_dy_line2_start =
(y + 1) * p_outpic->p[Y_PLANE].i_pitch;
const int i_du_line_start =
(y / 2) * p_outpic->p[U_PLANE].i_pitch;
const int i_dv_line_start =
(y / 2) * p_outpic->p[V_PLANE].i_pitch;
const int i_dy_line2_start = (y + 1) * p_outpic->p[Y_PLANE].i_pitch;
const int i_du_line_start = (y / 2) * p_outpic->p[U_PLANE].i_pitch;
const int i_dv_line_start = (y / 2) * p_outpic->p[V_PLANE].i_pitch;
int x = 0;
/* iterate for every visible line in the frame (eight values at once) */
for ( ; x < p_pic->p[Y_PLANE].i_visible_pitch - 15; x += 16 )
{
/* Compute yellow channel values with asm function */
Sepia8ySSE2(
&p_outpic->p[Y_PLANE].p_pixels[i_dy_line1_start + x],
Sepia8ySSE2(&p_outpic->p[Y_PLANE].p_pixels[i_dy_line1_start + x],
&p_pic->p[Y_PLANE].p_pixels[i_dy_line1_start + x],
i_intensity_spread );
Sepia8ySSE2(
&p_outpic->p[Y_PLANE].p_pixels[i_dy_line2_start + x],
Sepia8ySSE2(&p_outpic->p[Y_PLANE].p_pixels[i_dy_line2_start + x],
&p_pic->p[Y_PLANE].p_pixels[i_dy_line2_start + x],
i_intensity_spread );
Sepia8ySSE2(
&p_outpic->p[Y_PLANE].p_pixels[i_dy_line1_start + x + 8],
Sepia8ySSE2(&p_outpic->p[Y_PLANE].p_pixels[i_dy_line1_start + x + 8],
&p_pic->p[Y_PLANE].p_pixels[i_dy_line1_start + x + 8],
i_intensity_spread );
Sepia8ySSE2(
&p_outpic->p[Y_PLANE].p_pixels[i_dy_line2_start + x + 8],
Sepia8ySSE2(&p_outpic->p[Y_PLANE].p_pixels[i_dy_line2_start + x + 8],
&p_pic->p[Y_PLANE].p_pixels[i_dy_line2_start + x + 8],
i_intensity_spread );
/* Copy precomputed values to destination memory location */
vlc_memset(
&p_outpic->p[U_PLANE].p_pixels[i_du_line_start + (x / 2)],
memset(&p_outpic->p[U_PLANE].p_pixels[i_du_line_start + (x / 2)],
filling_const_8u, 8 );
vlc_memset(
&p_outpic->p[V_PLANE].p_pixels[i_dv_line_start + (x / 2)],
memset(&p_outpic->p[V_PLANE].p_pixels[i_dv_line_start + (x / 2)],
filling_const_8v, 8 );
}
/* Completing the job, the cycle above takes really big chunks, so
......@@ -322,10 +302,29 @@ static void PlanarI420Sepia( picture_t *p_pic, picture_t *p_outpic,
filling_const_8v;
}
}
}
else
}
#endif
{
/*****************************************************************************
* PlanarI420Sepia: Applies sepia to one frame of the planar I420 video
*****************************************************************************
* This function applies sepia effect to one frame of the video by iterating
* through video lines. We iterate for every two lines and for every two pixels
* in line to calculate new sepia values for four y components as well for u
* and v components.
*****************************************************************************/
static void PlanarI420Sepia( picture_t *p_pic, picture_t *p_outpic,
int i_intensity )
{
#if defined(CAN_COMPILE_SSE2)
if (vlc_CPU() & CPU_CAPABILITY_SSE2)
return PlanarI420SepiaSSE( p_pic, p_outpic, i_intensity );
#endif
// prepared values to copy for U and V channels
const uint8_t filling_const_8u = 128 - i_intensity / 6;
const uint8_t filling_const_8v = 128 + i_intensity / 14;
/* iterate for every two visible line in the frame */
for( int y = 0; y < p_pic->p[Y_PLANE].i_visible_lines - 1; y += 2)
{
......@@ -366,7 +365,6 @@ static void PlanarI420Sepia( picture_t *p_pic, picture_t *p_outpic,
filling_const_8v;
}
}
}
}
/*****************************************************************************
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment