Commit 708e92e9 authored by Rémi Denis-Courmont's avatar Rémi Denis-Courmont

sepia: clobber XMM registers correctly

(cherry picked from commit a865ced4888701e8caf0137672a4857c2b0d47d7)

Conflicts:
	modules/video_filter/sepia.c
parent d879cd1f
...@@ -206,6 +206,7 @@ static picture_t *Filter( filter_t *p_filter, picture_t *p_pic ) ...@@ -206,6 +206,7 @@ static picture_t *Filter( filter_t *p_filter, picture_t *p_pic )
* instructions. It copies those 8 bytes to 128b register and fills the gaps * instructions. It copies those 8 bytes to 128b register and fills the gaps
* with zeroes and following operations are made with word-operating instructs. * with zeroes and following operations are made with word-operating instructs.
*****************************************************************************/ *****************************************************************************/
VLC_SSE
static inline void Sepia8ySSE2(uint8_t * dst, const uint8_t * src, static inline void Sepia8ySSE2(uint8_t * dst, const uint8_t * src,
int i_intensity_spread) int i_intensity_spread)
{ {
...@@ -225,7 +226,82 @@ static inline void Sepia8ySSE2(uint8_t * dst, const uint8_t * src, ...@@ -225,7 +226,82 @@ static inline void Sepia8ySSE2(uint8_t * dst, const uint8_t * src,
"movq %%xmm1, (%0) \n" // load to dest "movq %%xmm1, (%0) \n" // load to dest
: :
:"r" (dst), "r"(src), "r"(i_intensity_spread) :"r" (dst), "r"(src), "r"(i_intensity_spread)
:"memory"); :"memory", "xmm1", "xmm2", "xmm3");
}
VLC_SSE
static void PlanarI420SepiaSSE( picture_t *p_pic, picture_t *p_outpic,
int i_intensity )
{
/* prepared values to copy for U and V channels */
const uint8_t filling_const_8u = 128 - i_intensity / 6;
const uint8_t filling_const_8v = 128 + i_intensity / 14;
/* prepared value for faster broadcasting in xmm register */
int i_intensity_spread = 0x10001 * (uint8_t) i_intensity;
__asm__ volatile(
"pxor %%xmm7, %%xmm7\n"
::: "xmm7");
/* iterate for every two visible line in the frame */
for (int y = 0; y < p_pic->p[Y_PLANE].i_visible_lines - 1; y += 2)
{
const int i_dy_line1_start = y * p_outpic->p[Y_PLANE].i_pitch;
const int i_dy_line2_start = (y + 1) * p_outpic->p[Y_PLANE].i_pitch;
const int i_du_line_start = (y / 2) * p_outpic->p[U_PLANE].i_pitch;
const int i_dv_line_start = (y / 2) * p_outpic->p[V_PLANE].i_pitch;
int x = 0;
/* iterate for every visible line in the frame (eight values at once) */
for ( ; x < p_pic->p[Y_PLANE].i_visible_pitch - 15; x += 16 )
{
/* Compute yellow channel values with asm function */
Sepia8ySSE2(&p_outpic->p[Y_PLANE].p_pixels[i_dy_line1_start + x],
&p_pic->p[Y_PLANE].p_pixels[i_dy_line1_start + x],
i_intensity_spread );
Sepia8ySSE2(&p_outpic->p[Y_PLANE].p_pixels[i_dy_line2_start + x],
&p_pic->p[Y_PLANE].p_pixels[i_dy_line2_start + x],
i_intensity_spread );
Sepia8ySSE2(&p_outpic->p[Y_PLANE].p_pixels[i_dy_line1_start + x + 8],
&p_pic->p[Y_PLANE].p_pixels[i_dy_line1_start + x + 8],
i_intensity_spread );
Sepia8ySSE2(&p_outpic->p[Y_PLANE].p_pixels[i_dy_line2_start + x + 8],
&p_pic->p[Y_PLANE].p_pixels[i_dy_line2_start + x + 8],
i_intensity_spread );
/* Copy precomputed values to destination memory location */
memset(&p_outpic->p[U_PLANE].p_pixels[i_du_line_start + (x / 2)],
filling_const_8u, 8 );
memset(&p_outpic->p[V_PLANE].p_pixels[i_dv_line_start + (x / 2)],
filling_const_8v, 8 );
}
/* Completing the job, the cycle above takes really big chunks, so
this makes sure the job will be done completely */
for ( ; x < p_pic->p[Y_PLANE].i_visible_pitch - 1; x += 2 )
{
// y = y - y/4 {to prevent overflow} + intensity / 4
p_outpic->p[Y_PLANE].p_pixels[i_dy_line1_start + x] =
p_pic->p[Y_PLANE].p_pixels[i_dy_line1_start + x] -
(p_pic->p[Y_PLANE].p_pixels[i_dy_line1_start + x] >> 2) +
(i_intensity >> 2);
p_outpic->p[Y_PLANE].p_pixels[i_dy_line1_start + x + 1] =
p_pic->p[Y_PLANE].p_pixels[i_dy_line1_start + x + 1] -
(p_pic->p[Y_PLANE].p_pixels[i_dy_line1_start + x + 1] >> 2) +
(i_intensity >> 2);
p_outpic->p[Y_PLANE].p_pixels[i_dy_line2_start + x] =
p_pic->p[Y_PLANE].p_pixels[i_dy_line2_start + x] -
(p_pic->p[Y_PLANE].p_pixels[i_dy_line2_start + x] >> 2) +
(i_intensity >> 2);
p_outpic->p[Y_PLANE].p_pixels[i_dy_line2_start + x + 1] =
p_pic->p[Y_PLANE].p_pixels[i_dy_line2_start + x + 1] -
(p_pic->p[Y_PLANE].p_pixels[i_dy_line2_start + x + 1] >> 2) +
(i_intensity >> 2);
// u = 128 {half => B&W} - intensity / 6
p_outpic->p[U_PLANE].p_pixels[i_du_line_start + (x / 2)] =
filling_const_8u;
// v = 128 {half => B&W} + intensity / 14
p_outpic->p[V_PLANE].p_pixels[i_dv_line_start + (x / 2)] =
filling_const_8v;
}
}
} }
#endif #endif
...@@ -240,131 +316,53 @@ static inline void Sepia8ySSE2(uint8_t * dst, const uint8_t * src, ...@@ -240,131 +316,53 @@ static inline void Sepia8ySSE2(uint8_t * dst, const uint8_t * src,
static void PlanarI420Sepia( picture_t *p_pic, picture_t *p_outpic, static void PlanarI420Sepia( picture_t *p_pic, picture_t *p_outpic,
int i_intensity ) int i_intensity )
{ {
// prepared values to copy for U and V channels
const uint8_t filling_const_8u = 128 - i_intensity / 6;
const uint8_t filling_const_8v = 128 + i_intensity / 14;
#if defined(CAN_COMPILE_SSE2) #if defined(CAN_COMPILE_SSE2)
if (vlc_CPU() & CPU_CAPABILITY_SSE2) if (vlc_CPU() & CPU_CAPABILITY_SSE2)
{ return PlanarI420SepiaSSE( p_pic, p_outpic, i_intensity );
/* prepared value for faster broadcasting in xmm register */ #endif
int i_intensity_spread = 0x10001 * (uint8_t) i_intensity;
__asm__ volatile( // prepared values to copy for U and V channels
"pxor %%xmm7, %%xmm7\n" const uint8_t filling_const_8u = 128 - i_intensity / 6;
::); const uint8_t filling_const_8v = 128 + i_intensity / 14;
/* iterate for every two visible line in the frame */ /* iterate for every two visible line in the frame */
for (int y = 0; y < p_pic->p[Y_PLANE].i_visible_lines - 1; y += 2) for( int y = 0; y < p_pic->p[Y_PLANE].i_visible_lines - 1; y += 2)
{
const int i_dy_line1_start = y * p_outpic->p[Y_PLANE].i_pitch;
const int i_dy_line2_start =
(y + 1) * p_outpic->p[Y_PLANE].i_pitch;
const int i_du_line_start =
(y / 2) * p_outpic->p[U_PLANE].i_pitch;
const int i_dv_line_start =
(y / 2) * p_outpic->p[V_PLANE].i_pitch;
int x = 0;
/* iterate for every visible line in the frame (eight values at once) */
for ( ; x < p_pic->p[Y_PLANE].i_visible_pitch - 15; x += 16 )
{
/* Compute yellow channel values with asm function */
Sepia8ySSE2(
&p_outpic->p[Y_PLANE].p_pixels[i_dy_line1_start + x],
&p_pic->p[Y_PLANE].p_pixels[i_dy_line1_start + x],
i_intensity_spread );
Sepia8ySSE2(
&p_outpic->p[Y_PLANE].p_pixels[i_dy_line2_start + x],
&p_pic->p[Y_PLANE].p_pixels[i_dy_line2_start + x],
i_intensity_spread );
Sepia8ySSE2(
&p_outpic->p[Y_PLANE].p_pixels[i_dy_line1_start + x + 8],
&p_pic->p[Y_PLANE].p_pixels[i_dy_line1_start + x + 8],
i_intensity_spread );
Sepia8ySSE2(
&p_outpic->p[Y_PLANE].p_pixels[i_dy_line2_start + x + 8],
&p_pic->p[Y_PLANE].p_pixels[i_dy_line2_start + x + 8],
i_intensity_spread );
/* Copy precomputed values to destination memory location */
vlc_memset(
&p_outpic->p[U_PLANE].p_pixels[i_du_line_start + (x / 2)],
filling_const_8u, 8 );
vlc_memset(
&p_outpic->p[V_PLANE].p_pixels[i_dv_line_start + (x / 2)],
filling_const_8v, 8 );
}
/* Completing the job, the cycle above takes really big chunks, so
this makes sure the job will be done completely */
for ( ; x < p_pic->p[Y_PLANE].i_visible_pitch - 1; x += 2 )
{
// y = y - y/4 {to prevent overflow} + intensity / 4
p_outpic->p[Y_PLANE].p_pixels[i_dy_line1_start + x] =
p_pic->p[Y_PLANE].p_pixels[i_dy_line1_start + x] -
(p_pic->p[Y_PLANE].p_pixels[i_dy_line1_start + x] >> 2) +
(i_intensity >> 2);
p_outpic->p[Y_PLANE].p_pixels[i_dy_line1_start + x + 1] =
p_pic->p[Y_PLANE].p_pixels[i_dy_line1_start + x + 1] -
(p_pic->p[Y_PLANE].p_pixels[i_dy_line1_start + x + 1] >> 2) +
(i_intensity >> 2);
p_outpic->p[Y_PLANE].p_pixels[i_dy_line2_start + x] =
p_pic->p[Y_PLANE].p_pixels[i_dy_line2_start + x] -
(p_pic->p[Y_PLANE].p_pixels[i_dy_line2_start + x] >> 2) +
(i_intensity >> 2);
p_outpic->p[Y_PLANE].p_pixels[i_dy_line2_start + x + 1] =
p_pic->p[Y_PLANE].p_pixels[i_dy_line2_start + x + 1] -
(p_pic->p[Y_PLANE].p_pixels[i_dy_line2_start + x + 1] >> 2) +
(i_intensity >> 2);
// u = 128 {half => B&W} - intensity / 6
p_outpic->p[U_PLANE].p_pixels[i_du_line_start + (x / 2)] =
filling_const_8u;
// v = 128 {half => B&W} + intensity / 14
p_outpic->p[V_PLANE].p_pixels[i_dv_line_start + (x / 2)] =
filling_const_8v;
}
}
}
else
#endif
{ {
const int i_dy_line1_start = y * p_outpic->p[Y_PLANE].i_pitch;
const int i_dy_line2_start = ( y + 1 ) * p_outpic->p[Y_PLANE].i_pitch;
const int i_du_line_start = (y/2) * p_outpic->p[U_PLANE].i_pitch;
const int i_dv_line_start = (y/2) * p_outpic->p[V_PLANE].i_pitch;
// to prevent sigsegv if one pic is smaller (theoretically)
int i_picture_size_limit = p_pic->p[Y_PLANE].i_visible_pitch
< p_outpic->p[Y_PLANE].i_visible_pitch
? (p_pic->p[Y_PLANE].i_visible_pitch - 1) :
(p_outpic->p[Y_PLANE].i_visible_pitch - 1);
/* iterate for every two visible line in the frame */ /* iterate for every two visible line in the frame */
for( int y = 0; y < p_pic->p[Y_PLANE].i_visible_lines - 1; y += 2) for( int x = 0; x < i_picture_size_limit; x += 2)
{ {
const int i_dy_line1_start = y * p_outpic->p[Y_PLANE].i_pitch; // y = y - y/4 {to prevent overflow} + intensity / 4
const int i_dy_line2_start = ( y + 1 ) * p_outpic->p[Y_PLANE].i_pitch; p_outpic->p[Y_PLANE].p_pixels[i_dy_line1_start + x] =
const int i_du_line_start = (y/2) * p_outpic->p[U_PLANE].i_pitch; p_pic->p[Y_PLANE].p_pixels[i_dy_line1_start + x] -
const int i_dv_line_start = (y/2) * p_outpic->p[V_PLANE].i_pitch; (p_pic->p[Y_PLANE].p_pixels[i_dy_line1_start + x] >> 2) +
// to prevent sigsegv if one pic is smaller (theoretically) (i_intensity >> 2);
int i_picture_size_limit = p_pic->p[Y_PLANE].i_visible_pitch p_outpic->p[Y_PLANE].p_pixels[i_dy_line1_start + x + 1] =
< p_outpic->p[Y_PLANE].i_visible_pitch p_pic->p[Y_PLANE].p_pixels[i_dy_line1_start + x + 1] -
? (p_pic->p[Y_PLANE].i_visible_pitch - 1) : (p_pic->p[Y_PLANE].p_pixels[i_dy_line1_start + x + 1] >> 2) +
(p_outpic->p[Y_PLANE].i_visible_pitch - 1); (i_intensity >> 2);
/* iterate for every two visible line in the frame */ p_outpic->p[Y_PLANE].p_pixels[i_dy_line2_start + x] =
for( int x = 0; x < i_picture_size_limit; x += 2) p_pic->p[Y_PLANE].p_pixels[i_dy_line2_start + x] -
{ (p_pic->p[Y_PLANE].p_pixels[i_dy_line2_start + x] >> 2) +
// y = y - y/4 {to prevent overflow} + intensity / 4 (i_intensity >> 2);
p_outpic->p[Y_PLANE].p_pixels[i_dy_line1_start + x] = p_outpic->p[Y_PLANE].p_pixels[i_dy_line2_start + x + 1] =
p_pic->p[Y_PLANE].p_pixels[i_dy_line1_start + x] - p_pic->p[Y_PLANE].p_pixels[i_dy_line2_start + x + 1] -
(p_pic->p[Y_PLANE].p_pixels[i_dy_line1_start + x] >> 2) + (p_pic->p[Y_PLANE].p_pixels[i_dy_line2_start + x + 1] >> 2) +
(i_intensity >> 2); (i_intensity >> 2);
p_outpic->p[Y_PLANE].p_pixels[i_dy_line1_start + x + 1] = // u = 128 {half => B&W} - intensity / 6
p_pic->p[Y_PLANE].p_pixels[i_dy_line1_start + x + 1] - p_outpic->p[U_PLANE].p_pixels[i_du_line_start + (x / 2)] =
(p_pic->p[Y_PLANE].p_pixels[i_dy_line1_start + x + 1] >> 2) + filling_const_8u;
(i_intensity >> 2); // v = 128 {half => B&W} + intensity / 14
p_outpic->p[Y_PLANE].p_pixels[i_dy_line2_start + x] = p_outpic->p[V_PLANE].p_pixels[i_dv_line_start + (x / 2)] =
p_pic->p[Y_PLANE].p_pixels[i_dy_line2_start + x] - filling_const_8v;
(p_pic->p[Y_PLANE].p_pixels[i_dy_line2_start + x] >> 2) +
(i_intensity >> 2);
p_outpic->p[Y_PLANE].p_pixels[i_dy_line2_start + x + 1] =
p_pic->p[Y_PLANE].p_pixels[i_dy_line2_start + x + 1] -
(p_pic->p[Y_PLANE].p_pixels[i_dy_line2_start + x + 1] >> 2) +
(i_intensity >> 2);
// u = 128 {half => B&W} - intensity / 6
p_outpic->p[U_PLANE].p_pixels[i_du_line_start + (x / 2)] =
filling_const_8u;
// v = 128 {half => B&W} + intensity / 14
p_outpic->p[V_PLANE].p_pixels[i_dv_line_start + (x / 2)] =
filling_const_8v;
}
} }
} }
} }
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment