Commit 972c39b6 authored by Martin Briza's avatar Martin Briza Committed by Jean-Baptiste Kempf

Sepia filter ASM optimization

Added functions to improve YUV performance, still needs a bit of work on RGB
Signed-off-by: default avatarJean-Baptiste Kempf <jb@videolan.org>
parent 23e6f1bf
......@@ -32,6 +32,7 @@
#include <vlc_common.h>
#include <vlc_plugin.h>
#include <vlc_filter.h>
#include <vlc_cpu.h>
#include <assert.h>
#include "filter_picture.h"
......@@ -46,7 +47,8 @@ static void RVSepia( picture_t *, picture_t *, int );
static void PlanarI420Sepia( picture_t *, picture_t *, int);
static void PackedYUVSepia( picture_t *, picture_t *, int);
static picture_t *Filter( filter_t *, picture_t * );
inline void Sepia8ySSE41( uint8_t *, const uint8_t *, volatile uint8_t * );
inline void Memcpy8BMMX( uint8_t *, const uint8_t * );
static const char *const ppsz_filter_options[] = {
"intensity", NULL
};
......@@ -212,6 +214,94 @@ static void PlanarI420Sepia( picture_t *p_pic, picture_t *p_outpic,
// prepared values to copy for U and V channels
const uint8_t filling_const_8u = 128 - i_intensity / 6;
const uint8_t filling_const_8v = 128 + i_intensity / 14;
#if defined(CAN_COMPILE_SSE4_1) && 1
if (vlc_CPU() & CPU_CAPABILITY_SSE4_1)
{
/*prepare array of values to copy with mmx, compute only once
to improve speed */
volatile uint8_t intensity_array[8] = { i_intensity, i_intensity,
i_intensity, i_intensity, i_intensity, i_intensity,
i_intensity, i_intensity };
const uint8_t filling_array_8u[8] =
{ filling_const_8u, filling_const_8u, filling_const_8u,
filling_const_8u, filling_const_8u, filling_const_8u,
filling_const_8u, filling_const_8u };
const uint8_t filling_array_8v[8] =
{ filling_const_8v, filling_const_8v, filling_const_8v,
filling_const_8v, filling_const_8v, filling_const_8v,
filling_const_8v, filling_const_8v };
/* iterate for every two visible line in the frame */
for (int y = 0; y < p_pic->p[Y_PLANE].i_visible_lines - 1; y += 2)
{
const int i_dy_line1_start = y * p_outpic->p[Y_PLANE].i_pitch;
const int i_dy_line2_start =
(y + 1) * p_outpic->p[Y_PLANE].i_pitch;
const int i_du_line_start =
(y / 2) * p_outpic->p[U_PLANE].i_pitch;
const int i_dv_line_start =
(y / 2) * p_outpic->p[V_PLANE].i_pitch;
int x = 0;
/* iterate for every visible line in the frame (eight values at once) */
for (; x < p_pic->p[Y_PLANE].i_visible_pitch - 15; x += 16)
{
/* Compute yellow channel values with asm function */
Sepia8ySSE41(
&p_outpic->p[Y_PLANE].p_pixels[i_dy_line1_start + x],
&p_pic->p[Y_PLANE].p_pixels[i_dy_line1_start + x],
intensity_array );
Sepia8ySSE41(
&p_outpic->p[Y_PLANE].p_pixels[i_dy_line2_start + x],
&p_pic->p[Y_PLANE].p_pixels[i_dy_line2_start + x],
intensity_array );
Sepia8ySSE41(
&p_outpic->p[Y_PLANE].p_pixels[i_dy_line1_start + x + 8],
&p_pic->p[Y_PLANE].p_pixels[i_dy_line1_start + x + 8],
intensity_array );
Sepia8ySSE41(
&p_outpic->p[Y_PLANE].p_pixels[i_dy_line2_start + x + 8],
&p_pic->p[Y_PLANE].p_pixels[i_dy_line2_start + x + 8],
intensity_array );
/* Copy precomputed values to destination image memory location */
Memcpy8BMMX(
&p_outpic->p[U_PLANE].p_pixels[i_du_line_start + (x / 2)],
filling_array_8u );
Memcpy8BMMX(&p_outpic->p[V_PLANE].p_pixels[i_dv_line_start + (x / 2)],
filling_array_8v );
}
/* Completing the job, the cycle above takes really big chunks, so
this makes sure the job will be done completely */
for (; x < p_pic->p[Y_PLANE].i_visible_pitch - 1; x += 2)
{
// y = y - y/4 {to prevent overflow} + intensity / 4
p_outpic->p[Y_PLANE].p_pixels[i_dy_line1_start + x] =
p_pic->p[Y_PLANE].p_pixels[i_dy_line1_start + x] -
(p_pic->p[Y_PLANE].p_pixels[i_dy_line1_start + x] >> 2) +
(i_intensity >> 2);
p_outpic->p[Y_PLANE].p_pixels[i_dy_line1_start + x + 1] =
p_pic->p[Y_PLANE].p_pixels[i_dy_line1_start + x + 1] -
(p_pic->p[Y_PLANE].p_pixels[i_dy_line1_start + x + 1] >> 2) +
(i_intensity >> 2);
p_outpic->p[Y_PLANE].p_pixels[i_dy_line2_start + x] =
p_pic->p[Y_PLANE].p_pixels[i_dy_line2_start + x] -
(p_pic->p[Y_PLANE].p_pixels[i_dy_line2_start + x] >> 2) +
(i_intensity >> 2);
p_outpic->p[Y_PLANE].p_pixels[i_dy_line2_start + x + 1] =
p_pic->p[Y_PLANE].p_pixels[i_dy_line2_start + x + 1] -
(p_pic->p[Y_PLANE].p_pixels[i_dy_line2_start + x + 1] >> 2) +
(i_intensity >> 2);
// u = 128 {half => B&W} - intensity / 6
p_outpic->p[U_PLANE].p_pixels[i_du_line_start + (x / 2)] =
filling_const_8u;
// v = 128 {half => B&W} + intensity / 14
p_outpic->p[V_PLANE].p_pixels[i_dv_line_start + (x / 2)] =
filling_const_8v;
}
}
} else
#endif
{
/* iterate for every two visible line in the frame */
for( int y = 0; y < p_pic->p[Y_PLANE].i_visible_lines - 1; y += 2)
{
......@@ -252,6 +342,7 @@ static void PlanarI420Sepia( picture_t *p_pic, picture_t *p_outpic,
filling_const_8v;
}
}
}
}
/*****************************************************************************
......@@ -278,7 +369,69 @@ static void PackedYUVSepia( picture_t *p_pic, picture_t *p_outpic,
p_in_end = p_in + p_pic->p[0].i_visible_lines
* p_pic->p[0].i_pitch;
p_out = p_outpic->p[0].p_pixels;
#if defined(CAN_COMPILE_SSE4_1)
if (vlc_CPU() & CPU_CAPABILITY_SSE4_1)
{
/*prepare array of values to copy with mmx, compute only once
to improve speed */
volatile uint8_t intensity_array[8] = { i_intensity, i_intensity,
i_intensity, i_intensity, i_intensity, i_intensity,
i_intensity,
i_intensity
};
const uint8_t filling_array_8u[8] =
{ filling_const_8u, filling_const_8u,
filling_const_8u, filling_const_8u, filling_const_8u,
filling_const_8u,
filling_const_8u, filling_const_8u
};
const uint8_t filling_array_8v[8] =
{ filling_const_8v, filling_const_8v,
filling_const_8v, filling_const_8v, filling_const_8v,
filling_const_8v,
filling_const_8v, filling_const_8v
};
/* iterate for every two visible line in the frame */
while (p_in < p_in_end)
{
p_line_end = p_in + p_pic->p[0].i_visible_pitch;
while (p_in < p_line_end)
{
Sepia8ySSE41(&p_out[i_yindex], &p_in[i_yindex],
intensity_array);
Sepia8ySSE41(&p_out[i_yindex + 8], &p_in[i_yindex + 8],
intensity_array);
Sepia8ySSE41(&p_out[i_yindex + 16], &p_in[i_yindex + 16],
intensity_array);
Sepia8ySSE41(&p_out[i_yindex + 24], &p_in[i_yindex + 24],
intensity_array);
Memcpy8BMMX(&p_out[i_uindex], filling_array_8u);
Memcpy8BMMX(&p_out[i_vindex], filling_array_8v);
p_in += 32;
p_out += 32;
}
while (p_in < p_line_end)
{
p_out[i_yindex] =
p_in[i_yindex] - (p_in[i_yindex] >> 2) +
(i_intensity >> 2);
p_out[i_yindex + 2] =
p_in[i_yindex + 2] - (p_in[i_yindex + 2] >> 2) +
(i_intensity >> 2);
p_out[i_uindex] = filling_const_8u;
p_out[i_vindex] = filling_const_8v;
p_in += 4;
p_out += 4;
}
p_in += p_pic->p[0].i_pitch - p_pic->p[0].i_visible_pitch;
p_out += p_outpic->p[0].i_pitch
- p_outpic->p[0].i_visible_pitch;
}
} else
#endif
{
while( p_in < p_in_end )
{
p_line_end = p_in + p_pic->p[0].i_visible_pitch;
......@@ -299,6 +452,7 @@ static void PackedYUVSepia( picture_t *p_pic, picture_t *p_outpic,
p_out += p_outpic->p[0].i_pitch
- p_outpic->p[0].i_visible_pitch;
}
}
}
/*****************************************************************************
......@@ -314,7 +468,6 @@ static void RVSepia( picture_t *p_pic, picture_t *p_outpic, int i_intensity )
#define ONE_HALF (1 << (SCALEBITS - 1))
#define FIX(x) ((int) ((x) * (1<<SCALEBITS) + 0.5))
uint8_t *p_in, *p_in_end, *p_line_end, *p_out;
int i_r, i_g, i_b;
bool b_isRV32 = p_pic->format.i_chroma == VLC_CODEC_RGB32;
int i_rindex = 0, i_gindex = 1, i_bindex = 2;
......@@ -372,6 +525,50 @@ static void RVSepia( picture_t *p_pic, picture_t *p_outpic, int i_intensity )
#undef FIX
}
/*****************************************************************************
* Sepia8ySSE41
*****************************************************************************
* This function applies sepia effect to eight bytes of yellow using SSE4.1
* instructions. It copies those 8 bytes to 128b register and fills the gaps
* with zeroes and following operations are made with word-operating instructs.
*****************************************************************************/
inline void Sepia8ySSE41(uint8_t * dst, const uint8_t * src,
volatile uint8_t * i_intensity)
{
#if defined(CAN_COMPILE_SSE4_1) && 1
__asm__ volatile (
"pmovzxbw (%1), %%xmm1\n" // y = y - y / 4 + i_intensity / 4
"pmovzxbw (%1), %%xmm2\n" // store bytes as words with 0s in between
"pmovzxbw (%2), %%xmm3\n"
"psrlw $2, %%xmm2\n" // rotate right 2
"psubusb %%xmm1, %%xmm2\n" // subtract
"psrlw $2, %%xmm3\n"
"paddsb %%xmm1, %%xmm3\n" // add
"packuswb %%xmm2, %%xmm1\n" // pack back to bytes
"movq %%xmm1, (%0) \n" // load to dest
:
:"r" (dst), "r"(src), "r"(i_intensity)
:"memory");
#endif
}
/*****************************************************************************
* Memcpy8BMMX: Copies 8 bytes of memory in two instructions
*****************************************************************************
* Not quite clean, but it should be fast.
*****************************************************************************/
inline void Memcpy8BMMX(uint8_t * dst, const uint8_t * src)
{
#if defined(CAN_COMPILE_MMX) && 1
__asm__ volatile (
"movq (%1), %%xmm0\n"
"movq %%xmm0, (%0)\n"
:
:"r" (dst), "r"(src)
:"memory");
#endif
}
static int FilterCallback ( vlc_object_t *p_this, char const *psz_var,
vlc_value_t oldval, vlc_value_t newval,
void *p_data )
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment