Commit 19ba572b authored by michael's avatar michael

mmx2 optimization of huffyuv median encoding


git-svn-id: file:///var/local/repositories/ffmpeg/trunk@2372 9553f0bf-9b14-0410-a0b8-cfaf0461ba5b
parent e4b65a2c
...@@ -2526,6 +2526,24 @@ static void diff_bytes_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){ ...@@ -2526,6 +2526,24 @@ static void diff_bytes_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
dst[i+0] = src1[i+0]-src2[i+0]; dst[i+0] = src1[i+0]-src2[i+0];
} }
static void sub_hfyu_median_prediction_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w, int *left, int *left_top){
int i;
uint8_t l, lt;
l= *left;
lt= *left_top;
for(i=0; i<w; i++){
const int pred= mid_pred(l, src1[i], (l + src1[i] - lt)&0xFF);
lt= src1[i];
l= src2[i];
dst[i]= l - pred;
}
*left= l;
*left_top= lt;
}
#define BUTTERFLY2(o1,o2,i1,i2) \ #define BUTTERFLY2(o1,o2,i1,i2) \
o1= (i1)+(i2);\ o1= (i1)+(i2);\
o2= (i1)-(i2); o2= (i1)-(i2);
...@@ -3007,6 +3025,7 @@ void dsputil_init(DSPContext* c, AVCodecContext *avctx) ...@@ -3007,6 +3025,7 @@ void dsputil_init(DSPContext* c, AVCodecContext *avctx)
c->add_bytes= add_bytes_c; c->add_bytes= add_bytes_c;
c->diff_bytes= diff_bytes_c; c->diff_bytes= diff_bytes_c;
c->sub_hfyu_median_prediction= sub_hfyu_median_prediction_c;
c->bswap_buf= bswap_buf; c->bswap_buf= bswap_buf;
#ifdef HAVE_MMX #ifdef HAVE_MMX
......
...@@ -234,6 +234,11 @@ typedef struct DSPContext { ...@@ -234,6 +234,11 @@ typedef struct DSPContext {
/* huffyuv specific */ /* huffyuv specific */
void (*add_bytes)(uint8_t *dst/*align 16*/, uint8_t *src/*align 16*/, int w); void (*add_bytes)(uint8_t *dst/*align 16*/, uint8_t *src/*align 16*/, int w);
void (*diff_bytes)(uint8_t *dst/*align 16*/, uint8_t *src1/*align 16*/, uint8_t *src2/*align 1*/,int w); void (*diff_bytes)(uint8_t *dst/*align 16*/, uint8_t *src1/*align 16*/, uint8_t *src2/*align 1*/,int w);
/**
* subtract huffyuv's variant of median prediction
* note, this might read from src1[-1], src2[-1]
*/
void (*sub_hfyu_median_prediction)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w, int *left, int *left_top);
void (*bswap_buf)(uint32_t *dst, uint32_t *src, int w); void (*bswap_buf)(uint32_t *dst, uint32_t *src, int w);
/* (I)DCT */ /* (I)DCT */
......
...@@ -153,25 +153,6 @@ static inline void add_median_prediction(uint8_t *dst, uint8_t *src1, uint8_t *d ...@@ -153,25 +153,6 @@ static inline void add_median_prediction(uint8_t *dst, uint8_t *src1, uint8_t *d
*left_top= lt; *left_top= lt;
} }
//FIXME optimize
static inline void sub_median_prediction(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w, int *left, int *left_top){
int i;
uint8_t l, lt;
l= *left;
lt= *left_top;
for(i=0; i<w; i++){
const int pred= mid_pred(l, src1[i], (l + src1[i] - lt)&0xFF);
lt= src1[i];
l= src2[i];
dst[i]= l - pred;
}
*left= l;
*left_top= lt;
}
static inline void add_left_prediction_bgr32(uint8_t *dst, uint8_t *src, int w, int *red, int *green, int *blue){ static inline void add_left_prediction_bgr32(uint8_t *dst, uint8_t *src, int w, int *red, int *green, int *blue){
int i; int i;
int r,g,b; int r,g,b;
...@@ -999,9 +980,9 @@ static int encode_frame(AVCodecContext *avctx, unsigned char *buf, int buf_size, ...@@ -999,9 +980,9 @@ static int encode_frame(AVCodecContext *avctx, unsigned char *buf, int buf_size,
lefttopy= p->data[0][3]; lefttopy= p->data[0][3];
lefttopu= p->data[1][1]; lefttopu= p->data[1][1];
lefttopv= p->data[2][1]; lefttopv= p->data[2][1];
sub_median_prediction(s->temp[0], p->data[0]+4, p->data[0] + fake_ystride+4, width-4 , &lefty, &lefttopy); s->dsp.sub_hfyu_median_prediction(s->temp[0], p->data[0]+4, p->data[0] + fake_ystride+4, width-4 , &lefty, &lefttopy);
sub_median_prediction(s->temp[1], p->data[1]+2, p->data[1] + fake_ustride+2, width2-2, &leftu, &lefttopu); s->dsp.sub_hfyu_median_prediction(s->temp[1], p->data[1]+2, p->data[1] + fake_ustride+2, width2-2, &leftu, &lefttopu);
sub_median_prediction(s->temp[2], p->data[2]+2, p->data[2] + fake_vstride+2, width2-2, &leftv, &lefttopv); s->dsp.sub_hfyu_median_prediction(s->temp[2], p->data[2]+2, p->data[2] + fake_vstride+2, width2-2, &leftv, &lefttopv);
encode_422_bitstream(s, width-4); encode_422_bitstream(s, width-4);
y++; cy++; y++; cy++;
...@@ -1011,7 +992,7 @@ static int encode_frame(AVCodecContext *avctx, unsigned char *buf, int buf_size, ...@@ -1011,7 +992,7 @@ static int encode_frame(AVCodecContext *avctx, unsigned char *buf, int buf_size,
if(s->bitstream_bpp==12){ if(s->bitstream_bpp==12){
while(2*cy > y){ while(2*cy > y){
ydst= p->data[0] + p->linesize[0]*y; ydst= p->data[0] + p->linesize[0]*y;
sub_median_prediction(s->temp[0], ydst - fake_ystride, ydst, width , &lefty, &lefttopy); s->dsp.sub_hfyu_median_prediction(s->temp[0], ydst - fake_ystride, ydst, width , &lefty, &lefttopy);
encode_gray_bitstream(s, width); encode_gray_bitstream(s, width);
y++; y++;
} }
...@@ -1021,9 +1002,9 @@ static int encode_frame(AVCodecContext *avctx, unsigned char *buf, int buf_size, ...@@ -1021,9 +1002,9 @@ static int encode_frame(AVCodecContext *avctx, unsigned char *buf, int buf_size,
udst= p->data[1] + p->linesize[1]*cy; udst= p->data[1] + p->linesize[1]*cy;
vdst= p->data[2] + p->linesize[2]*cy; vdst= p->data[2] + p->linesize[2]*cy;
sub_median_prediction(s->temp[0], ydst - fake_ystride, ydst, width , &lefty, &lefttopy); s->dsp.sub_hfyu_median_prediction(s->temp[0], ydst - fake_ystride, ydst, width , &lefty, &lefttopy);
sub_median_prediction(s->temp[1], udst - fake_ustride, udst, width2, &leftu, &lefttopu); s->dsp.sub_hfyu_median_prediction(s->temp[1], udst - fake_ustride, udst, width2, &leftu, &lefttopu);
sub_median_prediction(s->temp[2], vdst - fake_vstride, vdst, width2, &leftv, &lefttopv); s->dsp.sub_hfyu_median_prediction(s->temp[2], vdst - fake_vstride, vdst, width2, &leftv, &lefttopv);
encode_422_bitstream(s, width); encode_422_bitstream(s, width);
} }
......
...@@ -583,6 +583,43 @@ static void diff_bytes_mmx(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){ ...@@ -583,6 +583,43 @@ static void diff_bytes_mmx(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
for(; i<w; i++) for(; i<w; i++)
dst[i+0] = src1[i+0]-src2[i+0]; dst[i+0] = src1[i+0]-src2[i+0];
} }
static void sub_hfyu_median_prediction_mmx2(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w, int *left, int *left_top){
int i=0;
uint8_t l, lt;
asm volatile(
"1: \n\t"
"movq -1(%1, %0), %%mm0 \n\t" // LT
"movq (%1, %0), %%mm1 \n\t" // T
"movq -1(%2, %0), %%mm2 \n\t" // L
"movq (%2, %0), %%mm3 \n\t" // X
"movq %%mm2, %%mm4 \n\t" // L
"psubb %%mm0, %%mm2 \n\t"
"paddb %%mm1, %%mm2 \n\t" // L + T - LT
"movq %%mm4, %%mm5 \n\t" // L
"pmaxub %%mm1, %%mm4 \n\t" // max(T, L)
"pminub %%mm5, %%mm1 \n\t" // min(T, L)
"pminub %%mm2, %%mm4 \n\t"
"pmaxub %%mm1, %%mm4 \n\t"
"psubb %%mm4, %%mm3 \n\t" // dst - pred
"movq %%mm3, (%3, %0) \n\t"
"addl $8, %0 \n\t"
"cmpl %4, %0 \n\t"
" jb 1b \n\t"
: "+r" (i)
: "r"(src1), "r"(src2), "r"(dst), "r"(w)
);
l= *left;
lt= *left_top;
dst[0]= src2[0] - mid_pred(l, src1[0], (l + src1[0] - lt)&0xFF);
*left_top= src1[w-1];
*left = src2[w-1];
}
#define LBUTTERFLY2(a1,b1,a2,b2)\ #define LBUTTERFLY2(a1,b1,a2,b2)\
"paddw " #b1 ", " #a1 " \n\t"\ "paddw " #b1 ", " #a1 " \n\t"\
"paddw " #b2 ", " #a2 " \n\t"\ "paddw " #b2 ", " #a2 " \n\t"\
...@@ -1699,6 +1736,8 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx) ...@@ -1699,6 +1736,8 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx)
SET_QPEL_FUNC(qpel_pixels_tab[1][14], qpel8_mc23_mmx2) SET_QPEL_FUNC(qpel_pixels_tab[1][14], qpel8_mc23_mmx2)
SET_QPEL_FUNC(qpel_pixels_tab[1][15], qpel8_mc33_mmx2) SET_QPEL_FUNC(qpel_pixels_tab[1][15], qpel8_mc33_mmx2)
#endif #endif
c->sub_hfyu_median_prediction= sub_hfyu_median_prediction_mmx2;
} else if (mm_flags & MM_3DNOW) { } else if (mm_flags & MM_3DNOW) {
c->put_pixels_tab[0][1] = put_pixels16_x2_3dnow; c->put_pixels_tab[0][1] = put_pixels16_x2_3dnow;
c->put_pixels_tab[0][2] = put_pixels16_y2_3dnow; c->put_pixels_tab[0][2] = put_pixels16_y2_3dnow;
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment