Commit 655f16c5 authored by michael's avatar michael

avoid one transpose (730->680 dezicycles on duron)


git-svn-id: file:///var/local/repositories/ffmpeg/trunk@4332 9553f0bf-9b14-0410-a0b8-cfaf0461ba5b
parent df8039a8
...@@ -333,6 +333,8 @@ typedef struct H264Context{ ...@@ -333,6 +333,8 @@ typedef struct H264Context{
uint8_t *direct_table; uint8_t *direct_table;
uint8_t direct_cache[5*8]; uint8_t direct_cache[5*8];
uint8_t zigzag_scan[16];
uint8_t field_scan[16];
}H264Context; }H264Context;
static VLC coeff_token_vlc[4]; static VLC coeff_token_vlc[4];
...@@ -2721,6 +2723,18 @@ static int decode_init(AVCodecContext *avctx){ ...@@ -2721,6 +2723,18 @@ static int decode_init(AVCodecContext *avctx){
s->low_delay= 1; s->low_delay= 1;
avctx->pix_fmt= PIX_FMT_YUV420P; avctx->pix_fmt= PIX_FMT_YUV420P;
if(s->dsp.h264_idct_add == ff_h264_idct_add_c){ //FIXME little ugly
memcpy(h->zigzag_scan, zigzag_scan, 16*sizeof(uint8_t));
memcpy(h-> field_scan, field_scan, 16*sizeof(uint8_t));
}else{
int i;
for(i=0; i<16; i++){
#define T(x) (x>>2) | ((x<<2) & 0xF)
h->zigzag_scan[i] = T(zigzag_scan[i]);
h-> field_scan[i] = T( field_scan[i]);
}
}
decode_init_vlc(h); decode_init_vlc(h);
if(avctx->extradata_size > 0 && avctx->extradata && if(avctx->extradata_size > 0 && avctx->extradata &&
...@@ -4591,10 +4605,10 @@ decode_intra_mb: ...@@ -4591,10 +4605,10 @@ decode_intra_mb:
// fill_non_zero_count_cache(h); // fill_non_zero_count_cache(h);
if(IS_INTERLACED(mb_type)){ if(IS_INTERLACED(mb_type)){
scan= field_scan; scan= h->field_scan;
dc_scan= luma_dc_field_scan; dc_scan= luma_dc_field_scan;
}else{ }else{
scan= zigzag_scan; scan= h->zigzag_scan;
dc_scan= luma_dc_zigzag_scan; dc_scan= luma_dc_zigzag_scan;
} }
...@@ -5575,10 +5589,10 @@ decode_intra_mb: ...@@ -5575,10 +5589,10 @@ decode_intra_mb:
int dqp; int dqp;
if(IS_INTERLACED(mb_type)){ if(IS_INTERLACED(mb_type)){
scan= field_scan; scan= h->field_scan;
dc_scan= luma_dc_field_scan; dc_scan= luma_dc_field_scan;
}else{ }else{
scan= zigzag_scan; scan= h->zigzag_scan;
dc_scan= luma_dc_zigzag_scan; dc_scan= luma_dc_zigzag_scan;
} }
......
...@@ -673,14 +673,11 @@ void ff_h264_idct_add_mmx2(uint8_t *dst, int16_t *block, int stride) ...@@ -673,14 +673,11 @@ void ff_h264_idct_add_mmx2(uint8_t *dst, int16_t *block, int stride)
/* mm2=s02+s13 mm3=s02-s13 mm4=d02+d13 mm1=d02-d13 */ /* mm2=s02+s13 mm3=s02-s13 mm4=d02+d13 mm1=d02-d13 */
IDCT4_1D( %%mm3, %%mm2, %%mm1, %%mm0, %%mm4, %%mm5 ) IDCT4_1D( %%mm3, %%mm2, %%mm1, %%mm0, %%mm4, %%mm5 )
/* in: 2,4,1,3 out: 2,3,0,1 */
TRANSPOSE4( %%mm2, %%mm4, %%mm1, %%mm3, %%mm0 )
"pxor %%mm7, %%mm7 \n\t" "pxor %%mm7, %%mm7 \n\t"
:: "m"(ff_pw_32)); :: "m"(ff_pw_32));
STORE_DIFF_4P( %%mm2, %%mm4, %%mm7, &dst[0*stride] ); STORE_DIFF_4P( %%mm2, %%mm0, %%mm7, &dst[0*stride] );
STORE_DIFF_4P( %%mm3, %%mm4, %%mm7, &dst[1*stride] ); STORE_DIFF_4P( %%mm4, %%mm0, %%mm7, &dst[1*stride] );
STORE_DIFF_4P( %%mm0, %%mm4, %%mm7, &dst[2*stride] ); STORE_DIFF_4P( %%mm1, %%mm0, %%mm7, &dst[2*stride] );
STORE_DIFF_4P( %%mm1, %%mm4, %%mm7, &dst[3*stride] ); STORE_DIFF_4P( %%mm3, %%mm0, %%mm7, &dst[3*stride] );
} }
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment