Commit 6f3e7b0b authored by astrange's avatar astrange

H.264: Use 64-/128-bit write-combining macros for copies

2-3% faster decode on x86-32 core2.


git-svn-id: file:///var/local/repositories/ffmpeg/trunk@21440 9553f0bf-9b14-0410-a0b8-cfaf0461ba5b
parent 5c734382
...@@ -945,6 +945,7 @@ int ff_h264_frame_start(H264Context *h){ ...@@ -945,6 +945,7 @@ int ff_h264_frame_start(H264Context *h){
static inline void backup_mb_border(H264Context *h, uint8_t *src_y, uint8_t *src_cb, uint8_t *src_cr, int linesize, int uvlinesize, int simple){ static inline void backup_mb_border(H264Context *h, uint8_t *src_y, uint8_t *src_cb, uint8_t *src_cr, int linesize, int uvlinesize, int simple){
MpegEncContext * const s = &h->s; MpegEncContext * const s = &h->s;
uint8_t *top_border;
int top_idx = 1; int top_idx = 1;
src_y -= linesize; src_y -= linesize;
...@@ -954,11 +955,11 @@ static inline void backup_mb_border(H264Context *h, uint8_t *src_y, uint8_t *src ...@@ -954,11 +955,11 @@ static inline void backup_mb_border(H264Context *h, uint8_t *src_y, uint8_t *src
if(!simple && FRAME_MBAFF){ if(!simple && FRAME_MBAFF){
if(s->mb_y&1){ if(s->mb_y&1){
if(!MB_MBAFF){ if(!MB_MBAFF){
*(uint64_t*)(h->top_borders[0][s->mb_x]+ 0)= *(uint64_t*)(src_y + 15*linesize); top_border = h->top_borders[0][s->mb_x];
*(uint64_t*)(h->top_borders[0][s->mb_x]+ 8)= *(uint64_t*)(src_y +8+15*linesize); AV_COPY128(top_border, src_y + 15*linesize);
if(simple || !CONFIG_GRAY || !(s->flags&CODEC_FLAG_GRAY)){ if(simple || !CONFIG_GRAY || !(s->flags&CODEC_FLAG_GRAY)){
*(uint64_t*)(h->top_borders[0][s->mb_x]+16)= *(uint64_t*)(src_cb+7*uvlinesize); AV_COPY64(top_border+16, src_cb+7*uvlinesize);
*(uint64_t*)(h->top_borders[0][s->mb_x]+24)= *(uint64_t*)(src_cr+7*uvlinesize); AV_COPY64(top_border+24, src_cr+7*uvlinesize);
} }
} }
}else if(MB_MBAFF){ }else if(MB_MBAFF){
...@@ -967,15 +968,14 @@ static inline void backup_mb_border(H264Context *h, uint8_t *src_y, uint8_t *src ...@@ -967,15 +968,14 @@ static inline void backup_mb_border(H264Context *h, uint8_t *src_y, uint8_t *src
return; return;
} }
top_border = h->top_borders[top_idx][s->mb_x];
// There are two lines saved, the line above the the top macroblock of a pair, // There are two lines saved, the line above the the top macroblock of a pair,
// and the line above the bottom macroblock // and the line above the bottom macroblock
AV_COPY128(top_border, src_y + 16*linesize);
*(uint64_t*)(h->top_borders[top_idx][s->mb_x]+0)= *(uint64_t*)(src_y + 16*linesize);
*(uint64_t*)(h->top_borders[top_idx][s->mb_x]+8)= *(uint64_t*)(src_y +8+16*linesize);
if(simple || !CONFIG_GRAY || !(s->flags&CODEC_FLAG_GRAY)){ if(simple || !CONFIG_GRAY || !(s->flags&CODEC_FLAG_GRAY)){
*(uint64_t*)(h->top_borders[top_idx][s->mb_x]+16)= *(uint64_t*)(src_cb+8*uvlinesize); AV_COPY64(top_border+16, src_cb+8*uvlinesize);
*(uint64_t*)(h->top_borders[top_idx][s->mb_x]+24)= *(uint64_t*)(src_cr+8*uvlinesize); AV_COPY64(top_border+24, src_cr+8*uvlinesize);
} }
} }
...@@ -987,6 +987,8 @@ static inline void xchg_mb_border(H264Context *h, uint8_t *src_y, uint8_t *src_c ...@@ -987,6 +987,8 @@ static inline void xchg_mb_border(H264Context *h, uint8_t *src_y, uint8_t *src_c
int deblock_top; int deblock_top;
int mb_xy; int mb_xy;
int top_idx = 1; int top_idx = 1;
uint8_t *top_border_m1 = h->top_borders[top_idx][s->mb_x-1];
uint8_t *top_border = h->top_borders[top_idx][s->mb_x];
if(!simple && FRAME_MBAFF){ if(!simple && FRAME_MBAFF){
if(s->mb_y&1){ if(s->mb_y&1){
...@@ -1010,31 +1012,29 @@ static inline void xchg_mb_border(H264Context *h, uint8_t *src_y, uint8_t *src_c ...@@ -1010,31 +1012,29 @@ static inline void xchg_mb_border(H264Context *h, uint8_t *src_y, uint8_t *src_c
src_cb -= uvlinesize + 1; src_cb -= uvlinesize + 1;
src_cr -= uvlinesize + 1; src_cr -= uvlinesize + 1;
#define XCHG(a,b,t,xchg)\ #define XCHG(a,b,xchg)\
t= a;\ if (xchg) AV_SWAP64(b,a);\
if(xchg)\ else AV_COPY64(b,a);
a= b;\
b= t;
if(deblock_top){ if(deblock_top){
if(deblock_left){ if(deblock_left){
XCHG(*(uint64_t*)(h->top_borders[top_idx][s->mb_x-1]+8), *(uint64_t*)(src_y -7), temp64, 1); XCHG(top_border_m1+8, src_y -7, 1);
} }
XCHG(*(uint64_t*)(h->top_borders[top_idx][s->mb_x]+0), *(uint64_t*)(src_y +1), temp64, xchg); XCHG(top_border+0, src_y +1, xchg);
XCHG(*(uint64_t*)(h->top_borders[top_idx][s->mb_x]+8), *(uint64_t*)(src_y +9), temp64, 1); XCHG(top_border+8, src_y +9, 1);
if(s->mb_x+1 < s->mb_width){ if(s->mb_x+1 < s->mb_width){
XCHG(*(uint64_t*)(h->top_borders[top_idx][s->mb_x+1]), *(uint64_t*)(src_y +17), temp64, 1); XCHG(h->top_borders[top_idx][s->mb_x+1], src_y +17, 1);
} }
} }
if(simple || !CONFIG_GRAY || !(s->flags&CODEC_FLAG_GRAY)){ if(simple || !CONFIG_GRAY || !(s->flags&CODEC_FLAG_GRAY)){
if(deblock_top){ if(deblock_top){
if(deblock_left){ if(deblock_left){
XCHG(*(uint64_t*)(h->top_borders[top_idx][s->mb_x-1]+16), *(uint64_t*)(src_cb -7), temp64, 1); XCHG(top_border_m1+16, src_cb -7, 1);
XCHG(*(uint64_t*)(h->top_borders[top_idx][s->mb_x-1]+24), *(uint64_t*)(src_cr -7), temp64, 1); XCHG(top_border_m1+24, src_cr -7, 1);
} }
XCHG(*(uint64_t*)(h->top_borders[top_idx][s->mb_x]+16), *(uint64_t*)(src_cb+1), temp64, 1); XCHG(top_border+16, src_cb+1, 1);
XCHG(*(uint64_t*)(h->top_borders[top_idx][s->mb_x]+24), *(uint64_t*)(src_cr+1), temp64, 1); XCHG(top_border+24, src_cr+1, 1);
} }
} }
} }
......
...@@ -312,7 +312,7 @@ typedef struct H264Context{ ...@@ -312,7 +312,7 @@ typedef struct H264Context{
/** /**
* Motion vector cache. * Motion vector cache.
*/ */
DECLARE_ALIGNED_8(int16_t, mv_cache)[2][5*8][2]; DECLARE_ALIGNED_16(int16_t, mv_cache)[2][5*8][2];
DECLARE_ALIGNED_8(int8_t, ref_cache)[2][5*8]; DECLARE_ALIGNED_8(int8_t, ref_cache)[2][5*8];
#define LIST_NOT_USED -1 //FIXME rename? #define LIST_NOT_USED -1 //FIXME rename?
#define PART_NOT_AVAILABLE -2 #define PART_NOT_AVAILABLE -2
...@@ -475,7 +475,7 @@ typedef struct H264Context{ ...@@ -475,7 +475,7 @@ typedef struct H264Context{
uint8_t *chroma_pred_mode_table; uint8_t *chroma_pred_mode_table;
int last_qscale_diff; int last_qscale_diff;
int16_t (*mvd_table[2])[2]; int16_t (*mvd_table[2])[2];
DECLARE_ALIGNED_8(int16_t, mvd_cache)[2][5*8][2]; DECLARE_ALIGNED_16(int16_t, mvd_cache)[2][5*8][2];
uint8_t *direct_table; uint8_t *direct_table;
uint8_t direct_cache[5*8]; uint8_t direct_cache[5*8];
...@@ -809,11 +809,11 @@ static av_always_inline int fill_caches(H264Context *h, int mb_type, int for_deb ...@@ -809,11 +809,11 @@ static av_always_inline int fill_caches(H264Context *h, int mb_type, int for_deb
if(IS_INTRA(mb_type)) if(IS_INTRA(mb_type))
return 0; return 0;
*((uint64_t*)&h->non_zero_count_cache[0+8*1])= *((uint64_t*)&h->non_zero_count[mb_xy][ 0]); AV_COPY64(&h->non_zero_count_cache[0+8*1], &h->non_zero_count[mb_xy][ 0]);
*((uint64_t*)&h->non_zero_count_cache[0+8*2])= *((uint64_t*)&h->non_zero_count[mb_xy][ 8]); AV_COPY64(&h->non_zero_count_cache[0+8*2], &h->non_zero_count[mb_xy][ 8]);
*((uint32_t*)&h->non_zero_count_cache[0+8*5])= *((uint32_t*)&h->non_zero_count[mb_xy][16]); *((uint32_t*)&h->non_zero_count_cache[0+8*5])= *((uint32_t*)&h->non_zero_count[mb_xy][16]);
*((uint32_t*)&h->non_zero_count_cache[4+8*3])= *((uint32_t*)&h->non_zero_count[mb_xy][20]); *((uint32_t*)&h->non_zero_count_cache[4+8*3])= *((uint32_t*)&h->non_zero_count[mb_xy][20]);
*((uint64_t*)&h->non_zero_count_cache[0+8*4])= *((uint64_t*)&h->non_zero_count[mb_xy][24]); AV_COPY64(&h->non_zero_count_cache[0+8*4], &h->non_zero_count[mb_xy][24]);
h->cbp= h->cbp_table[mb_xy]; h->cbp= h->cbp_table[mb_xy];
...@@ -825,7 +825,10 @@ static av_always_inline int fill_caches(H264Context *h, int mb_type, int for_deb ...@@ -825,7 +825,10 @@ static av_always_inline int fill_caches(H264Context *h, int mb_type, int for_deb
int list; int list;
for(list=0; list<h->list_count; list++){ for(list=0; list<h->list_count; list++){
int8_t *ref; int8_t *ref;
int y, b_xy; int y, b_stride;
int16_t (*mv_dst)[2];
int16_t (*mv_src)[2];
if(!USES_LIST(mb_type, list)){ if(!USES_LIST(mb_type, list)){
fill_rectangle( h->mv_cache[list][scan8[0]], 4, 4, 8, pack16to32(0,0), 4); fill_rectangle( h->mv_cache[list][scan8[0]], 4, 4, 8, pack16to32(0,0), 4);
*(uint32_t*)&h->ref_cache[list][scan8[ 0]] = *(uint32_t*)&h->ref_cache[list][scan8[ 0]] =
...@@ -845,10 +848,11 @@ static av_always_inline int fill_caches(H264Context *h, int mb_type, int for_deb ...@@ -845,10 +848,11 @@ static av_always_inline int fill_caches(H264Context *h, int mb_type, int for_deb
*(uint32_t*)&h->ref_cache[list][scan8[10]] = (pack16to32(ref2frm[list][ref[0]],ref2frm[list][ref[1]])&0x00FF00FF)*0x0101; *(uint32_t*)&h->ref_cache[list][scan8[10]] = (pack16to32(ref2frm[list][ref[0]],ref2frm[list][ref[1]])&0x00FF00FF)*0x0101;
} }
b_xy = 4*s->mb_x + 4*s->mb_y*h->b_stride; b_stride = h->b_stride;
mv_dst = &h->mv_cache[list][scan8[0]];
mv_src = &s->current_picture.motion_val[list][4*s->mb_x + 4*s->mb_y*b_stride];
for(y=0; y<4; y++){ for(y=0; y<4; y++){
*(uint64_t*)h->mv_cache[list][scan8[0]+0 + 8*y]= *(uint64_t*)s->current_picture.motion_val[list][b_xy + 0 + y*h->b_stride]; AV_COPY128(mv_dst + 8*y, mv_src + y*b_stride);
*(uint64_t*)h->mv_cache[list][scan8[0]+2 + 8*y]= *(uint64_t*)s->current_picture.motion_val[list][b_xy + 2 + y*h->b_stride];
} }
} }
...@@ -1059,8 +1063,7 @@ static av_always_inline int fill_caches(H264Context *h, int mb_type, int for_deb ...@@ -1059,8 +1063,7 @@ static av_always_inline int fill_caches(H264Context *h, int mb_type, int for_deb
if(USES_LIST(top_type, list)){ if(USES_LIST(top_type, list)){
const int b_xy= h->mb2b_xy[top_xy] + 3*h->b_stride; const int b_xy= h->mb2b_xy[top_xy] + 3*h->b_stride;
const int b8_xy= h->mb2b8_xy[top_xy] + h->b8_stride; const int b8_xy= h->mb2b8_xy[top_xy] + h->b8_stride;
*(uint64_t*)h->mv_cache[list][scan8[0] + 0 - 1*8]= *(uint64_t*)s->current_picture.motion_val[list][b_xy + 0]; AV_COPY128(h->mv_cache[list][scan8[0] + 0 - 1*8], s->current_picture.motion_val[list][b_xy + 0]);
*(uint64_t*)h->mv_cache[list][scan8[0] + 2 - 1*8]= *(uint64_t*)s->current_picture.motion_val[list][b_xy + 2];
if(for_deblock){ if(for_deblock){
int (*ref2frm)[64] = h->ref2frm[ h->slice_table[top_xy]&(MAX_SLICES-1) ][0] + (MB_MBAFF ? 20 : 2); int (*ref2frm)[64] = h->ref2frm[ h->slice_table[top_xy]&(MAX_SLICES-1) ][0] + (MB_MBAFF ? 20 : 2);
h->ref_cache[list][scan8[0] + 0 - 1*8]= h->ref_cache[list][scan8[0] + 0 - 1*8]=
...@@ -1074,8 +1077,7 @@ static av_always_inline int fill_caches(H264Context *h, int mb_type, int for_deb ...@@ -1074,8 +1077,7 @@ static av_always_inline int fill_caches(H264Context *h, int mb_type, int for_deb
h->ref_cache[list][scan8[0] + 3 - 1*8]= s->current_picture.ref_index[list][b8_xy + 1]; h->ref_cache[list][scan8[0] + 3 - 1*8]= s->current_picture.ref_index[list][b8_xy + 1];
} }
}else{ }else{
*(uint64_t*)h->mv_cache [list][scan8[0] + 0 - 1*8]= AV_ZERO128(h->mv_cache[list][scan8[0] + 0 - 1*8]);
*(uint64_t*)h->mv_cache [list][scan8[0] + 2 - 1*8]= 0;
*(uint32_t*)&h->ref_cache[list][scan8[0] + 0 - 1*8]= (((for_deblock||top_type) ? LIST_NOT_USED : PART_NOT_AVAILABLE)&0xFF)*0x01010101; *(uint32_t*)&h->ref_cache[list][scan8[0] + 0 - 1*8]= (((for_deblock||top_type) ? LIST_NOT_USED : PART_NOT_AVAILABLE)&0xFF)*0x01010101;
} }
...@@ -1143,15 +1145,9 @@ static av_always_inline int fill_caches(H264Context *h, int mb_type, int for_deb ...@@ -1143,15 +1145,9 @@ static av_always_inline int fill_caches(H264Context *h, int mb_type, int for_deb
/* XXX beurk, Load mvd */ /* XXX beurk, Load mvd */
if(USES_LIST(top_type, list)){ if(USES_LIST(top_type, list)){
const int b_xy= h->mb2b_xy[top_xy] + 3*h->b_stride; const int b_xy= h->mb2b_xy[top_xy] + 3*h->b_stride;
*(uint32_t*)h->mvd_cache[list][scan8[0] + 0 - 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + 0]; AV_COPY128(h->mvd_cache[list][scan8[0] + 0 - 1*8], h->mvd_table[list][b_xy + 0]);
*(uint32_t*)h->mvd_cache[list][scan8[0] + 1 - 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + 1];
*(uint32_t*)h->mvd_cache[list][scan8[0] + 2 - 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + 2];
*(uint32_t*)h->mvd_cache[list][scan8[0] + 3 - 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + 3];
}else{ }else{
*(uint32_t*)h->mvd_cache [list][scan8[0] + 0 - 1*8]= AV_ZERO128(h->mvd_cache[list][scan8[0] + 0 - 1*8]);
*(uint32_t*)h->mvd_cache [list][scan8[0] + 1 - 1*8]=
*(uint32_t*)h->mvd_cache [list][scan8[0] + 2 - 1*8]=
*(uint32_t*)h->mvd_cache [list][scan8[0] + 3 - 1*8]= 0;
} }
if(USES_LIST(left_type[0], list)){ if(USES_LIST(left_type[0], list)){
const int b_xy= h->mb2b_xy[left_xy[0]] + 3; const int b_xy= h->mb2b_xy[left_xy[0]] + 3;
...@@ -1275,11 +1271,11 @@ static inline int pred_intra_mode(H264Context *h, int n){ ...@@ -1275,11 +1271,11 @@ static inline int pred_intra_mode(H264Context *h, int n){
static inline void write_back_non_zero_count(H264Context *h){ static inline void write_back_non_zero_count(H264Context *h){
const int mb_xy= h->mb_xy; const int mb_xy= h->mb_xy;
*((uint64_t*)&h->non_zero_count[mb_xy][ 0]) = *((uint64_t*)&h->non_zero_count_cache[0+8*1]); AV_COPY64(&h->non_zero_count[mb_xy][ 0], &h->non_zero_count_cache[0+8*1]);
*((uint64_t*)&h->non_zero_count[mb_xy][ 8]) = *((uint64_t*)&h->non_zero_count_cache[0+8*2]); AV_COPY64(&h->non_zero_count[mb_xy][ 8], &h->non_zero_count_cache[0+8*2]);
*((uint32_t*)&h->non_zero_count[mb_xy][16]) = *((uint32_t*)&h->non_zero_count_cache[0+8*5]); *((uint32_t*)&h->non_zero_count[mb_xy][16]) = *((uint32_t*)&h->non_zero_count_cache[0+8*5]);
*((uint32_t*)&h->non_zero_count[mb_xy][20]) = *((uint32_t*)&h->non_zero_count_cache[4+8*3]); *((uint32_t*)&h->non_zero_count[mb_xy][20]) = *((uint32_t*)&h->non_zero_count_cache[4+8*3]);
*((uint64_t*)&h->non_zero_count[mb_xy][24]) = *((uint64_t*)&h->non_zero_count_cache[0+8*4]); AV_COPY64(&h->non_zero_count[mb_xy][24], &h->non_zero_count_cache[0+8*4]);
} }
static inline void write_back_motion(H264Context *h, int mb_type){ static inline void write_back_motion(H264Context *h, int mb_type){
...@@ -1292,21 +1288,27 @@ static inline void write_back_motion(H264Context *h, int mb_type){ ...@@ -1292,21 +1288,27 @@ static inline void write_back_motion(H264Context *h, int mb_type){
fill_rectangle(&s->current_picture.ref_index[0][b8_xy], 2, 2, h->b8_stride, (uint8_t)LIST_NOT_USED, 1); fill_rectangle(&s->current_picture.ref_index[0][b8_xy], 2, 2, h->b8_stride, (uint8_t)LIST_NOT_USED, 1);
for(list=0; list<h->list_count; list++){ for(list=0; list<h->list_count; list++){
int y; int y, b_stride;
int16_t (*mv_dst)[2];
int16_t (*mv_src)[2];
if(!USES_LIST(mb_type, list)) if(!USES_LIST(mb_type, list))
continue; continue;
b_stride = h->b_stride;
mv_dst = &s->current_picture.motion_val[list][b_xy];
mv_src = &h->mv_cache[list][scan8[0]];
for(y=0; y<4; y++){ for(y=0; y<4; y++){
*(uint64_t*)s->current_picture.motion_val[list][b_xy + 0 + y*h->b_stride]= *(uint64_t*)h->mv_cache[list][scan8[0]+0 + 8*y]; AV_COPY128(mv_dst + y*b_stride, mv_src + 8*y);
*(uint64_t*)s->current_picture.motion_val[list][b_xy + 2 + y*h->b_stride]= *(uint64_t*)h->mv_cache[list][scan8[0]+2 + 8*y];
} }
if( CABAC ) { if( CABAC ) {
int16_t (*mvd_dst)[2] = &h->mvd_table[list][b_xy];
int16_t (*mvd_src)[2] = &h->mvd_cache[list][scan8[0]];
if(IS_SKIP(mb_type)) if(IS_SKIP(mb_type))
fill_rectangle(h->mvd_table[list][b_xy], 4, 4, h->b_stride, 0, 4); fill_rectangle(mvd_dst, 4, 4, h->b_stride, 0, 4);
else else
for(y=0; y<4; y++){ for(y=0; y<4; y++){
*(uint64_t*)h->mvd_table[list][b_xy + 0 + y*h->b_stride]= *(uint64_t*)h->mvd_cache[list][scan8[0]+0 + 8*y]; AV_COPY128(mvd_dst + y*b_stride, mvd_src + 8*y);
*(uint64_t*)h->mvd_table[list][b_xy + 2 + y*h->b_stride]= *(uint64_t*)h->mvd_cache[list][scan8[0]+2 + 8*y];
} }
} }
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment