Commit 6253132f authored by michael's avatar michael

Get rid of mb2b8_xy and b8_stride, change arrays organized based on b8_stride to

ones based on mb_stride in h264.
about 20 cpu cycles faster overall per MB


git-svn-id: file:///var/local/repositories/ffmpeg/trunk@22065 9553f0bf-9b14-0410-a0b8-cfaf0461ba5b
parent a4010a01
...@@ -692,7 +692,7 @@ void ff_er_frame_end(MpegEncContext *s){ ...@@ -692,7 +692,7 @@ void ff_er_frame_end(MpegEncContext *s){
av_log(s->avctx, AV_LOG_ERROR, "Warning MVs not available\n"); av_log(s->avctx, AV_LOG_ERROR, "Warning MVs not available\n");
for(i=0; i<2; i++){ for(i=0; i<2; i++){
pic->ref_index[i]= av_mallocz(size * sizeof(uint8_t)); pic->ref_index[i]= av_mallocz(s->mb_stride * s->mb_height * 4 * sizeof(uint8_t));
pic->motion_val_base[i]= av_mallocz((size+4) * 2 * sizeof(uint16_t)); pic->motion_val_base[i]= av_mallocz((size+4) * 2 * sizeof(uint16_t));
pic->motion_val[i]= pic->motion_val_base[i]+4; pic->motion_val[i]= pic->motion_val_base[i]+4;
} }
......
...@@ -661,7 +661,6 @@ static void free_tables(H264Context *h){ ...@@ -661,7 +661,6 @@ static void free_tables(H264Context *h){
av_freep(&h->mb2b_xy); av_freep(&h->mb2b_xy);
av_freep(&h->mb2br_xy); av_freep(&h->mb2br_xy);
av_freep(&h->mb2b8_xy);
for(i = 0; i < MAX_THREADS; i++) { for(i = 0; i < MAX_THREADS; i++) {
hx = h->thread_context[i]; hx = h->thread_context[i];
...@@ -764,16 +763,13 @@ int ff_h264_alloc_tables(H264Context *h){ ...@@ -764,16 +763,13 @@ int ff_h264_alloc_tables(H264Context *h){
FF_ALLOCZ_OR_GOTO(h->s.avctx, h->mb2b_xy , big_mb_num * sizeof(uint32_t), fail); FF_ALLOCZ_OR_GOTO(h->s.avctx, h->mb2b_xy , big_mb_num * sizeof(uint32_t), fail);
FF_ALLOCZ_OR_GOTO(h->s.avctx, h->mb2br_xy , big_mb_num * sizeof(uint32_t), fail); FF_ALLOCZ_OR_GOTO(h->s.avctx, h->mb2br_xy , big_mb_num * sizeof(uint32_t), fail);
FF_ALLOCZ_OR_GOTO(h->s.avctx, h->mb2b8_xy , big_mb_num * sizeof(uint32_t), fail);
for(y=0; y<s->mb_height; y++){ for(y=0; y<s->mb_height; y++){
for(x=0; x<s->mb_width; x++){ for(x=0; x<s->mb_width; x++){
const int mb_xy= x + y*s->mb_stride; const int mb_xy= x + y*s->mb_stride;
const int b_xy = 4*x + 4*y*h->b_stride; const int b_xy = 4*x + 4*y*h->b_stride;
const int b8_xy= 2*x + 2*y*h->b8_stride;
h->mb2b_xy [mb_xy]= b_xy; h->mb2b_xy [mb_xy]= b_xy;
h->mb2br_xy[mb_xy]= 8*(FMO ? mb_xy : (mb_xy % (2*s->mb_stride))); h->mb2br_xy[mb_xy]= 8*(FMO ? mb_xy : (mb_xy % (2*s->mb_stride)));
h->mb2b8_xy[mb_xy]= b8_xy;
} }
} }
...@@ -798,7 +794,6 @@ static void clone_tables(H264Context *dst, H264Context *src){ ...@@ -798,7 +794,6 @@ static void clone_tables(H264Context *dst, H264Context *src){
dst->cbp_table = src->cbp_table; dst->cbp_table = src->cbp_table;
dst->mb2b_xy = src->mb2b_xy; dst->mb2b_xy = src->mb2b_xy;
dst->mb2br_xy = src->mb2br_xy; dst->mb2br_xy = src->mb2br_xy;
dst->mb2b8_xy = src->mb2b8_xy;
dst->chroma_pred_mode_table = src->chroma_pred_mode_table; dst->chroma_pred_mode_table = src->chroma_pred_mode_table;
dst->mvd_table[0] = src->mvd_table[0]; dst->mvd_table[0] = src->mvd_table[0];
dst->mvd_table[1] = src->mvd_table[1]; dst->mvd_table[1] = src->mvd_table[1];
...@@ -1768,7 +1763,6 @@ static int decode_slice_header(H264Context *h, H264Context *h0){ ...@@ -1768,7 +1763,6 @@ static int decode_slice_header(H264Context *h, H264Context *h0){
s->mb_height= h->sps.mb_height * (2 - h->sps.frame_mbs_only_flag); s->mb_height= h->sps.mb_height * (2 - h->sps.frame_mbs_only_flag);
h->b_stride= s->mb_width*4; h->b_stride= s->mb_width*4;
h->b8_stride= s->mb_width*2;
s->width = 16*s->mb_width - 2*FFMIN(h->sps.crop_right, 7); s->width = 16*s->mb_width - 2*FFMIN(h->sps.crop_right, 7);
if(h->sps.frame_mbs_only_flag) if(h->sps.frame_mbs_only_flag)
......
...@@ -347,9 +347,7 @@ typedef struct H264Context{ ...@@ -347,9 +347,7 @@ typedef struct H264Context{
uint32_t *mb2b_xy; //FIXME are these 4 a good idea? uint32_t *mb2b_xy; //FIXME are these 4 a good idea?
uint32_t *mb2br_xy; uint32_t *mb2br_xy;
uint32_t *mb2b8_xy;
int b_stride; //FIXME use s->b4_stride int b_stride; //FIXME use s->b4_stride
int b8_stride;
int mb_linesize; ///< may be equal to s->linesize or s->linesize*2, for mbaff int mb_linesize; ///< may be equal to s->linesize or s->linesize*2, for mbaff
int mb_uvlinesize; int mb_uvlinesize;
...@@ -990,12 +988,11 @@ static void fill_decode_caches(H264Context *h, int mb_type){ ...@@ -990,12 +988,11 @@ static void fill_decode_caches(H264Context *h, int mb_type){
if(USES_LIST(top_type, list)){ if(USES_LIST(top_type, list)){
const int b_xy= h->mb2b_xy[top_xy] + 3*h->b_stride; const int b_xy= h->mb2b_xy[top_xy] + 3*h->b_stride;
const int b8_xy= h->mb2b8_xy[top_xy] + h->b8_stride;
AV_COPY128(h->mv_cache[list][scan8[0] + 0 - 1*8], s->current_picture.motion_val[list][b_xy + 0]); AV_COPY128(h->mv_cache[list][scan8[0] + 0 - 1*8], s->current_picture.motion_val[list][b_xy + 0]);
h->ref_cache[list][scan8[0] + 0 - 1*8]= h->ref_cache[list][scan8[0] + 0 - 1*8]=
h->ref_cache[list][scan8[0] + 1 - 1*8]= s->current_picture.ref_index[list][b8_xy + 0]; h->ref_cache[list][scan8[0] + 1 - 1*8]= s->current_picture.ref_index[list][4*top_xy + 2];
h->ref_cache[list][scan8[0] + 2 - 1*8]= h->ref_cache[list][scan8[0] + 2 - 1*8]=
h->ref_cache[list][scan8[0] + 3 - 1*8]= s->current_picture.ref_index[list][b8_xy + 1]; h->ref_cache[list][scan8[0] + 3 - 1*8]= s->current_picture.ref_index[list][4*top_xy + 3];
}else{ }else{
AV_ZERO128(h->mv_cache[list][scan8[0] + 0 - 1*8]); AV_ZERO128(h->mv_cache[list][scan8[0] + 0 - 1*8]);
AV_WN32A(&h->ref_cache[list][scan8[0] + 0 - 1*8], ((top_type ? LIST_NOT_USED : PART_NOT_AVAILABLE)&0xFF)*0x01010101); AV_WN32A(&h->ref_cache[list][scan8[0] + 0 - 1*8], ((top_type ? LIST_NOT_USED : PART_NOT_AVAILABLE)&0xFF)*0x01010101);
...@@ -1005,11 +1002,11 @@ static void fill_decode_caches(H264Context *h, int mb_type){ ...@@ -1005,11 +1002,11 @@ static void fill_decode_caches(H264Context *h, int mb_type){
int cache_idx = scan8[0] - 1 + i*2*8; int cache_idx = scan8[0] - 1 + i*2*8;
if(USES_LIST(left_type[i], list)){ if(USES_LIST(left_type[i], list)){
const int b_xy= h->mb2b_xy[left_xy[i]] + 3; const int b_xy= h->mb2b_xy[left_xy[i]] + 3;
const int b8_xy= h->mb2b8_xy[left_xy[i]] + 1; const int b8_xy= 4*left_xy[i] + 1;
AV_COPY32(h->mv_cache[list][cache_idx ], s->current_picture.motion_val[list][b_xy + h->b_stride*left_block[0+i*2]]); AV_COPY32(h->mv_cache[list][cache_idx ], s->current_picture.motion_val[list][b_xy + h->b_stride*left_block[0+i*2]]);
AV_COPY32(h->mv_cache[list][cache_idx+8], s->current_picture.motion_val[list][b_xy + h->b_stride*left_block[1+i*2]]); AV_COPY32(h->mv_cache[list][cache_idx+8], s->current_picture.motion_val[list][b_xy + h->b_stride*left_block[1+i*2]]);
h->ref_cache[list][cache_idx ]= s->current_picture.ref_index[list][b8_xy + h->b8_stride*(left_block[0+i*2]>>1)]; h->ref_cache[list][cache_idx ]= s->current_picture.ref_index[list][b8_xy + (left_block[0+i*2]&~1)];
h->ref_cache[list][cache_idx+8]= s->current_picture.ref_index[list][b8_xy + h->b8_stride*(left_block[1+i*2]>>1)]; h->ref_cache[list][cache_idx+8]= s->current_picture.ref_index[list][b8_xy + (left_block[1+i*2]&~1)];
}else{ }else{
AV_ZERO32(h->mv_cache [list][cache_idx ]); AV_ZERO32(h->mv_cache [list][cache_idx ]);
AV_ZERO32(h->mv_cache [list][cache_idx+8]); AV_ZERO32(h->mv_cache [list][cache_idx+8]);
...@@ -1020,7 +1017,7 @@ static void fill_decode_caches(H264Context *h, int mb_type){ ...@@ -1020,7 +1017,7 @@ static void fill_decode_caches(H264Context *h, int mb_type){
if(USES_LIST(topleft_type, list)){ if(USES_LIST(topleft_type, list)){
const int b_xy = h->mb2b_xy [topleft_xy] + 3 + h->b_stride + (h->topleft_partition & 2*h->b_stride); const int b_xy = h->mb2b_xy [topleft_xy] + 3 + h->b_stride + (h->topleft_partition & 2*h->b_stride);
const int b8_xy= h->mb2b8_xy[topleft_xy] + 1 + (h->topleft_partition & h->b8_stride); const int b8_xy= 4*topleft_xy + 1 + (h->topleft_partition & 2);
AV_COPY32(h->mv_cache[list][scan8[0] - 1 - 1*8], s->current_picture.motion_val[list][b_xy]); AV_COPY32(h->mv_cache[list][scan8[0] - 1 - 1*8], s->current_picture.motion_val[list][b_xy]);
h->ref_cache[list][scan8[0] - 1 - 1*8]= s->current_picture.ref_index[list][b8_xy]; h->ref_cache[list][scan8[0] - 1 - 1*8]= s->current_picture.ref_index[list][b8_xy];
}else{ }else{
...@@ -1030,9 +1027,8 @@ static void fill_decode_caches(H264Context *h, int mb_type){ ...@@ -1030,9 +1027,8 @@ static void fill_decode_caches(H264Context *h, int mb_type){
if(USES_LIST(topright_type, list)){ if(USES_LIST(topright_type, list)){
const int b_xy= h->mb2b_xy[topright_xy] + 3*h->b_stride; const int b_xy= h->mb2b_xy[topright_xy] + 3*h->b_stride;
const int b8_xy= h->mb2b8_xy[topright_xy] + h->b8_stride;
AV_COPY32(h->mv_cache[list][scan8[0] + 4 - 1*8], s->current_picture.motion_val[list][b_xy]); AV_COPY32(h->mv_cache[list][scan8[0] + 4 - 1*8], s->current_picture.motion_val[list][b_xy]);
h->ref_cache[list][scan8[0] + 4 - 1*8]= s->current_picture.ref_index[list][b8_xy]; h->ref_cache[list][scan8[0] + 4 - 1*8]= s->current_picture.ref_index[list][4*topright_xy + 2];
}else{ }else{
AV_ZERO32(h->mv_cache [list][scan8[0] + 4 - 1*8]); AV_ZERO32(h->mv_cache [list][scan8[0] + 4 - 1*8]);
h->ref_cache[list][scan8[0] + 4 - 1*8]= topright_type ? LIST_NOT_USED : PART_NOT_AVAILABLE; h->ref_cache[list][scan8[0] + 4 - 1*8]= topright_type ? LIST_NOT_USED : PART_NOT_AVAILABLE;
...@@ -1241,12 +1237,12 @@ static int fill_filter_caches(H264Context *h, int mb_type){ ...@@ -1241,12 +1237,12 @@ static int fill_filter_caches(H264Context *h, int mb_type){
continue; continue;
} }
ref = &s->current_picture.ref_index[list][h->mb2b8_xy[mb_xy]]; ref = &s->current_picture.ref_index[list][4*mb_xy];
{ {
int (*ref2frm)[64] = h->ref2frm[ h->slice_num&(MAX_SLICES-1) ][0] + (MB_MBAFF ? 20 : 2); int (*ref2frm)[64] = h->ref2frm[ h->slice_num&(MAX_SLICES-1) ][0] + (MB_MBAFF ? 20 : 2);
AV_WN32A(&h->ref_cache[list][scan8[ 0]], (pack16to32(ref2frm[list][ref[0]],ref2frm[list][ref[1]])&0x00FF00FF)*0x0101); AV_WN32A(&h->ref_cache[list][scan8[ 0]], (pack16to32(ref2frm[list][ref[0]],ref2frm[list][ref[1]])&0x00FF00FF)*0x0101);
AV_WN32A(&h->ref_cache[list][scan8[ 2]], (pack16to32(ref2frm[list][ref[0]],ref2frm[list][ref[1]])&0x00FF00FF)*0x0101); AV_WN32A(&h->ref_cache[list][scan8[ 2]], (pack16to32(ref2frm[list][ref[0]],ref2frm[list][ref[1]])&0x00FF00FF)*0x0101);
ref += h->b8_stride; ref += 2;
AV_WN32A(&h->ref_cache[list][scan8[ 8]], (pack16to32(ref2frm[list][ref[0]],ref2frm[list][ref[1]])&0x00FF00FF)*0x0101); AV_WN32A(&h->ref_cache[list][scan8[ 8]], (pack16to32(ref2frm[list][ref[0]],ref2frm[list][ref[1]])&0x00FF00FF)*0x0101);
AV_WN32A(&h->ref_cache[list][scan8[10]], (pack16to32(ref2frm[list][ref[0]],ref2frm[list][ref[1]])&0x00FF00FF)*0x0101); AV_WN32A(&h->ref_cache[list][scan8[10]], (pack16to32(ref2frm[list][ref[0]],ref2frm[list][ref[1]])&0x00FF00FF)*0x0101);
} }
...@@ -1319,7 +1315,7 @@ static int fill_filter_caches(H264Context *h, int mb_type){ ...@@ -1319,7 +1315,7 @@ static int fill_filter_caches(H264Context *h, int mb_type){
for(list=0; list<h->list_count; list++){ for(list=0; list<h->list_count; list++){
if(USES_LIST(top_type, list)){ if(USES_LIST(top_type, list)){
const int b_xy= h->mb2b_xy[top_xy] + 3*h->b_stride; const int b_xy= h->mb2b_xy[top_xy] + 3*h->b_stride;
const int b8_xy= h->mb2b8_xy[top_xy] + h->b8_stride; const int b8_xy= 4*top_xy + 2;
int (*ref2frm)[64] = h->ref2frm[ h->slice_table[top_xy]&(MAX_SLICES-1) ][0] + (MB_MBAFF ? 20 : 2); int (*ref2frm)[64] = h->ref2frm[ h->slice_table[top_xy]&(MAX_SLICES-1) ][0] + (MB_MBAFF ? 20 : 2);
AV_COPY128(h->mv_cache[list][scan8[0] + 0 - 1*8], s->current_picture.motion_val[list][b_xy + 0]); AV_COPY128(h->mv_cache[list][scan8[0] + 0 - 1*8], s->current_picture.motion_val[list][b_xy + 0]);
h->ref_cache[list][scan8[0] + 0 - 1*8]= h->ref_cache[list][scan8[0] + 0 - 1*8]=
...@@ -1334,16 +1330,16 @@ static int fill_filter_caches(H264Context *h, int mb_type){ ...@@ -1334,16 +1330,16 @@ static int fill_filter_caches(H264Context *h, int mb_type){
if(!IS_INTERLACED(mb_type^left_type[0])){ if(!IS_INTERLACED(mb_type^left_type[0])){
if(USES_LIST(left_type[0], list)){ if(USES_LIST(left_type[0], list)){
const int b_xy= h->mb2b_xy[left_xy[0]] + 3; const int b_xy= h->mb2b_xy[left_xy[0]] + 3;
const int b8_xy= h->mb2b8_xy[left_xy[0]] + 1; const int b8_xy= 4*left_xy[0] + 1;
int (*ref2frm)[64] = h->ref2frm[ h->slice_table[left_xy[0]]&(MAX_SLICES-1) ][0] + (MB_MBAFF ? 20 : 2); int (*ref2frm)[64] = h->ref2frm[ h->slice_table[left_xy[0]]&(MAX_SLICES-1) ][0] + (MB_MBAFF ? 20 : 2);
AV_COPY32(h->mv_cache[list][scan8[0] - 1 + 0 ], s->current_picture.motion_val[list][b_xy + h->b_stride*0]); AV_COPY32(h->mv_cache[list][scan8[0] - 1 + 0 ], s->current_picture.motion_val[list][b_xy + h->b_stride*0]);
AV_COPY32(h->mv_cache[list][scan8[0] - 1 + 8 ], s->current_picture.motion_val[list][b_xy + h->b_stride*1]); AV_COPY32(h->mv_cache[list][scan8[0] - 1 + 8 ], s->current_picture.motion_val[list][b_xy + h->b_stride*1]);
AV_COPY32(h->mv_cache[list][scan8[0] - 1 +16 ], s->current_picture.motion_val[list][b_xy + h->b_stride*2]); AV_COPY32(h->mv_cache[list][scan8[0] - 1 +16 ], s->current_picture.motion_val[list][b_xy + h->b_stride*2]);
AV_COPY32(h->mv_cache[list][scan8[0] - 1 +24 ], s->current_picture.motion_val[list][b_xy + h->b_stride*3]); AV_COPY32(h->mv_cache[list][scan8[0] - 1 +24 ], s->current_picture.motion_val[list][b_xy + h->b_stride*3]);
h->ref_cache[list][scan8[0] - 1 + 0 ]= h->ref_cache[list][scan8[0] - 1 + 0 ]=
h->ref_cache[list][scan8[0] - 1 + 8 ]= ref2frm[list][s->current_picture.ref_index[list][b8_xy + h->b8_stride*0]]; h->ref_cache[list][scan8[0] - 1 + 8 ]= ref2frm[list][s->current_picture.ref_index[list][b8_xy + 2*0]];
h->ref_cache[list][scan8[0] - 1 +16 ]= h->ref_cache[list][scan8[0] - 1 +16 ]=
h->ref_cache[list][scan8[0] - 1 +24 ]= ref2frm[list][s->current_picture.ref_index[list][b8_xy + h->b8_stride*1]]; h->ref_cache[list][scan8[0] - 1 +24 ]= ref2frm[list][s->current_picture.ref_index[list][b8_xy + 2*1]];
}else{ }else{
AV_ZERO32(h->mv_cache [list][scan8[0] - 1 + 0 ]); AV_ZERO32(h->mv_cache [list][scan8[0] - 1 + 0 ]);
AV_ZERO32(h->mv_cache [list][scan8[0] - 1 + 8 ]); AV_ZERO32(h->mv_cache [list][scan8[0] - 1 + 8 ]);
...@@ -1388,12 +1384,12 @@ static inline void write_back_non_zero_count(H264Context *h){ ...@@ -1388,12 +1384,12 @@ static inline void write_back_non_zero_count(H264Context *h){
static inline void write_back_motion(H264Context *h, int mb_type){ static inline void write_back_motion(H264Context *h, int mb_type){
MpegEncContext * const s = &h->s; MpegEncContext * const s = &h->s;
const int b_xy = 4*s->mb_x + 4*s->mb_y*h->b_stride; const int b_xy = 4*s->mb_x + 4*s->mb_y*h->b_stride; //try mb2b(8)_xy
const int b8_xy= 2*s->mb_x + 2*s->mb_y*h->b8_stride; const int b8_xy= 4*h->mb_xy;
int list; int list;
if(!USES_LIST(mb_type, 0)) if(!USES_LIST(mb_type, 0))
fill_rectangle(&s->current_picture.ref_index[0][b8_xy], 2, 2, h->b8_stride, (uint8_t)LIST_NOT_USED, 1); fill_rectangle(&s->current_picture.ref_index[0][b8_xy], 2, 2, 2, (uint8_t)LIST_NOT_USED, 1);
for(list=0; list<h->list_count; list++){ for(list=0; list<h->list_count; list++){
int y, b_stride; int y, b_stride;
...@@ -1424,10 +1420,10 @@ static inline void write_back_motion(H264Context *h, int mb_type){ ...@@ -1424,10 +1420,10 @@ static inline void write_back_motion(H264Context *h, int mb_type){
{ {
int8_t *ref_index = &s->current_picture.ref_index[list][b8_xy]; int8_t *ref_index = &s->current_picture.ref_index[list][b8_xy];
ref_index[0+0*h->b8_stride]= h->ref_cache[list][scan8[0]]; ref_index[0+0*2]= h->ref_cache[list][scan8[0]];
ref_index[1+0*h->b8_stride]= h->ref_cache[list][scan8[4]]; ref_index[1+0*2]= h->ref_cache[list][scan8[4]];
ref_index[0+1*h->b8_stride]= h->ref_cache[list][scan8[8]]; ref_index[0+1*2]= h->ref_cache[list][scan8[8]];
ref_index[1+1*h->b8_stride]= h->ref_cache[list][scan8[12]]; ref_index[1+1*2]= h->ref_cache[list][scan8[12]];
} }
} }
......
...@@ -142,7 +142,7 @@ void ff_h264_direct_ref_list_init(H264Context * const h){ ...@@ -142,7 +142,7 @@ void ff_h264_direct_ref_list_init(H264Context * const h){
static void pred_spatial_direct_motion(H264Context * const h, int *mb_type){ static void pred_spatial_direct_motion(H264Context * const h, int *mb_type){
MpegEncContext * const s = &h->s; MpegEncContext * const s = &h->s;
int b8_stride = h->b8_stride; int b8_stride = 2;
int b4_stride = h->b_stride; int b4_stride = h->b_stride;
int mb_xy = h->mb_xy; int mb_xy = h->mb_xy;
int mb_type_col[2]; int mb_type_col[2];
...@@ -228,7 +228,7 @@ static void pred_spatial_direct_motion(H264Context * const h, int *mb_type){ ...@@ -228,7 +228,7 @@ static void pred_spatial_direct_motion(H264Context * const h, int *mb_type){
mb_xy= s->mb_x + (s->mb_y&~1)*s->mb_stride; mb_xy= s->mb_x + (s->mb_y&~1)*s->mb_stride;
mb_type_col[0] = h->ref_list[1][0].mb_type[mb_xy]; mb_type_col[0] = h->ref_list[1][0].mb_type[mb_xy];
mb_type_col[1] = h->ref_list[1][0].mb_type[mb_xy + s->mb_stride]; mb_type_col[1] = h->ref_list[1][0].mb_type[mb_xy + s->mb_stride];
b8_stride *= 3; b8_stride = 2+4*s->mb_stride;
b4_stride *= 6; b4_stride *= 6;
sub_mb_type |= MB_TYPE_16x16|MB_TYPE_DIRECT2; /* B_SUB_8x8 */ sub_mb_type |= MB_TYPE_16x16|MB_TYPE_DIRECT2; /* B_SUB_8x8 */
...@@ -262,12 +262,12 @@ single_col: ...@@ -262,12 +262,12 @@ single_col:
l1mv0 = &h->ref_list[1][0].motion_val[0][h->mb2b_xy [mb_xy]]; l1mv0 = &h->ref_list[1][0].motion_val[0][h->mb2b_xy [mb_xy]];
l1mv1 = &h->ref_list[1][0].motion_val[1][h->mb2b_xy [mb_xy]]; l1mv1 = &h->ref_list[1][0].motion_val[1][h->mb2b_xy [mb_xy]];
l1ref0 = &h->ref_list[1][0].ref_index [0][h->mb2b8_xy[mb_xy]]; l1ref0 = &h->ref_list[1][0].ref_index [0][4*mb_xy];
l1ref1 = &h->ref_list[1][0].ref_index [1][h->mb2b8_xy[mb_xy]]; l1ref1 = &h->ref_list[1][0].ref_index [1][4*mb_xy];
if(!b8_stride){ if(!b8_stride){
if(s->mb_y&1){ if(s->mb_y&1){
l1ref0 += h->b8_stride; l1ref0 += 2;
l1ref1 += h->b8_stride; l1ref1 += 2;
l1mv0 += 2*b4_stride; l1mv0 += 2*b4_stride;
l1mv1 += 2*b4_stride; l1mv1 += 2*b4_stride;
} }
...@@ -342,11 +342,12 @@ single_col: ...@@ -342,11 +342,12 @@ single_col:
fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, (uint8_t)ref[0], 1); fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, (uint8_t)ref[0], 1);
fill_rectangle(&h->ref_cache[1][scan8[i8*4]], 2, 2, 8, (uint8_t)ref[1], 1); fill_rectangle(&h->ref_cache[1][scan8[i8*4]], 2, 2, 8, (uint8_t)ref[1], 1);
assert(b8_stride==2);
/* col_zero_flag */ /* col_zero_flag */
if(!IS_INTRA(mb_type_col[0]) && !h->ref_list[1][0].long_ref && ( l1ref0[x8 + y8*b8_stride] == 0 if(!IS_INTRA(mb_type_col[0]) && !h->ref_list[1][0].long_ref && ( l1ref0[i8] == 0
|| (l1ref0[x8 + y8*b8_stride] < 0 && l1ref1[x8 + y8*b8_stride] == 0 || (l1ref0[i8] < 0 && l1ref1[i8] == 0
&& h->x264_build>33U))){ && h->x264_build>33U))){
const int16_t (*l1mv)[2]= l1ref0[x8 + y8*b8_stride] == 0 ? l1mv0 : l1mv1; const int16_t (*l1mv)[2]= l1ref0[i8] == 0 ? l1mv0 : l1mv1;
if(IS_SUB_8X8(sub_mb_type)){ if(IS_SUB_8X8(sub_mb_type)){
const int16_t *mv_col = l1mv[x8*3 + y8*3*b4_stride]; const int16_t *mv_col = l1mv[x8*3 + y8*3*b4_stride];
if(FFABS(mv_col[0]) <= 1 && FFABS(mv_col[1]) <= 1){ if(FFABS(mv_col[0]) <= 1 && FFABS(mv_col[1]) <= 1){
...@@ -381,7 +382,7 @@ single_col: ...@@ -381,7 +382,7 @@ single_col:
static void pred_temp_direct_motion(H264Context * const h, int *mb_type){ static void pred_temp_direct_motion(H264Context * const h, int *mb_type){
MpegEncContext * const s = &h->s; MpegEncContext * const s = &h->s;
int b8_stride = h->b8_stride; int b8_stride = 2;
int b4_stride = h->b_stride; int b4_stride = h->b_stride;
int mb_xy = h->mb_xy; int mb_xy = h->mb_xy;
int mb_type_col[2]; int mb_type_col[2];
...@@ -406,7 +407,7 @@ static void pred_temp_direct_motion(H264Context * const h, int *mb_type){ ...@@ -406,7 +407,7 @@ static void pred_temp_direct_motion(H264Context * const h, int *mb_type){
mb_xy= s->mb_x + (s->mb_y&~1)*s->mb_stride; mb_xy= s->mb_x + (s->mb_y&~1)*s->mb_stride;
mb_type_col[0] = h->ref_list[1][0].mb_type[mb_xy]; mb_type_col[0] = h->ref_list[1][0].mb_type[mb_xy];
mb_type_col[1] = h->ref_list[1][0].mb_type[mb_xy + s->mb_stride]; mb_type_col[1] = h->ref_list[1][0].mb_type[mb_xy + s->mb_stride];
b8_stride *= 3; b8_stride = 2+4*s->mb_stride;
b4_stride *= 6; b4_stride *= 6;
sub_mb_type = MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_SUB_8x8 */ sub_mb_type = MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_SUB_8x8 */
...@@ -441,12 +442,12 @@ single_col: ...@@ -441,12 +442,12 @@ single_col:
l1mv0 = &h->ref_list[1][0].motion_val[0][h->mb2b_xy [mb_xy]]; l1mv0 = &h->ref_list[1][0].motion_val[0][h->mb2b_xy [mb_xy]];
l1mv1 = &h->ref_list[1][0].motion_val[1][h->mb2b_xy [mb_xy]]; l1mv1 = &h->ref_list[1][0].motion_val[1][h->mb2b_xy [mb_xy]];
l1ref0 = &h->ref_list[1][0].ref_index [0][h->mb2b8_xy[mb_xy]]; l1ref0 = &h->ref_list[1][0].ref_index [0][4*mb_xy];
l1ref1 = &h->ref_list[1][0].ref_index [1][h->mb2b8_xy[mb_xy]]; l1ref1 = &h->ref_list[1][0].ref_index [1][4*mb_xy];
if(!b8_stride){ if(!b8_stride){
if(s->mb_y&1){ if(s->mb_y&1){
l1ref0 += h->b8_stride; l1ref0 += 2;
l1ref1 += h->b8_stride; l1ref1 += 2;
l1mv0 += 2*b4_stride; l1mv0 += 2*b4_stride;
l1mv1 += 2*b4_stride; l1mv1 += 2*b4_stride;
} }
...@@ -549,11 +550,12 @@ single_col: ...@@ -549,11 +550,12 @@ single_col:
continue; continue;
} }
ref0 = l1ref0[x8 + y8*b8_stride]; assert(b8_stride == 2);
ref0 = l1ref0[i8];
if(ref0 >= 0) if(ref0 >= 0)
ref0 = map_col_to_list0[0][ref0 + ref_offset]; ref0 = map_col_to_list0[0][ref0 + ref_offset];
else{ else{
ref0 = map_col_to_list0[1][l1ref1[x8 + y8*b8_stride] + ref_offset]; ref0 = map_col_to_list0[1][l1ref1[i8] + ref_offset];
l1mv= l1mv1; l1mv= l1mv1;
} }
scale = dist_scale_factor[ref0]; scale = dist_scale_factor[ref0];
......
...@@ -43,15 +43,15 @@ static inline int fetch_diagonal_mv(H264Context *h, const int16_t **C, int i, in ...@@ -43,15 +43,15 @@ static inline int fetch_diagonal_mv(H264Context *h, const int16_t **C, int i, in
* make mbaff happy, so we can't move all this logic to fill_caches */ * make mbaff happy, so we can't move all this logic to fill_caches */
if(FRAME_MBAFF){ if(FRAME_MBAFF){
#define SET_DIAG_MV(MV_OP, REF_OP, X4, Y4)\ #define SET_DIAG_MV(MV_OP, REF_OP, XY, Y4)\
const int x4 = X4, y4 = Y4;\ const int xy = XY, y4 = Y4;\
const int mb_type = mb_types[(x4>>2)+(y4>>2)*s->mb_stride];\ const int mb_type = mb_types[xy+(y4>>2)*s->mb_stride];\
if(!USES_LIST(mb_type,list))\ if(!USES_LIST(mb_type,list))\
return LIST_NOT_USED;\ return LIST_NOT_USED;\
mv = s->current_picture_ptr->motion_val[list][x4 + y4*h->b_stride];\ mv = s->current_picture_ptr->motion_val[list][h->mb2b_xy[xy]+3 + y4*h->b_stride];\
h->mv_cache[list][scan8[0]-2][0] = mv[0];\ h->mv_cache[list][scan8[0]-2][0] = mv[0];\
h->mv_cache[list][scan8[0]-2][1] = mv[1] MV_OP;\ h->mv_cache[list][scan8[0]-2][1] = mv[1] MV_OP;\
return s->current_picture_ptr->ref_index[list][(x4>>1) + (y4>>1)*h->b8_stride] REF_OP; return s->current_picture_ptr->ref_index[list][4*xy+1 + (y4&~1)] REF_OP;
if(topright_ref == PART_NOT_AVAILABLE if(topright_ref == PART_NOT_AVAILABLE
&& i >= scan8[0]+8 && (i&7)==4 && i >= scan8[0]+8 && (i&7)==4
...@@ -63,12 +63,13 @@ static inline int fetch_diagonal_mv(H264Context *h, const int16_t **C, int i, in ...@@ -63,12 +63,13 @@ static inline int fetch_diagonal_mv(H264Context *h, const int16_t **C, int i, in
if(!MB_FIELD if(!MB_FIELD
&& IS_INTERLACED(mb_types[h->left_mb_xy[0]])){ && IS_INTERLACED(mb_types[h->left_mb_xy[0]])){
SET_DIAG_MV(*2, >>1, s->mb_x*4-1, (s->mb_y|1)*4+(s->mb_y&1)*2+(i>>4)-1); SET_DIAG_MV(*2, >>1, h->left_mb_xy[0]+s->mb_stride, (s->mb_y&1)*2+(i>>4)-1);
assert(h->left_mb_xy[0] == h->left_mb_xy[1]);
} }
if(MB_FIELD if(MB_FIELD
&& !IS_INTERLACED(mb_types[h->left_mb_xy[0]])){ && !IS_INTERLACED(mb_types[h->left_mb_xy[0]])){
// left shift will turn LIST_NOT_USED into PART_NOT_AVAILABLE, but that's OK. // left shift will turn LIST_NOT_USED into PART_NOT_AVAILABLE, but that's OK.
SET_DIAG_MV(/2, <<1, s->mb_x*4-1, (s->mb_y&~1)*4 - 1 + ((i-scan8[0])>>3)*2); SET_DIAG_MV(/2, <<1, h->left_mb_xy[i>=36], (- 1 + ((i-scan8[0])>>3)*2)&3);
} }
} }
#undef SET_DIAG_MV #undef SET_DIAG_MV
......
...@@ -257,7 +257,7 @@ int ff_alloc_picture(MpegEncContext *s, Picture *pic, int shared){ ...@@ -257,7 +257,7 @@ int ff_alloc_picture(MpegEncContext *s, Picture *pic, int shared){
for(i=0; i<2; i++){ for(i=0; i<2; i++){
FF_ALLOCZ_OR_GOTO(s->avctx, pic->motion_val_base[i], 2 * (b4_array_size+4) * sizeof(int16_t), fail) FF_ALLOCZ_OR_GOTO(s->avctx, pic->motion_val_base[i], 2 * (b4_array_size+4) * sizeof(int16_t), fail)
pic->motion_val[i]= pic->motion_val_base[i]+4; pic->motion_val[i]= pic->motion_val_base[i]+4;
FF_ALLOCZ_OR_GOTO(s->avctx, pic->ref_index[i], b8_array_size * sizeof(uint8_t), fail) FF_ALLOCZ_OR_GOTO(s->avctx, pic->ref_index[i], 4*mb_array_size * sizeof(uint8_t), fail)
} }
pic->motion_subsample_log2= 2; pic->motion_subsample_log2= 2;
}else if(s->out_format == FMT_H263 || s->encoding || (s->avctx->debug&FF_DEBUG_MV) || (s->avctx->debug_mv)){ }else if(s->out_format == FMT_H263 || s->encoding || (s->avctx->debug&FF_DEBUG_MV) || (s->avctx->debug_mv)){
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment