Commit 008ae412 authored by lorenm's avatar lorenm

ff_h264_idct8_add_sse2.

compared to mmx, 280->192 cycles on core2, 395->288 on k8.


git-svn-id: file:///var/local/repositories/ffmpeg/trunk@11826 9553f0bf-9b14-0410-a0b8-cfaf0461ba5b
parent edadbb70
...@@ -348,7 +348,7 @@ typedef struct H264Context{ ...@@ -348,7 +348,7 @@ typedef struct H264Context{
GetBitContext *intra_gb_ptr; GetBitContext *intra_gb_ptr;
GetBitContext *inter_gb_ptr; GetBitContext *inter_gb_ptr;
DECLARE_ALIGNED_8(DCTELEM, mb[16*24]); DECLARE_ALIGNED_16(DCTELEM, mb[16*24]);
DCTELEM mb_padding[256]; ///< as mb is addressed by scantable[i] and scantable is uint8_t we can either check that i is not to large or ensure that there is some unused stuff after mb DCTELEM mb_padding[256]; ///< as mb is addressed by scantable[i] and scantable is uint8_t we can either check that i is not to large or ensure that there is some unused stuff after mb
/** /**
......
...@@ -98,7 +98,7 @@ static void H264_CHROMA_MC8_TMPL(uint8_t *dst/*align 8*/, uint8_t *src/*align 1* ...@@ -98,7 +98,7 @@ static void H264_CHROMA_MC8_TMPL(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*
} }
/* general case, bilinear */ /* general case, bilinear */
rnd_reg = rnd ? &ff_pw_32 : &ff_pw_28; rnd_reg = rnd ? ff_pw_32 : &ff_pw_28;
asm volatile("movd %2, %%mm4\n\t" asm volatile("movd %2, %%mm4\n\t"
"movd %3, %%mm6\n\t" "movd %3, %%mm6\n\t"
"punpcklwd %%mm4, %%mm4\n\t" "punpcklwd %%mm4, %%mm4\n\t"
...@@ -250,7 +250,7 @@ static void H264_CHROMA_MC4_TMPL(uint8_t *dst/*align 4*/, uint8_t *src/*align 1* ...@@ -250,7 +250,7 @@ static void H264_CHROMA_MC4_TMPL(uint8_t *dst/*align 4*/, uint8_t *src/*align 1*
"sub $2, %2 \n\t" "sub $2, %2 \n\t"
"jnz 1b \n\t" "jnz 1b \n\t"
: "+r"(dst), "+r"(src), "+r"(h) : "+r"(dst), "+r"(src), "+r"(h)
: "r"((long)stride), "m"(ff_pw_32), "m"(x), "m"(y) : "r"((long)stride), "m"(*ff_pw_32), "m"(x), "m"(y)
); );
} }
...@@ -301,7 +301,7 @@ static void H264_CHROMA_MC2_TMPL(uint8_t *dst/*align 2*/, uint8_t *src/*align 1* ...@@ -301,7 +301,7 @@ static void H264_CHROMA_MC2_TMPL(uint8_t *dst/*align 2*/, uint8_t *src/*align 1*
"sub $1, %2\n\t" "sub $1, %2\n\t"
"jnz 1b\n\t" "jnz 1b\n\t"
: "+r" (dst), "+r"(src), "+r"(h) : "+r" (dst), "+r"(src), "+r"(h)
: "m" (ff_pw_32), "r"((long)stride) : "m" (*ff_pw_32), "r"((long)stride)
: "%esi"); : "%esi");
} }
......
...@@ -54,7 +54,7 @@ DECLARE_ALIGNED_8 (const uint64_t, ff_pw_8 ) = 0x0008000800080008ULL; ...@@ -54,7 +54,7 @@ DECLARE_ALIGNED_8 (const uint64_t, ff_pw_8 ) = 0x0008000800080008ULL;
DECLARE_ALIGNED_8 (const uint64_t, ff_pw_15 ) = 0x000F000F000F000FULL; DECLARE_ALIGNED_8 (const uint64_t, ff_pw_15 ) = 0x000F000F000F000FULL;
DECLARE_ALIGNED_8 (const uint64_t, ff_pw_16 ) = 0x0010001000100010ULL; DECLARE_ALIGNED_8 (const uint64_t, ff_pw_16 ) = 0x0010001000100010ULL;
DECLARE_ALIGNED_8 (const uint64_t, ff_pw_20 ) = 0x0014001400140014ULL; DECLARE_ALIGNED_8 (const uint64_t, ff_pw_20 ) = 0x0014001400140014ULL;
DECLARE_ALIGNED_8 (const uint64_t, ff_pw_32 ) = 0x0020002000200020ULL; DECLARE_ALIGNED_16(const uint64_t, ff_pw_32[2]) = {0x0020002000200020ULL, 0x0020002000200020ULL};
DECLARE_ALIGNED_8 (const uint64_t, ff_pw_42 ) = 0x002A002A002A002AULL; DECLARE_ALIGNED_8 (const uint64_t, ff_pw_42 ) = 0x002A002A002A002AULL;
DECLARE_ALIGNED_8 (const uint64_t, ff_pw_64 ) = 0x0040004000400040ULL; DECLARE_ALIGNED_8 (const uint64_t, ff_pw_64 ) = 0x0040004000400040ULL;
DECLARE_ALIGNED_8 (const uint64_t, ff_pw_96 ) = 0x0060006000600060ULL; DECLARE_ALIGNED_8 (const uint64_t, ff_pw_96 ) = 0x0060006000600060ULL;
...@@ -3328,6 +3328,8 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx) ...@@ -3328,6 +3328,8 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx)
c->h264_idct_add= ff_h264_idct_add_mmx; c->h264_idct_add= ff_h264_idct_add_mmx;
c->h264_idct8_dc_add= c->h264_idct8_dc_add=
c->h264_idct8_add= ff_h264_idct8_add_mmx; c->h264_idct8_add= ff_h264_idct8_add_mmx;
if (mm_flags & MM_SSE2)
c->h264_idct8_add= ff_h264_idct8_add_sse2;
if (mm_flags & MM_MMXEXT) { if (mm_flags & MM_MMXEXT) {
c->prefetch = prefetch_mmx2; c->prefetch = prefetch_mmx2;
......
...@@ -36,7 +36,7 @@ extern const uint64_t ff_pw_8; ...@@ -36,7 +36,7 @@ extern const uint64_t ff_pw_8;
extern const uint64_t ff_pw_15; extern const uint64_t ff_pw_15;
extern const uint64_t ff_pw_16; extern const uint64_t ff_pw_16;
extern const uint64_t ff_pw_20; extern const uint64_t ff_pw_20;
extern const uint64_t ff_pw_32; extern const uint64_t ff_pw_32[2];
extern const uint64_t ff_pw_42; extern const uint64_t ff_pw_42;
extern const uint64_t ff_pw_64; extern const uint64_t ff_pw_64;
extern const uint64_t ff_pw_96; extern const uint64_t ff_pw_96;
......
...@@ -75,7 +75,7 @@ static void ff_h264_idct_add_mmx(uint8_t *dst, int16_t *block, int stride) ...@@ -75,7 +75,7 @@ static void ff_h264_idct_add_mmx(uint8_t *dst, int16_t *block, int stride)
IDCT4_1D( %%mm4, %%mm2, %%mm3, %%mm0, %%mm1 ) IDCT4_1D( %%mm4, %%mm2, %%mm3, %%mm0, %%mm1 )
"pxor %%mm7, %%mm7 \n\t" "pxor %%mm7, %%mm7 \n\t"
:: "m"(ff_pw_32)); :: "m"(*ff_pw_32));
asm volatile( asm volatile(
STORE_DIFF_4P( %%mm0, %%mm1, %%mm7) STORE_DIFF_4P( %%mm0, %%mm1, %%mm7)
...@@ -211,6 +211,93 @@ static void ff_h264_idct8_add_mmx(uint8_t *dst, int16_t *block, int stride) ...@@ -211,6 +211,93 @@ static void ff_h264_idct8_add_mmx(uint8_t *dst, int16_t *block, int stride)
add_pixels_clamped_mmx(b2, dst, stride); add_pixels_clamped_mmx(b2, dst, stride);
} }
#define STORE_DIFF_8P( p, d, t, z )\
"movq "#d", "#t" \n"\
"psraw $6, "#p" \n"\
"punpcklbw "#z", "#t" \n"\
"paddsw "#t", "#p" \n"\
"packuswb "#p", "#p" \n"\
"movq "#p", "#d" \n"
#define H264_IDCT8_1D_SSE2(a,b,c,d,e,f,g,h)\
"movdqa "#c", "#a" \n"\
"movdqa "#g", "#e" \n"\
"psraw $1, "#c" \n"\
"psraw $1, "#g" \n"\
"psubw "#e", "#c" \n"\
"paddw "#a", "#g" \n"\
"movdqa "#b", "#e" \n"\
"psraw $1, "#e" \n"\
"paddw "#b", "#e" \n"\
"paddw "#d", "#e" \n"\
"paddw "#f", "#e" \n"\
"movdqa "#f", "#a" \n"\
"psraw $1, "#a" \n"\
"paddw "#f", "#a" \n"\
"paddw "#h", "#a" \n"\
"psubw "#b", "#a" \n"\
"psubw "#d", "#b" \n"\
"psubw "#d", "#f" \n"\
"paddw "#h", "#b" \n"\
"psubw "#h", "#f" \n"\
"psraw $1, "#d" \n"\
"psraw $1, "#h" \n"\
"psubw "#d", "#b" \n"\
"psubw "#h", "#f" \n"\
"movdqa "#e", "#d" \n"\
"movdqa "#a", "#h" \n"\
"psraw $2, "#d" \n"\
"psraw $2, "#h" \n"\
"paddw "#f", "#d" \n"\
"paddw "#b", "#h" \n"\
"psraw $2, "#f" \n"\
"psraw $2, "#b" \n"\
"psubw "#f", "#e" \n"\
"psubw "#a", "#b" \n"\
"movdqa 0x00(%1), "#a" \n"\
"movdqa 0x40(%1), "#f" \n"\
SUMSUB_BA(f, a)\
SUMSUB_BA(g, f)\
SUMSUB_BA(c, a)\
SUMSUB_BA(e, g)\
SUMSUB_BA(b, c)\
SUMSUB_BA(h, a)\
SUMSUB_BA(d, f)
static void ff_h264_idct8_add_sse2(uint8_t *dst, int16_t *block, int stride)
{
asm volatile(
"movdqa 0x10(%1), %%xmm1 \n"
"movdqa 0x20(%1), %%xmm2 \n"
"movdqa 0x30(%1), %%xmm3 \n"
"movdqa 0x50(%1), %%xmm5 \n"
"movdqa 0x60(%1), %%xmm6 \n"
"movdqa 0x70(%1), %%xmm7 \n"
H264_IDCT8_1D_SSE2(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm6, %%xmm7)
TRANSPOSE8(%%xmm4, %%xmm1, %%xmm7, %%xmm3, %%xmm5, %%xmm0, %%xmm2, %%xmm6, (%1))
"paddw %4, %%xmm4 \n"
"movdqa %%xmm4, 0x00(%1) \n"
"movdqa %%xmm2, 0x40(%1) \n"
H264_IDCT8_1D_SSE2(%%xmm4, %%xmm0, %%xmm6, %%xmm3, %%xmm2, %%xmm5, %%xmm7, %%xmm1)
"movdqa %%xmm6, 0x60(%1) \n"
"movdqa %%xmm7, 0x70(%1) \n"
"pxor %%xmm7, %%xmm7 \n"
STORE_DIFF_8P(%%xmm2, (%0), %%xmm6, %%xmm7)
STORE_DIFF_8P(%%xmm0, (%0,%2), %%xmm6, %%xmm7)
STORE_DIFF_8P(%%xmm1, (%0,%2,2), %%xmm6, %%xmm7)
STORE_DIFF_8P(%%xmm3, (%0,%3), %%xmm6, %%xmm7)
"lea (%0,%2,4), %0 \n"
STORE_DIFF_8P(%%xmm5, (%0), %%xmm6, %%xmm7)
STORE_DIFF_8P(%%xmm4, (%0,%2), %%xmm6, %%xmm7)
"movdqa 0x60(%1), %%xmm0 \n"
"movdqa 0x70(%1), %%xmm1 \n"
STORE_DIFF_8P(%%xmm0, (%0,%2,2), %%xmm6, %%xmm7)
STORE_DIFF_8P(%%xmm1, (%0,%3), %%xmm6, %%xmm7)
:"+r"(dst)
:"r"(block), "r"((long)stride), "r"(3L*stride), "m"(*ff_pw_32)
);
}
static void ff_h264_idct_dc_add_mmx2(uint8_t *dst, int16_t *block, int stride) static void ff_h264_idct_dc_add_mmx2(uint8_t *dst, int16_t *block, int stride)
{ {
int dc = (block[0] + 32) >> 6; int dc = (block[0] + 32) >> 6;
...@@ -839,7 +926,7 @@ static av_noinline void OPNAME ## h264_qpel4_hv_lowpass_ ## MMX(uint8_t *dst, in ...@@ -839,7 +926,7 @@ static av_noinline void OPNAME ## h264_qpel4_hv_lowpass_ ## MMX(uint8_t *dst, in
"decl %2 \n\t"\ "decl %2 \n\t"\
" jnz 1b \n\t"\ " jnz 1b \n\t"\
: "+a"(tmp), "+c"(dst), "+m"(h)\ : "+a"(tmp), "+c"(dst), "+m"(h)\
: "S"((long)dstStride), "m"(ff_pw_32)\ : "S"((long)dstStride), "m"(*ff_pw_32)\
: "memory"\ : "memory"\
);\ );\
}\ }\
...@@ -1113,7 +1200,7 @@ static av_noinline void OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(uint8_t *dst ...@@ -1113,7 +1200,7 @@ static av_noinline void OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(uint8_t *dst
"decl %2 \n\t"\ "decl %2 \n\t"\
" jnz 1b \n\t"\ " jnz 1b \n\t"\
: "+a"(tmp), "+c"(dst), "+m"(h)\ : "+a"(tmp), "+c"(dst), "+m"(h)\
: "S"((long)dstStride), "m"(ff_pw_32)\ : "S"((long)dstStride), "m"(*ff_pw_32)\
: "memory"\ : "memory"\
);\ );\
tmp += 8 - size*24;\ tmp += 8 - size*24;\
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment