Commit 01ff07b7 authored by benoit's avatar benoit

Make LOAD4/STORE4 macros more generic.

Patch by Victor Pollex victor pollex web de
Original thread: [PATCH] mmx implementation of vc-1 inverse transformations
Date: 06/21/2008 03:37 PM


git-svn-id: file:///var/local/repositories/ffmpeg/trunk@14108 9553f0bf-9b14-0410-a0b8-cfaf0461ba5b
parent b92f6be5
...@@ -57,6 +57,18 @@ extern const uint64_t ff_pb_FC; ...@@ -57,6 +57,18 @@ extern const uint64_t ff_pb_FC;
extern const double ff_pd_1[2]; extern const double ff_pd_1[2];
extern const double ff_pd_2[2]; extern const double ff_pd_2[2];
#define LOAD4(stride,in,a,b,c,d)\
"movq 0*"#stride"+"#in", "#a"\n\t"\
"movq 1*"#stride"+"#in", "#b"\n\t"\
"movq 2*"#stride"+"#in", "#c"\n\t"\
"movq 3*"#stride"+"#in", "#d"\n\t"
#define STORE4(stride,out,a,b,c,d)\
"movq "#a", 0*"#stride"+"#out"\n\t"\
"movq "#b", 1*"#stride"+"#out"\n\t"\
"movq "#c", 2*"#stride"+"#out"\n\t"\
"movq "#d", 3*"#stride"+"#out"\n\t"
/* in/out: mma=mma+mmb, mmb=mmb-mma */ /* in/out: mma=mma+mmb, mmb=mmb-mma */
#define SUMSUB_BA( a, b ) \ #define SUMSUB_BA( a, b ) \
"paddw "#b", "#a" \n\t"\ "paddw "#b", "#a" \n\t"\
......
...@@ -998,18 +998,6 @@ static void sub_hfyu_median_prediction_mmx2(uint8_t *dst, uint8_t *src1, uint8_t ...@@ -998,18 +998,6 @@ static void sub_hfyu_median_prediction_mmx2(uint8_t *dst, uint8_t *src1, uint8_t
"paddusw %%xmm1, %%xmm0 \n\t" "paddusw %%xmm1, %%xmm0 \n\t"
#endif #endif
#define LOAD4(o, a, b, c, d)\
"movq "#o"(%1), "#a" \n\t"\
"movq "#o"+8(%1), "#b" \n\t"\
"movq "#o"+16(%1), "#c" \n\t"\
"movq "#o"+24(%1), "#d" \n\t"\
#define STORE4(o, a, b, c, d)\
"movq "#a", "#o"(%1) \n\t"\
"movq "#b", "#o"+8(%1) \n\t"\
"movq "#c", "#o"+16(%1) \n\t"\
"movq "#d", "#o"+24(%1) \n\t"\
/* FIXME: HSUM_* saturates at 64k, while an 8x8 hadamard or dct block can get up to /* FIXME: HSUM_* saturates at 64k, while an 8x8 hadamard or dct block can get up to
* about 100k on extreme inputs. But that's very unlikely to occur in natural video, * about 100k on extreme inputs. But that's very unlikely to occur in natural video,
* and it's even more unlikely to not have any alternative mvs/modes with lower cost. */ * and it's even more unlikely to not have any alternative mvs/modes with lower cost. */
...@@ -1053,11 +1041,11 @@ static int hadamard8_diff_##cpu(void *s, uint8_t *src1, uint8_t *src2, int strid ...@@ -1053,11 +1041,11 @@ static int hadamard8_diff_##cpu(void *s, uint8_t *src1, uint8_t *src2, int strid
"movq %%mm7, 96(%1) \n\t"\ "movq %%mm7, 96(%1) \n\t"\
\ \
TRANSPOSE4(%%mm0, %%mm1, %%mm2, %%mm3, %%mm7)\ TRANSPOSE4(%%mm0, %%mm1, %%mm2, %%mm3, %%mm7)\
STORE4(0 , %%mm0, %%mm3, %%mm7, %%mm2)\ STORE4(8, 0(%1), %%mm0, %%mm3, %%mm7, %%mm2)\
\ \
"movq 96(%1), %%mm7 \n\t"\ "movq 96(%1), %%mm7 \n\t"\
TRANSPOSE4(%%mm4, %%mm5, %%mm6, %%mm7, %%mm0)\ TRANSPOSE4(%%mm4, %%mm5, %%mm6, %%mm7, %%mm0)\
STORE4(64, %%mm4, %%mm7, %%mm0, %%mm6)\ STORE4(8, 64(%1), %%mm4, %%mm7, %%mm0, %%mm6)\
\ \
: "=r" (sum)\ : "=r" (sum)\
: "r"(temp)\ : "r"(temp)\
...@@ -1071,7 +1059,7 @@ static int hadamard8_diff_##cpu(void *s, uint8_t *src1, uint8_t *src2, int strid ...@@ -1071,7 +1059,7 @@ static int hadamard8_diff_##cpu(void *s, uint8_t *src1, uint8_t *src2, int strid
"movq %%mm7, 96(%1) \n\t"\ "movq %%mm7, 96(%1) \n\t"\
\ \
TRANSPOSE4(%%mm0, %%mm1, %%mm2, %%mm3, %%mm7)\ TRANSPOSE4(%%mm0, %%mm1, %%mm2, %%mm3, %%mm7)\
STORE4(32, %%mm0, %%mm3, %%mm7, %%mm2)\ STORE4(8, 32(%1), %%mm0, %%mm3, %%mm7, %%mm2)\
\ \
"movq 96(%1), %%mm7 \n\t"\ "movq 96(%1), %%mm7 \n\t"\
TRANSPOSE4(%%mm4, %%mm5, %%mm6, %%mm7, %%mm0)\ TRANSPOSE4(%%mm4, %%mm5, %%mm6, %%mm7, %%mm0)\
...@@ -1079,7 +1067,7 @@ static int hadamard8_diff_##cpu(void *s, uint8_t *src1, uint8_t *src2, int strid ...@@ -1079,7 +1067,7 @@ static int hadamard8_diff_##cpu(void *s, uint8_t *src1, uint8_t *src2, int strid
"movq %%mm6, %%mm7 \n\t"\ "movq %%mm6, %%mm7 \n\t"\
"movq %%mm0, %%mm6 \n\t"\ "movq %%mm0, %%mm6 \n\t"\
\ \
LOAD4(64, %%mm0, %%mm1, %%mm2, %%mm3)\ LOAD4(8, 64(%1), %%mm0, %%mm1, %%mm2, %%mm3)\
\ \
HADAMARD48\ HADAMARD48\
"movq %%mm7, 64(%1) \n\t"\ "movq %%mm7, 64(%1) \n\t"\
...@@ -1095,8 +1083,8 @@ static int hadamard8_diff_##cpu(void *s, uint8_t *src1, uint8_t *src2, int strid ...@@ -1095,8 +1083,8 @@ static int hadamard8_diff_##cpu(void *s, uint8_t *src1, uint8_t *src2, int strid
"paddusw %%mm1, %%mm0 \n\t"\ "paddusw %%mm1, %%mm0 \n\t"\
"movq %%mm0, 64(%1) \n\t"\ "movq %%mm0, 64(%1) \n\t"\
\ \
LOAD4(0 , %%mm0, %%mm1, %%mm2, %%mm3)\ LOAD4(8, 0(%1), %%mm0, %%mm1, %%mm2, %%mm3)\
LOAD4(32, %%mm4, %%mm5, %%mm6, %%mm7)\ LOAD4(8, 32(%1), %%mm4, %%mm5, %%mm6, %%mm7)\
\ \
HADAMARD48\ HADAMARD48\
"movq %%mm7, (%1) \n\t"\ "movq %%mm7, (%1) \n\t"\
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment