Commit 09fcf89b authored by michael's avatar michael

adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64...

adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)


git-svn-id: file:///var/local/repositories/ffmpeg/trunk@3578 9553f0bf-9b14-0410-a0b8-cfaf0461ba5b
parent fbfb4b78
...@@ -106,6 +106,14 @@ case "$cpu" in ...@@ -106,6 +106,14 @@ case "$cpu" in
i386|i486|i586|i686|i86pc|BePC) i386|i486|i586|i686|i86pc|BePC)
cpu="x86" cpu="x86"
;; ;;
x86_64)
if [ "`$cc -dumpmachine | grep x86_64 | cut -d- -f1`" = "x86_64" -a \
-z "`echo $CFLAGS | grep -- -m32`" ]; then
cpu="x86_64"
else
cpu="x86"
fi
;;
# armv4l is a subset of armv5tel # armv4l is a subset of armv5tel
armv4l|armv5tel) armv4l|armv5tel)
cpu="armv4l" cpu="armv4l"
...@@ -500,7 +508,7 @@ fi ...@@ -500,7 +508,7 @@ fi
# compute mmx state # compute mmx state
if test $mmx = "default"; then if test $mmx = "default"; then
if test $cpu = "x86"; then if test $cpu = "x86" -o $cpu = "x86_64"; then
mmx="yes" mmx="yes"
else else
mmx="no" mmx="no"
...@@ -827,6 +835,7 @@ done ...@@ -827,6 +835,7 @@ done
# test gcc version to see if vector builtins can be used # test gcc version to see if vector builtins can be used
# currently only used on i386 for MMX builtins # currently only used on i386 for MMX builtins
cat > $TMPC << EOF cat > $TMPC << EOF
#include <xmmintrin.h>
int main(void) { int main(void) {
#if __GNUC__ > 3 || (__GNUC__ == 3 && __GNUC_MINOR__ >= 2) #if __GNUC__ > 3 || (__GNUC__ == 3 && __GNUC_MINOR__ >= 2)
return 0; return 0;
...@@ -985,7 +994,7 @@ echo "CPU $cpu ($tune)" ...@@ -985,7 +994,7 @@ echo "CPU $cpu ($tune)"
echo "Big Endian $bigendian" echo "Big Endian $bigendian"
echo "inttypes.h $inttypes" echo "inttypes.h $inttypes"
echo "broken inttypes.h $emu_fast_int" echo "broken inttypes.h $emu_fast_int"
if test $cpu = "x86"; then if test $cpu = "x86" -o $cpu = "x86_64"; then
echo "MMX enabled $mmx" echo "MMX enabled $mmx"
echo "Vector Builtins $builtin_vector" echo "Vector Builtins $builtin_vector"
fi fi
...@@ -1074,6 +1083,9 @@ echo "TARGET_OS=$TARGET_OS" >> config.mak ...@@ -1074,6 +1083,9 @@ echo "TARGET_OS=$TARGET_OS" >> config.mak
if test "$cpu" = "x86" ; then if test "$cpu" = "x86" ; then
echo "TARGET_ARCH_X86=yes" >> config.mak echo "TARGET_ARCH_X86=yes" >> config.mak
echo "#define ARCH_X86 1" >> $TMPH echo "#define ARCH_X86 1" >> $TMPH
elif test "$cpu" = "x86_64" ; then
echo "TARGET_ARCH_X86_64=yes" >> config.mak
echo "#define ARCH_X86_64 1" >> $TMPH
elif test "$cpu" = "armv4l" ; then elif test "$cpu" = "armv4l" ; then
echo "TARGET_ARCH_ARMV4L=yes" >> config.mak echo "TARGET_ARCH_ARMV4L=yes" >> config.mak
echo "#define ARCH_ARMV4L 1" >> $TMPH echo "#define ARCH_ARMV4L 1" >> $TMPH
......
...@@ -10,17 +10,23 @@ ...@@ -10,17 +10,23 @@
#include <byteswap.h> #include <byteswap.h>
#else #else
#ifdef ARCH_X86 #ifdef ARCH_X86_64
static inline unsigned short ByteSwap16(unsigned short x) # define LEGACY_REGS "=Q"
#else
# define LEGACY_REGS "=q"
#endif
#if defined(ARCH_X86) || defined(ARCH_X86_64)
static inline uint16_t ByteSwap16(uint16_t x)
{ {
__asm("xchgb %b0,%h0" : __asm("xchgb %b0,%h0" :
"=q" (x) : LEGACY_REGS (x) :
"0" (x)); "0" (x));
return x; return x;
} }
#define bswap_16(x) ByteSwap16(x) #define bswap_16(x) ByteSwap16(x)
static inline unsigned int ByteSwap32(unsigned int x) static inline uint32_t ByteSwap32(uint32_t x)
{ {
#if __CPU__ > 386 #if __CPU__ > 386
__asm("bswap %0": __asm("bswap %0":
...@@ -29,21 +35,28 @@ static inline unsigned int ByteSwap32(unsigned int x) ...@@ -29,21 +35,28 @@ static inline unsigned int ByteSwap32(unsigned int x)
__asm("xchgb %b0,%h0\n" __asm("xchgb %b0,%h0\n"
" rorl $16,%0\n" " rorl $16,%0\n"
" xchgb %b0,%h0": " xchgb %b0,%h0":
"=q" (x) : LEGACY_REGS (x) :
#endif #endif
"0" (x)); "0" (x));
return x; return x;
} }
#define bswap_32(x) ByteSwap32(x) #define bswap_32(x) ByteSwap32(x)
static inline unsigned long long int ByteSwap64(unsigned long long int x) static inline uint64_t ByteSwap64(uint64_t x)
{ {
#ifdef ARCH_X86_64
__asm("bswap %0":
"=r" (x) :
"0" (x));
return x;
#else
register union { __extension__ uint64_t __ll; register union { __extension__ uint64_t __ll;
uint32_t __l[2]; } __x; uint32_t __l[2]; } __x;
asm("xchgl %0,%1": asm("xchgl %0,%1":
"=r"(__x.__l[0]),"=r"(__x.__l[1]): "=r"(__x.__l[0]),"=r"(__x.__l[1]):
"0"(bswap_32((unsigned long)x)),"1"(bswap_32((unsigned long)(x>>32)))); "0"(bswap_32((uint32_t)x)),"1"(bswap_32((uint32_t)(x>>32))));
return __x.__ll; return __x.__ll;
#endif
} }
#define bswap_64(x) ByteSwap64(x) #define bswap_64(x) ByteSwap64(x)
......
...@@ -254,7 +254,7 @@ inline void dprintf(const char* fmt,...) {} ...@@ -254,7 +254,7 @@ inline void dprintf(const char* fmt,...) {}
extern const uint32_t inverse[256]; extern const uint32_t inverse[256];
#ifdef ARCH_X86 #if defined(ARCH_X86) || defined(ARCH_X86_64)
# define FASTDIV(a,b) \ # define FASTDIV(a,b) \
({\ ({\
int ret,dmy;\ int ret,dmy;\
...@@ -271,7 +271,7 @@ extern const uint32_t inverse[256]; ...@@ -271,7 +271,7 @@ extern const uint32_t inverse[256];
# define FASTDIV(a,b) ((a)/(b)) # define FASTDIV(a,b) ((a)/(b))
#endif #endif
#ifdef ARCH_X86 #if defined(ARCH_X86) || defined(ARCH_X86_64)
// avoid +32 for shift optimization (gcc should do that ...) // avoid +32 for shift optimization (gcc should do that ...)
static inline int32_t NEG_SSR32( int32_t a, int8_t s){ static inline int32_t NEG_SSR32( int32_t a, int8_t s){
asm ("sarl %1, %0\n\t" asm ("sarl %1, %0\n\t"
...@@ -390,7 +390,7 @@ typedef struct RL_VLC_ELEM { ...@@ -390,7 +390,7 @@ typedef struct RL_VLC_ELEM {
#endif #endif
/* used to avoid missaligned exceptions on some archs (alpha, ...) */ /* used to avoid missaligned exceptions on some archs (alpha, ...) */
#ifdef ARCH_X86 #if defined(ARCH_X86) || defined(ARCH_X86_64)
# define unaligned32(a) (*(const uint32_t*)(a)) # define unaligned32(a) (*(const uint32_t*)(a))
#else #else
# ifdef __GNUC__ # ifdef __GNUC__
...@@ -460,7 +460,7 @@ static inline void put_bits(PutBitContext *s, int n, unsigned int value) ...@@ -460,7 +460,7 @@ static inline void put_bits(PutBitContext *s, int n, unsigned int value)
static inline void put_bits(PutBitContext *s, int n, unsigned int value) static inline void put_bits(PutBitContext *s, int n, unsigned int value)
{ {
# ifdef ALIGNED_BITSTREAM_WRITER # ifdef ALIGNED_BITSTREAM_WRITER
# ifdef ARCH_X86 # if defined(ARCH_X86) || defined(ARCH_X86_64)
asm volatile( asm volatile(
"movl %0, %%ecx \n\t" "movl %0, %%ecx \n\t"
"xorl %%eax, %%eax \n\t" "xorl %%eax, %%eax \n\t"
...@@ -491,7 +491,7 @@ static inline void put_bits(PutBitContext *s, int n, unsigned int value) ...@@ -491,7 +491,7 @@ static inline void put_bits(PutBitContext *s, int n, unsigned int value)
s->index= index; s->index= index;
# endif # endif
# else //ALIGNED_BITSTREAM_WRITER # else //ALIGNED_BITSTREAM_WRITER
# ifdef ARCH_X86 # if defined(ARCH_X86) || defined(ARCH_X86_64)
asm volatile( asm volatile(
"movl $7, %%ecx \n\t" "movl $7, %%ecx \n\t"
"andl %0, %%ecx \n\t" "andl %0, %%ecx \n\t"
...@@ -738,7 +738,7 @@ static inline int get_bits_count(GetBitContext *s){ ...@@ -738,7 +738,7 @@ static inline int get_bits_count(GetBitContext *s){
name##_bit_count-= 32;\ name##_bit_count-= 32;\
}\ }\
#ifdef ARCH_X86 #if defined(ARCH_X86) || defined(ARCH_X86_64)
# define SKIP_CACHE(name, gb, num)\ # define SKIP_CACHE(name, gb, num)\
asm(\ asm(\
"shldl %2, %1, %0 \n\t"\ "shldl %2, %1, %0 \n\t"\
...@@ -1218,7 +1218,7 @@ static inline int ff_get_fourcc(const char *s){ ...@@ -1218,7 +1218,7 @@ static inline int ff_get_fourcc(const char *s){
#define MKBETAG(a,b,c,d) (d | (c << 8) | (b << 16) | (a << 24)) #define MKBETAG(a,b,c,d) (d | (c << 8) | (b << 16) | (a << 24))
#ifdef ARCH_X86 #if defined(ARCH_X86) || defined(ARCH_X86_64)
#define MASK_ABS(mask, level)\ #define MASK_ABS(mask, level)\
asm volatile(\ asm volatile(\
"cdq \n\t"\ "cdq \n\t"\
...@@ -1252,7 +1252,7 @@ if((y)<(x)){\ ...@@ -1252,7 +1252,7 @@ if((y)<(x)){\
} }
#endif #endif
#ifdef ARCH_X86 #if defined(ARCH_X86) || defined(ARCH_X86_64)
static inline long long rdtsc(void) static inline long long rdtsc(void)
{ {
long long l; long long l;
......
...@@ -4,12 +4,20 @@ ...@@ -4,12 +4,20 @@
#include <stdlib.h> #include <stdlib.h>
#include "../dsputil.h" #include "../dsputil.h"
#ifdef ARCH_X86_64
# define REG_b "rbx"
# define REG_S "rsi"
#else
# define REG_b "ebx"
# define REG_S "esi"
#endif
/* ebx saving is necessary for PIC. gcc seems unable to see it alone */ /* ebx saving is necessary for PIC. gcc seems unable to see it alone */
#define cpuid(index,eax,ebx,ecx,edx)\ #define cpuid(index,eax,ebx,ecx,edx)\
__asm __volatile\ __asm __volatile\
("movl %%ebx, %%esi\n\t"\ ("mov %%"REG_b", %%"REG_S"\n\t"\
"cpuid\n\t"\ "cpuid\n\t"\
"xchgl %%ebx, %%esi"\ "xchg %%"REG_b", %%"REG_S\
: "=a" (eax), "=S" (ebx),\ : "=a" (eax), "=S" (ebx),\
"=c" (ecx), "=d" (edx)\ "=c" (ecx), "=d" (edx)\
: "0" (index)); : "0" (index));
...@@ -24,7 +32,7 @@ int mm_support(void) ...@@ -24,7 +32,7 @@ int mm_support(void)
/* See if CPUID instruction is supported ... */ /* See if CPUID instruction is supported ... */
/* ... Get copies of EFLAGS into eax and ecx */ /* ... Get copies of EFLAGS into eax and ecx */
"pushf\n\t" "pushf\n\t"
"popl %0\n\t" "pop %0\n\t"
"movl %0, %1\n\t" "movl %0, %1\n\t"
/* ... Toggle the ID bit in one copy and store */ /* ... Toggle the ID bit in one copy and store */
...@@ -35,7 +43,7 @@ int mm_support(void) ...@@ -35,7 +43,7 @@ int mm_support(void)
/* ... Get the (hopefully modified) EFLAGS */ /* ... Get the (hopefully modified) EFLAGS */
"pushf\n\t" "pushf\n\t"
"popl %0\n\t" "pop %0\n\t"
: "=a" (eax), "=c" (ecx) : "=a" (eax), "=c" (ecx)
: :
: "cc" : "cc"
......
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
...@@ -47,13 +47,13 @@ static const int16_t ocos_4_16[4] ATTR_ALIGN(8) = { ...@@ -47,13 +47,13 @@ static const int16_t ocos_4_16[4] ATTR_ALIGN(8) = {
23170, 23170, 23170, 23170, //cos * (2<<15) + 0.5 23170, 23170, 23170, 23170, //cos * (2<<15) + 0.5
}; };
static const long long fdct_one_corr ATTR_ALIGN(8) = 0x0001000100010001LL; static const int64_t fdct_one_corr ATTR_ALIGN(8) = 0x0001000100010001LL;
static const long fdct_r_row[2] ATTR_ALIGN(8) = {RND_FRW_ROW, RND_FRW_ROW }; static const int32_t fdct_r_row[2] ATTR_ALIGN(8) = {RND_FRW_ROW, RND_FRW_ROW };
struct struct
{ {
const long fdct_r_row_sse2[4] ATTR_ALIGN(16); const int32_t fdct_r_row_sse2[4] ATTR_ALIGN(16);
} fdct_r_row_sse2 ATTR_ALIGN(16)= } fdct_r_row_sse2 ATTR_ALIGN(16)=
{{ {{
RND_FRW_ROW, RND_FRW_ROW, RND_FRW_ROW, RND_FRW_ROW RND_FRW_ROW, RND_FRW_ROW, RND_FRW_ROW, RND_FRW_ROW
......
...@@ -5,6 +5,12 @@ ...@@ -5,6 +5,12 @@
#ifndef AVCODEC_I386MMX_H #ifndef AVCODEC_I386MMX_H
#define AVCODEC_I386MMX_H #define AVCODEC_I386MMX_H
#ifdef ARCH_X86_64
# define REG_a "rax"
#else
# define REG_a "eax"
#endif
/* /*
* The type of an value that fits in an MMX register (note that long * The type of an value that fits in an MMX register (note that long
* long constant values MUST be suffixed by LL and unsigned long long * long constant values MUST be suffixed by LL and unsigned long long
......
...@@ -20,6 +20,7 @@ ...@@ -20,6 +20,7 @@
* mostly by Michael Niedermayer <michaelni@gmx.at> * mostly by Michael Niedermayer <michaelni@gmx.at>
*/ */
#include "../dsputil.h" #include "../dsputil.h"
#include "mmx.h"
static const __attribute__ ((aligned(8))) uint64_t round_tab[3]={ static const __attribute__ ((aligned(8))) uint64_t round_tab[3]={
0x0000000000000000ULL, 0x0000000000000000ULL,
...@@ -31,19 +32,19 @@ static attribute_used __attribute__ ((aligned(8))) uint64_t bone= 0x010101010101 ...@@ -31,19 +32,19 @@ static attribute_used __attribute__ ((aligned(8))) uint64_t bone= 0x010101010101
static inline void sad8_1_mmx(uint8_t *blk1, uint8_t *blk2, int stride, int h) static inline void sad8_1_mmx(uint8_t *blk1, uint8_t *blk2, int stride, int h)
{ {
int len= -(stride*h); long len= -(stride*h);
asm volatile( asm volatile(
".balign 16 \n\t" ".balign 16 \n\t"
"1: \n\t" "1: \n\t"
"movq (%1, %%eax), %%mm0 \n\t" "movq (%1, %%"REG_a"), %%mm0 \n\t"
"movq (%2, %%eax), %%mm2 \n\t" "movq (%2, %%"REG_a"), %%mm2 \n\t"
"movq (%2, %%eax), %%mm4 \n\t" "movq (%2, %%"REG_a"), %%mm4 \n\t"
"addl %3, %%eax \n\t" "add %3, %%"REG_a" \n\t"
"psubusb %%mm0, %%mm2 \n\t" "psubusb %%mm0, %%mm2 \n\t"
"psubusb %%mm4, %%mm0 \n\t" "psubusb %%mm4, %%mm0 \n\t"
"movq (%1, %%eax), %%mm1 \n\t" "movq (%1, %%"REG_a"), %%mm1 \n\t"
"movq (%2, %%eax), %%mm3 \n\t" "movq (%2, %%"REG_a"), %%mm3 \n\t"
"movq (%2, %%eax), %%mm5 \n\t" "movq (%2, %%"REG_a"), %%mm5 \n\t"
"psubusb %%mm1, %%mm3 \n\t" "psubusb %%mm1, %%mm3 \n\t"
"psubusb %%mm5, %%mm1 \n\t" "psubusb %%mm5, %%mm1 \n\t"
"por %%mm2, %%mm0 \n\t" "por %%mm2, %%mm0 \n\t"
...@@ -58,116 +59,116 @@ static inline void sad8_1_mmx(uint8_t *blk1, uint8_t *blk2, int stride, int h) ...@@ -58,116 +59,116 @@ static inline void sad8_1_mmx(uint8_t *blk1, uint8_t *blk2, int stride, int h)
"paddw %%mm3, %%mm2 \n\t" "paddw %%mm3, %%mm2 \n\t"
"paddw %%mm2, %%mm0 \n\t" "paddw %%mm2, %%mm0 \n\t"
"paddw %%mm0, %%mm6 \n\t" "paddw %%mm0, %%mm6 \n\t"
"addl %3, %%eax \n\t" "add %3, %%"REG_a" \n\t"
" js 1b \n\t" " js 1b \n\t"
: "+a" (len) : "+a" (len)
: "r" (blk1 - len), "r" (blk2 - len), "r" (stride) : "r" (blk1 - len), "r" (blk2 - len), "r" ((long)stride)
); );
} }
static inline void sad8_1_mmx2(uint8_t *blk1, uint8_t *blk2, int stride, int h) static inline void sad8_1_mmx2(uint8_t *blk1, uint8_t *blk2, int stride, int h)
{ {
int len= -(stride*h); long len= -(stride*h);
asm volatile( asm volatile(
".balign 16 \n\t" ".balign 16 \n\t"
"1: \n\t" "1: \n\t"
"movq (%1, %%eax), %%mm0 \n\t" "movq (%1, %%"REG_a"), %%mm0 \n\t"
"movq (%2, %%eax), %%mm2 \n\t" "movq (%2, %%"REG_a"), %%mm2 \n\t"
"psadbw %%mm2, %%mm0 \n\t" "psadbw %%mm2, %%mm0 \n\t"
"addl %3, %%eax \n\t" "add %3, %%"REG_a" \n\t"
"movq (%1, %%eax), %%mm1 \n\t" "movq (%1, %%"REG_a"), %%mm1 \n\t"
"movq (%2, %%eax), %%mm3 \n\t" "movq (%2, %%"REG_a"), %%mm3 \n\t"
"psadbw %%mm1, %%mm3 \n\t" "psadbw %%mm1, %%mm3 \n\t"
"paddw %%mm3, %%mm0 \n\t" "paddw %%mm3, %%mm0 \n\t"
"paddw %%mm0, %%mm6 \n\t" "paddw %%mm0, %%mm6 \n\t"
"addl %3, %%eax \n\t" "add %3, %%"REG_a" \n\t"
" js 1b \n\t" " js 1b \n\t"
: "+a" (len) : "+a" (len)
: "r" (blk1 - len), "r" (blk2 - len), "r" (stride) : "r" (blk1 - len), "r" (blk2 - len), "r" ((long)stride)
); );
} }
static inline void sad8_2_mmx2(uint8_t *blk1a, uint8_t *blk1b, uint8_t *blk2, int stride, int h) static inline void sad8_2_mmx2(uint8_t *blk1a, uint8_t *blk1b, uint8_t *blk2, int stride, int h)
{ {
int len= -(stride*h); long len= -(stride*h);
asm volatile( asm volatile(
".balign 16 \n\t" ".balign 16 \n\t"
"1: \n\t" "1: \n\t"
"movq (%1, %%eax), %%mm0 \n\t" "movq (%1, %%"REG_a"), %%mm0 \n\t"
"movq (%2, %%eax), %%mm2 \n\t" "movq (%2, %%"REG_a"), %%mm2 \n\t"
"pavgb %%mm2, %%mm0 \n\t" "pavgb %%mm2, %%mm0 \n\t"
"movq (%3, %%eax), %%mm2 \n\t" "movq (%3, %%"REG_a"), %%mm2 \n\t"
"psadbw %%mm2, %%mm0 \n\t" "psadbw %%mm2, %%mm0 \n\t"
"addl %4, %%eax \n\t" "add %4, %%"REG_a" \n\t"
"movq (%1, %%eax), %%mm1 \n\t" "movq (%1, %%"REG_a"), %%mm1 \n\t"
"movq (%2, %%eax), %%mm3 \n\t" "movq (%2, %%"REG_a"), %%mm3 \n\t"
"pavgb %%mm1, %%mm3 \n\t" "pavgb %%mm1, %%mm3 \n\t"
"movq (%3, %%eax), %%mm1 \n\t" "movq (%3, %%"REG_a"), %%mm1 \n\t"
"psadbw %%mm1, %%mm3 \n\t" "psadbw %%mm1, %%mm3 \n\t"
"paddw %%mm3, %%mm0 \n\t" "paddw %%mm3, %%mm0 \n\t"
"paddw %%mm0, %%mm6 \n\t" "paddw %%mm0, %%mm6 \n\t"
"addl %4, %%eax \n\t" "add %4, %%"REG_a" \n\t"
" js 1b \n\t" " js 1b \n\t"
: "+a" (len) : "+a" (len)
: "r" (blk1a - len), "r" (blk1b -len), "r" (blk2 - len), "r" (stride) : "r" (blk1a - len), "r" (blk1b -len), "r" (blk2 - len), "r" ((long)stride)
); );
} }
static inline void sad8_4_mmx2(uint8_t *blk1, uint8_t *blk2, int stride, int h) static inline void sad8_4_mmx2(uint8_t *blk1, uint8_t *blk2, int stride, int h)
{ //FIXME reuse src { //FIXME reuse src
int len= -(stride*h); long len= -(stride*h);
asm volatile( asm volatile(
".balign 16 \n\t" ".balign 16 \n\t"
"movq "MANGLE(bone)", %%mm5 \n\t" "movq "MANGLE(bone)", %%mm5 \n\t"
"1: \n\t" "1: \n\t"
"movq (%1, %%eax), %%mm0 \n\t" "movq (%1, %%"REG_a"), %%mm0 \n\t"
"movq (%2, %%eax), %%mm2 \n\t" "movq (%2, %%"REG_a"), %%mm2 \n\t"
"movq 1(%1, %%eax), %%mm1 \n\t" "movq 1(%1, %%"REG_a"), %%mm1 \n\t"
"movq 1(%2, %%eax), %%mm3 \n\t" "movq 1(%2, %%"REG_a"), %%mm3 \n\t"
"pavgb %%mm2, %%mm0 \n\t" "pavgb %%mm2, %%mm0 \n\t"
"pavgb %%mm1, %%mm3 \n\t" "pavgb %%mm1, %%mm3 \n\t"
"psubusb %%mm5, %%mm3 \n\t" "psubusb %%mm5, %%mm3 \n\t"
"pavgb %%mm3, %%mm0 \n\t" "pavgb %%mm3, %%mm0 \n\t"
"movq (%3, %%eax), %%mm2 \n\t" "movq (%3, %%"REG_a"), %%mm2 \n\t"
"psadbw %%mm2, %%mm0 \n\t" "psadbw %%mm2, %%mm0 \n\t"
"addl %4, %%eax \n\t" "add %4, %%"REG_a" \n\t"
"movq (%1, %%eax), %%mm1 \n\t" "movq (%1, %%"REG_a"), %%mm1 \n\t"
"movq (%2, %%eax), %%mm3 \n\t" "movq (%2, %%"REG_a"), %%mm3 \n\t"
"movq 1(%1, %%eax), %%mm2 \n\t" "movq 1(%1, %%"REG_a"), %%mm2 \n\t"
"movq 1(%2, %%eax), %%mm4 \n\t" "movq 1(%2, %%"REG_a"), %%mm4 \n\t"
"pavgb %%mm3, %%mm1 \n\t" "pavgb %%mm3, %%mm1 \n\t"
"pavgb %%mm4, %%mm2 \n\t" "pavgb %%mm4, %%mm2 \n\t"
"psubusb %%mm5, %%mm2 \n\t" "psubusb %%mm5, %%mm2 \n\t"
"pavgb %%mm1, %%mm2 \n\t" "pavgb %%mm1, %%mm2 \n\t"
"movq (%3, %%eax), %%mm1 \n\t" "movq (%3, %%"REG_a"), %%mm1 \n\t"
"psadbw %%mm1, %%mm2 \n\t" "psadbw %%mm1, %%mm2 \n\t"
"paddw %%mm2, %%mm0 \n\t" "paddw %%mm2, %%mm0 \n\t"
"paddw %%mm0, %%mm6 \n\t" "paddw %%mm0, %%mm6 \n\t"
"addl %4, %%eax \n\t" "add %4, %%"REG_a" \n\t"
" js 1b \n\t" " js 1b \n\t"
: "+a" (len) : "+a" (len)
: "r" (blk1 - len), "r" (blk1 - len + stride), "r" (blk2 - len), "r" (stride) : "r" (blk1 - len), "r" (blk1 - len + stride), "r" (blk2 - len), "r" ((long)stride)
); );
} }
static inline void sad8_2_mmx(uint8_t *blk1a, uint8_t *blk1b, uint8_t *blk2, int stride, int h) static inline void sad8_2_mmx(uint8_t *blk1a, uint8_t *blk1b, uint8_t *blk2, int stride, int h)
{ {
int len= -(stride*h); long len= -(stride*h);
asm volatile( asm volatile(
".balign 16 \n\t" ".balign 16 \n\t"
"1: \n\t" "1: \n\t"
"movq (%1, %%eax), %%mm0 \n\t" "movq (%1, %%"REG_a"), %%mm0 \n\t"
"movq (%2, %%eax), %%mm1 \n\t" "movq (%2, %%"REG_a"), %%mm1 \n\t"
"movq (%1, %%eax), %%mm2 \n\t" "movq (%1, %%"REG_a"), %%mm2 \n\t"
"movq (%2, %%eax), %%mm3 \n\t" "movq (%2, %%"REG_a"), %%mm3 \n\t"
"punpcklbw %%mm7, %%mm0 \n\t" "punpcklbw %%mm7, %%mm0 \n\t"
"punpcklbw %%mm7, %%mm1 \n\t" "punpcklbw %%mm7, %%mm1 \n\t"
"punpckhbw %%mm7, %%mm2 \n\t" "punpckhbw %%mm7, %%mm2 \n\t"
"punpckhbw %%mm7, %%mm3 \n\t" "punpckhbw %%mm7, %%mm3 \n\t"
"paddw %%mm0, %%mm1 \n\t" "paddw %%mm0, %%mm1 \n\t"
"paddw %%mm2, %%mm3 \n\t" "paddw %%mm2, %%mm3 \n\t"
"movq (%3, %%eax), %%mm4 \n\t" "movq (%3, %%"REG_a"), %%mm4 \n\t"
"movq (%3, %%eax), %%mm2 \n\t" "movq (%3, %%"REG_a"), %%mm2 \n\t"
"paddw %%mm5, %%mm1 \n\t" "paddw %%mm5, %%mm1 \n\t"
"paddw %%mm5, %%mm3 \n\t" "paddw %%mm5, %%mm3 \n\t"
"psrlw $1, %%mm1 \n\t" "psrlw $1, %%mm1 \n\t"
...@@ -181,21 +182,21 @@ static inline void sad8_2_mmx(uint8_t *blk1a, uint8_t *blk1b, uint8_t *blk2, int ...@@ -181,21 +182,21 @@ static inline void sad8_2_mmx(uint8_t *blk1a, uint8_t *blk1b, uint8_t *blk2, int
"punpckhbw %%mm7, %%mm1 \n\t" "punpckhbw %%mm7, %%mm1 \n\t"
"paddw %%mm1, %%mm0 \n\t" "paddw %%mm1, %%mm0 \n\t"
"paddw %%mm0, %%mm6 \n\t" "paddw %%mm0, %%mm6 \n\t"
"addl %4, %%eax \n\t" "add %4, %%"REG_a" \n\t"
" js 1b \n\t" " js 1b \n\t"
: "+a" (len) : "+a" (len)
: "r" (blk1a - len), "r" (blk1b -len), "r" (blk2 - len), "r" (stride) : "r" (blk1a - len), "r" (blk1b -len), "r" (blk2 - len), "r" ((long)stride)
); );
} }
static inline void sad8_4_mmx(uint8_t *blk1, uint8_t *blk2, int stride, int h) static inline void sad8_4_mmx(uint8_t *blk1, uint8_t *blk2, int stride, int h)
{ {
int len= -(stride*h); long len= -(stride*h);
asm volatile( asm volatile(
".balign 16 \n\t" ".balign 16 \n\t"
"1: \n\t" "1: \n\t"
"movq (%1, %%eax), %%mm0 \n\t" "movq (%1, %%"REG_a"), %%mm0 \n\t"
"movq (%2, %%eax), %%mm1 \n\t" "movq (%2, %%"REG_a"), %%mm1 \n\t"
"movq %%mm0, %%mm4 \n\t" "movq %%mm0, %%mm4 \n\t"
"movq %%mm1, %%mm2 \n\t" "movq %%mm1, %%mm2 \n\t"
"punpcklbw %%mm7, %%mm0 \n\t" "punpcklbw %%mm7, %%mm0 \n\t"
...@@ -204,8 +205,8 @@ static inline void sad8_4_mmx(uint8_t *blk1, uint8_t *blk2, int stride, int h) ...@@ -204,8 +205,8 @@ static inline void sad8_4_mmx(uint8_t *blk1, uint8_t *blk2, int stride, int h)
"punpckhbw %%mm7, %%mm2 \n\t" "punpckhbw %%mm7, %%mm2 \n\t"
"paddw %%mm1, %%mm0 \n\t" "paddw %%mm1, %%mm0 \n\t"
"paddw %%mm2, %%mm4 \n\t" "paddw %%mm2, %%mm4 \n\t"
"movq 1(%1, %%eax), %%mm2 \n\t" "movq 1(%1, %%"REG_a"), %%mm2 \n\t"
"movq 1(%2, %%eax), %%mm3 \n\t" "movq 1(%2, %%"REG_a"), %%mm3 \n\t"
"movq %%mm2, %%mm1 \n\t" "movq %%mm2, %%mm1 \n\t"
"punpcklbw %%mm7, %%mm2 \n\t" "punpcklbw %%mm7, %%mm2 \n\t"
"punpckhbw %%mm7, %%mm1 \n\t" "punpckhbw %%mm7, %%mm1 \n\t"
...@@ -216,8 +217,8 @@ static inline void sad8_4_mmx(uint8_t *blk1, uint8_t *blk2, int stride, int h) ...@@ -216,8 +217,8 @@ static inline void sad8_4_mmx(uint8_t *blk1, uint8_t *blk2, int stride, int h)
"punpckhbw %%mm7, %%mm4 \n\t" "punpckhbw %%mm7, %%mm4 \n\t"
"paddw %%mm3, %%mm2 \n\t" "paddw %%mm3, %%mm2 \n\t"
"paddw %%mm4, %%mm1 \n\t" "paddw %%mm4, %%mm1 \n\t"
"movq (%3, %%eax), %%mm3 \n\t" "movq (%3, %%"REG_a"), %%mm3 \n\t"
"movq (%3, %%eax), %%mm4 \n\t" "movq (%3, %%"REG_a"), %%mm4 \n\t"
"paddw %%mm5, %%mm2 \n\t" "paddw %%mm5, %%mm2 \n\t"
"paddw %%mm5, %%mm1 \n\t" "paddw %%mm5, %%mm1 \n\t"
"psrlw $2, %%mm2 \n\t" "psrlw $2, %%mm2 \n\t"
...@@ -231,10 +232,10 @@ static inline void sad8_4_mmx(uint8_t *blk1, uint8_t *blk2, int stride, int h) ...@@ -231,10 +232,10 @@ static inline void sad8_4_mmx(uint8_t *blk1, uint8_t *blk2, int stride, int h)
"punpckhbw %%mm7, %%mm2 \n\t" "punpckhbw %%mm7, %%mm2 \n\t"
"paddw %%mm2, %%mm0 \n\t" "paddw %%mm2, %%mm0 \n\t"
"paddw %%mm0, %%mm6 \n\t" "paddw %%mm0, %%mm6 \n\t"
"addl %4, %%eax \n\t" "add %4, %%"REG_a" \n\t"
" js 1b \n\t" " js 1b \n\t"
: "+a" (len) : "+a" (len)
: "r" (blk1 - len), "r" (blk1 -len + stride), "r" (blk2 - len), "r" (stride) : "r" (blk1 - len), "r" (blk1 -len + stride), "r" (blk2 - len), "r" ((long)stride)
); );
} }
......
This diff is collapsed.
...@@ -36,7 +36,8 @@ static int RENAME(dct_quantize)(MpegEncContext *s, ...@@ -36,7 +36,8 @@ static int RENAME(dct_quantize)(MpegEncContext *s,
DCTELEM *block, int n, DCTELEM *block, int n,
int qscale, int *overflow) int qscale, int *overflow)
{ {
int level=0, last_non_zero_p1, q; //=0 is cuz gcc says uninitalized ... long last_non_zero_p1;
int level=0, q; //=0 is cuz gcc says uninitalized ...
const uint16_t *qmat, *bias; const uint16_t *qmat, *bias;
__align8 int16_t temp_block[64]; __align8 int16_t temp_block[64];
...@@ -90,18 +91,18 @@ static int RENAME(dct_quantize)(MpegEncContext *s, ...@@ -90,18 +91,18 @@ static int RENAME(dct_quantize)(MpegEncContext *s,
if(s->out_format == FMT_H263 && s->mpeg_quant==0){ if(s->out_format == FMT_H263 && s->mpeg_quant==0){
asm volatile( asm volatile(
"movd %%eax, %%mm3 \n\t" // last_non_zero_p1 "movd %%"REG_a", %%mm3 \n\t" // last_non_zero_p1
SPREADW(%%mm3) SPREADW(%%mm3)
"pxor %%mm7, %%mm7 \n\t" // 0 "pxor %%mm7, %%mm7 \n\t" // 0
"pxor %%mm4, %%mm4 \n\t" // 0 "pxor %%mm4, %%mm4 \n\t" // 0
"movq (%2), %%mm5 \n\t" // qmat[0] "movq (%2), %%mm5 \n\t" // qmat[0]
"pxor %%mm6, %%mm6 \n\t" "pxor %%mm6, %%mm6 \n\t"
"psubw (%3), %%mm6 \n\t" // -bias[0] "psubw (%3), %%mm6 \n\t" // -bias[0]
"movl $-128, %%eax \n\t" "mov $-128, %%"REG_a" \n\t"
".balign 16 \n\t" ".balign 16 \n\t"
"1: \n\t" "1: \n\t"
"pxor %%mm1, %%mm1 \n\t" // 0 "pxor %%mm1, %%mm1 \n\t" // 0
"movq (%1, %%eax), %%mm0 \n\t" // block[i] "movq (%1, %%"REG_a"), %%mm0 \n\t" // block[i]
"pcmpgtw %%mm0, %%mm1 \n\t" // block[i] <= 0 ? 0xFF : 0x00 "pcmpgtw %%mm0, %%mm1 \n\t" // block[i] <= 0 ? 0xFF : 0x00
"pxor %%mm1, %%mm0 \n\t" "pxor %%mm1, %%mm0 \n\t"
"psubw %%mm1, %%mm0 \n\t" // ABS(block[i]) "psubw %%mm1, %%mm0 \n\t" // ABS(block[i])
...@@ -110,13 +111,13 @@ static int RENAME(dct_quantize)(MpegEncContext *s, ...@@ -110,13 +111,13 @@ static int RENAME(dct_quantize)(MpegEncContext *s,
"por %%mm0, %%mm4 \n\t" "por %%mm0, %%mm4 \n\t"
"pxor %%mm1, %%mm0 \n\t" "pxor %%mm1, %%mm0 \n\t"
"psubw %%mm1, %%mm0 \n\t" // out=((ABS(block[i])*qmat[0] - bias[0]*qmat[0])>>16)*sign(block[i]) "psubw %%mm1, %%mm0 \n\t" // out=((ABS(block[i])*qmat[0] - bias[0]*qmat[0])>>16)*sign(block[i])
"movq %%mm0, (%5, %%eax) \n\t" "movq %%mm0, (%5, %%"REG_a") \n\t"
"pcmpeqw %%mm7, %%mm0 \n\t" // out==0 ? 0xFF : 0x00 "pcmpeqw %%mm7, %%mm0 \n\t" // out==0 ? 0xFF : 0x00
"movq (%4, %%eax), %%mm1 \n\t" "movq (%4, %%"REG_a"), %%mm1 \n\t"
"movq %%mm7, (%1, %%eax) \n\t" // 0 "movq %%mm7, (%1, %%"REG_a") \n\t" // 0
"pandn %%mm1, %%mm0 \n\t" "pandn %%mm1, %%mm0 \n\t"
PMAXW(%%mm0, %%mm3) PMAXW(%%mm0, %%mm3)
"addl $8, %%eax \n\t" "add $8, %%"REG_a" \n\t"
" js 1b \n\t" " js 1b \n\t"
"movq %%mm3, %%mm0 \n\t" "movq %%mm3, %%mm0 \n\t"
"psrlq $32, %%mm3 \n\t" "psrlq $32, %%mm3 \n\t"
...@@ -124,8 +125,8 @@ static int RENAME(dct_quantize)(MpegEncContext *s, ...@@ -124,8 +125,8 @@ static int RENAME(dct_quantize)(MpegEncContext *s,
"movq %%mm3, %%mm0 \n\t" "movq %%mm3, %%mm0 \n\t"
"psrlq $16, %%mm3 \n\t" "psrlq $16, %%mm3 \n\t"
PMAXW(%%mm0, %%mm3) PMAXW(%%mm0, %%mm3)
"movd %%mm3, %%eax \n\t" "movd %%mm3, %%"REG_a" \n\t"
"movzbl %%al, %%eax \n\t" // last_non_zero_p1 "movzb %%al, %%"REG_a" \n\t" // last_non_zero_p1
: "+a" (last_non_zero_p1) : "+a" (last_non_zero_p1)
: "r" (block+64), "r" (qmat), "r" (bias), : "r" (block+64), "r" (qmat), "r" (bias),
"r" (inv_zigzag_direct16+64), "r" (temp_block+64) "r" (inv_zigzag_direct16+64), "r" (temp_block+64)
...@@ -142,32 +143,32 @@ static int RENAME(dct_quantize)(MpegEncContext *s, ...@@ -142,32 +143,32 @@ static int RENAME(dct_quantize)(MpegEncContext *s,
); );
}else{ // FMT_H263 }else{ // FMT_H263
asm volatile( asm volatile(
"movd %%eax, %%mm3 \n\t" // last_non_zero_p1 "movd %%"REG_a", %%mm3 \n\t" // last_non_zero_p1
SPREADW(%%mm3) SPREADW(%%mm3)
"pxor %%mm7, %%mm7 \n\t" // 0 "pxor %%mm7, %%mm7 \n\t" // 0
"pxor %%mm4, %%mm4 \n\t" // 0 "pxor %%mm4, %%mm4 \n\t" // 0
"movl $-128, %%eax \n\t" "mov $-128, %%"REG_a" \n\t"
".balign 16 \n\t" ".balign 16 \n\t"
"1: \n\t" "1: \n\t"
"pxor %%mm1, %%mm1 \n\t" // 0 "pxor %%mm1, %%mm1 \n\t" // 0
"movq (%1, %%eax), %%mm0 \n\t" // block[i] "movq (%1, %%"REG_a"), %%mm0 \n\t" // block[i]
"pcmpgtw %%mm0, %%mm1 \n\t" // block[i] <= 0 ? 0xFF : 0x00 "pcmpgtw %%mm0, %%mm1 \n\t" // block[i] <= 0 ? 0xFF : 0x00
"pxor %%mm1, %%mm0 \n\t" "pxor %%mm1, %%mm0 \n\t"
"psubw %%mm1, %%mm0 \n\t" // ABS(block[i]) "psubw %%mm1, %%mm0 \n\t" // ABS(block[i])
"movq (%3, %%eax), %%mm6 \n\t" // bias[0] "movq (%3, %%"REG_a"), %%mm6 \n\t" // bias[0]
"paddusw %%mm6, %%mm0 \n\t" // ABS(block[i]) + bias[0] "paddusw %%mm6, %%mm0 \n\t" // ABS(block[i]) + bias[0]
"movq (%2, %%eax), %%mm5 \n\t" // qmat[i] "movq (%2, %%"REG_a"), %%mm5 \n\t" // qmat[i]
"pmulhw %%mm5, %%mm0 \n\t" // (ABS(block[i])*qmat[0] + bias[0]*qmat[0])>>16 "pmulhw %%mm5, %%mm0 \n\t" // (ABS(block[i])*qmat[0] + bias[0]*qmat[0])>>16
"por %%mm0, %%mm4 \n\t" "por %%mm0, %%mm4 \n\t"
"pxor %%mm1, %%mm0 \n\t" "pxor %%mm1, %%mm0 \n\t"
"psubw %%mm1, %%mm0 \n\t" // out=((ABS(block[i])*qmat[0] - bias[0]*qmat[0])>>16)*sign(block[i]) "psubw %%mm1, %%mm0 \n\t" // out=((ABS(block[i])*qmat[0] - bias[0]*qmat[0])>>16)*sign(block[i])
"movq %%mm0, (%5, %%eax) \n\t" "movq %%mm0, (%5, %%"REG_a") \n\t"
"pcmpeqw %%mm7, %%mm0 \n\t" // out==0 ? 0xFF : 0x00 "pcmpeqw %%mm7, %%mm0 \n\t" // out==0 ? 0xFF : 0x00
"movq (%4, %%eax), %%mm1 \n\t" "movq (%4, %%"REG_a"), %%mm1 \n\t"
"movq %%mm7, (%1, %%eax) \n\t" // 0 "movq %%mm7, (%1, %%"REG_a") \n\t" // 0
"pandn %%mm1, %%mm0 \n\t" "pandn %%mm1, %%mm0 \n\t"
PMAXW(%%mm0, %%mm3) PMAXW(%%mm0, %%mm3)
"addl $8, %%eax \n\t" "add $8, %%"REG_a" \n\t"
" js 1b \n\t" " js 1b \n\t"
"movq %%mm3, %%mm0 \n\t" "movq %%mm3, %%mm0 \n\t"
"psrlq $32, %%mm3 \n\t" "psrlq $32, %%mm3 \n\t"
...@@ -175,8 +176,8 @@ static int RENAME(dct_quantize)(MpegEncContext *s, ...@@ -175,8 +176,8 @@ static int RENAME(dct_quantize)(MpegEncContext *s,
"movq %%mm3, %%mm0 \n\t" "movq %%mm3, %%mm0 \n\t"
"psrlq $16, %%mm3 \n\t" "psrlq $16, %%mm3 \n\t"
PMAXW(%%mm0, %%mm3) PMAXW(%%mm0, %%mm3)
"movd %%mm3, %%eax \n\t" "movd %%mm3, %%"REG_a" \n\t"
"movzbl %%al, %%eax \n\t" // last_non_zero_p1 "movzb %%al, %%"REG_a" \n\t" // last_non_zero_p1
: "+a" (last_non_zero_p1) : "+a" (last_non_zero_p1)
: "r" (block+64), "r" (qmat+64), "r" (bias+64), : "r" (block+64), "r" (qmat+64), "r" (bias+64),
"r" (inv_zigzag_direct16+64), "r" (temp_block+64) "r" (inv_zigzag_direct16+64), "r" (temp_block+64)
......
...@@ -119,7 +119,7 @@ try to unroll inner for(x=0 ... loop to avoid these damn if(x ... checks ...@@ -119,7 +119,7 @@ try to unroll inner for(x=0 ... loop to avoid these damn if(x ... checks
# define always_inline inline # define always_inline inline
#endif #endif
#ifdef ARCH_X86 #if defined(ARCH_X86) || defined(ARCH_X86_64)
static uint64_t __attribute__((aligned(8))) attribute_used w05= 0x0005000500050005LL; static uint64_t __attribute__((aligned(8))) attribute_used w05= 0x0005000500050005LL;
static uint64_t __attribute__((aligned(8))) attribute_used w04= 0x0004000400040004LL; static uint64_t __attribute__((aligned(8))) attribute_used w04= 0x0004000400040004LL;
static uint64_t __attribute__((aligned(8))) attribute_used w20= 0x0020002000200020LL; static uint64_t __attribute__((aligned(8))) attribute_used w20= 0x0020002000200020LL;
...@@ -172,7 +172,7 @@ static char *replaceTable[]= ...@@ -172,7 +172,7 @@ static char *replaceTable[]=
}; };
#ifdef ARCH_X86 #if defined(ARCH_X86) || defined(ARCH_X86_64)
static inline void prefetchnta(void *p) static inline void prefetchnta(void *p)
{ {
asm volatile( "prefetchnta (%0)\n\t" asm volatile( "prefetchnta (%0)\n\t"
...@@ -597,7 +597,7 @@ static always_inline void do_a_deblock_C(uint8_t *src, int step, int stride, PPC ...@@ -597,7 +597,7 @@ static always_inline void do_a_deblock_C(uint8_t *src, int step, int stride, PPC
#endif //HAVE_ALTIVEC #endif //HAVE_ALTIVEC
#endif //ARCH_POWERPC #endif //ARCH_POWERPC
#ifdef ARCH_X86 #if defined(ARCH_X86) || defined(ARCH_X86_64)
#if (defined (HAVE_MMX) && !defined (HAVE_3DNOW) && !defined (HAVE_MMX2)) || defined (RUNTIME_CPUDETECT) #if (defined (HAVE_MMX) && !defined (HAVE_3DNOW) && !defined (HAVE_MMX2)) || defined (RUNTIME_CPUDETECT)
#define COMPILE_MMX #define COMPILE_MMX
...@@ -616,13 +616,11 @@ static always_inline void do_a_deblock_C(uint8_t *src, int step, int stride, PPC ...@@ -616,13 +616,11 @@ static always_inline void do_a_deblock_C(uint8_t *src, int step, int stride, PPC
#undef HAVE_MMX2 #undef HAVE_MMX2
#undef HAVE_3DNOW #undef HAVE_3DNOW
#undef HAVE_ALTIVEC #undef HAVE_ALTIVEC
#undef ARCH_X86
#ifdef COMPILE_C #ifdef COMPILE_C
#undef HAVE_MMX #undef HAVE_MMX
#undef HAVE_MMX2 #undef HAVE_MMX2
#undef HAVE_3DNOW #undef HAVE_3DNOW
#undef ARCH_X86
#define RENAME(a) a ## _C #define RENAME(a) a ## _C
#include "postprocess_template.c" #include "postprocess_template.c"
#endif #endif
...@@ -643,7 +641,6 @@ static always_inline void do_a_deblock_C(uint8_t *src, int step, int stride, PPC ...@@ -643,7 +641,6 @@ static always_inline void do_a_deblock_C(uint8_t *src, int step, int stride, PPC
#define HAVE_MMX #define HAVE_MMX
#undef HAVE_MMX2 #undef HAVE_MMX2
#undef HAVE_3DNOW #undef HAVE_3DNOW
#define ARCH_X86
#define RENAME(a) a ## _MMX #define RENAME(a) a ## _MMX
#include "postprocess_template.c" #include "postprocess_template.c"
#endif #endif
...@@ -654,7 +651,6 @@ static always_inline void do_a_deblock_C(uint8_t *src, int step, int stride, PPC ...@@ -654,7 +651,6 @@ static always_inline void do_a_deblock_C(uint8_t *src, int step, int stride, PPC
#define HAVE_MMX #define HAVE_MMX
#define HAVE_MMX2 #define HAVE_MMX2
#undef HAVE_3DNOW #undef HAVE_3DNOW
#define ARCH_X86
#define RENAME(a) a ## _MMX2 #define RENAME(a) a ## _MMX2
#include "postprocess_template.c" #include "postprocess_template.c"
#endif #endif
...@@ -665,7 +661,6 @@ static always_inline void do_a_deblock_C(uint8_t *src, int step, int stride, PPC ...@@ -665,7 +661,6 @@ static always_inline void do_a_deblock_C(uint8_t *src, int step, int stride, PPC
#define HAVE_MMX #define HAVE_MMX
#undef HAVE_MMX2 #undef HAVE_MMX2
#define HAVE_3DNOW #define HAVE_3DNOW
#define ARCH_X86
#define RENAME(a) a ## _3DNow #define RENAME(a) a ## _3DNow
#include "postprocess_template.c" #include "postprocess_template.c"
#endif #endif
...@@ -683,7 +678,7 @@ static inline void postProcess(uint8_t src[], int srcStride, uint8_t dst[], int ...@@ -683,7 +678,7 @@ static inline void postProcess(uint8_t src[], int srcStride, uint8_t dst[], int
// difference wouldnt be messureable here but its much better because // difference wouldnt be messureable here but its much better because
// someone might exchange the cpu whithout restarting mplayer ;) // someone might exchange the cpu whithout restarting mplayer ;)
#ifdef RUNTIME_CPUDETECT #ifdef RUNTIME_CPUDETECT
#ifdef ARCH_X86 #if defined(ARCH_X86) || defined(ARCH_X86_64)
// ordered per speed fasterst first // ordered per speed fasterst first
if(c->cpuCaps & PP_CPU_CAPS_MMX2) if(c->cpuCaps & PP_CPU_CAPS_MMX2)
postProcess_MMX2(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c); postProcess_MMX2(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c);
......
...@@ -716,7 +716,7 @@ static inline int msmpeg4_pred_dc(MpegEncContext * s, int n, ...@@ -716,7 +716,7 @@ static inline int msmpeg4_pred_dc(MpegEncContext * s, int n,
necessitate to modify mpegvideo.c. The problem comes from the necessitate to modify mpegvideo.c. The problem comes from the
fact they decided to store the quantized DC (which would lead fact they decided to store the quantized DC (which would lead
to problems if Q could vary !) */ to problems if Q could vary !) */
#if defined ARCH_X86 && !defined PIC #if (defined(ARCH_X86) || defined(ARCH_X86_64)) && !defined PIC
asm volatile( asm volatile(
"movl %3, %%eax \n\t" "movl %3, %%eax \n\t"
"shrl $1, %%eax \n\t" "shrl $1, %%eax \n\t"
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment