adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64...

adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) git-svn-id: file:///var/local/repositories/ffmpeg/trunk@3578 9553f0bf-9b14-0410-a0b8-cfaf0461ba5b

adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64...
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) git-svn-id: file:///var/local/repositories/ffmpeg/trunk@3578 9553f0bf-9b14-0410-a0b8-cfaf0461ba5b
09fcf89b · michael · fbfb4b78 · 09fcf89b · 09fcf89b · 09fcf89b
Commit 09fcf89b authored Oct 11, 2004 by michael
15 changed files
--- a/configure
+++ b/configure
@@ -106,6 +106,14 @@ case "$cpu" in
  i386|i486|i586|i686|i86pc|BePC)
    cpu="x86"
  ;;
+  x86_64)
+    if [ "`$cc -dumpmachine | grep x86_64 | cut -d- -f1`" = "x86_64" -a \
+         -z "`echo $CFLAGS | grep -- -m32`"  ]; then
+      cpu="x86_64"
+    else
+      cpu="x86"
+    fi
+  ;;
  # armv4l is a subset of armv5tel
  armv4l|armv5tel)
    cpu="armv4l"
@@ -500,7 +508,7 @@ fi
 # compute mmx state
 if test $mmx = "default"; then
-    if test $cpu = "x86"; then
+    if test $cpu = "x86" -o $cpu = "x86_64"; then
        mmx="yes"
    else
        mmx="no"
@@ -827,6 +835,7 @@ done
 # test gcc version to see if vector builtins can be used
 # currently only used on i386 for MMX builtins
 cat > $TMPC << EOF
+#include <xmmintrin.h>
 int main(void) { 
 #if __GNUC__ > 3 || (__GNUC__ == 3 && __GNUC_MINOR__ >= 2)
 return 0;
@@ -985,7 +994,7 @@ echo "CPU              $cpu ($tune)"
 echo "Big Endian       $bigendian"
 echo "inttypes.h       $inttypes"
 echo "broken inttypes.h $emu_fast_int"
-if test $cpu = "x86"; then
+if test $cpu = "x86" -o $cpu = "x86_64"; then
 echo "MMX enabled      $mmx"
 echo "Vector Builtins  $builtin_vector"
 fi
@@ -1074,6 +1083,9 @@ echo "TARGET_OS=$TARGET_OS" >> config.mak
 if test "$cpu" = "x86" ; then
  echo "TARGET_ARCH_X86=yes" >> config.mak
  echo "#define ARCH_X86 1" >> $TMPH
+elif test "$cpu" = "x86_64" ; then
+  echo "TARGET_ARCH_X86_64=yes" >> config.mak
+  echo "#define ARCH_X86_64 1" >> $TMPH
 elif test "$cpu" = "armv4l" ; then
  echo "TARGET_ARCH_ARMV4L=yes" >> config.mak
  echo "#define ARCH_ARMV4L 1" >> $TMPH

--- a/libavcodec/bswap.h
+++ b/libavcodec/bswap.h
@@ -10,17 +10,23 @@
 #include <byteswap.h>
 #else
-#ifdef ARCH_X86
+#ifdef ARCH_X86_64
-static inline unsigned short ByteSwap16(unsigned short x)
+#  define LEGACY_REGS "=Q"
+#else
+#  define LEGACY_REGS "=q"
+#endif
+#if defined(ARCH_X86) || defined(ARCH_X86_64)
+static inline uint16_t ByteSwap16(uint16_t x)
 {
  __asm("xchgb %b0,%h0"	:
-        "=q" (x)	:
+        LEGACY_REGS (x)	:
        "0" (x));
    return x;
 }
 #define bswap_16(x) ByteSwap16(x)
-static inline unsigned int ByteSwap32(unsigned int x)
+static inline uint32_t ByteSwap32(uint32_t x)
 {
 #if __CPU__ > 386
 __asm("bswap	%0":
@@ -29,21 +35,28 @@ static inline unsigned int ByteSwap32(unsigned int x)
 __asm("xchgb	%b0,%h0\n"
      "	rorl	$16,%0\n"
      "	xchgb	%b0,%h0":
-      "=q" (x)		:
+      LEGACY_REGS (x)		:
 #endif
      "0" (x));
  return x;
 }
 #define bswap_32(x) ByteSwap32(x)
-static inline unsigned long long int ByteSwap64(unsigned long long int x)
+static inline uint64_t ByteSwap64(uint64_t x)
 {
+#ifdef ARCH_X86_64
+  __asm("bswap	%0":
+        "=r" (x)     :
+        "0" (x));
+  return x;
+#else
  register union { __extension__ uint64_t __ll;
          uint32_t __l[2]; } __x;
  asm("xchgl	%0,%1":
      "=r"(__x.__l[0]),"=r"(__x.__l[1]):
-      "0"(bswap_32((unsigned long)x)),"1"(bswap_32((unsigned long)(x>>32))));
+      "0"(bswap_32((uint32_t)x)),"1"(bswap_32((uint32_t)(x>>32))));
  return __x.__ll;
+#endif
 }
 #define bswap_64(x) ByteSwap64(x)

--- a/libavcodec/common.h
+++ b/libavcodec/common.h
@@ -254,7 +254,7 @@ inline void dprintf(const char* fmt,...) {}
 extern const uint32_t inverse[256];
-#ifdef ARCH_X86
+#if defined(ARCH_X86) || defined(ARCH_X86_64)
 #    define FASTDIV(a,b) \
    ({\
        int ret,dmy;\
@@ -271,7 +271,7 @@ extern const uint32_t inverse[256];
 #    define FASTDIV(a,b)   ((a)/(b))
 #endif
-#ifdef ARCH_X86
+#if defined(ARCH_X86) || defined(ARCH_X86_64)
 // avoid +32 for shift optimization (gcc should do that ...)
 static inline  int32_t NEG_SSR32( int32_t a, int8_t s){
    asm ("sarl %1, %0\n\t"
@@ -390,7 +390,7 @@ typedef struct RL_VLC_ELEM {
 #endif
 /* used to avoid missaligned exceptions on some archs (alpha, ...) */
-#ifdef ARCH_X86
+#if defined(ARCH_X86) || defined(ARCH_X86_64)
 #    define unaligned32(a) (*(const uint32_t*)(a))
 #else
 #    ifdef __GNUC__
@@ -460,7 +460,7 @@ static inline void put_bits(PutBitContext *s, int n, unsigned int value)
 static inline void put_bits(PutBitContext *s, int n, unsigned int value)
 {
 #    ifdef ALIGNED_BITSTREAM_WRITER
-#        ifdef ARCH_X86
+#        if defined(ARCH_X86) || defined(ARCH_X86_64)
    asm volatile(
 	"movl %0, %%ecx			\n\t"
 	"xorl %%eax, %%eax		\n\t"
@@ -491,7 +491,7 @@ static inline void put_bits(PutBitContext *s, int n, unsigned int value)
    s->index= index;
 #        endif
 #    else //ALIGNED_BITSTREAM_WRITER
-#        ifdef ARCH_X86
+#        if defined(ARCH_X86) || defined(ARCH_X86_64)
    asm volatile(
 	"movl $7, %%ecx			\n\t"
 	"andl %0, %%ecx			\n\t"
@@ -738,7 +738,7 @@ static inline int get_bits_count(GetBitContext *s){
        name##_bit_count-= 32;\
    }\
-#ifdef ARCH_X86
+#if defined(ARCH_X86) || defined(ARCH_X86_64)
 #   define SKIP_CACHE(name, gb, num)\
        asm(\
            "shldl %2, %1, %0		\n\t"\
@@ -1218,7 +1218,7 @@ static inline int ff_get_fourcc(const char *s){
 #define MKBETAG(a,b,c,d) (d | (c << 8) | (b << 16) | (a << 24))
-#ifdef ARCH_X86
+#if defined(ARCH_X86) || defined(ARCH_X86_64)
 #define MASK_ABS(mask, level)\
            asm volatile(\
 		"cdq			\n\t"\
@@ -1252,7 +1252,7 @@ if((y)<(x)){\
 }
 #endif
-#ifdef ARCH_X86
+#if defined(ARCH_X86) || defined(ARCH_X86_64)
 static inline long long rdtsc(void)
 {
 	long long l;

--- a/libavcodec/i386/cputest.c
+++ b/libavcodec/i386/cputest.c
@@ -4,12 +4,20 @@
 #include <stdlib.h>
 #include "../dsputil.h"
+#ifdef ARCH_X86_64
+#  define REG_b "rbx"
+#  define REG_S "rsi"
+#else
+#  define REG_b "ebx"
+#  define REG_S "esi"
+#endif
 /* ebx saving is necessary for PIC. gcc seems unable to see it alone */
 #define cpuid(index,eax,ebx,ecx,edx)\
    __asm __volatile\
-	("movl %%ebx, %%esi\n\t"\
+	("mov %%"REG_b", %%"REG_S"\n\t"\
         "cpuid\n\t"\
-         "xchgl %%ebx, %%esi"\
+         "xchg %%"REG_b", %%"REG_S\
         : "=a" (eax), "=S" (ebx),\
           "=c" (ecx), "=d" (edx)\
         : "0" (index));
@@ -24,7 +32,7 @@ int mm_support(void)
                          /* See if CPUID instruction is supported ... */
                          /* ... Get copies of EFLAGS into eax and ecx */
                          "pushf\n\t"
-                          "popl %0\n\t"
+                          "pop %0\n\t"
                          "movl %0, %1\n\t"
                          /* ... Toggle the ID bit in one copy and store */
@@ -35,7 +43,7 @@ int mm_support(void)
                          /* ... Get the (hopefully modified) EFLAGS */
                          "pushf\n\t"
-                          "popl %0\n\t"
+                          "pop %0\n\t"
                          : "=a" (eax), "=c" (ecx)
                          :
                          : "cc" 

--- a/libavcodec/i386/dsputil_mmx.c
+++ b/libavcodec/i386/dsputil_mmx.c
--- a/libavcodec/i386/dsputil_mmx_avg.h
+++ b/libavcodec/i386/dsputil_mmx_avg.h
--- a/libavcodec/i386/dsputil_mmx_rnd.h
+++ b/libavcodec/i386/dsputil_mmx_rnd.h
--- a/libavcodec/i386/fdct_mmx.c
+++ b/libavcodec/i386/fdct_mmx.c
@@ -47,13 +47,13 @@ static const int16_t ocos_4_16[4] ATTR_ALIGN(8) = {
    23170, 23170, 23170, 23170,	//cos * (2<<15) + 0.5
 };
-static const long long fdct_one_corr ATTR_ALIGN(8) = 0x0001000100010001LL;
+static const int64_t fdct_one_corr ATTR_ALIGN(8) = 0x0001000100010001LL;
-static const long fdct_r_row[2] ATTR_ALIGN(8) = {RND_FRW_ROW, RND_FRW_ROW };
+static const int32_t fdct_r_row[2] ATTR_ALIGN(8) = {RND_FRW_ROW, RND_FRW_ROW };
 struct 
 {
- const long fdct_r_row_sse2[4] ATTR_ALIGN(16);
+ const int32_t fdct_r_row_sse2[4] ATTR_ALIGN(16);
 } fdct_r_row_sse2 ATTR_ALIGN(16)=
 {{
 RND_FRW_ROW, RND_FRW_ROW, RND_FRW_ROW, RND_FRW_ROW

--- a/libavcodec/i386/mmx.h
+++ b/libavcodec/i386/mmx.h
@@ -5,6 +5,12 @@
 #ifndef AVCODEC_I386MMX_H
 #define AVCODEC_I386MMX_H
+#ifdef ARCH_X86_64
+#  define REG_a "rax"
+#else
+#  define REG_a "eax"
+#endif
 /*
 * The type of an value that fits in an MMX register (note that long
 * long constant values MUST be suffixed by LL and unsigned long long

--- a/libavcodec/i386/motion_est_mmx.c
+++ b/libavcodec/i386/motion_est_mmx.c
@@ -20,6 +20,7 @@
 * mostly by Michael Niedermayer <michaelni@gmx.at>
 */
 #include "../dsputil.h"
+#include "mmx.h"
 static const __attribute__ ((aligned(8))) uint64_t round_tab[3]={
 0x0000000000000000ULL,
@@ -31,19 +32,19 @@ static attribute_used __attribute__ ((aligned(8))) uint64_t bone= 0x010101010101
 static inline void sad8_1_mmx(uint8_t *blk1, uint8_t *blk2, int stride, int h)
 {
-    int len= -(stride*h);
+    long len= -(stride*h);
    asm volatile(
        ".balign 16			\n\t"
        "1:				\n\t"
-        "movq (%1, %%eax), %%mm0	\n\t"
+        "movq (%1, %%"REG_a"), %%mm0	\n\t"
-        "movq (%2, %%eax), %%mm2	\n\t"
+        "movq (%2, %%"REG_a"), %%mm2	\n\t"
-        "movq (%2, %%eax), %%mm4	\n\t"
+        "movq (%2, %%"REG_a"), %%mm4	\n\t"
-        "addl %3, %%eax			\n\t"
+        "add %3, %%"REG_a"		\n\t"
        "psubusb %%mm0, %%mm2		\n\t"
        "psubusb %%mm4, %%mm0		\n\t"
-        "movq (%1, %%eax), %%mm1	\n\t"
+        "movq (%1, %%"REG_a"), %%mm1	\n\t"
-        "movq (%2, %%eax), %%mm3	\n\t"
+        "movq (%2, %%"REG_a"), %%mm3	\n\t"
-        "movq (%2, %%eax), %%mm5	\n\t"
+        "movq (%2, %%"REG_a"), %%mm5	\n\t"
        "psubusb %%mm1, %%mm3		\n\t"
        "psubusb %%mm5, %%mm1		\n\t"
        "por %%mm2, %%mm0		\n\t"
@@ -58,116 +59,116 @@ static inline void sad8_1_mmx(uint8_t *blk1, uint8_t *blk2, int stride, int h)
        "paddw %%mm3, %%mm2		\n\t"
        "paddw %%mm2, %%mm0		\n\t"
        "paddw %%mm0, %%mm6		\n\t"
-        "addl %3, %%eax			\n\t"
+        "add %3, %%"REG_a"		\n\t"
        " js 1b				\n\t"
        : "+a" (len)
-        : "r" (blk1 - len), "r" (blk2 - len), "r" (stride)
+        : "r" (blk1 - len), "r" (blk2 - len), "r" ((long)stride)
    );
 }
 static inline void sad8_1_mmx2(uint8_t *blk1, uint8_t *blk2, int stride, int h)
 {
-    int len= -(stride*h);
+    long len= -(stride*h);
    asm volatile(
        ".balign 16			\n\t"
        "1:				\n\t"
-        "movq (%1, %%eax), %%mm0	\n\t"
+        "movq (%1, %%"REG_a"), %%mm0	\n\t"
-        "movq (%2, %%eax), %%mm2	\n\t"
+        "movq (%2, %%"REG_a"), %%mm2	\n\t"
        "psadbw %%mm2, %%mm0		\n\t"
-        "addl %3, %%eax			\n\t"
+        "add %3, %%"REG_a"		\n\t"
-        "movq (%1, %%eax), %%mm1	\n\t"
+        "movq (%1, %%"REG_a"), %%mm1	\n\t"
-        "movq (%2, %%eax), %%mm3	\n\t"
+        "movq (%2, %%"REG_a"), %%mm3	\n\t"
        "psadbw %%mm1, %%mm3		\n\t"
        "paddw %%mm3, %%mm0		\n\t"
        "paddw %%mm0, %%mm6		\n\t"
-        "addl %3, %%eax			\n\t"
+        "add %3, %%"REG_a"		\n\t"
        " js 1b				\n\t"
        : "+a" (len)
-        : "r" (blk1 - len), "r" (blk2 - len), "r" (stride)
+        : "r" (blk1 - len), "r" (blk2 - len), "r" ((long)stride)
    );
 }
 static inline void sad8_2_mmx2(uint8_t *blk1a, uint8_t *blk1b, uint8_t *blk2, int stride, int h)
 {
-    int len= -(stride*h);
+    long len= -(stride*h);
    asm volatile(
        ".balign 16			\n\t"
        "1:				\n\t"
-        "movq (%1, %%eax), %%mm0	\n\t"
+        "movq (%1, %%"REG_a"), %%mm0	\n\t"
-        "movq (%2, %%eax), %%mm2	\n\t"
+        "movq (%2, %%"REG_a"), %%mm2	\n\t"
        "pavgb %%mm2, %%mm0		\n\t"
-        "movq (%3, %%eax), %%mm2	\n\t"
+        "movq (%3, %%"REG_a"), %%mm2	\n\t"
        "psadbw %%mm2, %%mm0		\n\t"
-        "addl %4, %%eax			\n\t"
+        "add %4, %%"REG_a"		\n\t"
-        "movq (%1, %%eax), %%mm1	\n\t"
+        "movq (%1, %%"REG_a"), %%mm1	\n\t"
-        "movq (%2, %%eax), %%mm3	\n\t"
+        "movq (%2, %%"REG_a"), %%mm3	\n\t"
        "pavgb %%mm1, %%mm3		\n\t"
-        "movq (%3, %%eax), %%mm1	\n\t"
+        "movq (%3, %%"REG_a"), %%mm1	\n\t"
        "psadbw %%mm1, %%mm3		\n\t"
        "paddw %%mm3, %%mm0		\n\t"
        "paddw %%mm0, %%mm6		\n\t"
-        "addl %4, %%eax			\n\t"
+        "add %4, %%"REG_a"		\n\t"
        " js 1b				\n\t"
        : "+a" (len)
-        : "r" (blk1a - len), "r" (blk1b -len), "r" (blk2 - len), "r" (stride)
+        : "r" (blk1a - len), "r" (blk1b -len), "r" (blk2 - len), "r" ((long)stride)
    );
 }
 static inline void sad8_4_mmx2(uint8_t *blk1, uint8_t *blk2, int stride, int h)
 { //FIXME reuse src
-    int len= -(stride*h);
+    long len= -(stride*h);
    asm volatile(
        ".balign 16			\n\t"
        "movq "MANGLE(bone)", %%mm5	\n\t"
        "1:				\n\t"
-        "movq (%1, %%eax), %%mm0	\n\t"
+        "movq (%1, %%"REG_a"), %%mm0	\n\t"
-        "movq (%2, %%eax), %%mm2	\n\t"
+        "movq (%2, %%"REG_a"), %%mm2	\n\t"
-        "movq 1(%1, %%eax), %%mm1	\n\t"
+        "movq 1(%1, %%"REG_a"), %%mm1	\n\t"
-        "movq 1(%2, %%eax), %%mm3	\n\t"
+        "movq 1(%2, %%"REG_a"), %%mm3	\n\t"
        "pavgb %%mm2, %%mm0		\n\t"
        "pavgb %%mm1, %%mm3		\n\t"
        "psubusb %%mm5, %%mm3		\n\t"
        "pavgb %%mm3, %%mm0		\n\t"
-        "movq (%3, %%eax), %%mm2	\n\t"
+        "movq (%3, %%"REG_a"), %%mm2	\n\t"
        "psadbw %%mm2, %%mm0		\n\t"
-        "addl %4, %%eax			\n\t"
+        "add %4, %%"REG_a"		\n\t"
-        "movq (%1, %%eax), %%mm1	\n\t"
+        "movq (%1, %%"REG_a"), %%mm1	\n\t"
-        "movq (%2, %%eax), %%mm3	\n\t"
+        "movq (%2, %%"REG_a"), %%mm3	\n\t"
-        "movq 1(%1, %%eax), %%mm2	\n\t"
+        "movq 1(%1, %%"REG_a"), %%mm2	\n\t"
-        "movq 1(%2, %%eax), %%mm4	\n\t"
+        "movq 1(%2, %%"REG_a"), %%mm4	\n\t"
        "pavgb %%mm3, %%mm1		\n\t"
        "pavgb %%mm4, %%mm2		\n\t"
        "psubusb %%mm5, %%mm2		\n\t"
        "pavgb %%mm1, %%mm2		\n\t"
-        "movq (%3, %%eax), %%mm1	\n\t"
+        "movq (%3, %%"REG_a"), %%mm1	\n\t"
        "psadbw %%mm1, %%mm2		\n\t"
        "paddw %%mm2, %%mm0		\n\t"
        "paddw %%mm0, %%mm6		\n\t"
-        "addl %4, %%eax			\n\t"
+        "add %4, %%"REG_a"		\n\t"
        " js 1b				\n\t"
        : "+a" (len)
-        : "r" (blk1 - len), "r" (blk1 - len + stride), "r" (blk2 - len), "r" (stride)
+        : "r" (blk1 - len), "r" (blk1 - len + stride), "r" (blk2 - len), "r" ((long)stride)
    );
 }
 static inline void sad8_2_mmx(uint8_t *blk1a, uint8_t *blk1b, uint8_t *blk2, int stride, int h)
 {
-    int len= -(stride*h);
+    long len= -(stride*h);
    asm volatile(
        ".balign 16			\n\t"
        "1:				\n\t"
-        "movq (%1, %%eax), %%mm0	\n\t"
+        "movq (%1, %%"REG_a"), %%mm0	\n\t"
-        "movq (%2, %%eax), %%mm1	\n\t"
+        "movq (%2, %%"REG_a"), %%mm1	\n\t"
-        "movq (%1, %%eax), %%mm2	\n\t"
+        "movq (%1, %%"REG_a"), %%mm2	\n\t"
-        "movq (%2, %%eax), %%mm3	\n\t"
+        "movq (%2, %%"REG_a"), %%mm3	\n\t"
        "punpcklbw %%mm7, %%mm0		\n\t"
        "punpcklbw %%mm7, %%mm1		\n\t"
        "punpckhbw %%mm7, %%mm2		\n\t"
        "punpckhbw %%mm7, %%mm3		\n\t"
        "paddw %%mm0, %%mm1		\n\t"
        "paddw %%mm2, %%mm3		\n\t"
-        "movq (%3, %%eax), %%mm4	\n\t"
+        "movq (%3, %%"REG_a"), %%mm4	\n\t"
-        "movq (%3, %%eax), %%mm2	\n\t"
+        "movq (%3, %%"REG_a"), %%mm2	\n\t"
        "paddw %%mm5, %%mm1		\n\t"
        "paddw %%mm5, %%mm3		\n\t"
        "psrlw $1, %%mm1		\n\t"
@@ -181,21 +182,21 @@ static inline void sad8_2_mmx(uint8_t *blk1a, uint8_t *blk1b, uint8_t *blk2, int
        "punpckhbw %%mm7, %%mm1		\n\t"
        "paddw %%mm1, %%mm0		\n\t"
        "paddw %%mm0, %%mm6		\n\t"
-        "addl %4, %%eax			\n\t"
+        "add %4, %%"REG_a"		\n\t"
        " js 1b				\n\t"
        : "+a" (len)
-        : "r" (blk1a - len), "r" (blk1b -len), "r" (blk2 - len), "r" (stride)
+        : "r" (blk1a - len), "r" (blk1b -len), "r" (blk2 - len), "r" ((long)stride)
    );
 }
 static inline void sad8_4_mmx(uint8_t *blk1, uint8_t *blk2, int stride, int h)
 {
-    int len= -(stride*h);
+    long len= -(stride*h);
    asm volatile(
        ".balign 16			\n\t"
        "1:				\n\t"
-        "movq (%1, %%eax), %%mm0	\n\t"
+        "movq (%1, %%"REG_a"), %%mm0	\n\t"
-        "movq (%2, %%eax), %%mm1	\n\t"
+        "movq (%2, %%"REG_a"), %%mm1	\n\t"
        "movq %%mm0, %%mm4		\n\t"
        "movq %%mm1, %%mm2		\n\t"
        "punpcklbw %%mm7, %%mm0		\n\t"
@@ -204,8 +205,8 @@ static inline void sad8_4_mmx(uint8_t *blk1, uint8_t *blk2, int stride, int h)
        "punpckhbw %%mm7, %%mm2		\n\t"
        "paddw %%mm1, %%mm0		\n\t"
        "paddw %%mm2, %%mm4		\n\t"
-        "movq 1(%1, %%eax), %%mm2	\n\t"
+        "movq 1(%1, %%"REG_a"), %%mm2	\n\t"
-        "movq 1(%2, %%eax), %%mm3	\n\t"
+        "movq 1(%2, %%"REG_a"), %%mm3	\n\t"
        "movq %%mm2, %%mm1		\n\t"
        "punpcklbw %%mm7, %%mm2		\n\t"
        "punpckhbw %%mm7, %%mm1		\n\t"
@@ -216,8 +217,8 @@ static inline void sad8_4_mmx(uint8_t *blk1, uint8_t *blk2, int stride, int h)
        "punpckhbw %%mm7, %%mm4		\n\t"
        "paddw %%mm3, %%mm2		\n\t"
        "paddw %%mm4, %%mm1		\n\t"
-        "movq (%3, %%eax), %%mm3	\n\t"
+        "movq (%3, %%"REG_a"), %%mm3	\n\t"
-        "movq (%3, %%eax), %%mm4	\n\t"
+        "movq (%3, %%"REG_a"), %%mm4	\n\t"
        "paddw %%mm5, %%mm2		\n\t"
        "paddw %%mm5, %%mm1		\n\t"
        "psrlw $2, %%mm2		\n\t"
@@ -231,10 +232,10 @@ static inline void sad8_4_mmx(uint8_t *blk1, uint8_t *blk2, int stride, int h)
        "punpckhbw %%mm7, %%mm2		\n\t"
        "paddw %%mm2, %%mm0		\n\t"
        "paddw %%mm0, %%mm6		\n\t"
-        "addl %4, %%eax			\n\t"
+        "add %4, %%"REG_a"		\n\t"
        " js 1b				\n\t"
        : "+a" (len)
-        : "r" (blk1 - len), "r" (blk1 -len + stride), "r" (blk2 - len), "r" (stride)
+        : "r" (blk1 - len), "r" (blk1 -len + stride), "r" (blk2 - len), "r" ((long)stride)
    );
 }

--- a/libavcodec/i386/mpegvideo_mmx.c
+++ b/libavcodec/i386/mpegvideo_mmx.c
--- a/libavcodec/i386/mpegvideo_mmx_template.c
+++ b/libavcodec/i386/mpegvideo_mmx_template.c
@@ -36,7 +36,8 @@ static int RENAME(dct_quantize)(MpegEncContext *s,
                            DCTELEM *block, int n,
                            int qscale, int *overflow)
 {
-    int level=0, last_non_zero_p1, q; //=0 is cuz gcc says uninitalized ...
+    long last_non_zero_p1;
+    int level=0, q; //=0 is cuz gcc says uninitalized ...
    const uint16_t *qmat, *bias;
    __align8 int16_t temp_block[64];
@@ -90,18 +91,18 @@ static int RENAME(dct_quantize)(MpegEncContext *s,
    if(s->out_format == FMT_H263 && s->mpeg_quant==0){
        asm volatile(
-            "movd %%eax, %%mm3			\n\t" // last_non_zero_p1
+            "movd %%"REG_a", %%mm3		\n\t" // last_non_zero_p1
            SPREADW(%%mm3)
            "pxor %%mm7, %%mm7			\n\t" // 0
            "pxor %%mm4, %%mm4			\n\t" // 0
            "movq (%2), %%mm5			\n\t" // qmat[0]
            "pxor %%mm6, %%mm6			\n\t"
            "psubw (%3), %%mm6			\n\t" // -bias[0]
-            "movl $-128, %%eax			\n\t"
+            "mov $-128, %%"REG_a"		\n\t"
            ".balign 16				\n\t"
            "1:					\n\t"
            "pxor %%mm1, %%mm1			\n\t" // 0
-            "movq (%1, %%eax), %%mm0		\n\t" // block[i]
+            "movq (%1, %%"REG_a"), %%mm0	\n\t" // block[i]
            "pcmpgtw %%mm0, %%mm1		\n\t" // block[i] <= 0 ? 0xFF : 0x00
            "pxor %%mm1, %%mm0			\n\t" 
            "psubw %%mm1, %%mm0			\n\t" // ABS(block[i])
@@ -110,13 +111,13 @@ static int RENAME(dct_quantize)(MpegEncContext *s,
            "por %%mm0, %%mm4			\n\t" 
            "pxor %%mm1, %%mm0			\n\t" 
            "psubw %%mm1, %%mm0			\n\t" // out=((ABS(block[i])*qmat[0] - bias[0]*qmat[0])>>16)*sign(block[i])
-            "movq %%mm0, (%5, %%eax)		\n\t"
+            "movq %%mm0, (%5, %%"REG_a")	\n\t"
            "pcmpeqw %%mm7, %%mm0		\n\t" // out==0 ? 0xFF : 0x00
-            "movq (%4, %%eax), %%mm1		\n\t" 
+            "movq (%4, %%"REG_a"), %%mm1	\n\t" 
-            "movq %%mm7, (%1, %%eax)		\n\t" // 0
+            "movq %%mm7, (%1, %%"REG_a")	\n\t" // 0
            "pandn %%mm1, %%mm0			\n\t"
 	    PMAXW(%%mm0, %%mm3)
-            "addl $8, %%eax			\n\t"
+            "add $8, %%"REG_a"			\n\t"
            " js 1b				\n\t"
            "movq %%mm3, %%mm0			\n\t"
            "psrlq $32, %%mm3			\n\t"
@@ -124,8 +125,8 @@ static int RENAME(dct_quantize)(MpegEncContext *s,
            "movq %%mm3, %%mm0			\n\t"
            "psrlq $16, %%mm3			\n\t"
 	    PMAXW(%%mm0, %%mm3)
-            "movd %%mm3, %%eax			\n\t"
+            "movd %%mm3, %%"REG_a"		\n\t"
-            "movzbl %%al, %%eax			\n\t" // last_non_zero_p1
+            "movzb %%al, %%"REG_a"		\n\t" // last_non_zero_p1
 	    : "+a" (last_non_zero_p1)
            : "r" (block+64), "r" (qmat), "r" (bias),
              "r" (inv_zigzag_direct16+64), "r" (temp_block+64)
@@ -142,32 +143,32 @@ static int RENAME(dct_quantize)(MpegEncContext *s,
        );
    }else{ // FMT_H263
        asm volatile(
-            "movd %%eax, %%mm3			\n\t" // last_non_zero_p1
+            "movd %%"REG_a", %%mm3		\n\t" // last_non_zero_p1
            SPREADW(%%mm3)
            "pxor %%mm7, %%mm7			\n\t" // 0
            "pxor %%mm4, %%mm4			\n\t" // 0
-            "movl $-128, %%eax			\n\t"
+            "mov $-128, %%"REG_a"		\n\t"
            ".balign 16				\n\t"
            "1:					\n\t"
            "pxor %%mm1, %%mm1			\n\t" // 0
-            "movq (%1, %%eax), %%mm0		\n\t" // block[i]
+            "movq (%1, %%"REG_a"), %%mm0	\n\t" // block[i]
            "pcmpgtw %%mm0, %%mm1		\n\t" // block[i] <= 0 ? 0xFF : 0x00
            "pxor %%mm1, %%mm0			\n\t" 
            "psubw %%mm1, %%mm0			\n\t" // ABS(block[i])
-            "movq (%3, %%eax), %%mm6		\n\t" // bias[0]
+            "movq (%3, %%"REG_a"), %%mm6	\n\t" // bias[0]
            "paddusw %%mm6, %%mm0		\n\t" // ABS(block[i]) + bias[0]
-            "movq (%2, %%eax), %%mm5		\n\t" // qmat[i]
+            "movq (%2, %%"REG_a"), %%mm5		\n\t" // qmat[i]
            "pmulhw %%mm5, %%mm0		\n\t" // (ABS(block[i])*qmat[0] + bias[0]*qmat[0])>>16
            "por %%mm0, %%mm4			\n\t" 
            "pxor %%mm1, %%mm0			\n\t" 
            "psubw %%mm1, %%mm0			\n\t" // out=((ABS(block[i])*qmat[0] - bias[0]*qmat[0])>>16)*sign(block[i])
-            "movq %%mm0, (%5, %%eax)		\n\t"
+            "movq %%mm0, (%5, %%"REG_a")	\n\t"
            "pcmpeqw %%mm7, %%mm0		\n\t" // out==0 ? 0xFF : 0x00
-            "movq (%4, %%eax), %%mm1		\n\t" 
+            "movq (%4, %%"REG_a"), %%mm1		\n\t" 
-            "movq %%mm7, (%1, %%eax)		\n\t" // 0
+            "movq %%mm7, (%1, %%"REG_a")		\n\t" // 0
            "pandn %%mm1, %%mm0			\n\t"
 	    PMAXW(%%mm0, %%mm3)
-            "addl $8, %%eax			\n\t"
+            "add $8, %%"REG_a"			\n\t"
            " js 1b				\n\t"
            "movq %%mm3, %%mm0			\n\t"
            "psrlq $32, %%mm3			\n\t"
@@ -175,8 +176,8 @@ static int RENAME(dct_quantize)(MpegEncContext *s,
            "movq %%mm3, %%mm0			\n\t"
            "psrlq $16, %%mm3			\n\t"
 	    PMAXW(%%mm0, %%mm3)
-            "movd %%mm3, %%eax			\n\t"
+            "movd %%mm3, %%"REG_a"		\n\t"
-            "movzbl %%al, %%eax			\n\t" // last_non_zero_p1
+            "movzb %%al, %%"REG_a"		\n\t" // last_non_zero_p1
 	    : "+a" (last_non_zero_p1)
            : "r" (block+64), "r" (qmat+64), "r" (bias+64),
              "r" (inv_zigzag_direct16+64), "r" (temp_block+64)

--- a/libavcodec/libpostproc/postprocess.c
+++ b/libavcodec/libpostproc/postprocess.c
@@ -119,7 +119,7 @@ try to unroll inner for(x=0 ... loop to avoid these damn if(x ... checks
 #    define always_inline inline
 #endif
-#ifdef ARCH_X86
+#if defined(ARCH_X86) || defined(ARCH_X86_64)
 static uint64_t __attribute__((aligned(8))) attribute_used w05=		0x0005000500050005LL;
 static uint64_t __attribute__((aligned(8))) attribute_used w04=		0x0004000400040004LL;
 static uint64_t __attribute__((aligned(8))) attribute_used w20=		0x0020002000200020LL;
@@ -172,7 +172,7 @@ static char *replaceTable[]=
 };
-#ifdef ARCH_X86
+#if defined(ARCH_X86) || defined(ARCH_X86_64)
 static inline void prefetchnta(void *p)
 {
 	asm volatile(	"prefetchnta (%0)\n\t"
@@ -597,7 +597,7 @@ static always_inline void do_a_deblock_C(uint8_t *src, int step, int stride, PPC
 #endif //HAVE_ALTIVEC
 #endif //ARCH_POWERPC
-#ifdef ARCH_X86
+#if defined(ARCH_X86) || defined(ARCH_X86_64)
 #if (defined (HAVE_MMX) && !defined (HAVE_3DNOW) && !defined (HAVE_MMX2)) || defined (RUNTIME_CPUDETECT)
 #define COMPILE_MMX
@@ -616,13 +616,11 @@ static always_inline void do_a_deblock_C(uint8_t *src, int step, int stride, PPC
 #undef HAVE_MMX2
 #undef HAVE_3DNOW
 #undef HAVE_ALTIVEC
-#undef ARCH_X86
 #ifdef COMPILE_C
 #undef HAVE_MMX
 #undef HAVE_MMX2
 #undef HAVE_3DNOW
-#undef ARCH_X86
 #define RENAME(a) a ## _C
 #include "postprocess_template.c"
 #endif
@@ -643,7 +641,6 @@ static always_inline void do_a_deblock_C(uint8_t *src, int step, int stride, PPC
 #define HAVE_MMX
 #undef HAVE_MMX2
 #undef HAVE_3DNOW
-#define ARCH_X86
 #define RENAME(a) a ## _MMX
 #include "postprocess_template.c"
 #endif
@@ -654,7 +651,6 @@ static always_inline void do_a_deblock_C(uint8_t *src, int step, int stride, PPC
 #define HAVE_MMX
 #define HAVE_MMX2
 #undef HAVE_3DNOW
-#define ARCH_X86
 #define RENAME(a) a ## _MMX2
 #include "postprocess_template.c"
 #endif
@@ -665,7 +661,6 @@ static always_inline void do_a_deblock_C(uint8_t *src, int step, int stride, PPC
 #define HAVE_MMX
 #undef HAVE_MMX2
 #define HAVE_3DNOW
-#define ARCH_X86
 #define RENAME(a) a ## _3DNow
 #include "postprocess_template.c"
 #endif
@@ -683,7 +678,7 @@ static inline void postProcess(uint8_t src[], int srcStride, uint8_t dst[], int
 	// difference wouldnt be messureable here but its much better because
 	// someone might exchange the cpu whithout restarting mplayer ;)
 #ifdef RUNTIME_CPUDETECT
-#ifdef ARCH_X86
+#if defined(ARCH_X86) || defined(ARCH_X86_64)
 	// ordered per speed fasterst first
 	if(c->cpuCaps & PP_CPU_CAPS_MMX2)
 		postProcess_MMX2(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c);

--- a/libavcodec/libpostproc/postprocess_template.c
+++ b/libavcodec/libpostproc/postprocess_template.c
--- a/libavcodec/msmpeg4.c
+++ b/libavcodec/msmpeg4.c
@@ -716,7 +716,7 @@ static inline int msmpeg4_pred_dc(MpegEncContext * s, int n,
       necessitate to modify mpegvideo.c. The problem comes from the
       fact they decided to store the quantized DC (which would lead
       to problems if Q could vary !) */
-#if defined ARCH_X86 && !defined PIC
+#if (defined(ARCH_X86) || defined(ARCH_X86_64)) && !defined PIC
    asm volatile(
        "movl %3, %%eax		\n\t"
 	"shrl $1, %%eax		\n\t"