From 5f232025efa9d525fb709f6373d685db60e43316 Mon Sep 17 00:00:00 2001
From: Renaud Dartus <reno@videolan.org>
Date: Wed, 16 May 2001 14:51:29 +0000
Subject: [PATCH] * Add 3D Now! imdct * Remove kmudge for ac3 on MacOS X

---
 Makefile                         |   7 +-
 configure                        |   2 +-
 configure.in                     |   2 +-
 plugins/imdct/Makefile           |  18 +-
 plugins/imdct/ac3_imdct_3dn.c    | 559 +++++++++++++++++++++++++++++++
 plugins/imdct/ac3_imdct_c.c      |  62 +---
 plugins/imdct/ac3_imdct_common.c |  65 +---
 plugins/imdct/ac3_imdct_common.h |   3 +-
 plugins/imdct/ac3_imdct_sse.c    |  58 +---
 plugins/imdct/ac3_retables.h     |  83 +++++
 plugins/imdct/ac3_srfft_3dn.c    | 344 +++++++++++++++++++
 plugins/imdct/ac3_srfft_sse.c    |  18 +-
 plugins/imdct/imdct3dn.c         | 152 +++++++++
 src/ac3_decoder/ac3_imdct.c      |   4 +-
 14 files changed, 1180 insertions(+), 197 deletions(-)
 create mode 100644 plugins/imdct/ac3_imdct_3dn.c
 create mode 100644 plugins/imdct/ac3_retables.h
 create mode 100644 plugins/imdct/ac3_srfft_3dn.c
 create mode 100644 plugins/imdct/imdct3dn.c

diff --git a/Makefile b/Makefile
index 9a868def9f..4aa0e32453 100644
--- a/Makefile
+++ b/Makefile
@@ -26,7 +26,7 @@ PLUGINS_TARGETS := alsa/alsa beos/beos darwin/darwin dsp/dsp dummy/dummy \
 		dvd/dvd esd/esd fb/fb ggi/ggi glide/glide gnome/gnome gtk/gtk \
 		downmix/downmix downmix/downmixsse downmix/downmix3dn \
 		idct/idct idct/idctclassic idct/idctmmx idct/idctmmxext \
-		imdct/imdct imdct/imdctsse \
+		imdct/imdct imdct/imdct3dn imdct/imdctsse \
 		macosx/macosx mga/mga \
 		motion/motion motion/motionmmx motion/motionmmxext \
 		mpeg/es mpeg/ps mpeg/ts null/null qt/qt sdl/sdl \
@@ -317,12 +317,7 @@ endif
 $(C_OBJ): %.o: Makefile.opts Makefile.dep Makefile
 $(C_OBJ): %.o: .dep/%.d
 $(C_OBJ): %.o: %.c
-ifneq (,$(findstring darwin,$(SYS)))
-#this is uglier of all
-	@if test "src/ac3_decoder/ac3_imdct.c" = "$<"; then $(CC) `echo $(CFLAGS) | sed -e 's/-O3/-O/'` -c -o $@ $<; echo "(CC) `echo $(CFLAGS) | sed -e 's/-O3/-O/'` -c -o $@ $<"; else $(CC) $(CFLAGS) -c -o $@ $<; echo "$(CC) $(CFLAGS) -c -o $@ $<"; fi
-else
 	$(CC) $(CFLAGS) -c -o $@ $<
-endif
 
 $(CPP_OBJ): %.o: Makefile.opts Makefile.dep Makefile
 $(CPP_OBJ): %.o: .dep/%.dpp
diff --git a/configure b/configure
index b75b5741a5..a03a4d4cad 100755
--- a/configure
+++ b/configure
@@ -3285,7 +3285,7 @@ int main() {
 EOF
 if { (eval echo configure:3287: \"$ac_compile\") 1>&5; (eval $ac_compile) 2>&5; }; then
   rm -rf conftest*
-  ACCEL_PLUGINS="${ACCEL_PLUGINS} idctmmxext motionmmxext imdctsse downmix3dn downmixsse"
+  ACCEL_PLUGINS="${ACCEL_PLUGINS} idctmmxext motionmmxext imdct3dn imdctsse downmix3dn downmixsse"
   echo "$ac_t""yes" 1>&6
 else
   echo "configure: failed program was:" >&5
diff --git a/configure.in b/configure.in
index 4ff3bade95..23766c7b0b 100644
--- a/configure.in
+++ b/configure.in
@@ -162,7 +162,7 @@ AC_TRY_COMPILE([void quux(){void *p;asm("packuswb %%mm1,%%mm2"::"r"(p));}],,
 
 AC_MSG_CHECKING([if \$CC groks MMX EXT or SSE inline assembly])
 AC_TRY_COMPILE([void quux(){void *p;asm("maskmovq %%mm1,%%mm2"::"r"(p));}],,
-  ACCEL_PLUGINS="${ACCEL_PLUGINS} idctmmxext motionmmxext imdctsse downmix3dn downmixsse"
+  ACCEL_PLUGINS="${ACCEL_PLUGINS} idctmmxext motionmmxext imdct3dn imdctsse downmix3dn downmixsse"
   AC_MSG_RESULT(yes), AC_MSG_RESULT(no))
 
 dnl
diff --git a/plugins/imdct/Makefile b/plugins/imdct/Makefile
index 330287c9b6..b4dad20ac9 100644
--- a/plugins/imdct/Makefile
+++ b/plugins/imdct/Makefile
@@ -9,15 +9,18 @@
 
 PLUGIN_IMDCT = imdct.o ac3_imdct_c.o ac3_srfft_c.o
 PLUGIN_IMDCTSSE = imdctsse.o ac3_imdct_sse.o ac3_srfft_sse.o
+PLUGIN_IMDCT3DN = imdct3dn.o ac3_imdct_3dn.o ac3_srfft_3dn.o
 PLUGIN_IMDCTCOMMON = ac3_imdct_common.o
 
 BUILTIN_IMDCT = $(PLUGIN_IMDCT:%.o=BUILTIN_IMDCT_%.o) \
 		$(PLUGIN_IMDCTCOMMON:%.o=BUILTIN_IMDCT_%.o)
 BUILTIN_IMDCTSSE = $(PLUGIN_IMDCTSSE:%.o=BUILTIN_IMDCTSSE_%.o) \
 		$(PLUGIN_IMDCTCOMMON:%.o=BUILTIN_IMDCTSSE_%.o)
+BUILTIN_IMDCT3DN = $(PLUGIN_IMDCT3DN:%.o=BUILTIN_IMDCT3DN_%.o) \
+		$(PLUGIN_IMDCTCOMMON:%.o=BUILTIN_IMDCT3DN_%.o)
 
-PLUGIN_C = $(PLUGIN_IMDCT) $(PLUGIN_IMDCTSSE) $(PLUGIN_IMDCTCOMMON)
-ALL_OBJ = $(PLUGIN_C) $(BUILTIN_IMDCT) $(BUILTIN_IMDCTSSE)
+PLUGIN_C = $(PLUGIN_IMDCT) $(PLUGIN_IMDCTSSE) $(PLUGIN_IMDCT3DN) $(PLUGIN_IMDCTCOMMON)
+ALL_OBJ = $(PLUGIN_C) $(BUILTIN_IMDCT) $(BUILTIN_IMDCTSSE) $(BUILTIN_IMDCT3DN)
 
 #
 # Virtual targets
@@ -33,6 +36,10 @@ $(BUILTIN_IMDCTSSE): BUILTIN_IMDCTSSE_%.o: .dep/%.d
 $(BUILTIN_IMDCTSSE): BUILTIN_IMDCTSSE_%.o: %.c
 	$(CC) $(CFLAGS) -DBUILTIN -DMODULE_NAME=imdctsse -c -o $@ $<
 
+$(BUILTIN_IMDCT3DN): BUILTIN_IMDCT3DN_%.o: .dep/%.d
+$(BUILTIN_IMDCT3DN): BUILTIN_IMDCT3DN_%.o: %.c
+	$(CC) $(CFLAGS) -DBUILTIN -DMODULE_NAME=imdct3dn -c -o $@ $<
+
 #
 # Real targets
 #
@@ -51,3 +58,10 @@ $(BUILTIN_IMDCTSSE): BUILTIN_IMDCTSSE_%.o: %.c
 	ar r $@ $^
 	$(RANLIB) $@
 
+../../lib/imdct3dn.so: $(PLUGIN_IMDCT3DN) $(PLUGIN_IMDCTCOMMON)
+	$(CC) $(PCFLAGS) -o $@ $^ $(PLCFLAGS) 
+
+../../lib/imdct3dn.a: $(BUILTIN_IMDCT3DN)
+	ar r $@ $^
+	$(RANLIB) $@
+
diff --git a/plugins/imdct/ac3_imdct_3dn.c b/plugins/imdct/ac3_imdct_3dn.c
new file mode 100644
index 0000000000..5bb5e5d9a9
--- /dev/null
+++ b/plugins/imdct/ac3_imdct_3dn.c
@@ -0,0 +1,559 @@
+/*****************************************************************************
+ * ac3_imdct_3dn.c: accelerated 3D Now! ac3 DCT
+ *****************************************************************************
+ * Copyright (C) 1999, 2000 VideoLAN
+ * $Id: ac3_imdct_3dn.c,v 1.1 2001/05/16 14:51:29 reno Exp $
+ *
+ * Authors: Renaud Dartus <reno@videolan.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111, USA.
+ *****************************************************************************/
+
+#define MODULE_NAME imdct3dn
+#include "modules_inner.h"
+
+/*****************************************************************************
+ * Preamble
+ *****************************************************************************/
+#include "defs.h"
+
+#include <math.h>
+#include <stdio.h>
+
+#include "config.h"
+#include "common.h"
+#include "threads.h"
+#include "mtime.h"
+
+#include "ac3_imdct.h"
+#include "ac3_imdct_common.h"
+#include "ac3_retables.h"
+
+void _M( fft_64p )  ( complex_t *x );
+void _M( fft_128p ) ( complex_t *a );
+
+static void imdct512_pre_ifft_twiddle_3dn (const int *pmt, complex_t *buf, float *data, float *xcos_sin_sse);
+static void imdct512_post_ifft_twiddle_3dn (complex_t *buf, float *xcos_sin_sse);
+static void imdct512_window_delay_3dn (complex_t *buf, float *data_ptr, float *window_prt, float *delay_prt);
+static void imdct512_window_delay_nol_3dn (complex_t *buf, float *data_ptr, float *window_prt, float *delay_prt);
+
+
+void _M( imdct_init ) (imdct_t * p_imdct)
+{
+	int i;
+	float scale = 181.019;
+
+    fprintf(stderr,"imct_init\n");
+	for (i=0; i < 128; i++)
+	{
+		float xcos_i = cos(2.0f * M_PI * (8*i+1)/(8*N)) * scale;
+		float xsin_i = sin(2.0f * M_PI * (8*i+1)/(8*N)) * scale;
+		p_imdct->xcos_sin_sse[i * 4]     = xcos_i;
+		p_imdct->xcos_sin_sse[i * 4 + 1] = -xsin_i;
+		p_imdct->xcos_sin_sse[i * 4 + 2] = -xsin_i;
+		p_imdct->xcos_sin_sse[i * 4 + 3] = -xcos_i;
+	}
+    fprintf(stderr,"done imct_init\n");
+}
+
+void _M( imdct_do_512 ) (imdct_t * p_imdct, float data[], float delay[])
+{
+	imdct512_pre_ifft_twiddle_3dn (pm128, p_imdct->buf, data, p_imdct->xcos_sin_sse);
+	_M( fft_128p ) (p_imdct->buf);
+	imdct512_post_ifft_twiddle_3dn (p_imdct->buf, p_imdct->xcos_sin_sse);
+    imdct512_window_delay_3dn (p_imdct->buf, data, window, delay);
+}
+
+void _M( imdct_do_512_nol ) (imdct_t * p_imdct, float data[], float delay[])
+{
+	imdct512_pre_ifft_twiddle_3dn (pm128, p_imdct->buf, data, p_imdct->xcos_sin_sse);  
+	_M( fft_128p ) (p_imdct->buf);
+	imdct512_post_ifft_twiddle_3dn (p_imdct->buf, p_imdct->xcos_sin_sse);
+    imdct512_window_delay_nol_3dn (p_imdct->buf, data, window, delay);
+}
+
+static void imdct512_pre_ifft_twiddle_3dn (const int *pmt, complex_t *buf, float *data, float *xcos_sin_sse)
+{
+    __asm__ __volatile__ (	
+	"pushl %%ebp\n"
+	"movl  %%esp, %%ebp\n"
+	"addl  $-4, %%esp\n" /* local variable, loop counter */
+	
+	"pushl %%eax\n"
+	"pushl %%ebx\n"
+	"pushl %%ecx\n"
+	"pushl %%edx\n"
+	"pushl %%edi\n"
+	"pushl %%esi\n"
+
+	"movl  8(%%ebp), %%eax\n" 	/* pmt */
+	"movl 12(%%ebp), %%ebx\n"	/* buf */
+	"movl 16(%%ebp), %%ecx\n"	/* data */
+	"movl 20(%%ebp), %%edx\n" 	/* xcos_sin_sse */
+	"movl $128, -4(%%ebp)\n"
+	
+".loop:\n"
+	"movl  (%%eax), %%esi\n"
+	"movd (%%ecx, %%esi, 8), %%mm1\n"   /* 2j */
+    "punpckldq %%mm1, %%mm1\n"          /* 2j | 2j */
+
+	"shll $1, %%esi\n"
+
+	"movq (%%edx, %%esi, 8), %%mm0\n"   /* -s_j | c_j */
+	"movq 8(%%edx, %%esi, 8), %%mm2\n"  /* -c_j | -s_j */
+
+	"negl %%esi\n"
+
+	"movd 1020(%%ecx, %%esi, 4), %%mm4\n" /* 255-2j */
+    "punpckldq %%mm4, %%mm4\n"  /* 255-2j | 255-2j */
+	"addl $4, %%eax\n"
+
+	"pfmul   %%mm4, %%mm0\n"    /* 255-2j * -s_j | 255-2j  * c_j */
+	"pfmul   %%mm1, %%mm2\n"    /* 2j * -c_j | 2j * -s_j */
+	"addl    $8, %%ebx\n"
+	"pfadd   %%mm2, %%mm0\n"    /* 2j * -c_j + 255-2j * -s_j | 2j * -s_j + 255-2j * c_j */
+    
+	"movq  %%mm0, -8(%%ebx)\n"
+	"decl -4(%%ebp)\n"
+   	"jnz .loop\n"
+
+	"popl %%esi\n"
+	"popl %%edi\n"
+	"popl %%edx\n"
+	"popl %%ecx\n"
+	"popl %%ebx\n"
+	"popl %%eax\n"
+
+	"addl $4, %%esp\n"
+	"popl %%ebp\n"
+	"femms\n"
+    ::);
+}
+
+static void imdct512_post_ifft_twiddle_3dn (complex_t *buf, float *xcos_sin_sse)
+{
+    __asm__ __volatile__ ( 
+	"pushl %%ebx\n"
+	"movl $64, %%ebx\n"         /* loop counter */
+
+".loop1:\n"
+	"movq	(%%eax), %%mm0\n"   /* im0 | re0 */
+	"movq	  %%mm0, %%mm1\n"   /* im0 | re0 */
+    "punpckldq %%mm0, %%mm0\n"  /* re0 | re0 */
+    "punpckhdq %%mm1, %%mm1\n"  /* im0 | im0 */
+    
+	"movq  (%%ecx), %%mm2\n"    /* -s | c */
+	"movq 8(%%ecx), %%mm3\n"    /* -c | -s */
+    "movq    %%mm3, %%mm4\n"
+
+    "punpckhdq %%mm2,%%mm3\n"   /* -s | -c */
+    "punpckldq %%mm2,%%mm4\n"   /*  c | -s */
+    
+	"movq  8(%%eax), %%mm2\n"   /* im1 | re1 */
+	"movq   %%mm2, %%mm5\n"     /* im1 | re1 */
+    "punpckldq %%mm2, %%mm2\n"  /* re1 | re1 */
+    "punpckhdq %%mm5, %%mm5\n"  /* im1 | im1 */
+
+   	"pfmul %%mm3, %%mm0\n"      /* -s * re0 | -c * re0 */
+	"pfmul %%mm4, %%mm1\n"      /* c * im0 | -s * im0 */
+
+	"movq  16(%%ecx), %%mm6\n"  /* -s1 | c1 */
+	"movq  24(%%ecx), %%mm7\n"  /* -c1 | -s1 */
+    "movq   %%mm7, %%mm4\n"
+    
+    "punpckhdq %%mm6,%%mm7\n"   /* -s1 | -c1 */
+    "punpckldq %%mm6,%%mm4\n"   /*  c1 | -s1 */
+    
+	"pfmul %%mm7, %%mm2\n"      /* -s1*re1 | -c1*re1 */
+	"pfmul %%mm4, %%mm5\n"      /* c1*im1 | -s1*im1 */
+
+	"pfadd %%mm1, %%mm0\n"      /* -s * re0 + c * im0 | -c * re0 - s * im0 */
+	"pfadd %%mm5, %%mm2\n"      /* -s1 * re1 + c1 * im1 | -c1 * re1 - s1 * im1 */
+
+	"movq %%mm0, (%%eax)\n"
+	"movq %%mm2, 8(%%eax)\n"
+	"addl $32, %%ecx\n"
+	"addl $16, %%eax\n"
+	"decl %%ebx\n"
+	"jnz .loop1\n"
+
+	"popl %%ebx\n"
+	"femms\n"
+    : "=a" (buf)
+    : "a" (buf), "c" (xcos_sin_sse) );
+}
+
+static void imdct512_window_delay_3dn (complex_t *buf, float *data_ptr, float *window_prt, float *delay_prt)
+{
+    __asm__ __volatile__ (
+	"pushl %%ebp\n"
+	"movl  %%esp, %%ebp\n"
+
+	"pushl %%eax\n"
+	"pushl %%ebx\n"
+	"pushl %%ecx\n"
+	"pushl %%edx\n"
+	"pushl %%esi\n"
+	"pushl %%edi\n"
+
+	"movl 20(%%ebp), %%ebx\n"   /* delay */
+	"movl 16(%%ebp), %%edx\n"   /* window */
+
+	"movl 8(%%ebp), %%eax\n"    /* buf */
+	"movl $32, %%ecx\n"         /* loop count */
+	"leal 516(%%eax), %%esi\n"  /* buf[64].im */
+	"leal 504(%%eax), %%edi\n"  /* buf[63].re */
+	"movl  12(%%ebp), %%eax\n"  /* data */
+
+".first_128_samples:\n"
+	"movd   (%%esi), %%mm0\n" /* im0 */
+	"movd  8(%%esi), %%mm2\n" /* im1 */
+	"movd   (%%edi), %%mm1\n" /* re0 */
+	"movd -8(%%edi), %%mm3\n" /* re1 */
+
+    "pxor   %%mm4, %%mm4\n"
+    "pxor   %%mm5, %%mm5\n"
+    "pfsub  %%mm0, %%mm4\n" /* -im0 */
+    "pfsub  %%mm2, %%mm5\n" /* -im1 */
+    
+	"punpckldq %%mm1, %%mm4\n"      /* re0 | -im0 */
+	"punpckldq %%mm3, %%mm5\n"      /* re1 | -im1 */
+
+	"movq  (%%edx), %%mm0\n"      /* w1 | w0 */
+	"movq 8(%%edx), %%mm1\n"      /* w3 | w2 */
+	"movq  (%%ebx), %%mm2\n"      /* d1 | d0 */
+	"movq 8(%%ebx), %%mm3\n"      /* d3 | d2 */
+
+    "pfmul     %%mm4, %%mm0\n"      /* w1*re0 | -w0*im0 */
+	"pfmul     %%mm5, %%mm1\n"      /* w3*re1 | -w2*im1 */
+
+    "pfadd     %%mm2, %%mm0\n"      /* w1*re0+d1 | -w0*im0+d0 */
+    "pfadd     %%mm3, %%mm1\n"      /* w3*re1+d3 | -w2*im1+d2 */
+
+	"addl $16, %%edx\n"
+	"movq %%mm0,  (%%eax)\n"
+	"movq %%mm1, 8(%%eax)\n"
+	"addl $16, %%ebx\n"
+	"addl $16, %%esi\n"
+	"addl $16, %%eax\n"
+    "addl $-16, %%edi\n"
+	"decl %%ecx\n"
+	"jnz .first_128_samples\n"
+
+	"movl 8(%%ebp), %%esi\n"    /* buf[0].re */
+	"leal 1020(%%esi), %%edi\n" /* buf[127].im */
+	"movl $32, %%ecx\n"         /* loop count */
+    
+".second_128_samples:\n"
+	"movd   (%%esi), %%mm0\n" /* buf[i].re */
+	"movd  8(%%esi), %%mm2\n" /* re1 */
+	"movd   (%%edi), %%mm1\n" /* buf[127-i].im */
+	"movd -8(%%edi), %%mm3\n" /* im1 */
+
+    "pxor   %%mm4, %%mm4\n"
+    "pxor   %%mm5, %%mm5\n"
+    "pfsub  %%mm0, %%mm4\n" /* -re0 */
+    "pfsub  %%mm2, %%mm5\n" /* -re1 */
+    
+	"punpckldq %%mm1, %%mm4\n"     /* im0 | -re0 */
+	"punpckldq %%mm3, %%mm5\n"     /* im1 | -re1 */
+
+	"movq (%%edx), %%mm0\n"  /* w1 | w0 */
+	"movq 8(%%edx), %%mm1\n"  /* w3 | w2 */
+	"movq (%%ebx), %%mm2\n"  /* d1 | d0 */
+	"movq 8(%%ebx), %%mm3\n"  /* d3 | d2 */
+
+	"addl $16, %%esi\n"
+    
+    "pfmul     %%mm4, %%mm0\n"      /* w1*im0 | -w0*re0 */
+	"pfmul     %%mm5, %%mm1\n"      /* w3*im1 | -w2*re1 */
+    
+	"pfadd %%mm2, %%mm0\n"      /* w1*im0+d1 | -w0*re0+d0 */
+	"pfadd %%mm3, %%mm1\n"      /* w3*im1+d3 | -w2*re1+d2 */
+    
+	"addl $-16, %%edi\n"
+	
+    "movq %%mm0, (%%eax)\n"
+    "movq %%mm1, 8(%%eax)\n"
+    
+    "addl $16, %%edx\n"
+	"addl $16, %%eax\n"
+	"addl $16, %%ebx\n"
+	"decl %%ecx\n"
+	"jnz .second_128_samples\n"
+
+	"movl   8(%%ebp), %%eax\n"
+	"leal 512(%%eax), %%esi\n"  /* buf[64].re */
+	"leal 508(%%eax), %%edi\n"  /* buf[63].im */
+	"movl $32, %%ecx\n"         /* loop count */
+	"movl  20(%%ebp), %%eax\n"  /* delay */
+
+".first_128_delay:\n"
+	"movd   (%%esi), %%mm0\n" /* re0 */
+	"movd  8(%%esi), %%mm2\n" /* re1 */
+	"movd   (%%edi), %%mm1\n" /* im0 */
+	"movd -8(%%edi), %%mm3\n" /* im1 */
+
+    "pxor   %%mm4, %%mm4\n"
+    "pxor   %%mm5, %%mm5\n"
+    "pfsub  %%mm0, %%mm4\n" /* -re0 */
+    "pfsub  %%mm2, %%mm5\n" /* -re1 */
+    
+	"punpckldq %%mm1, %%mm4\n"     /* im0 | -re0 */
+	"punpckldq %%mm3, %%mm5\n"     /* im1 | -re1 */
+
+    
+	"movq -16(%%edx), %%mm1\n"   /* w3 | w2 */
+	"movq  -8(%%edx), %%mm0\n"   /* w1 | w0 */
+    
+	"addl $-16, %%edx\n"
+
+    "pfmul     %%mm4, %%mm0\n"      /* w1*im0 | -w0*re0 */
+	"pfmul     %%mm5, %%mm1\n"      /* w3*im1 | -w2*re1 */
+
+    
+	"movq %%mm0, (%%eax)\n"
+	"movq %%mm1, 8(%%eax)\n"
+	"addl  $16, %%esi\n"
+	"addl $-16, %%edi\n"
+	"addl  $16, %%eax\n"
+	"decl %%ecx\n"
+	"jnz .first_128_delay\n"
+
+	"movl    8(%%ebp), %%ebx\n"
+	"leal    4(%%ebx), %%esi\n" /* buf[0].im */
+	"leal 1016(%%ebx), %%edi\n" /* buf[127].re */
+	"movl $32, %%ecx\n"         /* loop count */
+    
+".second_128_delay:\n"
+	"movd   (%%esi), %%mm0\n" /* im0 */
+	"movd  8(%%esi), %%mm2\n" /* im1 */
+	"movd   (%%edi), %%mm1\n" /* re0 */
+	"movd -8(%%edi), %%mm3\n" /* re1 */
+
+    "pxor   %%mm4, %%mm4\n"
+    "pxor   %%mm5, %%mm5\n"
+    "pfsub  %%mm1, %%mm4\n" /* -re0 */
+    "pfsub  %%mm3, %%mm5\n" /* -re1 */
+    
+	"punpckldq %%mm4, %%mm0\n"     /* -re0 | im0 */
+	"punpckldq %%mm5, %%mm2\n"     /* -re1 | im1 */
+
+    
+	"movq -16(%%edx), %%mm1\n"   /* w3 | w2 */
+	"movq  -8(%%edx), %%mm3\n"   /* w1 | w0 */
+    
+	"addl $-16, %%edx\n"
+
+    "pfmul     %%mm0, %%mm1\n"      /* -w1*re0 | w0*im0 */
+	"pfmul     %%mm2, %%mm3\n"      /* -w3*re1 | w2*im1 */
+
+    
+	"movq %%mm1, (%%eax)\n"
+	"movq %%mm3, 8(%%eax)\n"
+	"addl  $16, %%esi\n"
+	"addl $-16, %%edi\n"
+	"addl  $16, %%eax\n"
+	"decl %%ecx\n"
+    "jnz .second_128_delay\n"
+
+	"popl %%edi\n"
+	"popl %%esi\n"
+	"popl %%edx\n"
+	"popl %%ecx\n"
+	"popl %%ebx\n"
+	"popl %%eax\n"
+	
+	"leave\n"
+	"femms\n"
+    ::);
+}
+
+static void imdct512_window_delay_nol_3dn (complex_t *buf, float *data_ptr, float *window_prt, float *delay_prt)
+{
+    __asm__ __volatile__ (
+	"pushl %%ebp\n"
+	"movl  %%esp, %%ebp\n"
+	
+	"pushl %%eax\n"
+	"pushl %%ebx\n"
+	"pushl %%ecx\n"
+	"pushl %%edx\n"
+	"pushl %%esi\n"
+	"pushl %%edi\n"
+
+	"movl 20(%%ebp), %%ebx\n"   /* delay */
+	"movl 16(%%ebp), %%edx\n"   /* window */
+
+	"movl 8(%%ebp), %%eax\n"    /* buf */
+	"movl $32, %%ecx\n"         /* loop count */
+	"leal 516(%%eax), %%esi\n"  /* buf[64].im */
+	"leal 504(%%eax), %%edi\n"  /* buf[63].re */
+	"movl  12(%%ebp), %%eax\n"  /* data */
+
+".first_128_samples2:\n"
+	"movd   (%%esi), %%mm0\n" /* im0 */
+	"movd  8(%%esi), %%mm2\n" /* im1 */
+	"movd   (%%edi), %%mm1\n" /* re0 */
+	"movd -8(%%edi), %%mm3\n" /* re1 */
+
+    "pxor   %%mm4, %%mm4\n"
+    "pxor   %%mm5, %%mm5\n"
+    "pfsub  %%mm0, %%mm4\n" /* -im0 */
+    "pfsub  %%mm2, %%mm5\n" /* -im1 */
+    
+	"punpckldq %%mm1, %%mm4\n"      /* re0 | -im0 */
+	"punpckldq %%mm3, %%mm5\n"      /* re1 | -im1 */
+
+	"movq (%%edx), %%mm0\n"      /* w1 | w0 */
+	"movq 8(%%edx), %%mm1\n"     /* w3 | w2 */
+
+    "pfmul     %%mm4, %%mm0\n"      /* w1*re0 | -w0*im0 */
+	"pfmul     %%mm5, %%mm1\n"      /* w3*re1 | -w2*im1 */
+
+	"addl $16, %%edx\n"
+	"movq %%mm0, (%%eax)\n"
+	"movq %%mm1, 8(%%eax)\n"
+	"addl $16, %%ebx\n"
+	"addl $16, %%esi\n"
+	"addl $16, %%eax\n"
+    "addl $-16, %%edi\n"
+	"decl %%ecx\n"
+	"jnz .first_128_samples2\n"
+
+	"movl 8(%%ebp), %%esi\n"    /* buf[0].re */
+	"leal 1020(%%esi), %%edi\n" /* buf[127].im */
+	"movl $32, %%ecx\n"         /* loop count */
+    
+".second_128_samples2:\n"
+	"movd   (%%esi), %%mm0\n" /* buf[i].re */
+	"movd  8(%%esi), %%mm2\n" /* re1 */
+	"movd   (%%edi), %%mm1\n" /* buf[127-i].im */
+	"movd -8(%%edi), %%mm3\n" /* im1 */
+
+    "pxor   %%mm4, %%mm4\n"
+    "pxor   %%mm5, %%mm5\n"
+    "pfsub  %%mm0, %%mm4\n" /* -re0 */
+    "pfsub  %%mm2, %%mm5\n" /* -re1 */
+    
+	"punpckldq %%mm1, %%mm4\n"     /* im0 | -re0 */
+	"punpckldq %%mm3, %%mm5\n"     /* im1 | -re1 */
+
+	"movq (%%edx), %%mm0\n"  /* w1 | w0 */
+	"movq 8(%%edx), %%mm1\n"  /* w3 | w2 */
+
+	"addl $16, %%esi\n"
+    
+    "pfmul     %%mm4, %%mm0\n"      /* w1*im0 | -w0*re0 */
+	"pfmul     %%mm5, %%mm1\n"      /* w3*im1 | -w2*re1 */
+    
+	"addl $-16, %%edi\n"
+	
+    "movq %%mm0, (%%eax)\n"
+    "movq %%mm1, 8(%%eax)\n"
+    
+    "addl $16, %%edx\n"
+	"addl $16, %%eax\n"
+	"addl $16, %%ebx\n"
+	"decl %%ecx\n"
+	"jnz .second_128_samples2\n"
+
+	"movl   8(%%ebp), %%eax\n"
+	"leal 512(%%eax), %%esi\n"  /* buf[64].re */
+	"leal 508(%%eax), %%edi\n"  /* buf[63].im */
+	"movl $32, %%ecx\n"         /* loop count */
+	"movl  20(%%ebp), %%eax\n"  /* delay */
+
+".first_128_delays:\n"
+	"movd   (%%esi), %%mm0\n" /* re0 */
+	"movd  8(%%esi), %%mm2\n" /* re1 */
+	"movd   (%%edi), %%mm1\n" /* im0 */
+	"movd -8(%%edi), %%mm3\n" /* im1 */
+
+    "pxor   %%mm4, %%mm4\n"
+    "pxor   %%mm5, %%mm5\n"
+    "pfsub  %%mm0, %%mm4\n" /* -re0 */
+    "pfsub  %%mm2, %%mm5\n" /* -re1 */
+    
+	"punpckldq %%mm1, %%mm4\n"     /* im0 | -re0 */
+	"punpckldq %%mm3, %%mm5\n"     /* im1 | -re1 */
+
+    
+	"movq -16(%%edx), %%mm1\n"   /* w3 | w2 */
+	"movq  -8(%%edx), %%mm0\n"   /* w1 | w0 */
+    
+	"addl $-16, %%edx\n"
+
+    "pfmul     %%mm4, %%mm0\n"      /* w1*im0 | -w0*re0 */
+	"pfmul     %%mm5, %%mm1\n"      /* w3*im1 | -w2*re1 */
+
+    
+	"movq %%mm0, (%%eax)\n"
+	"movq %%mm1, 8(%%eax)\n"
+	"addl  $16, %%esi\n"
+	"addl $-16, %%edi\n"
+	"addl  $16, %%eax\n"
+	"decl %%ecx\n"
+	"jnz .first_128_delays\n"
+
+	"movl    8(%%ebp), %%ebx\n"
+	"leal    4(%%ebx), %%esi\n" /* buf[0].im */
+	"leal 1016(%%ebx), %%edi\n" /* buf[127].re */
+	"movl $32, %%ecx\n"         /* loop count */
+    
+".second_128_delays:\n"
+	"movd   (%%esi), %%mm0\n" /* im0 */
+	"movd  8(%%esi), %%mm2\n" /* im1 */
+	"movd   (%%edi), %%mm1\n" /* re0 */
+	"movd -8(%%edi), %%mm3\n" /* re1 */
+
+    "pxor   %%mm4, %%mm4\n"
+    "pxor   %%mm5, %%mm5\n"
+    "pfsub  %%mm1, %%mm4\n" /* -re0 */
+    "pfsub  %%mm3, %%mm5\n" /* -re1 */
+    
+	"punpckldq %%mm4, %%mm0\n"     /* -re0 | im0 */
+	"punpckldq %%mm5, %%mm2\n"     /* -re1 | im1 */
+
+    
+	"movq -16(%%edx), %%mm1\n"   /* w3 | w2 */
+	"movq  -8(%%edx), %%mm3\n"   /* w1 | w0 */
+    
+	"addl $-16, %%edx\n"
+
+    "pfmul     %%mm0, %%mm1\n"      /* -w1*re0 | w0*im0 */
+	"pfmul     %%mm2, %%mm3\n"      /* -w3*re1 | w2*im1 */
+
+    
+	"movq %%mm1, (%%eax)\n"
+	"movq %%mm3, 8(%%eax)\n"
+	"addl  $16, %%esi\n"
+	"addl $-16, %%edi\n"
+	"addl  $16, %%eax\n"
+	"decl %%ecx\n"
+    "jnz .second_128_delays\n"
+
+	"popl %%edi\n"
+	"popl %%esi\n"
+	"popl %%edx\n"
+	"popl %%ecx\n"
+	"popl %%ebx\n"
+	"popl %%eax\n"
+	
+	"leave\n"
+	"femms\n"
+    ::);
+}
+
diff --git a/plugins/imdct/ac3_imdct_c.c b/plugins/imdct/ac3_imdct_c.c
index 3ebf16c9b3..106d7faaaf 100644
--- a/plugins/imdct/ac3_imdct_c.c
+++ b/plugins/imdct/ac3_imdct_c.c
@@ -2,7 +2,7 @@
  * ac3_imdct_c.c: ac3 DCT in C
  *****************************************************************************
  * Copyright (C) 1999, 2000 VideoLAN
- * $Id: ac3_imdct_c.c,v 1.1 2001/05/15 16:19:42 sam Exp $
+ * $Id: ac3_imdct_c.c,v 1.2 2001/05/16 14:51:29 reno Exp $
  *
  * Authors: Renaud Dartus <reno@videolan.org>
  *          Aaron Holtzman <aholtzma@engr.uvic.ca>
@@ -42,6 +42,7 @@
 
 #include "ac3_imdct.h"
 #include "ac3_imdct_common.h"
+#include "ac3_retables.h"
 
 #ifndef M_PI
 #   define M_PI 3.14159265358979323846
@@ -50,65 +51,6 @@
 void _M( fft_64p )  ( complex_t *x );
 void _M( fft_128p ) ( complex_t *x );
 
-static float window[] = {
-    0.00014, 0.00024, 0.00037, 0.00051, 0.00067, 0.00086, 0.00107, 0.00130,
-    0.00157, 0.00187, 0.00220, 0.00256, 0.00297, 0.00341, 0.00390, 0.00443,
-    0.00501, 0.00564, 0.00632, 0.00706, 0.00785, 0.00871, 0.00962, 0.01061,
-    0.01166, 0.01279, 0.01399, 0.01526, 0.01662, 0.01806, 0.01959, 0.02121,
-    0.02292, 0.02472, 0.02662, 0.02863, 0.03073, 0.03294, 0.03527, 0.03770,
-    0.04025, 0.04292, 0.04571, 0.04862, 0.05165, 0.05481, 0.05810, 0.06153,
-    0.06508, 0.06878, 0.07261, 0.07658, 0.08069, 0.08495, 0.08935, 0.09389,
-    0.09859, 0.10343, 0.10842, 0.11356, 0.11885, 0.12429, 0.12988, 0.13563,
-    0.14152, 0.14757, 0.15376, 0.16011, 0.16661, 0.17325, 0.18005, 0.18699,
-    0.19407, 0.20130, 0.20867, 0.21618, 0.22382, 0.23161, 0.23952, 0.24757,
-    0.25574, 0.26404, 0.27246, 0.28100, 0.28965, 0.29841, 0.30729, 0.31626,
-    0.32533, 0.33450, 0.34376, 0.35311, 0.36253, 0.37204, 0.38161, 0.39126,
-    0.40096, 0.41072, 0.42054, 0.43040, 0.44030, 0.45023, 0.46020, 0.47019,
-    0.48020, 0.49022, 0.50025, 0.51028, 0.52031, 0.53033, 0.54033, 0.55031,
-    0.56026, 0.57019, 0.58007, 0.58991, 0.59970, 0.60944, 0.61912, 0.62873,
-    0.63827, 0.64774, 0.65713, 0.66643, 0.67564, 0.68476, 0.69377, 0.70269,
-    0.71150, 0.72019, 0.72877, 0.73723, 0.74557, 0.75378, 0.76186, 0.76981,
-    0.77762, 0.78530, 0.79283, 0.80022, 0.80747, 0.81457, 0.82151, 0.82831,
-    0.83496, 0.84145, 0.84779, 0.85398, 0.86001, 0.86588, 0.87160, 0.87716,
-    0.88257, 0.88782, 0.89291, 0.89785, 0.90264, 0.90728, 0.91176, 0.91610,
-    0.92028, 0.92432, 0.92822, 0.93197, 0.93558, 0.93906, 0.94240, 0.94560,
-    0.94867, 0.95162, 0.95444, 0.95713, 0.95971, 0.96217, 0.96451, 0.96674,
-    0.96887, 0.97089, 0.97281, 0.97463, 0.97635, 0.97799, 0.97953, 0.98099,
-    0.98236, 0.98366, 0.98488, 0.98602, 0.98710, 0.98811, 0.98905, 0.98994,
-    0.99076, 0.99153, 0.99225, 0.99291, 0.99353, 0.99411, 0.99464, 0.99513,
-    0.99558, 0.99600, 0.99639, 0.99674, 0.99706, 0.99736, 0.99763, 0.99788,
-    0.99811, 0.99831, 0.99850, 0.99867, 0.99882, 0.99895, 0.99908, 0.99919,
-    0.99929, 0.99938, 0.99946, 0.99953, 0.99959, 0.99965, 0.99969, 0.99974,
-    0.99978, 0.99981, 0.99984, 0.99986, 0.99988, 0.99990, 0.99992, 0.99993,
-    0.99994, 0.99995, 0.99996, 0.99997, 0.99998, 0.99998, 0.99998, 0.99999,
-    0.99999, 0.99999, 0.99999, 1.00000, 1.00000, 1.00000, 1.00000, 1.00000,
-    1.00000, 1.00000, 1.00000, 1.00000, 1.00000, 1.00000, 1.00000, 1.00000
-};
-
-static const int pm128[128] =
-{
-    0, 16, 32, 48, 64, 80,  96, 112,  8, 40, 72, 104, 24, 56,  88, 120,
-    4, 20, 36, 52, 68, 84, 100, 116, 12, 28, 44,  60, 76, 92, 108, 124,
-    2, 18, 34, 50, 66, 82,  98, 114, 10, 42, 74, 106, 26, 58,  90, 122,
-    6, 22, 38, 54, 70, 86, 102, 118, 14, 46, 78, 110, 30, 62,  94, 126,
-    1, 17, 33, 49, 65, 81,  97, 113,  9, 41, 73, 105, 25, 57,  89, 121,
-    5, 21, 37, 53, 69, 85, 101, 117, 13, 29, 45,  61, 77, 93, 109, 125,
-    3, 19, 35, 51, 67, 83,  99, 115, 11, 43, 75, 107, 27, 59,  91, 123,
-    7, 23, 39, 55, 71, 87, 103, 119, 15, 31, 47,  63, 79, 95, 111, 127
-}; 
-
-static const int pm64[64] =
-{
-    0,  8, 16, 24, 32, 40, 48, 56,
-    4, 20, 36, 52, 12, 28, 44, 60,
-    2, 10, 18, 26, 34, 42, 50, 58,
-    6, 14, 22, 30, 38, 46, 54, 62,
-    1,  9, 17, 25, 33, 41, 49, 57,
-    5, 21, 37, 53, 13, 29, 45, 61,
-    3, 11, 19, 27, 35, 43, 51, 59,
-    7, 23, 39, 55, 15, 31, 47, 63
-};
-
 void _M( imdct_init ) (imdct_t * p_imdct)
 {
     int i;
diff --git a/plugins/imdct/ac3_imdct_common.c b/plugins/imdct/ac3_imdct_common.c
index 493eca2d12..b5bdc6d64b 100644
--- a/plugins/imdct/ac3_imdct_common.c
+++ b/plugins/imdct/ac3_imdct_common.c
@@ -2,7 +2,7 @@
  * ac3_imdct_common.c: common ac3 DCT functions
  *****************************************************************************
  * Copyright (C) 1999, 2000 VideoLAN
- * $Id: ac3_imdct_common.c,v 1.2 2001/05/15 19:36:27 sam Exp $
+ * $Id: ac3_imdct_common.c,v 1.3 2001/05/16 14:51:29 reno Exp $
  *
  * Authors: Renaud Dartus <reno@videolan.org>
  *          Aaron Holtzman <aholtzma@engr.uvic.ca>
@@ -45,71 +45,13 @@
 #include "mtime.h"
 
 #include "ac3_imdct.h"
+#include "ac3_retables.h"
 
 #ifndef M_PI
 #   define M_PI 3.14159265358979323846
 #endif
 
-static float window[] = {
-    0.00014, 0.00024, 0.00037, 0.00051, 0.00067, 0.00086, 0.00107, 0.00130,
-    0.00157, 0.00187, 0.00220, 0.00256, 0.00297, 0.00341, 0.00390, 0.00443,
-    0.00501, 0.00564, 0.00632, 0.00706, 0.00785, 0.00871, 0.00962, 0.01061,
-    0.01166, 0.01279, 0.01399, 0.01526, 0.01662, 0.01806, 0.01959, 0.02121,
-    0.02292, 0.02472, 0.02662, 0.02863, 0.03073, 0.03294, 0.03527, 0.03770,
-    0.04025, 0.04292, 0.04571, 0.04862, 0.05165, 0.05481, 0.05810, 0.06153,
-    0.06508, 0.06878, 0.07261, 0.07658, 0.08069, 0.08495, 0.08935, 0.09389,
-    0.09859, 0.10343, 0.10842, 0.11356, 0.11885, 0.12429, 0.12988, 0.13563,
-    0.14152, 0.14757, 0.15376, 0.16011, 0.16661, 0.17325, 0.18005, 0.18699,
-    0.19407, 0.20130, 0.20867, 0.21618, 0.22382, 0.23161, 0.23952, 0.24757,
-    0.25574, 0.26404, 0.27246, 0.28100, 0.28965, 0.29841, 0.30729, 0.31626,
-    0.32533, 0.33450, 0.34376, 0.35311, 0.36253, 0.37204, 0.38161, 0.39126,
-    0.40096, 0.41072, 0.42054, 0.43040, 0.44030, 0.45023, 0.46020, 0.47019,
-    0.48020, 0.49022, 0.50025, 0.51028, 0.52031, 0.53033, 0.54033, 0.55031,
-    0.56026, 0.57019, 0.58007, 0.58991, 0.59970, 0.60944, 0.61912, 0.62873,
-    0.63827, 0.64774, 0.65713, 0.66643, 0.67564, 0.68476, 0.69377, 0.70269,
-    0.71150, 0.72019, 0.72877, 0.73723, 0.74557, 0.75378, 0.76186, 0.76981,
-    0.77762, 0.78530, 0.79283, 0.80022, 0.80747, 0.81457, 0.82151, 0.82831,
-    0.83496, 0.84145, 0.84779, 0.85398, 0.86001, 0.86588, 0.87160, 0.87716,
-    0.88257, 0.88782, 0.89291, 0.89785, 0.90264, 0.90728, 0.91176, 0.91610,
-    0.92028, 0.92432, 0.92822, 0.93197, 0.93558, 0.93906, 0.94240, 0.94560,
-    0.94867, 0.95162, 0.95444, 0.95713, 0.95971, 0.96217, 0.96451, 0.96674,
-    0.96887, 0.97089, 0.97281, 0.97463, 0.97635, 0.97799, 0.97953, 0.98099,
-    0.98236, 0.98366, 0.98488, 0.98602, 0.98710, 0.98811, 0.98905, 0.98994,
-    0.99076, 0.99153, 0.99225, 0.99291, 0.99353, 0.99411, 0.99464, 0.99513,
-    0.99558, 0.99600, 0.99639, 0.99674, 0.99706, 0.99736, 0.99763, 0.99788,
-    0.99811, 0.99831, 0.99850, 0.99867, 0.99882, 0.99895, 0.99908, 0.99919,
-    0.99929, 0.99938, 0.99946, 0.99953, 0.99959, 0.99965, 0.99969, 0.99974,
-    0.99978, 0.99981, 0.99984, 0.99986, 0.99988, 0.99990, 0.99992, 0.99993,
-    0.99994, 0.99995, 0.99996, 0.99997, 0.99998, 0.99998, 0.99998, 0.99999,
-    0.99999, 0.99999, 0.99999, 1.00000, 1.00000, 1.00000, 1.00000, 1.00000,
-    1.00000, 1.00000, 1.00000, 1.00000, 1.00000, 1.00000, 1.00000, 1.00000
-};
-
-static const int pm128[128] =
-{
-    0, 16, 32, 48, 64, 80,  96, 112,  8, 40, 72, 104, 24, 56,  88, 120,
-    4, 20, 36, 52, 68, 84, 100, 116, 12, 28, 44,  60, 76, 92, 108, 124,
-    2, 18, 34, 50, 66, 82,  98, 114, 10, 42, 74, 106, 26, 58,  90, 122,
-    6, 22, 38, 54, 70, 86, 102, 118, 14, 46, 78, 110, 30, 62,  94, 126,
-    1, 17, 33, 49, 65, 81,  97, 113,  9, 41, 73, 105, 25, 57,  89, 121,
-    5, 21, 37, 53, 69, 85, 101, 117, 13, 29, 45,  61, 77, 93, 109, 125,
-    3, 19, 35, 51, 67, 83,  99, 115, 11, 43, 75, 107, 27, 59,  91, 123,
-    7, 23, 39, 55, 71, 87, 103, 119, 15, 31, 47,  63, 79, 95, 111, 127
-}; 
-
-static const int pm64[64] =
-{
-    0,  8, 16, 24, 32, 40, 48, 56,
-    4, 20, 36, 52, 12, 28, 44, 60,
-    2, 10, 18, 26, 34, 42, 50, 58,
-    6, 14, 22, 30, 38, 46, 54, 62,
-    1,  9, 17, 25, 33, 41, 49, 57,
-    5, 21, 37, 53, 13, 29, 45, 61,
-    3, 11, 19, 27, 35, 43, 51, 59,
-    7, 23, 39, 55, 15, 31, 47, 63
-};
-
-void _M( fft_64p ) ( complex_t *a );
+void _M( fft_64p )  ( complex_t *x );
 
 void _M( imdct_do_256 ) (imdct_t * p_imdct, float data[],float delay[])
 {
@@ -266,4 +208,3 @@ void _M( imdct_do_256_nol ) (imdct_t * p_imdct, float data[], float delay[])
         *delay_ptr++ = -buf2[64-i-1].real * *--window_ptr;
     }
 }
-
diff --git a/plugins/imdct/ac3_imdct_common.h b/plugins/imdct/ac3_imdct_common.h
index ce0a7ab6d8..3977b2a3ba 100644
--- a/plugins/imdct/ac3_imdct_common.h
+++ b/plugins/imdct/ac3_imdct_common.h
@@ -2,7 +2,7 @@
  * ac3_imdct_common.h: common ac3 DCT headers
  *****************************************************************************
  * Copyright (C) 1999, 2000 VideoLAN
- * $Id: ac3_imdct_common.h,v 1.1 2001/05/15 16:19:42 sam Exp $
+ * $Id: ac3_imdct_common.h,v 1.2 2001/05/16 14:51:29 reno Exp $
  *
  * Authors: Renaud Dartus <reno@videolan.org>
  *          Aaron Holtzman <aholtzma@engr.uvic.ca>
@@ -27,4 +27,3 @@ void _M( imdct_do_256 )     ( imdct_t * p_imdct, float data[], float delay[] );
 void _M( imdct_do_256_nol ) ( imdct_t * p_imdct, float data[], float delay[] );
 void _M( imdct_do_512  )    ( imdct_t * p_imdct, float data[], float delay[] );
 void _M( imdct_do_512_nol ) ( imdct_t * p_imdct, float data[], float delay[] );
-
diff --git a/plugins/imdct/ac3_imdct_sse.c b/plugins/imdct/ac3_imdct_sse.c
index d426f55a66..a9dad29166 100644
--- a/plugins/imdct/ac3_imdct_sse.c
+++ b/plugins/imdct/ac3_imdct_sse.c
@@ -2,7 +2,7 @@
  * ac3_imdct_sse.c: accelerated SSE ac3 DCT
  *****************************************************************************
  * Copyright (C) 1999, 2000 VideoLAN
- * $Id: ac3_imdct_sse.c,v 1.1 2001/05/15 16:19:42 sam Exp $
+ * $Id: ac3_imdct_sse.c,v 1.2 2001/05/16 14:51:29 reno Exp $
  *
  * Authors: Renaud Dartus <reno@videolan.org>
  *          Aaron Holtzman <aholtzma@engr.uvic.ca>
@@ -40,61 +40,15 @@
 
 #include "ac3_imdct.h"
 #include "ac3_imdct_common.h"
-
-static const float window[] = {
-    0.00014, 0.00024, 0.00037, 0.00051, 0.00067, 0.00086, 0.00107, 0.00130,
-    0.00157, 0.00187, 0.00220, 0.00256, 0.00297, 0.00341, 0.00390, 0.00443,
-    0.00501, 0.00564, 0.00632, 0.00706, 0.00785, 0.00871, 0.00962, 0.01061,
-    0.01166, 0.01279, 0.01399, 0.01526, 0.01662, 0.01806, 0.01959, 0.02121,
-    0.02292, 0.02472, 0.02662, 0.02863, 0.03073, 0.03294, 0.03527, 0.03770,
-    0.04025, 0.04292, 0.04571, 0.04862, 0.05165, 0.05481, 0.05810, 0.06153,
-    0.06508, 0.06878, 0.07261, 0.07658, 0.08069, 0.08495, 0.08935, 0.09389,
-    0.09859, 0.10343, 0.10842, 0.11356, 0.11885, 0.12429, 0.12988, 0.13563,
-    0.14152, 0.14757, 0.15376, 0.16011, 0.16661, 0.17325, 0.18005, 0.18699,
-    0.19407, 0.20130, 0.20867, 0.21618, 0.22382, 0.23161, 0.23952, 0.24757,
-    0.25574, 0.26404, 0.27246, 0.28100, 0.28965, 0.29841, 0.30729, 0.31626,
-    0.32533, 0.33450, 0.34376, 0.35311, 0.36253, 0.37204, 0.38161, 0.39126,
-    0.40096, 0.41072, 0.42054, 0.43040, 0.44030, 0.45023, 0.46020, 0.47019,
-    0.48020, 0.49022, 0.50025, 0.51028, 0.52031, 0.53033, 0.54033, 0.55031,
-    0.56026, 0.57019, 0.58007, 0.58991, 0.59970, 0.60944, 0.61912, 0.62873,
-    0.63827, 0.64774, 0.65713, 0.66643, 0.67564, 0.68476, 0.69377, 0.70269,
-    0.71150, 0.72019, 0.72877, 0.73723, 0.74557, 0.75378, 0.76186, 0.76981,
-    0.77762, 0.78530, 0.79283, 0.80022, 0.80747, 0.81457, 0.82151, 0.82831,
-    0.83496, 0.84145, 0.84779, 0.85398, 0.86001, 0.86588, 0.87160, 0.87716,
-    0.88257, 0.88782, 0.89291, 0.89785, 0.90264, 0.90728, 0.91176, 0.91610,
-    0.92028, 0.92432, 0.92822, 0.93197, 0.93558, 0.93906, 0.94240, 0.94560,
-    0.94867, 0.95162, 0.95444, 0.95713, 0.95971, 0.96217, 0.96451, 0.96674,
-    0.96887, 0.97089, 0.97281, 0.97463, 0.97635, 0.97799, 0.97953, 0.98099,
-    0.98236, 0.98366, 0.98488, 0.98602, 0.98710, 0.98811, 0.98905, 0.98994,
-    0.99076, 0.99153, 0.99225, 0.99291, 0.99353, 0.99411, 0.99464, 0.99513,
-    0.99558, 0.99600, 0.99639, 0.99674, 0.99706, 0.99736, 0.99763, 0.99788,
-    0.99811, 0.99831, 0.99850, 0.99867, 0.99882, 0.99895, 0.99908, 0.99919,
-    0.99929, 0.99938, 0.99946, 0.99953, 0.99959, 0.99965, 0.99969, 0.99974,
-    0.99978, 0.99981, 0.99984, 0.99986, 0.99988, 0.99990, 0.99992, 0.99993,
-    0.99994, 0.99995, 0.99996, 0.99997, 0.99998, 0.99998, 0.99998, 0.99999,
-    0.99999, 0.99999, 0.99999, 1.00000, 1.00000, 1.00000, 1.00000, 1.00000,
-    1.00000, 1.00000, 1.00000, 1.00000, 1.00000, 1.00000, 1.00000, 1.00000
-};
-
-static const int pm128[128] =
-{
-    0, 16, 32, 48, 64, 80,  96, 112,  8, 40, 72, 104, 24, 56,  88, 120,
-    4, 20, 36, 52, 68, 84, 100, 116, 12, 28, 44,  60, 76, 92, 108, 124,
-    2, 18, 34, 50, 66, 82,  98, 114, 10, 42, 74, 106, 26, 58,  90, 122,
-    6, 22, 38, 54, 70, 86, 102, 118, 14, 46, 78, 110, 30, 62,  94, 126,
-    1, 17, 33, 49, 65, 81,  97, 113,  9, 41, 73, 105, 25, 57,  89, 121,
-    5, 21, 37, 53, 69, 85, 101, 117, 13, 29, 45,  61, 77, 93, 109, 125,
-    3, 19, 35, 51, 67, 83,  99, 115, 11, 43, 75, 107, 27, 59,  91, 123,
-    7, 23, 39, 55, 71, 87, 103, 119, 15, 31, 47,  63, 79, 95, 111, 127
-}; 
+#include "ac3_retables.h"
 
 void _M( fft_64p )  ( complex_t *x );
 void _M( fft_128p ) ( complex_t *a );
 
 static void imdct512_pre_ifft_twiddle_sse (const int *pmt, complex_t *buf, float *data, float *xcos_sin_sse);
 static void imdct512_post_ifft_twiddle_sse (complex_t *buf, float *xcos_sin_sse);
-static void imdct512_window_delay_sse (complex_t *buf, float *data_ptr, const float *window_prt, float *delay_prt);
-static void imdct512_window_delay_nol_sse (complex_t *buf, float *data_ptr, const float *window_prt, float *delay_prt);
+static void imdct512_window_delay_sse (complex_t *buf, float *data_ptr, float *window_prt, float *delay_prt);
+static void imdct512_window_delay_nol_sse (complex_t *buf, float *data_ptr, float *window_prt, float *delay_prt);
 
 
 void _M( imdct_init ) (imdct_t * p_imdct)
@@ -260,7 +214,7 @@ static void imdct512_post_ifft_twiddle_sse (complex_t *buf, float *xcos_sin_sse)
     : "a" (buf), "c" (xcos_sin_sse) );
 }
 
-static void imdct512_window_delay_sse (complex_t *buf, float *data_ptr, const float *window_prt, float *delay_prt)
+static void imdct512_window_delay_sse (complex_t *buf, float *data_ptr, float *window_prt, float *delay_prt)
 {
     __asm__ __volatile__ (
     "pushl %%ebp\n"
@@ -448,7 +402,7 @@ static void imdct512_window_delay_sse (complex_t *buf, float *data_ptr, const fl
     ::);
 }
 
-static void imdct512_window_delay_nol_sse (complex_t *buf, float *data_ptr, const float *window_prt, float *delay_prt)
+static void imdct512_window_delay_nol_sse (complex_t *buf, float *data_ptr, float *window_prt, float *delay_prt)
 {
     __asm__ __volatile__ (
     "pushl %%ebp\n"
diff --git a/plugins/imdct/ac3_retables.h b/plugins/imdct/ac3_retables.h
new file mode 100644
index 0000000000..50e1d1c55d
--- /dev/null
+++ b/plugins/imdct/ac3_retables.h
@@ -0,0 +1,83 @@
+/*****************************************************************************
+ * ac3_retables.h: ac3 DCT tables
+ *****************************************************************************
+ * Copyright (C) 1999, 2000 VideoLAN
+ * $Id: ac3_retables.h,v 1.1 2001/05/16 14:51:29 reno Exp $
+ *
+ * Authors: Renaud Dartus <reno@videolan.org>
+ *          Aaron Holtzman <aholtzma@engr.uvic.ca>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111, USA.
+ *****************************************************************************/
+
+static float window[] = {
+    0.00014, 0.00024, 0.00037, 0.00051, 0.00067, 0.00086, 0.00107, 0.00130,
+    0.00157, 0.00187, 0.00220, 0.00256, 0.00297, 0.00341, 0.00390, 0.00443,
+    0.00501, 0.00564, 0.00632, 0.00706, 0.00785, 0.00871, 0.00962, 0.01061,
+    0.01166, 0.01279, 0.01399, 0.01526, 0.01662, 0.01806, 0.01959, 0.02121,
+    0.02292, 0.02472, 0.02662, 0.02863, 0.03073, 0.03294, 0.03527, 0.03770,
+    0.04025, 0.04292, 0.04571, 0.04862, 0.05165, 0.05481, 0.05810, 0.06153,
+    0.06508, 0.06878, 0.07261, 0.07658, 0.08069, 0.08495, 0.08935, 0.09389,
+    0.09859, 0.10343, 0.10842, 0.11356, 0.11885, 0.12429, 0.12988, 0.13563,
+    0.14152, 0.14757, 0.15376, 0.16011, 0.16661, 0.17325, 0.18005, 0.18699,
+    0.19407, 0.20130, 0.20867, 0.21618, 0.22382, 0.23161, 0.23952, 0.24757,
+    0.25574, 0.26404, 0.27246, 0.28100, 0.28965, 0.29841, 0.30729, 0.31626,
+    0.32533, 0.33450, 0.34376, 0.35311, 0.36253, 0.37204, 0.38161, 0.39126,
+    0.40096, 0.41072, 0.42054, 0.43040, 0.44030, 0.45023, 0.46020, 0.47019,
+    0.48020, 0.49022, 0.50025, 0.51028, 0.52031, 0.53033, 0.54033, 0.55031,
+    0.56026, 0.57019, 0.58007, 0.58991, 0.59970, 0.60944, 0.61912, 0.62873,
+    0.63827, 0.64774, 0.65713, 0.66643, 0.67564, 0.68476, 0.69377, 0.70269,
+    0.71150, 0.72019, 0.72877, 0.73723, 0.74557, 0.75378, 0.76186, 0.76981,
+    0.77762, 0.78530, 0.79283, 0.80022, 0.80747, 0.81457, 0.82151, 0.82831,
+    0.83496, 0.84145, 0.84779, 0.85398, 0.86001, 0.86588, 0.87160, 0.87716,
+    0.88257, 0.88782, 0.89291, 0.89785, 0.90264, 0.90728, 0.91176, 0.91610,
+    0.92028, 0.92432, 0.92822, 0.93197, 0.93558, 0.93906, 0.94240, 0.94560,
+    0.94867, 0.95162, 0.95444, 0.95713, 0.95971, 0.96217, 0.96451, 0.96674,
+    0.96887, 0.97089, 0.97281, 0.97463, 0.97635, 0.97799, 0.97953, 0.98099,
+    0.98236, 0.98366, 0.98488, 0.98602, 0.98710, 0.98811, 0.98905, 0.98994,
+    0.99076, 0.99153, 0.99225, 0.99291, 0.99353, 0.99411, 0.99464, 0.99513,
+    0.99558, 0.99600, 0.99639, 0.99674, 0.99706, 0.99736, 0.99763, 0.99788,
+    0.99811, 0.99831, 0.99850, 0.99867, 0.99882, 0.99895, 0.99908, 0.99919,
+    0.99929, 0.99938, 0.99946, 0.99953, 0.99959, 0.99965, 0.99969, 0.99974,
+    0.99978, 0.99981, 0.99984, 0.99986, 0.99988, 0.99990, 0.99992, 0.99993,
+    0.99994, 0.99995, 0.99996, 0.99997, 0.99998, 0.99998, 0.99998, 0.99999,
+    0.99999, 0.99999, 0.99999, 1.00000, 1.00000, 1.00000, 1.00000, 1.00000,
+    1.00000, 1.00000, 1.00000, 1.00000, 1.00000, 1.00000, 1.00000, 1.00000
+};
+
+static const int pm128[128] =
+{
+    0, 16, 32, 48, 64, 80,  96, 112,  8, 40, 72, 104, 24, 56,  88, 120,
+    4, 20, 36, 52, 68, 84, 100, 116, 12, 28, 44,  60, 76, 92, 108, 124,
+    2, 18, 34, 50, 66, 82,  98, 114, 10, 42, 74, 106, 26, 58,  90, 122,
+    6, 22, 38, 54, 70, 86, 102, 118, 14, 46, 78, 110, 30, 62,  94, 126,
+    1, 17, 33, 49, 65, 81,  97, 113,  9, 41, 73, 105, 25, 57,  89, 121,
+    5, 21, 37, 53, 69, 85, 101, 117, 13, 29, 45,  61, 77, 93, 109, 125,
+    3, 19, 35, 51, 67, 83,  99, 115, 11, 43, 75, 107, 27, 59,  91, 123,
+    7, 23, 39, 55, 71, 87, 103, 119, 15, 31, 47,  63, 79, 95, 111, 127
+}; 
+
+static const int pm64[64] =
+{
+    0,  8, 16, 24, 32, 40, 48, 56,
+    4, 20, 36, 52, 12, 28, 44, 60,
+    2, 10, 18, 26, 34, 42, 50, 58,
+    6, 14, 22, 30, 38, 46, 54, 62,
+    1,  9, 17, 25, 33, 41, 49, 57,
+    5, 21, 37, 53, 13, 29, 45, 61,
+    3, 11, 19, 27, 35, 43, 51, 59,
+    7, 23, 39, 55, 15, 31, 47, 63
+};
+
diff --git a/plugins/imdct/ac3_srfft_3dn.c b/plugins/imdct/ac3_srfft_3dn.c
new file mode 100644
index 0000000000..729f0981c8
--- /dev/null
+++ b/plugins/imdct/ac3_srfft_3dn.c
@@ -0,0 +1,344 @@
+/*****************************************************************************
+ * ac3_srfft_3dn.c: accelerated 3D Now! ac3 fft functions
+ *****************************************************************************
+ * Copyright (C) 1999, 2000, 2001 VideoLAN
+ * $Id: ac3_srfft_3dn.c,v 1.1 2001/05/16 14:51:29 reno Exp $
+ *
+ * Authors: Renaud Dartus <reno@videolan.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111, USA.
+ *****************************************************************************/
+
+#define MODULE_NAME imdct3dn
+#include "modules_inner.h"
+
+/*****************************************************************************
+ *  * Preamble
+ *   *****************************************************************************/
+#include <stdio.h>
+
+#include "defs.h"
+
+#include <math.h>
+#include <stdio.h>
+
+#include "config.h"
+#include "common.h"
+#include "threads.h"
+#include "mtime.h"
+
+#include "ac3_imdct.h"
+#include "ac3_srfft.h"
+
+void hsqrt2_3dn (void);
+void C_1_3dn (void);
+static void fft_4_3dn (complex_t *x);
+static void fft_8_3dn (complex_t *x);
+static void fft_asmb_3dn (int k, complex_t *x, complex_t *wTB,
+	     const complex_t *d, const complex_t *d_3);
+
+void _M( fft_64p ) ( complex_t *a )
+{
+	fft_8_3dn(&a[0]); fft_4_3dn(&a[8]); fft_4_3dn(&a[12]);
+	fft_asmb_3dn(2, &a[0], &a[8], &delta16[0], &delta16_3[0]);
+  
+	fft_8_3dn(&a[16]), fft_8_3dn(&a[24]);
+	fft_asmb_3dn(4, &a[0], &a[16],&delta32[0], &delta32_3[0]);
+
+	fft_8_3dn(&a[32]); fft_4_3dn(&a[40]); fft_4_3dn(&a[44]);
+	fft_asmb_3dn(2, &a[32], &a[40], &delta16[0], &delta16_3[0]);
+
+	fft_8_3dn(&a[48]); fft_4_3dn(&a[56]); fft_4_3dn(&a[60]);
+	fft_asmb_3dn(2, &a[48], &a[56], &delta16[0], &delta16_3[0]);
+
+	fft_asmb_3dn(8, &a[0], &a[32],&delta64[0], &delta64_3[0]);
+}
+
+void _M( fft_128p ) ( complex_t *a )
+{
+    fft_8_3dn(&a[0]); fft_4_3dn(&a[8]); fft_4_3dn(&a[12]);
+	fft_asmb_3dn(2, &a[0], &a[8], &delta16[0], &delta16_3[0]);
+  
+	fft_8_3dn(&a[16]), fft_8_3dn(&a[24]);
+	fft_asmb_3dn(4, &a[0], &a[16],&delta32[0], &delta32_3[0]);
+
+	fft_8_3dn(&a[32]); fft_4_3dn(&a[40]); fft_4_3dn(&a[44]);
+	fft_asmb_3dn(2, &a[32], &a[40], &delta16[0], &delta16_3[0]);
+
+	fft_8_3dn(&a[48]); fft_4_3dn(&a[56]); fft_4_3dn(&a[60]);
+	fft_asmb_3dn(2, &a[48], &a[56], &delta16[0], &delta16_3[0]);
+
+	fft_asmb_3dn(8, &a[0], &a[32],&delta64[0], &delta64_3[0]);
+
+	fft_8_3dn(&a[64]); fft_4_3dn(&a[72]); fft_4_3dn(&a[76]);
+	/* fft_16(&a[64]); */
+	fft_asmb_3dn(2, &a[64], &a[72], &delta16[0], &delta16_3[0]);
+
+	fft_8_3dn(&a[80]); fft_8_3dn(&a[88]);
+  
+	/* fft_32(&a[64]); */
+	fft_asmb_3dn(4, &a[64], &a[80],&delta32[0], &delta32_3[0]);
+
+	fft_8_3dn(&a[96]); fft_4_3dn(&a[104]), fft_4_3dn(&a[108]);
+	/* fft_16(&a[96]); */
+	fft_asmb_3dn(2, &a[96], &a[104], &delta16[0], &delta16_3[0]);
+
+	fft_8_3dn(&a[112]), fft_8_3dn(&a[120]);
+	/* fft_32(&a[96]); */
+	fft_asmb_3dn(4, &a[96], &a[112], &delta32[0], &delta32_3[0]);
+  
+	/* fft_128(&a[0]); */
+	fft_asmb_3dn(16, &a[0], &a[64], &delta128[0], &delta128_3[0]);
+}
+
+void hsqrt2_3dn (void)
+{
+    __asm__ (
+     ".float 0f0.707106781188\n"
+     ".float 0f0.707106781188\n"
+     ".float 0f-0.707106781188\n"
+     ".float 0f-0.707106781188\n"
+     );
+}
+
+void C_1_3dn (void)
+{
+    __asm__ (
+     ".float 0f-1.0\n"
+     ".float 0f1.0\n"
+     ".float 0f-1.0\n"
+     ".float 0f1.0\n"
+     );
+}
+
+static void fft_4_3dn (complex_t *x)
+{
+    __asm__ __volatile__ (
+	"movq    (%%eax), %%mm0\n"      /* x[0] */
+	"movq   8(%%eax), %%mm1\n"      /* x[1] */
+	"movq  16(%%eax), %%mm2\n"      /* x[2] */
+	"movq  24(%%eax), %%mm3\n"      /* x[3] */
+	"movq    %%mm0, %%mm4\n"	    /* x[1] */
+	"movq    %%mm1, %%mm5\n"		/* x[1] */
+	"movq    %%mm0, %%mm6\n"		/* x[0] */
+	"pfadd   %%mm2, %%mm0\n"		/* x[0] + x[2] */
+	"pfadd   %%mm3, %%mm1\n"		/* x[1] + x[3] */
+	"pfsub   %%mm2, %%mm4\n"		/* x[0] - x[2] */
+	"pfsub   %%mm3, %%mm5\n"		/* x[1] - x[3] */
+
+    "pfadd   %%mm1, %%mm0\n"        /* x[0] + x[2] + x[1] + x[3] */
+    "pfsub   %%mm1, %%mm6\n"        /* x[0] + x[2] - x[1] - x[3] */
+
+    "movq   %%mm0, (%%eax)\n"
+    "movq   %%mm6, 16(%%eax)\n"
+   
+    "pxor    %%mm6, %%mm6\n"
+    "movq    %%mm5, %%mm2\n"        /* x[1] - x[3] */
+    "movq    %%mm4, %%mm3\n"        /* x[0] - x[2] */
+    "pfsub   %%mm5, %%mm6\n"        /* x[3] - x[1] */
+    
+    "punpckhdq %%mm2,%%mm2\n"       /* x[1] - x[3].im */
+    "punpckldq %%mm6,%%mm6\n"       /* x[3] - x[1].re */
+    "punpckhdq %%mm6,%%mm2\n"       /* x[3] - x[1].re,  x[1] - x[3].im */
+    
+	"pfsub   %%mm2, %%mm4\n"        /* x0i-x2i-x3r+x1.r,x0r-x2r-x1i+x3i */
+    "pfadd   %%mm3, %%mm2\n"        /* x0i-x2i+x3r-x1.r, x0r-x2r+x1i-x3.i */
+
+    "movq  %%mm2,  8(%%eax)\n"    /* mm4_2 + mm6_1, mm4_1 + mm5_2 */
+	"movq  %%mm4, 24(%%eax)\n"    /* mm4_2 - mm6_1, mm4_1 - mm5_2 */
+	"femms\n"
+    : "=a" (x)
+    : "a" (x) );
+}
+
+static void fft_8_3dn (complex_t *x)
+{
+  register float wT1_r, wT1_i, wB1_r, wB1_i, wT2_r, wT2_i, wB2_r, wB2_i;
+  
+  wT1_r = x[1].real;
+  wT1_i = x[1].imag;
+  wB1_r = x[3].real;
+  wB1_i = x[3].imag;
+
+  x[1] = x[2];
+  x[2] = x[4];
+  x[3] = x[6];
+  { /* fft_4 */
+      register float yt_r, yt_i, yb_r, yb_i, u_r, u_i, vi_r, vi_i;
+  
+      yt_r = x[0].real;
+      yb_r = yt_r - x[2].real;
+      yt_r += x[2].real;
+
+      u_r = x[1].real;
+      vi_i = x[3].real - u_r;
+      u_r += x[3].real;
+  
+      u_i = x[1].imag;
+      vi_r = u_i - x[3].imag;
+      u_i += x[3].imag;
+
+      yt_i = yt_r;
+      yt_i += u_r;
+      x[0].real = yt_i;
+      yt_r -= u_r;
+      x[2].real = yt_r;
+      yt_i = yb_r;
+      yt_i += vi_r;
+      x[1].real = yt_i;
+      yb_r -= vi_r;
+      x[3].real = yb_r;
+
+      yt_i = x[0].imag;
+      yb_i = yt_i - x[2].imag;
+      yt_i += x[2].imag;
+
+      yt_r = yt_i;
+      yt_r += u_i;
+      x[0].imag = yt_r;
+      yt_i -= u_i;
+      x[2].imag = yt_i;
+      yt_r = yb_i;
+      yt_r += vi_i;
+      x[1].imag = yt_r;
+      yb_i -= vi_i;
+      x[3].imag = yb_i;
+  }
+  
+  /* x[0] x[4] */
+  wT2_r = x[5].real;
+  wT2_r += x[7].real;
+  wT2_r += wT1_r;
+  wT2_r += wB1_r;
+  wT2_i = wT2_r;
+  wT2_r += x[0].real;
+  wT2_i = x[0].real - wT2_i;
+  x[0].real = wT2_r;
+  x[4].real = wT2_i;
+
+  wT2_i = x[5].imag;
+  wT2_i += x[7].imag;
+  wT2_i += wT1_i;
+  wT2_i += wB1_i;
+  wT2_r = wT2_i;
+  wT2_r += x[0].imag;
+  wT2_i = x[0].imag - wT2_i;
+  x[0].imag = wT2_r;
+  x[4].imag = wT2_i;
+  
+  /* x[2] x[6] */
+  wT2_r = x[5].imag;
+  wT2_r -= x[7].imag;
+  wT2_r += wT1_i;
+  wT2_r -= wB1_i;
+  wT2_i = wT2_r;
+  wT2_r += x[2].real;
+  wT2_i = x[2].real - wT2_i;
+  x[2].real = wT2_r;
+  x[6].real = wT2_i;
+
+  wT2_i = x[5].real;
+  wT2_i -= x[7].real;
+  wT2_i += wT1_r;
+  wT2_i -= wB1_r;
+  wT2_r = wT2_i;
+  wT2_r += x[2].imag;
+  wT2_i = x[2].imag - wT2_i;
+  x[2].imag = wT2_i;
+  x[6].imag = wT2_r;
+
+  /* x[1] x[5] */
+  wT2_r = wT1_r;
+  wT2_r += wB1_i;
+  wT2_r -= x[5].real;
+  wT2_r -= x[7].imag;
+  wT2_i = wT1_i;
+  wT2_i -= wB1_r;
+  wT2_i -= x[5].imag;
+  wT2_i += x[7].real;
+
+  wB2_r = wT2_r;
+  wB2_r += wT2_i;
+  wT2_i -= wT2_r;
+  wB2_r *= HSQRT2;
+  wT2_i *= HSQRT2;
+  wT2_r = wB2_r;
+  wB2_r += x[1].real;
+  wT2_r =  x[1].real - wT2_r;
+
+  wB2_i = x[5].real;
+  x[1].real = wB2_r;
+  x[5].real = wT2_r;
+
+  wT2_r = wT2_i;
+  wT2_r += x[1].imag;
+  wT2_i = x[1].imag - wT2_i;
+  wB2_r = x[5].imag;
+  x[1].imag = wT2_r;
+  x[5].imag = wT2_i;
+
+  /* x[3] x[7] */
+  wT1_r -= wB1_i;
+  wT1_i += wB1_r;
+  wB1_r = wB2_i - x[7].imag;
+  wB1_i = wB2_r + x[7].real;
+  wT1_r -= wB1_r;
+  wT1_i -= wB1_i;
+  wB1_r = wT1_r + wT1_i;
+  wB1_r *= HSQRT2;
+  wT1_i -= wT1_r;
+  wT1_i *= HSQRT2;
+  wB2_r = x[3].real;
+  wB2_i = wB2_r + wT1_i;
+  wB2_r -= wT1_i;
+  x[3].real = wB2_i;
+  x[7].real = wB2_r;
+  wB2_i = x[3].imag;
+  wB2_r = wB2_i + wB1_r;
+  wB2_i -= wB1_r;
+  x[3].imag = wB2_i;
+  x[7].imag = wB2_r;
+}
+
+    
+static void fft_asmb_3dn (int k, complex_t *x, complex_t *wTB,
+	     const complex_t *d, const complex_t *d_3)
+{
+  register complex_t  *x2k, *x3k, *x4k, *wB;
+  register float a_r, a_i, a1_r, a1_i, u_r, u_i, v_r, v_i;
+
+  x2k = x + 2 * k;
+  x3k = x2k + 2 * k;
+  x4k = x3k + 2 * k;
+  wB = wTB + 2 * k;
+  
+  TRANSZERO(x[0],x2k[0],x3k[0],x4k[0]);
+  TRANS(x[1],x2k[1],x3k[1],x4k[1],wTB[1],wB[1],d[1],d_3[1]);
+  
+  --k;
+  for(;;) {
+     TRANS(x[2],x2k[2],x3k[2],x4k[2],wTB[2],wB[2],d[2],d_3[2]);
+     TRANS(x[3],x2k[3],x3k[3],x4k[3],wTB[3],wB[3],d[3],d_3[3]);
+     if (!--k) break;
+     x += 2;
+     x2k += 2;
+     x3k += 2;
+     x4k += 2;
+     d += 2;
+     d_3 += 2;
+     wTB += 2;
+     wB += 2;
+  }
+}
diff --git a/plugins/imdct/ac3_srfft_sse.c b/plugins/imdct/ac3_srfft_sse.c
index 2de563b57b..741b2a255d 100644
--- a/plugins/imdct/ac3_srfft_sse.c
+++ b/plugins/imdct/ac3_srfft_sse.c
@@ -2,7 +2,7 @@
  * ac3_srfft_sse.c: accelerated SSE ac3 fft functions
  *****************************************************************************
  * Copyright (C) 1999, 2000, 2001 VideoLAN
- * $Id: ac3_srfft_sse.c,v 1.1 2001/05/15 16:19:42 sam Exp $
+ * $Id: ac3_srfft_sse.c,v 1.2 2001/05/16 14:51:29 reno Exp $
  *
  * Authors: Renaud Dartus <reno@videolan.org>
  *          Aaron Holtzman <aholtzma@engr.uvic.ca>
@@ -43,8 +43,8 @@
 #include "ac3_imdct.h"
 #include "ac3_srfft.h"
 
-void hsqrt2 (void);
-void C_1 (void);
+void hsqrt2_sse (void);
+void C_1_sse (void);
 static void fft_4_sse (complex_t *x);
 static void fft_8_sse (complex_t *x);
 static void fft_asmb_sse (int k, complex_t *x, complex_t *wTB,
@@ -104,7 +104,7 @@ void _M( fft_128p ) ( complex_t *a )
     fft_asmb_sse(16, &a[0], &a[64], &delta128[0], &delta128_3[0]);
 }
 
-void hsqrt2 (void)
+void hsqrt2_sse (void)
 {
     __asm__ (
      ".float 0f0.707106781188\n"
@@ -114,7 +114,7 @@ void hsqrt2 (void)
      );
 }
 
-void C_1 (void)
+void C_1_sse (void)
 {
     __asm__ (
      ".float 0f-1.0\n"
@@ -174,7 +174,7 @@ static void fft_8_sse (complex_t *x)
     "subps   %%xmm5, %%xmm7\n"        /* yb = i*(x6-x2)+x0-x4 | -x6-x2+x0+x4 */
 
     "movhps 24(%%eax), %%xmm1\n"    /* x[3] | x[1] */
-    "movl   $hsqrt2, %%ebx\n"
+    "movl   $hsqrt2_sse, %%ebx\n"
     "movlps 40(%%eax), %%xmm2\n"    /* x[5] */
     "movhps 56(%%eax), %%xmm2\n"    /* x[7] | x[5] */
     "movups  %%xmm1, %%xmm3\n"        /* x[3] | x[1] */
@@ -191,7 +191,7 @@ static void fft_8_sse (complex_t *x)
     "movlhps %%xmm6, %%xmm1\n"        /* (1+i)/sqrt2 * (x[1]-x[5]) | x[1]+x[5] */
     "shufps   $0xe4, %%xmm6, %%xmm5\n"    /* (-1-i)/sqrt2 * (x[3]-x[7]) | x[3]+x[7] */
     "movups  %%xmm1, %%xmm3\n"        /* (1-i)/sqrt2 * (x[1]-x[5]) | x[1]+x[5] */
-    "movl      $C_1, %%ebx\n"
+    "movl  $C_1_sse, %%ebx\n"
     "addps   %%xmm5, %%xmm1\n"        /* u */
     "subps   %%xmm5, %%xmm3\n"        /* v */
     "movups  %%xmm0, %%xmm2\n"        /* yb */
@@ -258,7 +258,7 @@ static void fft_asmb_sse (int k, complex_t *x, complex_t *wTB,
     "movhlps %%xmm5, %%xmm7\n"      /* wT[1].im * d[1].im | wT[1].re * d[1].im */
     "movlhps %%xmm6, %%xmm5\n"      /* wB[1].im * d3[1].re | wB[1].re * d3[1].re | wT[1].im * d[1].re | wT[1].re * d[1].re */
     "shufps $0xb1, %%xmm6, %%xmm7\n" /* wB[1].re * d3[1].im | wB[i].im * d3[1].im | wT[1].re * d[1].im | wT[1].im * d[1].im */
-    "movl $C_1, %%edi\n"
+    "movl  $C_1_sse, %%edi\n"
     "movups (%%edi), %%xmm4\n"
     "mulps   %%xmm4, %%xmm7\n"
     "addps   %%xmm7, %%xmm5\n"      /* wB[1] * d3[1] | wT[1] * d[1] */
@@ -318,7 +318,7 @@ static void fft_asmb_sse (int k, complex_t *x, complex_t *wTB,
     "mulps   %%xmm5, %%xmm4\n"  /* wB[0].im * d3[0].im | wB[0].re * d3[0].im | wB[0].im * d3[0].re | wB[0].re * d3[0].re */
     "mulps   %%xmm7, %%xmm6\n"  /* wB[1].im * d3[1].im | wB[1].re * d3[1].im | wB[1].im * d3[1].re | wB[1].re * d3[1].re */
     "shufps $0xb1, %%xmm2, %%xmm1\n"    /* d[1].im * wT[1].re | d[1].im * wT[1].im | d[0].im * wT[0].re | d[0].im * wT[0].im */
-    "movl $C_1, %%edi\n"
+    "movl  $C_1_sse, %%edi\n"
     "movups (%%edi), %%xmm3\n"  /* 1.0 | -1.0 | 1.0 | -1.0 */
 
     "movhlps %%xmm4, %%xmm5\n"  /* wB[0].im * d3[0].im | wB[0].re * d3[0].im */
diff --git a/plugins/imdct/imdct3dn.c b/plugins/imdct/imdct3dn.c
new file mode 100644
index 0000000000..7432b7164c
--- /dev/null
+++ b/plugins/imdct/imdct3dn.c
@@ -0,0 +1,152 @@
+/*****************************************************************************
+ * imdct3dn.c : accelerated 3D Now! IMDCT module
+ *****************************************************************************
+ * Copyright (C) 1999, 2000 VideoLAN
+ * $Id: imdct3dn.c,v 1.1 2001/05/16 14:51:29 reno Exp $
+ *
+ * Authors: Ga�l Hendryckx <jimmy@via.ecp.fr>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111, USA.
+ *****************************************************************************/
+
+#define MODULE_NAME imdct3dn
+#include "modules_inner.h"
+
+/*****************************************************************************
+ * Preamble
+ *****************************************************************************/
+#include "defs.h"
+
+#include <stdlib.h>
+
+#include "config.h"
+#include "common.h"
+#include "threads.h"
+#include "mtime.h"
+#include "tests.h"
+
+#include "ac3_imdct.h"
+#include "ac3_imdct_common.h"
+
+#include "modules.h"
+
+/*****************************************************************************
+ * Local and extern prototypes.
+ *****************************************************************************/
+static void imdct_getfunctions( function_list_t * p_function_list );
+static int  imdct_Probe       ( probedata_t *p_data );
+
+/*****************************************************************************
+ * Build configuration tree.
+ *****************************************************************************/
+MODULE_CONFIG_START
+ADD_WINDOW( "Configuration for IMDCT module" )
+    ADD_COMMENT( "Ha, ha -- nothing to configure yet" )
+MODULE_CONFIG_END
+
+/*****************************************************************************
+ * InitModule: get the module structure and configuration.
+ *****************************************************************************
+ * We have to fill psz_name, psz_longname and psz_version. These variables
+ * will be strdup()ed later by the main application because the module can
+ * be unloaded later to save memory, and we want to be able to access this
+ * data even after the module has been unloaded.
+ *****************************************************************************/
+MODULE_INIT
+{
+    p_module->psz_name = MODULE_STRING;
+    p_module->psz_longname = "AC3 IMDCT module";
+    p_module->psz_version = VERSION;
+
+    p_module->i_capabilities = MODULE_CAPABILITY_NULL
+                                | MODULE_CAPABILITY_IMDCT;
+
+    return( 0 );
+}
+
+/*****************************************************************************
+ * ActivateModule: set the module to an usable state.
+ *****************************************************************************
+ * This function fills the capability functions and the configuration
+ * structure. Once ActivateModule() has been called, the i_usage can
+ * be set to 0 and calls to NeedModule() be made to increment it. To unload
+ * the module, one has to wait until i_usage == 0 and call DeactivateModule().
+ *****************************************************************************/
+MODULE_ACTIVATE
+{
+    p_module->p_functions = malloc( sizeof( module_functions_t ) );
+    if( p_module->p_functions == NULL )
+    {
+        return( -1 );
+    }
+
+    imdct_getfunctions( &p_module->p_functions->imdct );
+
+    p_module->p_config = p_config;
+
+    return( 0 );
+}
+
+/*****************************************************************************
+ * DeactivateModule: make sure the module can be unloaded.
+ *****************************************************************************
+ * This function must only be called when i_usage == 0. If it successfully
+ * returns, i_usage can be set to -1 and the module unloaded. Be careful to
+ * lock usage_lock during the whole process.
+ *****************************************************************************/
+MODULE_DEACTIVATE
+{
+    free( p_module->p_functions );
+
+    return( 0 );
+}
+
+/* Following functions are local */
+
+/*****************************************************************************
+ * Functions exported as capabilities. They are declared as static so that
+ * we don't pollute the namespace too much.
+ *****************************************************************************/
+static void imdct_getfunctions( function_list_t * p_function_list )
+{
+    p_function_list->pf_probe = imdct_Probe;
+#define F p_function_list->functions.imdct
+    F.pf_imdct_init    = _M( imdct_init );
+    F.pf_imdct_256     = _M( imdct_do_256 );
+    F.pf_imdct_256_nol = _M( imdct_do_256_nol );
+    F.pf_imdct_512     = _M( imdct_do_512 );
+    F.pf_imdct_512_nol = _M( imdct_do_512_nol );
+#undef F
+}
+
+/*****************************************************************************
+ * imdct_Probe: returns a preference score
+ *****************************************************************************/
+static int imdct_Probe( probedata_t *p_data )
+{
+    if( !TestCPU( CPU_CAPABILITY_3DNOW ) )
+    {
+        return( 0 );
+    }
+
+    if( TestMethod( DOWNMIX_METHOD_VAR, "imdct3dn" ) )
+    {
+        return( 999 );
+    }
+
+    /* This plugin always works */
+    return( 200 );
+}
+
diff --git a/src/ac3_decoder/ac3_imdct.c b/src/ac3_decoder/ac3_imdct.c
index 639ac439f3..6ad5fbc724 100644
--- a/src/ac3_decoder/ac3_imdct.c
+++ b/src/ac3_decoder/ac3_imdct.c
@@ -2,7 +2,7 @@
  * ac3_imdct.c: ac3 DCT
  *****************************************************************************
  * Copyright (C) 1999, 2000 VideoLAN
- * $Id: ac3_imdct.c,v 1.20 2001/05/15 16:19:42 sam Exp $
+ * $Id: ac3_imdct.c,v 1.21 2001/05/16 14:51:29 reno Exp $
  *
  * Authors: Michel Kaempf <maxx@via.ecp.fr>
  *          Aaron Holtzman <aholtzma@engr.uvic.ca>
@@ -54,7 +54,7 @@ void imdct_init(imdct_t * p_imdct)
     int i;
     float scale = 181.019;
 
-        p_imdct->pf_imdct_init( p_imdct );
+    p_imdct->pf_imdct_init( p_imdct );
 
     /* More twiddle factors to turn IFFT into IMDCT */
     for (i=0; i < 64; i++) {
-- 
2.25.4