* Begin of SSE/3DNow! support for imdct and downmix

If you have a PIII or a Athlon and you want to try this, just comment #if 0 in ac3_downmix.c and ac3_imdct.c and add in AC3_DECODER section of Makefile : src/ac3_decoder/ac3_imdct_sse.o \ src/ac3_decoder/ac3_srfft_sse.o \ src/ac3_decoder/ac3_downmix_sse.o \ src/ac3_decoder/ac3_downmix_3dn.o \

* Begin of SSE/3DNow! support for imdct and downmix
If you have a PIII or a Athlon and you want to try this, just comment #if 0 in ac3_downmix.c and ac3_imdct.c and add in AC3_DECODER section of Makefile : src/ac3_decoder/ac3_imdct_sse.o \ src/ac3_decoder/ac3_srfft_sse.o \ src/ac3_decoder/ac3_downmix_sse.o \ src/ac3_decoder/ac3_downmix_3dn.o \
db7b0421 · Renaud Dartus · c1df8159 · db7b0421 · db7b0421 · db7b0421
Commit db7b0421 authored May 14, 2001 by Renaud Dartus
24 changed files
--- a/Makefile
+++ b/Makefile
--- a/include/tests.h
+++ b/include/tests.h
@@ -2,7 +2,7 @@
 * tests.h: several test functions needed by the plugins
 *****************************************************************************
 * Copyright (C) 1996, 1997, 1998, 1999, 2000 VideoLAN
- * $Id: tests.h,v 1.9 2001/03/21 13:42:33 sam Exp $
+ * $Id: tests.h,v 1.10 2001/05/14 15:58:03 reno Exp $
 *
 * Authors: Samuel Hocevar <sam@zoy.org>
 *
@@ -28,6 +28,7 @@
 #define CPU_CAPABILITY_MMX     1<<3
 #define CPU_CAPABILITY_3DNOW   1<<4
 #define CPU_CAPABILITY_MMXEXT  1<<5
+#define CPU_CAPABILITY_SSE     1<<6
 #define CPU_CAPABILITY_ALTIVEC 1<<16

 /*****************************************************************************

--- a/src/ac3_decoder/ac3_bit_allocate.c
+++ b/src/ac3_decoder/ac3_bit_allocate.c
@@ -2,7 +2,7 @@
 * ac3_bit_allocate.c: ac3 allocation tables
 *****************************************************************************
 * Copyright (C) 2000 VideoLAN
- * $Id: ac3_bit_allocate.c,v 1.20 2001/05/06 04:32:02 sam Exp $
+ * $Id: ac3_bit_allocate.c,v 1.21 2001/05/14 15:58:03 reno Exp $
 *
 * Authors: Michel Kaempf <maxx@via.ecp.fr>
 *          Aaron Holtzman <aholtzma@engr.uvic.ca>
@@ -37,7 +37,7 @@
 #include "input_ext-dec.h"

 #include "ac3_decoder.h"
-#include "ac3_internal.h"
+#include "ac3_internal.h"                                 /* DELTA_BIT_REUSE */


 static void ba_compute_psd (bit_allocate_t * p_bit, s16 start, s16 end, s16 exps[]);

--- a/src/ac3_decoder/ac3_decoder.c
+++ b/src/ac3_decoder/ac3_decoder.c
@@ -2,7 +2,7 @@
 * ac3_decoder.c: core ac3 decoder
 *****************************************************************************
 * Copyright (C) 1999, 2000 VideoLAN
- * $Id: ac3_decoder.c,v 1.32 2001/05/07 03:14:09 stef Exp $
+ * $Id: ac3_decoder.c,v 1.33 2001/05/14 15:58:03 reno Exp $
 *
 * Authors: Michel Kaempf <maxx@via.ecp.fr>
 *          Michel Lespinasse <walken@zoy.org>
@@ -40,21 +40,14 @@
 #include "audio_output.h"

 #include "ac3_decoder.h"
-#include "ac3_decoder_thread.h"
+#include "ac3_decoder_thread.h"                           /* ac3dec_thread_t */
 #include "ac3_internal.h"

-#include <stdio.h>
-
-void imdct_init (imdct_t * p_imdct);
-void downmix_init (downmix_t * p_downmix);
-
-static float cmixlev_lut[4] = { 0.707, 0.595, 0.500, 0.707 };
-static float smixlev_lut[4] = { 0.707, 0.500, 0.0  , 0.500 };
+static const float cmixlev_lut[4] = { 0.707, 0.595, 0.500, 0.707 };
+static const float smixlev_lut[4] = { 0.707, 0.500, 0.0  , 0.500 };

 int ac3_init (ac3dec_t * p_ac3dec)
 {
-//    p_ac3dec->bit_stream.buffer = 0;
-//    p_ac3dec->bit_stream.i_available = 0;
    p_ac3dec->mantissa.lfsr_state = 1;          /* dither_gen initialization */
    imdct_init(&p_ac3dec->imdct);
    downmix_init(&p_ac3dec->downmix);
@@ -69,7 +62,7 @@ int ac3_decode_frame (ac3dec_t * p_ac3dec, s16 * buffer)
    
    if (parse_bsi (p_ac3dec))
    {
-        intf_WarnMsg (3,"Error during ac3parsing");
+        intf_WarnMsg (3,"ac3dec warn: error during parsing");
        parse_auxdata (p_ac3dec);
        return 1;
    }
@@ -102,7 +95,7 @@ int ac3_decode_frame (ac3dec_t * p_ac3dec, s16 * buffer)
 
        if (parse_audblk (p_ac3dec, i))
        {
-            intf_WarnMsg (3,"Error during ac3audioblock");
+            intf_WarnMsg (3,"ac3dec warn: error during audioblock");
            parse_auxdata (p_ac3dec);
            return 1;
        }
@@ -114,7 +107,7 @@ int ac3_decode_frame (ac3dec_t * p_ac3dec, s16 * buffer)

        if (exponent_unpack (p_ac3dec))
        {
-            intf_WarnMsg (3,"Error during ac3unpack");
+            intf_WarnMsg (3,"ac3dec warn: error during unpack");
            parse_auxdata (p_ac3dec);
            return 1;
        }

--- a/src/ac3_decoder/ac3_decoder.h
+++ b/src/ac3_decoder/ac3_decoder.h
@@ -2,7 +2,7 @@
 * ac3_decoder.h : ac3 decoder interface
 *****************************************************************************
 * Copyright (C) 1999, 2000 VideoLAN
- * $Id: ac3_decoder.h,v 1.7 2001/04/30 21:04:20 reno Exp $
+ * $Id: ac3_decoder.h,v 1.8 2001/05/14 15:58:03 reno Exp $
 *
 * Authors: Michel Kaempf <maxx@via.ecp.fr>
 *          Renaud Dartus <reno@videolan.org>

--- a/src/ac3_decoder/ac3_decoder_thread.h
+++ b/src/ac3_decoder/ac3_decoder_thread.h
@@ -2,7 +2,7 @@
 * ac3_decoder_thread.h : ac3 decoder thread interface
 *****************************************************************************
 * Copyright (C) 1999, 2000 VideoLAN
- * $Id: ac3_decoder_thread.h,v 1.6 2001/05/01 04:18:18 sam Exp $
+ * $Id: ac3_decoder_thread.h,v 1.7 2001/05/14 15:58:03 reno Exp $
 *
 * Authors: Michel Kaempf <maxx@via.ecp.fr>
 *
@@ -30,14 +30,11 @@ typedef struct ac3dec_thread_s
     * Thread properties
     */
    vlc_thread_t        thread_id;                /* id for thread functions */
-//    bit_stream_t        bit_stream;
-        

    /*
     * Input properties
     */
    decoder_fifo_t *    p_fifo;                /* stores the PES stream data */
-//    data_packet_t *     p_data;
    int                 sync_ptr;          /* sync ptr from ac3 magic header */
    adec_config_t *     p_config;


--- a/src/ac3_decoder/ac3_downmix.c
+++ b/src/ac3_decoder/ac3_downmix.c
@@ -2,7 +2,7 @@
 * ac3_downmix.c: ac3 downmix functions
 *****************************************************************************
 * Copyright (C) 1999, 2000 VideoLAN
- * $Id: ac3_downmix.c,v 1.22 2001/05/06 04:32:02 sam Exp $
+ * $Id: ac3_downmix.c,v 1.23 2001/05/14 15:58:03 reno Exp $
 *
 * Authors: Michel Kaempf <maxx@via.ecp.fr>
 *          Aaron Holtzman <aholtzma@engr.uvic.ca>
@@ -31,29 +31,41 @@
 #include "threads.h"
 #include "mtime.h"

+#include "intf_msg.h"                        /* intf_DbgMsg(), intf_ErrMsg() */
 #include "tests.h"

 #include "stream_control.h"
 #include "input_ext-dec.h"

 #include "ac3_decoder.h"
-#include "ac3_internal.h"
 #include "ac3_downmix.h"

 void downmix_init (downmix_t * p_downmix)
 {
 #if 0
-    if ( TestCPU (CPU_CAPABILITY_MMX) )
+    if ( TestCPU (CPU_CAPABILITY_SSE) )
    {
-		fprintf(stderr,"Using MMX for downmix\n");
-		p_downmix->downmix_3f_2r_to_2ch = downmix_3f_2r_to_2ch_kni;
-		p_downmix->downmix_2f_2r_to_2ch = downmix_2f_2r_to_2ch_kni;
-		p_downmix->downmix_3f_1r_to_2ch = downmix_3f_1r_to_2ch_kni;
-		p_downmix->downmix_2f_1r_to_2ch = downmix_2f_1r_to_2ch_kni;
-		p_downmix->downmix_3f_0r_to_2ch = downmix_3f_0r_to_2ch_kni;
-		p_downmix->stream_sample_2ch_to_s16 = stream_sample_2ch_to_s16_kni;
-    	p_downmix->stream_sample_1ch_to_s16 = stream_sample_1ch_to_s16_kni;
-    } else 
+		intf_WarnMsg (1,"ac3dec: using MMX_SSE for downmix");
+		p_downmix->downmix_3f_2r_to_2ch = downmix_3f_2r_to_2ch_sse;
+		p_downmix->downmix_2f_2r_to_2ch = downmix_2f_2r_to_2ch_sse;
+		p_downmix->downmix_3f_1r_to_2ch = downmix_3f_1r_to_2ch_sse;
+		p_downmix->downmix_2f_1r_to_2ch = downmix_2f_1r_to_2ch_sse;
+		p_downmix->downmix_3f_0r_to_2ch = downmix_3f_0r_to_2ch_sse;
+		p_downmix->stream_sample_2ch_to_s16 = stream_sample_2ch_to_s16_sse;
+    	p_downmix->stream_sample_1ch_to_s16 = stream_sample_1ch_to_s16_sse;
+    } 
+    else if ( TestCPU (CPU_CAPABILITY_3DNOW) )
+    {
+		intf_WarnMsg (1,"ac3dec: using MMX_3DNOW for downmix");
+		p_downmix->downmix_3f_2r_to_2ch = downmix_3f_2r_to_2ch_3dn;
+		p_downmix->downmix_2f_2r_to_2ch = downmix_2f_2r_to_2ch_3dn;
+		p_downmix->downmix_3f_1r_to_2ch = downmix_3f_1r_to_2ch_3dn;
+		p_downmix->downmix_2f_1r_to_2ch = downmix_2f_1r_to_2ch_3dn;
+		p_downmix->downmix_3f_0r_to_2ch = downmix_3f_0r_to_2ch_3dn;
+		p_downmix->stream_sample_2ch_to_s16 = stream_sample_2ch_to_s16_3dn;
+    	p_downmix->stream_sample_1ch_to_s16 = stream_sample_1ch_to_s16_3dn;
+    } 
+    else
 #endif
    {
 		p_downmix->downmix_3f_2r_to_2ch = downmix_3f_2r_to_2ch_c;

--- a/src/ac3_decoder/ac3_downmix.h
+++ b/src/ac3_decoder/ac3_downmix.h
@@ -2,7 +2,7 @@
 * ac3_downmix.h: ac3 downmix functions
 *****************************************************************************
 * Copyright (C) 2000, 2001 VideoLAN
- * $Id: ac3_downmix.h,v 1.6 2001/04/30 21:04:20 reno Exp $
+ * $Id: ac3_downmix.h,v 1.7 2001/05/14 15:58:04 reno Exp $
 *
 * Authors: Renaud Dartus <reno@videolan.org>
 *
@@ -30,13 +30,22 @@ void downmix_3f_0r_to_2ch_c(float *samples, dm_par_t * dm_par);
 void stream_sample_2ch_to_s16_c(s16 *s16_samples, float *left, float *right);
 void stream_sample_1ch_to_s16_c(s16 *s16_samples, float *center); 

-#if 0
-/* Kni functions */
-void downmix_3f_2r_to_2ch_kni(float *samples, dm_par_t * dm_par);
-void downmix_3f_1r_to_2ch_kni(float *samples, dm_par_t * dm_par);
-void downmix_2f_2r_to_2ch_kni(float *samples, dm_par_t * dm_par);
-void downmix_2f_1r_to_2ch_kni(float *samples, dm_par_t * dm_par);
-void downmix_3f_0r_to_2ch_kni(float *samples, dm_par_t * dm_par);            
-void stream_sample_2ch_to_s16_kni(s16 *s16_samples, float *left, float *right);
-void stream_sample_1ch_to_s16_kni(s16 *s16_samples, float *center);  
-#endif
+/* SSE functions */
+void downmix_3f_2r_to_2ch_sse(float *samples, dm_par_t * dm_par);
+void downmix_3f_1r_to_2ch_sse(float *samples, dm_par_t * dm_par);
+void downmix_2f_2r_to_2ch_sse(float *samples, dm_par_t * dm_par);
+void downmix_2f_1r_to_2ch_sse(float *samples, dm_par_t * dm_par);
+void downmix_3f_0r_to_2ch_sse(float *samples, dm_par_t * dm_par);            
+void stream_sample_2ch_to_s16_sse(s16 *s16_samples, float *left, float *right);
+void stream_sample_1ch_to_s16_sse(s16 *s16_samples, float *center);  
+
+/* 3DNow! functions */
+void downmix_3f_2r_to_2ch_3dn(float *samples, dm_par_t * dm_par);
+void downmix_3f_1r_to_2ch_3dn(float *samples, dm_par_t * dm_par);
+void downmix_2f_2r_to_2ch_3dn(float *samples, dm_par_t * dm_par);
+void downmix_2f_1r_to_2ch_3dn(float *samples, dm_par_t * dm_par);
+void downmix_3f_0r_to_2ch_3dn(float *samples, dm_par_t * dm_par);            
+void stream_sample_2ch_to_s16_3dn(s16 *s16_samples, float *left, float *right);
+void stream_sample_1ch_to_s16_3dn(s16 *s16_samples, float *center);  
+
+
--- a/src/ac3_decoder/ac3_downmix_3dn.c
+++ b/src/ac3_decoder/ac3_downmix_3dn.c
+/*****************************************************************************
+ * ac3_downmix_3dn.c: ac3 downmix functions
+ *****************************************************************************
+ * Copyright (C) 1999, 2000, 2001 VideoLAN
+ * $Id: ac3_downmix_3dn.c,v 1.1 2001/05/14 15:58:04 reno Exp $
+ *
+ * Authors: Renaud Dartus <reno@videolan.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111, USA.
+ *****************************************************************************/
+
+#include "defs.h"
+
+#include "config.h"
+#include "common.h"
+#include "threads.h"
+#include "mtime.h"
+#include "tests.h"
+
+#include "stream_control.h"
+#include "input_ext-dec.h"
+#include "ac3_decoder.h"
+
+
+void downmix_3f_2r_to_2ch_3dn (float * samples, dm_par_t * dm_par)
+{
+    __asm__ __volatile__ (
+    "pushl %%ecx\n"
+	"movl  $128,  %%ecx\n"	        /* loop counter */
+
+	"movd	(%%ebx), %%mm5\n"	    /* unit */
+	"punpckldq %%mm5, %%mm5\n"	    /* unit | unit */
+
+	"movd	4(%%ebx), %%mm6\n"		/* clev */
+	"punpckldq %%mm6, %%mm6\n"	    /* clev | clev */
+
+	"movd	8(%%ebx), %%mm7\n"		/* slev */
+	"punpckldq %%mm7, %%mm7\n"	    /* slev | slev */
+
+".loop:\n"
+	"movq	(%%eax),     %%mm0\n"   /* left */
+	"movq	2048(%%eax), %%mm1\n"   /* right */
+	"movq   1024(%%eax), %%mm2\n"	/* center */
+	"movq	3072(%%eax), %%mm3\n"	/* leftsur */
+	"movq	4096(%%eax), %%mm4\n"	/* rightsur */
+	"pfmul	%%mm5, %%mm0\n"
+	"pfmul	%%mm5, %%mm1\n"
+	"pfmul	%%mm6, %%mm2\n"
+	"pfadd	%%mm2, %%mm0\n"
+	"pfadd 	%%mm2, %%mm1\n"
+	"pfmul  %%mm7, %%mm3\n"
+	"pfmul	%%mm7, %%mm4\n"
+	"pfadd	%%mm3, %%mm0\n"
+	"pfadd	%%mm4, %%mm1\n"
+
+	"movq	%%mm0, (%%eax)\n"
+	"movq	%%mm1, 1024(%%eax)\n"
+
+	"addl	$8, %%eax\n"
+	"decl 	%%ecx\n"
+	"jnz	.loop\n"
+    
+    "popl   %%ecx\n"
+    "femms\n"
+    : "=a" (samples)
+    : "a" (samples), "b" (dm_par));
+}
+
+void downmix_2f_2r_to_2ch_3dn (float *samples, dm_par_t * dm_par)
+{
+    __asm__ __volatile__ (
+	"pushl %%ecx\n"
+	"movl  $128, %%ecx\n"       /* loop counter */
+
+	"movd  (%%ebx), %%mm5\n"	/* unit */
+	"punpckldq %%mm5, %%mm5\n"  /* unit | unit */
+
+	"movd	8(%%ebx), %%mm7\n"	/* slev */
+	"punpckldq %%mm7, %%mm7\n"	/* slev | slev */
+
+".loop3:\n"
+	"movq   (%%eax), %%mm0\n"       /* left */
+	"movq   1024(%%eax), %%mm1\n"   /* right */
+	"movq   2048(%%eax), %%mm3\n"	/* leftsur */
+	"movq   3072(%%eax), %%mm4\n"	/* rightsur */
+	"pfmul	%%mm5, %%mm0\n"
+	"pfmul	%%mm5, %%mm1\n"
+	"pfmul	%%mm7, %%mm3\n"
+	"pfmul	%%mm7, %%mm4\n"
+	"pfadd	%%mm3, %%mm0\n"
+	"pfadd	%%mm4, %%mm1\n"
+
+	"movq	%%mm0, (%%eax)\n"
+	"movq	%%mm1, 1024(%%eax)\n"
+
+	"addl	$8, %%eax\n"
+	"decl 	%%ecx\n"
+	"jnz	.loop3\n"
+
+	"popl	%%ecx\n"
+    "femms\n"
+    : "=a" (samples)
+    : "a" (samples), "b" (dm_par));
+}
+void downmix_3f_1r_to_2ch_3dn (float *samples, dm_par_t * dm_par)
+{
+    __asm__ __volatile__ (
+
+	"pushl	%%ecx\n"
+	"movl	$128, %%ecx\n"		    /* loop counter */
+
+	"movd	(%%ebx), %%mm5\n"	    /* unit */
+	"punpckldq %%mm5, %%mm5\n"	    /* unit | unit */
+
+	"movd	4(%%ebx), %%mm6\n"		/* clev */
+	"punpckldq %%mm6, %%mm6\n"	    /* clev | clev */
+
+    "movd	8(%%ebx), %%mm7\n"		/* slev */
+	"punpckldq %%mm7, %%mm7\n"  	/* slev | slev */
+
+".loop4:\n"
+	"movq	(%%eax), %%mm0\n"       /* left */
+	"movq	2048(%%eax), %%mm1\n"   /* right */
+	"movq	1024(%%eax), %%mm2\n"	/* center */
+    "movq	3072(%%eax), %%mm3\n"	/* sur */
+	"pfmul	%%mm5, %%mm0\n"
+	"pfmul	%%mm5, %%mm1\n"
+	"pfmul	%%mm6, %%mm2\n"
+	"pfadd	%%mm2, %%mm0\n"
+	"pfmul	%%mm7, %%mm3\n"
+	"pfadd 	%%mm2, %%mm1\n"
+	"pfsub	%%mm3, %%mm0\n"
+	"pfadd	%%mm3, %%mm1\n"
+
+	"movq	%%mm0, (%%eax)\n"
+	"movq	%%mm1, 1024(%%eax)\n"
+
+	"addl	$8, %%eax\n"
+	"decl 	%%ecx\n"
+	"jnz	.loop4\n"
+
+	"popl	%%ecx\n"
+    "femms\n"
+    : "=a" (samples)
+    : "a" (samples), "b" (dm_par));
+}
+void downmix_2f_1r_to_2ch_3dn (float *samples, dm_par_t * dm_par)
+{
+    __asm__ __volatile__ (
+	"pushl	%%ecx\n"
+	"movl	$128, %%ecx\n"		    /* loop counter */
+
+	"movd	(%%ebx), %%mm5\n"	    /* unit */
+	"punpckldq %%mm5, %%mm5\n"	    /* unit | unit */
+
+	"movd	8(%%ebx), %%mm7\n"		/* slev */
+	"punpckldq %%mm7, %%mm7\n"  	/* slev | slev */
+
+".loop5:\n"
+	"movq	(%%eax), %%mm0\n"       /* left */
+	"movq	1024(%%eax), %%mm1\n"   /* right */
+	"movq	2048(%%eax), %%mm3\n"	/* sur */
+	"pfmul	%%mm5, %%mm0\n"
+	"pfmul	%%mm5, %%mm1\n"
+	"pfmul	%%mm7, %%mm3\n"
+	"pfsub	%%mm3, %%mm0\n"
+	"pfadd	%%mm3, %%mm1\n"
+
+	"movq	%%mm0, (%%eax)\n"
+	"movq	%%mm1, 1024(%%eax)\n"
+
+	"addl	$8, %%eax\n"
+	"decl 	%%ecx\n"
+	"jnz	.loop5\n"
+
+	"popl	%%ecx\n"
+    "femms\n"
+    : "=a" (samples)
+    : "a" (samples), "b" (dm_par));
+}
+
+void downmix_3f_0r_to_2ch_3dn (float *samples, dm_par_t * dm_par)
+{
+    __asm__ __volatile__ (
+	"pushl	%%ecx\n"
+	"movl	$128, %%ecx\n"		    /* loop counter */
+
+	"movd	(%%ebx), %%mm5\n"	    /* unit */
+	"punpckldq %%mm5, %%mm5\n"	    /* unit | unit */
+
+	"movd	4(%%ebx), %%mm6\n"		/* clev */
+	"punpckldq %%mm6, %%mm6\n"  	/* clev | clev */
+
+".loop6:\n"
+	"movq	(%%eax), %%mm0\n"       /*left */
+	"movq	2048(%%eax), %%mm1\n"   /* right */
+	"movq   1024(%%eax), %%mm2\n"   /* center */
+	"pfmul	%%mm5, %%mm0\n"
+	"pfmul	%%mm5, %%mm1\n"
+	"pfmul	%%mm6, %%mm2\n"
+	"pfadd	%%mm2, %%mm0\n"
+	"pfadd 	%%mm2, %%mm1\n"
+
+	"movq	%%mm0, (%%eax)\n"
+	"movq	%%mm1, 1024(%%eax)\n"
+
+	"addl	$8, %%eax\n"
+	"decl 	%%ecx\n"
+	"jnz	.loop6\n"
+
+	"popl	%%ecx\n"
+    "femms\n"
+    : "=a" (samples)
+    : "a" (samples), "b" (dm_par));
+}
+
+void stream_sample_1ch_to_s16_3dn (s16 *s16_samples, float *left)
+{
+    __asm__ __volatile__ (
+    "pushl %%ecx\n"
+    "pushl %%edx\n"
+
+	"movl   $sqrt2, %%edx\n"
+	"movd  (%%edx), %%mm7\n"
+    "punpckldq %%mm7, %%mm7\n"   /* sqrt2 | sqrt2 */
+	"movl $128, %%ecx\n"
+
+".loop2:\n"
+	"movq (%%ebx), %%mm0\n"	    /* c1 | c0 */
+	"pfmul   %%mm7, %%mm0\n"
+
+	"pf2id %%mm0, %%mm0\n"	    /* c1 c0 --> mm0, int_32 */
+
+	"packssdw %%mm0, %%mm0\n"	    /* c1 c1 c0 c0 --> mm0, int_16 */
+
+    "movq %%mm0, (%%eax)\n"
+	"addl $8, %%eax\n"
+	"addl $8, %%ebx\n"
+
+	"decl %%ecx\n"
+	"jnz .loop2\n"
+
+	"popl %%edx\n"
+	"popl %%ecx\n"
+	"femms\n"
+    : "=a" (s16_samples), "=b" (left)
+    : "a" (s16_samples), "b" (left));
+}
+
+void stream_sample_2ch_to_s16_3dn (s16 *s16_samples, float *left, float *right)
+{
+
+	__asm__ __volatile__ (
+    "pushl %%ecx\n"
+	"movl $128, %%ecx\n"
+
+".loop1:\n"
+	"movq  (%%ebx), %%mm0\n"	/* l1 | l0 */
+	"movq  (%%edx), %%mm1\n"	/* r1 | r0 */
+	"movq   %%mm0,  %%mm2\n"	/* l1 | l0 */
+	"punpckldq %%mm1, %%mm0\n"	/* r0 | l0 */
+	"punpckhdq %%mm1, %%mm2\n"	/* r1 | l1 */
+
+	"pf2id    %%mm0, %%mm0\n"	/* r0 l0 --> mm0, int_32 */
+	"pf2id    %%mm2, %%mm2\n"	/* r0 l0 --> mm0, int_32 */
+    
+    "packssdw %%mm2, %%mm0\n"	/* r1 l1 r0 l0 --> mm0, int_16 */
+
+	"movq %%mm0, (%%eax)\n"
+	"movq %%mm2, 8(%%eax)\n"
+	"addl $8, %%eax\n"
+	"addl $8, %%ebx\n"
+	"addl $8, %%edx\n"
+
+	"decl %%ecx\n"
+	"jnz .loop1\n"
+
+	"popl %%ecx\n"
+	"femms\n"
+    : "=a" (s16_samples), "=b" (left), "=d" (right)
+    : "a" (s16_samples), "b" (left), "d" (right));
+    
+}
--- a/src/ac3_decoder/ac3_downmix_c.c
+++ b/src/ac3_decoder/ac3_downmix_c.c
@@ -2,7 +2,7 @@
 * ac3_downmix_c.c: ac3 downmix functions
 *****************************************************************************
 * Copyright (C) 1999, 2000, 2001 VideoLAN
- * $Id: ac3_downmix_c.c,v 1.7 2001/05/06 04:32:02 sam Exp $
+ * $Id: ac3_downmix_c.c,v 1.8 2001/05/14 15:58:04 reno Exp $
 *
 * Authors: Renaud Dartus <reno@videolan.org>
 *          Aaron Holtzman <aholtzma@engr.uvic.ca>
@@ -35,11 +35,8 @@
 #include "input_ext-dec.h"

 #include "ac3_decoder.h"
-#include "ac3_internal.h"

-#include "ac3_downmix.h"
-
-void __inline__ downmix_3f_2r_to_2ch_c (float *samples, dm_par_t *dm_par)
+void downmix_3f_2r_to_2ch_c (float *samples, dm_par_t *dm_par)
 {
    int i;
    float *left, *right, *center, *left_sur, *right_sur;
@@ -59,7 +56,7 @@ void __inline__ downmix_3f_2r_to_2ch_c (float *samples, dm_par_t *dm_par)
    }
 }

-void __inline__ downmix_2f_2r_to_2ch_c (float *samples, dm_par_t *dm_par)
+void downmix_2f_2r_to_2ch_c (float *samples, dm_par_t *dm_par)
 {
    int i;
    float *left, *right, *left_sur, *right_sur;
@@ -78,7 +75,7 @@ void __inline__ downmix_2f_2r_to_2ch_c (float *samples, dm_par_t *dm_par)
    }
 }

-void __inline__ downmix_3f_1r_to_2ch_c (float *samples, dm_par_t *dm_par)
+void downmix_3f_1r_to_2ch_c (float *samples, dm_par_t *dm_par)
 {
    int i;
    float *left, *right, *center, *right_sur;
@@ -98,7 +95,7 @@ void __inline__ downmix_3f_1r_to_2ch_c (float *samples, dm_par_t *dm_par)
 }


-void __inline__ downmix_2f_1r_to_2ch_c (float *samples, dm_par_t *dm_par)
+void downmix_2f_1r_to_2ch_c (float *samples, dm_par_t *dm_par)
 {
    int i;
    float *left, *right, *right_sur;
@@ -117,7 +114,7 @@ void __inline__ downmix_2f_1r_to_2ch_c (float *samples, dm_par_t *dm_par)
 }


-void __inline__ downmix_3f_0r_to_2ch_c (float *samples, dm_par_t *dm_par)
+void downmix_3f_0r_to_2ch_c (float *samples, dm_par_t *dm_par)
 {
    int i;
    float *left, *right, *center;
@@ -136,7 +133,7 @@ void __inline__ downmix_3f_0r_to_2ch_c (float *samples, dm_par_t *dm_par)
 }


-void __inline__ stream_sample_2ch_to_s16_c (s16 *out_buf, float *left, float *right)
+void stream_sample_2ch_to_s16_c (s16 *out_buf, float *left, float *right)
 {
    int i;
    for (i=0; i < 256; i++) {
@@ -146,7 +143,7 @@ void __inline__ stream_sample_2ch_to_s16_c (s16 *out_buf, float *left, float *ri
 }


-void __inline__ stream_sample_1ch_to_s16_c (s16 *out_buf, float *center)
+void stream_sample_1ch_to_s16_c (s16 *out_buf, float *center)
 {
    int i;
    float tmp;

--- a/src/ac3_decoder/ac3_downmix_sse.c
+++ b/src/ac3_decoder/ac3_downmix_sse.c
+/*****************************************************************************
+ * ac3_downmix_sse.c: ac3 downmix functions
+ *****************************************************************************
+ * Copyright (C) 1999, 2000, 2001 VideoLAN
+ * $Id: ac3_downmix_sse.c,v 1.1 2001/05/14 15:58:04 reno Exp $
+ *
+ * Authors: Renaud Dartus <reno@videolan.org>
+ *          Aaron Holtzman <aholtzma@engr.uvic.ca>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111, USA.
+ *****************************************************************************/
+
+#include "defs.h"
+
+#include "config.h"
+#include "common.h"
+#include "threads.h"
+#include "mtime.h"
+#include "tests.h"
+
+#include "stream_control.h"
+#include "input_ext-dec.h"
+#include "ac3_decoder.h"
+
+
+void sqrt2 (void)
+{
+    __asm__ (".float 0f0.7071068");
+}
+
+void downmix_3f_2r_to_2ch_sse (float * samples, dm_par_t * dm_par)
+{
+    __asm__ __volatile__ (
+    "pushl %%ecx\n"
+	"movl  $64,  %%ecx\n"	        /* loop counter */
+
+	"movss	(%%ebx), %%xmm5\n"	    /* unit */
+	"shufps	$0, %%xmm5, %%xmm5\n"	/* unit | unit | unit | unit */
+
+	"movss	4(%%ebx), %%xmm6\n"		/* clev */
+	"shufps	$0, %%xmm6, %%xmm6\n"	/* clev | clev | clev | clev */
+
+	"movss	8(%%ebx), %%xmm7\n"		/* slev */
+	"shufps	$0, %%xmm7, %%xmm7\n"	/* slev | slev | slev | slev */
+
+".loop:\n"
+	"movups	(%%eax),     %%xmm0\n"  /* left */
+	"movups	2048(%%eax), %%xmm1\n"  /* right */
+	"movups 1024(%%eax), %%xmm2\n"	/* center */
+	"movups	3072(%%eax), %%xmm3\n"	/* leftsur */
+	"movups	4096(%%eax), %%xmm4\n"	/* rithgsur */
+	"mulps	%%xmm5, %%xmm0\n"
+	"mulps	%%xmm5, %%xmm1\n"
+	"mulps	%%xmm6, %%xmm2\n"
+	"addps	%%xmm2, %%xmm0\n"
+	"addps 	%%xmm2, %%xmm1\n"
+	"mulps	%%xmm7, %%xmm3\n"
+	"mulps	%%xmm7, %%xmm4\n"
+	"addps	%%xmm3, %%xmm0\n"
+	"addps	%%xmm4, %%xmm1\n"
+
+	"movups	%%xmm0, (%%eax)\n"
+	"movups	%%xmm1, 1024(%%eax)\n"
+
+	"addl	$16, %%eax\n"
+	"decl 	%%ecx\n"
+	"jnz	.loop\n"
+    
+    "popl   %%ecx\n"
+    : "=a" (samples)
+    : "a" (samples), "b" (dm_par));
+}
+
+void downmix_2f_2r_to_2ch_sse (float *samples, dm_par_t * dm_par)
+{
+    __asm__ __volatile__ (
+	"pushl %%ecx\n"
+	"movl  $64, %%ecx\n"            /* loop counter */
+
+	"movss  (%%ebx), %%xmm5\n"	    /* unit */
+	"shufps $0, %%xmm5, %%xmm5\n"   /* unit | unit | unit | unit */
+
+	"movss	8(%%ebx), %%xmm7\n"		/* slev */
+	"shufps	$0, %%xmm7, %%xmm7\n"	/* slev | slev | slev | slev */
+
+".loop3:\n"
+	"movups	(%%eax), %%xmm0\n"      /* left */
+	"movups	1024(%%eax), %%xmm1\n"  /* right */
+	"movups 2048(%%eax), %%xmm3\n"	/* leftsur */
+	"movups	3072(%%eax), %%xmm4\n"	/* rightsur */
+	"mulps	%%xmm5, %%xmm0\n"
+	"mulps	%%xmm5, %%xmm1\n"
+	"mulps	%%xmm7, %%xmm3\n"
+	"mulps	%%xmm7, %%xmm4\n"
+	"addps	%%xmm3, %%xmm0\n"
+	"addps	%%xmm4, %%xmm1\n"
+
+	"movups	%%xmm0, (%%eax)\n"
+	"movups	%%xmm1, 1024(%%eax)\n"
+
+	"addl	$16, %%eax\n"
+	"decl 	%%ecx\n"
+	"jnz	.loop3\n"
+
+	"popl	%%ecx\n"
+    : "=a" (samples)
+    : "a" (samples), "b" (dm_par));
+}
+void downmix_3f_1r_to_2ch_sse (float *samples, dm_par_t * dm_par)
+{
+    __asm__ __volatile__ (
+
+	"pushl	%%ecx\n"
+	"movl	$64, %%ecx\n"		    /* loop counter */
+
+	"movss	(%%ebx), %%xmm5\n"	    /* unit */
+	"shufps	$0, %%xmm5, %%xmm5\n"	/* unit | unit | unit | unit */
+
+	"movss	4(%%ebx), %%xmm6\n"		/* clev */
+	"shufps	$0, %%xmm6, %%xmm6\n"	/* clev | clev | clev | clev */
+
+	"movss	8(%%ebx), %%xmm7\n"		/* slev */
+	"shufps	$0, %%xmm7, %%xmm7\n"	/* slev | slev | slev | slev */
+
+".loop4:\n"
+	"movups	(%%eax), %%xmm0\n"      /* left */
+	"movups	2048(%%eax), %%xmm1\n"  /* right */
+	"movups	1024(%%eax), %%xmm2\n"	/* center */
+    "movups	3072(%%eax), %%xmm3\n"	/* sur */
+	"mulps	%%xmm5, %%xmm0\n"
+	"mulps	%%xmm5, %%xmm1\n"
+	"mulps	%%xmm6, %%xmm2\n"
+	"addps	%%xmm2, %%xmm0\n"
+	"mulps	%%xmm7, %%xmm3\n"
+	"addps 	%%xmm2, %%xmm1\n"
+	"subps	%%xmm3, %%xmm0\n"
+	"addps	%%xmm3, %%xmm1\n"
+
+	"movups	%%xmm0, (%%eax)\n"
+	"movups	%%xmm1, 1024(%%eax)\n"
+
+	"addl	$16, %%eax\n"
+	"decl 	%%ecx\n"
+	"jnz	.loop4\n"
+
+	"popl	%%ecx\n"
+    : "=a" (samples)
+    : "a" (samples), "b" (dm_par));
+
+}
+void downmix_2f_1r_to_2ch_sse (float *samples, dm_par_t * dm_par)
+{
+    __asm__ __volatile__ (
+	"pushl	%%ecx\n"
+	"movl	$64, %%ecx\n"		    /* loop counter */
+
+	"movss	(%%ebx), %%xmm5\n"	    /* unit */
+	"shufps	$0, %%xmm5, %%xmm5\n"	/* unit | unit | unit | unit */
+
+	"movss	8(%%ebx), %%xmm7\n"		/* slev */
+	"shufps	$0, %%xmm7, %%xmm7\n"	/* slev | slev | slev | slev */
+
+".loop5:\n"
+	"movups	(%%eax), %%xmm0\n"      /* left */
+	"movups	1024(%%eax), %%xmm1\n"  /* right */
+	"movups	2048(%%eax), %%xmm3\n"	/* sur */
+	"mulps	%%xmm5, %%xmm0\n"
+	"mulps	%%xmm5, %%xmm1\n"
+	"mulps	%%xmm7, %%xmm3\n"
+	"subps	%%xmm3, %%xmm0\n"
+	"addps	%%xmm3, %%xmm1\n"
+
+	"movups	%%xmm0, (%%eax)\n"
+	"movups	%%xmm1, 1024(%%eax)\n"
+
+	"addl	$16, %%eax\n"
+	"decl 	%%ecx\n"
+	"jnz	.loop5\n"
+
+	"popl	%%ecx\n"
+    : "=a" (samples)
+    : "a" (samples), "b" (dm_par));
+
+
+}
+void downmix_3f_0r_to_2ch_sse (float *samples, dm_par_t * dm_par)
+{
+    __asm__ __volatile__ (
+	"pushl	%%ecx\n"
+	"movl	$64, %%ecx\n"		    /* loop counter */
+
+	"movss	(%%ebx), %%xmm5\n"	    /* unit */
+	"shufps	$0, %%xmm5, %%xmm5\n"	/* unit | unit | unit | unit */
+
+	"movss	4(%%ebx), %%xmm6\n"		/* clev */
+	"shufps	$0, %%xmm6, %%xmm6\n"	/* clev | clev | clev | clev */
+
+".loop6:\n"
+	"movups	(%%eax), %%xmm0\n"      /*left */
+	"movups	2048(%%eax), %%xmm1\n"  /* right */
+	"movups 1024(%%eax), %%xmm2\n"	/* center */
+	"mulps	%%xmm5, %%xmm0\n"
+	"mulps	%%xmm5, %%xmm1\n"
+	"mulps	%%xmm6, %%xmm2\n"
+	"addps	%%xmm2, %%xmm0\n"
+	"addps 	%%xmm2, %%xmm1\n"
+
+	"movups	%%xmm0, (%%eax)\n"
+	"movups	%%xmm1, 1024(%%eax)\n"
+
+	"addl	$16, %%eax\n"
+	"decl 	%%ecx\n"
+	"jnz	.loop6\n"
+
+	"popl	%%ecx\n"
+    : "=a" (samples)
+    : "a" (samples), "b" (dm_par));
+}
+    
+void stream_sample_1ch_to_s16_sse (s16 *s16_samples, float *left)
+{
+    __asm__ __volatile__ (
+    "pushl %%ecx\n"
+    "pushl %%edx\n"
+
+	"movl   $sqrt2, %%edx\n"
+	"movss (%%edx), %%xmm7\n"
+    "shufps $0, %%xmm7, %%xmm7\n"   /* sqrt2 | sqrt2 | sqrt2 | sqrt2 */
+	"movl $64, %%ecx\n"
+
+".loop2:\n"
+	"movups (%%ebx), %%xmm0\n"	    /* c3 | c2 | c1 | c0 */
+	"mulps   %%xmm7, %%xmm0\n"
+	"movhlps %%xmm0, %%xmm2\n"	    /* c3 | c2 */
+
+	"cvtps2pi %%xmm0, %%mm0\n"	    /* c1 c0 --> mm0, int_32 */
+	"cvtps2pi %%xmm2, %%mm1\n"	    /* c3 c2 --> mm1, int_32 */
+
+	"packssdw %%mm0, %%mm0\n"	    /* c1 c1 c0 c0 --> mm0, int_16 */
+	"packssdw %%mm1, %%mm1\n"	    /* c3 c3 c2 c2 --> mm1, int_16 */
+
+    "movq %%mm0, (%%eax)\n"
+	"movq %%mm1, 8(%%eax)\n"
+	"addl $16, %%eax\n"
+	"addl $16, %%ebx\n"
+
+	"decl %%ecx\n"
+	"jnz .loop2\n"
+
+	"popl %%edx\n"
+	"popl %%ecx\n"
+	"emms\n"
+    : "=a" (s16_samples), "=b" (left)
+    : "a" (s16_samples), "b" (left));
+}
+
+void stream_sample_2ch_to_s16_sse (s16 *s16_samples, float *left, float *right)
+{
+
+	__asm__ __volatile__ (
+    "pushl %%ecx\n"
+	"movl $64, %%ecx\n"
+
+".loop1:\n"
+	"movups  (%%ebx), %%xmm0\n"	/* l3 | l2 | l1 | l0 */
+	"movups  (%%edx), %%xmm1\n"	/* r3 | r2 | r1 | r0 */
+	"movhlps  %%xmm0, %%xmm2\n"	/* l3 | l2 */
+	"movhlps  %%xmm1, %%xmm3\n"	/* r3 | r2 */
+	"unpcklps %%xmm1, %%xmm0\n"	/* r1 | l1 | r0 | l0 */
+	"unpcklps %%xmm3, %%xmm2\n"	/* r3 | l3 | r2 | l2 */
+
+	"cvtps2pi %%xmm0, %%mm0\n"	/* r0 l0 --> mm0, int_32 */
+	"movhlps  %%xmm0, %%xmm0\n"
+	"cvtps2pi %%xmm0, %%mm1\n"	/* r1 l1 --> mm1, int_32 */
+	"cvtps2pi %%xmm2, %%mm2\n"	/* r2 l2 --> mm2, int_32 */
+	"movhlps  %%xmm2, %%xmm2\n"
+	"cvtps2pi %%xmm2, %%mm3\n"	/* r3 l3 --> mm3, int_32 */
+    
+	"packssdw %%mm1, %%mm0\n"	/* r1 l1 r0 l0 --> mm0, int_16 */
+	"packssdw %%mm3, %%mm2\n"	/* r3 l3 r2 l2 --> mm2, int_16 */
+
+	"movq %%mm0, (%%eax)\n"
+	"movq %%mm2, 8(%%eax)\n"
+	"addl $16, %%eax\n"
+	"addl $16, %%ebx\n"
+	"addl $16, %%edx\n"
+
+	"decl %%ecx\n"
+	"jnz .loop1\n"
+
+	"popl %%ecx\n"
+	"emms\n"
+    : "=a" (s16_samples), "=b" (left), "=d" (right)
+    : "a" (s16_samples), "b" (left), "d" (right));
+    
+}
--- a/src/ac3_decoder/ac3_exponent.c
+++ b/src/ac3_decoder/ac3_exponent.c
@@ -2,7 +2,7 @@
 * ac3_exponent.c: ac3 exponent calculations
 *****************************************************************************
 * Copyright (C) 1999, 2000 VideoLAN
- * $Id: ac3_exponent.c,v 1.23 2001/04/20 12:14:34 reno Exp $
+ * $Id: ac3_exponent.c,v 1.24 2001/05/14 15:58:04 reno Exp $
 *
 * Authors: Michel Kaempf <maxx@via.ecp.fr>
 *          Michel Lespinasse <walken@zoy.org>
@@ -31,15 +31,14 @@
 #include "threads.h"
 #include "mtime.h"

+#include "intf_msg.h"                        /* intf_DbgMsg(), intf_ErrMsg() */
+
 #include "stream_control.h"
 #include "input_ext-dec.h"

 #include "audio_output.h"

 #include "ac3_decoder.h"
-#include "ac3_decoder_thread.h"
-
-#include "intf_msg.h"

 #include "ac3_internal.h"


--- a/src/ac3_decoder/ac3_imdct.c
+++ b/src/ac3_decoder/ac3_imdct.c
@@ -2,7 +2,7 @@
 * ac3_imdct.c: ac3 DCT
 *****************************************************************************
 * Copyright (C) 1999, 2000 VideoLAN
- * $Id: ac3_imdct.c,v 1.18 2001/05/06 04:32:02 sam Exp $
+ * $Id: ac3_imdct.c,v 1.19 2001/05/14 15:58:04 reno Exp $
 *
 * Authors: Michel Kaempf <maxx@via.ecp.fr>
 *          Aaron Holtzman <aholtzma@engr.uvic.ca>
@@ -39,15 +39,11 @@
 #include "input_ext-dec.h"

 #include "ac3_decoder.h"
-#include "ac3_internal.h"

-#include "ac3_downmix.h"
-#include "ac3_imdct_c.h"
-#if 0
-#include "ac3_imdct_kni.h"
-#endif
+#include "ac3_imdct_c.h"                                     /* imdct_init_c */
+#include "ac3_imdct_sse.h"                                 /* imdct_init_sse */

-#include "tests.h"
+#include "tests.h"                                                /* TestCPU */

 #ifndef M_PI
 #   define M_PI 3.14159265358979323846
@@ -57,13 +53,13 @@
 void imdct_init(imdct_t * p_imdct)
 {
 	int i;
-	float scale = 255.99609372;
-
+	float scale = 181.019;
 #if 0
-	if ( TestCPU (CPU_CAPABILITY_MMX) )
+	if ( TestCPU (CPU_CAPABILITY_SSE) )
    {
-        imdct_init_kni (p_imdct);
-    } else 
+        imdct_init_sse (p_imdct);
+    }
+    else
 #endif
    {
        imdct_init_c (p_imdct);

--- a/src/ac3_decoder/ac3_imdct_c.c
+++ b/src/ac3_decoder/ac3_imdct_c.c
@@ -2,7 +2,7 @@
 * ac3_imdct_c.c: ac3 DCT
 *****************************************************************************
 * Copyright (C) 1999, 2000 VideoLAN
- * $Id: ac3_imdct_c.c,v 1.2 2001/05/06 04:32:02 sam Exp $
+ * $Id: ac3_imdct_c.c,v 1.3 2001/05/14 15:58:04 reno Exp $
 *
 * Authors: Renaud Dartus <reno@videolan.org>
 *          Aaron Holtzman <aholtzma@engr.uvic.ca>
@@ -38,7 +38,7 @@
 #include "input_ext-dec.h"

 #include "ac3_decoder.h"
-#include "ac3_internal.h"
+#include "ac3_imdct_c.h"

 #ifndef M_PI
 #   define M_PI 3.14159265358979323846
@@ -46,9 +46,6 @@

 void fft_64p_c (complex_t *x);
 void fft_128p_c (complex_t *x);
-void imdct_do_512_c (imdct_t * p_imdct, float data[], float delay[]);
-void imdct_do_512_nol_c (imdct_t * p_imdct, float data[], float delay[]);
-

 static float window[] = {
 	0.00014, 0.00024, 0.00037, 0.00051, 0.00067, 0.00086, 0.00107, 0.00130,
@@ -112,7 +109,7 @@ static const int pm64[64] =
 int imdct_init_c (imdct_t * p_imdct)
 {
 	int i;
-	float scale = 255.99609372;
+	float scale = 181.019;

 	p_imdct->imdct_do_512 = imdct_do_512_c;
 	p_imdct->imdct_do_512_nol = imdct_do_512_nol_c;

--- a/src/ac3_decoder/ac3_imdct_sse.c
+++ b/src/ac3_decoder/ac3_imdct_sse.c
+/*****************************************************************************
+ * ac3_imdct_sse.c: ac3 DCT
+ *****************************************************************************
+ * Copyright (C) 1999, 2000 VideoLAN
+ * $Id: ac3_imdct_sse.c,v 1.1 2001/05/14 15:58:04 reno Exp $
+ *
+ * Authors: Renaud Dartus <reno@videolan.org>
+ *          Aaron Holtzman <aholtzma@engr.uvic.ca>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111, USA.
+ *****************************************************************************/
+
+#include "defs.h"
+
+#include <math.h>
+#include <stdio.h>
+
+#include "config.h"
+#include "common.h"
+#include "threads.h"
+#include "mtime.h"
+
+#include "intf_msg.h"                        /* intf_DbgMsg(), intf_ErrMsg() */
+
+#include "stream_control.h"
+#include "input_ext-dec.h"
+
+#include "ac3_decoder.h"
+
+#include "ac3_imdct_sse.h"
+
+static const float window[] = {
+	0.00014, 0.00024, 0.00037, 0.00051, 0.00067, 0.00086, 0.00107, 0.00130,
+	0.00157, 0.00187, 0.00220, 0.00256, 0.00297, 0.00341, 0.00390, 0.00443,
+	0.00501, 0.00564, 0.00632, 0.00706, 0.00785, 0.00871, 0.00962, 0.01061,
+	0.01166, 0.01279, 0.01399, 0.01526, 0.01662, 0.01806, 0.01959, 0.02121,
+	0.02292, 0.02472, 0.02662, 0.02863, 0.03073, 0.03294, 0.03527, 0.03770,
+	0.04025, 0.04292, 0.04571, 0.04862, 0.05165, 0.05481, 0.05810, 0.06153,
+	0.06508, 0.06878, 0.07261, 0.07658, 0.08069, 0.08495, 0.08935, 0.09389,
+	0.09859, 0.10343, 0.10842, 0.11356, 0.11885, 0.12429, 0.12988, 0.13563,
+	0.14152, 0.14757, 0.15376, 0.16011, 0.16661, 0.17325, 0.18005, 0.18699,
+	0.19407, 0.20130, 0.20867, 0.21618, 0.22382, 0.23161, 0.23952, 0.24757,
+	0.25574, 0.26404, 0.27246, 0.28100, 0.28965, 0.29841, 0.30729, 0.31626,
+	0.32533, 0.33450, 0.34376, 0.35311, 0.36253, 0.37204, 0.38161, 0.39126,
+	0.40096, 0.41072, 0.42054, 0.43040, 0.44030, 0.45023, 0.46020, 0.47019,
+	0.48020, 0.49022, 0.50025, 0.51028, 0.52031, 0.53033, 0.54033, 0.55031,
+	0.56026, 0.57019, 0.58007, 0.58991, 0.59970, 0.60944, 0.61912, 0.62873,
+	0.63827, 0.64774, 0.65713, 0.66643, 0.67564, 0.68476, 0.69377, 0.70269,
+	0.71150, 0.72019, 0.72877, 0.73723, 0.74557, 0.75378, 0.76186, 0.76981,
+	0.77762, 0.78530, 0.79283, 0.80022, 0.80747, 0.81457, 0.82151, 0.82831,
+	0.83496, 0.84145, 0.84779, 0.85398, 0.86001, 0.86588, 0.87160, 0.87716,
+	0.88257, 0.88782, 0.89291, 0.89785, 0.90264, 0.90728, 0.91176, 0.91610,
+	0.92028, 0.92432, 0.92822, 0.93197, 0.93558, 0.93906, 0.94240, 0.94560,
+	0.94867, 0.95162, 0.95444, 0.95713, 0.95971, 0.96217, 0.96451, 0.96674,
+	0.96887, 0.97089, 0.97281, 0.97463, 0.97635, 0.97799, 0.97953, 0.98099,
+	0.98236, 0.98366, 0.98488, 0.98602, 0.98710, 0.98811, 0.98905, 0.98994,
+	0.99076, 0.99153, 0.99225, 0.99291, 0.99353, 0.99411, 0.99464, 0.99513,
+	0.99558, 0.99600, 0.99639, 0.99674, 0.99706, 0.99736, 0.99763, 0.99788,
+	0.99811, 0.99831, 0.99850, 0.99867, 0.99882, 0.99895, 0.99908, 0.99919,
+	0.99929, 0.99938, 0.99946, 0.99953, 0.99959, 0.99965, 0.99969, 0.99974,
+	0.99978, 0.99981, 0.99984, 0.99986, 0.99988, 0.99990, 0.99992, 0.99993,
+	0.99994, 0.99995, 0.99996, 0.99997, 0.99998, 0.99998, 0.99998, 0.99999,
+	0.99999, 0.99999, 0.99999, 1.00000, 1.00000, 1.00000, 1.00000, 1.00000,
+	1.00000, 1.00000, 1.00000, 1.00000, 1.00000, 1.00000, 1.00000, 1.00000
+};
+
+static const int pm128[128] =
+{
+	0, 16, 32, 48, 64, 80,  96, 112,  8, 40, 72, 104, 24, 56,  88, 120,
+	4, 20, 36, 52, 68, 84, 100, 116, 12, 28, 44,  60, 76, 92, 108, 124,
+	2, 18, 34, 50, 66, 82,  98, 114, 10, 42, 74, 106, 26, 58,  90, 122,
+	6, 22, 38, 54, 70, 86, 102, 118, 14, 46, 78, 110, 30, 62,  94, 126,
+	1, 17, 33, 49, 65, 81,  97, 113,  9, 41, 73, 105, 25, 57,  89, 121,
+	5, 21, 37, 53, 69, 85, 101, 117, 13, 29, 45,  61, 77, 93, 109, 125,
+	3, 19, 35, 51, 67, 83,  99, 115, 11, 43, 75, 107, 27, 59,  91, 123,
+	7, 23, 39, 55, 71, 87, 103, 119, 15, 31, 47,  63, 79, 95, 111, 127
+}; 
+
+void fft_64p_sse (complex_t *x);
+void fft_128p_sse(complex_t *a);
+static void imdct512_pre_ifft_twiddle_sse (const int *pmt, complex_t *buf, float *data, float *xcos_sin_sse);
+static void imdct512_post_ifft_twiddle_sse (complex_t *buf, float *xcos_sin_sse);
+static void imdct512_window_delay_sse (complex_t *buf, float *data_ptr, float *window_prt, float *delay_prt);
+static void imdct512_window_delay_nol_sse (complex_t *buf, float *data_ptr, float *window_prt, float *delay_prt);
+
+
+int imdct_init_sse (imdct_t * p_imdct)
+{
+	int i;
+	float scale = 181.019;
+
+	intf_WarnMsg (1, "ac3dec: using MMX_SSE for imdct");
+	p_imdct->imdct_do_512 = imdct_do_512_sse;
+	p_imdct->imdct_do_512_nol = imdct_do_512_nol_sse;
+	p_imdct->fft_64p = fft_64p_sse;
+
+	for (i=0; i < 128; i++)
+	{
+		float xcos_i = cos(2.0f * M_PI * (8*i+1)/(8*N)) * scale;
+		float xsin_i = sin(2.0f * M_PI * (8*i+1)/(8*N)) * scale;
+		p_imdct->xcos_sin_sse[i * 4]     = xcos_i;
+		p_imdct->xcos_sin_sse[i * 4 + 1] = -xsin_i;
+		p_imdct->xcos_sin_sse[i * 4 + 2] = -xsin_i;
+		p_imdct->xcos_sin_sse[i * 4 + 3] = -xcos_i;
+	}
+	return 0;
+}
+
+void imdct_do_512_sse (imdct_t * p_imdct, float data[], float delay[])
+{
+	imdct512_pre_ifft_twiddle_sse (pm128, p_imdct->buf, data, p_imdct->xcos_sin_sse);
+	fft_128p_sse (p_imdct->buf);
+	imdct512_post_ifft_twiddle_sse (p_imdct->buf, p_imdct->xcos_sin_sse);
+    imdct512_window_delay_sse (p_imdct->buf, data, window, delay);
+}
+
+
+void imdct_do_512_nol_sse (imdct_t * p_imdct, float data[], float delay[])
+{
+	imdct512_pre_ifft_twiddle_sse (pm128, p_imdct->buf, data, p_imdct->xcos_sin_sse);  
+	fft_128p_sse (p_imdct->buf);
+	imdct512_post_ifft_twiddle_sse (p_imdct->buf, p_imdct->xcos_sin_sse);
+    imdct512_window_delay_nol_sse (p_imdct->buf, data, window, delay);
+}
+
+static void imdct512_pre_ifft_twiddle_sse (const int *pmt, complex_t *buf, float *data, float *xcos_sin_sse)
+{
+    __asm__ __volatile__ (	
+	"pushl %%ebp\n"
+	"movl  %%esp, %%ebp\n"
+	"addl  $-4, %%esp\n" /* local variable, loop counter */
+	
+	"pushl %%eax\n"
+	"pushl %%ebx\n"
+	"pushl %%ecx\n"
+	"pushl %%edx\n"
+	"pushl %%edi\n"
+	"pushl %%esi\n"
+
+	"movl  8(%%ebp), %%eax\n" 	/* pmt */
+	"movl 12(%%ebp), %%ebx\n"	/* buf */
+	"movl 16(%%ebp), %%ecx\n"	/* data */
+	"movl 20(%%ebp), %%edx\n" 	/* xcos_sin_sse */
+	"movl $64, -4(%%ebp)\n"
+	
+".loop:\n"
+	"movl  (%%eax), %%esi\n"
+	"movl 4(%%eax), %%edi\n"
+	"movss (%%ecx, %%esi, 8), %%xmm1\n" /* 2j */
+	"movss (%%ecx, %%edi, 8), %%xmm3\n" /* 2(j+1) */
+
+	"shll $1, %%esi\n"
+	"shll $1, %%edi\n"
+
+	"movups (%%edx, %%esi, 8), %%xmm0\n" /* -c_j | -s_j | -s_j | c_j */
+	"movups (%%edx, %%edi, 8), %%xmm2\n" /* -c_j+1 | -s_j+1 | -s_j+1 | c_j+1 */
+
+	"negl %%esi\n"
+	"negl %%edi\n"
+
+	"movss 1020(%%ecx, %%esi, 4), %%xmm4\n" /* 255-2j */
+	"addl $8, %%eax\n"
+	"movss 1020(%%ecx, %%edi, 4), %%xmm5\n" /* 255-2(j+1) */
+
+	"shufps $0, %%xmm1, %%xmm4\n" /* 2j | 2j | 255-2j | 255-2j */
+	"shufps $0, %%xmm3, %%xmm5\n" /* 2(j+1) | 2(j+1) | 255-2(j+1) | 255-2(j+1) */
+	"mulps   %%xmm4, %%xmm0\n"
+	"mulps   %%xmm5, %%xmm2\n"
+	"movhlps %%xmm0, %%xmm1\n"
+	"movhlps %%xmm2, %%xmm3\n"
+	"addl    $16, %%ebx\n"
+	"addps   %%xmm1, %%xmm0\n"
+	"addps   %%xmm3, %%xmm2\n"
+	"movlhps %%xmm2, %%xmm0\n"
+    
+	"movups  %%xmm0, -16(%%ebx)\n"
+	"decl -4(%%ebp)\n"
+   	"jnz .loop\n"
+
+	"popl %%esi\n"
+	"popl %%edi\n"
+	"popl %%edx\n"
+	"popl %%ecx\n"
+	"popl %%ebx\n"
+	"popl %%eax\n"
+
+	"addl $4, %%esp\n"
+	"popl %%ebp\n"
+    ::);
+}
+
+static void imdct512_post_ifft_twiddle_sse (complex_t *buf, float *xcos_sin_sse)
+{
+    __asm__ __volatile__ ( 
+	"pushl %%ecx\n"
+	"movl $32, %%ecx\n"                 /* loop counter */
+
+".loop1:\n"
+	"movups	(%%eax), %%xmm0\n"          /*  im1 | re1 | im0 | re0 */
+
+	"movups  (%%ebx), %%xmm2\n"         /* -c | -s | -s | c */
+	"movhlps  %%xmm0, %%xmm1\n"         /* im1 | re1 */
+	"movups  16(%%ebx), %%xmm3\n"       /* -c1 | -s1 | -s1 | c1 */
+
+	"shufps $0x50, %%xmm0, %%xmm0\n"    /* im0 | im0 | re0 | re0 */
+	"shufps $0x50, %%xmm1, %%xmm1\n"    /* im1 | im1 | re1 | re1 */
+
+	"movups  16(%%eax), %%xmm4\n"       /* im3 | re3 | im2 | re2 */
+
+    "shufps $0x27, %%xmm2, %%xmm2\n"    /* c | -s | -s | -c */
+	"movhlps  %%xmm4, %%xmm5\n"         /* im3 | re3 */
+    "shufps $0x27, %%xmm3, %%xmm3\n"    /* c1 | -s1 | -s1 | -c1 */
+
+	"movups  32(%%ebx), %%xmm6\n"       /* -c2 | -s2 | -s2 | c2 */
+	"movups  48(%%ebx), %%xmm7\n"       /* -c3 | -s3 | -s3 | c3 */
+
+	"shufps $0x50, %%xmm4, %%xmm4\n"    /* im2 | im2 | re2 | re2 */
+	"shufps $0x50, %%xmm5, %%xmm5\n"    /* im3 | im3 | re3 | re3 */
+
+	"mulps %%xmm2, %%xmm0\n"
+	"mulps %%xmm3, %%xmm1\n"
+
+	"shufps $0x27, %%xmm6, %%xmm6\n"    /* c2 | -s2 | -s2 | -c2 */
+	"shufps $0x27, %%xmm7, %%xmm7\n"    /* c3 | -s3 | -s3 | -c3 */
+
+	"movhlps %%xmm0, %%xmm2\n"
+	"movhlps %%xmm1, %%xmm3\n"
+
+	"mulps %%xmm6, %%xmm4\n"
+	"mulps %%xmm7, %%xmm5\n"
+
+	"addps %%xmm2, %%xmm0\n"
+	"addps %%xmm3, %%xmm1\n"
+
+	"movhlps %%xmm4, %%xmm6\n"
+	"movhlps %%xmm5, %%xmm7\n"
+
+	"addps %%xmm6, %%xmm4\n"
+	"addps %%xmm7, %%xmm5\n"
+
+	"movlhps %%xmm1, %%xmm0\n"
+	"movlhps %%xmm5, %%xmm4\n"
+
+	"movups %%xmm0, (%%eax)\n"
+	"movups %%xmm4, 16(%%eax)\n"
+	"addl $64, %%ebx\n"
+	"addl $32, %%eax\n"
+	"decl %%ecx\n"
+	"jnz .loop1\n"
+
+	"popl %%ecx\n"
+    : "=a" (buf)
+    : "a" (buf), "b" (xcos_sin_sse) );
+}
+
+static void imdct512_window_delay_sse (complex_t *buf, float *data_ptr, float *window_prt, float *delay_prt)
+{
+    __asm__ __volatile__ (
+	"pushl %%ebp\n"
+	"movl  %%esp, %%ebp\n"
+	
+	"pushl %%eax\n"
+	"pushl %%ebx\n"
+	"pushl %%ecx\n"
+	"pushl %%edx\n"
+	"pushl %%esi\n"
+	"pushl %%edi\n"
+
+	"movl 20(%%ebp), %%ebx\n"   /* delay */
+	"movl 16(%%ebp), %%edx\n"   /* window */
+
+	"movl 8(%%ebp), %%eax\n"    /* buf */
+	"movl $16, %%ecx\n"         /* loop count */
+	"leal 516(%%eax), %%esi\n"  /* buf[64].im */
+	"leal 504(%%eax), %%edi\n"  /* buf[63].re */
+	"movl  12(%%ebp), %%eax\n"  /* data */
+
+".first_128_samples:\n"
+	"movss   (%%esi), %%xmm0\n"
+	"movss  8(%%esi), %%xmm2\n"
+	"movss   (%%edi), %%xmm1\n"
+	"movss -8(%%edi), %%xmm3\n"
+
+	"movlhps %%xmm2, %%xmm0\n"      /* 0.0 | im1 | 0.0 | im0 */
+	"movlhps %%xmm3, %%xmm1\n"      /* 0.0 | re1 | 0.0 | re0 */
+
+	"movups (%%edx), %%xmm4\n"      /* w3 | w2 | w1 | w0 */
+	"movups (%%ebx), %%xmm5\n"      /* d3 | d2 | d1 | d0 */
+	"shufps $0xb1, %%xmm1, %%xmm1\n"/* re1 | 0.0 | re0 | 0.0 */
+
+	"movss  16(%%esi), %%xmm6\n"    /* im2 */
+	"movss  24(%%esi), %%xmm7\n"    /* im3 */
+	"subps     %%xmm1, %%xmm0\n"    /* -re1 | im1 | -re0 | im0 */
+	"movss -16(%%edi), %%xmm2\n"    /* re2 */
+	"movss -24(%%edi), %%xmm3\n"    /* re3 */
+	"mulps     %%xmm4, %%xmm0\n"
+	"movlhps   %%xmm7, %%xmm6\n"    /* 0.0 | im3 | 0.0 | im2 */
+	"movlhps   %%xmm3, %%xmm2\n"    /* 0.0 | re3 | 0.0 | re2 */
+	"addps %%xmm5, %%xmm0\n"
+	"shufps $0xb1, %%xmm2, %%xmm2\n"/* re3 | 0.0 | re2 | 0.0 */
+	"movups 16(%%edx), %%xmm4\n"    /* w7 | w6 | w5 | w4 */
+	"movups 16(%%ebx), %%xmm5\n"    /* d7 | d6 | d5 | d4 */
+	"subps %%xmm2, %%xmm6\n"        /* -re3 | im3 | -re2 | im2 */
+	"addl $32, %%edx\n"
+	"movups %%xmm0, (%%eax)\n"
+	"addl $32, %%ebx\n"
+	"mulps %%xmm4, %%xmm6\n"
+	"addl $32, %%esi\n"
+	"addl $32, %%eax\n"
+	"addps %%xmm5, %%xmm6\n"
+    "addl $-32, %%edi\n"
+	"movups %%xmm6, -16(%%eax)\n"
+	"decl %%ecx\n"
+	"jnz .first_128_samples\n"
+
+	"movl 8(%%ebp), %%esi\n"    /* buf[0].re */
+	"leal 1020(%%esi), %%edi\n" /* buf[127].im */
+	"movl $16, %%ecx\n"         /* loop count */
+    
+".second_128_samples:\n"
+	"movss   (%%esi), %%xmm0\n" /* buf[i].re */
+	"movss  8(%%esi), %%xmm2\n" /* re1 */
+	"movss   (%%edi), %%xmm1\n" /* buf[127-i].im */
+	"movss -8(%%edi), %%xmm3\n" /* im1 */
+
+	"movlhps %%xmm2, %%xmm0\n"  /* 0.0 | re1 | 0.0 | re0 */
+	"movlhps %%xmm3, %%xmm1\n"  /* 0.0 | im1 | 0.0 | im1 */
+
+	"movups (%%edx), %%xmm4\n"  /* w3 | w2 | w1 | w0 */
+	"movups (%%ebx), %%xmm5\n"  /* d3 | d2 | d1 | d0 */
+
+	"shufps $0xb1, %%xmm1, %%xmm1\n"/* im1 | 0.0 | im0 | 0.0 */
+	"movss  16(%%esi), %%xmm6\n"    /* re2 */
+	"movss  24(%%esi), %%xmm7\n"    /* re3 */
+	"movss -16(%%edi), %%xmm2\n"    /* im2 */
+	"movss -24(%%edi), %%xmm3\n"    /* im3 */
+	"subps   %%xmm1, %%xmm0\n"      /* -im1 | re1 | -im0 | re0 */
+	"movlhps %%xmm7, %%xmm6\n"      /* 0.0 | re3 | 0.0 | re2 */
+	"movlhps %%xmm3, %%xmm2\n"      /* 0.0 | im3 | 0.0 | im2 */
+	"mulps   %%xmm4, %%xmm0\n"
+	"shufps $0xb1, %%xmm2, %%xmm2\n"/* im3 | 0.0 | im2 | 0.0 */
+	"movups 16(%%edx), %%xmm4\n"    /* w7 | w6 | w5 | w4 */
+	"addl $32, %%esi\n"
+	"subps %%xmm2, %%xmm6\n"        /* -im3 | re3 | -im2 | re2 */
+	"addps %%xmm5, %%xmm0\n"
+	"mulps %%xmm4, %%xmm6\n"
+	"addl $-32, %%edi\n"
+	"movups 16(%%ebx), %%xmm5\n"    /* d7 | d6 | d5 | d4 */
+	"movups %%xmm0, (%%eax)\n"
+	"addps %%xmm5, %%xmm6\n"
+	"addl $32, %%edx\n"
+	"addl $32, %%eax\n"
+	"addl $32, %%ebx\n"
+	"movups %%xmm6, -16(%%eax)\n"
+	"decl %%ecx\n"
+	"jnz .second_128_samples\n"
+
+	"movl   8(%%ebp), %%eax\n"
+	"leal 512(%%eax), %%esi\n"  /* buf[64].re */
+	"leal 508(%%eax), %%edi\n"  /* buf[63].im */
+	"movl $16, %%ecx\n"         /* loop count */
+	"movl  20(%%ebp), %%eax\n"  /* delay */
+
+".first_128_delay:\n"
+	"movss   (%%esi), %%xmm0\n"
+	"movss  8(%%esi), %%xmm2\n"
+	"movss   (%%edi), %%xmm1\n"
+	"movss -8(%%edi), %%xmm3\n"
+
+	"movlhps %%xmm2, %%xmm0\n"      /* 0.0 | re1 | 0.0 | re0 */
+	"movlhps %%xmm3, %%xmm1\n"      /* 0.0 | im1 | 0.0 | im0 */
+
+	"movups -16(%%edx), %%xmm4\n"   /* w3 | w2 | w1 | w0 */
+    "shufps $0xb1, %%xmm1, %%xmm1\n"/* im1 | 0.0 | im0 | 0.0 */
+	"movss  16(%%esi), %%xmm6\n"    /* re2 */
+	"movss  24(%%esi), %%xmm7\n"    /* re3 */
+	"movss -16(%%edi), %%xmm2\n"    /* im2 */
+	"movss -24(%%edi), %%xmm3\n"    /* im3 */
+	"subps     %%xmm1, %%xmm0\n"    /* -im1 | re1 | -im0 | re0 */
+	"addl $-32, %%edx\n"
+	"movlhps %%xmm7, %%xmm6\n"      /* 0.0 | re3 | 0.0 | re2 */
+	"movlhps %%xmm3, %%xmm2\n"      /* 0.0 | im3 | 0.0 | im2 */
+    "mulps   %%xmm4, %%xmm0\n"
+	"movups (%%edx), %%xmm5\n"      /* w7 | w6 | w5 | w4 */
+	"shufps $0xb1, %%xmm2, %%xmm2\n"/* im3 | 0.0 | im2 | 0.0 */
+	"movups %%xmm0, (%%eax)\n"
+	"addl $32, %%esi\n"
+	"subps %%xmm2, %%xmm6\n"        /* -im3 | re3 | -im2 | re2 */
+	"addl $-32, %%edi\n"
+	"mulps %%xmm5, %%xmm6\n"
+	"addl $32, %%eax\n"
+	"movups %%xmm6, -16(%%eax)\n"
+	"decl %%ecx\n"
+	"jnz .first_128_delay\n"
+
+	"movl    8(%%ebp), %%ebx\n"
+	"leal    4(%%ebx), %%esi\n" /* buf[0].im */
+	"leal 1016(%%ebx), %%edi\n" /* buf[127].re */
+	"movl $16, %%ecx\n"         /* loop count */
+    
+".second_128_delay:\n"
+	"movss   (%%esi), %%xmm0\n"
+	"movss  8(%%esi), %%xmm2\n"
+	"movss   (%%edi), %%xmm1\n"
+	"movss -8(%%edi), %%xmm3\n"
+
+	"movlhps %%xmm2, %%xmm0\n"      /* 0.0 | im1 | 0.0 | im0 */
+	"movlhps %%xmm3, %%xmm1\n"      /* 0.0 | re1 | 0.0 | re0 */
+
+	"movups -16(%%edx), %%xmm4\n"   /* w3 | w2 | w1 | w0 */
+	"shufps $0xb1, %%xmm1, %%xmm1\n"/* re1 | 0.0 | re0 | 0.0 */
+	"movss  16(%%esi), %%xmm6\n"    /* im2 */
+	"movss  24(%%esi), %%xmm7\n"    /* im3 */
+	"movss -16(%%edi), %%xmm2\n"    /* re2 */
+	"movss -24(%%edi), %%xmm3\n"    /* re3 */
+	"subps %%xmm0, %%xmm1\n"        /* re1 | -im1 | re0 | -im0 */
+	"addl $-32, %%edx\n"
+	"movlhps %%xmm7, %%xmm6\n"      /* 0.0 | im3 | 0.0 | im2 */
+	"movlhps %%xmm3, %%xmm2\n"      /* 0.0 | re3 | 0.0 | re2 */
+	"mulps   %%xmm4, %%xmm1\n"
+	"movups (%%edx), %%xmm5\n"      /* w7 | w6 | w5 | w4 */
+	"shufps $0xb1, %%xmm2, %%xmm2\n"/* re3 | 0.0 | re2 | 0.0 */
+	"movups %%xmm1, (%%eax)\n"
+	"addl $32, %%esi\n"
+	"subps %%xmm6, %%xmm2\n"        /* re | -im3 | re | -im2 */
+	"addl $-32, %%edi\n"
+	"mulps %%xmm5, %%xmm2\n"
+	"addl $32, %%eax\n"
+	"movups %%xmm2, -16(%%eax)\n"
+	"decl %%ecx\n"
+	"jnz .second_128_delay\n"
+
+	"popl %%edi\n"
+	"popl %%esi\n"
+	"popl %%edx\n"
+	"popl %%ecx\n"
+	"popl %%ebx\n"
+	"popl %%eax\n"
+	
+	"leave\n"
+    ::);
+}
+
+static void imdct512_window_delay_nol_sse (complex_t *buf, float *data_ptr, float *window_prt, float *delay_prt)
+{
+    __asm__ __volatile__ (
+	"pushl %%ebp\n"
+	"movl  %%esp, %%ebp\n"
+	
+	"pushl %%eax\n"
+	"pushl %%ebx\n"
+	"pushl %%ecx\n"
+	"pushl %%edx\n"
+	"pushl %%esi\n"
+	"pushl %%edi\n"
+
+	/* movl 20(%%ebp), %%ebx delay */
+	"movl 16(%%ebp), %%edx\n"   /* window */
+
+	"movl   8(%%ebp), %%eax\n"  /* buf */
+	"movl $16, %%ecx\n"         /* loop count */
+	"leal 516(%%eax), %%esi\n"  /* buf[64].im */
+	"leal 504(%%eax), %%edi\n"  /* buf[63].re */
+	"movl  12(%%ebp), %%eax\n"  /* data */
+    
+".first_128_sample:\n"
+	"movss   (%%esi), %%xmm0\n"
+	"movss  8(%%esi), %%xmm2\n"
+	"movss   (%%edi), %%xmm1\n"
+	"movss -8(%%edi), %%xmm3\n"
+
+	"movlhps %%xmm2, %%xmm0\n"      /* 0.0 | im1 | 0.0 | im0 */
+	"movlhps %%xmm3, %%xmm1\n"      /* 0.0 | re1 | 0.0 | re0 */
+
+	"movups (%%edx), %%xmm4\n"      /* w3 | w2 | w1 | w0 */
+    /* movups (%%ebx), %%xmm5 d3 | d2 | d1 | d0 */
+	"shufps $0xb1, %%xmm1, %%xmm1\n"/* re1 | 0.0 | re0 | 0.0 */
+
+	"movss  16(%%esi), %%xmm6\n"    /* im2 */
+	"movss  24(%%esi), %%xmm7\n"    /* im3 */
+	"subps     %%xmm1, %%xmm0\n"    /* -re1 | im1 | -re0 | im0 */
+	"movss -16(%%edi), %%xmm2\n"    /* re2 */
+	"movss -24(%%edi), %%xmm3\n"    /* re3 */
+	"mulps %%xmm4, %%xmm0\n"
+	"movlhps %%xmm7, %%xmm6\n"      /* 0.0 | im3 | 0.0 | im2 */
+	"movlhps %%xmm3, %%xmm2\n"      /* 0.0 | re3 | 0.0 | re2 */
+	/* addps %%xmm5, %%xmm0 */
+	"shufps $0xb1, %%xmm2, %%xmm2\n"/* re3 | 0.0 | re2 | 0.0 */
+	"movups 16(%%edx), %%xmm4\n"    /* w7 | w6 | w5 | w4 */
+	/* movups 16(%%ebx), %%xmm5  d7 | d6 | d5 | d4 */
+	"subps %%xmm2, %%xmm6\n"        /* -re3 | im3 | -re2 | im2 */
+    "addl $32, %%edx\n"
+	"movups %%xmm0, (%%eax)\n"
+	/* addl $32, %%ebx */
+	"mulps %%xmm4, %%xmm6\n"
+	"addl $32, %%esi\n"
+	"addl $32, %%eax\n"
+	/* addps %%xmm5, %%xmm6 */
+	"addl $-32, %%edi\n"
+	"movups %%xmm6, -16(%%eax)\n"
+	"decl %%ecx\n"
+	"jnz .first_128_sample\n"
+
+	"movl    8(%%ebp), %%esi\n"     /* buf[0].re */
+	"leal 1020(%%esi), %%edi\n"     /* buf[127].im */
+	"movl $16, %%ecx\n"             /* loop count */
+    
+".second_128_sample:\n"
+	"movss   (%%esi), %%xmm0\n"     /* buf[i].re */
+	"movss  8(%%esi), %%xmm2\n"     /* re1 */
+	"movss   (%%edi), %%xmm1\n"     /* buf[127-i].im */
+	"movss -8(%%edi), %%xmm3\n"     /* im1 */
+
+	"movlhps %%xmm2, %%xmm0\n"      /* 0.0 | re1 | 0.0 | re0 */
+	"movlhps %%xmm3, %%xmm1\n"      /* 0.0 | im1 | 0.0 | im1 */
+	
+	"movups (%%edx), %%xmm4\n"      /* w3 | w2 | w1 | w0 */
+	/* movups (%%ebx), %%xmm5 d3 | d2 | d1 | d0 */
+
+	"shufps $0xb1, %%xmm1, %%xmm1\n"/* im1 | 0.0 | im0 | 0.0 */
+	"movss  16(%%esi), %%xmm6\n"    /* re2 */
+	"movss  24(%%esi), %%xmm7\n"    /* re3 */
+	"movss -16(%%edi), %%xmm2\n"    /* im2 */
+	"movss -24(%%edi), %%xmm3\n"    /* im3 */
+	"subps %%xmm1, %%xmm0\n"        /* -im1 | re1 | -im0 | re0 */
+	"movlhps %%xmm7, %%xmm6\n"      /* 0.0 | re3 | 0.0 | re2 */
+	"movlhps %%xmm3, %%xmm2\n"      /* 0.0 | im3 | 0.0 | im2 */
+	"mulps %%xmm4, %%xmm0\n"
+	"shufps $0xb1, %%xmm2, %%xmm2\n"/* im3 | 0.0 | im2 | 0.0 */
+	"movups 16(%%edx), %%xmm4\n"    /* w7 | w6 | w5 | w4 */
+	"addl $32, %%esi\n"
+	"subps %%xmm2, %%xmm6\n"        /* -im3 | re3 | -im2 | re2 */
+	/* addps %%xmm5, %%xmm0 */
+	"mulps %%xmm4, %%xmm6\n"
+	"addl $-32, %%edi\n"
+	/* movups 16(%%ebx), %%xmm5  d7 | d6 | d5 | d4 */
+	"movups %%xmm0, (%%eax)\n"
+	/* addps %%xmm5, %%xmm6 */
+	"addl $32, %%edx\n"
+	"addl $32, %%eax\n"
+	/* addl $32, %%ebx */
+	"movups %%xmm6, -16(%%eax)\n"
+	"decl %%ecx\n"
+	"jnz .second_128_sample\n"
+
+	"movl   8(%%ebp), %%eax\n"
+	"leal 512(%%eax), %%esi\n"  /* buf[64].re */
+	"leal 508(%%eax), %%edi\n"  /* buf[63].im */
+	"movl $16, %%ecx\n"         /* loop count */
+	"movl  20(%%ebp), %%eax\n"  /* delay */
+    
+".first_128_delays:\n"
+	"movss   (%%esi), %%xmm0\n"
+	"movss  8(%%esi), %%xmm2\n"
+	"movss   (%%edi), %%xmm1\n"
+	"movss -8(%%edi), %%xmm3\n"
+
+	"movlhps %%xmm2, %%xmm0\n"  /* 0.0 | re1 | 0.0 | re0 */
+	"movlhps %%xmm3, %%xmm1\n"  /* 0.0 | im1 | 0.0 | im0 */
+
+	"movups -16(%%edx), %%xmm4\n"   /* w3 | w2 | w1 | w0 */
+	"shufps $0xb1, %%xmm1, %%xmm1\n"/* im1 | 0.0 | im0 | 0.0 */
+	"movss  16(%%esi), %%xmm6\n"    /* re2 */
+	"movss  24(%%esi), %%xmm7\n"    /* re3 */
+	"movss -16(%%edi), %%xmm2\n"    /* im2 */
+	"movss -24(%%edi), %%xmm3\n"    /* im3 */
+	"subps %%xmm1, %%xmm0\n"        /* -im1 | re1 | -im0 | re0 */
+	"addl $-32, %%edx\n"
+	"movlhps %%xmm7, %%xmm6\n"      /* 0.0 | re3 | 0.0 | re2 */
+	"movlhps %%xmm3, %%xmm2\n"      /* 0.0 | im3 | 0.0 | im2 */
+	"mulps %%xmm4, %%xmm0\n"
+	"movups (%%edx), %%xmm5\n"      /* w7 | w6 | w5 | w4 */
+	"shufps $0xb1, %%xmm2, %%xmm2\n"/* im3 | 0.0 | im2 | 0.0 */
+	"movups %%xmm0, (%%eax)\n"
+	"addl $32, %%esi\n"
+	"subps %%xmm2, %%xmm6\n"        /* -im3 | re3 | -im2 | re2 */
+	"addl $-32, %%edi\n"
+	"mulps %%xmm5, %%xmm6\n"
+	"addl $32, %%eax\n"
+	"movups %%xmm6, -16(%%eax)\n"
+	"decl %%ecx\n"
+	"jnz .first_128_delays\n"
+
+	"movl    8(%%ebp), %%ebx\n"
+	"leal    4(%%ebx), %%esi\n" /* buf[0].im */
+	"leal 1016(%%ebx), %%edi\n" /* buf[127].re */
+	"movl $16, %%ecx\n"         /* loop count */
+    
+".second_128_delays:\n"
+	"movss   (%%esi), %%xmm0\n"
+	"movss  8(%%esi), %%xmm2\n"
+	"movss   (%%edi), %%xmm1\n"
+	"movss -8(%%edi), %%xmm3\n"
+
+	"movlhps %%xmm2, %%xmm0\n"  /* 0.0 | im1 | 0.0 | im0 */
+	"movlhps %%xmm3, %%xmm1\n"  /* 0.0 | re1 | 0.0 | re0 */
+
+	"movups -16(%%edx), %%xmm4\n"   /* w3 | w2 | w1 | w0 */
+	"shufps $0xb1, %%xmm1, %%xmm1\n"/* re1 | 0.0 | re0 | 0.0 */
+	"movss  16(%%esi), %%xmm6\n"    /* im2 */
+	"movss  24(%%esi), %%xmm7\n"    /* im3 */
+	"movss -16(%%edi), %%xmm2\n"    /* re2 */
+	"movss -24(%%edi), %%xmm3\n"    /* re3 */
+	"subps %%xmm0, %%xmm1\n"        /* re1 | -im1 | re0 | -im0 */
+	"addl $-32, %%edx\n"
+	"movlhps %%xmm7, %%xmm6\n"      /* 0.0 | im3 | 0.0 | im2 */
+	"movlhps %%xmm3, %%xmm2\n"      /* 0.0 | re3 | 0.0 | re2 */
+	"mulps %%xmm4, %%xmm1\n"
+	"movups (%%edx), %%xmm5\n"      /* w7 | w6 | w5 | w4 */
+	"shufps $0xb1, %%xmm2, %%xmm2\n"/* re3 | 0.0 | re2 | 0.0 */
+	"movups %%xmm1, (%%eax)\n"
+	"addl $32, %%esi\n"
+	"subps %%xmm6, %%xmm2\n"        /* re | -im3 | re | -im2 */
+	"addl $-32, %%edi\n"
+	"mulps %%xmm5, %%xmm2\n"
+	"addl $32, %%eax\n"
+	"movups %%xmm2, -16(%%eax)\n"
+	"decl %%ecx\n"
+	"jnz .second_128_delays\n"
+
+	"popl %%edi\n"
+	"popl %%esi\n"
+	"popl %%edx\n"
+	"popl %%ecx\n"
+	"popl %%ebx\n"
+	"popl %%eax\n"
+	
+	"leave\n"
+    ::);
+}
--- a/src/ac3_decoder/ac3_imdct_sse.h
+++ b/src/ac3_decoder/ac3_imdct_sse.h
+int  imdct_init_sse (imdct_t * p_imdct);
+void imdct_do_512_sse(imdct_t * p_imdct, float data[], float delay[]);
+void imdct_do_512_nol_sse(imdct_t * p_imdct, float data[], float delay[]);
--- a/src/ac3_decoder/ac3_internal.h
+++ b/src/ac3_decoder/ac3_internal.h
@@ -2,7 +2,7 @@
 * ac3_internals.h: needed by the ac3 decoder
 *****************************************************************************
 * Copyright (C) 2000 VideoLAN
- * $Id: ac3_internal.h,v 1.8 2001/03/21 13:42:34 sam Exp $
+ * $Id: ac3_internal.h,v 1.9 2001/05/14 15:58:04 reno Exp $
 *
 * Authors: Michel Lespinasse <walken@zoy.org>
 *
@@ -37,12 +37,13 @@
 void bit_allocate (ac3dec_t *);

 /* ac3_downmix.c */
-int downmix (ac3dec_t *, float *, s16 *);
+void downmix_init (downmix_t * p_downmix);

 /* ac3_exponent.c */
 int exponent_unpack (ac3dec_t *);

 /* ac3_imdct.c */
+void imdct_init (imdct_t * p_imdct);
 void imdct (ac3dec_t * p_ac3dec, s16 * buffer);

 /* ac3_mantissa.c */

--- a/src/ac3_decoder/ac3_mantissa.c
+++ b/src/ac3_decoder/ac3_mantissa.c
@@ -2,7 +2,7 @@
 * ac3_mantissa.c: ac3 mantissa computation
 *****************************************************************************
 * Copyright (C) 1999, 2000, 2001 VideoLAN
- * $Id: ac3_mantissa.c,v 1.27 2001/05/07 03:14:09 stef Exp $
+ * $Id: ac3_mantissa.c,v 1.28 2001/05/14 15:58:04 reno Exp $
 *
 * Authors: Michel Kaempf <maxx@via.ecp.fr>
 *          Aaron Holtzman <aholtzma@engr.uvic.ca>
@@ -38,9 +38,6 @@
 #include "audio_output.h"

 #include "ac3_decoder.h"
-#include "ac3_decoder_thread.h"
-
-#include "ac3_internal.h"

 #include "intf_msg.h"

@@ -291,7 +288,7 @@ static __inline__ float coeff_get_float (ac3dec_t * p_ac3dec, u16 bap, u16 dithf
            p_ac3dec->total_bits_read += 5;
            if ((group_code = GetBits (&p_ac3dec->bit_stream,5)) > 26)
            {
-                intf_WarnMsg ( 3, "ac3dec error: invalid mantissa (1)" );
+                intf_WarnMsg ( 3, "ac3dec warn: invalid mantissa (1)" );
                return 0;
            }
    
@@ -312,7 +309,7 @@ static __inline__ float coeff_get_float (ac3dec_t * p_ac3dec, u16 bap, u16 dithf
            p_ac3dec->total_bits_read += 7;
            if ((group_code = GetBits (&p_ac3dec->bit_stream,7)) > 124)
            {
-                intf_WarnMsg ( 3, "ac3dec error: invalid mantissa (2)" );
+                intf_WarnMsg ( 3, "ac3dec warn: invalid mantissa (2)" );
                return 0;
            }

@@ -327,7 +324,7 @@ static __inline__ float coeff_get_float (ac3dec_t * p_ac3dec, u16 bap, u16 dithf
            p_ac3dec->total_bits_read += 3;
            if ((group_code = GetBits (&p_ac3dec->bit_stream,3)) > 6)
            {
-                intf_WarnMsg ( 3, "ac3dec error: invalid mantissa (3)" );
+                intf_WarnMsg ( 3, "ac3dec warn: invalid mantissa (3)" );
                return 0;
            }

@@ -343,7 +340,7 @@ static __inline__ float coeff_get_float (ac3dec_t * p_ac3dec, u16 bap, u16 dithf
            p_ac3dec->total_bits_read += 7;
            if ((group_code = GetBits (&p_ac3dec->bit_stream,7)) > 120)
            {
-                intf_WarnMsg ( 3, "ac3dec error: invalid mantissa (4)" );
+                intf_WarnMsg ( 3, "ac3dec warn: invalid mantissa (4)" );
                return 0;
            }

@@ -357,7 +354,7 @@ static __inline__ float coeff_get_float (ac3dec_t * p_ac3dec, u16 bap, u16 dithf
            p_ac3dec->total_bits_read += 4;
            if ((group_code = GetBits (&p_ac3dec->bit_stream,4)) > 14)
            {
-                intf_WarnMsg ( 3, "ac3dec error: invalid mantissa (5)" );
+                intf_WarnMsg ( 3, "ac3dec warn: invalid mantissa (5)" );
                return 0;
            }


--- a/src/ac3_decoder/ac3_parse.c
+++ b/src/ac3_decoder/ac3_parse.c
@@ -2,7 +2,7 @@
 * ac3_parse.c: ac3 parsing procedures
 *****************************************************************************
 * Copyright (C) 1999, 2000, 2001 VideoLAN
- * $Id: ac3_parse.c,v 1.21 2001/05/07 04:42:42 sam Exp $
+ * $Id: ac3_parse.c,v 1.22 2001/05/14 15:58:04 reno Exp $
 *
 * Authors: Michel Kaempf <maxx@via.ecp.fr>
 *          Aaron Holtzman <aholtzma@engr.uvic.ca>
@@ -40,9 +40,9 @@

 #include "intf_msg.h"
 #include "ac3_decoder.h"
-#include "ac3_decoder_thread.h"
+#include "ac3_decoder_thread.h"                           /* ac3dec_thread_t */

-#include "ac3_internal.h"
+#include "ac3_internal.h"                                       /* EXP_REUSE */

 /* Misc LUT */
 static const u16 nfchans[] = { 2, 1, 2, 3, 3, 4, 4, 5 };
@@ -97,8 +97,10 @@ static const struct frmsize_s frmsizecod_tbl[] =
 static const int fscod_tbl[] = {48000, 44100, 32000};

 /* Some internal functions */
-void parse_bsi_stats (ac3dec_t * p_ac3dec);
-void parse_audblk_stats (ac3dec_t * p_ac3dec);
+#ifdef STATS
+static void parse_bsi_stats (ac3dec_t * p_ac3dec);
+static void parse_audblk_stats (ac3dec_t * p_ac3dec);
+#endif

 /* Parse a syncinfo structure */
 int ac3_sync_frame (ac3dec_t * p_ac3dec, ac3_sync_info_t * p_sync_info) 
@@ -778,7 +780,7 @@ int parse_audblk (ac3dec_t * p_ac3dec, int blknum)
    }
    
 #ifdef STATS
-//    parse_audblk_stats(p_ac3dec);
+    parse_audblk_stats(p_ac3dec);
 #endif
    
    return 0;
@@ -806,7 +808,8 @@ void parse_auxdata (ac3dec_t * p_ac3dec)
    RemoveBits (&p_ac3dec->bit_stream,16);
 }

-void parse_bsi_stats (ac3dec_t * p_ac3dec) /*Some stats */
+#ifdef STATS
+static void parse_bsi_stats (ac3dec_t * p_ac3dec) /* Some stats */
 {  
    struct mixlev_s
    {
@@ -850,7 +853,7 @@ void parse_bsi_stats (ac3dec_t * p_ac3dec) /*Some stats */
        i = 0;
 }

-void parse_audblk_stats (ac3dec_t * p_ac3dec)
+static void parse_audblk_stats (ac3dec_t * p_ac3dec)
 {
    char *exp_strat_tbl[4] = {"R   ","D15 ","D25 ","D45 "};
    u32 i;
@@ -871,3 +874,4 @@ void parse_audblk_stats (ac3dec_t * p_ac3dec)

    intf_ErrMsg ("\n");
 }
+#endif
--- a/src/ac3_decoder/ac3_rematrix.c
+++ b/src/ac3_decoder/ac3_rematrix.c
@@ -2,7 +2,7 @@
 * ac3_rematrix.c: ac3 audio rematrixing
 *****************************************************************************
 * Copyright (C) 1999, 2000 VideoLAN
- * $Id: ac3_rematrix.c,v 1.16 2001/05/06 04:32:02 sam Exp $
+ * $Id: ac3_rematrix.c,v 1.17 2001/05/14 15:58:04 reno Exp $
 *
 * Authors: Michel Kaempf <maxx@via.ecp.fr>
 *          Aaron Holtzman <aholtzma@engr.uvic.ca>
@@ -34,7 +34,6 @@
 #include "input_ext-dec.h"

 #include "ac3_decoder.h"
-#include "ac3_internal.h"

 struct rematrix_band_s {
    u32 start;

--- a/src/ac3_decoder/ac3_srfft.c
+++ b/src/ac3_decoder/ac3_srfft.c
@@ -2,7 +2,7 @@
 * ac3_srfft.c: ac3 FFT
 *****************************************************************************
 * Copyright (C) 1999, 2000 VideoLAN
- * $Id: ac3_srfft.c,v 1.3 2001/05/06 04:32:02 sam Exp $
+ * $Id: ac3_srfft.c,v 1.4 2001/05/14 15:58:04 reno Exp $
 *
 * Authors: Renaud Dartus <reno@videolan.org>
 *          Aaron Holtzman <aholtzma@engr.uvic.ca>
@@ -40,9 +40,9 @@
 #include "ac3_decoder.h"
 #include "ac3_srfft.h"

-void fft_8 (complex_t *x);
+static void fft_8 (complex_t *x);

-void fft_4(complex_t *x)
+static void fft_4(complex_t *x)
 {
  /* delta_p = 1 here */
  /* x[k] = sum_{i=0..3} x[i] * w^{i*k}, w=e^{-2*pi/4} 
@@ -90,7 +90,7 @@ void fft_4(complex_t *x)
 }


-void fft_8 (complex_t *x)
+static void fft_8 (complex_t *x)
 {
  /* delta_p = diag{1, sqrt(i)} here */
  /* x[k] = sum_{i=0..7} x[i] * w^{i*k}, w=e^{-2*pi/8} 
@@ -205,7 +205,7 @@ void fft_8 (complex_t *x)
 }


-void fft_asmb(int k, complex_t *x, complex_t *wTB,
+static void fft_asmb(int k, complex_t *x, complex_t *wTB,
 	     const complex_t *d, const complex_t *d_3)
 {
  register complex_t  *x2k, *x3k, *x4k, *wB;
@@ -236,7 +236,7 @@ void fft_asmb(int k, complex_t *x, complex_t *wTB,
 
 }

-void fft_asmb16(complex_t *x, complex_t *wTB)
+static void fft_asmb16(complex_t *x, complex_t *wTB)
 {
  register float a_r, a_i, a1_r, a1_i, u_r, u_i, v_r, v_i;
  int k = 2;

--- a/src/ac3_decoder/ac3_srfft.h
+++ b/src/ac3_decoder/ac3_srfft.h
@@ -2,7 +2,7 @@
 * ac3_srfft.h: ac3 FFT
 *****************************************************************************
 * Copyright (C) 1999, 2000 VideoLAN
- * $Id: ac3_srfft.h,v 1.2 2001/04/30 21:10:25 reno Exp $
+ * $Id: ac3_srfft.h,v 1.3 2001/05/14 15:58:04 reno Exp $
 *
 * Authors: Renaud Dartus <reno@videolan.org>
 *          Aaron Holtzman <aholtzma@engr.uvic.ca>
@@ -22,19 +22,19 @@
 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111, USA.
 *****************************************************************************/

-static complex_t delta16[4] = 
+static const complex_t delta16[4] = 
 { {1.00000000000000,  0.00000000000000},
   {0.92387953251129, -0.38268343236509},
   {0.70710678118655, -0.70710678118655},
   {0.38268343236509, -0.92387953251129}};

-static complex_t delta16_3[4] = 
+static const complex_t delta16_3[4] = 
 { {1.00000000000000,  0.00000000000000},
   {0.38268343236509, -0.92387953251129},
   {-0.70710678118655, -0.70710678118655},
   {-0.92387953251129, 0.38268343236509}};

-static complex_t delta32[8] = 
+static const complex_t delta32[8] = 
 { {1.00000000000000,  0.00000000000000},
   {0.98078528040323, -0.19509032201613},
   {0.92387953251129, -0.38268343236509},
@@ -44,7 +44,7 @@ static complex_t delta32[8] =
   {0.38268343236509, -0.92387953251129},
   {0.19509032201613, -0.98078528040323}};

-static complex_t delta32_3[8] = 
+static const complex_t delta32_3[8] = 
 { {1.00000000000000,  0.00000000000000},
   {0.83146961230255, -0.55557023301960},
   {0.38268343236509, -0.92387953251129},
@@ -54,7 +54,7 @@ static complex_t delta32_3[8] =
   {-0.92387953251129, 0.38268343236509},
   {-0.55557023301960, 0.83146961230255}};

-static complex_t delta64[16] = 
+static const complex_t delta64[16] = 
 { {1.00000000000000,  0.00000000000000},
   {0.99518472667220, -0.09801714032956},
   {0.98078528040323, -0.19509032201613},
@@ -72,7 +72,7 @@ static complex_t delta64[16] =
   {0.19509032201613, -0.98078528040323},
   {0.09801714032956, -0.99518472667220}};

-static complex_t delta64_3[16] = 
+static const complex_t delta64_3[16] = 
 { {1.00000000000000,  0.00000000000000},
   {0.95694033573221, -0.29028467725446},
   {0.83146961230255, -0.55557023301960},
@@ -90,7 +90,7 @@ static complex_t delta64_3[16] =
   {-0.55557023301960, 0.83146961230255},
   {-0.29028467725446, 0.95694033573221}};

-static complex_t delta128[32] = 
+static const complex_t delta128[32] = 
 { {1.00000000000000,  0.00000000000000},
   {0.99879545620517, -0.04906767432742},
   {0.99518472667220, -0.09801714032956},
@@ -124,7 +124,7 @@ static complex_t delta128[32] =
   {0.09801714032956, -0.99518472667220},
   {0.04906767432742, -0.99879545620517}};

-static complex_t delta128_3[32] = 
+static const complex_t delta128_3[32] = 
 { {1.00000000000000,  0.00000000000000},
   {0.98917650996478, -0.14673047445536},
   {0.95694033573221, -0.29028467725446},

--- a/src/ac3_decoder/ac3_srfft_sse.c
+++ b/src/ac3_decoder/ac3_srfft_sse.c
+/*****************************************************************************
+ * ac3_srfft_sse.c: ac3 fft functions
+ *****************************************************************************
+ * Copyright (C) 1999, 2000, 2001 VideoLAN
+ * $Id: ac3_srfft_sse.c,v 1.1 2001/05/14 15:58:04 reno Exp $
+ *
+ * Authors: Renaud Dartus <reno@videolan.org>
+ *          Aaron Holtzman <aholtzma@engr.uvic.ca>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111, USA.
+ *****************************************************************************/
+
+#include <stdio.h>
+
+#include "defs.h"
+
+#include <math.h>
+#include <stdio.h>
+
+#include "config.h"
+#include "common.h"
+#include "threads.h"
+#include "mtime.h"
+
+#include "stream_control.h"
+#include "input_ext-dec.h"
+
+#include "ac3_decoder.h"
+#include "ac3_srfft.h"
+
+void hsqrt2 (void);
+void C_1 (void);
+static void fft_4_sse (complex_t *x);
+static void fft_8_sse (complex_t *x);
+static void fft_asmb_sse (int k, complex_t *x, complex_t *wTB,
+	     const complex_t *d, const complex_t *d_3);
+
+void fft_64p_sse(complex_t *a)
+{
+	fft_8_sse(&a[0]); fft_4_sse(&a[8]); fft_4_sse(&a[12]);
+	fft_asmb_sse(2, &a[0], &a[8], &delta16[0], &delta16_3[0]);
+  
+	fft_8_sse(&a[16]), fft_8_sse(&a[24]);
+	fft_asmb_sse(4, &a[0], &a[16],&delta32[0], &delta32_3[0]);
+
+	fft_8_sse(&a[32]); fft_4_sse(&a[40]); fft_4_sse(&a[44]);
+	fft_asmb_sse(2, &a[32], &a[40], &delta16[0], &delta16_3[0]);
+
+	fft_8_sse(&a[48]); fft_4_sse(&a[56]); fft_4_sse(&a[60]);
+	fft_asmb_sse(2, &a[48], &a[56], &delta16[0], &delta16_3[0]);
+
+	fft_asmb_sse(8, &a[0], &a[32],&delta64[0], &delta64_3[0]);
+}
+
+
+void fft_128p_sse(complex_t *a)
+{
+	fft_8_sse(&a[0]); fft_4_sse(&a[8]); fft_4_sse(&a[12]);
+	fft_asmb_sse(2, &a[0], &a[8], &delta16[0], &delta16_3[0]);
+  
+	fft_8_sse(&a[16]), fft_8_sse(&a[24]);
+	fft_asmb_sse(4, &a[0], &a[16],&delta32[0], &delta32_3[0]);
+
+	fft_8_sse(&a[32]); fft_4_sse(&a[40]); fft_4_sse(&a[44]);
+	fft_asmb_sse(2, &a[32], &a[40], &delta16[0], &delta16_3[0]);
+
+	fft_8_sse(&a[48]); fft_4_sse(&a[56]); fft_4_sse(&a[60]);
+	fft_asmb_sse(2, &a[48], &a[56], &delta16[0], &delta16_3[0]);
+
+	fft_asmb_sse(8, &a[0], &a[32],&delta64[0], &delta64_3[0]);
+
+	fft_8_sse(&a[64]); fft_4_sse(&a[72]); fft_4_sse(&a[76]);
+	/* fft_16(&a[64]); */
+	fft_asmb_sse(2, &a[64], &a[72], &delta16[0], &delta16_3[0]);
+
+	fft_8_sse(&a[80]); fft_8_sse(&a[88]);
+  
+	/* fft_32(&a[64]); */
+	fft_asmb_sse(4, &a[64], &a[80],&delta32[0], &delta32_3[0]);
+
+	fft_8_sse(&a[96]); fft_4_sse(&a[104]), fft_4_sse(&a[108]);
+	/* fft_16(&a[96]); */
+	fft_asmb_sse(2, &a[96], &a[104], &delta16[0], &delta16_3[0]);
+
+	fft_8_sse(&a[112]), fft_8_sse(&a[120]);
+	/* fft_32(&a[96]); */
+	fft_asmb_sse(4, &a[96], &a[112], &delta32[0], &delta32_3[0]);
+  
+	/* fft_128(&a[0]); */
+	fft_asmb_sse(16, &a[0], &a[64], &delta128[0], &delta128_3[0]);
+}
+
+void hsqrt2 (void)
+{
+    __asm__ (
+     ".float 0f0.707106781188\n"
+	 ".float 0f0.707106781188\n"
+	 ".float 0f-0.707106781188\n"
+	 ".float 0f-0.707106781188\n"
+     );
+}
+
+void C_1 (void)
+{
+    __asm__ (
+     ".float 0f-1.0\n"
+	 ".float 0f1.0\n"
+	 ".float 0f-1.0\n"
+	 ".float 0f1.0\n"
+     );
+}
+
+static void fft_4_sse (complex_t *x)
+{
+    __asm__ __volatile__ (
+	"movups   (%%eax), %%xmm0\n"	/* x[1] | x[0] */
+	"movups 16(%%eax), %%xmm2\n"	/* x[3] | x[2] */
+	"movups  %%xmm0, %%xmm1\n"		/* x[1] | x[0] */
+	"addps   %%xmm2, %%xmm0\n"		/* x[1] + x[3] | x[0] + x[2] */
+	"subps   %%xmm2, %%xmm1\n"		/* x[1] - x[3] | x[0] - x[2] */
+	"xorps   %%xmm6, %%xmm6\n"
+	"movhlps %%xmm1, %%xmm4\n"		/* ? | x[1] - x[3] */
+	"movhlps %%xmm0, %%xmm3\n"		/* ? | x[1] + x[3] */
+	"subss   %%xmm4, %%xmm6\n"		/* 0 | -(x[1] - x[3]).re */
+	"movlhps %%xmm1, %%xmm0\n"		/* x[0] - x[2] | x[0] + x[2] */
+    "movlhps %%xmm6, %%xmm4\n"		/* 0 | -(x[1] - x[3]).re | (x[1] - x[3]).im | (x[3]-x[1]).re */
+	"movups  %%xmm0, %%xmm2\n"		/* x[0] - x[2] | x[0] + x[2] */
+	"shufps   $0x94, %%xmm4, %%xmm3\n" /* i*(x[1] - x[3]) | x[1] + x[3] */
+    "addps   %%xmm3, %%xmm0\n"
+	"subps   %%xmm3, %%xmm2\n"
+	"movups  %%xmm0,   (%%eax)\n"
+	"movups  %%xmm2, 16(%%eax)\n"
+    : "=a" (x)
+    : "a" (x) );
+}
+
+static void fft_8_sse (complex_t *x)
+{
+    __asm__ __volatile__ (
+	"pushl   %%ebx\n"
+    
+	"movlps   (%%eax), %%xmm0\n"	/* x[0] */
+	"movlps 32(%%eax), %%xmm1\n"	/* x[4] */
+	"movhps 16(%%eax), %%xmm0\n"	/* x[2] | x[0] */
+	"movhps 48(%%eax), %%xmm1\n"	/* x[6] | x[4] */
+	"movups  %%xmm0, %%xmm2\n"	    /* x[2] | x[0] */
+	"xorps   %%xmm3, %%xmm3\n"
+    "addps   %%xmm1, %%xmm0\n"	    /* x[2] + x[6] | x[0] + x[4] */
+	"subps   %%xmm1, %%xmm2\n"    	/* x[2] - x[6] | x[0] - x[4] */
+	"movhlps %%xmm0, %%xmm5\n" 		/* x[2] + x[6] */
+	"movhlps %%xmm2, %%xmm4\n"      /* x[2] - x[6] */
+    "movlhps %%xmm2, %%xmm0\n"	    /* x[0] - x[4] | x[0] + x[4] */
+	"subss   %%xmm4, %%xmm3\n"	    /* (x[2]-x[6]).im | -(x[2]-x[6]).re */
+	"movups  %%xmm0, %%xmm7\n"	    /* x[0] - x[4] | x[0] + x[4] */
+	"movups  %%xmm3, %%xmm4\n"	    /* (x[2]-x[6]).im | -(x[2]-x[6]).re */
+	"movlps 8(%%eax), %%xmm1\n"	    /* x[1] */
+	"shufps   $0x14, %%xmm4, %%xmm5\n" /* i*(x[2] - x[6]) | x[2] + x[6] */
+
+	"addps   %%xmm5, %%xmm0\n"		/* yt = i*(x2-x6)+x0-x4 | x2+x6+x0+x4 */
+	"subps   %%xmm5, %%xmm7\n"		/* yb = i*(x6-x2)+x0-x4 | -x6-x2+x0+x4 */
+
+	"movhps 24(%%eax), %%xmm1\n"	/* x[3] | x[1] */
+    "movl   $hsqrt2, %%ebx\n"
+	"movlps 40(%%eax), %%xmm2\n"	/* x[5] */
+	"movhps 56(%%eax), %%xmm2\n"	/* x[7] | x[5] */
+	"movups  %%xmm1, %%xmm3\n"		/* x[3] | x[1] */
+	"addps   %%xmm2, %%xmm1\n"		/* x[3] + x[7] | x[1] + x[5] */
+	"subps   %%xmm2, %%xmm3\n"		/* x[3] - x[7] | x[1] - x[5] */
+	"movups (%%ebx), %%xmm4\n"		/* -1/sqrt2 | -1/sqrt2 | 1/sqrt2 | 1/sqrt2 */
+	"movups  %%xmm3, %%xmm6\n"		/* x[3] - x[7] | x[1] - x[5] */
+	"mulps   %%xmm4, %%xmm3\n"      /* -1/s2*(x[3] - x[7]) | 1/s2*(x[1] - x[5]) */
+	"shufps   $0xc8, %%xmm4, %%xmm4\n" /* -1/sqrt2 | 1/sqrt2 | -1/sqrt2 | 1/sqrt2 */
+	"shufps   $0xb1, %%xmm6, %%xmm6\n" /* (x3-x7).re|(x3-x7).im|(x1-x5).re|(x1-x5).im */
+	"mulps   %%xmm4, %%xmm6\n"      /* (x7-x3).re/s2|(x3-x7).im/s2|(x5-x1).re/s2|(x1-x5).im/s2 */
+	"addps   %%xmm3, %%xmm6\n"		/* (-1-i)/sqrt2 * (x[3]-x[7]) | (1-i)/sqrt2 * (x[1] - x[5]) */
+	"movhlps %%xmm1, %%xmm5\n"		/* x[3] + x[7] */
+	"movlhps %%xmm6, %%xmm1\n"		/* (1+i)/sqrt2 * (x[1]-x[5]) | x[1]+x[5] */
+	"shufps   $0xe4, %%xmm6, %%xmm5\n"	/* (-1-i)/sqrt2 * (x[3]-x[7]) | x[3]+x[7] */
+	"movups  %%xmm1, %%xmm3\n"		/* (1-i)/sqrt2 * (x[1]-x[5]) | x[1]+x[5] */
+	"movl      $C_1, %%ebx\n"
+	"addps   %%xmm5, %%xmm1\n"		/* u */
+	"subps   %%xmm5, %%xmm3\n"		/* v */
+	"movups  %%xmm0, %%xmm2\n"		/* yb */
+	"movups  %%xmm7, %%xmm4\n"		/* yt */
+	"movups (%%ebx), %%xmm5\n"
+	"mulps   %%xmm5, %%xmm3\n"
+	"addps   %%xmm1, %%xmm0\n"		/* yt + u */
+	"subps   %%xmm1, %%xmm2\n"		/* yt - u */
+	"shufps   $0xb1, %%xmm3, %%xmm3\n" /* -i * v */
+	"movups  %%xmm0, (%%eax)\n"
+	"movups  %%xmm2, 32(%%eax)\n"
+	"addps   %%xmm3, %%xmm4\n"		/* yb - i*v */
+	"subps   %%xmm3, %%xmm7\n"		/* yb + i*v */
+	"movups  %%xmm4, 16(%%eax)\n"
+	"movups  %%xmm7, 48(%%eax)\n"
+
+	"popl    %%ebx\n"
+    : "=a" (x)
+    : "a" (x));
+}
+
+    
+static void fft_asmb_sse (int k, complex_t *x, complex_t *wTB,
+	     const complex_t *d, const complex_t *d_3)
+{
+    __asm__ __volatile__ (
+	"pushl %%ebp\n"
+	"movl %%esp, %%ebp\n"
+
+	"subl $4, %%esp\n"
+	
+	"pushl %%eax\n"
+	"pushl %%ebx\n"
+	"pushl %%ecx\n"
+	"pushl %%edx\n"
+	"pushl %%esi\n"
+	"pushl %%edi\n"
+
+	"movl  8(%%ebp), %%ecx\n"   /* k */
+	"movl 12(%%ebp), %%eax\n"   /* x */
+	"movl %%ecx, -4(%%ebp)\n"   /* k */
+	"movl 16(%%ebp), %%ebx\n"   /* wT */
+	"movl 20(%%ebp), %%edx\n"   /* d */
+	"movl 24(%%ebp), %%esi\n"   /* d3 */
+	"shll $4, %%ecx\n"          /* 16k */
+	"addl $8, %%edx\n"
+	"leal (%%eax, %%ecx, 2), %%edi\n"
+	"addl $8, %%esi\n"
+	
+	/* TRANSZERO and TRANS */
+	"movups (%%eax), %%xmm0\n"      /* x[1] | x[0] */
+	"movups (%%ebx), %%xmm1\n"      /* wT[1] | wT[0] */
+	"movups (%%ebx, %%ecx), %%xmm2\n" /* wB[1] | wB[0] */
+	"movlps (%%edx), %%xmm3\n"      /* d */
+	"movlps (%%esi), %%xmm4\n"      /* d3 */
+	"movhlps %%xmm1, %%xmm5\n"      /* wT[1] */
+	"movhlps %%xmm2, %%xmm6\n"      /* wB[1] */
+	"shufps $0x50, %%xmm3, %%xmm3\n" /* d[1].im | d[1].im | d[1].re | d[1].re */
+	"shufps $0x50, %%xmm4, %%xmm4\n" /* d3[1].im | d3[1].im | d3[i].re | d3[i].re */
+	"movlhps %%xmm5, %%xmm5\n"      /* wT[1] | wT[1] */
+	"movlhps %%xmm6, %%xmm6\n"      /* wB[1] | wB[1] */
+	"mulps   %%xmm3, %%xmm5\n"
+	"mulps   %%xmm4, %%xmm6\n"
+	"movhlps %%xmm5, %%xmm7\n"      /* wT[1].im * d[1].im | wT[1].re * d[1].im */
+	"movlhps %%xmm6, %%xmm5\n"      /* wB[1].im * d3[1].re | wB[1].re * d3[1].re | wT[1].im * d[1].re | wT[1].re * d[1].re */
+	"shufps $0xb1, %%xmm6, %%xmm7\n" /* wB[1].re * d3[1].im | wB[i].im * d3[1].im | wT[1].re * d[1].im | wT[1].im * d[1].im */
+	"movl $C_1, %%edi\n"
+	"movups (%%edi), %%xmm4\n"
+	"mulps   %%xmm4, %%xmm7\n"
+	"addps   %%xmm7, %%xmm5\n"      /* wB[1] * d3[1] | wT[1] * d[1] */
+	"movlhps %%xmm5, %%xmm1\n"      /* d[1] * wT[1] | wT[0] */
+	"shufps  $0xe4, %%xmm5, %%xmm2\n" /* d3[1] * wB[1] | wB[0] */
+	"movups  %%xmm1, %%xmm3\n"      /* d[1] * wT[1] | wT[0] */
+	"leal   (%%eax, %%ecx, 2), %%edi\n"
+	"addps  %%xmm2, %%xmm1\n"       /* u */
+	"subps  %%xmm2, %%xmm3\n"       /* v */
+	"mulps  %%xmm4, %%xmm3\n"
+	"movups (%%eax, %%ecx), %%xmm5\n" /* xk[1] | xk[0] */
+	"shufps $0xb1, %%xmm3, %%xmm3\n"  /* -i * v */
+	"movups %%xmm0, %%xmm2\n"         /* x[1] | x[0] */
+	"movups %%xmm5, %%xmm6\n"         /* xk[1] | xk[0] */
+	"addps  %%xmm1, %%xmm0\n"
+	"subps  %%xmm1, %%xmm2\n"
+	"addps  %%xmm3, %%xmm5\n"
+	"subps  %%xmm3, %%xmm6\n"
+	"movups %%xmm0, (%%eax)\n"
+	"movups %%xmm2, (%%edi)\n"
+	"movups %%xmm5, (%%eax, %%ecx)\n"
+	"movups %%xmm6, (%%edi, %%ecx)\n"
+	"addl $16, %%eax\n"
+	"addl $16, %%ebx\n"
+	"addl  $8, %%edx\n"
+	"addl  $8, %%esi\n"
+	"decl -4(%%ebp)\n"
+
+".loop:\n"
+	"movups (%%ebx), %%xmm0\n"      /* wT[1] | wT[0] */
+	"movups (%%edx), %%xmm1\n"      /* d[1] | d[0] */
+
+	"movups (%%ebx, %%ecx), %%xmm4\n" /* wB[1] | wB[0] */
+	"movups (%%esi), %%xmm5\n"      /* d3[1] | d3[0] */
+
+	"movhlps %%xmm0, %%xmm2\n"      /* wT[1] */
+	"movhlps %%xmm1, %%xmm3\n"      /* d[1] */
+
+	"movhlps %%xmm4, %%xmm6\n"      /* wB[1] */
+	"movhlps %%xmm5, %%xmm7\n"      /* d3[1] */
+
+	"shufps $0x50, %%xmm1, %%xmm1\n" /* d[0].im | d[0].im | d[0].re | d[0].re */
+	"shufps $0x50, %%xmm3, %%xmm3\n" /* d[1].im | d[1].im | d[1].re | d[1].re */
+
+	"movlhps %%xmm0, %%xmm0\n"       /* wT[0] | wT[0] */
+	"shufps $0x50, %%xmm5, %%xmm5\n" /* d3[0].im | d3[0].im | d3[0].re | d3[0].re */
+	"movlhps %%xmm2, %%xmm2\n"       /* wT[1] | wT[1] */
+	"shufps $0x50, %%xmm7, %%xmm7\n" /* d3[1].im | d3[1].im | d3[1].re | d3[1].re */
+
+	"mulps   %%xmm1, %%xmm0\n"  /* d[0].im * wT[0].im | d[0].im * wT[0].re | d[0].re * wT[0].im | d[0].re * wT[0].re */
+	"mulps   %%xmm3, %%xmm2\n"  /* d[1].im * wT[1].im | d[1].im * wT[1].re | d[1].re * wT[1].im | d[1].re * wT[1].re */
+	"movlhps %%xmm4, %%xmm4\n"  /* wB[0] | wB[0] */
+	"movlhps %%xmm6, %%xmm6\n"  /* wB[1] | wB[1] */
+    
+	"movhlps %%xmm0, %%xmm1\n"  /* d[0].im * wT[0].im | d[0].im * wT[0].re */
+	"movlhps %%xmm2, %%xmm0\n"  /* d[1].re * wT[1].im | d[1].re * wT[1].re | d[0].re * wT[0].im | d[0].re * wT[0].re */
+	"mulps   %%xmm5, %%xmm4\n"  /* wB[0].im * d3[0].im | wB[0].re * d3[0].im | wB[0].im * d3[0].re | wB[0].re * d3[0].re */
+	"mulps   %%xmm7, %%xmm6\n"  /* wB[1].im * d3[1].im | wB[1].re * d3[1].im | wB[1].im * d3[1].re | wB[1].re * d3[1].re */
+	"shufps $0xb1, %%xmm2, %%xmm1\n"    /* d[1].im * wT[1].re | d[1].im * wT[1].im | d[0].im * wT[0].re | d[0].im * wT[0].im */
+	"movl $C_1, %%edi\n"
+	"movups (%%edi), %%xmm3\n"  /* 1.0 | -1.0 | 1.0 | -1.0 */
+
+	"movhlps %%xmm4, %%xmm5\n"  /* wB[0].im * d3[0].im | wB[0].re * d3[0].im */
+	"mulps   %%xmm3, %%xmm1\n"  /* d[1].im * wT[1].re | -d[1].im * wT[1].im | d[0].im * wT[0].re | -d[0].im * wT[0].im */
+	"movlhps %%xmm6, %%xmm4\n"  /* wB[1].im * d3[1].re | wB[1].re * d3[1].re | wB[0].im * d3[0].re | wB[0].im * d3[0].re */
+	"addps   %%xmm1, %%xmm0\n"  /* wT[1] * d[1] | wT[0] * d[0] */
+
+	"shufps $0xb1, %%xmm6, %%xmm5\n"    /* wB[1].re * d3[1].im | wB[1].im * d3[1].im | wB[0].re * d3[0].im | wB[0].im * d3[0].im */
+	"mulps   %%xmm3, %%xmm5\n"  /* wB[1].re * d3[1].im | -wB[1].im * d3[1].im | wB[0].re * d3[0].im | -wB[0].im * d3[0].im */
+	"addps   %%xmm5, %%xmm4\n"  /* wB[1] * d3[1] | wB[0] * d3[0] */
+
+	"movups %%xmm0, %%xmm1\n"   /* wT[1] * d[1] | wT[0] * d[0] */
+	"addps  %%xmm4, %%xmm0\n"   /* u */
+	"subps  %%xmm4, %%xmm1\n"   /* v */
+	"movups (%%eax), %%xmm6\n"  /* x[1] | x[0] */
+	"leal   (%%eax, %%ecx, 2), %%edi\n"
+	"mulps  %%xmm3, %%xmm1\n"
+	"addl $16, %%ebx\n"
+	"addl $16, %%esi\n"
+	"shufps $0xb1, %%xmm1, %%xmm1\n"    /* -i * v */
+	"movups (%%eax, %%ecx), %%xmm7\n"   /* xk[1] | xk[0] */
+	"movups %%xmm6, %%xmm2\n"
+	"movups %%xmm7, %%xmm4\n"
+	"addps  %%xmm0, %%xmm6\n"
+	"subps  %%xmm0, %%xmm2\n"
+	"movups %%xmm6, (%%eax)\n"
+	"movups %%xmm2, (%%edi)\n"
+	"addps  %%xmm1, %%xmm7\n"
+	"subps  %%xmm1, %%xmm4\n"
+	"addl $16, %%edx\n"
+	"movups %%xmm7, (%%eax, %%ecx)\n"
+	"movups %%xmm4, (%%edi, %%ecx)\n"
+
+	"addl $16, %%eax\n"
+	"decl -4(%%ebp)\n"
+	"jnz .loop\n"
+
+".end:\n"
+	"popl %%edi\n"
+	"popl %%esi\n"
+	"popl %%edx\n"
+	"popl %%ecx\n"
+	"popl %%ebx\n"
+	"popl %%eax\n"
+	
+	"addl $4, %%esp\n"
+
+    "leave\n"
+    ::);
+}
--- a/src/interface/main.c
+++ b/src/interface/main.c
@@ -4,7 +4,7 @@
 * and spawn threads.
 *****************************************************************************
 * Copyright (C) 1998, 1999, 2000 VideoLAN
- * $Id: main.c,v 1.93 2001/05/07 03:14:09 stef Exp $
+ * $Id: main.c,v 1.94 2001/05/14 15:58:04 reno Exp $
 *
 * Authors: Vincent Seguin <seguin@via.ecp.fr>
 *          Samuel Hocevar <sam@zoy.org>
@@ -974,6 +974,7 @@ static int CPUCapabilities( void )
    if( i_edx & 0x02000000 )
    {
        i_capabilities |= CPU_CAPABILITY_MMXEXT;
+        i_capabilities |= CPU_CAPABILITY_SSE;
    }
    
    /* test for additional capabilities */
@@ -996,7 +997,6 @@ static int CPUCapabilities( void )
    {
        i_capabilities |= CPU_CAPABILITY_MMXEXT;
    }
-
 #else
    /* default behaviour */