* Alignement in asm functions

* 16 bytes alignement for data (need fo SSE) * Optimization in SSE

* Alignement in asm functions
* 16 bytes alignement for data (need fo SSE) * Optimization in SSE
dee3179d · Renaud Dartus · 5b49dba8 · dee3179d · dee3179d · dee3179d
Commit dee3179d authored Jul 08, 2001 by Renaud Dartus
10 changed files
--- a/include/ac3_imdct.h
+++ b/include/ac3_imdct.h
@@ -2,7 +2,7 @@
 * ac3_imdct.h : AC3 IMDCT types
 *****************************************************************************
 * Copyright (C) 1999, 2000 VideoLAN
- * $Id: ac3_imdct.h,v 1.4 2001/06/12 00:30:41 reno Exp $
+ * $Id: ac3_imdct.h,v 1.5 2001/07/08 23:15:11 reno Exp $
 *
 * Authors: Michel Kaempf <maxx@via.ecp.fr>
 *          Renaud Dartus <reno@videolan.org>
@@ -42,18 +42,19 @@ typedef struct imdct_s
    float xsin1[N/4] __attribute__ ((aligned(16)));
    float xcos2[N/8] __attribute__ ((aligned(16)));
    float xsin2[N/8] __attribute__ ((aligned(16)));
+    float xcos_sin_sse[128 * 4] __attribute__ ((aligned(16)));
    /* Twiddle factor LUT */
-    complex_t *w[7] __attribute__ ((aligned(16)));
    complex_t w_1[1] __attribute__ ((aligned(16)));
+    float used_for_alignement1;
+    float used_for_alignement2;
    complex_t w_2[2] __attribute__ ((aligned(16)));
    complex_t w_4[4] __attribute__ ((aligned(16)));
    complex_t w_8[8] __attribute__ ((aligned(16)));
    complex_t w_16[16] __attribute__ ((aligned(16)));
    complex_t w_32[32] __attribute__ ((aligned(16)));
    complex_t w_64[64] __attribute__ ((aligned(16)));
+    complex_t *w[7] __attribute__ ((aligned(16)));
-    float xcos_sin_sse[128 * 4] __attribute__ ((aligned(16)));
    /* Module used and shortcuts */
    struct module_s * p_module;

--- a/plugins/downmix/ac3_downmix_3dn.c
+++ b/plugins/downmix/ac3_downmix_3dn.c
@@ -2,7 +2,7 @@
 * ac3_downmix_3dn.c: accelerated 3D Now! ac3 downmix functions
 *****************************************************************************
 * Copyright (C) 1999, 2000, 2001 VideoLAN
- * $Id: ac3_downmix_3dn.c,v 1.3 2001/07/01 08:49:09 gbazin Exp $
+ * $Id: ac3_downmix_3dn.c,v 1.4 2001/07/08 23:15:11 reno Exp $
 *
 * Authors: Renaud Dartus <reno@videolan.org>
 *
@@ -46,6 +46,7 @@ void sqrt2_3dn (void)
 void _M( downmix_3f_2r_to_2ch ) (float * samples, dm_par_t * dm_par)
 {
    __asm__ __volatile__ (
+    ".align 16\n"
    "pushl %%ebx\n"
    "movl  $128,  %%ebx\n"            /* loop counter */
@@ -58,6 +59,7 @@ void _M( downmix_3f_2r_to_2ch ) (float * samples, dm_par_t * dm_par)
    "movd    8(%%ecx), %%mm7\n"        /* slev */
    "punpckldq %%mm7, %%mm7\n"        /* slev | slev */
+    ".align 16\n"
 ".loop:\n"
    "movq    (%%eax),     %%mm0\n"   /* left */
    "movq    2048(%%eax), %%mm1\n"   /* right */
@@ -90,6 +92,7 @@ void _M( downmix_3f_2r_to_2ch ) (float * samples, dm_par_t * dm_par)
 void _M( downmix_2f_2r_to_2ch ) (float *samples, dm_par_t * dm_par)
 {
    __asm__ __volatile__ (
+    ".align 16\n"
    "pushl %%ebx\n"
    "movl  $128, %%ebx\n"       /* loop counter */
@@ -99,6 +102,7 @@ void _M( downmix_2f_2r_to_2ch ) (float *samples, dm_par_t * dm_par)
    "movd    8(%%ecx), %%mm7\n"    /* slev */
    "punpckldq %%mm7, %%mm7\n"    /* slev | slev */
+    ".align 16\n"
 ".loop3:\n"
    "movq   (%%eax), %%mm0\n"       /* left */
    "movq   1024(%%eax), %%mm1\n"   /* right */
@@ -127,7 +131,7 @@ void _M( downmix_2f_2r_to_2ch ) (float *samples, dm_par_t * dm_par)
 void _M( downmix_3f_1r_to_2ch ) (float *samples, dm_par_t * dm_par)
 {
    __asm__ __volatile__ (
+    ".align 16\n"
    "pushl    %%ebx\n"
    "movl    $128, %%ebx\n"            /* loop counter */
@@ -140,6 +144,7 @@ void _M( downmix_3f_1r_to_2ch ) (float *samples, dm_par_t * dm_par)
    "movd    8(%%ecx), %%mm7\n"        /* slev */
    "punpckldq %%mm7, %%mm7\n"      /* slev | slev */
+    ".align 16\n"
 ".loop4:\n"
    "movq    (%%eax), %%mm0\n"       /* left */
    "movq    2048(%%eax), %%mm1\n"   /* right */
@@ -170,6 +175,7 @@ void _M( downmix_3f_1r_to_2ch ) (float *samples, dm_par_t * dm_par)
 void _M( downmix_2f_1r_to_2ch ) (float *samples, dm_par_t * dm_par)
 {
    __asm__ __volatile__ (
+    ".align 16\n"
    "pushl    %%ebx\n"
    "movl    $128, %%ebx\n"            /* loop counter */
@@ -179,6 +185,7 @@ void _M( downmix_2f_1r_to_2ch ) (float *samples, dm_par_t * dm_par)
    "movd    8(%%ecx), %%mm7\n"        /* slev */
    "punpckldq %%mm7, %%mm7\n"      /* slev | slev */
+    ".align 16\n"
 ".loop5:\n"
    "movq    (%%eax), %%mm0\n"       /* left */
    "movq    1024(%%eax), %%mm1\n"   /* right */
@@ -205,6 +212,7 @@ void _M( downmix_2f_1r_to_2ch ) (float *samples, dm_par_t * dm_par)
 void _M( downmix_3f_0r_to_2ch ) (float *samples, dm_par_t * dm_par)
 {
    __asm__ __volatile__ (
+    ".align 16\n"
    "pushl    %%ebx\n"
    "movl    $128, %%ebx\n"            /* loop counter */
@@ -214,6 +222,7 @@ void _M( downmix_3f_0r_to_2ch ) (float *samples, dm_par_t * dm_par)
    "movd    4(%%ecx), %%mm6\n"        /* clev */
    "punpckldq %%mm6, %%mm6\n"      /* clev | clev */
+    ".align 16\n"
 ".loop6:\n"
    "movq    (%%eax), %%mm0\n"       /*left */
    "movq    2048(%%eax), %%mm1\n"   /* right */
@@ -240,6 +249,7 @@ void _M( downmix_3f_0r_to_2ch ) (float *samples, dm_par_t * dm_par)
 void _M( stream_sample_1ch_to_s16 ) (s16 *s16_samples, float *left)
 {
    __asm__ __volatile__ (
+    ".align 16\n"
    "pushl %%ebx\n"
    "pushl %%edx\n"
@@ -248,6 +258,7 @@ void _M( stream_sample_1ch_to_s16 ) (s16 *s16_samples, float *left)
    "punpckldq %%mm7, %%mm7\n"   /* sqrt2 | sqrt2 */
    "movl $128, %%ebx\n"
+    ".align 16\n"
 ".loop2:\n"
    "movq (%%ecx), %%mm0\n"        /* c1 | c0 */
    "pfmul   %%mm7, %%mm0\n"
@@ -274,9 +285,11 @@ void _M( stream_sample_2ch_to_s16 ) (s16 *s16_samples, float *left, float *right
 {
    __asm__ __volatile__ (
+    ".align 16\n"
    "pushl %%ebx\n"
    "movl $128, %%ebx\n"
+    ".align 16\n"
 ".loop1:\n"
    "movq  (%%ecx), %%mm0\n"    /* l1 | l0 */
    "movq  (%%edx), %%mm1\n"    /* r1 | r0 */

--- a/plugins/downmix/ac3_downmix_sse.c
+++ b/plugins/downmix/ac3_downmix_sse.c
@@ -2,7 +2,7 @@
 * ac3_downmix_sse.c: accelerated SSE ac3 downmix functions
 *****************************************************************************
 * Copyright (C) 1999, 2000, 2001 VideoLAN
- * $Id: ac3_downmix_sse.c,v 1.3 2001/07/01 08:49:09 gbazin Exp $
+ * $Id: ac3_downmix_sse.c,v 1.4 2001/07/08 23:15:11 reno Exp $
 *
 * Authors: Renaud Dartus <reno@videolan.org>
 *          Aaron Holtzman <aholtzma@engr.uvic.ca>
@@ -41,12 +41,14 @@
 void sqrt2_sse (void) __asm__ ("sqrt2_sse");
 void sqrt2_sse (void)
 {
-    __asm__ (".float 0f0.7071068");
+    __asm__ (".align 16\n"
+             ".float 0f0.7071068");
 }
 void _M( downmix_3f_2r_to_2ch ) (float * samples, dm_par_t * dm_par)
 {
    __asm__ __volatile__ (
+    ".align 16\n"
    "pushl %%ebx\n"
    "movl  $64, %%ebx\n"            /* loop counter */
@@ -59,12 +61,13 @@ void _M( downmix_3f_2r_to_2ch ) (float * samples, dm_par_t * dm_par)
    "movss    8(%%ecx), %%xmm7\n"   /* slev */
    "shufps $0, %%xmm7, %%xmm7\n"   /* slev | slev | slev | slev */
+    ".align 16\n"
 ".loop:\n"
-    "movups    (%%eax),     %%xmm0\n"  /* left */
+    "movaps     (%%eax), %%xmm0\n"  /* left */
-    "movups    2048(%%eax), %%xmm1\n"  /* right */
+    "movaps 2048(%%eax), %%xmm1\n"  /* right */
-    "movups 1024(%%eax), %%xmm2\n"    /* center */
+    "movaps 1024(%%eax), %%xmm2\n"  /* center */
-    "movups    3072(%%eax), %%xmm3\n"    /* leftsur */
+    "movaps 3072(%%eax), %%xmm3\n"  /* leftsur */
-    "movups    4096(%%eax), %%xmm4\n"    /* rithgsur */
+    "movaps 4096(%%eax), %%xmm4\n"  /* rithgsur */
    "mulps %%xmm5, %%xmm0\n"
    "mulps %%xmm5, %%xmm1\n"
    "mulps %%xmm6, %%xmm2\n"
@@ -75,8 +78,8 @@ void _M( downmix_3f_2r_to_2ch ) (float * samples, dm_par_t * dm_par)
    "addps %%xmm3, %%xmm0\n"
    "addps %%xmm4, %%xmm1\n"
-    "movups    %%xmm0, (%%eax)\n"
+    "movaps %%xmm0, (%%eax)\n"
-    "movups    %%xmm1, 1024(%%eax)\n"
+    "movaps %%xmm1, 1024(%%eax)\n"
    "addl $16, %%eax\n"
    "decl %%ebx\n"
@@ -90,6 +93,7 @@ void _M( downmix_3f_2r_to_2ch ) (float * samples, dm_par_t * dm_par)
 void _M( downmix_2f_2r_to_2ch ) (float *samples, dm_par_t * dm_par)
 {
    __asm__ __volatile__ (
+    ".align 16\n"
    "pushl %%ebx\n"
    "movl  $64, %%ebx\n"            /* loop counter */
@@ -99,11 +103,12 @@ void _M( downmix_2f_2r_to_2ch ) (float *samples, dm_par_t * dm_par)
    "movss    8(%%ecx), %%xmm7\n"   /* slev */
    "shufps $0, %%xmm7, %%xmm7\n"   /* slev | slev | slev | slev */
+    ".align 16\n"
 ".loop3:\n"
-    "movups    (%%eax), %%xmm0\n"      /* left */
+    "movaps     (%%eax), %%xmm0\n"  /* left */
-    "movups    1024(%%eax), %%xmm1\n"  /* right */
+    "movaps 1024(%%eax), %%xmm1\n"  /* right */
-    "movups 2048(%%eax), %%xmm3\n"    /* leftsur */
+    "movaps 2048(%%eax), %%xmm3\n"  /* leftsur */
-    "movups    3072(%%eax), %%xmm4\n"    /* rightsur */
+    "movaps 3072(%%eax), %%xmm4\n"  /* rightsur */
    "mulps %%xmm5, %%xmm0\n"
    "mulps %%xmm5, %%xmm1\n"
    "mulps %%xmm7, %%xmm3\n"
@@ -111,8 +116,8 @@ void _M( downmix_2f_2r_to_2ch ) (float *samples, dm_par_t * dm_par)
    "addps %%xmm3, %%xmm0\n"
    "addps %%xmm4, %%xmm1\n"
-    "movups    %%xmm0, (%%eax)\n"
+    "movaps %%xmm0, (%%eax)\n"
-    "movups    %%xmm1, 1024(%%eax)\n"
+    "movaps %%xmm1, 1024(%%eax)\n"
    "addl $16, %%eax\n"
    "decl %%ebx\n"
@@ -126,7 +131,7 @@ void _M( downmix_2f_2r_to_2ch ) (float *samples, dm_par_t * dm_par)
 void _M( downmix_3f_1r_to_2ch ) (float *samples, dm_par_t * dm_par)
 {
    __asm__ __volatile__ (
+    ".align 16\n"
    "pushl %%ebx\n"
    "movl  $64, %%ebx\n"            /* loop counter */
@@ -139,11 +144,12 @@ void _M( downmix_3f_1r_to_2ch ) (float *samples, dm_par_t * dm_par)
    "movss    8(%%ecx), %%xmm7\n"   /* slev */
    "shufps $0, %%xmm7, %%xmm7\n"   /* slev | slev | slev | slev */
+    ".align 16\n"
 ".loop4:\n"
-    "movups    (%%eax), %%xmm0\n"      /* left */
+    "movaps     (%%eax), %%xmm0\n"  /* left */
-    "movups    2048(%%eax), %%xmm1\n"  /* right */
+    "movaps 2048(%%eax), %%xmm1\n"  /* right */
-    "movups    1024(%%eax), %%xmm2\n"    /* center */
+    "movaps 1024(%%eax), %%xmm2\n"  /* center */
-    "movups    3072(%%eax), %%xmm3\n"    /* sur */
+    "movaps 3072(%%eax), %%xmm3\n"  /* sur */
    "mulps %%xmm5, %%xmm0\n"
    "mulps %%xmm5, %%xmm1\n"
    "mulps %%xmm6, %%xmm2\n"
@@ -153,8 +159,8 @@ void _M( downmix_3f_1r_to_2ch ) (float *samples, dm_par_t * dm_par)
    "subps %%xmm3, %%xmm0\n"
    "addps %%xmm3, %%xmm1\n"
-    "movups    %%xmm0, (%%eax)\n"
+    "movaps %%xmm0, (%%eax)\n"
-    "movups    %%xmm1, 1024(%%eax)\n"
+    "movaps %%xmm1, 1024(%%eax)\n"
    "addl $16, %%eax\n"
    "decl %%ebx\n"
@@ -163,12 +169,12 @@ void _M( downmix_3f_1r_to_2ch ) (float *samples, dm_par_t * dm_par)
    "popl %%ebx\n"
    : "=a" (samples)
    : "a" (samples), "c" (dm_par));
 }
 void _M( downmix_2f_1r_to_2ch ) (float *samples, dm_par_t * dm_par)
 {
    __asm__ __volatile__ (
+    ".align 16\n"
    "pushl %%ebx\n"
    "movl  $64, %%ebx\n"            /* loop counter */
@@ -178,18 +184,19 @@ void _M( downmix_2f_1r_to_2ch ) (float *samples, dm_par_t * dm_par)
    "movss    8(%%ecx), %%xmm7\n"   /* slev */
    "shufps $0, %%xmm7, %%xmm7\n"   /* slev | slev | slev | slev */
+    ".align 16\n"
 ".loop5:\n"
-    "movups    (%%eax), %%xmm0\n"      /* left */
+    "movaps     (%%eax), %%xmm0\n"  /* left */
-    "movups    1024(%%eax), %%xmm1\n"  /* right */
+    "movaps 1024(%%eax), %%xmm1\n"  /* right */
-    "movups    2048(%%eax), %%xmm3\n"    /* sur */
+    "movaps 2048(%%eax), %%xmm3\n"  /* sur */
    "mulps %%xmm5, %%xmm0\n"
    "mulps %%xmm5, %%xmm1\n"
    "mulps %%xmm7, %%xmm3\n"
    "subps %%xmm3, %%xmm0\n"
    "addps %%xmm3, %%xmm1\n"
-    "movups    %%xmm0, (%%eax)\n"
+    "movaps %%xmm0, (%%eax)\n"
-    "movups    %%xmm1, 1024(%%eax)\n"
+    "movaps %%xmm1, 1024(%%eax)\n"
    "addl $16, %%eax\n"
    "decl %%ebx\n"
@@ -198,13 +205,12 @@ void _M( downmix_2f_1r_to_2ch ) (float *samples, dm_par_t * dm_par)
    "popl %%ebx\n"
    : "=a" (samples)
    : "a" (samples), "c" (dm_par));
 }
 void _M( downmix_3f_0r_to_2ch ) (float *samples, dm_par_t * dm_par)
 {
    __asm__ __volatile__ (
+    ".align 16\n"
    "pushl %%ebx\n"
    "movl  $64, %%ebx\n"           /* loop counter */
@@ -214,18 +220,19 @@ void _M( downmix_3f_0r_to_2ch ) (float *samples, dm_par_t * dm_par)
    "movss    4(%%ecx), %%xmm6\n"  /* clev */
    "shufps $0, %%xmm6, %%xmm6\n"  /* clev | clev | clev | clev */
+    ".align 16\n"
 ".loop6:\n"
-    "movups    (%%eax), %%xmm0\n"      /*left */
+    "movaps     (%%eax), %%xmm0\n"  /*left */
-    "movups    2048(%%eax), %%xmm1\n"  /* right */
+    "movaps 2048(%%eax), %%xmm1\n"  /* right */
-    "movups 1024(%%eax), %%xmm2\n"    /* center */
+    "movaps 1024(%%eax), %%xmm2\n"  /* center */
    "mulps %%xmm5, %%xmm0\n"
    "mulps %%xmm5, %%xmm1\n"
    "mulps %%xmm6, %%xmm2\n"
    "addps %%xmm2, %%xmm0\n"
    "addps %%xmm2, %%xmm1\n"
-    "movups    %%xmm0, (%%eax)\n"
+    "movaps %%xmm0, (%%eax)\n"
-    "movups    %%xmm1, 1024(%%eax)\n"
+    "movaps %%xmm1, 1024(%%eax)\n"
    "addl $16, %%eax\n"
    "decl %%ebx\n"
@@ -239,6 +246,7 @@ void _M( downmix_3f_0r_to_2ch ) (float *samples, dm_par_t * dm_par)
 void _M( stream_sample_1ch_to_s16 ) (s16 *s16_samples, float *left)
 {
    __asm__ __volatile__ (
+    ".align 16\n"
    "pushl %%ebx\n"
    "pushl %%edx\n"
@@ -247,8 +255,9 @@ void _M( stream_sample_1ch_to_s16 ) (s16 *s16_samples, float *left)
    "shufps $0, %%xmm7, %%xmm7\n"  /* sqrt2 | sqrt2 | sqrt2 | sqrt2 */
    "movl   $64, %%ebx\n"
+    ".align 16\n"
 ".loop2:\n"
-    "movups (%%ecx), %%xmm0\n"        /* c3 | c2 | c1 | c0 */
+    "movaps (%%ecx), %%xmm0\n"     /* c3 | c2 | c1 | c0 */
    "mulps   %%xmm7, %%xmm0\n"
    "movhlps %%xmm0, %%xmm2\n"     /* c3 | c2 */
@@ -275,14 +284,15 @@ void _M( stream_sample_1ch_to_s16 ) (s16 *s16_samples, float *left)
 void _M( stream_sample_2ch_to_s16 ) (s16 *s16_samples, float *left, float *right)
 {
    __asm__ __volatile__ (
+    ".align 16\n"
    "pushl %%ebx\n"
    "movl  $64, %%ebx\n"
+    ".align 16\n"
 ".loop1:\n"
-    "movups  (%%ecx), %%xmm0\n"    /* l3 | l2 | l1 | l0 */
+    "movaps  (%%ecx), %%xmm0\n"   /* l3 | l2 | l1 | l0 */
-    "movups  (%%edx), %%xmm1\n"    /* r3 | r2 | r1 | r0 */
+    "movaps  (%%edx), %%xmm1\n"   /* r3 | r2 | r1 | r0 */
    "movhlps  %%xmm0, %%xmm2\n"   /* l3 | l2 */
    "movhlps  %%xmm1, %%xmm3\n"   /* r3 | r2 */
    "unpcklps %%xmm1, %%xmm0\n"   /* r1 | l1 | r0 | l0 */

--- a/plugins/imdct/ac3_imdct_3dn.c
+++ b/plugins/imdct/ac3_imdct_3dn.c
@@ -2,7 +2,7 @@
 * ac3_imdct_3dn.c: accelerated 3D Now! ac3 DCT
 *****************************************************************************
 * Copyright (C) 1999, 2000 VideoLAN
- * $Id: ac3_imdct_3dn.c,v 1.4 2001/06/03 12:47:21 sam Exp $
+ * $Id: ac3_imdct_3dn.c,v 1.5 2001/07/08 23:15:11 reno Exp $
 *
 * Authors: Renaud Dartus <reno@videolan.org>
 *
@@ -89,6 +89,7 @@ void _M( imdct_do_512_nol ) (imdct_t * p_imdct, float data[], float delay[])
 static void imdct512_pre_ifft_twiddle_3dn (const int *pmt, complex_t *buf, float *data, float *xcos_sin_sse)
 {
    __asm__ __volatile__ (	
+    ".align 16\n"
 	"pushl %%ebp\n"
 	"movl  %%esp, %%ebp\n"
 	"addl  $-4, %%esp\n" /* local variable, loop counter */
@@ -106,6 +107,7 @@ static void imdct512_pre_ifft_twiddle_3dn (const int *pmt, complex_t *buf, float
 	"movl 20(%%ebp), %%edx\n" 	/* xcos_sin_sse */
 	"movl $128, -4(%%ebp)\n"
+    ".align 16\n"
 ".loop:\n"
 	"movl  (%%eax), %%esi\n"
 	"movd (%%ecx, %%esi, 8), %%mm1\n"   /* 2j */
@@ -147,9 +149,11 @@ static void imdct512_pre_ifft_twiddle_3dn (const int *pmt, complex_t *buf, float
 static void imdct512_post_ifft_twiddle_3dn (complex_t *buf, float *xcos_sin_sse)
 {
    __asm__ __volatile__ ( 
+    ".align 16\n"
 	"pushl %%ebx\n"
 	"movl $64, %%ebx\n"         /* loop counter */
+    ".align 16\n"
 ".loop1:\n"
 	"movq	(%%eax), %%mm0\n"   /* im0 | re0 */
 	"movq	  %%mm0, %%mm1\n"   /* im0 | re0 */
@@ -200,6 +204,7 @@ static void imdct512_post_ifft_twiddle_3dn (complex_t *buf, float *xcos_sin_sse)
 static void imdct512_window_delay_3dn (complex_t *buf, float *data_ptr, float *window_prt, float *delay_prt)
 {
    __asm__ __volatile__ (
+    ".align 16\n"
 	"pushl %%ebp\n"
 	"movl  %%esp, %%ebp\n"
@@ -219,6 +224,7 @@ static void imdct512_window_delay_3dn (complex_t *buf, float *data_ptr, float *w
 	"leal 504(%%eax), %%edi\n"  /* buf[63].re */
 	"movl  12(%%ebp), %%eax\n"  /* data */
+    ".align 16\n"
 ".first_128_samples:\n"
 	"movd   (%%esi), %%mm0\n" /* im0 */
 	"movd  8(%%esi), %%mm2\n" /* im1 */
@@ -258,6 +264,7 @@ static void imdct512_window_delay_3dn (complex_t *buf, float *data_ptr, float *w
 	"leal 1020(%%esi), %%edi\n" /* buf[127].im */
 	"movl $32, %%ecx\n"         /* loop count */
+    ".align 16\n"
 ".second_128_samples:\n"
 	"movd   (%%esi), %%mm0\n" /* buf[i].re */
 	"movd  8(%%esi), %%mm2\n" /* re1 */
@@ -302,6 +309,7 @@ static void imdct512_window_delay_3dn (complex_t *buf, float *data_ptr, float *w
 	"movl $32, %%ecx\n"         /* loop count */
 	"movl  20(%%ebp), %%eax\n"  /* delay */
+    ".align 16\n"
 ".first_128_delay:\n"
 	"movd   (%%esi), %%mm0\n" /* re0 */
 	"movd  8(%%esi), %%mm2\n" /* re1 */
@@ -339,6 +347,7 @@ static void imdct512_window_delay_3dn (complex_t *buf, float *data_ptr, float *w
 	"leal 1016(%%ebx), %%edi\n" /* buf[127].re */
 	"movl $32, %%ecx\n"         /* loop count */
+    ".align 16\n"
 ".second_128_delay:\n"
 	"movd   (%%esi), %%mm0\n" /* im0 */
 	"movd  8(%%esi), %%mm2\n" /* im1 */
@@ -386,6 +395,7 @@ static void imdct512_window_delay_3dn (complex_t *buf, float *data_ptr, float *w
 static void imdct512_window_delay_nol_3dn (complex_t *buf, float *data_ptr, float *window_prt, float *delay_prt)
 {
    __asm__ __volatile__ (
+    ".align 16\n"
 	"pushl %%ebp\n"
 	"movl  %%esp, %%ebp\n"
@@ -405,6 +415,7 @@ static void imdct512_window_delay_nol_3dn (complex_t *buf, float *data_ptr, floa
 	"leal 504(%%eax), %%edi\n"  /* buf[63].re */
 	"movl  12(%%ebp), %%eax\n"  /* data */
+    ".align 16\n"
 ".first_128_samples2:\n"
 	"movd   (%%esi), %%mm0\n" /* im0 */
 	"movd  8(%%esi), %%mm2\n" /* im1 */
@@ -439,6 +450,7 @@ static void imdct512_window_delay_nol_3dn (complex_t *buf, float *data_ptr, floa
 	"leal 1020(%%esi), %%edi\n" /* buf[127].im */
 	"movl $32, %%ecx\n"         /* loop count */
+    ".align 16\n"
 ".second_128_samples2:\n"
 	"movd   (%%esi), %%mm0\n" /* buf[i].re */
 	"movd  8(%%esi), %%mm2\n" /* re1 */
@@ -478,6 +490,7 @@ static void imdct512_window_delay_nol_3dn (complex_t *buf, float *data_ptr, floa
 	"movl $32, %%ecx\n"         /* loop count */
 	"movl  20(%%ebp), %%eax\n"  /* delay */
+    ".align 16\n"
 ".first_128_delays:\n"
 	"movd   (%%esi), %%mm0\n" /* re0 */
 	"movd  8(%%esi), %%mm2\n" /* re1 */
@@ -515,6 +528,7 @@ static void imdct512_window_delay_nol_3dn (complex_t *buf, float *data_ptr, floa
 	"leal 1016(%%ebx), %%edi\n" /* buf[127].re */
 	"movl $32, %%ecx\n"         /* loop count */
+    ".align 16\n"
 ".second_128_delays:\n"
 	"movd   (%%esi), %%mm0\n" /* im0 */
 	"movd  8(%%esi), %%mm2\n" /* im1 */

--- a/plugins/imdct/ac3_imdct_sse.c
+++ b/plugins/imdct/ac3_imdct_sse.c
--- a/plugins/imdct/ac3_srfft_3dn.c
+++ b/plugins/imdct/ac3_srfft_3dn.c
@@ -2,7 +2,7 @@
 * ac3_srfft_3dn.c: accelerated 3D Now! ac3 fft functions
 *****************************************************************************
 * Copyright (C) 1999, 2000, 2001 VideoLAN
- * $Id: ac3_srfft_3dn.c,v 1.1 2001/05/16 14:51:29 reno Exp $
+ * $Id: ac3_srfft_3dn.c,v 1.2 2001/07/08 23:15:11 reno Exp $
 *
 * Authors: Renaud Dartus <reno@videolan.org>
 *
@@ -126,6 +126,7 @@ void C_1_3dn (void)
 static void fft_4_3dn (complex_t *x)
 {
    __asm__ __volatile__ (
+    ".align 16\n"
 	"movq    (%%eax), %%mm0\n"      /* x[0] */
 	"movq   8(%%eax), %%mm1\n"      /* x[1] */
 	"movq  16(%%eax), %%mm2\n"      /* x[2] */

--- a/plugins/imdct/ac3_srfft_sse.c
+++ b/plugins/imdct/ac3_srfft_sse.c
--- a/src/ac3_decoder/ac3_decoder.h
+++ b/src/ac3_decoder/ac3_decoder.h
@@ -2,7 +2,7 @@
 * ac3_decoder.h : ac3 decoder interface
 *****************************************************************************
 * Copyright (C) 1999, 2000 VideoLAN
- * $Id: ac3_decoder.h,v 1.10 2001/06/12 00:30:41 reno Exp $
+ * $Id: ac3_decoder.h,v 1.11 2001/07/08 23:15:11 reno Exp $
 *
 * Authors: Michel Kaempf <maxx@via.ecp.fr>
 *          Renaud Dartus <reno@videolan.org>
@@ -354,6 +354,9 @@ typedef struct mantissa_s
 struct ac3dec_s
 {
+    float               samples[6][256] __attribute__ ((aligned(16)));
+    imdct_t             imdct __attribute__ ((aligned(16)));
    /*
     * Input properties
     */
@@ -370,12 +373,10 @@ struct ac3dec_s
    bsi_t               bsi;
    audblk_t            audblk;
-    float               samples[6][256] __attribute__ ((aligned(16)));
    dm_par_t            dm_par;
    bit_allocate_t      bit_allocate;
    mantissa_t          mantissa;
-    imdct_t             imdct;
    downmix_t           downmix;
 };

--- a/src/ac3_decoder/ac3_decoder_thread.c
+++ b/src/ac3_decoder/ac3_decoder_thread.c
@@ -2,7 +2,7 @@
 * ac3_decoder_thread.c: ac3 decoder thread
 *****************************************************************************
 * Copyright (C) 1999, 2000 VideoLAN
- * $Id: ac3_decoder_thread.c,v 1.34 2001/05/31 01:37:08 sam Exp $
+ * $Id: ac3_decoder_thread.c,v 1.35 2001/07/08 23:15:11 reno Exp $
 *
 * Authors: Michel Lespinasse <walken@zoy.org>
 *
@@ -82,7 +82,13 @@ vlc_thread_t ac3dec_CreateThread( adec_config_t * p_config )
    intf_DbgMsg( "ac3dec debug: creating ac3 decoder thread" );
    /* Allocate the memory needed to store the thread's structure */
-    if((p_ac3thread = (ac3dec_thread_t *)malloc(sizeof(ac3dec_thread_t)))==NULL)
+    p_ac3thread = (ac3dec_thread_t *)malloc(sizeof(ac3dec_thread_t));
+    /* We need to be 16 bytes aligned */
+    p_ac3thread->ac3thread = (int)p_ac3thread & (-15);
+    p_ac3thread = (ac3dec_thread_t *)p_ac3thread->ac3thread;
+    if(p_ac3thread == NULL)
    {
        intf_ErrMsg ( "ac3dec error: not enough memory "
                      "for ac3dec_CreateThread() to create the new thread");
@@ -335,6 +341,7 @@ static void EndThread (ac3dec_thread_t * p_ac3thread)
    /* Destroy descriptor */
    free( p_ac3thread->p_config );
+    p_ac3thread = (ac3dec_thread_t *)p_ac3thread->ac3thread;
    free( p_ac3thread );
    intf_DbgMsg ("ac3dec debug: ac3 decoder thread %p destroyed", p_ac3thread);

--- a/src/ac3_decoder/ac3_decoder_thread.h
+++ b/src/ac3_decoder/ac3_decoder_thread.h
@@ -2,7 +2,7 @@
 * ac3_decoder_thread.h : ac3 decoder thread interface
 *****************************************************************************
 * Copyright (C) 1999, 2000 VideoLAN
- * $Id: ac3_decoder_thread.h,v 1.7 2001/05/14 15:58:03 reno Exp $
+ * $Id: ac3_decoder_thread.h,v 1.8 2001/07/08 23:15:11 reno Exp $
 *
 * Authors: Michel Kaempf <maxx@via.ecp.fr>
 *
@@ -24,8 +24,16 @@
 /*****************************************************************************
 * ac3dec_thread_t : ac3 decoder thread descriptor
 *****************************************************************************/
 typedef struct ac3dec_thread_s
 {
+    /*
+     * Decoder properties
+     */
+    float used_for_alignement1;
+    float used_for_alignement2;
+    ac3dec_t            ac3_decoder __attribute__ ((aligned(16)));
    /*
     * Thread properties
     */
@@ -38,15 +46,11 @@ typedef struct ac3dec_thread_s
    int                 sync_ptr;          /* sync ptr from ac3 magic header */
    adec_config_t *     p_config;
-    /*
-     * Decoder properties
-     */
-    ac3dec_t            ac3_decoder;
    /*
     * Output properties
     */
    aout_fifo_t *       p_aout_fifo; /* stores the decompressed audio frames */
+    int                 ac3thread;      /* save the old pointer */
 } ac3dec_thread_t;