* Alignement in asm functions

* 16 bytes alignement for data (need fo SSE) * Optimization in SSE

* Alignement in asm functions
* 16 bytes alignement for data (need fo SSE) * Optimization in SSE
dee3179d · Renaud Dartus · 5b49dba8 · dee3179d · dee3179d · dee3179d
Commit dee3179d authored Jul 08, 2001 by Renaud Dartus
10 changed files
--- a/include/ac3_imdct.h
+++ b/include/ac3_imdct.h
@@ -2,7 +2,7 @@
 * ac3_imdct.h : AC3 IMDCT types
 *****************************************************************************
 * Copyright (C) 1999, 2000 VideoLAN
- * $Id: ac3_imdct.h,v 1.4 2001/06/12 00:30:41 reno Exp $
+ * $Id: ac3_imdct.h,v 1.5 2001/07/08 23:15:11 reno Exp $
 *
 * Authors: Michel Kaempf <maxx@via.ecp.fr>
 *          Renaud Dartus <reno@videolan.org>
@@ -42,18 +42,19 @@ typedef struct imdct_s
    float xsin1[N/4] __attribute__ ((aligned(16)));
    float xcos2[N/8] __attribute__ ((aligned(16)));
    float xsin2[N/8] __attribute__ ((aligned(16)));
+    float xcos_sin_sse[128 * 4] __attribute__ ((aligned(16)));
   
    /* Twiddle factor LUT */
-    complex_t *w[7] __attribute__ ((aligned(16)));
    complex_t w_1[1] __attribute__ ((aligned(16)));
+    float used_for_alignement1;
+    float used_for_alignement2;
    complex_t w_2[2] __attribute__ ((aligned(16)));
    complex_t w_4[4] __attribute__ ((aligned(16)));
    complex_t w_8[8] __attribute__ ((aligned(16)));
    complex_t w_16[16] __attribute__ ((aligned(16)));
    complex_t w_32[32] __attribute__ ((aligned(16)));
    complex_t w_64[64] __attribute__ ((aligned(16)));
-
-    float xcos_sin_sse[128 * 4] __attribute__ ((aligned(16)));
+    complex_t *w[7] __attribute__ ((aligned(16)));
    
    /* Module used and shortcuts */
    struct module_s * p_module;

--- a/plugins/downmix/ac3_downmix_3dn.c
+++ b/plugins/downmix/ac3_downmix_3dn.c
@@ -2,7 +2,7 @@
 * ac3_downmix_3dn.c: accelerated 3D Now! ac3 downmix functions
 *****************************************************************************
 * Copyright (C) 1999, 2000, 2001 VideoLAN
- * $Id: ac3_downmix_3dn.c,v 1.3 2001/07/01 08:49:09 gbazin Exp $
+ * $Id: ac3_downmix_3dn.c,v 1.4 2001/07/08 23:15:11 reno Exp $
 *
 * Authors: Renaud Dartus <reno@videolan.org>
 *
@@ -46,6 +46,7 @@ void sqrt2_3dn (void)
 void _M( downmix_3f_2r_to_2ch ) (float * samples, dm_par_t * dm_par)
 {
    __asm__ __volatile__ (
+    ".align 16\n"
    "pushl %%ebx\n"
    "movl  $128,  %%ebx\n"            /* loop counter */

@@ -58,6 +59,7 @@ void _M( downmix_3f_2r_to_2ch ) (float * samples, dm_par_t * dm_par)
    "movd    8(%%ecx), %%mm7\n"        /* slev */
    "punpckldq %%mm7, %%mm7\n"        /* slev | slev */

+    ".align 16\n"
 ".loop:\n"
    "movq    (%%eax),     %%mm0\n"   /* left */
    "movq    2048(%%eax), %%mm1\n"   /* right */
@@ -90,6 +92,7 @@ void _M( downmix_3f_2r_to_2ch ) (float * samples, dm_par_t * dm_par)
 void _M( downmix_2f_2r_to_2ch ) (float *samples, dm_par_t * dm_par)
 {
    __asm__ __volatile__ (
+    ".align 16\n"
    "pushl %%ebx\n"
    "movl  $128, %%ebx\n"       /* loop counter */

@@ -99,6 +102,7 @@ void _M( downmix_2f_2r_to_2ch ) (float *samples, dm_par_t * dm_par)
    "movd    8(%%ecx), %%mm7\n"    /* slev */
    "punpckldq %%mm7, %%mm7\n"    /* slev | slev */

+    ".align 16\n"
 ".loop3:\n"
    "movq   (%%eax), %%mm0\n"       /* left */
    "movq   1024(%%eax), %%mm1\n"   /* right */
@@ -127,7 +131,7 @@ void _M( downmix_2f_2r_to_2ch ) (float *samples, dm_par_t * dm_par)
 void _M( downmix_3f_1r_to_2ch ) (float *samples, dm_par_t * dm_par)
 {
    __asm__ __volatile__ (
-
+    ".align 16\n"
    "pushl    %%ebx\n"
    "movl    $128, %%ebx\n"            /* loop counter */

@@ -140,6 +144,7 @@ void _M( downmix_3f_1r_to_2ch ) (float *samples, dm_par_t * dm_par)
    "movd    8(%%ecx), %%mm7\n"        /* slev */
    "punpckldq %%mm7, %%mm7\n"      /* slev | slev */

+    ".align 16\n"
 ".loop4:\n"
    "movq    (%%eax), %%mm0\n"       /* left */
    "movq    2048(%%eax), %%mm1\n"   /* right */
@@ -170,6 +175,7 @@ void _M( downmix_3f_1r_to_2ch ) (float *samples, dm_par_t * dm_par)
 void _M( downmix_2f_1r_to_2ch ) (float *samples, dm_par_t * dm_par)
 {
    __asm__ __volatile__ (
+    ".align 16\n"
    "pushl    %%ebx\n"
    "movl    $128, %%ebx\n"            /* loop counter */

@@ -179,6 +185,7 @@ void _M( downmix_2f_1r_to_2ch ) (float *samples, dm_par_t * dm_par)
    "movd    8(%%ecx), %%mm7\n"        /* slev */
    "punpckldq %%mm7, %%mm7\n"      /* slev | slev */

+    ".align 16\n"
 ".loop5:\n"
    "movq    (%%eax), %%mm0\n"       /* left */
    "movq    1024(%%eax), %%mm1\n"   /* right */
@@ -205,6 +212,7 @@ void _M( downmix_2f_1r_to_2ch ) (float *samples, dm_par_t * dm_par)
 void _M( downmix_3f_0r_to_2ch ) (float *samples, dm_par_t * dm_par)
 {
    __asm__ __volatile__ (
+    ".align 16\n"
    "pushl    %%ebx\n"
    "movl    $128, %%ebx\n"            /* loop counter */

@@ -214,6 +222,7 @@ void _M( downmix_3f_0r_to_2ch ) (float *samples, dm_par_t * dm_par)
    "movd    4(%%ecx), %%mm6\n"        /* clev */
    "punpckldq %%mm6, %%mm6\n"      /* clev | clev */

+    ".align 16\n"
 ".loop6:\n"
    "movq    (%%eax), %%mm0\n"       /*left */
    "movq    2048(%%eax), %%mm1\n"   /* right */
@@ -240,6 +249,7 @@ void _M( downmix_3f_0r_to_2ch ) (float *samples, dm_par_t * dm_par)
 void _M( stream_sample_1ch_to_s16 ) (s16 *s16_samples, float *left)
 {
    __asm__ __volatile__ (
+    ".align 16\n"
    "pushl %%ebx\n"
    "pushl %%edx\n"

@@ -248,6 +258,7 @@ void _M( stream_sample_1ch_to_s16 ) (s16 *s16_samples, float *left)
    "punpckldq %%mm7, %%mm7\n"   /* sqrt2 | sqrt2 */
    "movl $128, %%ebx\n"

+    ".align 16\n"
 ".loop2:\n"
    "movq (%%ecx), %%mm0\n"        /* c1 | c0 */
    "pfmul   %%mm7, %%mm0\n"
@@ -274,9 +285,11 @@ void _M( stream_sample_2ch_to_s16 ) (s16 *s16_samples, float *left, float *right
 {

    __asm__ __volatile__ (
+    ".align 16\n"
    "pushl %%ebx\n"
    "movl $128, %%ebx\n"

+    ".align 16\n"
 ".loop1:\n"
    "movq  (%%ecx), %%mm0\n"    /* l1 | l0 */
    "movq  (%%edx), %%mm1\n"    /* r1 | r0 */

--- a/plugins/downmix/ac3_downmix_sse.c
+++ b/plugins/downmix/ac3_downmix_sse.c
--- a/plugins/imdct/ac3_imdct_3dn.c
+++ b/plugins/imdct/ac3_imdct_3dn.c
@@ -2,7 +2,7 @@
 * ac3_imdct_3dn.c: accelerated 3D Now! ac3 DCT
 *****************************************************************************
 * Copyright (C) 1999, 2000 VideoLAN
- * $Id: ac3_imdct_3dn.c,v 1.4 2001/06/03 12:47:21 sam Exp $
+ * $Id: ac3_imdct_3dn.c,v 1.5 2001/07/08 23:15:11 reno Exp $
 *
 * Authors: Renaud Dartus <reno@videolan.org>
 *
@@ -89,6 +89,7 @@ void _M( imdct_do_512_nol ) (imdct_t * p_imdct, float data[], float delay[])
 static void imdct512_pre_ifft_twiddle_3dn (const int *pmt, complex_t *buf, float *data, float *xcos_sin_sse)
 {
    __asm__ __volatile__ (	
+    ".align 16\n"
 	"pushl %%ebp\n"
 	"movl  %%esp, %%ebp\n"
 	"addl  $-4, %%esp\n" /* local variable, loop counter */
@@ -106,6 +107,7 @@ static void imdct512_pre_ifft_twiddle_3dn (const int *pmt, complex_t *buf, float
 	"movl 20(%%ebp), %%edx\n" 	/* xcos_sin_sse */
 	"movl $128, -4(%%ebp)\n"
 	
+    ".align 16\n"
 ".loop:\n"
 	"movl  (%%eax), %%esi\n"
 	"movd (%%ecx, %%esi, 8), %%mm1\n"   /* 2j */
@@ -147,9 +149,11 @@ static void imdct512_pre_ifft_twiddle_3dn (const int *pmt, complex_t *buf, float
 static void imdct512_post_ifft_twiddle_3dn (complex_t *buf, float *xcos_sin_sse)
 {
    __asm__ __volatile__ ( 
+    ".align 16\n"
 	"pushl %%ebx\n"
 	"movl $64, %%ebx\n"         /* loop counter */

+    ".align 16\n"
 ".loop1:\n"
 	"movq	(%%eax), %%mm0\n"   /* im0 | re0 */
 	"movq	  %%mm0, %%mm1\n"   /* im0 | re0 */
@@ -200,6 +204,7 @@ static void imdct512_post_ifft_twiddle_3dn (complex_t *buf, float *xcos_sin_sse)
 static void imdct512_window_delay_3dn (complex_t *buf, float *data_ptr, float *window_prt, float *delay_prt)
 {
    __asm__ __volatile__ (
+    ".align 16\n"
 	"pushl %%ebp\n"
 	"movl  %%esp, %%ebp\n"

@@ -219,6 +224,7 @@ static void imdct512_window_delay_3dn (complex_t *buf, float *data_ptr, float *w
 	"leal 504(%%eax), %%edi\n"  /* buf[63].re */
 	"movl  12(%%ebp), %%eax\n"  /* data */

+    ".align 16\n"
 ".first_128_samples:\n"
 	"movd   (%%esi), %%mm0\n" /* im0 */
 	"movd  8(%%esi), %%mm2\n" /* im1 */
@@ -258,6 +264,7 @@ static void imdct512_window_delay_3dn (complex_t *buf, float *data_ptr, float *w
 	"leal 1020(%%esi), %%edi\n" /* buf[127].im */
 	"movl $32, %%ecx\n"         /* loop count */
    
+    ".align 16\n"
 ".second_128_samples:\n"
 	"movd   (%%esi), %%mm0\n" /* buf[i].re */
 	"movd  8(%%esi), %%mm2\n" /* re1 */
@@ -302,6 +309,7 @@ static void imdct512_window_delay_3dn (complex_t *buf, float *data_ptr, float *w
 	"movl $32, %%ecx\n"         /* loop count */
 	"movl  20(%%ebp), %%eax\n"  /* delay */

+    ".align 16\n"
 ".first_128_delay:\n"
 	"movd   (%%esi), %%mm0\n" /* re0 */
 	"movd  8(%%esi), %%mm2\n" /* re1 */
@@ -339,6 +347,7 @@ static void imdct512_window_delay_3dn (complex_t *buf, float *data_ptr, float *w
 	"leal 1016(%%ebx), %%edi\n" /* buf[127].re */
 	"movl $32, %%ecx\n"         /* loop count */
    
+    ".align 16\n"
 ".second_128_delay:\n"
 	"movd   (%%esi), %%mm0\n" /* im0 */
 	"movd  8(%%esi), %%mm2\n" /* im1 */
@@ -386,6 +395,7 @@ static void imdct512_window_delay_3dn (complex_t *buf, float *data_ptr, float *w
 static void imdct512_window_delay_nol_3dn (complex_t *buf, float *data_ptr, float *window_prt, float *delay_prt)
 {
    __asm__ __volatile__ (
+    ".align 16\n"
 	"pushl %%ebp\n"
 	"movl  %%esp, %%ebp\n"
 	
@@ -405,6 +415,7 @@ static void imdct512_window_delay_nol_3dn (complex_t *buf, float *data_ptr, floa
 	"leal 504(%%eax), %%edi\n"  /* buf[63].re */
 	"movl  12(%%ebp), %%eax\n"  /* data */

+    ".align 16\n"
 ".first_128_samples2:\n"
 	"movd   (%%esi), %%mm0\n" /* im0 */
 	"movd  8(%%esi), %%mm2\n" /* im1 */
@@ -439,6 +450,7 @@ static void imdct512_window_delay_nol_3dn (complex_t *buf, float *data_ptr, floa
 	"leal 1020(%%esi), %%edi\n" /* buf[127].im */
 	"movl $32, %%ecx\n"         /* loop count */
    
+    ".align 16\n"
 ".second_128_samples2:\n"
 	"movd   (%%esi), %%mm0\n" /* buf[i].re */
 	"movd  8(%%esi), %%mm2\n" /* re1 */
@@ -478,6 +490,7 @@ static void imdct512_window_delay_nol_3dn (complex_t *buf, float *data_ptr, floa
 	"movl $32, %%ecx\n"         /* loop count */
 	"movl  20(%%ebp), %%eax\n"  /* delay */

+    ".align 16\n"
 ".first_128_delays:\n"
 	"movd   (%%esi), %%mm0\n" /* re0 */
 	"movd  8(%%esi), %%mm2\n" /* re1 */
@@ -515,6 +528,7 @@ static void imdct512_window_delay_nol_3dn (complex_t *buf, float *data_ptr, floa
 	"leal 1016(%%ebx), %%edi\n" /* buf[127].re */
 	"movl $32, %%ecx\n"         /* loop count */
    
+    ".align 16\n"
 ".second_128_delays:\n"
 	"movd   (%%esi), %%mm0\n" /* im0 */
 	"movd  8(%%esi), %%mm2\n" /* im1 */

--- a/plugins/imdct/ac3_imdct_sse.c
+++ b/plugins/imdct/ac3_imdct_sse.c
--- a/plugins/imdct/ac3_srfft_3dn.c
+++ b/plugins/imdct/ac3_srfft_3dn.c
@@ -2,7 +2,7 @@
 * ac3_srfft_3dn.c: accelerated 3D Now! ac3 fft functions
 *****************************************************************************
 * Copyright (C) 1999, 2000, 2001 VideoLAN
- * $Id: ac3_srfft_3dn.c,v 1.1 2001/05/16 14:51:29 reno Exp $
+ * $Id: ac3_srfft_3dn.c,v 1.2 2001/07/08 23:15:11 reno Exp $
 *
 * Authors: Renaud Dartus <reno@videolan.org>
 *
@@ -126,6 +126,7 @@ void C_1_3dn (void)
 static void fft_4_3dn (complex_t *x)
 {
    __asm__ __volatile__ (
+    ".align 16\n"
 	"movq    (%%eax), %%mm0\n"      /* x[0] */
 	"movq   8(%%eax), %%mm1\n"      /* x[1] */
 	"movq  16(%%eax), %%mm2\n"      /* x[2] */

--- a/plugins/imdct/ac3_srfft_sse.c
+++ b/plugins/imdct/ac3_srfft_sse.c
--- a/src/ac3_decoder/ac3_decoder.h
+++ b/src/ac3_decoder/ac3_decoder.h
@@ -2,7 +2,7 @@
 * ac3_decoder.h : ac3 decoder interface
 *****************************************************************************
 * Copyright (C) 1999, 2000 VideoLAN
- * $Id: ac3_decoder.h,v 1.10 2001/06/12 00:30:41 reno Exp $
+ * $Id: ac3_decoder.h,v 1.11 2001/07/08 23:15:11 reno Exp $
 *
 * Authors: Michel Kaempf <maxx@via.ecp.fr>
 *          Renaud Dartus <reno@videolan.org>
@@ -354,6 +354,9 @@ typedef struct mantissa_s

 struct ac3dec_s
 {
+    float               samples[6][256] __attribute__ ((aligned(16)));
+    imdct_t             imdct __attribute__ ((aligned(16)));
+
    /*
     * Input properties
     */
@@ -370,12 +373,10 @@ struct ac3dec_s
    bsi_t               bsi;
    audblk_t            audblk;

-    float               samples[6][256] __attribute__ ((aligned(16)));
    dm_par_t            dm_par;

    bit_allocate_t      bit_allocate;
    mantissa_t          mantissa;
-    imdct_t             imdct;
    downmix_t           downmix;

 };

--- a/src/ac3_decoder/ac3_decoder_thread.c
+++ b/src/ac3_decoder/ac3_decoder_thread.c
@@ -2,7 +2,7 @@
 * ac3_decoder_thread.c: ac3 decoder thread
 *****************************************************************************
 * Copyright (C) 1999, 2000 VideoLAN
- * $Id: ac3_decoder_thread.c,v 1.34 2001/05/31 01:37:08 sam Exp $
+ * $Id: ac3_decoder_thread.c,v 1.35 2001/07/08 23:15:11 reno Exp $
 *
 * Authors: Michel Lespinasse <walken@zoy.org>
 *
@@ -82,7 +82,13 @@ vlc_thread_t ac3dec_CreateThread( adec_config_t * p_config )
    intf_DbgMsg( "ac3dec debug: creating ac3 decoder thread" );

    /* Allocate the memory needed to store the thread's structure */
-    if((p_ac3thread = (ac3dec_thread_t *)malloc(sizeof(ac3dec_thread_t)))==NULL)
+    p_ac3thread = (ac3dec_thread_t *)malloc(sizeof(ac3dec_thread_t));
+
+    /* We need to be 16 bytes aligned */
+    p_ac3thread->ac3thread = (int)p_ac3thread & (-15);
+    p_ac3thread = (ac3dec_thread_t *)p_ac3thread->ac3thread;
+    
+    if(p_ac3thread == NULL)
    {
        intf_ErrMsg ( "ac3dec error: not enough memory "
                      "for ac3dec_CreateThread() to create the new thread");
@@ -335,6 +341,7 @@ static void EndThread (ac3dec_thread_t * p_ac3thread)

    /* Destroy descriptor */
    free( p_ac3thread->p_config );
+    p_ac3thread = (ac3dec_thread_t *)p_ac3thread->ac3thread;
    free( p_ac3thread );

    intf_DbgMsg ("ac3dec debug: ac3 decoder thread %p destroyed", p_ac3thread);

--- a/src/ac3_decoder/ac3_decoder_thread.h
+++ b/src/ac3_decoder/ac3_decoder_thread.h
@@ -2,7 +2,7 @@
 * ac3_decoder_thread.h : ac3 decoder thread interface
 *****************************************************************************
 * Copyright (C) 1999, 2000 VideoLAN
- * $Id: ac3_decoder_thread.h,v 1.7 2001/05/14 15:58:03 reno Exp $
+ * $Id: ac3_decoder_thread.h,v 1.8 2001/07/08 23:15:11 reno Exp $
 *
 * Authors: Michel Kaempf <maxx@via.ecp.fr>
 *
@@ -24,8 +24,16 @@
 /*****************************************************************************
 * ac3dec_thread_t : ac3 decoder thread descriptor
 *****************************************************************************/
+
 typedef struct ac3dec_thread_s
 {
+    /*
+     * Decoder properties
+     */
+    float used_for_alignement1;
+    float used_for_alignement2;
+    ac3dec_t            ac3_decoder __attribute__ ((aligned(16)));
+    
    /*
     * Thread properties
     */
@@ -38,16 +46,12 @@ typedef struct ac3dec_thread_s
    int                 sync_ptr;          /* sync ptr from ac3 magic header */
    adec_config_t *     p_config;

-    /*
-     * Decoder properties
-     */
-    ac3dec_t            ac3_decoder;
-
    /*
     * Output properties
     */
    aout_fifo_t *       p_aout_fifo; /* stores the decompressed audio frames */
-
+    int                 ac3thread;      /* save the old pointer */
+    
 } ac3dec_thread_t;

 /*****************************************************************************