Commit dee3179d authored by Renaud Dartus's avatar Renaud Dartus

* Alignement in asm functions

* 16 bytes alignement for data (need fo SSE)
* Optimization in SSE
parent 5b49dba8
......@@ -2,7 +2,7 @@
* ac3_imdct.h : AC3 IMDCT types
*****************************************************************************
* Copyright (C) 1999, 2000 VideoLAN
* $Id: ac3_imdct.h,v 1.4 2001/06/12 00:30:41 reno Exp $
* $Id: ac3_imdct.h,v 1.5 2001/07/08 23:15:11 reno Exp $
*
* Authors: Michel Kaempf <maxx@via.ecp.fr>
* Renaud Dartus <reno@videolan.org>
......@@ -42,18 +42,19 @@ typedef struct imdct_s
float xsin1[N/4] __attribute__ ((aligned(16)));
float xcos2[N/8] __attribute__ ((aligned(16)));
float xsin2[N/8] __attribute__ ((aligned(16)));
float xcos_sin_sse[128 * 4] __attribute__ ((aligned(16)));
/* Twiddle factor LUT */
complex_t *w[7] __attribute__ ((aligned(16)));
complex_t w_1[1] __attribute__ ((aligned(16)));
float used_for_alignement1;
float used_for_alignement2;
complex_t w_2[2] __attribute__ ((aligned(16)));
complex_t w_4[4] __attribute__ ((aligned(16)));
complex_t w_8[8] __attribute__ ((aligned(16)));
complex_t w_16[16] __attribute__ ((aligned(16)));
complex_t w_32[32] __attribute__ ((aligned(16)));
complex_t w_64[64] __attribute__ ((aligned(16)));
float xcos_sin_sse[128 * 4] __attribute__ ((aligned(16)));
complex_t *w[7] __attribute__ ((aligned(16)));
/* Module used and shortcuts */
struct module_s * p_module;
......
......@@ -2,7 +2,7 @@
* ac3_downmix_3dn.c: accelerated 3D Now! ac3 downmix functions
*****************************************************************************
* Copyright (C) 1999, 2000, 2001 VideoLAN
* $Id: ac3_downmix_3dn.c,v 1.3 2001/07/01 08:49:09 gbazin Exp $
* $Id: ac3_downmix_3dn.c,v 1.4 2001/07/08 23:15:11 reno Exp $
*
* Authors: Renaud Dartus <reno@videolan.org>
*
......@@ -46,6 +46,7 @@ void sqrt2_3dn (void)
void _M( downmix_3f_2r_to_2ch ) (float * samples, dm_par_t * dm_par)
{
__asm__ __volatile__ (
".align 16\n"
"pushl %%ebx\n"
"movl $128, %%ebx\n" /* loop counter */
......@@ -58,6 +59,7 @@ void _M( downmix_3f_2r_to_2ch ) (float * samples, dm_par_t * dm_par)
"movd 8(%%ecx), %%mm7\n" /* slev */
"punpckldq %%mm7, %%mm7\n" /* slev | slev */
".align 16\n"
".loop:\n"
"movq (%%eax), %%mm0\n" /* left */
"movq 2048(%%eax), %%mm1\n" /* right */
......@@ -90,6 +92,7 @@ void _M( downmix_3f_2r_to_2ch ) (float * samples, dm_par_t * dm_par)
void _M( downmix_2f_2r_to_2ch ) (float *samples, dm_par_t * dm_par)
{
__asm__ __volatile__ (
".align 16\n"
"pushl %%ebx\n"
"movl $128, %%ebx\n" /* loop counter */
......@@ -99,6 +102,7 @@ void _M( downmix_2f_2r_to_2ch ) (float *samples, dm_par_t * dm_par)
"movd 8(%%ecx), %%mm7\n" /* slev */
"punpckldq %%mm7, %%mm7\n" /* slev | slev */
".align 16\n"
".loop3:\n"
"movq (%%eax), %%mm0\n" /* left */
"movq 1024(%%eax), %%mm1\n" /* right */
......@@ -127,7 +131,7 @@ void _M( downmix_2f_2r_to_2ch ) (float *samples, dm_par_t * dm_par)
void _M( downmix_3f_1r_to_2ch ) (float *samples, dm_par_t * dm_par)
{
__asm__ __volatile__ (
".align 16\n"
"pushl %%ebx\n"
"movl $128, %%ebx\n" /* loop counter */
......@@ -140,6 +144,7 @@ void _M( downmix_3f_1r_to_2ch ) (float *samples, dm_par_t * dm_par)
"movd 8(%%ecx), %%mm7\n" /* slev */
"punpckldq %%mm7, %%mm7\n" /* slev | slev */
".align 16\n"
".loop4:\n"
"movq (%%eax), %%mm0\n" /* left */
"movq 2048(%%eax), %%mm1\n" /* right */
......@@ -170,6 +175,7 @@ void _M( downmix_3f_1r_to_2ch ) (float *samples, dm_par_t * dm_par)
void _M( downmix_2f_1r_to_2ch ) (float *samples, dm_par_t * dm_par)
{
__asm__ __volatile__ (
".align 16\n"
"pushl %%ebx\n"
"movl $128, %%ebx\n" /* loop counter */
......@@ -179,6 +185,7 @@ void _M( downmix_2f_1r_to_2ch ) (float *samples, dm_par_t * dm_par)
"movd 8(%%ecx), %%mm7\n" /* slev */
"punpckldq %%mm7, %%mm7\n" /* slev | slev */
".align 16\n"
".loop5:\n"
"movq (%%eax), %%mm0\n" /* left */
"movq 1024(%%eax), %%mm1\n" /* right */
......@@ -205,6 +212,7 @@ void _M( downmix_2f_1r_to_2ch ) (float *samples, dm_par_t * dm_par)
void _M( downmix_3f_0r_to_2ch ) (float *samples, dm_par_t * dm_par)
{
__asm__ __volatile__ (
".align 16\n"
"pushl %%ebx\n"
"movl $128, %%ebx\n" /* loop counter */
......@@ -214,6 +222,7 @@ void _M( downmix_3f_0r_to_2ch ) (float *samples, dm_par_t * dm_par)
"movd 4(%%ecx), %%mm6\n" /* clev */
"punpckldq %%mm6, %%mm6\n" /* clev | clev */
".align 16\n"
".loop6:\n"
"movq (%%eax), %%mm0\n" /*left */
"movq 2048(%%eax), %%mm1\n" /* right */
......@@ -240,6 +249,7 @@ void _M( downmix_3f_0r_to_2ch ) (float *samples, dm_par_t * dm_par)
void _M( stream_sample_1ch_to_s16 ) (s16 *s16_samples, float *left)
{
__asm__ __volatile__ (
".align 16\n"
"pushl %%ebx\n"
"pushl %%edx\n"
......@@ -248,6 +258,7 @@ void _M( stream_sample_1ch_to_s16 ) (s16 *s16_samples, float *left)
"punpckldq %%mm7, %%mm7\n" /* sqrt2 | sqrt2 */
"movl $128, %%ebx\n"
".align 16\n"
".loop2:\n"
"movq (%%ecx), %%mm0\n" /* c1 | c0 */
"pfmul %%mm7, %%mm0\n"
......@@ -274,9 +285,11 @@ void _M( stream_sample_2ch_to_s16 ) (s16 *s16_samples, float *left, float *right
{
__asm__ __volatile__ (
".align 16\n"
"pushl %%ebx\n"
"movl $128, %%ebx\n"
".align 16\n"
".loop1:\n"
"movq (%%ecx), %%mm0\n" /* l1 | l0 */
"movq (%%edx), %%mm1\n" /* r1 | r0 */
......
This diff is collapsed.
......@@ -2,7 +2,7 @@
* ac3_imdct_3dn.c: accelerated 3D Now! ac3 DCT
*****************************************************************************
* Copyright (C) 1999, 2000 VideoLAN
* $Id: ac3_imdct_3dn.c,v 1.4 2001/06/03 12:47:21 sam Exp $
* $Id: ac3_imdct_3dn.c,v 1.5 2001/07/08 23:15:11 reno Exp $
*
* Authors: Renaud Dartus <reno@videolan.org>
*
......@@ -89,6 +89,7 @@ void _M( imdct_do_512_nol ) (imdct_t * p_imdct, float data[], float delay[])
static void imdct512_pre_ifft_twiddle_3dn (const int *pmt, complex_t *buf, float *data, float *xcos_sin_sse)
{
__asm__ __volatile__ (
".align 16\n"
"pushl %%ebp\n"
"movl %%esp, %%ebp\n"
"addl $-4, %%esp\n" /* local variable, loop counter */
......@@ -106,6 +107,7 @@ static void imdct512_pre_ifft_twiddle_3dn (const int *pmt, complex_t *buf, float
"movl 20(%%ebp), %%edx\n" /* xcos_sin_sse */
"movl $128, -4(%%ebp)\n"
".align 16\n"
".loop:\n"
"movl (%%eax), %%esi\n"
"movd (%%ecx, %%esi, 8), %%mm1\n" /* 2j */
......@@ -147,9 +149,11 @@ static void imdct512_pre_ifft_twiddle_3dn (const int *pmt, complex_t *buf, float
static void imdct512_post_ifft_twiddle_3dn (complex_t *buf, float *xcos_sin_sse)
{
__asm__ __volatile__ (
".align 16\n"
"pushl %%ebx\n"
"movl $64, %%ebx\n" /* loop counter */
".align 16\n"
".loop1:\n"
"movq (%%eax), %%mm0\n" /* im0 | re0 */
"movq %%mm0, %%mm1\n" /* im0 | re0 */
......@@ -200,6 +204,7 @@ static void imdct512_post_ifft_twiddle_3dn (complex_t *buf, float *xcos_sin_sse)
static void imdct512_window_delay_3dn (complex_t *buf, float *data_ptr, float *window_prt, float *delay_prt)
{
__asm__ __volatile__ (
".align 16\n"
"pushl %%ebp\n"
"movl %%esp, %%ebp\n"
......@@ -219,6 +224,7 @@ static void imdct512_window_delay_3dn (complex_t *buf, float *data_ptr, float *w
"leal 504(%%eax), %%edi\n" /* buf[63].re */
"movl 12(%%ebp), %%eax\n" /* data */
".align 16\n"
".first_128_samples:\n"
"movd (%%esi), %%mm0\n" /* im0 */
"movd 8(%%esi), %%mm2\n" /* im1 */
......@@ -258,6 +264,7 @@ static void imdct512_window_delay_3dn (complex_t *buf, float *data_ptr, float *w
"leal 1020(%%esi), %%edi\n" /* buf[127].im */
"movl $32, %%ecx\n" /* loop count */
".align 16\n"
".second_128_samples:\n"
"movd (%%esi), %%mm0\n" /* buf[i].re */
"movd 8(%%esi), %%mm2\n" /* re1 */
......@@ -302,6 +309,7 @@ static void imdct512_window_delay_3dn (complex_t *buf, float *data_ptr, float *w
"movl $32, %%ecx\n" /* loop count */
"movl 20(%%ebp), %%eax\n" /* delay */
".align 16\n"
".first_128_delay:\n"
"movd (%%esi), %%mm0\n" /* re0 */
"movd 8(%%esi), %%mm2\n" /* re1 */
......@@ -339,6 +347,7 @@ static void imdct512_window_delay_3dn (complex_t *buf, float *data_ptr, float *w
"leal 1016(%%ebx), %%edi\n" /* buf[127].re */
"movl $32, %%ecx\n" /* loop count */
".align 16\n"
".second_128_delay:\n"
"movd (%%esi), %%mm0\n" /* im0 */
"movd 8(%%esi), %%mm2\n" /* im1 */
......@@ -386,6 +395,7 @@ static void imdct512_window_delay_3dn (complex_t *buf, float *data_ptr, float *w
static void imdct512_window_delay_nol_3dn (complex_t *buf, float *data_ptr, float *window_prt, float *delay_prt)
{
__asm__ __volatile__ (
".align 16\n"
"pushl %%ebp\n"
"movl %%esp, %%ebp\n"
......@@ -405,6 +415,7 @@ static void imdct512_window_delay_nol_3dn (complex_t *buf, float *data_ptr, floa
"leal 504(%%eax), %%edi\n" /* buf[63].re */
"movl 12(%%ebp), %%eax\n" /* data */
".align 16\n"
".first_128_samples2:\n"
"movd (%%esi), %%mm0\n" /* im0 */
"movd 8(%%esi), %%mm2\n" /* im1 */
......@@ -439,6 +450,7 @@ static void imdct512_window_delay_nol_3dn (complex_t *buf, float *data_ptr, floa
"leal 1020(%%esi), %%edi\n" /* buf[127].im */
"movl $32, %%ecx\n" /* loop count */
".align 16\n"
".second_128_samples2:\n"
"movd (%%esi), %%mm0\n" /* buf[i].re */
"movd 8(%%esi), %%mm2\n" /* re1 */
......@@ -478,6 +490,7 @@ static void imdct512_window_delay_nol_3dn (complex_t *buf, float *data_ptr, floa
"movl $32, %%ecx\n" /* loop count */
"movl 20(%%ebp), %%eax\n" /* delay */
".align 16\n"
".first_128_delays:\n"
"movd (%%esi), %%mm0\n" /* re0 */
"movd 8(%%esi), %%mm2\n" /* re1 */
......@@ -515,6 +528,7 @@ static void imdct512_window_delay_nol_3dn (complex_t *buf, float *data_ptr, floa
"leal 1016(%%ebx), %%edi\n" /* buf[127].re */
"movl $32, %%ecx\n" /* loop count */
".align 16\n"
".second_128_delays:\n"
"movd (%%esi), %%mm0\n" /* im0 */
"movd 8(%%esi), %%mm2\n" /* im1 */
......
This diff is collapsed.
......@@ -2,7 +2,7 @@
* ac3_srfft_3dn.c: accelerated 3D Now! ac3 fft functions
*****************************************************************************
* Copyright (C) 1999, 2000, 2001 VideoLAN
* $Id: ac3_srfft_3dn.c,v 1.1 2001/05/16 14:51:29 reno Exp $
* $Id: ac3_srfft_3dn.c,v 1.2 2001/07/08 23:15:11 reno Exp $
*
* Authors: Renaud Dartus <reno@videolan.org>
*
......@@ -126,6 +126,7 @@ void C_1_3dn (void)
static void fft_4_3dn (complex_t *x)
{
__asm__ __volatile__ (
".align 16\n"
"movq (%%eax), %%mm0\n" /* x[0] */
"movq 8(%%eax), %%mm1\n" /* x[1] */
"movq 16(%%eax), %%mm2\n" /* x[2] */
......
This diff is collapsed.
......@@ -2,7 +2,7 @@
* ac3_decoder.h : ac3 decoder interface
*****************************************************************************
* Copyright (C) 1999, 2000 VideoLAN
* $Id: ac3_decoder.h,v 1.10 2001/06/12 00:30:41 reno Exp $
* $Id: ac3_decoder.h,v 1.11 2001/07/08 23:15:11 reno Exp $
*
* Authors: Michel Kaempf <maxx@via.ecp.fr>
* Renaud Dartus <reno@videolan.org>
......@@ -354,6 +354,9 @@ typedef struct mantissa_s
struct ac3dec_s
{
float samples[6][256] __attribute__ ((aligned(16)));
imdct_t imdct __attribute__ ((aligned(16)));
/*
* Input properties
*/
......@@ -370,12 +373,10 @@ struct ac3dec_s
bsi_t bsi;
audblk_t audblk;
float samples[6][256] __attribute__ ((aligned(16)));
dm_par_t dm_par;
bit_allocate_t bit_allocate;
mantissa_t mantissa;
imdct_t imdct;
downmix_t downmix;
};
......
......@@ -2,7 +2,7 @@
* ac3_decoder_thread.c: ac3 decoder thread
*****************************************************************************
* Copyright (C) 1999, 2000 VideoLAN
* $Id: ac3_decoder_thread.c,v 1.34 2001/05/31 01:37:08 sam Exp $
* $Id: ac3_decoder_thread.c,v 1.35 2001/07/08 23:15:11 reno Exp $
*
* Authors: Michel Lespinasse <walken@zoy.org>
*
......@@ -82,7 +82,13 @@ vlc_thread_t ac3dec_CreateThread( adec_config_t * p_config )
intf_DbgMsg( "ac3dec debug: creating ac3 decoder thread" );
/* Allocate the memory needed to store the thread's structure */
if((p_ac3thread = (ac3dec_thread_t *)malloc(sizeof(ac3dec_thread_t)))==NULL)
p_ac3thread = (ac3dec_thread_t *)malloc(sizeof(ac3dec_thread_t));
/* We need to be 16 bytes aligned */
p_ac3thread->ac3thread = (int)p_ac3thread & (-15);
p_ac3thread = (ac3dec_thread_t *)p_ac3thread->ac3thread;
if(p_ac3thread == NULL)
{
intf_ErrMsg ( "ac3dec error: not enough memory "
"for ac3dec_CreateThread() to create the new thread");
......@@ -335,6 +341,7 @@ static void EndThread (ac3dec_thread_t * p_ac3thread)
/* Destroy descriptor */
free( p_ac3thread->p_config );
p_ac3thread = (ac3dec_thread_t *)p_ac3thread->ac3thread;
free( p_ac3thread );
intf_DbgMsg ("ac3dec debug: ac3 decoder thread %p destroyed", p_ac3thread);
......
......@@ -2,7 +2,7 @@
* ac3_decoder_thread.h : ac3 decoder thread interface
*****************************************************************************
* Copyright (C) 1999, 2000 VideoLAN
* $Id: ac3_decoder_thread.h,v 1.7 2001/05/14 15:58:03 reno Exp $
* $Id: ac3_decoder_thread.h,v 1.8 2001/07/08 23:15:11 reno Exp $
*
* Authors: Michel Kaempf <maxx@via.ecp.fr>
*
......@@ -24,8 +24,16 @@
/*****************************************************************************
* ac3dec_thread_t : ac3 decoder thread descriptor
*****************************************************************************/
typedef struct ac3dec_thread_s
{
/*
* Decoder properties
*/
float used_for_alignement1;
float used_for_alignement2;
ac3dec_t ac3_decoder __attribute__ ((aligned(16)));
/*
* Thread properties
*/
......@@ -38,16 +46,12 @@ typedef struct ac3dec_thread_s
int sync_ptr; /* sync ptr from ac3 magic header */
adec_config_t * p_config;
/*
* Decoder properties
*/
ac3dec_t ac3_decoder;
/*
* Output properties
*/
aout_fifo_t * p_aout_fifo; /* stores the decompressed audio frames */
int ac3thread; /* save the old pointer */
} ac3dec_thread_t;
/*****************************************************************************
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment