Commit dee3179d authored by Renaud Dartus's avatar Renaud Dartus

* Alignement in asm functions

* 16 bytes alignement for data (need fo SSE)
* Optimization in SSE
parent 5b49dba8
...@@ -2,7 +2,7 @@ ...@@ -2,7 +2,7 @@
* ac3_imdct.h : AC3 IMDCT types * ac3_imdct.h : AC3 IMDCT types
***************************************************************************** *****************************************************************************
* Copyright (C) 1999, 2000 VideoLAN * Copyright (C) 1999, 2000 VideoLAN
* $Id: ac3_imdct.h,v 1.4 2001/06/12 00:30:41 reno Exp $ * $Id: ac3_imdct.h,v 1.5 2001/07/08 23:15:11 reno Exp $
* *
* Authors: Michel Kaempf <maxx@via.ecp.fr> * Authors: Michel Kaempf <maxx@via.ecp.fr>
* Renaud Dartus <reno@videolan.org> * Renaud Dartus <reno@videolan.org>
...@@ -42,18 +42,19 @@ typedef struct imdct_s ...@@ -42,18 +42,19 @@ typedef struct imdct_s
float xsin1[N/4] __attribute__ ((aligned(16))); float xsin1[N/4] __attribute__ ((aligned(16)));
float xcos2[N/8] __attribute__ ((aligned(16))); float xcos2[N/8] __attribute__ ((aligned(16)));
float xsin2[N/8] __attribute__ ((aligned(16))); float xsin2[N/8] __attribute__ ((aligned(16)));
float xcos_sin_sse[128 * 4] __attribute__ ((aligned(16)));
/* Twiddle factor LUT */ /* Twiddle factor LUT */
complex_t *w[7] __attribute__ ((aligned(16)));
complex_t w_1[1] __attribute__ ((aligned(16))); complex_t w_1[1] __attribute__ ((aligned(16)));
float used_for_alignement1;
float used_for_alignement2;
complex_t w_2[2] __attribute__ ((aligned(16))); complex_t w_2[2] __attribute__ ((aligned(16)));
complex_t w_4[4] __attribute__ ((aligned(16))); complex_t w_4[4] __attribute__ ((aligned(16)));
complex_t w_8[8] __attribute__ ((aligned(16))); complex_t w_8[8] __attribute__ ((aligned(16)));
complex_t w_16[16] __attribute__ ((aligned(16))); complex_t w_16[16] __attribute__ ((aligned(16)));
complex_t w_32[32] __attribute__ ((aligned(16))); complex_t w_32[32] __attribute__ ((aligned(16)));
complex_t w_64[64] __attribute__ ((aligned(16))); complex_t w_64[64] __attribute__ ((aligned(16)));
complex_t *w[7] __attribute__ ((aligned(16)));
float xcos_sin_sse[128 * 4] __attribute__ ((aligned(16)));
/* Module used and shortcuts */ /* Module used and shortcuts */
struct module_s * p_module; struct module_s * p_module;
......
...@@ -2,7 +2,7 @@ ...@@ -2,7 +2,7 @@
* ac3_downmix_3dn.c: accelerated 3D Now! ac3 downmix functions * ac3_downmix_3dn.c: accelerated 3D Now! ac3 downmix functions
***************************************************************************** *****************************************************************************
* Copyright (C) 1999, 2000, 2001 VideoLAN * Copyright (C) 1999, 2000, 2001 VideoLAN
* $Id: ac3_downmix_3dn.c,v 1.3 2001/07/01 08:49:09 gbazin Exp $ * $Id: ac3_downmix_3dn.c,v 1.4 2001/07/08 23:15:11 reno Exp $
* *
* Authors: Renaud Dartus <reno@videolan.org> * Authors: Renaud Dartus <reno@videolan.org>
* *
...@@ -46,6 +46,7 @@ void sqrt2_3dn (void) ...@@ -46,6 +46,7 @@ void sqrt2_3dn (void)
void _M( downmix_3f_2r_to_2ch ) (float * samples, dm_par_t * dm_par) void _M( downmix_3f_2r_to_2ch ) (float * samples, dm_par_t * dm_par)
{ {
__asm__ __volatile__ ( __asm__ __volatile__ (
".align 16\n"
"pushl %%ebx\n" "pushl %%ebx\n"
"movl $128, %%ebx\n" /* loop counter */ "movl $128, %%ebx\n" /* loop counter */
...@@ -58,6 +59,7 @@ void _M( downmix_3f_2r_to_2ch ) (float * samples, dm_par_t * dm_par) ...@@ -58,6 +59,7 @@ void _M( downmix_3f_2r_to_2ch ) (float * samples, dm_par_t * dm_par)
"movd 8(%%ecx), %%mm7\n" /* slev */ "movd 8(%%ecx), %%mm7\n" /* slev */
"punpckldq %%mm7, %%mm7\n" /* slev | slev */ "punpckldq %%mm7, %%mm7\n" /* slev | slev */
".align 16\n"
".loop:\n" ".loop:\n"
"movq (%%eax), %%mm0\n" /* left */ "movq (%%eax), %%mm0\n" /* left */
"movq 2048(%%eax), %%mm1\n" /* right */ "movq 2048(%%eax), %%mm1\n" /* right */
...@@ -90,6 +92,7 @@ void _M( downmix_3f_2r_to_2ch ) (float * samples, dm_par_t * dm_par) ...@@ -90,6 +92,7 @@ void _M( downmix_3f_2r_to_2ch ) (float * samples, dm_par_t * dm_par)
void _M( downmix_2f_2r_to_2ch ) (float *samples, dm_par_t * dm_par) void _M( downmix_2f_2r_to_2ch ) (float *samples, dm_par_t * dm_par)
{ {
__asm__ __volatile__ ( __asm__ __volatile__ (
".align 16\n"
"pushl %%ebx\n" "pushl %%ebx\n"
"movl $128, %%ebx\n" /* loop counter */ "movl $128, %%ebx\n" /* loop counter */
...@@ -99,6 +102,7 @@ void _M( downmix_2f_2r_to_2ch ) (float *samples, dm_par_t * dm_par) ...@@ -99,6 +102,7 @@ void _M( downmix_2f_2r_to_2ch ) (float *samples, dm_par_t * dm_par)
"movd 8(%%ecx), %%mm7\n" /* slev */ "movd 8(%%ecx), %%mm7\n" /* slev */
"punpckldq %%mm7, %%mm7\n" /* slev | slev */ "punpckldq %%mm7, %%mm7\n" /* slev | slev */
".align 16\n"
".loop3:\n" ".loop3:\n"
"movq (%%eax), %%mm0\n" /* left */ "movq (%%eax), %%mm0\n" /* left */
"movq 1024(%%eax), %%mm1\n" /* right */ "movq 1024(%%eax), %%mm1\n" /* right */
...@@ -127,7 +131,7 @@ void _M( downmix_2f_2r_to_2ch ) (float *samples, dm_par_t * dm_par) ...@@ -127,7 +131,7 @@ void _M( downmix_2f_2r_to_2ch ) (float *samples, dm_par_t * dm_par)
void _M( downmix_3f_1r_to_2ch ) (float *samples, dm_par_t * dm_par) void _M( downmix_3f_1r_to_2ch ) (float *samples, dm_par_t * dm_par)
{ {
__asm__ __volatile__ ( __asm__ __volatile__ (
".align 16\n"
"pushl %%ebx\n" "pushl %%ebx\n"
"movl $128, %%ebx\n" /* loop counter */ "movl $128, %%ebx\n" /* loop counter */
...@@ -140,6 +144,7 @@ void _M( downmix_3f_1r_to_2ch ) (float *samples, dm_par_t * dm_par) ...@@ -140,6 +144,7 @@ void _M( downmix_3f_1r_to_2ch ) (float *samples, dm_par_t * dm_par)
"movd 8(%%ecx), %%mm7\n" /* slev */ "movd 8(%%ecx), %%mm7\n" /* slev */
"punpckldq %%mm7, %%mm7\n" /* slev | slev */ "punpckldq %%mm7, %%mm7\n" /* slev | slev */
".align 16\n"
".loop4:\n" ".loop4:\n"
"movq (%%eax), %%mm0\n" /* left */ "movq (%%eax), %%mm0\n" /* left */
"movq 2048(%%eax), %%mm1\n" /* right */ "movq 2048(%%eax), %%mm1\n" /* right */
...@@ -170,6 +175,7 @@ void _M( downmix_3f_1r_to_2ch ) (float *samples, dm_par_t * dm_par) ...@@ -170,6 +175,7 @@ void _M( downmix_3f_1r_to_2ch ) (float *samples, dm_par_t * dm_par)
void _M( downmix_2f_1r_to_2ch ) (float *samples, dm_par_t * dm_par) void _M( downmix_2f_1r_to_2ch ) (float *samples, dm_par_t * dm_par)
{ {
__asm__ __volatile__ ( __asm__ __volatile__ (
".align 16\n"
"pushl %%ebx\n" "pushl %%ebx\n"
"movl $128, %%ebx\n" /* loop counter */ "movl $128, %%ebx\n" /* loop counter */
...@@ -179,6 +185,7 @@ void _M( downmix_2f_1r_to_2ch ) (float *samples, dm_par_t * dm_par) ...@@ -179,6 +185,7 @@ void _M( downmix_2f_1r_to_2ch ) (float *samples, dm_par_t * dm_par)
"movd 8(%%ecx), %%mm7\n" /* slev */ "movd 8(%%ecx), %%mm7\n" /* slev */
"punpckldq %%mm7, %%mm7\n" /* slev | slev */ "punpckldq %%mm7, %%mm7\n" /* slev | slev */
".align 16\n"
".loop5:\n" ".loop5:\n"
"movq (%%eax), %%mm0\n" /* left */ "movq (%%eax), %%mm0\n" /* left */
"movq 1024(%%eax), %%mm1\n" /* right */ "movq 1024(%%eax), %%mm1\n" /* right */
...@@ -205,6 +212,7 @@ void _M( downmix_2f_1r_to_2ch ) (float *samples, dm_par_t * dm_par) ...@@ -205,6 +212,7 @@ void _M( downmix_2f_1r_to_2ch ) (float *samples, dm_par_t * dm_par)
void _M( downmix_3f_0r_to_2ch ) (float *samples, dm_par_t * dm_par) void _M( downmix_3f_0r_to_2ch ) (float *samples, dm_par_t * dm_par)
{ {
__asm__ __volatile__ ( __asm__ __volatile__ (
".align 16\n"
"pushl %%ebx\n" "pushl %%ebx\n"
"movl $128, %%ebx\n" /* loop counter */ "movl $128, %%ebx\n" /* loop counter */
...@@ -214,6 +222,7 @@ void _M( downmix_3f_0r_to_2ch ) (float *samples, dm_par_t * dm_par) ...@@ -214,6 +222,7 @@ void _M( downmix_3f_0r_to_2ch ) (float *samples, dm_par_t * dm_par)
"movd 4(%%ecx), %%mm6\n" /* clev */ "movd 4(%%ecx), %%mm6\n" /* clev */
"punpckldq %%mm6, %%mm6\n" /* clev | clev */ "punpckldq %%mm6, %%mm6\n" /* clev | clev */
".align 16\n"
".loop6:\n" ".loop6:\n"
"movq (%%eax), %%mm0\n" /*left */ "movq (%%eax), %%mm0\n" /*left */
"movq 2048(%%eax), %%mm1\n" /* right */ "movq 2048(%%eax), %%mm1\n" /* right */
...@@ -240,6 +249,7 @@ void _M( downmix_3f_0r_to_2ch ) (float *samples, dm_par_t * dm_par) ...@@ -240,6 +249,7 @@ void _M( downmix_3f_0r_to_2ch ) (float *samples, dm_par_t * dm_par)
void _M( stream_sample_1ch_to_s16 ) (s16 *s16_samples, float *left) void _M( stream_sample_1ch_to_s16 ) (s16 *s16_samples, float *left)
{ {
__asm__ __volatile__ ( __asm__ __volatile__ (
".align 16\n"
"pushl %%ebx\n" "pushl %%ebx\n"
"pushl %%edx\n" "pushl %%edx\n"
...@@ -248,6 +258,7 @@ void _M( stream_sample_1ch_to_s16 ) (s16 *s16_samples, float *left) ...@@ -248,6 +258,7 @@ void _M( stream_sample_1ch_to_s16 ) (s16 *s16_samples, float *left)
"punpckldq %%mm7, %%mm7\n" /* sqrt2 | sqrt2 */ "punpckldq %%mm7, %%mm7\n" /* sqrt2 | sqrt2 */
"movl $128, %%ebx\n" "movl $128, %%ebx\n"
".align 16\n"
".loop2:\n" ".loop2:\n"
"movq (%%ecx), %%mm0\n" /* c1 | c0 */ "movq (%%ecx), %%mm0\n" /* c1 | c0 */
"pfmul %%mm7, %%mm0\n" "pfmul %%mm7, %%mm0\n"
...@@ -274,9 +285,11 @@ void _M( stream_sample_2ch_to_s16 ) (s16 *s16_samples, float *left, float *right ...@@ -274,9 +285,11 @@ void _M( stream_sample_2ch_to_s16 ) (s16 *s16_samples, float *left, float *right
{ {
__asm__ __volatile__ ( __asm__ __volatile__ (
".align 16\n"
"pushl %%ebx\n" "pushl %%ebx\n"
"movl $128, %%ebx\n" "movl $128, %%ebx\n"
".align 16\n"
".loop1:\n" ".loop1:\n"
"movq (%%ecx), %%mm0\n" /* l1 | l0 */ "movq (%%ecx), %%mm0\n" /* l1 | l0 */
"movq (%%edx), %%mm1\n" /* r1 | r0 */ "movq (%%edx), %%mm1\n" /* r1 | r0 */
......
...@@ -2,7 +2,7 @@ ...@@ -2,7 +2,7 @@
* ac3_downmix_sse.c: accelerated SSE ac3 downmix functions * ac3_downmix_sse.c: accelerated SSE ac3 downmix functions
***************************************************************************** *****************************************************************************
* Copyright (C) 1999, 2000, 2001 VideoLAN * Copyright (C) 1999, 2000, 2001 VideoLAN
* $Id: ac3_downmix_sse.c,v 1.3 2001/07/01 08:49:09 gbazin Exp $ * $Id: ac3_downmix_sse.c,v 1.4 2001/07/08 23:15:11 reno Exp $
* *
* Authors: Renaud Dartus <reno@videolan.org> * Authors: Renaud Dartus <reno@videolan.org>
* Aaron Holtzman <aholtzma@engr.uvic.ca> * Aaron Holtzman <aholtzma@engr.uvic.ca>
...@@ -41,48 +41,51 @@ ...@@ -41,48 +41,51 @@
void sqrt2_sse (void) __asm__ ("sqrt2_sse"); void sqrt2_sse (void) __asm__ ("sqrt2_sse");
void sqrt2_sse (void) void sqrt2_sse (void)
{ {
__asm__ (".float 0f0.7071068"); __asm__ (".align 16\n"
".float 0f0.7071068");
} }
void _M( downmix_3f_2r_to_2ch ) (float * samples, dm_par_t * dm_par) void _M( downmix_3f_2r_to_2ch ) (float * samples, dm_par_t * dm_par)
{ {
__asm__ __volatile__ ( __asm__ __volatile__ (
".align 16\n"
"pushl %%ebx\n" "pushl %%ebx\n"
"movl $64, %%ebx\n" /* loop counter */ "movl $64, %%ebx\n" /* loop counter */
"movss (%%ecx), %%xmm5\n" /* unit */ "movss (%%ecx), %%xmm5\n" /* unit */
"shufps $0, %%xmm5, %%xmm5\n" /* unit | unit | unit | unit */ "shufps $0, %%xmm5, %%xmm5\n" /* unit | unit | unit | unit */
"movss 4(%%ecx), %%xmm6\n" /* clev */ "movss 4(%%ecx), %%xmm6\n" /* clev */
"shufps $0, %%xmm6, %%xmm6\n" /* clev | clev | clev | clev */ "shufps $0, %%xmm6, %%xmm6\n" /* clev | clev | clev | clev */
"movss 8(%%ecx), %%xmm7\n" /* slev */ "movss 8(%%ecx), %%xmm7\n" /* slev */
"shufps $0, %%xmm7, %%xmm7\n" /* slev | slev | slev | slev */ "shufps $0, %%xmm7, %%xmm7\n" /* slev | slev | slev | slev */
".align 16\n"
".loop:\n" ".loop:\n"
"movups (%%eax), %%xmm0\n" /* left */ "movaps (%%eax), %%xmm0\n" /* left */
"movups 2048(%%eax), %%xmm1\n" /* right */ "movaps 2048(%%eax), %%xmm1\n" /* right */
"movups 1024(%%eax), %%xmm2\n" /* center */ "movaps 1024(%%eax), %%xmm2\n" /* center */
"movups 3072(%%eax), %%xmm3\n" /* leftsur */ "movaps 3072(%%eax), %%xmm3\n" /* leftsur */
"movups 4096(%%eax), %%xmm4\n" /* rithgsur */ "movaps 4096(%%eax), %%xmm4\n" /* rithgsur */
"mulps %%xmm5, %%xmm0\n" "mulps %%xmm5, %%xmm0\n"
"mulps %%xmm5, %%xmm1\n" "mulps %%xmm5, %%xmm1\n"
"mulps %%xmm6, %%xmm2\n" "mulps %%xmm6, %%xmm2\n"
"addps %%xmm2, %%xmm0\n" "addps %%xmm2, %%xmm0\n"
"addps %%xmm2, %%xmm1\n" "addps %%xmm2, %%xmm1\n"
"mulps %%xmm7, %%xmm3\n" "mulps %%xmm7, %%xmm3\n"
"mulps %%xmm7, %%xmm4\n" "mulps %%xmm7, %%xmm4\n"
"addps %%xmm3, %%xmm0\n" "addps %%xmm3, %%xmm0\n"
"addps %%xmm4, %%xmm1\n" "addps %%xmm4, %%xmm1\n"
"movups %%xmm0, (%%eax)\n" "movaps %%xmm0, (%%eax)\n"
"movups %%xmm1, 1024(%%eax)\n" "movaps %%xmm1, 1024(%%eax)\n"
"addl $16, %%eax\n" "addl $16, %%eax\n"
"decl %%ebx\n" "decl %%ebx\n"
"jnz .loop\n" "jnz .loop\n"
"popl %%ebx\n" "popl %%ebx\n"
: "=a" (samples) : "=a" (samples)
: "a" (samples), "c" (dm_par)); : "a" (samples), "c" (dm_par));
} }
...@@ -90,35 +93,37 @@ void _M( downmix_3f_2r_to_2ch ) (float * samples, dm_par_t * dm_par) ...@@ -90,35 +93,37 @@ void _M( downmix_3f_2r_to_2ch ) (float * samples, dm_par_t * dm_par)
void _M( downmix_2f_2r_to_2ch ) (float *samples, dm_par_t * dm_par) void _M( downmix_2f_2r_to_2ch ) (float *samples, dm_par_t * dm_par)
{ {
__asm__ __volatile__ ( __asm__ __volatile__ (
".align 16\n"
"pushl %%ebx\n" "pushl %%ebx\n"
"movl $64, %%ebx\n" /* loop counter */ "movl $64, %%ebx\n" /* loop counter */
"movss (%%ecx), %%xmm5\n" /* unit */ "movss (%%ecx), %%xmm5\n" /* unit */
"shufps $0, %%xmm5, %%xmm5\n" /* unit | unit | unit | unit */ "shufps $0, %%xmm5, %%xmm5\n" /* unit | unit | unit | unit */
"movss 8(%%ecx), %%xmm7\n" /* slev */ "movss 8(%%ecx), %%xmm7\n" /* slev */
"shufps $0, %%xmm7, %%xmm7\n" /* slev | slev | slev | slev */ "shufps $0, %%xmm7, %%xmm7\n" /* slev | slev | slev | slev */
".align 16\n"
".loop3:\n" ".loop3:\n"
"movups (%%eax), %%xmm0\n" /* left */ "movaps (%%eax), %%xmm0\n" /* left */
"movups 1024(%%eax), %%xmm1\n" /* right */ "movaps 1024(%%eax), %%xmm1\n" /* right */
"movups 2048(%%eax), %%xmm3\n" /* leftsur */ "movaps 2048(%%eax), %%xmm3\n" /* leftsur */
"movups 3072(%%eax), %%xmm4\n" /* rightsur */ "movaps 3072(%%eax), %%xmm4\n" /* rightsur */
"mulps %%xmm5, %%xmm0\n" "mulps %%xmm5, %%xmm0\n"
"mulps %%xmm5, %%xmm1\n" "mulps %%xmm5, %%xmm1\n"
"mulps %%xmm7, %%xmm3\n" "mulps %%xmm7, %%xmm3\n"
"mulps %%xmm7, %%xmm4\n" "mulps %%xmm7, %%xmm4\n"
"addps %%xmm3, %%xmm0\n" "addps %%xmm3, %%xmm0\n"
"addps %%xmm4, %%xmm1\n" "addps %%xmm4, %%xmm1\n"
"movups %%xmm0, (%%eax)\n" "movaps %%xmm0, (%%eax)\n"
"movups %%xmm1, 1024(%%eax)\n" "movaps %%xmm1, 1024(%%eax)\n"
"addl $16, %%eax\n" "addl $16, %%eax\n"
"decl %%ebx\n" "decl %%ebx\n"
"jnz .loop3\n" "jnz .loop3\n"
"popl %%ebx\n" "popl %%ebx\n"
: "=a" (samples) : "=a" (samples)
: "a" (samples), "c" (dm_par)); : "a" (samples), "c" (dm_par));
} }
...@@ -126,112 +131,114 @@ void _M( downmix_2f_2r_to_2ch ) (float *samples, dm_par_t * dm_par) ...@@ -126,112 +131,114 @@ void _M( downmix_2f_2r_to_2ch ) (float *samples, dm_par_t * dm_par)
void _M( downmix_3f_1r_to_2ch ) (float *samples, dm_par_t * dm_par) void _M( downmix_3f_1r_to_2ch ) (float *samples, dm_par_t * dm_par)
{ {
__asm__ __volatile__ ( __asm__ __volatile__ (
".align 16\n"
"pushl %%ebx\n"
"movl $64, %%ebx\n" /* loop counter */
"pushl %%ebx\n" "movss (%%ecx), %%xmm5\n" /* unit */
"movl $64, %%ebx\n" /* loop counter */ "shufps $0, %%xmm5, %%xmm5\n" /* unit | unit | unit | unit */
"movss (%%ecx), %%xmm5\n" /* unit */
"shufps $0, %%xmm5, %%xmm5\n" /* unit | unit | unit | unit */
"movss 4(%%ecx), %%xmm6\n" /* clev */ "movss 4(%%ecx), %%xmm6\n" /* clev */
"shufps $0, %%xmm6, %%xmm6\n" /* clev | clev | clev | clev */ "shufps $0, %%xmm6, %%xmm6\n" /* clev | clev | clev | clev */
"movss 8(%%ecx), %%xmm7\n" /* slev */ "movss 8(%%ecx), %%xmm7\n" /* slev */
"shufps $0, %%xmm7, %%xmm7\n" /* slev | slev | slev | slev */ "shufps $0, %%xmm7, %%xmm7\n" /* slev | slev | slev | slev */
".align 16\n"
".loop4:\n" ".loop4:\n"
"movups (%%eax), %%xmm0\n" /* left */ "movaps (%%eax), %%xmm0\n" /* left */
"movups 2048(%%eax), %%xmm1\n" /* right */ "movaps 2048(%%eax), %%xmm1\n" /* right */
"movups 1024(%%eax), %%xmm2\n" /* center */ "movaps 1024(%%eax), %%xmm2\n" /* center */
"movups 3072(%%eax), %%xmm3\n" /* sur */ "movaps 3072(%%eax), %%xmm3\n" /* sur */
"mulps %%xmm5, %%xmm0\n" "mulps %%xmm5, %%xmm0\n"
"mulps %%xmm5, %%xmm1\n" "mulps %%xmm5, %%xmm1\n"
"mulps %%xmm6, %%xmm2\n" "mulps %%xmm6, %%xmm2\n"
"addps %%xmm2, %%xmm0\n" "addps %%xmm2, %%xmm0\n"
"mulps %%xmm7, %%xmm3\n" "mulps %%xmm7, %%xmm3\n"
"addps %%xmm2, %%xmm1\n" "addps %%xmm2, %%xmm1\n"
"subps %%xmm3, %%xmm0\n" "subps %%xmm3, %%xmm0\n"
"addps %%xmm3, %%xmm1\n" "addps %%xmm3, %%xmm1\n"
"movups %%xmm0, (%%eax)\n" "movaps %%xmm0, (%%eax)\n"
"movups %%xmm1, 1024(%%eax)\n" "movaps %%xmm1, 1024(%%eax)\n"
"addl $16, %%eax\n" "addl $16, %%eax\n"
"decl %%ebx\n" "decl %%ebx\n"
"jnz .loop4\n" "jnz .loop4\n"
"popl %%ebx\n" "popl %%ebx\n"
: "=a" (samples) : "=a" (samples)
: "a" (samples), "c" (dm_par)); : "a" (samples), "c" (dm_par));
} }
void _M( downmix_2f_1r_to_2ch ) (float *samples, dm_par_t * dm_par) void _M( downmix_2f_1r_to_2ch ) (float *samples, dm_par_t * dm_par)
{ {
__asm__ __volatile__ ( __asm__ __volatile__ (
"pushl %%ebx\n" ".align 16\n"
"movl $64, %%ebx\n" /* loop counter */ "pushl %%ebx\n"
"movl $64, %%ebx\n" /* loop counter */
"movss (%%ecx), %%xmm5\n" /* unit */ "movss (%%ecx), %%xmm5\n" /* unit */
"shufps $0, %%xmm5, %%xmm5\n" /* unit | unit | unit | unit */ "shufps $0, %%xmm5, %%xmm5\n" /* unit | unit | unit | unit */
"movss 8(%%ecx), %%xmm7\n" /* slev */ "movss 8(%%ecx), %%xmm7\n" /* slev */
"shufps $0, %%xmm7, %%xmm7\n" /* slev | slev | slev | slev */ "shufps $0, %%xmm7, %%xmm7\n" /* slev | slev | slev | slev */
".align 16\n"
".loop5:\n" ".loop5:\n"
"movups (%%eax), %%xmm0\n" /* left */ "movaps (%%eax), %%xmm0\n" /* left */
"movups 1024(%%eax), %%xmm1\n" /* right */ "movaps 1024(%%eax), %%xmm1\n" /* right */
"movups 2048(%%eax), %%xmm3\n" /* sur */ "movaps 2048(%%eax), %%xmm3\n" /* sur */
"mulps %%xmm5, %%xmm0\n" "mulps %%xmm5, %%xmm0\n"
"mulps %%xmm5, %%xmm1\n" "mulps %%xmm5, %%xmm1\n"
"mulps %%xmm7, %%xmm3\n" "mulps %%xmm7, %%xmm3\n"
"subps %%xmm3, %%xmm0\n" "subps %%xmm3, %%xmm0\n"
"addps %%xmm3, %%xmm1\n" "addps %%xmm3, %%xmm1\n"
"movups %%xmm0, (%%eax)\n" "movaps %%xmm0, (%%eax)\n"
"movups %%xmm1, 1024(%%eax)\n" "movaps %%xmm1, 1024(%%eax)\n"
"addl $16, %%eax\n"
"decl %%ebx\n"
"jnz .loop5\n"
"popl %%ebx\n"
: "=a" (samples)
: "a" (samples), "c" (dm_par));
"addl $16, %%eax\n"
"decl %%ebx\n"
"jnz .loop5\n"
"popl %%ebx\n"
: "=a" (samples)
: "a" (samples), "c" (dm_par));
} }
void _M( downmix_3f_0r_to_2ch ) (float *samples, dm_par_t * dm_par) void _M( downmix_3f_0r_to_2ch ) (float *samples, dm_par_t * dm_par)
{ {
__asm__ __volatile__ ( __asm__ __volatile__ (
"pushl %%ebx\n" ".align 16\n"
"movl $64, %%ebx\n" /* loop counter */ "pushl %%ebx\n"
"movl $64, %%ebx\n" /* loop counter */
"movss (%%ecx), %%xmm5\n" /* unit */ "movss (%%ecx), %%xmm5\n" /* unit */
"shufps $0, %%xmm5, %%xmm5\n" /* unit | unit | unit | unit */ "shufps $0, %%xmm5, %%xmm5\n" /* unit | unit | unit | unit */
"movss 4(%%ecx), %%xmm6\n" /* clev */ "movss 4(%%ecx), %%xmm6\n" /* clev */
"shufps $0, %%xmm6, %%xmm6\n" /* clev | clev | clev | clev */ "shufps $0, %%xmm6, %%xmm6\n" /* clev | clev | clev | clev */
".align 16\n"
".loop6:\n" ".loop6:\n"
"movups (%%eax), %%xmm0\n" /*left */ "movaps (%%eax), %%xmm0\n" /*left */
"movups 2048(%%eax), %%xmm1\n" /* right */ "movaps 2048(%%eax), %%xmm1\n" /* right */
"movups 1024(%%eax), %%xmm2\n" /* center */ "movaps 1024(%%eax), %%xmm2\n" /* center */
"mulps %%xmm5, %%xmm0\n" "mulps %%xmm5, %%xmm0\n"
"mulps %%xmm5, %%xmm1\n" "mulps %%xmm5, %%xmm1\n"
"mulps %%xmm6, %%xmm2\n" "mulps %%xmm6, %%xmm2\n"
"addps %%xmm2, %%xmm0\n" "addps %%xmm2, %%xmm0\n"
"addps %%xmm2, %%xmm1\n" "addps %%xmm2, %%xmm1\n"
"movups %%xmm0, (%%eax)\n" "movaps %%xmm0, (%%eax)\n"
"movups %%xmm1, 1024(%%eax)\n" "movaps %%xmm1, 1024(%%eax)\n"
"addl $16, %%eax\n" "addl $16, %%eax\n"
"decl %%ebx\n" "decl %%ebx\n"
"jnz .loop6\n" "jnz .loop6\n"
"popl %%ebx\n" "popl %%ebx\n"
: "=a" (samples) : "=a" (samples)
: "a" (samples), "c" (dm_par)); : "a" (samples), "c" (dm_par));
} }
...@@ -239,24 +246,26 @@ void _M( downmix_3f_0r_to_2ch ) (float *samples, dm_par_t * dm_par) ...@@ -239,24 +246,26 @@ void _M( downmix_3f_0r_to_2ch ) (float *samples, dm_par_t * dm_par)
void _M( stream_sample_1ch_to_s16 ) (s16 *s16_samples, float *left) void _M( stream_sample_1ch_to_s16 ) (s16 *s16_samples, float *left)
{ {
__asm__ __volatile__ ( __asm__ __volatile__ (
".align 16\n"
"pushl %%ebx\n" "pushl %%ebx\n"
"pushl %%edx\n" "pushl %%edx\n"
"movl $sqrt2_sse, %%edx\n" "movl $sqrt2_sse, %%edx\n"
"movss (%%edx), %%xmm7\n" "movss (%%edx), %%xmm7\n"
"shufps $0, %%xmm7, %%xmm7\n" /* sqrt2 | sqrt2 | sqrt2 | sqrt2 */ "shufps $0, %%xmm7, %%xmm7\n" /* sqrt2 | sqrt2 | sqrt2 | sqrt2 */
"movl $64, %%ebx\n" "movl $64, %%ebx\n"
".align 16\n"
".loop2:\n" ".loop2:\n"
"movups (%%ecx), %%xmm0\n" /* c3 | c2 | c1 | c0 */ "movaps (%%ecx), %%xmm0\n" /* c3 | c2 | c1 | c0 */
"mulps %%xmm7, %%xmm0\n" "mulps %%xmm7, %%xmm0\n"
"movhlps %%xmm0, %%xmm2\n" /* c3 | c2 */ "movhlps %%xmm0, %%xmm2\n" /* c3 | c2 */
"cvtps2pi %%xmm0, %%mm0\n" /* c1 c0 --> mm0, int_32 */ "cvtps2pi %%xmm0, %%mm0\n" /* c1 c0 --> mm0, int_32 */
"cvtps2pi %%xmm2, %%mm1\n" /* c3 c2 --> mm1, int_32 */ "cvtps2pi %%xmm2, %%mm1\n" /* c3 c2 --> mm1, int_32 */
"packssdw %%mm0, %%mm0\n" /* c1 c1 c0 c0 --> mm0, int_16 */ "packssdw %%mm0, %%mm0\n" /* c1 c1 c0 c0 --> mm0, int_16 */
"packssdw %%mm1, %%mm1\n" /* c3 c3 c2 c2 --> mm1, int_16 */ "packssdw %%mm1, %%mm1\n" /* c3 c3 c2 c2 --> mm1, int_16 */
"movq %%mm0, (%%eax)\n" "movq %%mm0, (%%eax)\n"
"movq %%mm1, 8(%%eax)\n" "movq %%mm1, 8(%%eax)\n"
...@@ -275,18 +284,19 @@ void _M( stream_sample_1ch_to_s16 ) (s16 *s16_samples, float *left) ...@@ -275,18 +284,19 @@ void _M( stream_sample_1ch_to_s16 ) (s16 *s16_samples, float *left)
void _M( stream_sample_2ch_to_s16 ) (s16 *s16_samples, float *left, float *right) void _M( stream_sample_2ch_to_s16 ) (s16 *s16_samples, float *left, float *right)
{ {
__asm__ __volatile__ ( __asm__ __volatile__ (
".align 16\n"
"pushl %%ebx\n" "pushl %%ebx\n"
"movl $64, %%ebx\n" "movl $64, %%ebx\n"
".align 16\n"
".loop1:\n" ".loop1:\n"
"movups (%%ecx), %%xmm0\n" /* l3 | l2 | l1 | l0 */ "movaps (%%ecx), %%xmm0\n" /* l3 | l2 | l1 | l0 */
"movups (%%edx), %%xmm1\n" /* r3 | r2 | r1 | r0 */ "movaps (%%edx), %%xmm1\n" /* r3 | r2 | r1 | r0 */
"movhlps %%xmm0, %%xmm2\n" /* l3 | l2 */ "movhlps %%xmm0, %%xmm2\n" /* l3 | l2 */
"movhlps %%xmm1, %%xmm3\n" /* r3 | r2 */ "movhlps %%xmm1, %%xmm3\n" /* r3 | r2 */
"unpcklps %%xmm1, %%xmm0\n" /* r1 | l1 | r0 | l0 */ "unpcklps %%xmm1, %%xmm0\n" /* r1 | l1 | r0 | l0 */
"unpcklps %%xmm3, %%xmm2\n" /* r3 | l3 | r2 | l2 */ "unpcklps %%xmm3, %%xmm2\n" /* r3 | l3 | r2 | l2 */
"cvtps2pi %%xmm0, %%mm0\n" /* r0 l0 --> mm0, int_32 */ "cvtps2pi %%xmm0, %%mm0\n" /* r0 l0 --> mm0, int_32 */
"movhlps %%xmm0, %%xmm0\n" "movhlps %%xmm0, %%xmm0\n"
...@@ -295,8 +305,8 @@ void _M( stream_sample_2ch_to_s16 ) (s16 *s16_samples, float *left, float *right ...@@ -295,8 +305,8 @@ void _M( stream_sample_2ch_to_s16 ) (s16 *s16_samples, float *left, float *right
"movhlps %%xmm2, %%xmm2\n" "movhlps %%xmm2, %%xmm2\n"
"cvtps2pi %%xmm2, %%mm3\n" /* r3 l3 --> mm3, int_32 */ "cvtps2pi %%xmm2, %%mm3\n" /* r3 l3 --> mm3, int_32 */
"packssdw %%mm1, %%mm0\n" /* r1 l1 r0 l0 --> mm0, int_16 */ "packssdw %%mm1, %%mm0\n" /* r1 l1 r0 l0 --> mm0, int_16 */
"packssdw %%mm3, %%mm2\n" /* r3 l3 r2 l2 --> mm2, int_16 */ "packssdw %%mm3, %%mm2\n" /* r3 l3 r2 l2 --> mm2, int_16 */
"movq %%mm0, (%%eax)\n" "movq %%mm0, (%%eax)\n"
"movq %%mm2, 8(%%eax)\n" "movq %%mm2, 8(%%eax)\n"
......
...@@ -2,7 +2,7 @@ ...@@ -2,7 +2,7 @@
* ac3_imdct_3dn.c: accelerated 3D Now! ac3 DCT * ac3_imdct_3dn.c: accelerated 3D Now! ac3 DCT
***************************************************************************** *****************************************************************************
* Copyright (C) 1999, 2000 VideoLAN * Copyright (C) 1999, 2000 VideoLAN
* $Id: ac3_imdct_3dn.c,v 1.4 2001/06/03 12:47:21 sam Exp $ * $Id: ac3_imdct_3dn.c,v 1.5 2001/07/08 23:15:11 reno Exp $
* *
* Authors: Renaud Dartus <reno@videolan.org> * Authors: Renaud Dartus <reno@videolan.org>
* *
...@@ -89,6 +89,7 @@ void _M( imdct_do_512_nol ) (imdct_t * p_imdct, float data[], float delay[]) ...@@ -89,6 +89,7 @@ void _M( imdct_do_512_nol ) (imdct_t * p_imdct, float data[], float delay[])
static void imdct512_pre_ifft_twiddle_3dn (const int *pmt, complex_t *buf, float *data, float *xcos_sin_sse) static void imdct512_pre_ifft_twiddle_3dn (const int *pmt, complex_t *buf, float *data, float *xcos_sin_sse)
{ {
__asm__ __volatile__ ( __asm__ __volatile__ (
".align 16\n"
"pushl %%ebp\n" "pushl %%ebp\n"
"movl %%esp, %%ebp\n" "movl %%esp, %%ebp\n"
"addl $-4, %%esp\n" /* local variable, loop counter */ "addl $-4, %%esp\n" /* local variable, loop counter */
...@@ -106,6 +107,7 @@ static void imdct512_pre_ifft_twiddle_3dn (const int *pmt, complex_t *buf, float ...@@ -106,6 +107,7 @@ static void imdct512_pre_ifft_twiddle_3dn (const int *pmt, complex_t *buf, float
"movl 20(%%ebp), %%edx\n" /* xcos_sin_sse */ "movl 20(%%ebp), %%edx\n" /* xcos_sin_sse */
"movl $128, -4(%%ebp)\n" "movl $128, -4(%%ebp)\n"
".align 16\n"
".loop:\n" ".loop:\n"
"movl (%%eax), %%esi\n" "movl (%%eax), %%esi\n"
"movd (%%ecx, %%esi, 8), %%mm1\n" /* 2j */ "movd (%%ecx, %%esi, 8), %%mm1\n" /* 2j */
...@@ -147,9 +149,11 @@ static void imdct512_pre_ifft_twiddle_3dn (const int *pmt, complex_t *buf, float ...@@ -147,9 +149,11 @@ static void imdct512_pre_ifft_twiddle_3dn (const int *pmt, complex_t *buf, float
static void imdct512_post_ifft_twiddle_3dn (complex_t *buf, float *xcos_sin_sse) static void imdct512_post_ifft_twiddle_3dn (complex_t *buf, float *xcos_sin_sse)
{ {
__asm__ __volatile__ ( __asm__ __volatile__ (
".align 16\n"
"pushl %%ebx\n" "pushl %%ebx\n"
"movl $64, %%ebx\n" /* loop counter */ "movl $64, %%ebx\n" /* loop counter */
".align 16\n"
".loop1:\n" ".loop1:\n"
"movq (%%eax), %%mm0\n" /* im0 | re0 */ "movq (%%eax), %%mm0\n" /* im0 | re0 */
"movq %%mm0, %%mm1\n" /* im0 | re0 */ "movq %%mm0, %%mm1\n" /* im0 | re0 */
...@@ -200,6 +204,7 @@ static void imdct512_post_ifft_twiddle_3dn (complex_t *buf, float *xcos_sin_sse) ...@@ -200,6 +204,7 @@ static void imdct512_post_ifft_twiddle_3dn (complex_t *buf, float *xcos_sin_sse)
static void imdct512_window_delay_3dn (complex_t *buf, float *data_ptr, float *window_prt, float *delay_prt) static void imdct512_window_delay_3dn (complex_t *buf, float *data_ptr, float *window_prt, float *delay_prt)
{ {
__asm__ __volatile__ ( __asm__ __volatile__ (
".align 16\n"
"pushl %%ebp\n" "pushl %%ebp\n"
"movl %%esp, %%ebp\n" "movl %%esp, %%ebp\n"
...@@ -219,6 +224,7 @@ static void imdct512_window_delay_3dn (complex_t *buf, float *data_ptr, float *w ...@@ -219,6 +224,7 @@ static void imdct512_window_delay_3dn (complex_t *buf, float *data_ptr, float *w
"leal 504(%%eax), %%edi\n" /* buf[63].re */ "leal 504(%%eax), %%edi\n" /* buf[63].re */
"movl 12(%%ebp), %%eax\n" /* data */ "movl 12(%%ebp), %%eax\n" /* data */
".align 16\n"
".first_128_samples:\n" ".first_128_samples:\n"
"movd (%%esi), %%mm0\n" /* im0 */ "movd (%%esi), %%mm0\n" /* im0 */
"movd 8(%%esi), %%mm2\n" /* im1 */ "movd 8(%%esi), %%mm2\n" /* im1 */
...@@ -258,6 +264,7 @@ static void imdct512_window_delay_3dn (complex_t *buf, float *data_ptr, float *w ...@@ -258,6 +264,7 @@ static void imdct512_window_delay_3dn (complex_t *buf, float *data_ptr, float *w
"leal 1020(%%esi), %%edi\n" /* buf[127].im */ "leal 1020(%%esi), %%edi\n" /* buf[127].im */
"movl $32, %%ecx\n" /* loop count */ "movl $32, %%ecx\n" /* loop count */
".align 16\n"
".second_128_samples:\n" ".second_128_samples:\n"
"movd (%%esi), %%mm0\n" /* buf[i].re */ "movd (%%esi), %%mm0\n" /* buf[i].re */
"movd 8(%%esi), %%mm2\n" /* re1 */ "movd 8(%%esi), %%mm2\n" /* re1 */
...@@ -302,6 +309,7 @@ static void imdct512_window_delay_3dn (complex_t *buf, float *data_ptr, float *w ...@@ -302,6 +309,7 @@ static void imdct512_window_delay_3dn (complex_t *buf, float *data_ptr, float *w
"movl $32, %%ecx\n" /* loop count */ "movl $32, %%ecx\n" /* loop count */
"movl 20(%%ebp), %%eax\n" /* delay */ "movl 20(%%ebp), %%eax\n" /* delay */
".align 16\n"
".first_128_delay:\n" ".first_128_delay:\n"
"movd (%%esi), %%mm0\n" /* re0 */ "movd (%%esi), %%mm0\n" /* re0 */
"movd 8(%%esi), %%mm2\n" /* re1 */ "movd 8(%%esi), %%mm2\n" /* re1 */
...@@ -339,6 +347,7 @@ static void imdct512_window_delay_3dn (complex_t *buf, float *data_ptr, float *w ...@@ -339,6 +347,7 @@ static void imdct512_window_delay_3dn (complex_t *buf, float *data_ptr, float *w
"leal 1016(%%ebx), %%edi\n" /* buf[127].re */ "leal 1016(%%ebx), %%edi\n" /* buf[127].re */
"movl $32, %%ecx\n" /* loop count */ "movl $32, %%ecx\n" /* loop count */
".align 16\n"
".second_128_delay:\n" ".second_128_delay:\n"
"movd (%%esi), %%mm0\n" /* im0 */ "movd (%%esi), %%mm0\n" /* im0 */
"movd 8(%%esi), %%mm2\n" /* im1 */ "movd 8(%%esi), %%mm2\n" /* im1 */
...@@ -386,6 +395,7 @@ static void imdct512_window_delay_3dn (complex_t *buf, float *data_ptr, float *w ...@@ -386,6 +395,7 @@ static void imdct512_window_delay_3dn (complex_t *buf, float *data_ptr, float *w
static void imdct512_window_delay_nol_3dn (complex_t *buf, float *data_ptr, float *window_prt, float *delay_prt) static void imdct512_window_delay_nol_3dn (complex_t *buf, float *data_ptr, float *window_prt, float *delay_prt)
{ {
__asm__ __volatile__ ( __asm__ __volatile__ (
".align 16\n"
"pushl %%ebp\n" "pushl %%ebp\n"
"movl %%esp, %%ebp\n" "movl %%esp, %%ebp\n"
...@@ -405,6 +415,7 @@ static void imdct512_window_delay_nol_3dn (complex_t *buf, float *data_ptr, floa ...@@ -405,6 +415,7 @@ static void imdct512_window_delay_nol_3dn (complex_t *buf, float *data_ptr, floa
"leal 504(%%eax), %%edi\n" /* buf[63].re */ "leal 504(%%eax), %%edi\n" /* buf[63].re */
"movl 12(%%ebp), %%eax\n" /* data */ "movl 12(%%ebp), %%eax\n" /* data */
".align 16\n"
".first_128_samples2:\n" ".first_128_samples2:\n"
"movd (%%esi), %%mm0\n" /* im0 */ "movd (%%esi), %%mm0\n" /* im0 */
"movd 8(%%esi), %%mm2\n" /* im1 */ "movd 8(%%esi), %%mm2\n" /* im1 */
...@@ -439,6 +450,7 @@ static void imdct512_window_delay_nol_3dn (complex_t *buf, float *data_ptr, floa ...@@ -439,6 +450,7 @@ static void imdct512_window_delay_nol_3dn (complex_t *buf, float *data_ptr, floa
"leal 1020(%%esi), %%edi\n" /* buf[127].im */ "leal 1020(%%esi), %%edi\n" /* buf[127].im */
"movl $32, %%ecx\n" /* loop count */ "movl $32, %%ecx\n" /* loop count */
".align 16\n"
".second_128_samples2:\n" ".second_128_samples2:\n"
"movd (%%esi), %%mm0\n" /* buf[i].re */ "movd (%%esi), %%mm0\n" /* buf[i].re */
"movd 8(%%esi), %%mm2\n" /* re1 */ "movd 8(%%esi), %%mm2\n" /* re1 */
...@@ -478,6 +490,7 @@ static void imdct512_window_delay_nol_3dn (complex_t *buf, float *data_ptr, floa ...@@ -478,6 +490,7 @@ static void imdct512_window_delay_nol_3dn (complex_t *buf, float *data_ptr, floa
"movl $32, %%ecx\n" /* loop count */ "movl $32, %%ecx\n" /* loop count */
"movl 20(%%ebp), %%eax\n" /* delay */ "movl 20(%%ebp), %%eax\n" /* delay */
".align 16\n"
".first_128_delays:\n" ".first_128_delays:\n"
"movd (%%esi), %%mm0\n" /* re0 */ "movd (%%esi), %%mm0\n" /* re0 */
"movd 8(%%esi), %%mm2\n" /* re1 */ "movd 8(%%esi), %%mm2\n" /* re1 */
...@@ -515,6 +528,7 @@ static void imdct512_window_delay_nol_3dn (complex_t *buf, float *data_ptr, floa ...@@ -515,6 +528,7 @@ static void imdct512_window_delay_nol_3dn (complex_t *buf, float *data_ptr, floa
"leal 1016(%%ebx), %%edi\n" /* buf[127].re */ "leal 1016(%%ebx), %%edi\n" /* buf[127].re */
"movl $32, %%ecx\n" /* loop count */ "movl $32, %%ecx\n" /* loop count */
".align 16\n"
".second_128_delays:\n" ".second_128_delays:\n"
"movd (%%esi), %%mm0\n" /* im0 */ "movd (%%esi), %%mm0\n" /* im0 */
"movd 8(%%esi), %%mm2\n" /* im1 */ "movd 8(%%esi), %%mm2\n" /* im1 */
......
...@@ -2,7 +2,7 @@ ...@@ -2,7 +2,7 @@
* ac3_imdct_sse.c: accelerated SSE ac3 DCT * ac3_imdct_sse.c: accelerated SSE ac3 DCT
***************************************************************************** *****************************************************************************
* Copyright (C) 1999, 2000 VideoLAN * Copyright (C) 1999, 2000 VideoLAN
* $Id: ac3_imdct_sse.c,v 1.3 2001/05/28 02:38:48 sam Exp $ * $Id: ac3_imdct_sse.c,v 1.4 2001/07/08 23:15:11 reno Exp $
* *
* Authors: Renaud Dartus <reno@videolan.org> * Authors: Renaud Dartus <reno@videolan.org>
* Aaron Holtzman <aholtzma@engr.uvic.ca> * Aaron Holtzman <aholtzma@engr.uvic.ca>
...@@ -91,6 +91,7 @@ void _M( imdct_do_512_nol ) (imdct_t * p_imdct, float data[], float delay[]) ...@@ -91,6 +91,7 @@ void _M( imdct_do_512_nol ) (imdct_t * p_imdct, float data[], float delay[])
static void imdct512_pre_ifft_twiddle_sse (const int *pmt, complex_t *buf, float *data, float *xcos_sin_sse) static void imdct512_pre_ifft_twiddle_sse (const int *pmt, complex_t *buf, float *data, float *xcos_sin_sse)
{ {
__asm__ __volatile__ ( __asm__ __volatile__ (
".align 16\n"
"pushl %%ebp\n" "pushl %%ebp\n"
"movl %%esp, %%ebp\n" "movl %%esp, %%ebp\n"
"addl $-4, %%esp\n" /* local variable, loop counter */ "addl $-4, %%esp\n" /* local variable, loop counter */
...@@ -103,11 +104,12 @@ static void imdct512_pre_ifft_twiddle_sse (const int *pmt, complex_t *buf, float ...@@ -103,11 +104,12 @@ static void imdct512_pre_ifft_twiddle_sse (const int *pmt, complex_t *buf, float
"pushl %%esi\n" "pushl %%esi\n"
"movl 8(%%ebp), %%eax\n" /* pmt */ "movl 8(%%ebp), %%eax\n" /* pmt */
"movl 12(%%ebp), %%ebx\n" /* buf */ "movl 12(%%ebp), %%ebx\n" /* buf */
"movl 16(%%ebp), %%ecx\n" /* data */ "movl 16(%%ebp), %%ecx\n" /* data */
"movl 20(%%ebp), %%edx\n" /* xcos_sin_sse */ "movl 20(%%ebp), %%edx\n" /* xcos_sin_sse */
"movl $64, -4(%%ebp)\n" "movl $64, -4(%%ebp)\n"
".align 16\n"
".loop:\n" ".loop:\n"
"movl (%%eax), %%esi\n" "movl (%%eax), %%esi\n"
"movl 4(%%eax), %%edi\n" "movl 4(%%eax), %%edi\n"
...@@ -117,18 +119,18 @@ static void imdct512_pre_ifft_twiddle_sse (const int *pmt, complex_t *buf, float ...@@ -117,18 +119,18 @@ static void imdct512_pre_ifft_twiddle_sse (const int *pmt, complex_t *buf, float
"shll $1, %%esi\n" "shll $1, %%esi\n"
"shll $1, %%edi\n" "shll $1, %%edi\n"
"movups (%%edx, %%esi, 8), %%xmm0\n" /* -c_j | -s_j | -s_j | c_j */ "movaps (%%edx, %%esi, 8), %%xmm0\n" /* -c_j | -s_j | -s_j | c_j */
"movups (%%edx, %%edi, 8), %%xmm2\n" /* -c_j+1 | -s_j+1 | -s_j+1 | c_j+1 */ "movaps (%%edx, %%edi, 8), %%xmm2\n" /* -c_j+1 | -s_j+1 | -s_j+1 | c_j+1 */
"negl %%esi\n" "negl %%esi\n"
"negl %%edi\n" "negl %%edi\n"
"movss 1020(%%ecx, %%esi, 4), %%xmm4\n" /* 255-2j */ "movss 1020(%%ecx, %%esi, 4), %%xmm4\n" /* 255-2j */
"addl $8, %%eax\n" "addl $8, %%eax\n"
"movss 1020(%%ecx, %%edi, 4), %%xmm5\n" /* 255-2(j+1) */ "movss 1020(%%ecx, %%edi, 4), %%xmm5\n" /* 255-2(j+1) */
"shufps $0, %%xmm1, %%xmm4\n" /* 2j | 2j | 255-2j | 255-2j */ "shufps $0, %%xmm1, %%xmm4\n" /* 2j | 2j | 255-2j | 255-2j */
"shufps $0, %%xmm3, %%xmm5\n" /* 2(j+1) | 2(j+1) | 255-2(j+1) | 255-2(j+1) */ "shufps $0, %%xmm3, %%xmm5\n" /* 2(j+1) | 2(j+1) | 255-2(j+1) | 255-2(j+1) */
"mulps %%xmm4, %%xmm0\n" "mulps %%xmm4, %%xmm0\n"
"mulps %%xmm5, %%xmm2\n" "mulps %%xmm5, %%xmm2\n"
"movhlps %%xmm0, %%xmm1\n" "movhlps %%xmm0, %%xmm1\n"
...@@ -138,9 +140,9 @@ static void imdct512_pre_ifft_twiddle_sse (const int *pmt, complex_t *buf, float ...@@ -138,9 +140,9 @@ static void imdct512_pre_ifft_twiddle_sse (const int *pmt, complex_t *buf, float
"addps %%xmm3, %%xmm2\n" "addps %%xmm3, %%xmm2\n"
"movlhps %%xmm2, %%xmm0\n" "movlhps %%xmm2, %%xmm0\n"
"movups %%xmm0, -16(%%ebx)\n" "movaps %%xmm0, -16(%%ebx)\n"
"decl -4(%%ebp)\n" "decl -4(%%ebp)\n"
"jnz .loop\n" "jnz .loop\n"
"popl %%esi\n" "popl %%esi\n"
"popl %%edi\n" "popl %%edi\n"
...@@ -157,36 +159,38 @@ static void imdct512_pre_ifft_twiddle_sse (const int *pmt, complex_t *buf, float ...@@ -157,36 +159,38 @@ static void imdct512_pre_ifft_twiddle_sse (const int *pmt, complex_t *buf, float
static void imdct512_post_ifft_twiddle_sse (complex_t *buf, float *xcos_sin_sse) static void imdct512_post_ifft_twiddle_sse (complex_t *buf, float *xcos_sin_sse)
{ {
__asm__ __volatile__ ( __asm__ __volatile__ (
".align 16\n"
"pushl %%ebx\n" "pushl %%ebx\n"
"movl $32, %%ebx\n" /* loop counter */ "movl $32, %%ebx\n" /* loop counter */
".align 16\n"
".loop1:\n" ".loop1:\n"
"movups (%%eax), %%xmm0\n" /* im1 | re1 | im0 | re0 */ "movaps (%%eax), %%xmm0\n" /* im1 | re1 | im0 | re0 */
"movups (%%ecx), %%xmm2\n" /* -c | -s | -s | c */ "movaps (%%ecx), %%xmm2\n" /* -c | -s | -s | c */
"movhlps %%xmm0, %%xmm1\n" /* im1 | re1 */ "movhlps %%xmm0, %%xmm1\n" /* im1 | re1 */
"movups 16(%%ecx), %%xmm3\n" /* -c1 | -s1 | -s1 | c1 */ "movaps 16(%%ecx), %%xmm3\n" /* -c1 | -s1 | -s1 | c1 */
"shufps $0x50, %%xmm0, %%xmm0\n" /* im0 | im0 | re0 | re0 */ "shufps $0x50, %%xmm0, %%xmm0\n" /* im0 | im0 | re0 | re0 */
"shufps $0x50, %%xmm1, %%xmm1\n" /* im1 | im1 | re1 | re1 */ "shufps $0x50, %%xmm1, %%xmm1\n" /* im1 | im1 | re1 | re1 */
"movups 16(%%eax), %%xmm4\n" /* im3 | re3 | im2 | re2 */ "movaps 16(%%eax), %%xmm4\n" /* im3 | re3 | im2 | re2 */
"shufps $0x27, %%xmm2, %%xmm2\n" /* c | -s | -s | -c */ "shufps $0x27, %%xmm2, %%xmm2\n" /* c | -s | -s | -c */
"movhlps %%xmm4, %%xmm5\n" /* im3 | re3 */ "movhlps %%xmm4, %%xmm5\n" /* im3 | re3 */
"shufps $0x27, %%xmm3, %%xmm3\n" /* c1 | -s1 | -s1 | -c1 */ "shufps $0x27, %%xmm3, %%xmm3\n" /* c1 | -s1 | -s1 | -c1 */
"movups 32(%%ecx), %%xmm6\n" /* -c2 | -s2 | -s2 | c2 */ "movaps 32(%%ecx), %%xmm6\n" /* -c2 | -s2 | -s2 | c2 */
"movups 48(%%ecx), %%xmm7\n" /* -c3 | -s3 | -s3 | c3 */ "movaps 48(%%ecx), %%xmm7\n" /* -c3 | -s3 | -s3 | c3 */
"shufps $0x50, %%xmm4, %%xmm4\n" /* im2 | im2 | re2 | re2 */ "shufps $0x50, %%xmm4, %%xmm4\n" /* im2 | im2 | re2 | re2 */
"shufps $0x50, %%xmm5, %%xmm5\n" /* im3 | im3 | re3 | re3 */ "shufps $0x50, %%xmm5, %%xmm5\n" /* im3 | im3 | re3 | re3 */
"mulps %%xmm2, %%xmm0\n" "mulps %%xmm2, %%xmm0\n"
"mulps %%xmm3, %%xmm1\n" "mulps %%xmm3, %%xmm1\n"
"shufps $0x27, %%xmm6, %%xmm6\n" /* c2 | -s2 | -s2 | -c2 */ "shufps $0x27, %%xmm6, %%xmm6\n" /* c2 | -s2 | -s2 | -c2 */
"shufps $0x27, %%xmm7, %%xmm7\n" /* c3 | -s3 | -s3 | -c3 */ "shufps $0x27, %%xmm7, %%xmm7\n" /* c3 | -s3 | -s3 | -c3 */
"movhlps %%xmm0, %%xmm2\n" "movhlps %%xmm0, %%xmm2\n"
"movhlps %%xmm1, %%xmm3\n" "movhlps %%xmm1, %%xmm3\n"
...@@ -206,8 +210,8 @@ static void imdct512_post_ifft_twiddle_sse (complex_t *buf, float *xcos_sin_sse) ...@@ -206,8 +210,8 @@ static void imdct512_post_ifft_twiddle_sse (complex_t *buf, float *xcos_sin_sse)
"movlhps %%xmm1, %%xmm0\n" "movlhps %%xmm1, %%xmm0\n"
"movlhps %%xmm5, %%xmm4\n" "movlhps %%xmm5, %%xmm4\n"
"movups %%xmm0, (%%eax)\n" "movaps %%xmm0, (%%eax)\n"
"movups %%xmm4, 16(%%eax)\n" "movaps %%xmm4, 16(%%eax)\n"
"addl $64, %%ecx\n" "addl $64, %%ecx\n"
"addl $32, %%eax\n" "addl $32, %%eax\n"
"decl %%ebx\n" "decl %%ebx\n"
...@@ -221,6 +225,7 @@ static void imdct512_post_ifft_twiddle_sse (complex_t *buf, float *xcos_sin_sse) ...@@ -221,6 +225,7 @@ static void imdct512_post_ifft_twiddle_sse (complex_t *buf, float *xcos_sin_sse)
static void imdct512_window_delay_sse (complex_t *buf, float *data_ptr, float *window_prt, float *delay_prt) static void imdct512_window_delay_sse (complex_t *buf, float *data_ptr, float *window_prt, float *delay_prt)
{ {
__asm__ __volatile__ ( __asm__ __volatile__ (
".align 16\n"
"pushl %%ebp\n" "pushl %%ebp\n"
"movl %%esp, %%ebp\n" "movl %%esp, %%ebp\n"
...@@ -240,6 +245,7 @@ static void imdct512_window_delay_sse (complex_t *buf, float *data_ptr, float *w ...@@ -240,6 +245,7 @@ static void imdct512_window_delay_sse (complex_t *buf, float *data_ptr, float *w
"leal 504(%%eax), %%edi\n" /* buf[63].re */ "leal 504(%%eax), %%edi\n" /* buf[63].re */
"movl 12(%%ebp), %%eax\n" /* data */ "movl 12(%%ebp), %%eax\n" /* data */
".align 16\n"
".first_128_samples:\n" ".first_128_samples:\n"
"movss (%%esi), %%xmm0\n" "movss (%%esi), %%xmm0\n"
"movss 8(%%esi), %%xmm2\n" "movss 8(%%esi), %%xmm2\n"
...@@ -250,7 +256,7 @@ static void imdct512_window_delay_sse (complex_t *buf, float *data_ptr, float *w ...@@ -250,7 +256,7 @@ static void imdct512_window_delay_sse (complex_t *buf, float *data_ptr, float *w
"movlhps %%xmm3, %%xmm1\n" /* 0.0 | re1 | 0.0 | re0 */ "movlhps %%xmm3, %%xmm1\n" /* 0.0 | re1 | 0.0 | re0 */
"movups (%%edx), %%xmm4\n" /* w3 | w2 | w1 | w0 */ "movups (%%edx), %%xmm4\n" /* w3 | w2 | w1 | w0 */
"movups (%%ebx), %%xmm5\n" /* d3 | d2 | d1 | d0 */ "movaps (%%ebx), %%xmm5\n" /* d3 | d2 | d1 | d0 */
"shufps $0xb1, %%xmm1, %%xmm1\n"/* re1 | 0.0 | re0 | 0.0 */ "shufps $0xb1, %%xmm1, %%xmm1\n"/* re1 | 0.0 | re0 | 0.0 */
"movss 16(%%esi), %%xmm6\n" /* im2 */ "movss 16(%%esi), %%xmm6\n" /* im2 */
...@@ -261,27 +267,28 @@ static void imdct512_window_delay_sse (complex_t *buf, float *data_ptr, float *w ...@@ -261,27 +267,28 @@ static void imdct512_window_delay_sse (complex_t *buf, float *data_ptr, float *w
"mulps %%xmm4, %%xmm0\n" "mulps %%xmm4, %%xmm0\n"
"movlhps %%xmm7, %%xmm6\n" /* 0.0 | im3 | 0.0 | im2 */ "movlhps %%xmm7, %%xmm6\n" /* 0.0 | im3 | 0.0 | im2 */
"movlhps %%xmm3, %%xmm2\n" /* 0.0 | re3 | 0.0 | re2 */ "movlhps %%xmm3, %%xmm2\n" /* 0.0 | re3 | 0.0 | re2 */
"addps %%xmm5, %%xmm0\n" "addps %%xmm5, %%xmm0\n"
"shufps $0xb1, %%xmm2, %%xmm2\n"/* re3 | 0.0 | re2 | 0.0 */ "shufps $0xb1, %%xmm2, %%xmm2\n"/* re3 | 0.0 | re2 | 0.0 */
"movups 16(%%edx), %%xmm4\n" /* w7 | w6 | w5 | w4 */ "movups 16(%%edx), %%xmm4\n" /* w7 | w6 | w5 | w4 */
"movups 16(%%ebx), %%xmm5\n" /* d7 | d6 | d5 | d4 */ "movaps 16(%%ebx), %%xmm5\n" /* d7 | d6 | d5 | d4 */
"subps %%xmm2, %%xmm6\n" /* -re3 | im3 | -re2 | im2 */ "subps %%xmm2, %%xmm6\n" /* -re3 | im3 | -re2 | im2 */
"addl $32, %%edx\n" "addl $32, %%edx\n"
"movups %%xmm0, (%%eax)\n" "movaps %%xmm0, (%%eax)\n"
"addl $32, %%ebx\n" "addl $32, %%ebx\n"
"mulps %%xmm4, %%xmm6\n" "mulps %%xmm4, %%xmm6\n"
"addl $32, %%esi\n" "addl $32, %%esi\n"
"addl $32, %%eax\n" "addl $32, %%eax\n"
"addps %%xmm5, %%xmm6\n" "addps %%xmm5, %%xmm6\n"
"addl $-32, %%edi\n" "addl $-32, %%edi\n"
"movups %%xmm6, -16(%%eax)\n" "movaps %%xmm6, -16(%%eax)\n"
"decl %%ecx\n" "decl %%ecx\n"
"jnz .first_128_samples\n" "jnz .first_128_samples\n"
"movl 8(%%ebp), %%esi\n" /* buf[0].re */ "movl 8(%%ebp), %%esi\n" /* buf[0].re */
"leal 1020(%%esi), %%edi\n" /* buf[127].im */ "leal 1020(%%esi), %%edi\n" /* buf[127].im */
"movl $16, %%ecx\n" /* loop count */ "movl $16, %%ecx\n" /* loop count */
".align 16\n"
".second_128_samples:\n" ".second_128_samples:\n"
"movss (%%esi), %%xmm0\n" /* buf[i].re */ "movss (%%esi), %%xmm0\n" /* buf[i].re */
"movss 8(%%esi), %%xmm2\n" /* re1 */ "movss 8(%%esi), %%xmm2\n" /* re1 */
...@@ -292,7 +299,7 @@ static void imdct512_window_delay_sse (complex_t *buf, float *data_ptr, float *w ...@@ -292,7 +299,7 @@ static void imdct512_window_delay_sse (complex_t *buf, float *data_ptr, float *w
"movlhps %%xmm3, %%xmm1\n" /* 0.0 | im1 | 0.0 | im1 */ "movlhps %%xmm3, %%xmm1\n" /* 0.0 | im1 | 0.0 | im1 */
"movups (%%edx), %%xmm4\n" /* w3 | w2 | w1 | w0 */ "movups (%%edx), %%xmm4\n" /* w3 | w2 | w1 | w0 */
"movups (%%ebx), %%xmm5\n" /* d3 | d2 | d1 | d0 */ "movaps (%%ebx), %%xmm5\n" /* d3 | d2 | d1 | d0 */
"shufps $0xb1, %%xmm1, %%xmm1\n"/* im1 | 0.0 | im0 | 0.0 */ "shufps $0xb1, %%xmm1, %%xmm1\n"/* im1 | 0.0 | im0 | 0.0 */
"movss 16(%%esi), %%xmm6\n" /* re2 */ "movss 16(%%esi), %%xmm6\n" /* re2 */
...@@ -305,19 +312,19 @@ static void imdct512_window_delay_sse (complex_t *buf, float *data_ptr, float *w ...@@ -305,19 +312,19 @@ static void imdct512_window_delay_sse (complex_t *buf, float *data_ptr, float *w
"mulps %%xmm4, %%xmm0\n" "mulps %%xmm4, %%xmm0\n"
"shufps $0xb1, %%xmm2, %%xmm2\n"/* im3 | 0.0 | im2 | 0.0 */ "shufps $0xb1, %%xmm2, %%xmm2\n"/* im3 | 0.0 | im2 | 0.0 */
"movups 16(%%edx), %%xmm4\n" /* w7 | w6 | w5 | w4 */ "movups 16(%%edx), %%xmm4\n" /* w7 | w6 | w5 | w4 */
"addl $32, %%esi\n" "addl $32, %%esi\n"
"subps %%xmm2, %%xmm6\n" /* -im3 | re3 | -im2 | re2 */ "subps %%xmm2, %%xmm6\n" /* -im3 | re3 | -im2 | re2 */
"addps %%xmm5, %%xmm0\n" "addps %%xmm5, %%xmm0\n"
"mulps %%xmm4, %%xmm6\n" "mulps %%xmm4, %%xmm6\n"
"addl $-32, %%edi\n" "addl $-32, %%edi\n"
"movups 16(%%ebx), %%xmm5\n" /* d7 | d6 | d5 | d4 */ "movaps 16(%%ebx), %%xmm5\n" /* d7 | d6 | d5 | d4 */
"movups %%xmm0, (%%eax)\n" "movaps %%xmm0, (%%eax)\n"
"addps %%xmm5, %%xmm6\n" "addps %%xmm5, %%xmm6\n"
"addl $32, %%edx\n" "addl $32, %%edx\n"
"addl $32, %%eax\n" "addl $32, %%eax\n"
"addl $32, %%ebx\n" "addl $32, %%ebx\n"
"movups %%xmm6, -16(%%eax)\n" "movaps %%xmm6, -16(%%eax)\n"
"decl %%ecx\n" "decl %%ecx\n"
"jnz .second_128_samples\n" "jnz .second_128_samples\n"
"movl 8(%%ebp), %%eax\n" "movl 8(%%ebp), %%eax\n"
...@@ -326,6 +333,7 @@ static void imdct512_window_delay_sse (complex_t *buf, float *data_ptr, float *w ...@@ -326,6 +333,7 @@ static void imdct512_window_delay_sse (complex_t *buf, float *data_ptr, float *w
"movl $16, %%ecx\n" /* loop count */ "movl $16, %%ecx\n" /* loop count */
"movl 20(%%ebp), %%eax\n" /* delay */ "movl 20(%%ebp), %%eax\n" /* delay */
".align 16\n"
".first_128_delay:\n" ".first_128_delay:\n"
"movss (%%esi), %%xmm0\n" "movss (%%esi), %%xmm0\n"
"movss 8(%%esi), %%xmm2\n" "movss 8(%%esi), %%xmm2\n"
...@@ -341,21 +349,21 @@ static void imdct512_window_delay_sse (complex_t *buf, float *data_ptr, float *w ...@@ -341,21 +349,21 @@ static void imdct512_window_delay_sse (complex_t *buf, float *data_ptr, float *w
"movss 24(%%esi), %%xmm7\n" /* re3 */ "movss 24(%%esi), %%xmm7\n" /* re3 */
"movss -16(%%edi), %%xmm2\n" /* im2 */ "movss -16(%%edi), %%xmm2\n" /* im2 */
"movss -24(%%edi), %%xmm3\n" /* im3 */ "movss -24(%%edi), %%xmm3\n" /* im3 */
"subps %%xmm1, %%xmm0\n" /* -im1 | re1 | -im0 | re0 */ "subps %%xmm1, %%xmm0\n" /* -im1 | re1 | -im0 | re0 */
"addl $-32, %%edx\n" "addl $-32, %%edx\n"
"movlhps %%xmm7, %%xmm6\n" /* 0.0 | re3 | 0.0 | re2 */ "movlhps %%xmm7, %%xmm6\n" /* 0.0 | re3 | 0.0 | re2 */
"movlhps %%xmm3, %%xmm2\n" /* 0.0 | im3 | 0.0 | im2 */ "movlhps %%xmm3, %%xmm2\n" /* 0.0 | im3 | 0.0 | im2 */
"mulps %%xmm4, %%xmm0\n" "mulps %%xmm4, %%xmm0\n"
"movups (%%edx), %%xmm5\n" /* w7 | w6 | w5 | w4 */ "movups (%%edx), %%xmm5\n" /* w7 | w6 | w5 | w4 */
"shufps $0xb1, %%xmm2, %%xmm2\n"/* im3 | 0.0 | im2 | 0.0 */ "shufps $0xb1, %%xmm2, %%xmm2\n"/* im3 | 0.0 | im2 | 0.0 */
"movups %%xmm0, (%%eax)\n" "movaps %%xmm0, (%%eax)\n"
"addl $32, %%esi\n" "addl $32, %%esi\n"
"subps %%xmm2, %%xmm6\n" /* -im3 | re3 | -im2 | re2 */ "subps %%xmm2, %%xmm6\n" /* -im3 | re3 | -im2 | re2 */
"addl $-32, %%edi\n" "addl $-32, %%edi\n"
"mulps %%xmm5, %%xmm6\n" "mulps %%xmm5, %%xmm6\n"
"addl $32, %%eax\n" "addl $32, %%eax\n"
"movups %%xmm6, -16(%%eax)\n" "movaps %%xmm6, -16(%%eax)\n"
"decl %%ecx\n" "decl %%ecx\n"
"jnz .first_128_delay\n" "jnz .first_128_delay\n"
"movl 8(%%ebp), %%ebx\n" "movl 8(%%ebp), %%ebx\n"
...@@ -363,6 +371,7 @@ static void imdct512_window_delay_sse (complex_t *buf, float *data_ptr, float *w ...@@ -363,6 +371,7 @@ static void imdct512_window_delay_sse (complex_t *buf, float *data_ptr, float *w
"leal 1016(%%ebx), %%edi\n" /* buf[127].re */ "leal 1016(%%ebx), %%edi\n" /* buf[127].re */
"movl $16, %%ecx\n" /* loop count */ "movl $16, %%ecx\n" /* loop count */
".align 16\n"
".second_128_delay:\n" ".second_128_delay:\n"
"movss (%%esi), %%xmm0\n" "movss (%%esi), %%xmm0\n"
"movss 8(%%esi), %%xmm2\n" "movss 8(%%esi), %%xmm2\n"
...@@ -378,21 +387,21 @@ static void imdct512_window_delay_sse (complex_t *buf, float *data_ptr, float *w ...@@ -378,21 +387,21 @@ static void imdct512_window_delay_sse (complex_t *buf, float *data_ptr, float *w
"movss 24(%%esi), %%xmm7\n" /* im3 */ "movss 24(%%esi), %%xmm7\n" /* im3 */
"movss -16(%%edi), %%xmm2\n" /* re2 */ "movss -16(%%edi), %%xmm2\n" /* re2 */
"movss -24(%%edi), %%xmm3\n" /* re3 */ "movss -24(%%edi), %%xmm3\n" /* re3 */
"subps %%xmm0, %%xmm1\n" /* re1 | -im1 | re0 | -im0 */ "subps %%xmm0, %%xmm1\n" /* re1 | -im1 | re0 | -im0 */
"addl $-32, %%edx\n" "addl $-32, %%edx\n"
"movlhps %%xmm7, %%xmm6\n" /* 0.0 | im3 | 0.0 | im2 */ "movlhps %%xmm7, %%xmm6\n" /* 0.0 | im3 | 0.0 | im2 */
"movlhps %%xmm3, %%xmm2\n" /* 0.0 | re3 | 0.0 | re2 */ "movlhps %%xmm3, %%xmm2\n" /* 0.0 | re3 | 0.0 | re2 */
"mulps %%xmm4, %%xmm1\n" "mulps %%xmm4, %%xmm1\n"
"movups (%%edx), %%xmm5\n" /* w7 | w6 | w5 | w4 */ "movups (%%edx), %%xmm5\n" /* w7 | w6 | w5 | w4 */
"shufps $0xb1, %%xmm2, %%xmm2\n"/* re3 | 0.0 | re2 | 0.0 */ "shufps $0xb1, %%xmm2, %%xmm2\n"/* re3 | 0.0 | re2 | 0.0 */
"movups %%xmm1, (%%eax)\n" "movaps %%xmm1, (%%eax)\n"
"addl $32, %%esi\n" "addl $32, %%esi\n"
"subps %%xmm6, %%xmm2\n" /* re | -im3 | re | -im2 */ "subps %%xmm6, %%xmm2\n" /* re | -im3 | re | -im2 */
"addl $-32, %%edi\n" "addl $-32, %%edi\n"
"mulps %%xmm5, %%xmm2\n" "mulps %%xmm5, %%xmm2\n"
"addl $32, %%eax\n" "addl $32, %%eax\n"
"movups %%xmm2, -16(%%eax)\n" "movaps %%xmm2, -16(%%eax)\n"
"decl %%ecx\n" "decl %%ecx\n"
"jnz .second_128_delay\n" "jnz .second_128_delay\n"
"popl %%edi\n" "popl %%edi\n"
...@@ -409,6 +418,7 @@ static void imdct512_window_delay_sse (complex_t *buf, float *data_ptr, float *w ...@@ -409,6 +418,7 @@ static void imdct512_window_delay_sse (complex_t *buf, float *data_ptr, float *w
static void imdct512_window_delay_nol_sse (complex_t *buf, float *data_ptr, float *window_prt, float *delay_prt) static void imdct512_window_delay_nol_sse (complex_t *buf, float *data_ptr, float *window_prt, float *delay_prt)
{ {
__asm__ __volatile__ ( __asm__ __volatile__ (
".align 16\n"
"pushl %%ebp\n" "pushl %%ebp\n"
"movl %%esp, %%ebp\n" "movl %%esp, %%ebp\n"
...@@ -428,6 +438,7 @@ static void imdct512_window_delay_nol_sse (complex_t *buf, float *data_ptr, floa ...@@ -428,6 +438,7 @@ static void imdct512_window_delay_nol_sse (complex_t *buf, float *data_ptr, floa
"leal 504(%%eax), %%edi\n" /* buf[63].re */ "leal 504(%%eax), %%edi\n" /* buf[63].re */
"movl 12(%%ebp), %%eax\n" /* data */ "movl 12(%%ebp), %%eax\n" /* data */
".align 16\n"
".first_128_sample:\n" ".first_128_sample:\n"
"movss (%%esi), %%xmm0\n" "movss (%%esi), %%xmm0\n"
"movss 8(%%esi), %%xmm2\n" "movss 8(%%esi), %%xmm2\n"
...@@ -438,7 +449,6 @@ static void imdct512_window_delay_nol_sse (complex_t *buf, float *data_ptr, floa ...@@ -438,7 +449,6 @@ static void imdct512_window_delay_nol_sse (complex_t *buf, float *data_ptr, floa
"movlhps %%xmm3, %%xmm1\n" /* 0.0 | re1 | 0.0 | re0 */ "movlhps %%xmm3, %%xmm1\n" /* 0.0 | re1 | 0.0 | re0 */
"movups (%%edx), %%xmm4\n" /* w3 | w2 | w1 | w0 */ "movups (%%edx), %%xmm4\n" /* w3 | w2 | w1 | w0 */
/* movups (%%ebx), %%xmm5 d3 | d2 | d1 | d0 */
"shufps $0xb1, %%xmm1, %%xmm1\n"/* re1 | 0.0 | re0 | 0.0 */ "shufps $0xb1, %%xmm1, %%xmm1\n"/* re1 | 0.0 | re0 | 0.0 */
"movss 16(%%esi), %%xmm6\n" /* im2 */ "movss 16(%%esi), %%xmm6\n" /* im2 */
...@@ -446,30 +456,27 @@ static void imdct512_window_delay_nol_sse (complex_t *buf, float *data_ptr, floa ...@@ -446,30 +456,27 @@ static void imdct512_window_delay_nol_sse (complex_t *buf, float *data_ptr, floa
"subps %%xmm1, %%xmm0\n" /* -re1 | im1 | -re0 | im0 */ "subps %%xmm1, %%xmm0\n" /* -re1 | im1 | -re0 | im0 */
"movss -16(%%edi), %%xmm2\n" /* re2 */ "movss -16(%%edi), %%xmm2\n" /* re2 */
"movss -24(%%edi), %%xmm3\n" /* re3 */ "movss -24(%%edi), %%xmm3\n" /* re3 */
"mulps %%xmm4, %%xmm0\n" "mulps %%xmm4, %%xmm0\n"
"movlhps %%xmm7, %%xmm6\n" /* 0.0 | im3 | 0.0 | im2 */ "movlhps %%xmm7, %%xmm6\n" /* 0.0 | im3 | 0.0 | im2 */
"movlhps %%xmm3, %%xmm2\n" /* 0.0 | re3 | 0.0 | re2 */ "movlhps %%xmm3, %%xmm2\n" /* 0.0 | re3 | 0.0 | re2 */
/* addps %%xmm5, %%xmm0 */
"shufps $0xb1, %%xmm2, %%xmm2\n"/* re3 | 0.0 | re2 | 0.0 */ "shufps $0xb1, %%xmm2, %%xmm2\n"/* re3 | 0.0 | re2 | 0.0 */
"movups 16(%%edx), %%xmm4\n" /* w7 | w6 | w5 | w4 */ "movups 16(%%edx), %%xmm4\n" /* w7 | w6 | w5 | w4 */
/* movups 16(%%ebx), %%xmm5 d7 | d6 | d5 | d4 */ "subps %%xmm2, %%xmm6\n" /* -re3 | im3 | -re2 | im2 */
"subps %%xmm2, %%xmm6\n" /* -re3 | im3 | -re2 | im2 */ "addl $32, %%edx\n"
"addl $32, %%edx\n" "movaps %%xmm0, (%%eax)\n"
"movups %%xmm0, (%%eax)\n" "mulps %%xmm4, %%xmm6\n"
/* addl $32, %%ebx */ "addl $32, %%esi\n"
"mulps %%xmm4, %%xmm6\n" "addl $32, %%eax\n"
"addl $32, %%esi\n" "addl $-32, %%edi\n"
"addl $32, %%eax\n" "movaps %%xmm6, -16(%%eax)\n"
/* addps %%xmm5, %%xmm6 */ "decl %%ecx\n"
"addl $-32, %%edi\n"
"movups %%xmm6, -16(%%eax)\n"
"decl %%ecx\n"
"jnz .first_128_sample\n" "jnz .first_128_sample\n"
"movl 8(%%ebp), %%esi\n" /* buf[0].re */ "movl 8(%%ebp), %%esi\n" /* buf[0].re */
"leal 1020(%%esi), %%edi\n" /* buf[127].im */ "leal 1020(%%esi), %%edi\n" /* buf[127].im */
"movl $16, %%ecx\n" /* loop count */ "movl $16, %%ecx\n" /* loop count */
".align 16\n"
".second_128_sample:\n" ".second_128_sample:\n"
"movss (%%esi), %%xmm0\n" /* buf[i].re */ "movss (%%esi), %%xmm0\n" /* buf[i].re */
"movss 8(%%esi), %%xmm2\n" /* re1 */ "movss 8(%%esi), %%xmm2\n" /* re1 */
...@@ -480,32 +487,27 @@ static void imdct512_window_delay_nol_sse (complex_t *buf, float *data_ptr, floa ...@@ -480,32 +487,27 @@ static void imdct512_window_delay_nol_sse (complex_t *buf, float *data_ptr, floa
"movlhps %%xmm3, %%xmm1\n" /* 0.0 | im1 | 0.0 | im1 */ "movlhps %%xmm3, %%xmm1\n" /* 0.0 | im1 | 0.0 | im1 */
"movups (%%edx), %%xmm4\n" /* w3 | w2 | w1 | w0 */ "movups (%%edx), %%xmm4\n" /* w3 | w2 | w1 | w0 */
/* movups (%%ebx), %%xmm5 d3 | d2 | d1 | d0 */
"shufps $0xb1, %%xmm1, %%xmm1\n"/* im1 | 0.0 | im0 | 0.0 */ "shufps $0xb1, %%xmm1, %%xmm1\n"/* im1 | 0.0 | im0 | 0.0 */
"movss 16(%%esi), %%xmm6\n" /* re2 */ "movss 16(%%esi), %%xmm6\n" /* re2 */
"movss 24(%%esi), %%xmm7\n" /* re3 */ "movss 24(%%esi), %%xmm7\n" /* re3 */
"movss -16(%%edi), %%xmm2\n" /* im2 */ "movss -16(%%edi), %%xmm2\n" /* im2 */
"movss -24(%%edi), %%xmm3\n" /* im3 */ "movss -24(%%edi), %%xmm3\n" /* im3 */
"subps %%xmm1, %%xmm0\n" /* -im1 | re1 | -im0 | re0 */ "subps %%xmm1, %%xmm0\n" /* -im1 | re1 | -im0 | re0 */
"movlhps %%xmm7, %%xmm6\n" /* 0.0 | re3 | 0.0 | re2 */ "movlhps %%xmm7, %%xmm6\n" /* 0.0 | re3 | 0.0 | re2 */
"movlhps %%xmm3, %%xmm2\n" /* 0.0 | im3 | 0.0 | im2 */ "movlhps %%xmm3, %%xmm2\n" /* 0.0 | im3 | 0.0 | im2 */
"mulps %%xmm4, %%xmm0\n" "mulps %%xmm4, %%xmm0\n"
"shufps $0xb1, %%xmm2, %%xmm2\n"/* im3 | 0.0 | im2 | 0.0 */ "shufps $0xb1, %%xmm2, %%xmm2\n"/* im3 | 0.0 | im2 | 0.0 */
"movups 16(%%edx), %%xmm4\n" /* w7 | w6 | w5 | w4 */ "movups 16(%%edx), %%xmm4\n" /* w7 | w6 | w5 | w4 */
"addl $32, %%esi\n" "addl $32, %%esi\n"
"subps %%xmm2, %%xmm6\n" /* -im3 | re3 | -im2 | re2 */ "subps %%xmm2, %%xmm6\n" /* -im3 | re3 | -im2 | re2 */
/* addps %%xmm5, %%xmm0 */ "mulps %%xmm4, %%xmm6\n"
"mulps %%xmm4, %%xmm6\n" "addl $-32, %%edi\n"
"addl $-32, %%edi\n" "movaps %%xmm0, (%%eax)\n"
/* movups 16(%%ebx), %%xmm5 d7 | d6 | d5 | d4 */ "addl $32, %%edx\n"
"movups %%xmm0, (%%eax)\n" "addl $32, %%eax\n"
/* addps %%xmm5, %%xmm6 */ "movaps %%xmm6, -16(%%eax)\n"
"addl $32, %%edx\n" "decl %%ecx\n"
"addl $32, %%eax\n"
/* addl $32, %%ebx */
"movups %%xmm6, -16(%%eax)\n"
"decl %%ecx\n"
"jnz .second_128_sample\n" "jnz .second_128_sample\n"
"movl 8(%%ebp), %%eax\n" "movl 8(%%ebp), %%eax\n"
...@@ -514,6 +516,7 @@ static void imdct512_window_delay_nol_sse (complex_t *buf, float *data_ptr, floa ...@@ -514,6 +516,7 @@ static void imdct512_window_delay_nol_sse (complex_t *buf, float *data_ptr, floa
"movl $16, %%ecx\n" /* loop count */ "movl $16, %%ecx\n" /* loop count */
"movl 20(%%ebp), %%eax\n" /* delay */ "movl 20(%%ebp), %%eax\n" /* delay */
".align 16\n"
".first_128_delays:\n" ".first_128_delays:\n"
"movss (%%esi), %%xmm0\n" "movss (%%esi), %%xmm0\n"
"movss 8(%%esi), %%xmm2\n" "movss 8(%%esi), %%xmm2\n"
...@@ -530,20 +533,20 @@ static void imdct512_window_delay_nol_sse (complex_t *buf, float *data_ptr, floa ...@@ -530,20 +533,20 @@ static void imdct512_window_delay_nol_sse (complex_t *buf, float *data_ptr, floa
"movss -16(%%edi), %%xmm2\n" /* im2 */ "movss -16(%%edi), %%xmm2\n" /* im2 */
"movss -24(%%edi), %%xmm3\n" /* im3 */ "movss -24(%%edi), %%xmm3\n" /* im3 */
"subps %%xmm1, %%xmm0\n" /* -im1 | re1 | -im0 | re0 */ "subps %%xmm1, %%xmm0\n" /* -im1 | re1 | -im0 | re0 */
"addl $-32, %%edx\n" "addl $-32, %%edx\n"
"movlhps %%xmm7, %%xmm6\n" /* 0.0 | re3 | 0.0 | re2 */ "movlhps %%xmm7, %%xmm6\n" /* 0.0 | re3 | 0.0 | re2 */
"movlhps %%xmm3, %%xmm2\n" /* 0.0 | im3 | 0.0 | im2 */ "movlhps %%xmm3, %%xmm2\n" /* 0.0 | im3 | 0.0 | im2 */
"mulps %%xmm4, %%xmm0\n" "mulps %%xmm4, %%xmm0\n"
"movups (%%edx), %%xmm5\n" /* w7 | w6 | w5 | w4 */ "movups (%%edx), %%xmm5\n" /* w7 | w6 | w5 | w4 */
"shufps $0xb1, %%xmm2, %%xmm2\n"/* im3 | 0.0 | im2 | 0.0 */ "shufps $0xb1, %%xmm2, %%xmm2\n"/* im3 | 0.0 | im2 | 0.0 */
"movups %%xmm0, (%%eax)\n" "movaps %%xmm0, (%%eax)\n"
"addl $32, %%esi\n" "addl $32, %%esi\n"
"subps %%xmm2, %%xmm6\n" /* -im3 | re3 | -im2 | re2 */ "subps %%xmm2, %%xmm6\n" /* -im3 | re3 | -im2 | re2 */
"addl $-32, %%edi\n" "addl $-32, %%edi\n"
"mulps %%xmm5, %%xmm6\n" "mulps %%xmm5, %%xmm6\n"
"addl $32, %%eax\n" "addl $32, %%eax\n"
"movups %%xmm6, -16(%%eax)\n" "movaps %%xmm6, -16(%%eax)\n"
"decl %%ecx\n" "decl %%ecx\n"
"jnz .first_128_delays\n" "jnz .first_128_delays\n"
"movl 8(%%ebp), %%ebx\n" "movl 8(%%ebp), %%ebx\n"
...@@ -551,6 +554,7 @@ static void imdct512_window_delay_nol_sse (complex_t *buf, float *data_ptr, floa ...@@ -551,6 +554,7 @@ static void imdct512_window_delay_nol_sse (complex_t *buf, float *data_ptr, floa
"leal 1016(%%ebx), %%edi\n" /* buf[127].re */ "leal 1016(%%ebx), %%edi\n" /* buf[127].re */
"movl $16, %%ecx\n" /* loop count */ "movl $16, %%ecx\n" /* loop count */
".align 16\n"
".second_128_delays:\n" ".second_128_delays:\n"
"movss (%%esi), %%xmm0\n" "movss (%%esi), %%xmm0\n"
"movss 8(%%esi), %%xmm2\n" "movss 8(%%esi), %%xmm2\n"
...@@ -566,21 +570,21 @@ static void imdct512_window_delay_nol_sse (complex_t *buf, float *data_ptr, floa ...@@ -566,21 +570,21 @@ static void imdct512_window_delay_nol_sse (complex_t *buf, float *data_ptr, floa
"movss 24(%%esi), %%xmm7\n" /* im3 */ "movss 24(%%esi), %%xmm7\n" /* im3 */
"movss -16(%%edi), %%xmm2\n" /* re2 */ "movss -16(%%edi), %%xmm2\n" /* re2 */
"movss -24(%%edi), %%xmm3\n" /* re3 */ "movss -24(%%edi), %%xmm3\n" /* re3 */
"subps %%xmm0, %%xmm1\n" /* re1 | -im1 | re0 | -im0 */ "subps %%xmm0, %%xmm1\n" /* re1 | -im1 | re0 | -im0 */
"addl $-32, %%edx\n" "addl $-32, %%edx\n"
"movlhps %%xmm7, %%xmm6\n" /* 0.0 | im3 | 0.0 | im2 */ "movlhps %%xmm7, %%xmm6\n" /* 0.0 | im3 | 0.0 | im2 */
"movlhps %%xmm3, %%xmm2\n" /* 0.0 | re3 | 0.0 | re2 */ "movlhps %%xmm3, %%xmm2\n" /* 0.0 | re3 | 0.0 | re2 */
"mulps %%xmm4, %%xmm1\n" "mulps %%xmm4, %%xmm1\n"
"movups (%%edx), %%xmm5\n" /* w7 | w6 | w5 | w4 */ "movups (%%edx), %%xmm5\n" /* w7 | w6 | w5 | w4 */
"shufps $0xb1, %%xmm2, %%xmm2\n"/* re3 | 0.0 | re2 | 0.0 */ "shufps $0xb1, %%xmm2, %%xmm2\n"/* re3 | 0.0 | re2 | 0.0 */
"movups %%xmm1, (%%eax)\n" "movaps %%xmm1, (%%eax)\n"
"addl $32, %%esi\n" "addl $32, %%esi\n"
"subps %%xmm6, %%xmm2\n" /* re | -im3 | re | -im2 */ "subps %%xmm6, %%xmm2\n" /* re | -im3 | re | -im2 */
"addl $-32, %%edi\n" "addl $-32, %%edi\n"
"mulps %%xmm5, %%xmm2\n" "mulps %%xmm5, %%xmm2\n"
"addl $32, %%eax\n" "addl $32, %%eax\n"
"movups %%xmm2, -16(%%eax)\n" "movaps %%xmm2, -16(%%eax)\n"
"decl %%ecx\n" "decl %%ecx\n"
"jnz .second_128_delays\n" "jnz .second_128_delays\n"
"popl %%edi\n" "popl %%edi\n"
......
...@@ -2,7 +2,7 @@ ...@@ -2,7 +2,7 @@
* ac3_srfft_3dn.c: accelerated 3D Now! ac3 fft functions * ac3_srfft_3dn.c: accelerated 3D Now! ac3 fft functions
***************************************************************************** *****************************************************************************
* Copyright (C) 1999, 2000, 2001 VideoLAN * Copyright (C) 1999, 2000, 2001 VideoLAN
* $Id: ac3_srfft_3dn.c,v 1.1 2001/05/16 14:51:29 reno Exp $ * $Id: ac3_srfft_3dn.c,v 1.2 2001/07/08 23:15:11 reno Exp $
* *
* Authors: Renaud Dartus <reno@videolan.org> * Authors: Renaud Dartus <reno@videolan.org>
* *
...@@ -126,6 +126,7 @@ void C_1_3dn (void) ...@@ -126,6 +126,7 @@ void C_1_3dn (void)
static void fft_4_3dn (complex_t *x) static void fft_4_3dn (complex_t *x)
{ {
__asm__ __volatile__ ( __asm__ __volatile__ (
".align 16\n"
"movq (%%eax), %%mm0\n" /* x[0] */ "movq (%%eax), %%mm0\n" /* x[0] */
"movq 8(%%eax), %%mm1\n" /* x[1] */ "movq 8(%%eax), %%mm1\n" /* x[1] */
"movq 16(%%eax), %%mm2\n" /* x[2] */ "movq 16(%%eax), %%mm2\n" /* x[2] */
......
...@@ -2,7 +2,7 @@ ...@@ -2,7 +2,7 @@
* ac3_srfft_sse.c: accelerated SSE ac3 fft functions * ac3_srfft_sse.c: accelerated SSE ac3 fft functions
***************************************************************************** *****************************************************************************
* Copyright (C) 1999, 2000, 2001 VideoLAN * Copyright (C) 1999, 2000, 2001 VideoLAN
* $Id: ac3_srfft_sse.c,v 1.3 2001/07/01 08:49:09 gbazin Exp $ * $Id: ac3_srfft_sse.c,v 1.4 2001/07/08 23:15:11 reno Exp $
* *
* Authors: Renaud Dartus <reno@videolan.org> * Authors: Renaud Dartus <reno@videolan.org>
* Aaron Holtzman <aholtzma@engr.uvic.ca> * Aaron Holtzman <aholtzma@engr.uvic.ca>
...@@ -106,44 +106,45 @@ void _M( fft_128p ) ( complex_t *a ) ...@@ -106,44 +106,45 @@ void _M( fft_128p ) ( complex_t *a )
void hsqrt2_sse (void) void hsqrt2_sse (void)
{ {
__asm__ ( __asm__ __volatile__ (
".float 0f0.707106781188\n" ".float 0f0.707106781188\n"
".float 0f0.707106781188\n" ".float 0f0.707106781188\n"
".float 0f-0.707106781188\n" ".float 0f-0.707106781188\n"
".float 0f-0.707106781188\n" ".float 0f-0.707106781188\n"
); );
} }
void C_1_sse (void) void C_1_sse (void)
{ {
__asm__ ( __asm__ __volatile__ (
".float 0f-1.0\n" ".float 0f-1.0\n"
".float 0f1.0\n" ".float 0f1.0\n"
".float 0f-1.0\n" ".float 0f-1.0\n"
".float 0f1.0\n" ".float 0f1.0\n"
); );
} }
static void fft_4_sse (complex_t *x) static void fft_4_sse (complex_t *x)
{ {
__asm__ __volatile__ ( __asm__ __volatile__ (
"movups (%%eax), %%xmm0\n" /* x[1] | x[0] */ ".align 16\n"
"movups 16(%%eax), %%xmm2\n" /* x[3] | x[2] */ "movaps (%%eax), %%xmm0\n" /* x[1] | x[0] */
"movups %%xmm0, %%xmm1\n" /* x[1] | x[0] */ "movaps 16(%%eax), %%xmm2\n" /* x[3] | x[2] */
"addps %%xmm2, %%xmm0\n" /* x[1] + x[3] | x[0] + x[2] */ "movaps %%xmm0, %%xmm1\n" /* x[1] | x[0] */
"subps %%xmm2, %%xmm1\n" /* x[1] - x[3] | x[0] - x[2] */ "addps %%xmm2, %%xmm0\n" /* x[1] + x[3] | x[0] + x[2] */
"subps %%xmm2, %%xmm1\n" /* x[1] - x[3] | x[0] - x[2] */
"xorps %%xmm6, %%xmm6\n" "xorps %%xmm6, %%xmm6\n"
"movhlps %%xmm1, %%xmm4\n" /* ? | x[1] - x[3] */ "movhlps %%xmm1, %%xmm4\n" /* ? | x[1] - x[3] */
"movhlps %%xmm0, %%xmm3\n" /* ? | x[1] + x[3] */ "movhlps %%xmm0, %%xmm3\n" /* ? | x[1] + x[3] */
"subss %%xmm4, %%xmm6\n" /* 0 | -(x[1] - x[3]).re */ "subss %%xmm4, %%xmm6\n" /* 0 | -(x[1] - x[3]).re */
"movlhps %%xmm1, %%xmm0\n" /* x[0] - x[2] | x[0] + x[2] */ "movlhps %%xmm1, %%xmm0\n" /* x[0] - x[2] | x[0] + x[2] */
"movlhps %%xmm6, %%xmm4\n" /* 0 | -(x[1] - x[3]).re | (x[1] - x[3]).im | (x[3]-x[1]).re */ "movlhps %%xmm6, %%xmm4\n" /* 0 | -(x[1] - x[3]).re | (x[1] - x[3]).im | (x[3]-x[1]).re */
"movups %%xmm0, %%xmm2\n" /* x[0] - x[2] | x[0] + x[2] */ "movaps %%xmm0, %%xmm2\n" /* x[0] - x[2] | x[0] + x[2] */
"shufps $0x94, %%xmm4, %%xmm3\n" /* i*(x[1] - x[3]) | x[1] + x[3] */ "shufps $0x94, %%xmm4, %%xmm3\n" /* i*(x[1] - x[3]) | x[1] + x[3] */
"addps %%xmm3, %%xmm0\n" "addps %%xmm3, %%xmm0\n"
"subps %%xmm3, %%xmm2\n" "subps %%xmm3, %%xmm2\n"
"movups %%xmm0, (%%eax)\n" "movaps %%xmm0, (%%eax)\n"
"movups %%xmm2, 16(%%eax)\n" "movaps %%xmm2, 16(%%eax)\n"
: "=a" (x) : "=a" (x)
: "a" (x) ); : "a" (x) );
} }
...@@ -151,62 +152,63 @@ static void fft_4_sse (complex_t *x) ...@@ -151,62 +152,63 @@ static void fft_4_sse (complex_t *x)
static void fft_8_sse (complex_t *x) static void fft_8_sse (complex_t *x)
{ {
__asm__ __volatile__ ( __asm__ __volatile__ (
".align 16\n"
"pushl %%ebx\n" "pushl %%ebx\n"
"movlps (%%eax), %%xmm0\n" /* x[0] */ "movlps (%%eax), %%xmm0\n" /* x[0] */
"movlps 32(%%eax), %%xmm1\n" /* x[4] */ "movlps 32(%%eax), %%xmm1\n" /* x[4] */
"movhps 16(%%eax), %%xmm0\n" /* x[2] | x[0] */ "movhps 16(%%eax), %%xmm0\n" /* x[2] | x[0] */
"movhps 48(%%eax), %%xmm1\n" /* x[6] | x[4] */ "movhps 48(%%eax), %%xmm1\n" /* x[6] | x[4] */
"movups %%xmm0, %%xmm2\n" /* x[2] | x[0] */ "movaps %%xmm0, %%xmm2\n" /* x[2] | x[0] */
"xorps %%xmm3, %%xmm3\n" "xorps %%xmm3, %%xmm3\n"
"addps %%xmm1, %%xmm0\n" /* x[2] + x[6] | x[0] + x[4] */ "addps %%xmm1, %%xmm0\n" /* x[2] + x[6] | x[0] + x[4] */
"subps %%xmm1, %%xmm2\n" /* x[2] - x[6] | x[0] - x[4] */ "subps %%xmm1, %%xmm2\n" /* x[2] - x[6] | x[0] - x[4] */
"movhlps %%xmm0, %%xmm5\n" /* x[2] + x[6] */ "movhlps %%xmm0, %%xmm5\n" /* x[2] + x[6] */
"movhlps %%xmm2, %%xmm4\n" /* x[2] - x[6] */ "movhlps %%xmm2, %%xmm4\n" /* x[2] - x[6] */
"movlhps %%xmm2, %%xmm0\n" /* x[0] - x[4] | x[0] + x[4] */ "movlhps %%xmm2, %%xmm0\n" /* x[0] - x[4] | x[0] + x[4] */
"subss %%xmm4, %%xmm3\n" /* (x[2]-x[6]).im | -(x[2]-x[6]).re */ "subss %%xmm4, %%xmm3\n" /* (x[2]-x[6]).im | -(x[2]-x[6]).re */
"movups %%xmm0, %%xmm7\n" /* x[0] - x[4] | x[0] + x[4] */ "movaps %%xmm0, %%xmm7\n" /* x[0] - x[4] | x[0] + x[4] */
"movups %%xmm3, %%xmm4\n" /* (x[2]-x[6]).im | -(x[2]-x[6]).re */ "movaps %%xmm3, %%xmm4\n" /* (x[2]-x[6]).im | -(x[2]-x[6]).re */
"movlps 8(%%eax), %%xmm1\n" /* x[1] */ "movlps 8(%%eax), %%xmm1\n" /* x[1] */
"shufps $0x14, %%xmm4, %%xmm5\n" /* i*(x[2] - x[6]) | x[2] + x[6] */ "shufps $0x14, %%xmm4, %%xmm5\n" /* i*(x[2] - x[6]) | x[2] + x[6] */
"addps %%xmm5, %%xmm0\n" /* yt = i*(x2-x6)+x0-x4 | x2+x6+x0+x4 */ "addps %%xmm5, %%xmm0\n" /* yt = i*(x2-x6)+x0-x4 | x2+x6+x0+x4 */
"subps %%xmm5, %%xmm7\n" /* yb = i*(x6-x2)+x0-x4 | -x6-x2+x0+x4 */ "subps %%xmm5, %%xmm7\n" /* yb = i*(x6-x2)+x0-x4 | -x6-x2+x0+x4 */
"movhps 24(%%eax), %%xmm1\n" /* x[3] | x[1] */ "movhps 24(%%eax), %%xmm1\n" /* x[3] | x[1] */
"movl $hsqrt2_sse, %%ebx\n" "movl $hsqrt2_sse, %%ebx\n"
"movlps 40(%%eax), %%xmm2\n" /* x[5] */ "movlps 40(%%eax), %%xmm2\n" /* x[5] */
"movhps 56(%%eax), %%xmm2\n" /* x[7] | x[5] */ "movhps 56(%%eax), %%xmm2\n" /* x[7] | x[5] */
"movups %%xmm1, %%xmm3\n" /* x[3] | x[1] */ "movaps %%xmm1, %%xmm3\n" /* x[3] | x[1] */
"addps %%xmm2, %%xmm1\n" /* x[3] + x[7] | x[1] + x[5] */ "addps %%xmm2, %%xmm1\n" /* x[3] + x[7] | x[1] + x[5] */
"subps %%xmm2, %%xmm3\n" /* x[3] - x[7] | x[1] - x[5] */ "subps %%xmm2, %%xmm3\n" /* x[3] - x[7] | x[1] - x[5] */
"movups (%%ebx), %%xmm4\n" /* -1/sqrt2 | -1/sqrt2 | 1/sqrt2 | 1/sqrt2 */ "movups (%%ebx), %%xmm4\n" /* -1/sqrt2 | -1/sqrt2 | 1/sqrt2 | 1/sqrt2 */
"movups %%xmm3, %%xmm6\n" /* x[3] - x[7] | x[1] - x[5] */ "movaps %%xmm3, %%xmm6\n" /* x[3] - x[7] | x[1] - x[5] */
"mulps %%xmm4, %%xmm3\n" /* -1/s2*(x[3] - x[7]) | 1/s2*(x[1] - x[5]) */ "mulps %%xmm4, %%xmm3\n" /* -1/s2*(x[3] - x[7]) | 1/s2*(x[1] - x[5]) */
"shufps $0xc8, %%xmm4, %%xmm4\n" /* -1/sqrt2 | 1/sqrt2 | -1/sqrt2 | 1/sqrt2 */ "shufps $0xc8, %%xmm4, %%xmm4\n" /* -1/sqrt2 | 1/sqrt2 | -1/sqrt2 | 1/sqrt2 */
"shufps $0xb1, %%xmm6, %%xmm6\n" /* (x3-x7).re|(x3-x7).im|(x1-x5).re|(x1-x5).im */ "shufps $0xb1, %%xmm6, %%xmm6\n" /* (x3-x7).re|(x3-x7).im|(x1-x5).re|(x1-x5).im */
"mulps %%xmm4, %%xmm6\n" /* (x7-x3).re/s2|(x3-x7).im/s2|(x5-x1).re/s2|(x1-x5).im/s2 */ "mulps %%xmm4, %%xmm6\n" /* (x7-x3).re/s2|(x3-x7).im/s2|(x5-x1).re/s2|(x1-x5).im/s2 */
"addps %%xmm3, %%xmm6\n" /* (-1-i)/sqrt2 * (x[3]-x[7]) | (1-i)/sqrt2 * (x[1] - x[5]) */ "addps %%xmm3, %%xmm6\n" /* (-1-i)/sqrt2 * (x[3]-x[7]) | (1-i)/sqrt2 * (x[1] - x[5]) */
"movhlps %%xmm1, %%xmm5\n" /* x[3] + x[7] */ "movhlps %%xmm1, %%xmm5\n" /* x[3] + x[7] */
"movlhps %%xmm6, %%xmm1\n" /* (1+i)/sqrt2 * (x[1]-x[5]) | x[1]+x[5] */ "movlhps %%xmm6, %%xmm1\n" /* (1+i)/sqrt2 * (x[1]-x[5]) | x[1]+x[5] */
"shufps $0xe4, %%xmm6, %%xmm5\n" /* (-1-i)/sqrt2 * (x[3]-x[7]) | x[3]+x[7] */ "shufps $0xe4, %%xmm6, %%xmm5\n" /* (-1-i)/sqrt2 * (x[3]-x[7]) | x[3]+x[7] */
"movups %%xmm1, %%xmm3\n" /* (1-i)/sqrt2 * (x[1]-x[5]) | x[1]+x[5] */ "movaps %%xmm1, %%xmm3\n" /* (1-i)/sqrt2 * (x[1]-x[5]) | x[1]+x[5] */
"movl $C_1_sse, %%ebx\n" "movl $C_1_sse, %%ebx\n"
"addps %%xmm5, %%xmm1\n" /* u */ "addps %%xmm5, %%xmm1\n" /* u */
"subps %%xmm5, %%xmm3\n" /* v */ "subps %%xmm5, %%xmm3\n" /* v */
"movups %%xmm0, %%xmm2\n" /* yb */ "movaps %%xmm0, %%xmm2\n" /* yb */
"movups %%xmm7, %%xmm4\n" /* yt */ "movaps %%xmm7, %%xmm4\n" /* yt */
"movups (%%ebx), %%xmm5\n" "movups (%%ebx), %%xmm5\n"
"mulps %%xmm5, %%xmm3\n" "mulps %%xmm5, %%xmm3\n"
"addps %%xmm1, %%xmm0\n" /* yt + u */ "addps %%xmm1, %%xmm0\n" /* yt + u */
"subps %%xmm1, %%xmm2\n" /* yt - u */ "subps %%xmm1, %%xmm2\n" /* yt - u */
"shufps $0xb1, %%xmm3, %%xmm3\n" /* -i * v */ "shufps $0xb1, %%xmm3, %%xmm3\n" /* -i * v */
"movups %%xmm0, (%%eax)\n" "movaps %%xmm0, (%%eax)\n"
"movups %%xmm2, 32(%%eax)\n" "movaps %%xmm2, 32(%%eax)\n"
"addps %%xmm3, %%xmm4\n" /* yb - i*v */ "addps %%xmm3, %%xmm4\n" /* yb - i*v */
"subps %%xmm3, %%xmm7\n" /* yb + i*v */ "subps %%xmm3, %%xmm7\n" /* yb + i*v */
"movups %%xmm4, 16(%%eax)\n" "movaps %%xmm4, 16(%%eax)\n"
"movups %%xmm7, 48(%%eax)\n" "movaps %%xmm7, 48(%%eax)\n"
"popl %%ebx\n" "popl %%ebx\n"
: "=a" (x) : "=a" (x)
...@@ -218,6 +220,7 @@ static void fft_asmb_sse (int k, complex_t *x, complex_t *wTB, ...@@ -218,6 +220,7 @@ static void fft_asmb_sse (int k, complex_t *x, complex_t *wTB,
const complex_t *d, const complex_t *d_3) const complex_t *d, const complex_t *d_3)
{ {
__asm__ __volatile__ ( __asm__ __volatile__ (
".align 16\n"
"pushl %%ebp\n" "pushl %%ebp\n"
"movl %%esp, %%ebp\n" "movl %%esp, %%ebp\n"
...@@ -225,10 +228,11 @@ static void fft_asmb_sse (int k, complex_t *x, complex_t *wTB, ...@@ -225,10 +228,11 @@ static void fft_asmb_sse (int k, complex_t *x, complex_t *wTB,
"pushl %%eax\n" "pushl %%eax\n"
"pushl %%ebx\n" "pushl %%ebx\n"
"pushl %%ecx\n" "pushl %%ecx\n" //
"pushl %%edx\n" "pushl %%edx\n"
"pushl %%esi\n" "pushl %%esi\n"
"pushl %%edi\n" // "movl %%edi, %%ecx\n" /* k */
"pushl %%edi\n" //
"movl 8(%%ebp), %%ecx\n" /* k */ "movl 8(%%ebp), %%ecx\n" /* k */
"movl 12(%%ebp), %%eax\n" /* x */ "movl 12(%%ebp), %%eax\n" /* x */
...@@ -236,19 +240,20 @@ static void fft_asmb_sse (int k, complex_t *x, complex_t *wTB, ...@@ -236,19 +240,20 @@ static void fft_asmb_sse (int k, complex_t *x, complex_t *wTB,
"movl 16(%%ebp), %%ebx\n" /* wT */ "movl 16(%%ebp), %%ebx\n" /* wT */
"movl 20(%%ebp), %%edx\n" /* d */ "movl 20(%%ebp), %%edx\n" /* d */
"movl 24(%%ebp), %%esi\n" /* d3 */ "movl 24(%%ebp), %%esi\n" /* d3 */
"shll $4, %%ecx\n" /* 16k */ "shll $4, %%ecx\n" /* 16k */ ///
"addl $8, %%edx\n" "addl $8, %%edx\n"
"leal (%%eax, %%ecx, 2), %%edi\n" "leal (%%eax, %%ecx, 2), %%edi\n"
"addl $8, %%esi\n" "addl $8, %%esi\n"
/* TRANSZERO and TRANS */ /* TRANSZERO and TRANS */
"movups (%%eax), %%xmm0\n" /* x[1] | x[0] */ ".align 16\n"
"movups (%%ebx), %%xmm1\n" /* wT[1] | wT[0] */ "movaps (%%eax), %%xmm0\n" /* x[1] | x[0] */
"movups (%%ebx, %%ecx), %%xmm2\n" /* wB[1] | wB[0] */ "movaps (%%ebx), %%xmm1\n" /* wT[1] | wT[0] */
"movlps (%%edx), %%xmm3\n" /* d */ "movaps (%%ebx, %%ecx), %%xmm2\n" /* wB[1] | wB[0] */
"movlps (%%esi), %%xmm4\n" /* d3 */ "movlps (%%edx), %%xmm3\n" /* d */
"movhlps %%xmm1, %%xmm5\n" /* wT[1] */ "movlps (%%esi), %%xmm4\n" /* d3 */
"movhlps %%xmm2, %%xmm6\n" /* wB[1] */ "movhlps %%xmm1, %%xmm5\n" /* wT[1] */
"movhlps %%xmm2, %%xmm6\n" /* wB[1] */
"shufps $0x50, %%xmm3, %%xmm3\n" /* d[1].im | d[1].im | d[1].re | d[1].re */ "shufps $0x50, %%xmm3, %%xmm3\n" /* d[1].im | d[1].im | d[1].re | d[1].re */
"shufps $0x50, %%xmm4, %%xmm4\n" /* d3[1].im | d3[1].im | d3[i].re | d3[i].re */ "shufps $0x50, %%xmm4, %%xmm4\n" /* d3[1].im | d3[1].im | d3[i].re | d3[i].re */
"movlhps %%xmm5, %%xmm5\n" /* wT[1] | wT[1] */ "movlhps %%xmm5, %%xmm5\n" /* wT[1] | wT[1] */
...@@ -259,40 +264,41 @@ static void fft_asmb_sse (int k, complex_t *x, complex_t *wTB, ...@@ -259,40 +264,41 @@ static void fft_asmb_sse (int k, complex_t *x, complex_t *wTB,
"movlhps %%xmm6, %%xmm5\n" /* wB[1].im * d3[1].re | wB[1].re * d3[1].re | wT[1].im * d[1].re | wT[1].re * d[1].re */ "movlhps %%xmm6, %%xmm5\n" /* wB[1].im * d3[1].re | wB[1].re * d3[1].re | wT[1].im * d[1].re | wT[1].re * d[1].re */
"shufps $0xb1, %%xmm6, %%xmm7\n" /* wB[1].re * d3[1].im | wB[i].im * d3[1].im | wT[1].re * d[1].im | wT[1].im * d[1].im */ "shufps $0xb1, %%xmm6, %%xmm7\n" /* wB[1].re * d3[1].im | wB[i].im * d3[1].im | wT[1].re * d[1].im | wT[1].im * d[1].im */
"movl $C_1_sse, %%edi\n" "movl $C_1_sse, %%edi\n"
"movups (%%edi), %%xmm4\n" "movaps (%%edi), %%xmm4\n"
"mulps %%xmm4, %%xmm7\n" "mulps %%xmm4, %%xmm7\n"
"addps %%xmm7, %%xmm5\n" /* wB[1] * d3[1] | wT[1] * d[1] */ "addps %%xmm7, %%xmm5\n" /* wB[1] * d3[1] | wT[1] * d[1] */
"movlhps %%xmm5, %%xmm1\n" /* d[1] * wT[1] | wT[0] */ "movlhps %%xmm5, %%xmm1\n" /* d[1] * wT[1] | wT[0] */
"shufps $0xe4, %%xmm5, %%xmm2\n" /* d3[1] * wB[1] | wB[0] */ "shufps $0xe4, %%xmm5, %%xmm2\n" /* d3[1] * wB[1] | wB[0] */
"movups %%xmm1, %%xmm3\n" /* d[1] * wT[1] | wT[0] */ "movaps %%xmm1, %%xmm3\n" /* d[1] * wT[1] | wT[0] */
"leal (%%eax, %%ecx, 2), %%edi\n" "leal (%%eax, %%ecx, 2), %%edi\n"
"addps %%xmm2, %%xmm1\n" /* u */ "addps %%xmm2, %%xmm1\n" /* u */
"subps %%xmm2, %%xmm3\n" /* v */ "subps %%xmm2, %%xmm3\n" /* v */
"mulps %%xmm4, %%xmm3\n" "mulps %%xmm4, %%xmm3\n"
"movups (%%eax, %%ecx), %%xmm5\n" /* xk[1] | xk[0] */ "movaps (%%eax, %%ecx), %%xmm5\n" /* xk[1] | xk[0] */
"shufps $0xb1, %%xmm3, %%xmm3\n" /* -i * v */ "shufps $0xb1, %%xmm3, %%xmm3\n" /* -i * v */
"movups %%xmm0, %%xmm2\n" /* x[1] | x[0] */ "movaps %%xmm0, %%xmm2\n" /* x[1] | x[0] */
"movups %%xmm5, %%xmm6\n" /* xk[1] | xk[0] */ "movaps %%xmm5, %%xmm6\n" /* xk[1] | xk[0] */
"addps %%xmm1, %%xmm0\n" "addps %%xmm1, %%xmm0\n"
"subps %%xmm1, %%xmm2\n" "subps %%xmm1, %%xmm2\n"
"addps %%xmm3, %%xmm5\n" "addps %%xmm3, %%xmm5\n"
"subps %%xmm3, %%xmm6\n" "subps %%xmm3, %%xmm6\n"
"movups %%xmm0, (%%eax)\n" "movaps %%xmm0, (%%eax)\n"
"movups %%xmm2, (%%edi)\n" "movaps %%xmm2, (%%edi)\n"
"movups %%xmm5, (%%eax, %%ecx)\n" "movaps %%xmm5, (%%eax, %%ecx)\n"
"movups %%xmm6, (%%edi, %%ecx)\n" "movaps %%xmm6, (%%edi, %%ecx)\n"
"addl $16, %%eax\n" "addl $16, %%eax\n"
"addl $16, %%ebx\n" "addl $16, %%ebx\n"
"addl $8, %%edx\n" "addl $8, %%edx\n"
"addl $8, %%esi\n" "addl $8, %%esi\n"
"decl -4(%%ebp)\n" "decl -4(%%ebp)\n"
".align 16\n"
".loop:\n" ".loop:\n"
"movups (%%ebx), %%xmm0\n" /* wT[1] | wT[0] */ "movaps (%%ebx), %%xmm0\n" /* wT[1] | wT[0] */
"movups (%%edx), %%xmm1\n" /* d[1] | d[0] */ "movaps (%%edx), %%xmm1\n" /* d[1] | d[0] */
"movups (%%ebx, %%ecx), %%xmm4\n" /* wB[1] | wB[0] */ "movaps (%%ebx, %%ecx), %%xmm4\n" /* wB[1] | wB[0] */
"movups (%%esi), %%xmm5\n" /* d3[1] | d3[0] */ "movaps (%%esi), %%xmm5\n" /* d3[1] | d3[0] */
"movhlps %%xmm0, %%xmm2\n" /* wT[1] */ "movhlps %%xmm0, %%xmm2\n" /* wT[1] */
"movhlps %%xmm1, %%xmm3\n" /* d[1] */ "movhlps %%xmm1, %%xmm3\n" /* d[1] */
...@@ -317,50 +323,51 @@ static void fft_asmb_sse (int k, complex_t *x, complex_t *wTB, ...@@ -317,50 +323,51 @@ static void fft_asmb_sse (int k, complex_t *x, complex_t *wTB,
"movlhps %%xmm2, %%xmm0\n" /* d[1].re * wT[1].im | d[1].re * wT[1].re | d[0].re * wT[0].im | d[0].re * wT[0].re */ "movlhps %%xmm2, %%xmm0\n" /* d[1].re * wT[1].im | d[1].re * wT[1].re | d[0].re * wT[0].im | d[0].re * wT[0].re */
"mulps %%xmm5, %%xmm4\n" /* wB[0].im * d3[0].im | wB[0].re * d3[0].im | wB[0].im * d3[0].re | wB[0].re * d3[0].re */ "mulps %%xmm5, %%xmm4\n" /* wB[0].im * d3[0].im | wB[0].re * d3[0].im | wB[0].im * d3[0].re | wB[0].re * d3[0].re */
"mulps %%xmm7, %%xmm6\n" /* wB[1].im * d3[1].im | wB[1].re * d3[1].im | wB[1].im * d3[1].re | wB[1].re * d3[1].re */ "mulps %%xmm7, %%xmm6\n" /* wB[1].im * d3[1].im | wB[1].re * d3[1].im | wB[1].im * d3[1].re | wB[1].re * d3[1].re */
"shufps $0xb1, %%xmm2, %%xmm1\n" /* d[1].im * wT[1].re | d[1].im * wT[1].im | d[0].im * wT[0].re | d[0].im * wT[0].im */ "shufps $0xb1, %%xmm2, %%xmm1\n" /* d[1].im * wT[1].re | d[1].im * wT[1].im | d[0].im * wT[0].re | d[0].im * wT[0].im */
"movl $C_1_sse, %%edi\n" "movl $C_1_sse, %%edi\n"
"movups (%%edi), %%xmm3\n" /* 1.0 | -1.0 | 1.0 | -1.0 */ "movaps (%%edi), %%xmm3\n" /* 1.0 | -1.0 | 1.0 | -1.0 */
"movhlps %%xmm4, %%xmm5\n" /* wB[0].im * d3[0].im | wB[0].re * d3[0].im */ "movhlps %%xmm4, %%xmm5\n" /* wB[0].im * d3[0].im | wB[0].re * d3[0].im */
"mulps %%xmm3, %%xmm1\n" /* d[1].im * wT[1].re | -d[1].im * wT[1].im | d[0].im * wT[0].re | -d[0].im * wT[0].im */ "mulps %%xmm3, %%xmm1\n" /* d[1].im * wT[1].re | -d[1].im * wT[1].im | d[0].im * wT[0].re | -d[0].im * wT[0].im */
"movlhps %%xmm6, %%xmm4\n" /* wB[1].im * d3[1].re | wB[1].re * d3[1].re | wB[0].im * d3[0].re | wB[0].im * d3[0].re */ "movlhps %%xmm6, %%xmm4\n" /* wB[1].im * d3[1].re | wB[1].re * d3[1].re | wB[0].im * d3[0].re | wB[0].im * d3[0].re */
"addps %%xmm1, %%xmm0\n" /* wT[1] * d[1] | wT[0] * d[0] */ "addps %%xmm1, %%xmm0\n" /* wT[1] * d[1] | wT[0] * d[0] */
"shufps $0xb1, %%xmm6, %%xmm5\n" /* wB[1].re * d3[1].im | wB[1].im * d3[1].im | wB[0].re * d3[0].im | wB[0].im * d3[0].im */ "shufps $0xb1, %%xmm6, %%xmm5\n" /* wB[1].re * d3[1].im | wB[1].im * d3[1].im | wB[0].re * d3[0].im | wB[0].im * d3[0].im */
"mulps %%xmm3, %%xmm5\n" /* wB[1].re * d3[1].im | -wB[1].im * d3[1].im | wB[0].re * d3[0].im | -wB[0].im * d3[0].im */ "mulps %%xmm3, %%xmm5\n" /* wB[1].re * d3[1].im | -wB[1].im * d3[1].im | wB[0].re * d3[0].im | -wB[0].im * d3[0].im */
"addps %%xmm5, %%xmm4\n" /* wB[1] * d3[1] | wB[0] * d3[0] */ "addps %%xmm5, %%xmm4\n" /* wB[1] * d3[1] | wB[0] * d3[0] */
"movups %%xmm0, %%xmm1\n" /* wT[1] * d[1] | wT[0] * d[0] */ "movaps %%xmm0, %%xmm1\n" /* wT[1] * d[1] | wT[0] * d[0] */
"addps %%xmm4, %%xmm0\n" /* u */ "addps %%xmm4, %%xmm0\n" /* u */
"subps %%xmm4, %%xmm1\n" /* v */ "subps %%xmm4, %%xmm1\n" /* v */
"movups (%%eax), %%xmm6\n" /* x[1] | x[0] */ "movaps (%%eax), %%xmm6\n" /* x[1] | x[0] */
"leal (%%eax, %%ecx, 2), %%edi\n" "leal (%%eax, %%ecx, 2), %%edi\n"
"mulps %%xmm3, %%xmm1\n" "mulps %%xmm3, %%xmm1\n"
"addl $16, %%ebx\n" "addl $16, %%ebx\n"
"addl $16, %%esi\n" "addl $16, %%esi\n"
"shufps $0xb1, %%xmm1, %%xmm1\n" /* -i * v */ "shufps $0xb1, %%xmm1, %%xmm1\n" /* -i * v */
"movups (%%eax, %%ecx), %%xmm7\n" /* xk[1] | xk[0] */ "movaps (%%eax, %%ecx), %%xmm7\n" /* xk[1] | xk[0] */
"movups %%xmm6, %%xmm2\n" "movaps %%xmm6, %%xmm2\n"
"movups %%xmm7, %%xmm4\n" "movaps %%xmm7, %%xmm4\n"
"addps %%xmm0, %%xmm6\n" "addps %%xmm0, %%xmm6\n"
"subps %%xmm0, %%xmm2\n" "subps %%xmm0, %%xmm2\n"
"movups %%xmm6, (%%eax)\n" "movaps %%xmm6, (%%eax)\n"
"movups %%xmm2, (%%edi)\n" "movaps %%xmm2, (%%edi)\n"
"addps %%xmm1, %%xmm7\n" "addps %%xmm1, %%xmm7\n"
"subps %%xmm1, %%xmm4\n" "subps %%xmm1, %%xmm4\n"
"addl $16, %%edx\n" "addl $16, %%edx\n"
"movups %%xmm7, (%%eax, %%ecx)\n" "movaps %%xmm7, (%%eax, %%ecx)\n"
"movups %%xmm4, (%%edi, %%ecx)\n" "movaps %%xmm4, (%%edi, %%ecx)\n"
"addl $16, %%eax\n" "addl $16, %%eax\n"
"decl -4(%%ebp)\n" "decl -4(%%ebp)\n"
"jnz .loop\n" "jnz .loop\n"
".align 16\n"
".end:\n" ".end:\n"
"popl %%edi\n" "popl %%edi\n" //
"popl %%esi\n" "popl %%esi\n"
"popl %%edx\n" "popl %%edx\n"
"popl %%ecx\n" "popl %%ecx\n" //
"popl %%ebx\n" "popl %%ebx\n"
"popl %%eax\n" "popl %%eax\n"
......
...@@ -2,7 +2,7 @@ ...@@ -2,7 +2,7 @@
* ac3_decoder.h : ac3 decoder interface * ac3_decoder.h : ac3 decoder interface
***************************************************************************** *****************************************************************************
* Copyright (C) 1999, 2000 VideoLAN * Copyright (C) 1999, 2000 VideoLAN
* $Id: ac3_decoder.h,v 1.10 2001/06/12 00:30:41 reno Exp $ * $Id: ac3_decoder.h,v 1.11 2001/07/08 23:15:11 reno Exp $
* *
* Authors: Michel Kaempf <maxx@via.ecp.fr> * Authors: Michel Kaempf <maxx@via.ecp.fr>
* Renaud Dartus <reno@videolan.org> * Renaud Dartus <reno@videolan.org>
...@@ -354,6 +354,9 @@ typedef struct mantissa_s ...@@ -354,6 +354,9 @@ typedef struct mantissa_s
struct ac3dec_s struct ac3dec_s
{ {
float samples[6][256] __attribute__ ((aligned(16)));
imdct_t imdct __attribute__ ((aligned(16)));
/* /*
* Input properties * Input properties
*/ */
...@@ -370,12 +373,10 @@ struct ac3dec_s ...@@ -370,12 +373,10 @@ struct ac3dec_s
bsi_t bsi; bsi_t bsi;
audblk_t audblk; audblk_t audblk;
float samples[6][256] __attribute__ ((aligned(16)));
dm_par_t dm_par; dm_par_t dm_par;
bit_allocate_t bit_allocate; bit_allocate_t bit_allocate;
mantissa_t mantissa; mantissa_t mantissa;
imdct_t imdct;
downmix_t downmix; downmix_t downmix;
}; };
......
...@@ -2,7 +2,7 @@ ...@@ -2,7 +2,7 @@
* ac3_decoder_thread.c: ac3 decoder thread * ac3_decoder_thread.c: ac3 decoder thread
***************************************************************************** *****************************************************************************
* Copyright (C) 1999, 2000 VideoLAN * Copyright (C) 1999, 2000 VideoLAN
* $Id: ac3_decoder_thread.c,v 1.34 2001/05/31 01:37:08 sam Exp $ * $Id: ac3_decoder_thread.c,v 1.35 2001/07/08 23:15:11 reno Exp $
* *
* Authors: Michel Lespinasse <walken@zoy.org> * Authors: Michel Lespinasse <walken@zoy.org>
* *
...@@ -82,7 +82,13 @@ vlc_thread_t ac3dec_CreateThread( adec_config_t * p_config ) ...@@ -82,7 +82,13 @@ vlc_thread_t ac3dec_CreateThread( adec_config_t * p_config )
intf_DbgMsg( "ac3dec debug: creating ac3 decoder thread" ); intf_DbgMsg( "ac3dec debug: creating ac3 decoder thread" );
/* Allocate the memory needed to store the thread's structure */ /* Allocate the memory needed to store the thread's structure */
if((p_ac3thread = (ac3dec_thread_t *)malloc(sizeof(ac3dec_thread_t)))==NULL) p_ac3thread = (ac3dec_thread_t *)malloc(sizeof(ac3dec_thread_t));
/* We need to be 16 bytes aligned */
p_ac3thread->ac3thread = (int)p_ac3thread & (-15);
p_ac3thread = (ac3dec_thread_t *)p_ac3thread->ac3thread;
if(p_ac3thread == NULL)
{ {
intf_ErrMsg ( "ac3dec error: not enough memory " intf_ErrMsg ( "ac3dec error: not enough memory "
"for ac3dec_CreateThread() to create the new thread"); "for ac3dec_CreateThread() to create the new thread");
...@@ -335,6 +341,7 @@ static void EndThread (ac3dec_thread_t * p_ac3thread) ...@@ -335,6 +341,7 @@ static void EndThread (ac3dec_thread_t * p_ac3thread)
/* Destroy descriptor */ /* Destroy descriptor */
free( p_ac3thread->p_config ); free( p_ac3thread->p_config );
p_ac3thread = (ac3dec_thread_t *)p_ac3thread->ac3thread;
free( p_ac3thread ); free( p_ac3thread );
intf_DbgMsg ("ac3dec debug: ac3 decoder thread %p destroyed", p_ac3thread); intf_DbgMsg ("ac3dec debug: ac3 decoder thread %p destroyed", p_ac3thread);
......
...@@ -2,7 +2,7 @@ ...@@ -2,7 +2,7 @@
* ac3_decoder_thread.h : ac3 decoder thread interface * ac3_decoder_thread.h : ac3 decoder thread interface
***************************************************************************** *****************************************************************************
* Copyright (C) 1999, 2000 VideoLAN * Copyright (C) 1999, 2000 VideoLAN
* $Id: ac3_decoder_thread.h,v 1.7 2001/05/14 15:58:03 reno Exp $ * $Id: ac3_decoder_thread.h,v 1.8 2001/07/08 23:15:11 reno Exp $
* *
* Authors: Michel Kaempf <maxx@via.ecp.fr> * Authors: Michel Kaempf <maxx@via.ecp.fr>
* *
...@@ -24,8 +24,16 @@ ...@@ -24,8 +24,16 @@
/***************************************************************************** /*****************************************************************************
* ac3dec_thread_t : ac3 decoder thread descriptor * ac3dec_thread_t : ac3 decoder thread descriptor
*****************************************************************************/ *****************************************************************************/
typedef struct ac3dec_thread_s typedef struct ac3dec_thread_s
{ {
/*
* Decoder properties
*/
float used_for_alignement1;
float used_for_alignement2;
ac3dec_t ac3_decoder __attribute__ ((aligned(16)));
/* /*
* Thread properties * Thread properties
*/ */
...@@ -38,16 +46,12 @@ typedef struct ac3dec_thread_s ...@@ -38,16 +46,12 @@ typedef struct ac3dec_thread_s
int sync_ptr; /* sync ptr from ac3 magic header */ int sync_ptr; /* sync ptr from ac3 magic header */
adec_config_t * p_config; adec_config_t * p_config;
/*
* Decoder properties
*/
ac3dec_t ac3_decoder;
/* /*
* Output properties * Output properties
*/ */
aout_fifo_t * p_aout_fifo; /* stores the decompressed audio frames */ aout_fifo_t * p_aout_fifo; /* stores the decompressed audio frames */
int ac3thread; /* save the old pointer */
} ac3dec_thread_t; } ac3dec_thread_t;
/***************************************************************************** /*****************************************************************************
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment