Commit bad0a366 authored by Renaud Dartus's avatar Renaud Dartus

* Format asm functions for gcc

  -> fixed the segfaults with imdct_sse
  -> sound is hugly with imdct_sse in debug mode
parent 332c81bb
......@@ -53,10 +53,10 @@ Choose stereo or mono audio output.
Activate hardware AC3 pass-through mode.
.TP
.B \-\-downmix <module>
Specify a module for AC3 downmix: "downmix", "downmixsse", for instance.
Specify a module for AC3 downmix: "downmix", "sse" or "3dn" for instance.
.TP
.B \-\-imdct <module>
Specify a module for AC3 IMDCT: "imdct", "imdctsse", for instance.
Specify a module for AC3 IMDCT: "imdct", sse" or "3dn" for instance.
.TP
.B \-\-novideo
Disable video output.
......
......@@ -2,7 +2,7 @@
* ac3_imdct_3dn.c: accelerated 3D Now! ac3 DCT
*****************************************************************************
* Copyright (C) 1999, 2000 VideoLAN
* $Id: ac3_imdct_3dn.c,v 1.5 2001/07/08 23:15:11 reno Exp $
* $Id: ac3_imdct_3dn.c,v 1.6 2001/07/26 20:00:33 reno Exp $
*
* Authors: Renaud Dartus <reno@videolan.org>
*
......@@ -90,22 +90,10 @@ static void imdct512_pre_ifft_twiddle_3dn (const int *pmt, complex_t *buf, float
{
__asm__ __volatile__ (
".align 16\n"
"pushl %%ebp\n"
"movl %%esp, %%ebp\n"
"addl $-4, %%esp\n" /* local variable, loop counter */
"pushl %%eax\n"
"pushl %%ebx\n"
"pushl %%ecx\n"
"pushl %%edx\n"
"pushl %%edi\n"
"pushl %%esi\n"
"movl 8(%%ebp), %%eax\n" /* pmt */
"movl 12(%%ebp), %%ebx\n" /* buf */
"movl 16(%%ebp), %%ecx\n" /* data */
"movl 20(%%ebp), %%edx\n" /* xcos_sin_sse */
"movl $128, -4(%%ebp)\n"
"movl $128, %%ebx\n" /* loop counter */
".align 16\n"
".loop:\n"
......@@ -126,24 +114,19 @@ static void imdct512_pre_ifft_twiddle_3dn (const int *pmt, complex_t *buf, float
"pfmul %%mm4, %%mm0\n" /* 255-2j * -s_j | 255-2j * c_j */
"pfmul %%mm1, %%mm2\n" /* 2j * -c_j | 2j * -s_j */
"addl $8, %%ebx\n"
"addl $8, %%edi\n"
"pfadd %%mm2, %%mm0\n" /* 2j * -c_j + 255-2j * -s_j | 2j * -s_j + 255-2j * c_j */
"movq %%mm0, -8(%%ebx)\n"
"decl -4(%%ebp)\n"
"movq %%mm0, -8(%%edi)\n"
"decl %%ebx\n"
"jnz .loop\n"
"popl %%esi\n"
"popl %%edi\n"
"popl %%edx\n"
"popl %%ecx\n"
"popl %%ebx\n"
"popl %%eax\n"
"addl $4, %%esp\n"
"popl %%ebp\n"
"femms\n"
::);
: "=D" (buf)
: "a" (pmt), "c" (data), "d" (xcos_sin_sse), "D" (buf));
}
static void imdct512_post_ifft_twiddle_3dn (complex_t *buf, float *xcos_sin_sse)
......@@ -205,24 +188,20 @@ static void imdct512_window_delay_3dn (complex_t *buf, float *data_ptr, float *w
{
__asm__ __volatile__ (
".align 16\n"
"pushl %%ebp\n"
"movl %%esp, %%ebp\n"
"pushl %%eax\n"
"pushl %%ebx\n"
"pushl %%ecx\n"
"pushl %%edx\n"
"pushl %%esi\n"
"pushl %%edi\n"
"pushl %%esi\n"
"pushl %%ebp\n"
"movl 20(%%ebp), %%ebx\n" /* delay */
"movl 16(%%ebp), %%edx\n" /* window */
"movl %%esi, %%ebp\n" /* buf */
"movl $32, %%ebx\n" /* loop count */
"leal 516(%%ebp), %%esi\n" /* buf[64].im */
"leal 504(%%ebp), %%edi\n" /* buf[63].re */
"movl 8(%%ebp), %%eax\n" /* buf */
"movl $32, %%ecx\n" /* loop count */
"leal 516(%%eax), %%esi\n" /* buf[64].im */
"leal 504(%%eax), %%edi\n" /* buf[63].re */
"movl 12(%%ebp), %%eax\n" /* data */
".align 16\n"
".first_128_samples:\n"
......@@ -241,8 +220,8 @@ static void imdct512_window_delay_3dn (complex_t *buf, float *data_ptr, float *w
"movq (%%edx), %%mm0\n" /* w1 | w0 */
"movq 8(%%edx), %%mm1\n" /* w3 | w2 */
"movq (%%ebx), %%mm2\n" /* d1 | d0 */
"movq 8(%%ebx), %%mm3\n" /* d3 | d2 */
"movq (%%ecx), %%mm2\n" /* d1 | d0 */
"movq 8(%%ecx), %%mm3\n" /* d3 | d2 */
"pfmul %%mm4, %%mm0\n" /* w1*re0 | -w0*im0 */
"pfmul %%mm5, %%mm1\n" /* w3*re1 | -w2*im1 */
......@@ -253,16 +232,16 @@ static void imdct512_window_delay_3dn (complex_t *buf, float *data_ptr, float *w
"addl $16, %%edx\n"
"movq %%mm0, (%%eax)\n"
"movq %%mm1, 8(%%eax)\n"
"addl $16, %%ebx\n"
"addl $16, %%ecx\n"
"addl $16, %%esi\n"
"addl $16, %%eax\n"
"addl $-16, %%edi\n"
"decl %%ecx\n"
"decl %%ebx\n"
"jnz .first_128_samples\n"
"movl 8(%%ebp), %%esi\n" /* buf[0].re */
"leal 1020(%%esi), %%edi\n" /* buf[127].im */
"movl $32, %%ecx\n" /* loop count */
"movl %%ebp, %%esi\n" /* buf[0].re */
"movl $32, %%ebx\n" /* loop count */
"leal 1020(%%ebp), %%edi\n" /* buf[127].im */
".align 16\n"
".second_128_samples:\n"
......@@ -281,8 +260,8 @@ static void imdct512_window_delay_3dn (complex_t *buf, float *data_ptr, float *w
"movq (%%edx), %%mm0\n" /* w1 | w0 */
"movq 8(%%edx), %%mm1\n" /* w3 | w2 */
"movq (%%ebx), %%mm2\n" /* d1 | d0 */
"movq 8(%%ebx), %%mm3\n" /* d3 | d2 */
"movq (%%ecx), %%mm2\n" /* d1 | d0 */
"movq 8(%%ecx), %%mm3\n" /* d3 | d2 */
"addl $16, %%esi\n"
......@@ -299,15 +278,14 @@ static void imdct512_window_delay_3dn (complex_t *buf, float *data_ptr, float *w
"addl $16, %%edx\n"
"addl $16, %%eax\n"
"addl $16, %%ebx\n"
"decl %%ecx\n"
"addl $16, %%ecx\n"
"decl %%ebx\n"
"jnz .second_128_samples\n"
"movl 8(%%ebp), %%eax\n"
"leal 512(%%eax), %%esi\n" /* buf[64].re */
"leal 508(%%eax), %%edi\n" /* buf[63].im */
"movl $32, %%ecx\n" /* loop count */
"movl 20(%%ebp), %%eax\n" /* delay */
"leal 512(%%ebp), %%esi\n" /* buf[64].re */
"leal 508(%%ebp), %%edi\n" /* buf[63].im */
"movl $32, %%ebx\n" /* loop count */
"addl $-1024, %%ecx\n" /* delay */
".align 16\n"
".first_128_delay:\n"
......@@ -333,19 +311,17 @@ static void imdct512_window_delay_3dn (complex_t *buf, float *data_ptr, float *w
"pfmul %%mm4, %%mm0\n" /* w1*im0 | -w0*re0 */
"pfmul %%mm5, %%mm1\n" /* w3*im1 | -w2*re1 */
"movq %%mm0, (%%eax)\n"
"movq %%mm1, 8(%%eax)\n"
"movq %%mm0, (%%ecx)\n"
"movq %%mm1, 8(%%ecx)\n"
"addl $16, %%esi\n"
"addl $-16, %%edi\n"
"addl $16, %%eax\n"
"decl %%ecx\n"
"addl $16, %%ecx\n"
"decl %%ebx\n"
"jnz .first_128_delay\n"
"movl 8(%%ebp), %%ebx\n"
"leal 4(%%ebx), %%esi\n" /* buf[0].im */
"leal 1016(%%ebx), %%edi\n" /* buf[127].re */
"movl $32, %%ecx\n" /* loop count */
"leal 4(%%ebp), %%esi\n" /* buf[0].im */
"leal 1016(%%ebp), %%edi\n" /* buf[127].re */
"movl $32, %%ebx\n" /* loop count */
".align 16\n"
".second_128_delay:\n"
......@@ -372,48 +348,44 @@ static void imdct512_window_delay_3dn (complex_t *buf, float *data_ptr, float *w
"pfmul %%mm2, %%mm3\n" /* -w3*re1 | w2*im1 */
"movq %%mm1, (%%eax)\n"
"movq %%mm3, 8(%%eax)\n"
"movq %%mm1, (%%ecx)\n"
"movq %%mm3, 8(%%ecx)\n"
"addl $16, %%esi\n"
"addl $-16, %%edi\n"
"addl $16, %%eax\n"
"decl %%ecx\n"
"addl $16, %%ecx\n"
"decl %%ebx\n"
"jnz .second_128_delay\n"
"popl %%edi\n"
"popl %%ebp\n"
"popl %%esi\n"
"popl %%edi\n"
"popl %%edx\n"
"popl %%ecx\n"
"popl %%ebx\n"
"popl %%eax\n"
"leave\n"
"femms\n"
::);
: "=S" (buf), "=a" (data_ptr), "=c" (delay_prt), "=d" (window_prt)
: "S" (buf), "a" (data_ptr), "c" (delay_prt), "d" (window_prt));
}
static void imdct512_window_delay_nol_3dn (complex_t *buf, float *data_ptr, float *window_prt, float *delay_prt)
{
__asm__ __volatile__ (
".align 16\n"
"pushl %%ebp\n"
"movl %%esp, %%ebp\n"
"pushl %%eax\n"
"pushl %%ebx\n"
"pushl %%ecx\n"
"pushl %%edx\n"
"pushl %%esi\n"
"pushl %%edi\n"
"pushl %%esi\n"
"pushl %%ebp\n"
"movl 20(%%ebp), %%ebx\n" /* delay */
"movl 16(%%ebp), %%edx\n" /* window */
"movl 8(%%ebp), %%eax\n" /* buf */
"movl $32, %%ecx\n" /* loop count */
"leal 516(%%eax), %%esi\n" /* buf[64].im */
"leal 504(%%eax), %%edi\n" /* buf[63].re */
"movl 12(%%ebp), %%eax\n" /* data */
"movl %%esi, %%ebp\n" /* buf */
"movl $32, %%ebx\n" /* loop count */
"leal 516(%%ebp), %%esi\n" /* buf[64].im */
"leal 504(%%ebp), %%edi\n" /* buf[63].re */
".align 16\n"
".first_128_samples2:\n"
......@@ -439,16 +411,16 @@ static void imdct512_window_delay_nol_3dn (complex_t *buf, float *data_ptr, floa
"addl $16, %%edx\n"
"movq %%mm0, (%%eax)\n"
"movq %%mm1, 8(%%eax)\n"
"addl $16, %%ebx\n"
"addl $16, %%ecx\n"
"addl $16, %%esi\n"
"addl $16, %%eax\n"
"addl $-16, %%edi\n"
"decl %%ecx\n"
"decl %%ebx\n"
"jnz .first_128_samples2\n"
"movl 8(%%ebp), %%esi\n" /* buf[0].re */
"leal 1020(%%esi), %%edi\n" /* buf[127].im */
"movl $32, %%ecx\n" /* loop count */
"movl %%ebp, %%esi\n" /* buf[0].re */
"movl $32, %%ebx\n" /* loop count */
"leal 1020(%%ebp), %%edi\n" /* buf[127].im */
".align 16\n"
".second_128_samples2:\n"
......@@ -480,15 +452,14 @@ static void imdct512_window_delay_nol_3dn (complex_t *buf, float *data_ptr, floa
"addl $16, %%edx\n"
"addl $16, %%eax\n"
"addl $16, %%ebx\n"
"decl %%ecx\n"
"addl $16, %%ecx\n"
"decl %%ebx\n"
"jnz .second_128_samples2\n"
"movl 8(%%ebp), %%eax\n"
"leal 512(%%eax), %%esi\n" /* buf[64].re */
"leal 508(%%eax), %%edi\n" /* buf[63].im */
"movl $32, %%ecx\n" /* loop count */
"movl 20(%%ebp), %%eax\n" /* delay */
"leal 512(%%ebp), %%esi\n" /* buf[64].re */
"leal 508(%%ebp), %%edi\n" /* buf[63].im */
"movl $32, %%ebx\n" /* loop count */
"addl $-1024, %%ecx\n" /* delay */
".align 16\n"
".first_128_delays:\n"
......@@ -515,18 +486,17 @@ static void imdct512_window_delay_nol_3dn (complex_t *buf, float *data_ptr, floa
"pfmul %%mm5, %%mm1\n" /* w3*im1 | -w2*re1 */
"movq %%mm0, (%%eax)\n"
"movq %%mm1, 8(%%eax)\n"
"movq %%mm0, (%%ecx)\n"
"movq %%mm1, 8(%%ecx)\n"
"addl $16, %%esi\n"
"addl $-16, %%edi\n"
"addl $16, %%eax\n"
"decl %%ecx\n"
"addl $16, %%ecx\n"
"decl %%ebx\n"
"jnz .first_128_delays\n"
"movl 8(%%ebp), %%ebx\n"
"leal 4(%%ebx), %%esi\n" /* buf[0].im */
"leal 1016(%%ebx), %%edi\n" /* buf[127].re */
"movl $32, %%ecx\n" /* loop count */
"leal 4(%%ebp), %%esi\n" /* buf[0].im */
"leal 1016(%%ebp), %%edi\n" /* buf[127].re */
"movl $32, %%ebx\n" /* loop count */
".align 16\n"
".second_128_delays:\n"
......@@ -553,23 +523,24 @@ static void imdct512_window_delay_nol_3dn (complex_t *buf, float *data_ptr, floa
"pfmul %%mm2, %%mm3\n" /* -w3*re1 | w2*im1 */
"movq %%mm1, (%%eax)\n"
"movq %%mm3, 8(%%eax)\n"
"movq %%mm1, (%%ecx)\n"
"movq %%mm3, 8(%%ecx)\n"
"addl $16, %%esi\n"
"addl $-16, %%edi\n"
"addl $16, %%eax\n"
"decl %%ecx\n"
"addl $16, %%ecx\n"
"decl %%ebx\n"
"jnz .second_128_delays\n"
"popl %%edi\n"
"popl %%ebp\n"
"popl %%esi\n"
"popl %%edi\n"
"popl %%edx\n"
"popl %%ecx\n"
"popl %%ebx\n"
"popl %%eax\n"
"leave\n"
"femms\n"
::);
: "=S" (buf), "=a" (data_ptr), "=c" (delay_prt), "=d" (window_prt)
: "S" (buf), "a" (data_ptr), "c" (delay_prt), "d" (window_prt));
}
......@@ -2,7 +2,7 @@
* ac3_imdct_sse.c: accelerated SSE ac3 DCT
*****************************************************************************
* Copyright (C) 1999, 2000 VideoLAN
* $Id: ac3_imdct_sse.c,v 1.4 2001/07/08 23:15:11 reno Exp $
* $Id: ac3_imdct_sse.c,v 1.5 2001/07/26 20:00:33 reno Exp $
*
* Authors: Renaud Dartus <reno@videolan.org>
* Aaron Holtzman <aholtzma@engr.uvic.ca>
......@@ -103,10 +103,7 @@ static void imdct512_pre_ifft_twiddle_sse (const int *pmt, complex_t *buf, float
"pushl %%edi\n"
"pushl %%esi\n"
"movl 8(%%ebp), %%eax\n" /* pmt */
"movl 12(%%ebp), %%ebx\n" /* buf */
"movl 16(%%ebp), %%ecx\n" /* data */
"movl 20(%%ebp), %%edx\n" /* xcos_sin_sse */
"movl %%edi, %%ebx\n" /* buf */
"movl $64, -4(%%ebp)\n"
".align 16\n"
......@@ -153,7 +150,9 @@ static void imdct512_pre_ifft_twiddle_sse (const int *pmt, complex_t *buf, float
"addl $4, %%esp\n"
"popl %%ebp\n"
::);
: "=D" (buf)
: "a" (pmt), "c" (data), "d" (xcos_sin_sse), "D" (buf));
}
static void imdct512_post_ifft_twiddle_sse (complex_t *buf, float *xcos_sin_sse)
......@@ -226,24 +225,19 @@ static void imdct512_window_delay_sse (complex_t *buf, float *data_ptr, float *w
{
__asm__ __volatile__ (
".align 16\n"
"pushl %%ebp\n"
"movl %%esp, %%ebp\n"
"pushl %%eax\n"
"pushl %%ebx\n"
"pushl %%ecx\n"
"pushl %%edx\n"
"pushl %%esi\n"
"pushl %%edi\n"
"pushl %%esi\n"
"pushl %%ebp\n"
"movl 20(%%ebp), %%ebx\n" /* delay */
"movl 16(%%ebp), %%edx\n" /* window */
"movl 8(%%ebp), %%eax\n" /* buf */
"movl $16, %%ecx\n" /* loop count */
"leal 516(%%eax), %%esi\n" /* buf[64].im */
"leal 504(%%eax), %%edi\n" /* buf[63].re */
"movl 12(%%ebp), %%eax\n" /* data */
"movl %%esi, %%ebp\n" /* buf */
"movl $16, %%ebx\n" /* loop count */
"leal 516(%%ebp), %%esi\n" /* buf[64].im */
"leal 504(%%ebp), %%edi\n" /* buf[63].re */
".align 16\n"
".first_128_samples:\n"
......@@ -256,7 +250,7 @@ static void imdct512_window_delay_sse (complex_t *buf, float *data_ptr, float *w
"movlhps %%xmm3, %%xmm1\n" /* 0.0 | re1 | 0.0 | re0 */
"movups (%%edx), %%xmm4\n" /* w3 | w2 | w1 | w0 */
"movaps (%%ebx), %%xmm5\n" /* d3 | d2 | d1 | d0 */
"movaps (%%ecx), %%xmm5\n" /* d3 | d2 | d1 | d0 */
"shufps $0xb1, %%xmm1, %%xmm1\n"/* re1 | 0.0 | re0 | 0.0 */
"movss 16(%%esi), %%xmm6\n" /* im2 */
......@@ -270,23 +264,23 @@ static void imdct512_window_delay_sse (complex_t *buf, float *data_ptr, float *w
"addps %%xmm5, %%xmm0\n"
"shufps $0xb1, %%xmm2, %%xmm2\n"/* re3 | 0.0 | re2 | 0.0 */
"movups 16(%%edx), %%xmm4\n" /* w7 | w6 | w5 | w4 */
"movaps 16(%%ebx), %%xmm5\n" /* d7 | d6 | d5 | d4 */
"movaps 16(%%ecx), %%xmm5\n" /* d7 | d6 | d5 | d4 */
"subps %%xmm2, %%xmm6\n" /* -re3 | im3 | -re2 | im2 */
"addl $32, %%edx\n"
"movaps %%xmm0, (%%eax)\n"
"addl $32, %%ebx\n"
"addl $32, %%ecx\n"
"mulps %%xmm4, %%xmm6\n"
"addl $32, %%esi\n"
"addl $32, %%eax\n"
"addps %%xmm5, %%xmm6\n"
"addl $-32, %%edi\n"
"movaps %%xmm6, -16(%%eax)\n"
"decl %%ecx\n"
"decl %%ebx\n"
"jnz .first_128_samples\n"
"movl 8(%%ebp), %%esi\n" /* buf[0].re */
"leal 1020(%%esi), %%edi\n" /* buf[127].im */
"movl $16, %%ecx\n" /* loop count */
"movl %%ebp, %%esi\n" /* buf[0].re */
"movl $16, %%ebx\n" /* loop count */
"leal 1020(%%ebp), %%edi\n" /* buf[127].im */
".align 16\n"
".second_128_samples:\n"
......@@ -299,7 +293,7 @@ static void imdct512_window_delay_sse (complex_t *buf, float *data_ptr, float *w
"movlhps %%xmm3, %%xmm1\n" /* 0.0 | im1 | 0.0 | im1 */
"movups (%%edx), %%xmm4\n" /* w3 | w2 | w1 | w0 */
"movaps (%%ebx), %%xmm5\n" /* d3 | d2 | d1 | d0 */
"movaps (%%ecx), %%xmm5\n" /* d3 | d2 | d1 | d0 */
"shufps $0xb1, %%xmm1, %%xmm1\n"/* im1 | 0.0 | im0 | 0.0 */
"movss 16(%%esi), %%xmm6\n" /* re2 */
......@@ -317,21 +311,20 @@ static void imdct512_window_delay_sse (complex_t *buf, float *data_ptr, float *w
"addps %%xmm5, %%xmm0\n"
"mulps %%xmm4, %%xmm6\n"
"addl $-32, %%edi\n"
"movaps 16(%%ebx), %%xmm5\n" /* d7 | d6 | d5 | d4 */
"movaps 16(%%ecx), %%xmm5\n" /* d7 | d6 | d5 | d4 */
"movaps %%xmm0, (%%eax)\n"
"addps %%xmm5, %%xmm6\n"
"addl $32, %%edx\n"
"addl $32, %%eax\n"
"addl $32, %%ebx\n"
"addl $32, %%ecx\n"
"movaps %%xmm6, -16(%%eax)\n"
"decl %%ecx\n"
"decl %%ebx\n"
"jnz .second_128_samples\n"
"movl 8(%%ebp), %%eax\n"
"leal 512(%%eax), %%esi\n" /* buf[64].re */
"leal 508(%%eax), %%edi\n" /* buf[63].im */
"movl $16, %%ecx\n" /* loop count */
"movl 20(%%ebp), %%eax\n" /* delay */
"leal 512(%%ebp), %%esi\n" /* buf[64].re */
"leal 508(%%ebp), %%edi\n" /* buf[63].im */
"movl $16, %%ebx\n" /* loop count */
"addl $-1024, %%ecx\n" /* delay */
".align 16\n"
".first_128_delay:\n"
......@@ -356,20 +349,19 @@ static void imdct512_window_delay_sse (complex_t *buf, float *data_ptr, float *w
"mulps %%xmm4, %%xmm0\n"
"movups (%%edx), %%xmm5\n" /* w7 | w6 | w5 | w4 */
"shufps $0xb1, %%xmm2, %%xmm2\n"/* im3 | 0.0 | im2 | 0.0 */
"movaps %%xmm0, (%%eax)\n"
"movaps %%xmm0, (%%ecx)\n"
"addl $32, %%esi\n"
"subps %%xmm2, %%xmm6\n" /* -im3 | re3 | -im2 | re2 */
"addl $-32, %%edi\n"
"mulps %%xmm5, %%xmm6\n"
"addl $32, %%eax\n"
"movaps %%xmm6, -16(%%eax)\n"
"decl %%ecx\n"
"addl $32, %%ecx\n"
"movaps %%xmm6, -16(%%ecx)\n"
"decl %%ebx\n"
"jnz .first_128_delay\n"
"movl 8(%%ebp), %%ebx\n"
"leal 4(%%ebx), %%esi\n" /* buf[0].im */
"leal 1016(%%ebx), %%edi\n" /* buf[127].re */
"movl $16, %%ecx\n" /* loop count */
"leal 4(%%ebp), %%esi\n" /* buf[0].im */
"leal 1016(%%ebp), %%edi\n" /* buf[127].re */
"movl $16, %%ebx\n" /* loop count */
".align 16\n"
".second_128_delay:\n"
......@@ -394,49 +386,45 @@ static void imdct512_window_delay_sse (complex_t *buf, float *data_ptr, float *w
"mulps %%xmm4, %%xmm1\n"
"movups (%%edx), %%xmm5\n" /* w7 | w6 | w5 | w4 */
"shufps $0xb1, %%xmm2, %%xmm2\n"/* re3 | 0.0 | re2 | 0.0 */
"movaps %%xmm1, (%%eax)\n"
"movaps %%xmm1, (%%ecx)\n"
"addl $32, %%esi\n"
"subps %%xmm6, %%xmm2\n" /* re | -im3 | re | -im2 */
"addl $-32, %%edi\n"
"mulps %%xmm5, %%xmm2\n"
"addl $32, %%eax\n"
"movaps %%xmm2, -16(%%eax)\n"
"decl %%ecx\n"
"addl $32, %%ecx\n"
"movaps %%xmm2, -16(%%ecx)\n"
"decl %%ebx\n"
"jnz .second_128_delay\n"
"popl %%edi\n"
"popl %%ebp\n"
"popl %%esi\n"
"popl %%edi\n"
"popl %%edx\n"
"popl %%ecx\n"
"popl %%ebx\n"
"popl %%eax\n"
: "=S" (buf), "=a" (data_ptr), "=c" (delay_prt), "=d" (window_prt)
: "S" (buf), "a" (data_ptr), "c" (delay_prt), "d" (window_prt));
"leave\n"
::);
}
static void imdct512_window_delay_nol_sse (complex_t *buf, float *data_ptr, float *window_prt, float *delay_prt)
{
__asm__ __volatile__ (
".align 16\n"
"pushl %%ebp\n"
"movl %%esp, %%ebp\n"
"pushl %%eax\n"
"pushl %%ebx\n"
"pushl %%ecx\n"
"pushl %%edx\n"
"pushl %%esi\n"
"pushl %%edi\n"
"pushl %%esi\n"
"pushl %%ebp\n"
/* movl 20(%%ebp), %%ebx delay */
"movl 16(%%ebp), %%edx\n" /* window */
"movl 8(%%ebp), %%eax\n" /* buf */
"movl $16, %%ecx\n" /* loop count */
"leal 516(%%eax), %%esi\n" /* buf[64].im */
"leal 504(%%eax), %%edi\n" /* buf[63].re */
"movl 12(%%ebp), %%eax\n" /* data */
"movl %%esi, %%ebp\n" /* buf */
"movl $16, %%ebx\n" /* loop count */
"leal 516(%%ebp), %%esi\n" /* buf[64].im */
"leal 504(%%ebp), %%edi\n" /* buf[63].re */
".align 16\n"
".first_128_sample:\n"
......@@ -469,12 +457,12 @@ static void imdct512_window_delay_nol_sse (complex_t *buf, float *data_ptr, floa
"addl $32, %%eax\n"
"addl $-32, %%edi\n"
"movaps %%xmm6, -16(%%eax)\n"
"decl %%ecx\n"
"decl %%ebx\n"
"jnz .first_128_sample\n"
"movl 8(%%ebp), %%esi\n" /* buf[0].re */
"leal 1020(%%esi), %%edi\n" /* buf[127].im */
"movl $16, %%ecx\n" /* loop count */
"movl %%ebp, %%esi\n" /* buf[0].re */
"movl $16, %%ebx\n" /* loop count */
"leal 1020(%%ebp), %%edi\n" /* buf[127].im */
".align 16\n"
".second_128_sample:\n"
......@@ -507,14 +495,13 @@ static void imdct512_window_delay_nol_sse (complex_t *buf, float *data_ptr, floa
"addl $32, %%edx\n"
"addl $32, %%eax\n"
"movaps %%xmm6, -16(%%eax)\n"
"decl %%ecx\n"
"decl %%ebx\n"
"jnz .second_128_sample\n"
"movl 8(%%ebp), %%eax\n"
"leal 512(%%eax), %%esi\n" /* buf[64].re */
"leal 508(%%eax), %%edi\n" /* buf[63].im */
"movl $16, %%ecx\n" /* loop count */
"movl 20(%%ebp), %%eax\n" /* delay */
"leal 512(%%ebp), %%esi\n" /* buf[64].re */
"leal 508(%%ebp), %%edi\n" /* buf[63].im */
"movl $16, %%ebx\n" /* loop count */
"addl $-1024, %%ecx\n" /* delay */
".align 16\n"
".first_128_delays:\n"
......@@ -539,20 +526,19 @@ static void imdct512_window_delay_nol_sse (complex_t *buf, float *data_ptr, floa
"mulps %%xmm4, %%xmm0\n"
"movups (%%edx), %%xmm5\n" /* w7 | w6 | w5 | w4 */
"shufps $0xb1, %%xmm2, %%xmm2\n"/* im3 | 0.0 | im2 | 0.0 */
"movaps %%xmm0, (%%eax)\n"
"movaps %%xmm0, (%%ecx)\n"
"addl $32, %%esi\n"
"subps %%xmm2, %%xmm6\n" /* -im3 | re3 | -im2 | re2 */
"addl $-32, %%edi\n"
"mulps %%xmm5, %%xmm6\n"
"addl $32, %%eax\n"
"movaps %%xmm6, -16(%%eax)\n"
"decl %%ecx\n"
"addl $32, %%ecx\n"
"movaps %%xmm6, -16(%%ecx)\n"
"decl %%ebx\n"
"jnz .first_128_delays\n"
"movl 8(%%ebp), %%ebx\n"
"leal 4(%%ebx), %%esi\n" /* buf[0].im */
"leal 1016(%%ebx), %%edi\n" /* buf[127].re */
"movl $16, %%ecx\n" /* loop count */
"leal 4(%%ebp), %%esi\n" /* buf[0].im */
"leal 1016(%%ebp), %%edi\n" /* buf[127].re */
"movl $16, %%ebx\n" /* loop count */
".align 16\n"
".second_128_delays:\n"
......@@ -577,23 +563,24 @@ static void imdct512_window_delay_nol_sse (complex_t *buf, float *data_ptr, floa
"mulps %%xmm4, %%xmm1\n"
"movups (%%edx), %%xmm5\n" /* w7 | w6 | w5 | w4 */
"shufps $0xb1, %%xmm2, %%xmm2\n"/* re3 | 0.0 | re2 | 0.0 */
"movaps %%xmm1, (%%eax)\n"
"movaps %%xmm1, (%%ecx)\n"
"addl $32, %%esi\n"
"subps %%xmm6, %%xmm2\n" /* re | -im3 | re | -im2 */
"addl $-32, %%edi\n"
"mulps %%xmm5, %%xmm2\n"
"addl $32, %%eax\n"
"movaps %%xmm2, -16(%%eax)\n"
"decl %%ecx\n"
"addl $32, %%ecx\n"
"movaps %%xmm2, -16(%%ecx)\n"
"decl %%ebx\n"
"jnz .second_128_delays\n"
"popl %%edi\n"
"popl %%ebp\n"
"popl %%esi\n"
"popl %%edi\n"
"popl %%edx\n"
"popl %%ecx\n"
"popl %%ebx\n"
"popl %%eax\n"
: "=S" (buf), "=a" (data_ptr), "=c" (delay_prt), "=d" (window_prt)
: "S" (buf), "a" (data_ptr), "c" (delay_prt), "d" (window_prt));
"leave\n"
::);
}
......@@ -2,7 +2,7 @@
* ac3_srfft_sse.c: accelerated SSE ac3 fft functions
*****************************************************************************
* Copyright (C) 1999, 2000, 2001 VideoLAN
* $Id: ac3_srfft_sse.c,v 1.4 2001/07/08 23:15:11 reno Exp $
* $Id: ac3_srfft_sse.c,v 1.5 2001/07/26 20:00:33 reno Exp $
*
* Authors: Renaud Dartus <reno@videolan.org>
* Aaron Holtzman <aholtzma@engr.uvic.ca>
......@@ -228,28 +228,21 @@ static void fft_asmb_sse (int k, complex_t *x, complex_t *wTB,
"pushl %%eax\n"
"pushl %%ebx\n"
"pushl %%ecx\n" //
"pushl %%ecx\n"
"pushl %%edx\n"
"pushl %%esi\n"
// "movl %%edi, %%ecx\n" /* k */
"pushl %%edi\n" //
"pushl %%edi\n"
"movl 8(%%ebp), %%ecx\n" /* k */
"movl 12(%%ebp), %%eax\n" /* x */
"movl %%ecx, -4(%%ebp)\n" /* k */
"movl 16(%%ebp), %%ebx\n" /* wT */
"movl 20(%%ebp), %%edx\n" /* d */
"movl 24(%%ebp), %%esi\n" /* d3 */
"shll $4, %%ecx\n" /* 16k */ ///
"addl $8, %%edx\n"
"leal (%%eax, %%ecx, 2), %%edi\n"
"addl $8, %%esi\n"
/* TRANSZERO and TRANS */
".align 16\n"
"movaps (%%eax), %%xmm0\n" /* x[1] | x[0] */
"movaps (%%ebx), %%xmm1\n" /* wT[1] | wT[0] */
"movaps (%%ebx, %%ecx), %%xmm2\n" /* wB[1] | wB[0] */
"movaps (%%edi), %%xmm1\n" /* wT[1] | wT[0] */
"movaps (%%edi, %%ecx), %%xmm2\n" /* wB[1] | wB[0] */
"movlps (%%edx), %%xmm3\n" /* d */
"movlps (%%esi), %%xmm4\n" /* d3 */
"movhlps %%xmm1, %%xmm5\n" /* wT[1] */
......@@ -263,14 +256,14 @@ static void fft_asmb_sse (int k, complex_t *x, complex_t *wTB,
"movhlps %%xmm5, %%xmm7\n" /* wT[1].im * d[1].im | wT[1].re * d[1].im */
"movlhps %%xmm6, %%xmm5\n" /* wB[1].im * d3[1].re | wB[1].re * d3[1].re | wT[1].im * d[1].re | wT[1].re * d[1].re */
"shufps $0xb1, %%xmm6, %%xmm7\n" /* wB[1].re * d3[1].im | wB[i].im * d3[1].im | wT[1].re * d[1].im | wT[1].im * d[1].im */
"movl $C_1_sse, %%edi\n"
"movaps (%%edi), %%xmm4\n"
"movl $C_1_sse, %%ebx\n"
"movaps (%%ebx), %%xmm4\n"
"mulps %%xmm4, %%xmm7\n"
"addps %%xmm7, %%xmm5\n" /* wB[1] * d3[1] | wT[1] * d[1] */
"movlhps %%xmm5, %%xmm1\n" /* d[1] * wT[1] | wT[0] */
"shufps $0xe4, %%xmm5, %%xmm2\n" /* d3[1] * wB[1] | wB[0] */
"movaps %%xmm1, %%xmm3\n" /* d[1] * wT[1] | wT[0] */
"leal (%%eax, %%ecx, 2), %%edi\n"
"leal (%%eax, %%ecx, 2), %%ebx\n"
"addps %%xmm2, %%xmm1\n" /* u */
"subps %%xmm2, %%xmm3\n" /* v */
"mulps %%xmm4, %%xmm3\n"
......@@ -283,21 +276,21 @@ static void fft_asmb_sse (int k, complex_t *x, complex_t *wTB,
"addps %%xmm3, %%xmm5\n"
"subps %%xmm3, %%xmm6\n"
"movaps %%xmm0, (%%eax)\n"
"movaps %%xmm2, (%%edi)\n"
"movaps %%xmm2, (%%ebx)\n"
"movaps %%xmm5, (%%eax, %%ecx)\n"
"movaps %%xmm6, (%%edi, %%ecx)\n"
"movaps %%xmm6, (%%ebx, %%ecx)\n"
"addl $16, %%eax\n"
"addl $16, %%ebx\n"
"addl $16, %%edi\n"
"addl $8, %%edx\n"
"addl $8, %%esi\n"
"decl -4(%%ebp)\n"
".align 16\n"
".loop:\n"
"movaps (%%ebx), %%xmm0\n" /* wT[1] | wT[0] */
"movaps (%%edi), %%xmm0\n" /* wT[1] | wT[0] */
"movaps (%%edx), %%xmm1\n" /* d[1] | d[0] */
"movaps (%%ebx, %%ecx), %%xmm4\n" /* wB[1] | wB[0] */
"movaps (%%edi, %%ecx), %%xmm4\n" /* wB[1] | wB[0] */
"movaps (%%esi), %%xmm5\n" /* d3[1] | d3[0] */
"movhlps %%xmm0, %%xmm2\n" /* wT[1] */
......@@ -324,8 +317,8 @@ static void fft_asmb_sse (int k, complex_t *x, complex_t *wTB,
"mulps %%xmm5, %%xmm4\n" /* wB[0].im * d3[0].im | wB[0].re * d3[0].im | wB[0].im * d3[0].re | wB[0].re * d3[0].re */
"mulps %%xmm7, %%xmm6\n" /* wB[1].im * d3[1].im | wB[1].re * d3[1].im | wB[1].im * d3[1].re | wB[1].re * d3[1].re */
"shufps $0xb1, %%xmm2, %%xmm1\n" /* d[1].im * wT[1].re | d[1].im * wT[1].im | d[0].im * wT[0].re | d[0].im * wT[0].im */
"movl $C_1_sse, %%edi\n"
"movaps (%%edi), %%xmm3\n" /* 1.0 | -1.0 | 1.0 | -1.0 */
"movl $C_1_sse, %%ebx\n"
"movaps (%%ebx), %%xmm3\n" /* 1.0 | -1.0 | 1.0 | -1.0 */
"movhlps %%xmm4, %%xmm5\n" /* wB[0].im * d3[0].im | wB[0].re * d3[0].im */
"mulps %%xmm3, %%xmm1\n" /* d[1].im * wT[1].re | -d[1].im * wT[1].im | d[0].im * wT[0].re | -d[0].im * wT[0].im */
......@@ -340,9 +333,9 @@ static void fft_asmb_sse (int k, complex_t *x, complex_t *wTB,
"addps %%xmm4, %%xmm0\n" /* u */
"subps %%xmm4, %%xmm1\n" /* v */
"movaps (%%eax), %%xmm6\n" /* x[1] | x[0] */
"leal (%%eax, %%ecx, 2), %%edi\n"
"leal (%%eax, %%ecx, 2), %%ebx\n"
"mulps %%xmm3, %%xmm1\n"
"addl $16, %%ebx\n"
"addl $16, %%edi\n"
"addl $16, %%esi\n"
"shufps $0xb1, %%xmm1, %%xmm1\n" /* -i * v */
"movaps (%%eax, %%ecx), %%xmm7\n" /* xk[1] | xk[0] */
......@@ -351,12 +344,12 @@ static void fft_asmb_sse (int k, complex_t *x, complex_t *wTB,
"addps %%xmm0, %%xmm6\n"
"subps %%xmm0, %%xmm2\n"
"movaps %%xmm6, (%%eax)\n"
"movaps %%xmm2, (%%edi)\n"
"movaps %%xmm2, (%%ebx)\n"
"addps %%xmm1, %%xmm7\n"
"subps %%xmm1, %%xmm4\n"
"addl $16, %%edx\n"
"movaps %%xmm7, (%%eax, %%ecx)\n"
"movaps %%xmm4, (%%edi, %%ecx)\n"
"movaps %%xmm4, (%%ebx, %%ecx)\n"
"addl $16, %%eax\n"
"decl -4(%%ebp)\n"
......@@ -364,16 +357,17 @@ static void fft_asmb_sse (int k, complex_t *x, complex_t *wTB,
".align 16\n"
".end:\n"
"popl %%edi\n" //
"popl %%edi\n"
"popl %%esi\n"
"popl %%edx\n"
"popl %%ecx\n" //
"popl %%ecx\n"
"popl %%ebx\n"
"popl %%eax\n"
"addl $4, %%esp\n"
"leave\n"
::);
: "=c" (k), "=a" (x), "=D" (wTB)
: "c" (k), "a" (x), "D" (wTB), "d" (d), "S" (d_3));
}
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment