From 5f232025efa9d525fb709f6373d685db60e43316 Mon Sep 17 00:00:00 2001 From: Renaud Dartus <reno@videolan.org> Date: Wed, 16 May 2001 14:51:29 +0000 Subject: [PATCH] * Add 3D Now! imdct * Remove kmudge for ac3 on MacOS X --- Makefile | 7 +- configure | 2 +- configure.in | 2 +- plugins/imdct/Makefile | 18 +- plugins/imdct/ac3_imdct_3dn.c | 559 +++++++++++++++++++++++++++++++ plugins/imdct/ac3_imdct_c.c | 62 +--- plugins/imdct/ac3_imdct_common.c | 65 +--- plugins/imdct/ac3_imdct_common.h | 3 +- plugins/imdct/ac3_imdct_sse.c | 58 +--- plugins/imdct/ac3_retables.h | 83 +++++ plugins/imdct/ac3_srfft_3dn.c | 344 +++++++++++++++++++ plugins/imdct/ac3_srfft_sse.c | 18 +- plugins/imdct/imdct3dn.c | 152 +++++++++ src/ac3_decoder/ac3_imdct.c | 4 +- 14 files changed, 1180 insertions(+), 197 deletions(-) create mode 100644 plugins/imdct/ac3_imdct_3dn.c create mode 100644 plugins/imdct/ac3_retables.h create mode 100644 plugins/imdct/ac3_srfft_3dn.c create mode 100644 plugins/imdct/imdct3dn.c diff --git a/Makefile b/Makefile index 9a868def9f..4aa0e32453 100644 --- a/Makefile +++ b/Makefile @@ -26,7 +26,7 @@ PLUGINS_TARGETS := alsa/alsa beos/beos darwin/darwin dsp/dsp dummy/dummy \ dvd/dvd esd/esd fb/fb ggi/ggi glide/glide gnome/gnome gtk/gtk \ downmix/downmix downmix/downmixsse downmix/downmix3dn \ idct/idct idct/idctclassic idct/idctmmx idct/idctmmxext \ - imdct/imdct imdct/imdctsse \ + imdct/imdct imdct/imdct3dn imdct/imdctsse \ macosx/macosx mga/mga \ motion/motion motion/motionmmx motion/motionmmxext \ mpeg/es mpeg/ps mpeg/ts null/null qt/qt sdl/sdl \ @@ -317,12 +317,7 @@ endif $(C_OBJ): %.o: Makefile.opts Makefile.dep Makefile $(C_OBJ): %.o: .dep/%.d $(C_OBJ): %.o: %.c -ifneq (,$(findstring darwin,$(SYS))) -#this is uglier of all - @if test "src/ac3_decoder/ac3_imdct.c" = "$<"; then $(CC) `echo $(CFLAGS) | sed -e 's/-O3/-O/'` -c -o $@ $<; echo "(CC) `echo $(CFLAGS) | sed -e 's/-O3/-O/'` -c -o $@ $<"; else $(CC) $(CFLAGS) -c -o $@ $<; echo "$(CC) $(CFLAGS) -c -o $@ $<"; fi -else $(CC) $(CFLAGS) -c -o $@ $< -endif $(CPP_OBJ): %.o: Makefile.opts Makefile.dep Makefile $(CPP_OBJ): %.o: .dep/%.dpp diff --git a/configure b/configure index b75b5741a5..a03a4d4cad 100755 --- a/configure +++ b/configure @@ -3285,7 +3285,7 @@ int main() { EOF if { (eval echo configure:3287: \"$ac_compile\") 1>&5; (eval $ac_compile) 2>&5; }; then rm -rf conftest* - ACCEL_PLUGINS="${ACCEL_PLUGINS} idctmmxext motionmmxext imdctsse downmix3dn downmixsse" + ACCEL_PLUGINS="${ACCEL_PLUGINS} idctmmxext motionmmxext imdct3dn imdctsse downmix3dn downmixsse" echo "$ac_t""yes" 1>&6 else echo "configure: failed program was:" >&5 diff --git a/configure.in b/configure.in index 4ff3bade95..23766c7b0b 100644 --- a/configure.in +++ b/configure.in @@ -162,7 +162,7 @@ AC_TRY_COMPILE([void quux(){void *p;asm("packuswb %%mm1,%%mm2"::"r"(p));}],, AC_MSG_CHECKING([if \$CC groks MMX EXT or SSE inline assembly]) AC_TRY_COMPILE([void quux(){void *p;asm("maskmovq %%mm1,%%mm2"::"r"(p));}],, - ACCEL_PLUGINS="${ACCEL_PLUGINS} idctmmxext motionmmxext imdctsse downmix3dn downmixsse" + ACCEL_PLUGINS="${ACCEL_PLUGINS} idctmmxext motionmmxext imdct3dn imdctsse downmix3dn downmixsse" AC_MSG_RESULT(yes), AC_MSG_RESULT(no)) dnl diff --git a/plugins/imdct/Makefile b/plugins/imdct/Makefile index 330287c9b6..b4dad20ac9 100644 --- a/plugins/imdct/Makefile +++ b/plugins/imdct/Makefile @@ -9,15 +9,18 @@ PLUGIN_IMDCT = imdct.o ac3_imdct_c.o ac3_srfft_c.o PLUGIN_IMDCTSSE = imdctsse.o ac3_imdct_sse.o ac3_srfft_sse.o +PLUGIN_IMDCT3DN = imdct3dn.o ac3_imdct_3dn.o ac3_srfft_3dn.o PLUGIN_IMDCTCOMMON = ac3_imdct_common.o BUILTIN_IMDCT = $(PLUGIN_IMDCT:%.o=BUILTIN_IMDCT_%.o) \ $(PLUGIN_IMDCTCOMMON:%.o=BUILTIN_IMDCT_%.o) BUILTIN_IMDCTSSE = $(PLUGIN_IMDCTSSE:%.o=BUILTIN_IMDCTSSE_%.o) \ $(PLUGIN_IMDCTCOMMON:%.o=BUILTIN_IMDCTSSE_%.o) +BUILTIN_IMDCT3DN = $(PLUGIN_IMDCT3DN:%.o=BUILTIN_IMDCT3DN_%.o) \ + $(PLUGIN_IMDCTCOMMON:%.o=BUILTIN_IMDCT3DN_%.o) -PLUGIN_C = $(PLUGIN_IMDCT) $(PLUGIN_IMDCTSSE) $(PLUGIN_IMDCTCOMMON) -ALL_OBJ = $(PLUGIN_C) $(BUILTIN_IMDCT) $(BUILTIN_IMDCTSSE) +PLUGIN_C = $(PLUGIN_IMDCT) $(PLUGIN_IMDCTSSE) $(PLUGIN_IMDCT3DN) $(PLUGIN_IMDCTCOMMON) +ALL_OBJ = $(PLUGIN_C) $(BUILTIN_IMDCT) $(BUILTIN_IMDCTSSE) $(BUILTIN_IMDCT3DN) # # Virtual targets @@ -33,6 +36,10 @@ $(BUILTIN_IMDCTSSE): BUILTIN_IMDCTSSE_%.o: .dep/%.d $(BUILTIN_IMDCTSSE): BUILTIN_IMDCTSSE_%.o: %.c $(CC) $(CFLAGS) -DBUILTIN -DMODULE_NAME=imdctsse -c -o $@ $< +$(BUILTIN_IMDCT3DN): BUILTIN_IMDCT3DN_%.o: .dep/%.d +$(BUILTIN_IMDCT3DN): BUILTIN_IMDCT3DN_%.o: %.c + $(CC) $(CFLAGS) -DBUILTIN -DMODULE_NAME=imdct3dn -c -o $@ $< + # # Real targets # @@ -51,3 +58,10 @@ $(BUILTIN_IMDCTSSE): BUILTIN_IMDCTSSE_%.o: %.c ar r $@ $^ $(RANLIB) $@ +../../lib/imdct3dn.so: $(PLUGIN_IMDCT3DN) $(PLUGIN_IMDCTCOMMON) + $(CC) $(PCFLAGS) -o $@ $^ $(PLCFLAGS) + +../../lib/imdct3dn.a: $(BUILTIN_IMDCT3DN) + ar r $@ $^ + $(RANLIB) $@ + diff --git a/plugins/imdct/ac3_imdct_3dn.c b/plugins/imdct/ac3_imdct_3dn.c new file mode 100644 index 0000000000..5bb5e5d9a9 --- /dev/null +++ b/plugins/imdct/ac3_imdct_3dn.c @@ -0,0 +1,559 @@ +/***************************************************************************** + * ac3_imdct_3dn.c: accelerated 3D Now! ac3 DCT + ***************************************************************************** + * Copyright (C) 1999, 2000 VideoLAN + * $Id: ac3_imdct_3dn.c,v 1.1 2001/05/16 14:51:29 reno Exp $ + * + * Authors: Renaud Dartus <reno@videolan.org> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111, USA. + *****************************************************************************/ + +#define MODULE_NAME imdct3dn +#include "modules_inner.h" + +/***************************************************************************** + * Preamble + *****************************************************************************/ +#include "defs.h" + +#include <math.h> +#include <stdio.h> + +#include "config.h" +#include "common.h" +#include "threads.h" +#include "mtime.h" + +#include "ac3_imdct.h" +#include "ac3_imdct_common.h" +#include "ac3_retables.h" + +void _M( fft_64p ) ( complex_t *x ); +void _M( fft_128p ) ( complex_t *a ); + +static void imdct512_pre_ifft_twiddle_3dn (const int *pmt, complex_t *buf, float *data, float *xcos_sin_sse); +static void imdct512_post_ifft_twiddle_3dn (complex_t *buf, float *xcos_sin_sse); +static void imdct512_window_delay_3dn (complex_t *buf, float *data_ptr, float *window_prt, float *delay_prt); +static void imdct512_window_delay_nol_3dn (complex_t *buf, float *data_ptr, float *window_prt, float *delay_prt); + + +void _M( imdct_init ) (imdct_t * p_imdct) +{ + int i; + float scale = 181.019; + + fprintf(stderr,"imct_init\n"); + for (i=0; i < 128; i++) + { + float xcos_i = cos(2.0f * M_PI * (8*i+1)/(8*N)) * scale; + float xsin_i = sin(2.0f * M_PI * (8*i+1)/(8*N)) * scale; + p_imdct->xcos_sin_sse[i * 4] = xcos_i; + p_imdct->xcos_sin_sse[i * 4 + 1] = -xsin_i; + p_imdct->xcos_sin_sse[i * 4 + 2] = -xsin_i; + p_imdct->xcos_sin_sse[i * 4 + 3] = -xcos_i; + } + fprintf(stderr,"done imct_init\n"); +} + +void _M( imdct_do_512 ) (imdct_t * p_imdct, float data[], float delay[]) +{ + imdct512_pre_ifft_twiddle_3dn (pm128, p_imdct->buf, data, p_imdct->xcos_sin_sse); + _M( fft_128p ) (p_imdct->buf); + imdct512_post_ifft_twiddle_3dn (p_imdct->buf, p_imdct->xcos_sin_sse); + imdct512_window_delay_3dn (p_imdct->buf, data, window, delay); +} + +void _M( imdct_do_512_nol ) (imdct_t * p_imdct, float data[], float delay[]) +{ + imdct512_pre_ifft_twiddle_3dn (pm128, p_imdct->buf, data, p_imdct->xcos_sin_sse); + _M( fft_128p ) (p_imdct->buf); + imdct512_post_ifft_twiddle_3dn (p_imdct->buf, p_imdct->xcos_sin_sse); + imdct512_window_delay_nol_3dn (p_imdct->buf, data, window, delay); +} + +static void imdct512_pre_ifft_twiddle_3dn (const int *pmt, complex_t *buf, float *data, float *xcos_sin_sse) +{ + __asm__ __volatile__ ( + "pushl %%ebp\n" + "movl %%esp, %%ebp\n" + "addl $-4, %%esp\n" /* local variable, loop counter */ + + "pushl %%eax\n" + "pushl %%ebx\n" + "pushl %%ecx\n" + "pushl %%edx\n" + "pushl %%edi\n" + "pushl %%esi\n" + + "movl 8(%%ebp), %%eax\n" /* pmt */ + "movl 12(%%ebp), %%ebx\n" /* buf */ + "movl 16(%%ebp), %%ecx\n" /* data */ + "movl 20(%%ebp), %%edx\n" /* xcos_sin_sse */ + "movl $128, -4(%%ebp)\n" + +".loop:\n" + "movl (%%eax), %%esi\n" + "movd (%%ecx, %%esi, 8), %%mm1\n" /* 2j */ + "punpckldq %%mm1, %%mm1\n" /* 2j | 2j */ + + "shll $1, %%esi\n" + + "movq (%%edx, %%esi, 8), %%mm0\n" /* -s_j | c_j */ + "movq 8(%%edx, %%esi, 8), %%mm2\n" /* -c_j | -s_j */ + + "negl %%esi\n" + + "movd 1020(%%ecx, %%esi, 4), %%mm4\n" /* 255-2j */ + "punpckldq %%mm4, %%mm4\n" /* 255-2j | 255-2j */ + "addl $4, %%eax\n" + + "pfmul %%mm4, %%mm0\n" /* 255-2j * -s_j | 255-2j * c_j */ + "pfmul %%mm1, %%mm2\n" /* 2j * -c_j | 2j * -s_j */ + "addl $8, %%ebx\n" + "pfadd %%mm2, %%mm0\n" /* 2j * -c_j + 255-2j * -s_j | 2j * -s_j + 255-2j * c_j */ + + "movq %%mm0, -8(%%ebx)\n" + "decl -4(%%ebp)\n" + "jnz .loop\n" + + "popl %%esi\n" + "popl %%edi\n" + "popl %%edx\n" + "popl %%ecx\n" + "popl %%ebx\n" + "popl %%eax\n" + + "addl $4, %%esp\n" + "popl %%ebp\n" + "femms\n" + ::); +} + +static void imdct512_post_ifft_twiddle_3dn (complex_t *buf, float *xcos_sin_sse) +{ + __asm__ __volatile__ ( + "pushl %%ebx\n" + "movl $64, %%ebx\n" /* loop counter */ + +".loop1:\n" + "movq (%%eax), %%mm0\n" /* im0 | re0 */ + "movq %%mm0, %%mm1\n" /* im0 | re0 */ + "punpckldq %%mm0, %%mm0\n" /* re0 | re0 */ + "punpckhdq %%mm1, %%mm1\n" /* im0 | im0 */ + + "movq (%%ecx), %%mm2\n" /* -s | c */ + "movq 8(%%ecx), %%mm3\n" /* -c | -s */ + "movq %%mm3, %%mm4\n" + + "punpckhdq %%mm2,%%mm3\n" /* -s | -c */ + "punpckldq %%mm2,%%mm4\n" /* c | -s */ + + "movq 8(%%eax), %%mm2\n" /* im1 | re1 */ + "movq %%mm2, %%mm5\n" /* im1 | re1 */ + "punpckldq %%mm2, %%mm2\n" /* re1 | re1 */ + "punpckhdq %%mm5, %%mm5\n" /* im1 | im1 */ + + "pfmul %%mm3, %%mm0\n" /* -s * re0 | -c * re0 */ + "pfmul %%mm4, %%mm1\n" /* c * im0 | -s * im0 */ + + "movq 16(%%ecx), %%mm6\n" /* -s1 | c1 */ + "movq 24(%%ecx), %%mm7\n" /* -c1 | -s1 */ + "movq %%mm7, %%mm4\n" + + "punpckhdq %%mm6,%%mm7\n" /* -s1 | -c1 */ + "punpckldq %%mm6,%%mm4\n" /* c1 | -s1 */ + + "pfmul %%mm7, %%mm2\n" /* -s1*re1 | -c1*re1 */ + "pfmul %%mm4, %%mm5\n" /* c1*im1 | -s1*im1 */ + + "pfadd %%mm1, %%mm0\n" /* -s * re0 + c * im0 | -c * re0 - s * im0 */ + "pfadd %%mm5, %%mm2\n" /* -s1 * re1 + c1 * im1 | -c1 * re1 - s1 * im1 */ + + "movq %%mm0, (%%eax)\n" + "movq %%mm2, 8(%%eax)\n" + "addl $32, %%ecx\n" + "addl $16, %%eax\n" + "decl %%ebx\n" + "jnz .loop1\n" + + "popl %%ebx\n" + "femms\n" + : "=a" (buf) + : "a" (buf), "c" (xcos_sin_sse) ); +} + +static void imdct512_window_delay_3dn (complex_t *buf, float *data_ptr, float *window_prt, float *delay_prt) +{ + __asm__ __volatile__ ( + "pushl %%ebp\n" + "movl %%esp, %%ebp\n" + + "pushl %%eax\n" + "pushl %%ebx\n" + "pushl %%ecx\n" + "pushl %%edx\n" + "pushl %%esi\n" + "pushl %%edi\n" + + "movl 20(%%ebp), %%ebx\n" /* delay */ + "movl 16(%%ebp), %%edx\n" /* window */ + + "movl 8(%%ebp), %%eax\n" /* buf */ + "movl $32, %%ecx\n" /* loop count */ + "leal 516(%%eax), %%esi\n" /* buf[64].im */ + "leal 504(%%eax), %%edi\n" /* buf[63].re */ + "movl 12(%%ebp), %%eax\n" /* data */ + +".first_128_samples:\n" + "movd (%%esi), %%mm0\n" /* im0 */ + "movd 8(%%esi), %%mm2\n" /* im1 */ + "movd (%%edi), %%mm1\n" /* re0 */ + "movd -8(%%edi), %%mm3\n" /* re1 */ + + "pxor %%mm4, %%mm4\n" + "pxor %%mm5, %%mm5\n" + "pfsub %%mm0, %%mm4\n" /* -im0 */ + "pfsub %%mm2, %%mm5\n" /* -im1 */ + + "punpckldq %%mm1, %%mm4\n" /* re0 | -im0 */ + "punpckldq %%mm3, %%mm5\n" /* re1 | -im1 */ + + "movq (%%edx), %%mm0\n" /* w1 | w0 */ + "movq 8(%%edx), %%mm1\n" /* w3 | w2 */ + "movq (%%ebx), %%mm2\n" /* d1 | d0 */ + "movq 8(%%ebx), %%mm3\n" /* d3 | d2 */ + + "pfmul %%mm4, %%mm0\n" /* w1*re0 | -w0*im0 */ + "pfmul %%mm5, %%mm1\n" /* w3*re1 | -w2*im1 */ + + "pfadd %%mm2, %%mm0\n" /* w1*re0+d1 | -w0*im0+d0 */ + "pfadd %%mm3, %%mm1\n" /* w3*re1+d3 | -w2*im1+d2 */ + + "addl $16, %%edx\n" + "movq %%mm0, (%%eax)\n" + "movq %%mm1, 8(%%eax)\n" + "addl $16, %%ebx\n" + "addl $16, %%esi\n" + "addl $16, %%eax\n" + "addl $-16, %%edi\n" + "decl %%ecx\n" + "jnz .first_128_samples\n" + + "movl 8(%%ebp), %%esi\n" /* buf[0].re */ + "leal 1020(%%esi), %%edi\n" /* buf[127].im */ + "movl $32, %%ecx\n" /* loop count */ + +".second_128_samples:\n" + "movd (%%esi), %%mm0\n" /* buf[i].re */ + "movd 8(%%esi), %%mm2\n" /* re1 */ + "movd (%%edi), %%mm1\n" /* buf[127-i].im */ + "movd -8(%%edi), %%mm3\n" /* im1 */ + + "pxor %%mm4, %%mm4\n" + "pxor %%mm5, %%mm5\n" + "pfsub %%mm0, %%mm4\n" /* -re0 */ + "pfsub %%mm2, %%mm5\n" /* -re1 */ + + "punpckldq %%mm1, %%mm4\n" /* im0 | -re0 */ + "punpckldq %%mm3, %%mm5\n" /* im1 | -re1 */ + + "movq (%%edx), %%mm0\n" /* w1 | w0 */ + "movq 8(%%edx), %%mm1\n" /* w3 | w2 */ + "movq (%%ebx), %%mm2\n" /* d1 | d0 */ + "movq 8(%%ebx), %%mm3\n" /* d3 | d2 */ + + "addl $16, %%esi\n" + + "pfmul %%mm4, %%mm0\n" /* w1*im0 | -w0*re0 */ + "pfmul %%mm5, %%mm1\n" /* w3*im1 | -w2*re1 */ + + "pfadd %%mm2, %%mm0\n" /* w1*im0+d1 | -w0*re0+d0 */ + "pfadd %%mm3, %%mm1\n" /* w3*im1+d3 | -w2*re1+d2 */ + + "addl $-16, %%edi\n" + + "movq %%mm0, (%%eax)\n" + "movq %%mm1, 8(%%eax)\n" + + "addl $16, %%edx\n" + "addl $16, %%eax\n" + "addl $16, %%ebx\n" + "decl %%ecx\n" + "jnz .second_128_samples\n" + + "movl 8(%%ebp), %%eax\n" + "leal 512(%%eax), %%esi\n" /* buf[64].re */ + "leal 508(%%eax), %%edi\n" /* buf[63].im */ + "movl $32, %%ecx\n" /* loop count */ + "movl 20(%%ebp), %%eax\n" /* delay */ + +".first_128_delay:\n" + "movd (%%esi), %%mm0\n" /* re0 */ + "movd 8(%%esi), %%mm2\n" /* re1 */ + "movd (%%edi), %%mm1\n" /* im0 */ + "movd -8(%%edi), %%mm3\n" /* im1 */ + + "pxor %%mm4, %%mm4\n" + "pxor %%mm5, %%mm5\n" + "pfsub %%mm0, %%mm4\n" /* -re0 */ + "pfsub %%mm2, %%mm5\n" /* -re1 */ + + "punpckldq %%mm1, %%mm4\n" /* im0 | -re0 */ + "punpckldq %%mm3, %%mm5\n" /* im1 | -re1 */ + + + "movq -16(%%edx), %%mm1\n" /* w3 | w2 */ + "movq -8(%%edx), %%mm0\n" /* w1 | w0 */ + + "addl $-16, %%edx\n" + + "pfmul %%mm4, %%mm0\n" /* w1*im0 | -w0*re0 */ + "pfmul %%mm5, %%mm1\n" /* w3*im1 | -w2*re1 */ + + + "movq %%mm0, (%%eax)\n" + "movq %%mm1, 8(%%eax)\n" + "addl $16, %%esi\n" + "addl $-16, %%edi\n" + "addl $16, %%eax\n" + "decl %%ecx\n" + "jnz .first_128_delay\n" + + "movl 8(%%ebp), %%ebx\n" + "leal 4(%%ebx), %%esi\n" /* buf[0].im */ + "leal 1016(%%ebx), %%edi\n" /* buf[127].re */ + "movl $32, %%ecx\n" /* loop count */ + +".second_128_delay:\n" + "movd (%%esi), %%mm0\n" /* im0 */ + "movd 8(%%esi), %%mm2\n" /* im1 */ + "movd (%%edi), %%mm1\n" /* re0 */ + "movd -8(%%edi), %%mm3\n" /* re1 */ + + "pxor %%mm4, %%mm4\n" + "pxor %%mm5, %%mm5\n" + "pfsub %%mm1, %%mm4\n" /* -re0 */ + "pfsub %%mm3, %%mm5\n" /* -re1 */ + + "punpckldq %%mm4, %%mm0\n" /* -re0 | im0 */ + "punpckldq %%mm5, %%mm2\n" /* -re1 | im1 */ + + + "movq -16(%%edx), %%mm1\n" /* w3 | w2 */ + "movq -8(%%edx), %%mm3\n" /* w1 | w0 */ + + "addl $-16, %%edx\n" + + "pfmul %%mm0, %%mm1\n" /* -w1*re0 | w0*im0 */ + "pfmul %%mm2, %%mm3\n" /* -w3*re1 | w2*im1 */ + + + "movq %%mm1, (%%eax)\n" + "movq %%mm3, 8(%%eax)\n" + "addl $16, %%esi\n" + "addl $-16, %%edi\n" + "addl $16, %%eax\n" + "decl %%ecx\n" + "jnz .second_128_delay\n" + + "popl %%edi\n" + "popl %%esi\n" + "popl %%edx\n" + "popl %%ecx\n" + "popl %%ebx\n" + "popl %%eax\n" + + "leave\n" + "femms\n" + ::); +} + +static void imdct512_window_delay_nol_3dn (complex_t *buf, float *data_ptr, float *window_prt, float *delay_prt) +{ + __asm__ __volatile__ ( + "pushl %%ebp\n" + "movl %%esp, %%ebp\n" + + "pushl %%eax\n" + "pushl %%ebx\n" + "pushl %%ecx\n" + "pushl %%edx\n" + "pushl %%esi\n" + "pushl %%edi\n" + + "movl 20(%%ebp), %%ebx\n" /* delay */ + "movl 16(%%ebp), %%edx\n" /* window */ + + "movl 8(%%ebp), %%eax\n" /* buf */ + "movl $32, %%ecx\n" /* loop count */ + "leal 516(%%eax), %%esi\n" /* buf[64].im */ + "leal 504(%%eax), %%edi\n" /* buf[63].re */ + "movl 12(%%ebp), %%eax\n" /* data */ + +".first_128_samples2:\n" + "movd (%%esi), %%mm0\n" /* im0 */ + "movd 8(%%esi), %%mm2\n" /* im1 */ + "movd (%%edi), %%mm1\n" /* re0 */ + "movd -8(%%edi), %%mm3\n" /* re1 */ + + "pxor %%mm4, %%mm4\n" + "pxor %%mm5, %%mm5\n" + "pfsub %%mm0, %%mm4\n" /* -im0 */ + "pfsub %%mm2, %%mm5\n" /* -im1 */ + + "punpckldq %%mm1, %%mm4\n" /* re0 | -im0 */ + "punpckldq %%mm3, %%mm5\n" /* re1 | -im1 */ + + "movq (%%edx), %%mm0\n" /* w1 | w0 */ + "movq 8(%%edx), %%mm1\n" /* w3 | w2 */ + + "pfmul %%mm4, %%mm0\n" /* w1*re0 | -w0*im0 */ + "pfmul %%mm5, %%mm1\n" /* w3*re1 | -w2*im1 */ + + "addl $16, %%edx\n" + "movq %%mm0, (%%eax)\n" + "movq %%mm1, 8(%%eax)\n" + "addl $16, %%ebx\n" + "addl $16, %%esi\n" + "addl $16, %%eax\n" + "addl $-16, %%edi\n" + "decl %%ecx\n" + "jnz .first_128_samples2\n" + + "movl 8(%%ebp), %%esi\n" /* buf[0].re */ + "leal 1020(%%esi), %%edi\n" /* buf[127].im */ + "movl $32, %%ecx\n" /* loop count */ + +".second_128_samples2:\n" + "movd (%%esi), %%mm0\n" /* buf[i].re */ + "movd 8(%%esi), %%mm2\n" /* re1 */ + "movd (%%edi), %%mm1\n" /* buf[127-i].im */ + "movd -8(%%edi), %%mm3\n" /* im1 */ + + "pxor %%mm4, %%mm4\n" + "pxor %%mm5, %%mm5\n" + "pfsub %%mm0, %%mm4\n" /* -re0 */ + "pfsub %%mm2, %%mm5\n" /* -re1 */ + + "punpckldq %%mm1, %%mm4\n" /* im0 | -re0 */ + "punpckldq %%mm3, %%mm5\n" /* im1 | -re1 */ + + "movq (%%edx), %%mm0\n" /* w1 | w0 */ + "movq 8(%%edx), %%mm1\n" /* w3 | w2 */ + + "addl $16, %%esi\n" + + "pfmul %%mm4, %%mm0\n" /* w1*im0 | -w0*re0 */ + "pfmul %%mm5, %%mm1\n" /* w3*im1 | -w2*re1 */ + + "addl $-16, %%edi\n" + + "movq %%mm0, (%%eax)\n" + "movq %%mm1, 8(%%eax)\n" + + "addl $16, %%edx\n" + "addl $16, %%eax\n" + "addl $16, %%ebx\n" + "decl %%ecx\n" + "jnz .second_128_samples2\n" + + "movl 8(%%ebp), %%eax\n" + "leal 512(%%eax), %%esi\n" /* buf[64].re */ + "leal 508(%%eax), %%edi\n" /* buf[63].im */ + "movl $32, %%ecx\n" /* loop count */ + "movl 20(%%ebp), %%eax\n" /* delay */ + +".first_128_delays:\n" + "movd (%%esi), %%mm0\n" /* re0 */ + "movd 8(%%esi), %%mm2\n" /* re1 */ + "movd (%%edi), %%mm1\n" /* im0 */ + "movd -8(%%edi), %%mm3\n" /* im1 */ + + "pxor %%mm4, %%mm4\n" + "pxor %%mm5, %%mm5\n" + "pfsub %%mm0, %%mm4\n" /* -re0 */ + "pfsub %%mm2, %%mm5\n" /* -re1 */ + + "punpckldq %%mm1, %%mm4\n" /* im0 | -re0 */ + "punpckldq %%mm3, %%mm5\n" /* im1 | -re1 */ + + + "movq -16(%%edx), %%mm1\n" /* w3 | w2 */ + "movq -8(%%edx), %%mm0\n" /* w1 | w0 */ + + "addl $-16, %%edx\n" + + "pfmul %%mm4, %%mm0\n" /* w1*im0 | -w0*re0 */ + "pfmul %%mm5, %%mm1\n" /* w3*im1 | -w2*re1 */ + + + "movq %%mm0, (%%eax)\n" + "movq %%mm1, 8(%%eax)\n" + "addl $16, %%esi\n" + "addl $-16, %%edi\n" + "addl $16, %%eax\n" + "decl %%ecx\n" + "jnz .first_128_delays\n" + + "movl 8(%%ebp), %%ebx\n" + "leal 4(%%ebx), %%esi\n" /* buf[0].im */ + "leal 1016(%%ebx), %%edi\n" /* buf[127].re */ + "movl $32, %%ecx\n" /* loop count */ + +".second_128_delays:\n" + "movd (%%esi), %%mm0\n" /* im0 */ + "movd 8(%%esi), %%mm2\n" /* im1 */ + "movd (%%edi), %%mm1\n" /* re0 */ + "movd -8(%%edi), %%mm3\n" /* re1 */ + + "pxor %%mm4, %%mm4\n" + "pxor %%mm5, %%mm5\n" + "pfsub %%mm1, %%mm4\n" /* -re0 */ + "pfsub %%mm3, %%mm5\n" /* -re1 */ + + "punpckldq %%mm4, %%mm0\n" /* -re0 | im0 */ + "punpckldq %%mm5, %%mm2\n" /* -re1 | im1 */ + + + "movq -16(%%edx), %%mm1\n" /* w3 | w2 */ + "movq -8(%%edx), %%mm3\n" /* w1 | w0 */ + + "addl $-16, %%edx\n" + + "pfmul %%mm0, %%mm1\n" /* -w1*re0 | w0*im0 */ + "pfmul %%mm2, %%mm3\n" /* -w3*re1 | w2*im1 */ + + + "movq %%mm1, (%%eax)\n" + "movq %%mm3, 8(%%eax)\n" + "addl $16, %%esi\n" + "addl $-16, %%edi\n" + "addl $16, %%eax\n" + "decl %%ecx\n" + "jnz .second_128_delays\n" + + "popl %%edi\n" + "popl %%esi\n" + "popl %%edx\n" + "popl %%ecx\n" + "popl %%ebx\n" + "popl %%eax\n" + + "leave\n" + "femms\n" + ::); +} + diff --git a/plugins/imdct/ac3_imdct_c.c b/plugins/imdct/ac3_imdct_c.c index 3ebf16c9b3..106d7faaaf 100644 --- a/plugins/imdct/ac3_imdct_c.c +++ b/plugins/imdct/ac3_imdct_c.c @@ -2,7 +2,7 @@ * ac3_imdct_c.c: ac3 DCT in C ***************************************************************************** * Copyright (C) 1999, 2000 VideoLAN - * $Id: ac3_imdct_c.c,v 1.1 2001/05/15 16:19:42 sam Exp $ + * $Id: ac3_imdct_c.c,v 1.2 2001/05/16 14:51:29 reno Exp $ * * Authors: Renaud Dartus <reno@videolan.org> * Aaron Holtzman <aholtzma@engr.uvic.ca> @@ -42,6 +42,7 @@ #include "ac3_imdct.h" #include "ac3_imdct_common.h" +#include "ac3_retables.h" #ifndef M_PI # define M_PI 3.14159265358979323846 @@ -50,65 +51,6 @@ void _M( fft_64p ) ( complex_t *x ); void _M( fft_128p ) ( complex_t *x ); -static float window[] = { - 0.00014, 0.00024, 0.00037, 0.00051, 0.00067, 0.00086, 0.00107, 0.00130, - 0.00157, 0.00187, 0.00220, 0.00256, 0.00297, 0.00341, 0.00390, 0.00443, - 0.00501, 0.00564, 0.00632, 0.00706, 0.00785, 0.00871, 0.00962, 0.01061, - 0.01166, 0.01279, 0.01399, 0.01526, 0.01662, 0.01806, 0.01959, 0.02121, - 0.02292, 0.02472, 0.02662, 0.02863, 0.03073, 0.03294, 0.03527, 0.03770, - 0.04025, 0.04292, 0.04571, 0.04862, 0.05165, 0.05481, 0.05810, 0.06153, - 0.06508, 0.06878, 0.07261, 0.07658, 0.08069, 0.08495, 0.08935, 0.09389, - 0.09859, 0.10343, 0.10842, 0.11356, 0.11885, 0.12429, 0.12988, 0.13563, - 0.14152, 0.14757, 0.15376, 0.16011, 0.16661, 0.17325, 0.18005, 0.18699, - 0.19407, 0.20130, 0.20867, 0.21618, 0.22382, 0.23161, 0.23952, 0.24757, - 0.25574, 0.26404, 0.27246, 0.28100, 0.28965, 0.29841, 0.30729, 0.31626, - 0.32533, 0.33450, 0.34376, 0.35311, 0.36253, 0.37204, 0.38161, 0.39126, - 0.40096, 0.41072, 0.42054, 0.43040, 0.44030, 0.45023, 0.46020, 0.47019, - 0.48020, 0.49022, 0.50025, 0.51028, 0.52031, 0.53033, 0.54033, 0.55031, - 0.56026, 0.57019, 0.58007, 0.58991, 0.59970, 0.60944, 0.61912, 0.62873, - 0.63827, 0.64774, 0.65713, 0.66643, 0.67564, 0.68476, 0.69377, 0.70269, - 0.71150, 0.72019, 0.72877, 0.73723, 0.74557, 0.75378, 0.76186, 0.76981, - 0.77762, 0.78530, 0.79283, 0.80022, 0.80747, 0.81457, 0.82151, 0.82831, - 0.83496, 0.84145, 0.84779, 0.85398, 0.86001, 0.86588, 0.87160, 0.87716, - 0.88257, 0.88782, 0.89291, 0.89785, 0.90264, 0.90728, 0.91176, 0.91610, - 0.92028, 0.92432, 0.92822, 0.93197, 0.93558, 0.93906, 0.94240, 0.94560, - 0.94867, 0.95162, 0.95444, 0.95713, 0.95971, 0.96217, 0.96451, 0.96674, - 0.96887, 0.97089, 0.97281, 0.97463, 0.97635, 0.97799, 0.97953, 0.98099, - 0.98236, 0.98366, 0.98488, 0.98602, 0.98710, 0.98811, 0.98905, 0.98994, - 0.99076, 0.99153, 0.99225, 0.99291, 0.99353, 0.99411, 0.99464, 0.99513, - 0.99558, 0.99600, 0.99639, 0.99674, 0.99706, 0.99736, 0.99763, 0.99788, - 0.99811, 0.99831, 0.99850, 0.99867, 0.99882, 0.99895, 0.99908, 0.99919, - 0.99929, 0.99938, 0.99946, 0.99953, 0.99959, 0.99965, 0.99969, 0.99974, - 0.99978, 0.99981, 0.99984, 0.99986, 0.99988, 0.99990, 0.99992, 0.99993, - 0.99994, 0.99995, 0.99996, 0.99997, 0.99998, 0.99998, 0.99998, 0.99999, - 0.99999, 0.99999, 0.99999, 1.00000, 1.00000, 1.00000, 1.00000, 1.00000, - 1.00000, 1.00000, 1.00000, 1.00000, 1.00000, 1.00000, 1.00000, 1.00000 -}; - -static const int pm128[128] = -{ - 0, 16, 32, 48, 64, 80, 96, 112, 8, 40, 72, 104, 24, 56, 88, 120, - 4, 20, 36, 52, 68, 84, 100, 116, 12, 28, 44, 60, 76, 92, 108, 124, - 2, 18, 34, 50, 66, 82, 98, 114, 10, 42, 74, 106, 26, 58, 90, 122, - 6, 22, 38, 54, 70, 86, 102, 118, 14, 46, 78, 110, 30, 62, 94, 126, - 1, 17, 33, 49, 65, 81, 97, 113, 9, 41, 73, 105, 25, 57, 89, 121, - 5, 21, 37, 53, 69, 85, 101, 117, 13, 29, 45, 61, 77, 93, 109, 125, - 3, 19, 35, 51, 67, 83, 99, 115, 11, 43, 75, 107, 27, 59, 91, 123, - 7, 23, 39, 55, 71, 87, 103, 119, 15, 31, 47, 63, 79, 95, 111, 127 -}; - -static const int pm64[64] = -{ - 0, 8, 16, 24, 32, 40, 48, 56, - 4, 20, 36, 52, 12, 28, 44, 60, - 2, 10, 18, 26, 34, 42, 50, 58, - 6, 14, 22, 30, 38, 46, 54, 62, - 1, 9, 17, 25, 33, 41, 49, 57, - 5, 21, 37, 53, 13, 29, 45, 61, - 3, 11, 19, 27, 35, 43, 51, 59, - 7, 23, 39, 55, 15, 31, 47, 63 -}; - void _M( imdct_init ) (imdct_t * p_imdct) { int i; diff --git a/plugins/imdct/ac3_imdct_common.c b/plugins/imdct/ac3_imdct_common.c index 493eca2d12..b5bdc6d64b 100644 --- a/plugins/imdct/ac3_imdct_common.c +++ b/plugins/imdct/ac3_imdct_common.c @@ -2,7 +2,7 @@ * ac3_imdct_common.c: common ac3 DCT functions ***************************************************************************** * Copyright (C) 1999, 2000 VideoLAN - * $Id: ac3_imdct_common.c,v 1.2 2001/05/15 19:36:27 sam Exp $ + * $Id: ac3_imdct_common.c,v 1.3 2001/05/16 14:51:29 reno Exp $ * * Authors: Renaud Dartus <reno@videolan.org> * Aaron Holtzman <aholtzma@engr.uvic.ca> @@ -45,71 +45,13 @@ #include "mtime.h" #include "ac3_imdct.h" +#include "ac3_retables.h" #ifndef M_PI # define M_PI 3.14159265358979323846 #endif -static float window[] = { - 0.00014, 0.00024, 0.00037, 0.00051, 0.00067, 0.00086, 0.00107, 0.00130, - 0.00157, 0.00187, 0.00220, 0.00256, 0.00297, 0.00341, 0.00390, 0.00443, - 0.00501, 0.00564, 0.00632, 0.00706, 0.00785, 0.00871, 0.00962, 0.01061, - 0.01166, 0.01279, 0.01399, 0.01526, 0.01662, 0.01806, 0.01959, 0.02121, - 0.02292, 0.02472, 0.02662, 0.02863, 0.03073, 0.03294, 0.03527, 0.03770, - 0.04025, 0.04292, 0.04571, 0.04862, 0.05165, 0.05481, 0.05810, 0.06153, - 0.06508, 0.06878, 0.07261, 0.07658, 0.08069, 0.08495, 0.08935, 0.09389, - 0.09859, 0.10343, 0.10842, 0.11356, 0.11885, 0.12429, 0.12988, 0.13563, - 0.14152, 0.14757, 0.15376, 0.16011, 0.16661, 0.17325, 0.18005, 0.18699, - 0.19407, 0.20130, 0.20867, 0.21618, 0.22382, 0.23161, 0.23952, 0.24757, - 0.25574, 0.26404, 0.27246, 0.28100, 0.28965, 0.29841, 0.30729, 0.31626, - 0.32533, 0.33450, 0.34376, 0.35311, 0.36253, 0.37204, 0.38161, 0.39126, - 0.40096, 0.41072, 0.42054, 0.43040, 0.44030, 0.45023, 0.46020, 0.47019, - 0.48020, 0.49022, 0.50025, 0.51028, 0.52031, 0.53033, 0.54033, 0.55031, - 0.56026, 0.57019, 0.58007, 0.58991, 0.59970, 0.60944, 0.61912, 0.62873, - 0.63827, 0.64774, 0.65713, 0.66643, 0.67564, 0.68476, 0.69377, 0.70269, - 0.71150, 0.72019, 0.72877, 0.73723, 0.74557, 0.75378, 0.76186, 0.76981, - 0.77762, 0.78530, 0.79283, 0.80022, 0.80747, 0.81457, 0.82151, 0.82831, - 0.83496, 0.84145, 0.84779, 0.85398, 0.86001, 0.86588, 0.87160, 0.87716, - 0.88257, 0.88782, 0.89291, 0.89785, 0.90264, 0.90728, 0.91176, 0.91610, - 0.92028, 0.92432, 0.92822, 0.93197, 0.93558, 0.93906, 0.94240, 0.94560, - 0.94867, 0.95162, 0.95444, 0.95713, 0.95971, 0.96217, 0.96451, 0.96674, - 0.96887, 0.97089, 0.97281, 0.97463, 0.97635, 0.97799, 0.97953, 0.98099, - 0.98236, 0.98366, 0.98488, 0.98602, 0.98710, 0.98811, 0.98905, 0.98994, - 0.99076, 0.99153, 0.99225, 0.99291, 0.99353, 0.99411, 0.99464, 0.99513, - 0.99558, 0.99600, 0.99639, 0.99674, 0.99706, 0.99736, 0.99763, 0.99788, - 0.99811, 0.99831, 0.99850, 0.99867, 0.99882, 0.99895, 0.99908, 0.99919, - 0.99929, 0.99938, 0.99946, 0.99953, 0.99959, 0.99965, 0.99969, 0.99974, - 0.99978, 0.99981, 0.99984, 0.99986, 0.99988, 0.99990, 0.99992, 0.99993, - 0.99994, 0.99995, 0.99996, 0.99997, 0.99998, 0.99998, 0.99998, 0.99999, - 0.99999, 0.99999, 0.99999, 1.00000, 1.00000, 1.00000, 1.00000, 1.00000, - 1.00000, 1.00000, 1.00000, 1.00000, 1.00000, 1.00000, 1.00000, 1.00000 -}; - -static const int pm128[128] = -{ - 0, 16, 32, 48, 64, 80, 96, 112, 8, 40, 72, 104, 24, 56, 88, 120, - 4, 20, 36, 52, 68, 84, 100, 116, 12, 28, 44, 60, 76, 92, 108, 124, - 2, 18, 34, 50, 66, 82, 98, 114, 10, 42, 74, 106, 26, 58, 90, 122, - 6, 22, 38, 54, 70, 86, 102, 118, 14, 46, 78, 110, 30, 62, 94, 126, - 1, 17, 33, 49, 65, 81, 97, 113, 9, 41, 73, 105, 25, 57, 89, 121, - 5, 21, 37, 53, 69, 85, 101, 117, 13, 29, 45, 61, 77, 93, 109, 125, - 3, 19, 35, 51, 67, 83, 99, 115, 11, 43, 75, 107, 27, 59, 91, 123, - 7, 23, 39, 55, 71, 87, 103, 119, 15, 31, 47, 63, 79, 95, 111, 127 -}; - -static const int pm64[64] = -{ - 0, 8, 16, 24, 32, 40, 48, 56, - 4, 20, 36, 52, 12, 28, 44, 60, - 2, 10, 18, 26, 34, 42, 50, 58, - 6, 14, 22, 30, 38, 46, 54, 62, - 1, 9, 17, 25, 33, 41, 49, 57, - 5, 21, 37, 53, 13, 29, 45, 61, - 3, 11, 19, 27, 35, 43, 51, 59, - 7, 23, 39, 55, 15, 31, 47, 63 -}; - -void _M( fft_64p ) ( complex_t *a ); +void _M( fft_64p ) ( complex_t *x ); void _M( imdct_do_256 ) (imdct_t * p_imdct, float data[],float delay[]) { @@ -266,4 +208,3 @@ void _M( imdct_do_256_nol ) (imdct_t * p_imdct, float data[], float delay[]) *delay_ptr++ = -buf2[64-i-1].real * *--window_ptr; } } - diff --git a/plugins/imdct/ac3_imdct_common.h b/plugins/imdct/ac3_imdct_common.h index ce0a7ab6d8..3977b2a3ba 100644 --- a/plugins/imdct/ac3_imdct_common.h +++ b/plugins/imdct/ac3_imdct_common.h @@ -2,7 +2,7 @@ * ac3_imdct_common.h: common ac3 DCT headers ***************************************************************************** * Copyright (C) 1999, 2000 VideoLAN - * $Id: ac3_imdct_common.h,v 1.1 2001/05/15 16:19:42 sam Exp $ + * $Id: ac3_imdct_common.h,v 1.2 2001/05/16 14:51:29 reno Exp $ * * Authors: Renaud Dartus <reno@videolan.org> * Aaron Holtzman <aholtzma@engr.uvic.ca> @@ -27,4 +27,3 @@ void _M( imdct_do_256 ) ( imdct_t * p_imdct, float data[], float delay[] ); void _M( imdct_do_256_nol ) ( imdct_t * p_imdct, float data[], float delay[] ); void _M( imdct_do_512 ) ( imdct_t * p_imdct, float data[], float delay[] ); void _M( imdct_do_512_nol ) ( imdct_t * p_imdct, float data[], float delay[] ); - diff --git a/plugins/imdct/ac3_imdct_sse.c b/plugins/imdct/ac3_imdct_sse.c index d426f55a66..a9dad29166 100644 --- a/plugins/imdct/ac3_imdct_sse.c +++ b/plugins/imdct/ac3_imdct_sse.c @@ -2,7 +2,7 @@ * ac3_imdct_sse.c: accelerated SSE ac3 DCT ***************************************************************************** * Copyright (C) 1999, 2000 VideoLAN - * $Id: ac3_imdct_sse.c,v 1.1 2001/05/15 16:19:42 sam Exp $ + * $Id: ac3_imdct_sse.c,v 1.2 2001/05/16 14:51:29 reno Exp $ * * Authors: Renaud Dartus <reno@videolan.org> * Aaron Holtzman <aholtzma@engr.uvic.ca> @@ -40,61 +40,15 @@ #include "ac3_imdct.h" #include "ac3_imdct_common.h" - -static const float window[] = { - 0.00014, 0.00024, 0.00037, 0.00051, 0.00067, 0.00086, 0.00107, 0.00130, - 0.00157, 0.00187, 0.00220, 0.00256, 0.00297, 0.00341, 0.00390, 0.00443, - 0.00501, 0.00564, 0.00632, 0.00706, 0.00785, 0.00871, 0.00962, 0.01061, - 0.01166, 0.01279, 0.01399, 0.01526, 0.01662, 0.01806, 0.01959, 0.02121, - 0.02292, 0.02472, 0.02662, 0.02863, 0.03073, 0.03294, 0.03527, 0.03770, - 0.04025, 0.04292, 0.04571, 0.04862, 0.05165, 0.05481, 0.05810, 0.06153, - 0.06508, 0.06878, 0.07261, 0.07658, 0.08069, 0.08495, 0.08935, 0.09389, - 0.09859, 0.10343, 0.10842, 0.11356, 0.11885, 0.12429, 0.12988, 0.13563, - 0.14152, 0.14757, 0.15376, 0.16011, 0.16661, 0.17325, 0.18005, 0.18699, - 0.19407, 0.20130, 0.20867, 0.21618, 0.22382, 0.23161, 0.23952, 0.24757, - 0.25574, 0.26404, 0.27246, 0.28100, 0.28965, 0.29841, 0.30729, 0.31626, - 0.32533, 0.33450, 0.34376, 0.35311, 0.36253, 0.37204, 0.38161, 0.39126, - 0.40096, 0.41072, 0.42054, 0.43040, 0.44030, 0.45023, 0.46020, 0.47019, - 0.48020, 0.49022, 0.50025, 0.51028, 0.52031, 0.53033, 0.54033, 0.55031, - 0.56026, 0.57019, 0.58007, 0.58991, 0.59970, 0.60944, 0.61912, 0.62873, - 0.63827, 0.64774, 0.65713, 0.66643, 0.67564, 0.68476, 0.69377, 0.70269, - 0.71150, 0.72019, 0.72877, 0.73723, 0.74557, 0.75378, 0.76186, 0.76981, - 0.77762, 0.78530, 0.79283, 0.80022, 0.80747, 0.81457, 0.82151, 0.82831, - 0.83496, 0.84145, 0.84779, 0.85398, 0.86001, 0.86588, 0.87160, 0.87716, - 0.88257, 0.88782, 0.89291, 0.89785, 0.90264, 0.90728, 0.91176, 0.91610, - 0.92028, 0.92432, 0.92822, 0.93197, 0.93558, 0.93906, 0.94240, 0.94560, - 0.94867, 0.95162, 0.95444, 0.95713, 0.95971, 0.96217, 0.96451, 0.96674, - 0.96887, 0.97089, 0.97281, 0.97463, 0.97635, 0.97799, 0.97953, 0.98099, - 0.98236, 0.98366, 0.98488, 0.98602, 0.98710, 0.98811, 0.98905, 0.98994, - 0.99076, 0.99153, 0.99225, 0.99291, 0.99353, 0.99411, 0.99464, 0.99513, - 0.99558, 0.99600, 0.99639, 0.99674, 0.99706, 0.99736, 0.99763, 0.99788, - 0.99811, 0.99831, 0.99850, 0.99867, 0.99882, 0.99895, 0.99908, 0.99919, - 0.99929, 0.99938, 0.99946, 0.99953, 0.99959, 0.99965, 0.99969, 0.99974, - 0.99978, 0.99981, 0.99984, 0.99986, 0.99988, 0.99990, 0.99992, 0.99993, - 0.99994, 0.99995, 0.99996, 0.99997, 0.99998, 0.99998, 0.99998, 0.99999, - 0.99999, 0.99999, 0.99999, 1.00000, 1.00000, 1.00000, 1.00000, 1.00000, - 1.00000, 1.00000, 1.00000, 1.00000, 1.00000, 1.00000, 1.00000, 1.00000 -}; - -static const int pm128[128] = -{ - 0, 16, 32, 48, 64, 80, 96, 112, 8, 40, 72, 104, 24, 56, 88, 120, - 4, 20, 36, 52, 68, 84, 100, 116, 12, 28, 44, 60, 76, 92, 108, 124, - 2, 18, 34, 50, 66, 82, 98, 114, 10, 42, 74, 106, 26, 58, 90, 122, - 6, 22, 38, 54, 70, 86, 102, 118, 14, 46, 78, 110, 30, 62, 94, 126, - 1, 17, 33, 49, 65, 81, 97, 113, 9, 41, 73, 105, 25, 57, 89, 121, - 5, 21, 37, 53, 69, 85, 101, 117, 13, 29, 45, 61, 77, 93, 109, 125, - 3, 19, 35, 51, 67, 83, 99, 115, 11, 43, 75, 107, 27, 59, 91, 123, - 7, 23, 39, 55, 71, 87, 103, 119, 15, 31, 47, 63, 79, 95, 111, 127 -}; +#include "ac3_retables.h" void _M( fft_64p ) ( complex_t *x ); void _M( fft_128p ) ( complex_t *a ); static void imdct512_pre_ifft_twiddle_sse (const int *pmt, complex_t *buf, float *data, float *xcos_sin_sse); static void imdct512_post_ifft_twiddle_sse (complex_t *buf, float *xcos_sin_sse); -static void imdct512_window_delay_sse (complex_t *buf, float *data_ptr, const float *window_prt, float *delay_prt); -static void imdct512_window_delay_nol_sse (complex_t *buf, float *data_ptr, const float *window_prt, float *delay_prt); +static void imdct512_window_delay_sse (complex_t *buf, float *data_ptr, float *window_prt, float *delay_prt); +static void imdct512_window_delay_nol_sse (complex_t *buf, float *data_ptr, float *window_prt, float *delay_prt); void _M( imdct_init ) (imdct_t * p_imdct) @@ -260,7 +214,7 @@ static void imdct512_post_ifft_twiddle_sse (complex_t *buf, float *xcos_sin_sse) : "a" (buf), "c" (xcos_sin_sse) ); } -static void imdct512_window_delay_sse (complex_t *buf, float *data_ptr, const float *window_prt, float *delay_prt) +static void imdct512_window_delay_sse (complex_t *buf, float *data_ptr, float *window_prt, float *delay_prt) { __asm__ __volatile__ ( "pushl %%ebp\n" @@ -448,7 +402,7 @@ static void imdct512_window_delay_sse (complex_t *buf, float *data_ptr, const fl ::); } -static void imdct512_window_delay_nol_sse (complex_t *buf, float *data_ptr, const float *window_prt, float *delay_prt) +static void imdct512_window_delay_nol_sse (complex_t *buf, float *data_ptr, float *window_prt, float *delay_prt) { __asm__ __volatile__ ( "pushl %%ebp\n" diff --git a/plugins/imdct/ac3_retables.h b/plugins/imdct/ac3_retables.h new file mode 100644 index 0000000000..50e1d1c55d --- /dev/null +++ b/plugins/imdct/ac3_retables.h @@ -0,0 +1,83 @@ +/***************************************************************************** + * ac3_retables.h: ac3 DCT tables + ***************************************************************************** + * Copyright (C) 1999, 2000 VideoLAN + * $Id: ac3_retables.h,v 1.1 2001/05/16 14:51:29 reno Exp $ + * + * Authors: Renaud Dartus <reno@videolan.org> + * Aaron Holtzman <aholtzma@engr.uvic.ca> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111, USA. + *****************************************************************************/ + +static float window[] = { + 0.00014, 0.00024, 0.00037, 0.00051, 0.00067, 0.00086, 0.00107, 0.00130, + 0.00157, 0.00187, 0.00220, 0.00256, 0.00297, 0.00341, 0.00390, 0.00443, + 0.00501, 0.00564, 0.00632, 0.00706, 0.00785, 0.00871, 0.00962, 0.01061, + 0.01166, 0.01279, 0.01399, 0.01526, 0.01662, 0.01806, 0.01959, 0.02121, + 0.02292, 0.02472, 0.02662, 0.02863, 0.03073, 0.03294, 0.03527, 0.03770, + 0.04025, 0.04292, 0.04571, 0.04862, 0.05165, 0.05481, 0.05810, 0.06153, + 0.06508, 0.06878, 0.07261, 0.07658, 0.08069, 0.08495, 0.08935, 0.09389, + 0.09859, 0.10343, 0.10842, 0.11356, 0.11885, 0.12429, 0.12988, 0.13563, + 0.14152, 0.14757, 0.15376, 0.16011, 0.16661, 0.17325, 0.18005, 0.18699, + 0.19407, 0.20130, 0.20867, 0.21618, 0.22382, 0.23161, 0.23952, 0.24757, + 0.25574, 0.26404, 0.27246, 0.28100, 0.28965, 0.29841, 0.30729, 0.31626, + 0.32533, 0.33450, 0.34376, 0.35311, 0.36253, 0.37204, 0.38161, 0.39126, + 0.40096, 0.41072, 0.42054, 0.43040, 0.44030, 0.45023, 0.46020, 0.47019, + 0.48020, 0.49022, 0.50025, 0.51028, 0.52031, 0.53033, 0.54033, 0.55031, + 0.56026, 0.57019, 0.58007, 0.58991, 0.59970, 0.60944, 0.61912, 0.62873, + 0.63827, 0.64774, 0.65713, 0.66643, 0.67564, 0.68476, 0.69377, 0.70269, + 0.71150, 0.72019, 0.72877, 0.73723, 0.74557, 0.75378, 0.76186, 0.76981, + 0.77762, 0.78530, 0.79283, 0.80022, 0.80747, 0.81457, 0.82151, 0.82831, + 0.83496, 0.84145, 0.84779, 0.85398, 0.86001, 0.86588, 0.87160, 0.87716, + 0.88257, 0.88782, 0.89291, 0.89785, 0.90264, 0.90728, 0.91176, 0.91610, + 0.92028, 0.92432, 0.92822, 0.93197, 0.93558, 0.93906, 0.94240, 0.94560, + 0.94867, 0.95162, 0.95444, 0.95713, 0.95971, 0.96217, 0.96451, 0.96674, + 0.96887, 0.97089, 0.97281, 0.97463, 0.97635, 0.97799, 0.97953, 0.98099, + 0.98236, 0.98366, 0.98488, 0.98602, 0.98710, 0.98811, 0.98905, 0.98994, + 0.99076, 0.99153, 0.99225, 0.99291, 0.99353, 0.99411, 0.99464, 0.99513, + 0.99558, 0.99600, 0.99639, 0.99674, 0.99706, 0.99736, 0.99763, 0.99788, + 0.99811, 0.99831, 0.99850, 0.99867, 0.99882, 0.99895, 0.99908, 0.99919, + 0.99929, 0.99938, 0.99946, 0.99953, 0.99959, 0.99965, 0.99969, 0.99974, + 0.99978, 0.99981, 0.99984, 0.99986, 0.99988, 0.99990, 0.99992, 0.99993, + 0.99994, 0.99995, 0.99996, 0.99997, 0.99998, 0.99998, 0.99998, 0.99999, + 0.99999, 0.99999, 0.99999, 1.00000, 1.00000, 1.00000, 1.00000, 1.00000, + 1.00000, 1.00000, 1.00000, 1.00000, 1.00000, 1.00000, 1.00000, 1.00000 +}; + +static const int pm128[128] = +{ + 0, 16, 32, 48, 64, 80, 96, 112, 8, 40, 72, 104, 24, 56, 88, 120, + 4, 20, 36, 52, 68, 84, 100, 116, 12, 28, 44, 60, 76, 92, 108, 124, + 2, 18, 34, 50, 66, 82, 98, 114, 10, 42, 74, 106, 26, 58, 90, 122, + 6, 22, 38, 54, 70, 86, 102, 118, 14, 46, 78, 110, 30, 62, 94, 126, + 1, 17, 33, 49, 65, 81, 97, 113, 9, 41, 73, 105, 25, 57, 89, 121, + 5, 21, 37, 53, 69, 85, 101, 117, 13, 29, 45, 61, 77, 93, 109, 125, + 3, 19, 35, 51, 67, 83, 99, 115, 11, 43, 75, 107, 27, 59, 91, 123, + 7, 23, 39, 55, 71, 87, 103, 119, 15, 31, 47, 63, 79, 95, 111, 127 +}; + +static const int pm64[64] = +{ + 0, 8, 16, 24, 32, 40, 48, 56, + 4, 20, 36, 52, 12, 28, 44, 60, + 2, 10, 18, 26, 34, 42, 50, 58, + 6, 14, 22, 30, 38, 46, 54, 62, + 1, 9, 17, 25, 33, 41, 49, 57, + 5, 21, 37, 53, 13, 29, 45, 61, + 3, 11, 19, 27, 35, 43, 51, 59, + 7, 23, 39, 55, 15, 31, 47, 63 +}; + diff --git a/plugins/imdct/ac3_srfft_3dn.c b/plugins/imdct/ac3_srfft_3dn.c new file mode 100644 index 0000000000..729f0981c8 --- /dev/null +++ b/plugins/imdct/ac3_srfft_3dn.c @@ -0,0 +1,344 @@ +/***************************************************************************** + * ac3_srfft_3dn.c: accelerated 3D Now! ac3 fft functions + ***************************************************************************** + * Copyright (C) 1999, 2000, 2001 VideoLAN + * $Id: ac3_srfft_3dn.c,v 1.1 2001/05/16 14:51:29 reno Exp $ + * + * Authors: Renaud Dartus <reno@videolan.org> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111, USA. + *****************************************************************************/ + +#define MODULE_NAME imdct3dn +#include "modules_inner.h" + +/***************************************************************************** + * * Preamble + * *****************************************************************************/ +#include <stdio.h> + +#include "defs.h" + +#include <math.h> +#include <stdio.h> + +#include "config.h" +#include "common.h" +#include "threads.h" +#include "mtime.h" + +#include "ac3_imdct.h" +#include "ac3_srfft.h" + +void hsqrt2_3dn (void); +void C_1_3dn (void); +static void fft_4_3dn (complex_t *x); +static void fft_8_3dn (complex_t *x); +static void fft_asmb_3dn (int k, complex_t *x, complex_t *wTB, + const complex_t *d, const complex_t *d_3); + +void _M( fft_64p ) ( complex_t *a ) +{ + fft_8_3dn(&a[0]); fft_4_3dn(&a[8]); fft_4_3dn(&a[12]); + fft_asmb_3dn(2, &a[0], &a[8], &delta16[0], &delta16_3[0]); + + fft_8_3dn(&a[16]), fft_8_3dn(&a[24]); + fft_asmb_3dn(4, &a[0], &a[16],&delta32[0], &delta32_3[0]); + + fft_8_3dn(&a[32]); fft_4_3dn(&a[40]); fft_4_3dn(&a[44]); + fft_asmb_3dn(2, &a[32], &a[40], &delta16[0], &delta16_3[0]); + + fft_8_3dn(&a[48]); fft_4_3dn(&a[56]); fft_4_3dn(&a[60]); + fft_asmb_3dn(2, &a[48], &a[56], &delta16[0], &delta16_3[0]); + + fft_asmb_3dn(8, &a[0], &a[32],&delta64[0], &delta64_3[0]); +} + +void _M( fft_128p ) ( complex_t *a ) +{ + fft_8_3dn(&a[0]); fft_4_3dn(&a[8]); fft_4_3dn(&a[12]); + fft_asmb_3dn(2, &a[0], &a[8], &delta16[0], &delta16_3[0]); + + fft_8_3dn(&a[16]), fft_8_3dn(&a[24]); + fft_asmb_3dn(4, &a[0], &a[16],&delta32[0], &delta32_3[0]); + + fft_8_3dn(&a[32]); fft_4_3dn(&a[40]); fft_4_3dn(&a[44]); + fft_asmb_3dn(2, &a[32], &a[40], &delta16[0], &delta16_3[0]); + + fft_8_3dn(&a[48]); fft_4_3dn(&a[56]); fft_4_3dn(&a[60]); + fft_asmb_3dn(2, &a[48], &a[56], &delta16[0], &delta16_3[0]); + + fft_asmb_3dn(8, &a[0], &a[32],&delta64[0], &delta64_3[0]); + + fft_8_3dn(&a[64]); fft_4_3dn(&a[72]); fft_4_3dn(&a[76]); + /* fft_16(&a[64]); */ + fft_asmb_3dn(2, &a[64], &a[72], &delta16[0], &delta16_3[0]); + + fft_8_3dn(&a[80]); fft_8_3dn(&a[88]); + + /* fft_32(&a[64]); */ + fft_asmb_3dn(4, &a[64], &a[80],&delta32[0], &delta32_3[0]); + + fft_8_3dn(&a[96]); fft_4_3dn(&a[104]), fft_4_3dn(&a[108]); + /* fft_16(&a[96]); */ + fft_asmb_3dn(2, &a[96], &a[104], &delta16[0], &delta16_3[0]); + + fft_8_3dn(&a[112]), fft_8_3dn(&a[120]); + /* fft_32(&a[96]); */ + fft_asmb_3dn(4, &a[96], &a[112], &delta32[0], &delta32_3[0]); + + /* fft_128(&a[0]); */ + fft_asmb_3dn(16, &a[0], &a[64], &delta128[0], &delta128_3[0]); +} + +void hsqrt2_3dn (void) +{ + __asm__ ( + ".float 0f0.707106781188\n" + ".float 0f0.707106781188\n" + ".float 0f-0.707106781188\n" + ".float 0f-0.707106781188\n" + ); +} + +void C_1_3dn (void) +{ + __asm__ ( + ".float 0f-1.0\n" + ".float 0f1.0\n" + ".float 0f-1.0\n" + ".float 0f1.0\n" + ); +} + +static void fft_4_3dn (complex_t *x) +{ + __asm__ __volatile__ ( + "movq (%%eax), %%mm0\n" /* x[0] */ + "movq 8(%%eax), %%mm1\n" /* x[1] */ + "movq 16(%%eax), %%mm2\n" /* x[2] */ + "movq 24(%%eax), %%mm3\n" /* x[3] */ + "movq %%mm0, %%mm4\n" /* x[1] */ + "movq %%mm1, %%mm5\n" /* x[1] */ + "movq %%mm0, %%mm6\n" /* x[0] */ + "pfadd %%mm2, %%mm0\n" /* x[0] + x[2] */ + "pfadd %%mm3, %%mm1\n" /* x[1] + x[3] */ + "pfsub %%mm2, %%mm4\n" /* x[0] - x[2] */ + "pfsub %%mm3, %%mm5\n" /* x[1] - x[3] */ + + "pfadd %%mm1, %%mm0\n" /* x[0] + x[2] + x[1] + x[3] */ + "pfsub %%mm1, %%mm6\n" /* x[0] + x[2] - x[1] - x[3] */ + + "movq %%mm0, (%%eax)\n" + "movq %%mm6, 16(%%eax)\n" + + "pxor %%mm6, %%mm6\n" + "movq %%mm5, %%mm2\n" /* x[1] - x[3] */ + "movq %%mm4, %%mm3\n" /* x[0] - x[2] */ + "pfsub %%mm5, %%mm6\n" /* x[3] - x[1] */ + + "punpckhdq %%mm2,%%mm2\n" /* x[1] - x[3].im */ + "punpckldq %%mm6,%%mm6\n" /* x[3] - x[1].re */ + "punpckhdq %%mm6,%%mm2\n" /* x[3] - x[1].re, x[1] - x[3].im */ + + "pfsub %%mm2, %%mm4\n" /* x0i-x2i-x3r+x1.r,x0r-x2r-x1i+x3i */ + "pfadd %%mm3, %%mm2\n" /* x0i-x2i+x3r-x1.r, x0r-x2r+x1i-x3.i */ + + "movq %%mm2, 8(%%eax)\n" /* mm4_2 + mm6_1, mm4_1 + mm5_2 */ + "movq %%mm4, 24(%%eax)\n" /* mm4_2 - mm6_1, mm4_1 - mm5_2 */ + "femms\n" + : "=a" (x) + : "a" (x) ); +} + +static void fft_8_3dn (complex_t *x) +{ + register float wT1_r, wT1_i, wB1_r, wB1_i, wT2_r, wT2_i, wB2_r, wB2_i; + + wT1_r = x[1].real; + wT1_i = x[1].imag; + wB1_r = x[3].real; + wB1_i = x[3].imag; + + x[1] = x[2]; + x[2] = x[4]; + x[3] = x[6]; + { /* fft_4 */ + register float yt_r, yt_i, yb_r, yb_i, u_r, u_i, vi_r, vi_i; + + yt_r = x[0].real; + yb_r = yt_r - x[2].real; + yt_r += x[2].real; + + u_r = x[1].real; + vi_i = x[3].real - u_r; + u_r += x[3].real; + + u_i = x[1].imag; + vi_r = u_i - x[3].imag; + u_i += x[3].imag; + + yt_i = yt_r; + yt_i += u_r; + x[0].real = yt_i; + yt_r -= u_r; + x[2].real = yt_r; + yt_i = yb_r; + yt_i += vi_r; + x[1].real = yt_i; + yb_r -= vi_r; + x[3].real = yb_r; + + yt_i = x[0].imag; + yb_i = yt_i - x[2].imag; + yt_i += x[2].imag; + + yt_r = yt_i; + yt_r += u_i; + x[0].imag = yt_r; + yt_i -= u_i; + x[2].imag = yt_i; + yt_r = yb_i; + yt_r += vi_i; + x[1].imag = yt_r; + yb_i -= vi_i; + x[3].imag = yb_i; + } + + /* x[0] x[4] */ + wT2_r = x[5].real; + wT2_r += x[7].real; + wT2_r += wT1_r; + wT2_r += wB1_r; + wT2_i = wT2_r; + wT2_r += x[0].real; + wT2_i = x[0].real - wT2_i; + x[0].real = wT2_r; + x[4].real = wT2_i; + + wT2_i = x[5].imag; + wT2_i += x[7].imag; + wT2_i += wT1_i; + wT2_i += wB1_i; + wT2_r = wT2_i; + wT2_r += x[0].imag; + wT2_i = x[0].imag - wT2_i; + x[0].imag = wT2_r; + x[4].imag = wT2_i; + + /* x[2] x[6] */ + wT2_r = x[5].imag; + wT2_r -= x[7].imag; + wT2_r += wT1_i; + wT2_r -= wB1_i; + wT2_i = wT2_r; + wT2_r += x[2].real; + wT2_i = x[2].real - wT2_i; + x[2].real = wT2_r; + x[6].real = wT2_i; + + wT2_i = x[5].real; + wT2_i -= x[7].real; + wT2_i += wT1_r; + wT2_i -= wB1_r; + wT2_r = wT2_i; + wT2_r += x[2].imag; + wT2_i = x[2].imag - wT2_i; + x[2].imag = wT2_i; + x[6].imag = wT2_r; + + /* x[1] x[5] */ + wT2_r = wT1_r; + wT2_r += wB1_i; + wT2_r -= x[5].real; + wT2_r -= x[7].imag; + wT2_i = wT1_i; + wT2_i -= wB1_r; + wT2_i -= x[5].imag; + wT2_i += x[7].real; + + wB2_r = wT2_r; + wB2_r += wT2_i; + wT2_i -= wT2_r; + wB2_r *= HSQRT2; + wT2_i *= HSQRT2; + wT2_r = wB2_r; + wB2_r += x[1].real; + wT2_r = x[1].real - wT2_r; + + wB2_i = x[5].real; + x[1].real = wB2_r; + x[5].real = wT2_r; + + wT2_r = wT2_i; + wT2_r += x[1].imag; + wT2_i = x[1].imag - wT2_i; + wB2_r = x[5].imag; + x[1].imag = wT2_r; + x[5].imag = wT2_i; + + /* x[3] x[7] */ + wT1_r -= wB1_i; + wT1_i += wB1_r; + wB1_r = wB2_i - x[7].imag; + wB1_i = wB2_r + x[7].real; + wT1_r -= wB1_r; + wT1_i -= wB1_i; + wB1_r = wT1_r + wT1_i; + wB1_r *= HSQRT2; + wT1_i -= wT1_r; + wT1_i *= HSQRT2; + wB2_r = x[3].real; + wB2_i = wB2_r + wT1_i; + wB2_r -= wT1_i; + x[3].real = wB2_i; + x[7].real = wB2_r; + wB2_i = x[3].imag; + wB2_r = wB2_i + wB1_r; + wB2_i -= wB1_r; + x[3].imag = wB2_i; + x[7].imag = wB2_r; +} + + +static void fft_asmb_3dn (int k, complex_t *x, complex_t *wTB, + const complex_t *d, const complex_t *d_3) +{ + register complex_t *x2k, *x3k, *x4k, *wB; + register float a_r, a_i, a1_r, a1_i, u_r, u_i, v_r, v_i; + + x2k = x + 2 * k; + x3k = x2k + 2 * k; + x4k = x3k + 2 * k; + wB = wTB + 2 * k; + + TRANSZERO(x[0],x2k[0],x3k[0],x4k[0]); + TRANS(x[1],x2k[1],x3k[1],x4k[1],wTB[1],wB[1],d[1],d_3[1]); + + --k; + for(;;) { + TRANS(x[2],x2k[2],x3k[2],x4k[2],wTB[2],wB[2],d[2],d_3[2]); + TRANS(x[3],x2k[3],x3k[3],x4k[3],wTB[3],wB[3],d[3],d_3[3]); + if (!--k) break; + x += 2; + x2k += 2; + x3k += 2; + x4k += 2; + d += 2; + d_3 += 2; + wTB += 2; + wB += 2; + } +} diff --git a/plugins/imdct/ac3_srfft_sse.c b/plugins/imdct/ac3_srfft_sse.c index 2de563b57b..741b2a255d 100644 --- a/plugins/imdct/ac3_srfft_sse.c +++ b/plugins/imdct/ac3_srfft_sse.c @@ -2,7 +2,7 @@ * ac3_srfft_sse.c: accelerated SSE ac3 fft functions ***************************************************************************** * Copyright (C) 1999, 2000, 2001 VideoLAN - * $Id: ac3_srfft_sse.c,v 1.1 2001/05/15 16:19:42 sam Exp $ + * $Id: ac3_srfft_sse.c,v 1.2 2001/05/16 14:51:29 reno Exp $ * * Authors: Renaud Dartus <reno@videolan.org> * Aaron Holtzman <aholtzma@engr.uvic.ca> @@ -43,8 +43,8 @@ #include "ac3_imdct.h" #include "ac3_srfft.h" -void hsqrt2 (void); -void C_1 (void); +void hsqrt2_sse (void); +void C_1_sse (void); static void fft_4_sse (complex_t *x); static void fft_8_sse (complex_t *x); static void fft_asmb_sse (int k, complex_t *x, complex_t *wTB, @@ -104,7 +104,7 @@ void _M( fft_128p ) ( complex_t *a ) fft_asmb_sse(16, &a[0], &a[64], &delta128[0], &delta128_3[0]); } -void hsqrt2 (void) +void hsqrt2_sse (void) { __asm__ ( ".float 0f0.707106781188\n" @@ -114,7 +114,7 @@ void hsqrt2 (void) ); } -void C_1 (void) +void C_1_sse (void) { __asm__ ( ".float 0f-1.0\n" @@ -174,7 +174,7 @@ static void fft_8_sse (complex_t *x) "subps %%xmm5, %%xmm7\n" /* yb = i*(x6-x2)+x0-x4 | -x6-x2+x0+x4 */ "movhps 24(%%eax), %%xmm1\n" /* x[3] | x[1] */ - "movl $hsqrt2, %%ebx\n" + "movl $hsqrt2_sse, %%ebx\n" "movlps 40(%%eax), %%xmm2\n" /* x[5] */ "movhps 56(%%eax), %%xmm2\n" /* x[7] | x[5] */ "movups %%xmm1, %%xmm3\n" /* x[3] | x[1] */ @@ -191,7 +191,7 @@ static void fft_8_sse (complex_t *x) "movlhps %%xmm6, %%xmm1\n" /* (1+i)/sqrt2 * (x[1]-x[5]) | x[1]+x[5] */ "shufps $0xe4, %%xmm6, %%xmm5\n" /* (-1-i)/sqrt2 * (x[3]-x[7]) | x[3]+x[7] */ "movups %%xmm1, %%xmm3\n" /* (1-i)/sqrt2 * (x[1]-x[5]) | x[1]+x[5] */ - "movl $C_1, %%ebx\n" + "movl $C_1_sse, %%ebx\n" "addps %%xmm5, %%xmm1\n" /* u */ "subps %%xmm5, %%xmm3\n" /* v */ "movups %%xmm0, %%xmm2\n" /* yb */ @@ -258,7 +258,7 @@ static void fft_asmb_sse (int k, complex_t *x, complex_t *wTB, "movhlps %%xmm5, %%xmm7\n" /* wT[1].im * d[1].im | wT[1].re * d[1].im */ "movlhps %%xmm6, %%xmm5\n" /* wB[1].im * d3[1].re | wB[1].re * d3[1].re | wT[1].im * d[1].re | wT[1].re * d[1].re */ "shufps $0xb1, %%xmm6, %%xmm7\n" /* wB[1].re * d3[1].im | wB[i].im * d3[1].im | wT[1].re * d[1].im | wT[1].im * d[1].im */ - "movl $C_1, %%edi\n" + "movl $C_1_sse, %%edi\n" "movups (%%edi), %%xmm4\n" "mulps %%xmm4, %%xmm7\n" "addps %%xmm7, %%xmm5\n" /* wB[1] * d3[1] | wT[1] * d[1] */ @@ -318,7 +318,7 @@ static void fft_asmb_sse (int k, complex_t *x, complex_t *wTB, "mulps %%xmm5, %%xmm4\n" /* wB[0].im * d3[0].im | wB[0].re * d3[0].im | wB[0].im * d3[0].re | wB[0].re * d3[0].re */ "mulps %%xmm7, %%xmm6\n" /* wB[1].im * d3[1].im | wB[1].re * d3[1].im | wB[1].im * d3[1].re | wB[1].re * d3[1].re */ "shufps $0xb1, %%xmm2, %%xmm1\n" /* d[1].im * wT[1].re | d[1].im * wT[1].im | d[0].im * wT[0].re | d[0].im * wT[0].im */ - "movl $C_1, %%edi\n" + "movl $C_1_sse, %%edi\n" "movups (%%edi), %%xmm3\n" /* 1.0 | -1.0 | 1.0 | -1.0 */ "movhlps %%xmm4, %%xmm5\n" /* wB[0].im * d3[0].im | wB[0].re * d3[0].im */ diff --git a/plugins/imdct/imdct3dn.c b/plugins/imdct/imdct3dn.c new file mode 100644 index 0000000000..7432b7164c --- /dev/null +++ b/plugins/imdct/imdct3dn.c @@ -0,0 +1,152 @@ +/***************************************************************************** + * imdct3dn.c : accelerated 3D Now! IMDCT module + ***************************************************************************** + * Copyright (C) 1999, 2000 VideoLAN + * $Id: imdct3dn.c,v 1.1 2001/05/16 14:51:29 reno Exp $ + * + * Authors: Ga�l Hendryckx <jimmy@via.ecp.fr> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111, USA. + *****************************************************************************/ + +#define MODULE_NAME imdct3dn +#include "modules_inner.h" + +/***************************************************************************** + * Preamble + *****************************************************************************/ +#include "defs.h" + +#include <stdlib.h> + +#include "config.h" +#include "common.h" +#include "threads.h" +#include "mtime.h" +#include "tests.h" + +#include "ac3_imdct.h" +#include "ac3_imdct_common.h" + +#include "modules.h" + +/***************************************************************************** + * Local and extern prototypes. + *****************************************************************************/ +static void imdct_getfunctions( function_list_t * p_function_list ); +static int imdct_Probe ( probedata_t *p_data ); + +/***************************************************************************** + * Build configuration tree. + *****************************************************************************/ +MODULE_CONFIG_START +ADD_WINDOW( "Configuration for IMDCT module" ) + ADD_COMMENT( "Ha, ha -- nothing to configure yet" ) +MODULE_CONFIG_END + +/***************************************************************************** + * InitModule: get the module structure and configuration. + ***************************************************************************** + * We have to fill psz_name, psz_longname and psz_version. These variables + * will be strdup()ed later by the main application because the module can + * be unloaded later to save memory, and we want to be able to access this + * data even after the module has been unloaded. + *****************************************************************************/ +MODULE_INIT +{ + p_module->psz_name = MODULE_STRING; + p_module->psz_longname = "AC3 IMDCT module"; + p_module->psz_version = VERSION; + + p_module->i_capabilities = MODULE_CAPABILITY_NULL + | MODULE_CAPABILITY_IMDCT; + + return( 0 ); +} + +/***************************************************************************** + * ActivateModule: set the module to an usable state. + ***************************************************************************** + * This function fills the capability functions and the configuration + * structure. Once ActivateModule() has been called, the i_usage can + * be set to 0 and calls to NeedModule() be made to increment it. To unload + * the module, one has to wait until i_usage == 0 and call DeactivateModule(). + *****************************************************************************/ +MODULE_ACTIVATE +{ + p_module->p_functions = malloc( sizeof( module_functions_t ) ); + if( p_module->p_functions == NULL ) + { + return( -1 ); + } + + imdct_getfunctions( &p_module->p_functions->imdct ); + + p_module->p_config = p_config; + + return( 0 ); +} + +/***************************************************************************** + * DeactivateModule: make sure the module can be unloaded. + ***************************************************************************** + * This function must only be called when i_usage == 0. If it successfully + * returns, i_usage can be set to -1 and the module unloaded. Be careful to + * lock usage_lock during the whole process. + *****************************************************************************/ +MODULE_DEACTIVATE +{ + free( p_module->p_functions ); + + return( 0 ); +} + +/* Following functions are local */ + +/***************************************************************************** + * Functions exported as capabilities. They are declared as static so that + * we don't pollute the namespace too much. + *****************************************************************************/ +static void imdct_getfunctions( function_list_t * p_function_list ) +{ + p_function_list->pf_probe = imdct_Probe; +#define F p_function_list->functions.imdct + F.pf_imdct_init = _M( imdct_init ); + F.pf_imdct_256 = _M( imdct_do_256 ); + F.pf_imdct_256_nol = _M( imdct_do_256_nol ); + F.pf_imdct_512 = _M( imdct_do_512 ); + F.pf_imdct_512_nol = _M( imdct_do_512_nol ); +#undef F +} + +/***************************************************************************** + * imdct_Probe: returns a preference score + *****************************************************************************/ +static int imdct_Probe( probedata_t *p_data ) +{ + if( !TestCPU( CPU_CAPABILITY_3DNOW ) ) + { + return( 0 ); + } + + if( TestMethod( DOWNMIX_METHOD_VAR, "imdct3dn" ) ) + { + return( 999 ); + } + + /* This plugin always works */ + return( 200 ); +} + diff --git a/src/ac3_decoder/ac3_imdct.c b/src/ac3_decoder/ac3_imdct.c index 639ac439f3..6ad5fbc724 100644 --- a/src/ac3_decoder/ac3_imdct.c +++ b/src/ac3_decoder/ac3_imdct.c @@ -2,7 +2,7 @@ * ac3_imdct.c: ac3 DCT ***************************************************************************** * Copyright (C) 1999, 2000 VideoLAN - * $Id: ac3_imdct.c,v 1.20 2001/05/15 16:19:42 sam Exp $ + * $Id: ac3_imdct.c,v 1.21 2001/05/16 14:51:29 reno Exp $ * * Authors: Michel Kaempf <maxx@via.ecp.fr> * Aaron Holtzman <aholtzma@engr.uvic.ca> @@ -54,7 +54,7 @@ void imdct_init(imdct_t * p_imdct) int i; float scale = 181.019; - p_imdct->pf_imdct_init( p_imdct ); + p_imdct->pf_imdct_init( p_imdct ); /* More twiddle factors to turn IFFT into IMDCT */ for (i=0; i < 64; i++) { -- 2.25.4