Commit 5f232025 authored by Renaud Dartus's avatar Renaud Dartus

* Add 3D Now! imdct

* Remove kmudge for ac3 on MacOS X
parent 1ac785a2
...@@ -26,7 +26,7 @@ PLUGINS_TARGETS := alsa/alsa beos/beos darwin/darwin dsp/dsp dummy/dummy \ ...@@ -26,7 +26,7 @@ PLUGINS_TARGETS := alsa/alsa beos/beos darwin/darwin dsp/dsp dummy/dummy \
dvd/dvd esd/esd fb/fb ggi/ggi glide/glide gnome/gnome gtk/gtk \ dvd/dvd esd/esd fb/fb ggi/ggi glide/glide gnome/gnome gtk/gtk \
downmix/downmix downmix/downmixsse downmix/downmix3dn \ downmix/downmix downmix/downmixsse downmix/downmix3dn \
idct/idct idct/idctclassic idct/idctmmx idct/idctmmxext \ idct/idct idct/idctclassic idct/idctmmx idct/idctmmxext \
imdct/imdct imdct/imdctsse \ imdct/imdct imdct/imdct3dn imdct/imdctsse \
macosx/macosx mga/mga \ macosx/macosx mga/mga \
motion/motion motion/motionmmx motion/motionmmxext \ motion/motion motion/motionmmx motion/motionmmxext \
mpeg/es mpeg/ps mpeg/ts null/null qt/qt sdl/sdl \ mpeg/es mpeg/ps mpeg/ts null/null qt/qt sdl/sdl \
...@@ -317,12 +317,7 @@ endif ...@@ -317,12 +317,7 @@ endif
$(C_OBJ): %.o: Makefile.opts Makefile.dep Makefile $(C_OBJ): %.o: Makefile.opts Makefile.dep Makefile
$(C_OBJ): %.o: .dep/%.d $(C_OBJ): %.o: .dep/%.d
$(C_OBJ): %.o: %.c $(C_OBJ): %.o: %.c
ifneq (,$(findstring darwin,$(SYS)))
#this is uglier of all
@if test "src/ac3_decoder/ac3_imdct.c" = "$<"; then $(CC) `echo $(CFLAGS) | sed -e 's/-O3/-O/'` -c -o $@ $<; echo "(CC) `echo $(CFLAGS) | sed -e 's/-O3/-O/'` -c -o $@ $<"; else $(CC) $(CFLAGS) -c -o $@ $<; echo "$(CC) $(CFLAGS) -c -o $@ $<"; fi
else
$(CC) $(CFLAGS) -c -o $@ $< $(CC) $(CFLAGS) -c -o $@ $<
endif
$(CPP_OBJ): %.o: Makefile.opts Makefile.dep Makefile $(CPP_OBJ): %.o: Makefile.opts Makefile.dep Makefile
$(CPP_OBJ): %.o: .dep/%.dpp $(CPP_OBJ): %.o: .dep/%.dpp
......
...@@ -3285,7 +3285,7 @@ int main() { ...@@ -3285,7 +3285,7 @@ int main() {
EOF EOF
if { (eval echo configure:3287: \"$ac_compile\") 1>&5; (eval $ac_compile) 2>&5; }; then if { (eval echo configure:3287: \"$ac_compile\") 1>&5; (eval $ac_compile) 2>&5; }; then
rm -rf conftest* rm -rf conftest*
ACCEL_PLUGINS="${ACCEL_PLUGINS} idctmmxext motionmmxext imdctsse downmix3dn downmixsse" ACCEL_PLUGINS="${ACCEL_PLUGINS} idctmmxext motionmmxext imdct3dn imdctsse downmix3dn downmixsse"
echo "$ac_t""yes" 1>&6 echo "$ac_t""yes" 1>&6
else else
echo "configure: failed program was:" >&5 echo "configure: failed program was:" >&5
......
...@@ -162,7 +162,7 @@ AC_TRY_COMPILE([void quux(){void *p;asm("packuswb %%mm1,%%mm2"::"r"(p));}],, ...@@ -162,7 +162,7 @@ AC_TRY_COMPILE([void quux(){void *p;asm("packuswb %%mm1,%%mm2"::"r"(p));}],,
AC_MSG_CHECKING([if \$CC groks MMX EXT or SSE inline assembly]) AC_MSG_CHECKING([if \$CC groks MMX EXT or SSE inline assembly])
AC_TRY_COMPILE([void quux(){void *p;asm("maskmovq %%mm1,%%mm2"::"r"(p));}],, AC_TRY_COMPILE([void quux(){void *p;asm("maskmovq %%mm1,%%mm2"::"r"(p));}],,
ACCEL_PLUGINS="${ACCEL_PLUGINS} idctmmxext motionmmxext imdctsse downmix3dn downmixsse" ACCEL_PLUGINS="${ACCEL_PLUGINS} idctmmxext motionmmxext imdct3dn imdctsse downmix3dn downmixsse"
AC_MSG_RESULT(yes), AC_MSG_RESULT(no)) AC_MSG_RESULT(yes), AC_MSG_RESULT(no))
dnl dnl
......
...@@ -9,15 +9,18 @@ ...@@ -9,15 +9,18 @@
PLUGIN_IMDCT = imdct.o ac3_imdct_c.o ac3_srfft_c.o PLUGIN_IMDCT = imdct.o ac3_imdct_c.o ac3_srfft_c.o
PLUGIN_IMDCTSSE = imdctsse.o ac3_imdct_sse.o ac3_srfft_sse.o PLUGIN_IMDCTSSE = imdctsse.o ac3_imdct_sse.o ac3_srfft_sse.o
PLUGIN_IMDCT3DN = imdct3dn.o ac3_imdct_3dn.o ac3_srfft_3dn.o
PLUGIN_IMDCTCOMMON = ac3_imdct_common.o PLUGIN_IMDCTCOMMON = ac3_imdct_common.o
BUILTIN_IMDCT = $(PLUGIN_IMDCT:%.o=BUILTIN_IMDCT_%.o) \ BUILTIN_IMDCT = $(PLUGIN_IMDCT:%.o=BUILTIN_IMDCT_%.o) \
$(PLUGIN_IMDCTCOMMON:%.o=BUILTIN_IMDCT_%.o) $(PLUGIN_IMDCTCOMMON:%.o=BUILTIN_IMDCT_%.o)
BUILTIN_IMDCTSSE = $(PLUGIN_IMDCTSSE:%.o=BUILTIN_IMDCTSSE_%.o) \ BUILTIN_IMDCTSSE = $(PLUGIN_IMDCTSSE:%.o=BUILTIN_IMDCTSSE_%.o) \
$(PLUGIN_IMDCTCOMMON:%.o=BUILTIN_IMDCTSSE_%.o) $(PLUGIN_IMDCTCOMMON:%.o=BUILTIN_IMDCTSSE_%.o)
BUILTIN_IMDCT3DN = $(PLUGIN_IMDCT3DN:%.o=BUILTIN_IMDCT3DN_%.o) \
$(PLUGIN_IMDCTCOMMON:%.o=BUILTIN_IMDCT3DN_%.o)
PLUGIN_C = $(PLUGIN_IMDCT) $(PLUGIN_IMDCTSSE) $(PLUGIN_IMDCTCOMMON) PLUGIN_C = $(PLUGIN_IMDCT) $(PLUGIN_IMDCTSSE) $(PLUGIN_IMDCT3DN) $(PLUGIN_IMDCTCOMMON)
ALL_OBJ = $(PLUGIN_C) $(BUILTIN_IMDCT) $(BUILTIN_IMDCTSSE) ALL_OBJ = $(PLUGIN_C) $(BUILTIN_IMDCT) $(BUILTIN_IMDCTSSE) $(BUILTIN_IMDCT3DN)
# #
# Virtual targets # Virtual targets
...@@ -33,6 +36,10 @@ $(BUILTIN_IMDCTSSE): BUILTIN_IMDCTSSE_%.o: .dep/%.d ...@@ -33,6 +36,10 @@ $(BUILTIN_IMDCTSSE): BUILTIN_IMDCTSSE_%.o: .dep/%.d
$(BUILTIN_IMDCTSSE): BUILTIN_IMDCTSSE_%.o: %.c $(BUILTIN_IMDCTSSE): BUILTIN_IMDCTSSE_%.o: %.c
$(CC) $(CFLAGS) -DBUILTIN -DMODULE_NAME=imdctsse -c -o $@ $< $(CC) $(CFLAGS) -DBUILTIN -DMODULE_NAME=imdctsse -c -o $@ $<
$(BUILTIN_IMDCT3DN): BUILTIN_IMDCT3DN_%.o: .dep/%.d
$(BUILTIN_IMDCT3DN): BUILTIN_IMDCT3DN_%.o: %.c
$(CC) $(CFLAGS) -DBUILTIN -DMODULE_NAME=imdct3dn -c -o $@ $<
# #
# Real targets # Real targets
# #
...@@ -51,3 +58,10 @@ $(BUILTIN_IMDCTSSE): BUILTIN_IMDCTSSE_%.o: %.c ...@@ -51,3 +58,10 @@ $(BUILTIN_IMDCTSSE): BUILTIN_IMDCTSSE_%.o: %.c
ar r $@ $^ ar r $@ $^
$(RANLIB) $@ $(RANLIB) $@
../../lib/imdct3dn.so: $(PLUGIN_IMDCT3DN) $(PLUGIN_IMDCTCOMMON)
$(CC) $(PCFLAGS) -o $@ $^ $(PLCFLAGS)
../../lib/imdct3dn.a: $(BUILTIN_IMDCT3DN)
ar r $@ $^
$(RANLIB) $@
/*****************************************************************************
* ac3_imdct_3dn.c: accelerated 3D Now! ac3 DCT
*****************************************************************************
* Copyright (C) 1999, 2000 VideoLAN
* $Id: ac3_imdct_3dn.c,v 1.1 2001/05/16 14:51:29 reno Exp $
*
* Authors: Renaud Dartus <reno@videolan.org>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111, USA.
*****************************************************************************/
#define MODULE_NAME imdct3dn
#include "modules_inner.h"
/*****************************************************************************
* Preamble
*****************************************************************************/
#include "defs.h"
#include <math.h>
#include <stdio.h>
#include "config.h"
#include "common.h"
#include "threads.h"
#include "mtime.h"
#include "ac3_imdct.h"
#include "ac3_imdct_common.h"
#include "ac3_retables.h"
void _M( fft_64p ) ( complex_t *x );
void _M( fft_128p ) ( complex_t *a );
static void imdct512_pre_ifft_twiddle_3dn (const int *pmt, complex_t *buf, float *data, float *xcos_sin_sse);
static void imdct512_post_ifft_twiddle_3dn (complex_t *buf, float *xcos_sin_sse);
static void imdct512_window_delay_3dn (complex_t *buf, float *data_ptr, float *window_prt, float *delay_prt);
static void imdct512_window_delay_nol_3dn (complex_t *buf, float *data_ptr, float *window_prt, float *delay_prt);
void _M( imdct_init ) (imdct_t * p_imdct)
{
int i;
float scale = 181.019;
fprintf(stderr,"imct_init\n");
for (i=0; i < 128; i++)
{
float xcos_i = cos(2.0f * M_PI * (8*i+1)/(8*N)) * scale;
float xsin_i = sin(2.0f * M_PI * (8*i+1)/(8*N)) * scale;
p_imdct->xcos_sin_sse[i * 4] = xcos_i;
p_imdct->xcos_sin_sse[i * 4 + 1] = -xsin_i;
p_imdct->xcos_sin_sse[i * 4 + 2] = -xsin_i;
p_imdct->xcos_sin_sse[i * 4 + 3] = -xcos_i;
}
fprintf(stderr,"done imct_init\n");
}
void _M( imdct_do_512 ) (imdct_t * p_imdct, float data[], float delay[])
{
imdct512_pre_ifft_twiddle_3dn (pm128, p_imdct->buf, data, p_imdct->xcos_sin_sse);
_M( fft_128p ) (p_imdct->buf);
imdct512_post_ifft_twiddle_3dn (p_imdct->buf, p_imdct->xcos_sin_sse);
imdct512_window_delay_3dn (p_imdct->buf, data, window, delay);
}
void _M( imdct_do_512_nol ) (imdct_t * p_imdct, float data[], float delay[])
{
imdct512_pre_ifft_twiddle_3dn (pm128, p_imdct->buf, data, p_imdct->xcos_sin_sse);
_M( fft_128p ) (p_imdct->buf);
imdct512_post_ifft_twiddle_3dn (p_imdct->buf, p_imdct->xcos_sin_sse);
imdct512_window_delay_nol_3dn (p_imdct->buf, data, window, delay);
}
static void imdct512_pre_ifft_twiddle_3dn (const int *pmt, complex_t *buf, float *data, float *xcos_sin_sse)
{
__asm__ __volatile__ (
"pushl %%ebp\n"
"movl %%esp, %%ebp\n"
"addl $-4, %%esp\n" /* local variable, loop counter */
"pushl %%eax\n"
"pushl %%ebx\n"
"pushl %%ecx\n"
"pushl %%edx\n"
"pushl %%edi\n"
"pushl %%esi\n"
"movl 8(%%ebp), %%eax\n" /* pmt */
"movl 12(%%ebp), %%ebx\n" /* buf */
"movl 16(%%ebp), %%ecx\n" /* data */
"movl 20(%%ebp), %%edx\n" /* xcos_sin_sse */
"movl $128, -4(%%ebp)\n"
".loop:\n"
"movl (%%eax), %%esi\n"
"movd (%%ecx, %%esi, 8), %%mm1\n" /* 2j */
"punpckldq %%mm1, %%mm1\n" /* 2j | 2j */
"shll $1, %%esi\n"
"movq (%%edx, %%esi, 8), %%mm0\n" /* -s_j | c_j */
"movq 8(%%edx, %%esi, 8), %%mm2\n" /* -c_j | -s_j */
"negl %%esi\n"
"movd 1020(%%ecx, %%esi, 4), %%mm4\n" /* 255-2j */
"punpckldq %%mm4, %%mm4\n" /* 255-2j | 255-2j */
"addl $4, %%eax\n"
"pfmul %%mm4, %%mm0\n" /* 255-2j * -s_j | 255-2j * c_j */
"pfmul %%mm1, %%mm2\n" /* 2j * -c_j | 2j * -s_j */
"addl $8, %%ebx\n"
"pfadd %%mm2, %%mm0\n" /* 2j * -c_j + 255-2j * -s_j | 2j * -s_j + 255-2j * c_j */
"movq %%mm0, -8(%%ebx)\n"
"decl -4(%%ebp)\n"
"jnz .loop\n"
"popl %%esi\n"
"popl %%edi\n"
"popl %%edx\n"
"popl %%ecx\n"
"popl %%ebx\n"
"popl %%eax\n"
"addl $4, %%esp\n"
"popl %%ebp\n"
"femms\n"
::);
}
static void imdct512_post_ifft_twiddle_3dn (complex_t *buf, float *xcos_sin_sse)
{
__asm__ __volatile__ (
"pushl %%ebx\n"
"movl $64, %%ebx\n" /* loop counter */
".loop1:\n"
"movq (%%eax), %%mm0\n" /* im0 | re0 */
"movq %%mm0, %%mm1\n" /* im0 | re0 */
"punpckldq %%mm0, %%mm0\n" /* re0 | re0 */
"punpckhdq %%mm1, %%mm1\n" /* im0 | im0 */
"movq (%%ecx), %%mm2\n" /* -s | c */
"movq 8(%%ecx), %%mm3\n" /* -c | -s */
"movq %%mm3, %%mm4\n"
"punpckhdq %%mm2,%%mm3\n" /* -s | -c */
"punpckldq %%mm2,%%mm4\n" /* c | -s */
"movq 8(%%eax), %%mm2\n" /* im1 | re1 */
"movq %%mm2, %%mm5\n" /* im1 | re1 */
"punpckldq %%mm2, %%mm2\n" /* re1 | re1 */
"punpckhdq %%mm5, %%mm5\n" /* im1 | im1 */
"pfmul %%mm3, %%mm0\n" /* -s * re0 | -c * re0 */
"pfmul %%mm4, %%mm1\n" /* c * im0 | -s * im0 */
"movq 16(%%ecx), %%mm6\n" /* -s1 | c1 */
"movq 24(%%ecx), %%mm7\n" /* -c1 | -s1 */
"movq %%mm7, %%mm4\n"
"punpckhdq %%mm6,%%mm7\n" /* -s1 | -c1 */
"punpckldq %%mm6,%%mm4\n" /* c1 | -s1 */
"pfmul %%mm7, %%mm2\n" /* -s1*re1 | -c1*re1 */
"pfmul %%mm4, %%mm5\n" /* c1*im1 | -s1*im1 */
"pfadd %%mm1, %%mm0\n" /* -s * re0 + c * im0 | -c * re0 - s * im0 */
"pfadd %%mm5, %%mm2\n" /* -s1 * re1 + c1 * im1 | -c1 * re1 - s1 * im1 */
"movq %%mm0, (%%eax)\n"
"movq %%mm2, 8(%%eax)\n"
"addl $32, %%ecx\n"
"addl $16, %%eax\n"
"decl %%ebx\n"
"jnz .loop1\n"
"popl %%ebx\n"
"femms\n"
: "=a" (buf)
: "a" (buf), "c" (xcos_sin_sse) );
}
static void imdct512_window_delay_3dn (complex_t *buf, float *data_ptr, float *window_prt, float *delay_prt)
{
__asm__ __volatile__ (
"pushl %%ebp\n"
"movl %%esp, %%ebp\n"
"pushl %%eax\n"
"pushl %%ebx\n"
"pushl %%ecx\n"
"pushl %%edx\n"
"pushl %%esi\n"
"pushl %%edi\n"
"movl 20(%%ebp), %%ebx\n" /* delay */
"movl 16(%%ebp), %%edx\n" /* window */
"movl 8(%%ebp), %%eax\n" /* buf */
"movl $32, %%ecx\n" /* loop count */
"leal 516(%%eax), %%esi\n" /* buf[64].im */
"leal 504(%%eax), %%edi\n" /* buf[63].re */
"movl 12(%%ebp), %%eax\n" /* data */
".first_128_samples:\n"
"movd (%%esi), %%mm0\n" /* im0 */
"movd 8(%%esi), %%mm2\n" /* im1 */
"movd (%%edi), %%mm1\n" /* re0 */
"movd -8(%%edi), %%mm3\n" /* re1 */
"pxor %%mm4, %%mm4\n"
"pxor %%mm5, %%mm5\n"
"pfsub %%mm0, %%mm4\n" /* -im0 */
"pfsub %%mm2, %%mm5\n" /* -im1 */
"punpckldq %%mm1, %%mm4\n" /* re0 | -im0 */
"punpckldq %%mm3, %%mm5\n" /* re1 | -im1 */
"movq (%%edx), %%mm0\n" /* w1 | w0 */
"movq 8(%%edx), %%mm1\n" /* w3 | w2 */
"movq (%%ebx), %%mm2\n" /* d1 | d0 */
"movq 8(%%ebx), %%mm3\n" /* d3 | d2 */
"pfmul %%mm4, %%mm0\n" /* w1*re0 | -w0*im0 */
"pfmul %%mm5, %%mm1\n" /* w3*re1 | -w2*im1 */
"pfadd %%mm2, %%mm0\n" /* w1*re0+d1 | -w0*im0+d0 */
"pfadd %%mm3, %%mm1\n" /* w3*re1+d3 | -w2*im1+d2 */
"addl $16, %%edx\n"
"movq %%mm0, (%%eax)\n"
"movq %%mm1, 8(%%eax)\n"
"addl $16, %%ebx\n"
"addl $16, %%esi\n"
"addl $16, %%eax\n"
"addl $-16, %%edi\n"
"decl %%ecx\n"
"jnz .first_128_samples\n"
"movl 8(%%ebp), %%esi\n" /* buf[0].re */
"leal 1020(%%esi), %%edi\n" /* buf[127].im */
"movl $32, %%ecx\n" /* loop count */
".second_128_samples:\n"
"movd (%%esi), %%mm0\n" /* buf[i].re */
"movd 8(%%esi), %%mm2\n" /* re1 */
"movd (%%edi), %%mm1\n" /* buf[127-i].im */
"movd -8(%%edi), %%mm3\n" /* im1 */
"pxor %%mm4, %%mm4\n"
"pxor %%mm5, %%mm5\n"
"pfsub %%mm0, %%mm4\n" /* -re0 */
"pfsub %%mm2, %%mm5\n" /* -re1 */
"punpckldq %%mm1, %%mm4\n" /* im0 | -re0 */
"punpckldq %%mm3, %%mm5\n" /* im1 | -re1 */
"movq (%%edx), %%mm0\n" /* w1 | w0 */
"movq 8(%%edx), %%mm1\n" /* w3 | w2 */
"movq (%%ebx), %%mm2\n" /* d1 | d0 */
"movq 8(%%ebx), %%mm3\n" /* d3 | d2 */
"addl $16, %%esi\n"
"pfmul %%mm4, %%mm0\n" /* w1*im0 | -w0*re0 */
"pfmul %%mm5, %%mm1\n" /* w3*im1 | -w2*re1 */
"pfadd %%mm2, %%mm0\n" /* w1*im0+d1 | -w0*re0+d0 */
"pfadd %%mm3, %%mm1\n" /* w3*im1+d3 | -w2*re1+d2 */
"addl $-16, %%edi\n"
"movq %%mm0, (%%eax)\n"
"movq %%mm1, 8(%%eax)\n"
"addl $16, %%edx\n"
"addl $16, %%eax\n"
"addl $16, %%ebx\n"
"decl %%ecx\n"
"jnz .second_128_samples\n"
"movl 8(%%ebp), %%eax\n"
"leal 512(%%eax), %%esi\n" /* buf[64].re */
"leal 508(%%eax), %%edi\n" /* buf[63].im */
"movl $32, %%ecx\n" /* loop count */
"movl 20(%%ebp), %%eax\n" /* delay */
".first_128_delay:\n"
"movd (%%esi), %%mm0\n" /* re0 */
"movd 8(%%esi), %%mm2\n" /* re1 */
"movd (%%edi), %%mm1\n" /* im0 */
"movd -8(%%edi), %%mm3\n" /* im1 */
"pxor %%mm4, %%mm4\n"
"pxor %%mm5, %%mm5\n"
"pfsub %%mm0, %%mm4\n" /* -re0 */
"pfsub %%mm2, %%mm5\n" /* -re1 */
"punpckldq %%mm1, %%mm4\n" /* im0 | -re0 */
"punpckldq %%mm3, %%mm5\n" /* im1 | -re1 */
"movq -16(%%edx), %%mm1\n" /* w3 | w2 */
"movq -8(%%edx), %%mm0\n" /* w1 | w0 */
"addl $-16, %%edx\n"
"pfmul %%mm4, %%mm0\n" /* w1*im0 | -w0*re0 */
"pfmul %%mm5, %%mm1\n" /* w3*im1 | -w2*re1 */
"movq %%mm0, (%%eax)\n"
"movq %%mm1, 8(%%eax)\n"
"addl $16, %%esi\n"
"addl $-16, %%edi\n"
"addl $16, %%eax\n"
"decl %%ecx\n"
"jnz .first_128_delay\n"
"movl 8(%%ebp), %%ebx\n"
"leal 4(%%ebx), %%esi\n" /* buf[0].im */
"leal 1016(%%ebx), %%edi\n" /* buf[127].re */
"movl $32, %%ecx\n" /* loop count */
".second_128_delay:\n"
"movd (%%esi), %%mm0\n" /* im0 */
"movd 8(%%esi), %%mm2\n" /* im1 */
"movd (%%edi), %%mm1\n" /* re0 */
"movd -8(%%edi), %%mm3\n" /* re1 */
"pxor %%mm4, %%mm4\n"
"pxor %%mm5, %%mm5\n"
"pfsub %%mm1, %%mm4\n" /* -re0 */
"pfsub %%mm3, %%mm5\n" /* -re1 */
"punpckldq %%mm4, %%mm0\n" /* -re0 | im0 */
"punpckldq %%mm5, %%mm2\n" /* -re1 | im1 */
"movq -16(%%edx), %%mm1\n" /* w3 | w2 */
"movq -8(%%edx), %%mm3\n" /* w1 | w0 */
"addl $-16, %%edx\n"
"pfmul %%mm0, %%mm1\n" /* -w1*re0 | w0*im0 */
"pfmul %%mm2, %%mm3\n" /* -w3*re1 | w2*im1 */
"movq %%mm1, (%%eax)\n"
"movq %%mm3, 8(%%eax)\n"
"addl $16, %%esi\n"
"addl $-16, %%edi\n"
"addl $16, %%eax\n"
"decl %%ecx\n"
"jnz .second_128_delay\n"
"popl %%edi\n"
"popl %%esi\n"
"popl %%edx\n"
"popl %%ecx\n"
"popl %%ebx\n"
"popl %%eax\n"
"leave\n"
"femms\n"
::);
}
static void imdct512_window_delay_nol_3dn (complex_t *buf, float *data_ptr, float *window_prt, float *delay_prt)
{
__asm__ __volatile__ (
"pushl %%ebp\n"
"movl %%esp, %%ebp\n"
"pushl %%eax\n"
"pushl %%ebx\n"
"pushl %%ecx\n"
"pushl %%edx\n"
"pushl %%esi\n"
"pushl %%edi\n"
"movl 20(%%ebp), %%ebx\n" /* delay */
"movl 16(%%ebp), %%edx\n" /* window */
"movl 8(%%ebp), %%eax\n" /* buf */
"movl $32, %%ecx\n" /* loop count */
"leal 516(%%eax), %%esi\n" /* buf[64].im */
"leal 504(%%eax), %%edi\n" /* buf[63].re */
"movl 12(%%ebp), %%eax\n" /* data */
".first_128_samples2:\n"
"movd (%%esi), %%mm0\n" /* im0 */
"movd 8(%%esi), %%mm2\n" /* im1 */
"movd (%%edi), %%mm1\n" /* re0 */
"movd -8(%%edi), %%mm3\n" /* re1 */
"pxor %%mm4, %%mm4\n"
"pxor %%mm5, %%mm5\n"
"pfsub %%mm0, %%mm4\n" /* -im0 */
"pfsub %%mm2, %%mm5\n" /* -im1 */
"punpckldq %%mm1, %%mm4\n" /* re0 | -im0 */
"punpckldq %%mm3, %%mm5\n" /* re1 | -im1 */
"movq (%%edx), %%mm0\n" /* w1 | w0 */
"movq 8(%%edx), %%mm1\n" /* w3 | w2 */
"pfmul %%mm4, %%mm0\n" /* w1*re0 | -w0*im0 */
"pfmul %%mm5, %%mm1\n" /* w3*re1 | -w2*im1 */
"addl $16, %%edx\n"
"movq %%mm0, (%%eax)\n"
"movq %%mm1, 8(%%eax)\n"
"addl $16, %%ebx\n"
"addl $16, %%esi\n"
"addl $16, %%eax\n"
"addl $-16, %%edi\n"
"decl %%ecx\n"
"jnz .first_128_samples2\n"
"movl 8(%%ebp), %%esi\n" /* buf[0].re */
"leal 1020(%%esi), %%edi\n" /* buf[127].im */
"movl $32, %%ecx\n" /* loop count */
".second_128_samples2:\n"
"movd (%%esi), %%mm0\n" /* buf[i].re */
"movd 8(%%esi), %%mm2\n" /* re1 */
"movd (%%edi), %%mm1\n" /* buf[127-i].im */
"movd -8(%%edi), %%mm3\n" /* im1 */
"pxor %%mm4, %%mm4\n"
"pxor %%mm5, %%mm5\n"
"pfsub %%mm0, %%mm4\n" /* -re0 */
"pfsub %%mm2, %%mm5\n" /* -re1 */
"punpckldq %%mm1, %%mm4\n" /* im0 | -re0 */
"punpckldq %%mm3, %%mm5\n" /* im1 | -re1 */
"movq (%%edx), %%mm0\n" /* w1 | w0 */
"movq 8(%%edx), %%mm1\n" /* w3 | w2 */
"addl $16, %%esi\n"
"pfmul %%mm4, %%mm0\n" /* w1*im0 | -w0*re0 */
"pfmul %%mm5, %%mm1\n" /* w3*im1 | -w2*re1 */
"addl $-16, %%edi\n"
"movq %%mm0, (%%eax)\n"
"movq %%mm1, 8(%%eax)\n"
"addl $16, %%edx\n"
"addl $16, %%eax\n"
"addl $16, %%ebx\n"
"decl %%ecx\n"
"jnz .second_128_samples2\n"
"movl 8(%%ebp), %%eax\n"
"leal 512(%%eax), %%esi\n" /* buf[64].re */
"leal 508(%%eax), %%edi\n" /* buf[63].im */
"movl $32, %%ecx\n" /* loop count */
"movl 20(%%ebp), %%eax\n" /* delay */
".first_128_delays:\n"
"movd (%%esi), %%mm0\n" /* re0 */
"movd 8(%%esi), %%mm2\n" /* re1 */
"movd (%%edi), %%mm1\n" /* im0 */
"movd -8(%%edi), %%mm3\n" /* im1 */
"pxor %%mm4, %%mm4\n"
"pxor %%mm5, %%mm5\n"
"pfsub %%mm0, %%mm4\n" /* -re0 */
"pfsub %%mm2, %%mm5\n" /* -re1 */
"punpckldq %%mm1, %%mm4\n" /* im0 | -re0 */
"punpckldq %%mm3, %%mm5\n" /* im1 | -re1 */
"movq -16(%%edx), %%mm1\n" /* w3 | w2 */
"movq -8(%%edx), %%mm0\n" /* w1 | w0 */
"addl $-16, %%edx\n"
"pfmul %%mm4, %%mm0\n" /* w1*im0 | -w0*re0 */
"pfmul %%mm5, %%mm1\n" /* w3*im1 | -w2*re1 */
"movq %%mm0, (%%eax)\n"
"movq %%mm1, 8(%%eax)\n"
"addl $16, %%esi\n"
"addl $-16, %%edi\n"
"addl $16, %%eax\n"
"decl %%ecx\n"
"jnz .first_128_delays\n"
"movl 8(%%ebp), %%ebx\n"
"leal 4(%%ebx), %%esi\n" /* buf[0].im */
"leal 1016(%%ebx), %%edi\n" /* buf[127].re */
"movl $32, %%ecx\n" /* loop count */
".second_128_delays:\n"
"movd (%%esi), %%mm0\n" /* im0 */
"movd 8(%%esi), %%mm2\n" /* im1 */
"movd (%%edi), %%mm1\n" /* re0 */
"movd -8(%%edi), %%mm3\n" /* re1 */
"pxor %%mm4, %%mm4\n"
"pxor %%mm5, %%mm5\n"
"pfsub %%mm1, %%mm4\n" /* -re0 */
"pfsub %%mm3, %%mm5\n" /* -re1 */
"punpckldq %%mm4, %%mm0\n" /* -re0 | im0 */
"punpckldq %%mm5, %%mm2\n" /* -re1 | im1 */
"movq -16(%%edx), %%mm1\n" /* w3 | w2 */
"movq -8(%%edx), %%mm3\n" /* w1 | w0 */
"addl $-16, %%edx\n"
"pfmul %%mm0, %%mm1\n" /* -w1*re0 | w0*im0 */
"pfmul %%mm2, %%mm3\n" /* -w3*re1 | w2*im1 */
"movq %%mm1, (%%eax)\n"
"movq %%mm3, 8(%%eax)\n"
"addl $16, %%esi\n"
"addl $-16, %%edi\n"
"addl $16, %%eax\n"
"decl %%ecx\n"
"jnz .second_128_delays\n"
"popl %%edi\n"
"popl %%esi\n"
"popl %%edx\n"
"popl %%ecx\n"
"popl %%ebx\n"
"popl %%eax\n"
"leave\n"
"femms\n"
::);
}
...@@ -2,7 +2,7 @@ ...@@ -2,7 +2,7 @@
* ac3_imdct_c.c: ac3 DCT in C * ac3_imdct_c.c: ac3 DCT in C
***************************************************************************** *****************************************************************************
* Copyright (C) 1999, 2000 VideoLAN * Copyright (C) 1999, 2000 VideoLAN
* $Id: ac3_imdct_c.c,v 1.1 2001/05/15 16:19:42 sam Exp $ * $Id: ac3_imdct_c.c,v 1.2 2001/05/16 14:51:29 reno Exp $
* *
* Authors: Renaud Dartus <reno@videolan.org> * Authors: Renaud Dartus <reno@videolan.org>
* Aaron Holtzman <aholtzma@engr.uvic.ca> * Aaron Holtzman <aholtzma@engr.uvic.ca>
...@@ -42,6 +42,7 @@ ...@@ -42,6 +42,7 @@
#include "ac3_imdct.h" #include "ac3_imdct.h"
#include "ac3_imdct_common.h" #include "ac3_imdct_common.h"
#include "ac3_retables.h"
#ifndef M_PI #ifndef M_PI
# define M_PI 3.14159265358979323846 # define M_PI 3.14159265358979323846
...@@ -50,65 +51,6 @@ ...@@ -50,65 +51,6 @@
void _M( fft_64p ) ( complex_t *x ); void _M( fft_64p ) ( complex_t *x );
void _M( fft_128p ) ( complex_t *x ); void _M( fft_128p ) ( complex_t *x );
static float window[] = {
0.00014, 0.00024, 0.00037, 0.00051, 0.00067, 0.00086, 0.00107, 0.00130,
0.00157, 0.00187, 0.00220, 0.00256, 0.00297, 0.00341, 0.00390, 0.00443,
0.00501, 0.00564, 0.00632, 0.00706, 0.00785, 0.00871, 0.00962, 0.01061,
0.01166, 0.01279, 0.01399, 0.01526, 0.01662, 0.01806, 0.01959, 0.02121,
0.02292, 0.02472, 0.02662, 0.02863, 0.03073, 0.03294, 0.03527, 0.03770,
0.04025, 0.04292, 0.04571, 0.04862, 0.05165, 0.05481, 0.05810, 0.06153,
0.06508, 0.06878, 0.07261, 0.07658, 0.08069, 0.08495, 0.08935, 0.09389,
0.09859, 0.10343, 0.10842, 0.11356, 0.11885, 0.12429, 0.12988, 0.13563,
0.14152, 0.14757, 0.15376, 0.16011, 0.16661, 0.17325, 0.18005, 0.18699,
0.19407, 0.20130, 0.20867, 0.21618, 0.22382, 0.23161, 0.23952, 0.24757,
0.25574, 0.26404, 0.27246, 0.28100, 0.28965, 0.29841, 0.30729, 0.31626,
0.32533, 0.33450, 0.34376, 0.35311, 0.36253, 0.37204, 0.38161, 0.39126,
0.40096, 0.41072, 0.42054, 0.43040, 0.44030, 0.45023, 0.46020, 0.47019,
0.48020, 0.49022, 0.50025, 0.51028, 0.52031, 0.53033, 0.54033, 0.55031,
0.56026, 0.57019, 0.58007, 0.58991, 0.59970, 0.60944, 0.61912, 0.62873,
0.63827, 0.64774, 0.65713, 0.66643, 0.67564, 0.68476, 0.69377, 0.70269,
0.71150, 0.72019, 0.72877, 0.73723, 0.74557, 0.75378, 0.76186, 0.76981,
0.77762, 0.78530, 0.79283, 0.80022, 0.80747, 0.81457, 0.82151, 0.82831,
0.83496, 0.84145, 0.84779, 0.85398, 0.86001, 0.86588, 0.87160, 0.87716,
0.88257, 0.88782, 0.89291, 0.89785, 0.90264, 0.90728, 0.91176, 0.91610,
0.92028, 0.92432, 0.92822, 0.93197, 0.93558, 0.93906, 0.94240, 0.94560,
0.94867, 0.95162, 0.95444, 0.95713, 0.95971, 0.96217, 0.96451, 0.96674,
0.96887, 0.97089, 0.97281, 0.97463, 0.97635, 0.97799, 0.97953, 0.98099,
0.98236, 0.98366, 0.98488, 0.98602, 0.98710, 0.98811, 0.98905, 0.98994,
0.99076, 0.99153, 0.99225, 0.99291, 0.99353, 0.99411, 0.99464, 0.99513,
0.99558, 0.99600, 0.99639, 0.99674, 0.99706, 0.99736, 0.99763, 0.99788,
0.99811, 0.99831, 0.99850, 0.99867, 0.99882, 0.99895, 0.99908, 0.99919,
0.99929, 0.99938, 0.99946, 0.99953, 0.99959, 0.99965, 0.99969, 0.99974,
0.99978, 0.99981, 0.99984, 0.99986, 0.99988, 0.99990, 0.99992, 0.99993,
0.99994, 0.99995, 0.99996, 0.99997, 0.99998, 0.99998, 0.99998, 0.99999,
0.99999, 0.99999, 0.99999, 1.00000, 1.00000, 1.00000, 1.00000, 1.00000,
1.00000, 1.00000, 1.00000, 1.00000, 1.00000, 1.00000, 1.00000, 1.00000
};
static const int pm128[128] =
{
0, 16, 32, 48, 64, 80, 96, 112, 8, 40, 72, 104, 24, 56, 88, 120,
4, 20, 36, 52, 68, 84, 100, 116, 12, 28, 44, 60, 76, 92, 108, 124,
2, 18, 34, 50, 66, 82, 98, 114, 10, 42, 74, 106, 26, 58, 90, 122,
6, 22, 38, 54, 70, 86, 102, 118, 14, 46, 78, 110, 30, 62, 94, 126,
1, 17, 33, 49, 65, 81, 97, 113, 9, 41, 73, 105, 25, 57, 89, 121,
5, 21, 37, 53, 69, 85, 101, 117, 13, 29, 45, 61, 77, 93, 109, 125,
3, 19, 35, 51, 67, 83, 99, 115, 11, 43, 75, 107, 27, 59, 91, 123,
7, 23, 39, 55, 71, 87, 103, 119, 15, 31, 47, 63, 79, 95, 111, 127
};
static const int pm64[64] =
{
0, 8, 16, 24, 32, 40, 48, 56,
4, 20, 36, 52, 12, 28, 44, 60,
2, 10, 18, 26, 34, 42, 50, 58,
6, 14, 22, 30, 38, 46, 54, 62,
1, 9, 17, 25, 33, 41, 49, 57,
5, 21, 37, 53, 13, 29, 45, 61,
3, 11, 19, 27, 35, 43, 51, 59,
7, 23, 39, 55, 15, 31, 47, 63
};
void _M( imdct_init ) (imdct_t * p_imdct) void _M( imdct_init ) (imdct_t * p_imdct)
{ {
int i; int i;
......
...@@ -2,7 +2,7 @@ ...@@ -2,7 +2,7 @@
* ac3_imdct_common.c: common ac3 DCT functions * ac3_imdct_common.c: common ac3 DCT functions
***************************************************************************** *****************************************************************************
* Copyright (C) 1999, 2000 VideoLAN * Copyright (C) 1999, 2000 VideoLAN
* $Id: ac3_imdct_common.c,v 1.2 2001/05/15 19:36:27 sam Exp $ * $Id: ac3_imdct_common.c,v 1.3 2001/05/16 14:51:29 reno Exp $
* *
* Authors: Renaud Dartus <reno@videolan.org> * Authors: Renaud Dartus <reno@videolan.org>
* Aaron Holtzman <aholtzma@engr.uvic.ca> * Aaron Holtzman <aholtzma@engr.uvic.ca>
...@@ -45,71 +45,13 @@ ...@@ -45,71 +45,13 @@
#include "mtime.h" #include "mtime.h"
#include "ac3_imdct.h" #include "ac3_imdct.h"
#include "ac3_retables.h"
#ifndef M_PI #ifndef M_PI
# define M_PI 3.14159265358979323846 # define M_PI 3.14159265358979323846
#endif #endif
static float window[] = { void _M( fft_64p ) ( complex_t *x );
0.00014, 0.00024, 0.00037, 0.00051, 0.00067, 0.00086, 0.00107, 0.00130,
0.00157, 0.00187, 0.00220, 0.00256, 0.00297, 0.00341, 0.00390, 0.00443,
0.00501, 0.00564, 0.00632, 0.00706, 0.00785, 0.00871, 0.00962, 0.01061,
0.01166, 0.01279, 0.01399, 0.01526, 0.01662, 0.01806, 0.01959, 0.02121,
0.02292, 0.02472, 0.02662, 0.02863, 0.03073, 0.03294, 0.03527, 0.03770,
0.04025, 0.04292, 0.04571, 0.04862, 0.05165, 0.05481, 0.05810, 0.06153,
0.06508, 0.06878, 0.07261, 0.07658, 0.08069, 0.08495, 0.08935, 0.09389,
0.09859, 0.10343, 0.10842, 0.11356, 0.11885, 0.12429, 0.12988, 0.13563,
0.14152, 0.14757, 0.15376, 0.16011, 0.16661, 0.17325, 0.18005, 0.18699,
0.19407, 0.20130, 0.20867, 0.21618, 0.22382, 0.23161, 0.23952, 0.24757,
0.25574, 0.26404, 0.27246, 0.28100, 0.28965, 0.29841, 0.30729, 0.31626,
0.32533, 0.33450, 0.34376, 0.35311, 0.36253, 0.37204, 0.38161, 0.39126,
0.40096, 0.41072, 0.42054, 0.43040, 0.44030, 0.45023, 0.46020, 0.47019,
0.48020, 0.49022, 0.50025, 0.51028, 0.52031, 0.53033, 0.54033, 0.55031,
0.56026, 0.57019, 0.58007, 0.58991, 0.59970, 0.60944, 0.61912, 0.62873,
0.63827, 0.64774, 0.65713, 0.66643, 0.67564, 0.68476, 0.69377, 0.70269,
0.71150, 0.72019, 0.72877, 0.73723, 0.74557, 0.75378, 0.76186, 0.76981,
0.77762, 0.78530, 0.79283, 0.80022, 0.80747, 0.81457, 0.82151, 0.82831,
0.83496, 0.84145, 0.84779, 0.85398, 0.86001, 0.86588, 0.87160, 0.87716,
0.88257, 0.88782, 0.89291, 0.89785, 0.90264, 0.90728, 0.91176, 0.91610,
0.92028, 0.92432, 0.92822, 0.93197, 0.93558, 0.93906, 0.94240, 0.94560,
0.94867, 0.95162, 0.95444, 0.95713, 0.95971, 0.96217, 0.96451, 0.96674,
0.96887, 0.97089, 0.97281, 0.97463, 0.97635, 0.97799, 0.97953, 0.98099,
0.98236, 0.98366, 0.98488, 0.98602, 0.98710, 0.98811, 0.98905, 0.98994,
0.99076, 0.99153, 0.99225, 0.99291, 0.99353, 0.99411, 0.99464, 0.99513,
0.99558, 0.99600, 0.99639, 0.99674, 0.99706, 0.99736, 0.99763, 0.99788,
0.99811, 0.99831, 0.99850, 0.99867, 0.99882, 0.99895, 0.99908, 0.99919,
0.99929, 0.99938, 0.99946, 0.99953, 0.99959, 0.99965, 0.99969, 0.99974,
0.99978, 0.99981, 0.99984, 0.99986, 0.99988, 0.99990, 0.99992, 0.99993,
0.99994, 0.99995, 0.99996, 0.99997, 0.99998, 0.99998, 0.99998, 0.99999,
0.99999, 0.99999, 0.99999, 1.00000, 1.00000, 1.00000, 1.00000, 1.00000,
1.00000, 1.00000, 1.00000, 1.00000, 1.00000, 1.00000, 1.00000, 1.00000
};
static const int pm128[128] =
{
0, 16, 32, 48, 64, 80, 96, 112, 8, 40, 72, 104, 24, 56, 88, 120,
4, 20, 36, 52, 68, 84, 100, 116, 12, 28, 44, 60, 76, 92, 108, 124,
2, 18, 34, 50, 66, 82, 98, 114, 10, 42, 74, 106, 26, 58, 90, 122,
6, 22, 38, 54, 70, 86, 102, 118, 14, 46, 78, 110, 30, 62, 94, 126,
1, 17, 33, 49, 65, 81, 97, 113, 9, 41, 73, 105, 25, 57, 89, 121,
5, 21, 37, 53, 69, 85, 101, 117, 13, 29, 45, 61, 77, 93, 109, 125,
3, 19, 35, 51, 67, 83, 99, 115, 11, 43, 75, 107, 27, 59, 91, 123,
7, 23, 39, 55, 71, 87, 103, 119, 15, 31, 47, 63, 79, 95, 111, 127
};
static const int pm64[64] =
{
0, 8, 16, 24, 32, 40, 48, 56,
4, 20, 36, 52, 12, 28, 44, 60,
2, 10, 18, 26, 34, 42, 50, 58,
6, 14, 22, 30, 38, 46, 54, 62,
1, 9, 17, 25, 33, 41, 49, 57,
5, 21, 37, 53, 13, 29, 45, 61,
3, 11, 19, 27, 35, 43, 51, 59,
7, 23, 39, 55, 15, 31, 47, 63
};
void _M( fft_64p ) ( complex_t *a );
void _M( imdct_do_256 ) (imdct_t * p_imdct, float data[],float delay[]) void _M( imdct_do_256 ) (imdct_t * p_imdct, float data[],float delay[])
{ {
...@@ -266,4 +208,3 @@ void _M( imdct_do_256_nol ) (imdct_t * p_imdct, float data[], float delay[]) ...@@ -266,4 +208,3 @@ void _M( imdct_do_256_nol ) (imdct_t * p_imdct, float data[], float delay[])
*delay_ptr++ = -buf2[64-i-1].real * *--window_ptr; *delay_ptr++ = -buf2[64-i-1].real * *--window_ptr;
} }
} }
...@@ -2,7 +2,7 @@ ...@@ -2,7 +2,7 @@
* ac3_imdct_common.h: common ac3 DCT headers * ac3_imdct_common.h: common ac3 DCT headers
***************************************************************************** *****************************************************************************
* Copyright (C) 1999, 2000 VideoLAN * Copyright (C) 1999, 2000 VideoLAN
* $Id: ac3_imdct_common.h,v 1.1 2001/05/15 16:19:42 sam Exp $ * $Id: ac3_imdct_common.h,v 1.2 2001/05/16 14:51:29 reno Exp $
* *
* Authors: Renaud Dartus <reno@videolan.org> * Authors: Renaud Dartus <reno@videolan.org>
* Aaron Holtzman <aholtzma@engr.uvic.ca> * Aaron Holtzman <aholtzma@engr.uvic.ca>
...@@ -27,4 +27,3 @@ void _M( imdct_do_256 ) ( imdct_t * p_imdct, float data[], float delay[] ); ...@@ -27,4 +27,3 @@ void _M( imdct_do_256 ) ( imdct_t * p_imdct, float data[], float delay[] );
void _M( imdct_do_256_nol ) ( imdct_t * p_imdct, float data[], float delay[] ); void _M( imdct_do_256_nol ) ( imdct_t * p_imdct, float data[], float delay[] );
void _M( imdct_do_512 ) ( imdct_t * p_imdct, float data[], float delay[] ); void _M( imdct_do_512 ) ( imdct_t * p_imdct, float data[], float delay[] );
void _M( imdct_do_512_nol ) ( imdct_t * p_imdct, float data[], float delay[] ); void _M( imdct_do_512_nol ) ( imdct_t * p_imdct, float data[], float delay[] );
...@@ -2,7 +2,7 @@ ...@@ -2,7 +2,7 @@
* ac3_imdct_sse.c: accelerated SSE ac3 DCT * ac3_imdct_sse.c: accelerated SSE ac3 DCT
***************************************************************************** *****************************************************************************
* Copyright (C) 1999, 2000 VideoLAN * Copyright (C) 1999, 2000 VideoLAN
* $Id: ac3_imdct_sse.c,v 1.1 2001/05/15 16:19:42 sam Exp $ * $Id: ac3_imdct_sse.c,v 1.2 2001/05/16 14:51:29 reno Exp $
* *
* Authors: Renaud Dartus <reno@videolan.org> * Authors: Renaud Dartus <reno@videolan.org>
* Aaron Holtzman <aholtzma@engr.uvic.ca> * Aaron Holtzman <aholtzma@engr.uvic.ca>
...@@ -40,61 +40,15 @@ ...@@ -40,61 +40,15 @@
#include "ac3_imdct.h" #include "ac3_imdct.h"
#include "ac3_imdct_common.h" #include "ac3_imdct_common.h"
#include "ac3_retables.h"
static const float window[] = {
0.00014, 0.00024, 0.00037, 0.00051, 0.00067, 0.00086, 0.00107, 0.00130,
0.00157, 0.00187, 0.00220, 0.00256, 0.00297, 0.00341, 0.00390, 0.00443,
0.00501, 0.00564, 0.00632, 0.00706, 0.00785, 0.00871, 0.00962, 0.01061,
0.01166, 0.01279, 0.01399, 0.01526, 0.01662, 0.01806, 0.01959, 0.02121,
0.02292, 0.02472, 0.02662, 0.02863, 0.03073, 0.03294, 0.03527, 0.03770,
0.04025, 0.04292, 0.04571, 0.04862, 0.05165, 0.05481, 0.05810, 0.06153,
0.06508, 0.06878, 0.07261, 0.07658, 0.08069, 0.08495, 0.08935, 0.09389,
0.09859, 0.10343, 0.10842, 0.11356, 0.11885, 0.12429, 0.12988, 0.13563,
0.14152, 0.14757, 0.15376, 0.16011, 0.16661, 0.17325, 0.18005, 0.18699,
0.19407, 0.20130, 0.20867, 0.21618, 0.22382, 0.23161, 0.23952, 0.24757,
0.25574, 0.26404, 0.27246, 0.28100, 0.28965, 0.29841, 0.30729, 0.31626,
0.32533, 0.33450, 0.34376, 0.35311, 0.36253, 0.37204, 0.38161, 0.39126,
0.40096, 0.41072, 0.42054, 0.43040, 0.44030, 0.45023, 0.46020, 0.47019,
0.48020, 0.49022, 0.50025, 0.51028, 0.52031, 0.53033, 0.54033, 0.55031,
0.56026, 0.57019, 0.58007, 0.58991, 0.59970, 0.60944, 0.61912, 0.62873,
0.63827, 0.64774, 0.65713, 0.66643, 0.67564, 0.68476, 0.69377, 0.70269,
0.71150, 0.72019, 0.72877, 0.73723, 0.74557, 0.75378, 0.76186, 0.76981,
0.77762, 0.78530, 0.79283, 0.80022, 0.80747, 0.81457, 0.82151, 0.82831,
0.83496, 0.84145, 0.84779, 0.85398, 0.86001, 0.86588, 0.87160, 0.87716,
0.88257, 0.88782, 0.89291, 0.89785, 0.90264, 0.90728, 0.91176, 0.91610,
0.92028, 0.92432, 0.92822, 0.93197, 0.93558, 0.93906, 0.94240, 0.94560,
0.94867, 0.95162, 0.95444, 0.95713, 0.95971, 0.96217, 0.96451, 0.96674,
0.96887, 0.97089, 0.97281, 0.97463, 0.97635, 0.97799, 0.97953, 0.98099,
0.98236, 0.98366, 0.98488, 0.98602, 0.98710, 0.98811, 0.98905, 0.98994,
0.99076, 0.99153, 0.99225, 0.99291, 0.99353, 0.99411, 0.99464, 0.99513,
0.99558, 0.99600, 0.99639, 0.99674, 0.99706, 0.99736, 0.99763, 0.99788,
0.99811, 0.99831, 0.99850, 0.99867, 0.99882, 0.99895, 0.99908, 0.99919,
0.99929, 0.99938, 0.99946, 0.99953, 0.99959, 0.99965, 0.99969, 0.99974,
0.99978, 0.99981, 0.99984, 0.99986, 0.99988, 0.99990, 0.99992, 0.99993,
0.99994, 0.99995, 0.99996, 0.99997, 0.99998, 0.99998, 0.99998, 0.99999,
0.99999, 0.99999, 0.99999, 1.00000, 1.00000, 1.00000, 1.00000, 1.00000,
1.00000, 1.00000, 1.00000, 1.00000, 1.00000, 1.00000, 1.00000, 1.00000
};
static const int pm128[128] =
{
0, 16, 32, 48, 64, 80, 96, 112, 8, 40, 72, 104, 24, 56, 88, 120,
4, 20, 36, 52, 68, 84, 100, 116, 12, 28, 44, 60, 76, 92, 108, 124,
2, 18, 34, 50, 66, 82, 98, 114, 10, 42, 74, 106, 26, 58, 90, 122,
6, 22, 38, 54, 70, 86, 102, 118, 14, 46, 78, 110, 30, 62, 94, 126,
1, 17, 33, 49, 65, 81, 97, 113, 9, 41, 73, 105, 25, 57, 89, 121,
5, 21, 37, 53, 69, 85, 101, 117, 13, 29, 45, 61, 77, 93, 109, 125,
3, 19, 35, 51, 67, 83, 99, 115, 11, 43, 75, 107, 27, 59, 91, 123,
7, 23, 39, 55, 71, 87, 103, 119, 15, 31, 47, 63, 79, 95, 111, 127
};
void _M( fft_64p ) ( complex_t *x ); void _M( fft_64p ) ( complex_t *x );
void _M( fft_128p ) ( complex_t *a ); void _M( fft_128p ) ( complex_t *a );
static void imdct512_pre_ifft_twiddle_sse (const int *pmt, complex_t *buf, float *data, float *xcos_sin_sse); static void imdct512_pre_ifft_twiddle_sse (const int *pmt, complex_t *buf, float *data, float *xcos_sin_sse);
static void imdct512_post_ifft_twiddle_sse (complex_t *buf, float *xcos_sin_sse); static void imdct512_post_ifft_twiddle_sse (complex_t *buf, float *xcos_sin_sse);
static void imdct512_window_delay_sse (complex_t *buf, float *data_ptr, const float *window_prt, float *delay_prt); static void imdct512_window_delay_sse (complex_t *buf, float *data_ptr, float *window_prt, float *delay_prt);
static void imdct512_window_delay_nol_sse (complex_t *buf, float *data_ptr, const float *window_prt, float *delay_prt); static void imdct512_window_delay_nol_sse (complex_t *buf, float *data_ptr, float *window_prt, float *delay_prt);
void _M( imdct_init ) (imdct_t * p_imdct) void _M( imdct_init ) (imdct_t * p_imdct)
...@@ -260,7 +214,7 @@ static void imdct512_post_ifft_twiddle_sse (complex_t *buf, float *xcos_sin_sse) ...@@ -260,7 +214,7 @@ static void imdct512_post_ifft_twiddle_sse (complex_t *buf, float *xcos_sin_sse)
: "a" (buf), "c" (xcos_sin_sse) ); : "a" (buf), "c" (xcos_sin_sse) );
} }
static void imdct512_window_delay_sse (complex_t *buf, float *data_ptr, const float *window_prt, float *delay_prt) static void imdct512_window_delay_sse (complex_t *buf, float *data_ptr, float *window_prt, float *delay_prt)
{ {
__asm__ __volatile__ ( __asm__ __volatile__ (
"pushl %%ebp\n" "pushl %%ebp\n"
...@@ -448,7 +402,7 @@ static void imdct512_window_delay_sse (complex_t *buf, float *data_ptr, const fl ...@@ -448,7 +402,7 @@ static void imdct512_window_delay_sse (complex_t *buf, float *data_ptr, const fl
::); ::);
} }
static void imdct512_window_delay_nol_sse (complex_t *buf, float *data_ptr, const float *window_prt, float *delay_prt) static void imdct512_window_delay_nol_sse (complex_t *buf, float *data_ptr, float *window_prt, float *delay_prt)
{ {
__asm__ __volatile__ ( __asm__ __volatile__ (
"pushl %%ebp\n" "pushl %%ebp\n"
......
/*****************************************************************************
* ac3_retables.h: ac3 DCT tables
*****************************************************************************
* Copyright (C) 1999, 2000 VideoLAN
* $Id: ac3_retables.h,v 1.1 2001/05/16 14:51:29 reno Exp $
*
* Authors: Renaud Dartus <reno@videolan.org>
* Aaron Holtzman <aholtzma@engr.uvic.ca>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111, USA.
*****************************************************************************/
static float window[] = {
0.00014, 0.00024, 0.00037, 0.00051, 0.00067, 0.00086, 0.00107, 0.00130,
0.00157, 0.00187, 0.00220, 0.00256, 0.00297, 0.00341, 0.00390, 0.00443,
0.00501, 0.00564, 0.00632, 0.00706, 0.00785, 0.00871, 0.00962, 0.01061,
0.01166, 0.01279, 0.01399, 0.01526, 0.01662, 0.01806, 0.01959, 0.02121,
0.02292, 0.02472, 0.02662, 0.02863, 0.03073, 0.03294, 0.03527, 0.03770,
0.04025, 0.04292, 0.04571, 0.04862, 0.05165, 0.05481, 0.05810, 0.06153,
0.06508, 0.06878, 0.07261, 0.07658, 0.08069, 0.08495, 0.08935, 0.09389,
0.09859, 0.10343, 0.10842, 0.11356, 0.11885, 0.12429, 0.12988, 0.13563,
0.14152, 0.14757, 0.15376, 0.16011, 0.16661, 0.17325, 0.18005, 0.18699,
0.19407, 0.20130, 0.20867, 0.21618, 0.22382, 0.23161, 0.23952, 0.24757,
0.25574, 0.26404, 0.27246, 0.28100, 0.28965, 0.29841, 0.30729, 0.31626,
0.32533, 0.33450, 0.34376, 0.35311, 0.36253, 0.37204, 0.38161, 0.39126,
0.40096, 0.41072, 0.42054, 0.43040, 0.44030, 0.45023, 0.46020, 0.47019,
0.48020, 0.49022, 0.50025, 0.51028, 0.52031, 0.53033, 0.54033, 0.55031,
0.56026, 0.57019, 0.58007, 0.58991, 0.59970, 0.60944, 0.61912, 0.62873,
0.63827, 0.64774, 0.65713, 0.66643, 0.67564, 0.68476, 0.69377, 0.70269,
0.71150, 0.72019, 0.72877, 0.73723, 0.74557, 0.75378, 0.76186, 0.76981,
0.77762, 0.78530, 0.79283, 0.80022, 0.80747, 0.81457, 0.82151, 0.82831,
0.83496, 0.84145, 0.84779, 0.85398, 0.86001, 0.86588, 0.87160, 0.87716,
0.88257, 0.88782, 0.89291, 0.89785, 0.90264, 0.90728, 0.91176, 0.91610,
0.92028, 0.92432, 0.92822, 0.93197, 0.93558, 0.93906, 0.94240, 0.94560,
0.94867, 0.95162, 0.95444, 0.95713, 0.95971, 0.96217, 0.96451, 0.96674,
0.96887, 0.97089, 0.97281, 0.97463, 0.97635, 0.97799, 0.97953, 0.98099,
0.98236, 0.98366, 0.98488, 0.98602, 0.98710, 0.98811, 0.98905, 0.98994,
0.99076, 0.99153, 0.99225, 0.99291, 0.99353, 0.99411, 0.99464, 0.99513,
0.99558, 0.99600, 0.99639, 0.99674, 0.99706, 0.99736, 0.99763, 0.99788,
0.99811, 0.99831, 0.99850, 0.99867, 0.99882, 0.99895, 0.99908, 0.99919,
0.99929, 0.99938, 0.99946, 0.99953, 0.99959, 0.99965, 0.99969, 0.99974,
0.99978, 0.99981, 0.99984, 0.99986, 0.99988, 0.99990, 0.99992, 0.99993,
0.99994, 0.99995, 0.99996, 0.99997, 0.99998, 0.99998, 0.99998, 0.99999,
0.99999, 0.99999, 0.99999, 1.00000, 1.00000, 1.00000, 1.00000, 1.00000,
1.00000, 1.00000, 1.00000, 1.00000, 1.00000, 1.00000, 1.00000, 1.00000
};
static const int pm128[128] =
{
0, 16, 32, 48, 64, 80, 96, 112, 8, 40, 72, 104, 24, 56, 88, 120,
4, 20, 36, 52, 68, 84, 100, 116, 12, 28, 44, 60, 76, 92, 108, 124,
2, 18, 34, 50, 66, 82, 98, 114, 10, 42, 74, 106, 26, 58, 90, 122,
6, 22, 38, 54, 70, 86, 102, 118, 14, 46, 78, 110, 30, 62, 94, 126,
1, 17, 33, 49, 65, 81, 97, 113, 9, 41, 73, 105, 25, 57, 89, 121,
5, 21, 37, 53, 69, 85, 101, 117, 13, 29, 45, 61, 77, 93, 109, 125,
3, 19, 35, 51, 67, 83, 99, 115, 11, 43, 75, 107, 27, 59, 91, 123,
7, 23, 39, 55, 71, 87, 103, 119, 15, 31, 47, 63, 79, 95, 111, 127
};
static const int pm64[64] =
{
0, 8, 16, 24, 32, 40, 48, 56,
4, 20, 36, 52, 12, 28, 44, 60,
2, 10, 18, 26, 34, 42, 50, 58,
6, 14, 22, 30, 38, 46, 54, 62,
1, 9, 17, 25, 33, 41, 49, 57,
5, 21, 37, 53, 13, 29, 45, 61,
3, 11, 19, 27, 35, 43, 51, 59,
7, 23, 39, 55, 15, 31, 47, 63
};
/*****************************************************************************
* ac3_srfft_3dn.c: accelerated 3D Now! ac3 fft functions
*****************************************************************************
* Copyright (C) 1999, 2000, 2001 VideoLAN
* $Id: ac3_srfft_3dn.c,v 1.1 2001/05/16 14:51:29 reno Exp $
*
* Authors: Renaud Dartus <reno@videolan.org>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111, USA.
*****************************************************************************/
#define MODULE_NAME imdct3dn
#include "modules_inner.h"
/*****************************************************************************
* * Preamble
* *****************************************************************************/
#include <stdio.h>
#include "defs.h"
#include <math.h>
#include <stdio.h>
#include "config.h"
#include "common.h"
#include "threads.h"
#include "mtime.h"
#include "ac3_imdct.h"
#include "ac3_srfft.h"
void hsqrt2_3dn (void);
void C_1_3dn (void);
static void fft_4_3dn (complex_t *x);
static void fft_8_3dn (complex_t *x);
static void fft_asmb_3dn (int k, complex_t *x, complex_t *wTB,
const complex_t *d, const complex_t *d_3);
void _M( fft_64p ) ( complex_t *a )
{
fft_8_3dn(&a[0]); fft_4_3dn(&a[8]); fft_4_3dn(&a[12]);
fft_asmb_3dn(2, &a[0], &a[8], &delta16[0], &delta16_3[0]);
fft_8_3dn(&a[16]), fft_8_3dn(&a[24]);
fft_asmb_3dn(4, &a[0], &a[16],&delta32[0], &delta32_3[0]);
fft_8_3dn(&a[32]); fft_4_3dn(&a[40]); fft_4_3dn(&a[44]);
fft_asmb_3dn(2, &a[32], &a[40], &delta16[0], &delta16_3[0]);
fft_8_3dn(&a[48]); fft_4_3dn(&a[56]); fft_4_3dn(&a[60]);
fft_asmb_3dn(2, &a[48], &a[56], &delta16[0], &delta16_3[0]);
fft_asmb_3dn(8, &a[0], &a[32],&delta64[0], &delta64_3[0]);
}
void _M( fft_128p ) ( complex_t *a )
{
fft_8_3dn(&a[0]); fft_4_3dn(&a[8]); fft_4_3dn(&a[12]);
fft_asmb_3dn(2, &a[0], &a[8], &delta16[0], &delta16_3[0]);
fft_8_3dn(&a[16]), fft_8_3dn(&a[24]);
fft_asmb_3dn(4, &a[0], &a[16],&delta32[0], &delta32_3[0]);
fft_8_3dn(&a[32]); fft_4_3dn(&a[40]); fft_4_3dn(&a[44]);
fft_asmb_3dn(2, &a[32], &a[40], &delta16[0], &delta16_3[0]);
fft_8_3dn(&a[48]); fft_4_3dn(&a[56]); fft_4_3dn(&a[60]);
fft_asmb_3dn(2, &a[48], &a[56], &delta16[0], &delta16_3[0]);
fft_asmb_3dn(8, &a[0], &a[32],&delta64[0], &delta64_3[0]);
fft_8_3dn(&a[64]); fft_4_3dn(&a[72]); fft_4_3dn(&a[76]);
/* fft_16(&a[64]); */
fft_asmb_3dn(2, &a[64], &a[72], &delta16[0], &delta16_3[0]);
fft_8_3dn(&a[80]); fft_8_3dn(&a[88]);
/* fft_32(&a[64]); */
fft_asmb_3dn(4, &a[64], &a[80],&delta32[0], &delta32_3[0]);
fft_8_3dn(&a[96]); fft_4_3dn(&a[104]), fft_4_3dn(&a[108]);
/* fft_16(&a[96]); */
fft_asmb_3dn(2, &a[96], &a[104], &delta16[0], &delta16_3[0]);
fft_8_3dn(&a[112]), fft_8_3dn(&a[120]);
/* fft_32(&a[96]); */
fft_asmb_3dn(4, &a[96], &a[112], &delta32[0], &delta32_3[0]);
/* fft_128(&a[0]); */
fft_asmb_3dn(16, &a[0], &a[64], &delta128[0], &delta128_3[0]);
}
void hsqrt2_3dn (void)
{
__asm__ (
".float 0f0.707106781188\n"
".float 0f0.707106781188\n"
".float 0f-0.707106781188\n"
".float 0f-0.707106781188\n"
);
}
void C_1_3dn (void)
{
__asm__ (
".float 0f-1.0\n"
".float 0f1.0\n"
".float 0f-1.0\n"
".float 0f1.0\n"
);
}
static void fft_4_3dn (complex_t *x)
{
__asm__ __volatile__ (
"movq (%%eax), %%mm0\n" /* x[0] */
"movq 8(%%eax), %%mm1\n" /* x[1] */
"movq 16(%%eax), %%mm2\n" /* x[2] */
"movq 24(%%eax), %%mm3\n" /* x[3] */
"movq %%mm0, %%mm4\n" /* x[1] */
"movq %%mm1, %%mm5\n" /* x[1] */
"movq %%mm0, %%mm6\n" /* x[0] */
"pfadd %%mm2, %%mm0\n" /* x[0] + x[2] */
"pfadd %%mm3, %%mm1\n" /* x[1] + x[3] */
"pfsub %%mm2, %%mm4\n" /* x[0] - x[2] */
"pfsub %%mm3, %%mm5\n" /* x[1] - x[3] */
"pfadd %%mm1, %%mm0\n" /* x[0] + x[2] + x[1] + x[3] */
"pfsub %%mm1, %%mm6\n" /* x[0] + x[2] - x[1] - x[3] */
"movq %%mm0, (%%eax)\n"
"movq %%mm6, 16(%%eax)\n"
"pxor %%mm6, %%mm6\n"
"movq %%mm5, %%mm2\n" /* x[1] - x[3] */
"movq %%mm4, %%mm3\n" /* x[0] - x[2] */
"pfsub %%mm5, %%mm6\n" /* x[3] - x[1] */
"punpckhdq %%mm2,%%mm2\n" /* x[1] - x[3].im */
"punpckldq %%mm6,%%mm6\n" /* x[3] - x[1].re */
"punpckhdq %%mm6,%%mm2\n" /* x[3] - x[1].re, x[1] - x[3].im */
"pfsub %%mm2, %%mm4\n" /* x0i-x2i-x3r+x1.r,x0r-x2r-x1i+x3i */
"pfadd %%mm3, %%mm2\n" /* x0i-x2i+x3r-x1.r, x0r-x2r+x1i-x3.i */
"movq %%mm2, 8(%%eax)\n" /* mm4_2 + mm6_1, mm4_1 + mm5_2 */
"movq %%mm4, 24(%%eax)\n" /* mm4_2 - mm6_1, mm4_1 - mm5_2 */
"femms\n"
: "=a" (x)
: "a" (x) );
}
static void fft_8_3dn (complex_t *x)
{
register float wT1_r, wT1_i, wB1_r, wB1_i, wT2_r, wT2_i, wB2_r, wB2_i;
wT1_r = x[1].real;
wT1_i = x[1].imag;
wB1_r = x[3].real;
wB1_i = x[3].imag;
x[1] = x[2];
x[2] = x[4];
x[3] = x[6];
{ /* fft_4 */
register float yt_r, yt_i, yb_r, yb_i, u_r, u_i, vi_r, vi_i;
yt_r = x[0].real;
yb_r = yt_r - x[2].real;
yt_r += x[2].real;
u_r = x[1].real;
vi_i = x[3].real - u_r;
u_r += x[3].real;
u_i = x[1].imag;
vi_r = u_i - x[3].imag;
u_i += x[3].imag;
yt_i = yt_r;
yt_i += u_r;
x[0].real = yt_i;
yt_r -= u_r;
x[2].real = yt_r;
yt_i = yb_r;
yt_i += vi_r;
x[1].real = yt_i;
yb_r -= vi_r;
x[3].real = yb_r;
yt_i = x[0].imag;
yb_i = yt_i - x[2].imag;
yt_i += x[2].imag;
yt_r = yt_i;
yt_r += u_i;
x[0].imag = yt_r;
yt_i -= u_i;
x[2].imag = yt_i;
yt_r = yb_i;
yt_r += vi_i;
x[1].imag = yt_r;
yb_i -= vi_i;
x[3].imag = yb_i;
}
/* x[0] x[4] */
wT2_r = x[5].real;
wT2_r += x[7].real;
wT2_r += wT1_r;
wT2_r += wB1_r;
wT2_i = wT2_r;
wT2_r += x[0].real;
wT2_i = x[0].real - wT2_i;
x[0].real = wT2_r;
x[4].real = wT2_i;
wT2_i = x[5].imag;
wT2_i += x[7].imag;
wT2_i += wT1_i;
wT2_i += wB1_i;
wT2_r = wT2_i;
wT2_r += x[0].imag;
wT2_i = x[0].imag - wT2_i;
x[0].imag = wT2_r;
x[4].imag = wT2_i;
/* x[2] x[6] */
wT2_r = x[5].imag;
wT2_r -= x[7].imag;
wT2_r += wT1_i;
wT2_r -= wB1_i;
wT2_i = wT2_r;
wT2_r += x[2].real;
wT2_i = x[2].real - wT2_i;
x[2].real = wT2_r;
x[6].real = wT2_i;
wT2_i = x[5].real;
wT2_i -= x[7].real;
wT2_i += wT1_r;
wT2_i -= wB1_r;
wT2_r = wT2_i;
wT2_r += x[2].imag;
wT2_i = x[2].imag - wT2_i;
x[2].imag = wT2_i;
x[6].imag = wT2_r;
/* x[1] x[5] */
wT2_r = wT1_r;
wT2_r += wB1_i;
wT2_r -= x[5].real;
wT2_r -= x[7].imag;
wT2_i = wT1_i;
wT2_i -= wB1_r;
wT2_i -= x[5].imag;
wT2_i += x[7].real;
wB2_r = wT2_r;
wB2_r += wT2_i;
wT2_i -= wT2_r;
wB2_r *= HSQRT2;
wT2_i *= HSQRT2;
wT2_r = wB2_r;
wB2_r += x[1].real;
wT2_r = x[1].real - wT2_r;
wB2_i = x[5].real;
x[1].real = wB2_r;
x[5].real = wT2_r;
wT2_r = wT2_i;
wT2_r += x[1].imag;
wT2_i = x[1].imag - wT2_i;
wB2_r = x[5].imag;
x[1].imag = wT2_r;
x[5].imag = wT2_i;
/* x[3] x[7] */
wT1_r -= wB1_i;
wT1_i += wB1_r;
wB1_r = wB2_i - x[7].imag;
wB1_i = wB2_r + x[7].real;
wT1_r -= wB1_r;
wT1_i -= wB1_i;
wB1_r = wT1_r + wT1_i;
wB1_r *= HSQRT2;
wT1_i -= wT1_r;
wT1_i *= HSQRT2;
wB2_r = x[3].real;
wB2_i = wB2_r + wT1_i;
wB2_r -= wT1_i;
x[3].real = wB2_i;
x[7].real = wB2_r;
wB2_i = x[3].imag;
wB2_r = wB2_i + wB1_r;
wB2_i -= wB1_r;
x[3].imag = wB2_i;
x[7].imag = wB2_r;
}
static void fft_asmb_3dn (int k, complex_t *x, complex_t *wTB,
const complex_t *d, const complex_t *d_3)
{
register complex_t *x2k, *x3k, *x4k, *wB;
register float a_r, a_i, a1_r, a1_i, u_r, u_i, v_r, v_i;
x2k = x + 2 * k;
x3k = x2k + 2 * k;
x4k = x3k + 2 * k;
wB = wTB + 2 * k;
TRANSZERO(x[0],x2k[0],x3k[0],x4k[0]);
TRANS(x[1],x2k[1],x3k[1],x4k[1],wTB[1],wB[1],d[1],d_3[1]);
--k;
for(;;) {
TRANS(x[2],x2k[2],x3k[2],x4k[2],wTB[2],wB[2],d[2],d_3[2]);
TRANS(x[3],x2k[3],x3k[3],x4k[3],wTB[3],wB[3],d[3],d_3[3]);
if (!--k) break;
x += 2;
x2k += 2;
x3k += 2;
x4k += 2;
d += 2;
d_3 += 2;
wTB += 2;
wB += 2;
}
}
...@@ -2,7 +2,7 @@ ...@@ -2,7 +2,7 @@
* ac3_srfft_sse.c: accelerated SSE ac3 fft functions * ac3_srfft_sse.c: accelerated SSE ac3 fft functions
***************************************************************************** *****************************************************************************
* Copyright (C) 1999, 2000, 2001 VideoLAN * Copyright (C) 1999, 2000, 2001 VideoLAN
* $Id: ac3_srfft_sse.c,v 1.1 2001/05/15 16:19:42 sam Exp $ * $Id: ac3_srfft_sse.c,v 1.2 2001/05/16 14:51:29 reno Exp $
* *
* Authors: Renaud Dartus <reno@videolan.org> * Authors: Renaud Dartus <reno@videolan.org>
* Aaron Holtzman <aholtzma@engr.uvic.ca> * Aaron Holtzman <aholtzma@engr.uvic.ca>
...@@ -43,8 +43,8 @@ ...@@ -43,8 +43,8 @@
#include "ac3_imdct.h" #include "ac3_imdct.h"
#include "ac3_srfft.h" #include "ac3_srfft.h"
void hsqrt2 (void); void hsqrt2_sse (void);
void C_1 (void); void C_1_sse (void);
static void fft_4_sse (complex_t *x); static void fft_4_sse (complex_t *x);
static void fft_8_sse (complex_t *x); static void fft_8_sse (complex_t *x);
static void fft_asmb_sse (int k, complex_t *x, complex_t *wTB, static void fft_asmb_sse (int k, complex_t *x, complex_t *wTB,
...@@ -104,7 +104,7 @@ void _M( fft_128p ) ( complex_t *a ) ...@@ -104,7 +104,7 @@ void _M( fft_128p ) ( complex_t *a )
fft_asmb_sse(16, &a[0], &a[64], &delta128[0], &delta128_3[0]); fft_asmb_sse(16, &a[0], &a[64], &delta128[0], &delta128_3[0]);
} }
void hsqrt2 (void) void hsqrt2_sse (void)
{ {
__asm__ ( __asm__ (
".float 0f0.707106781188\n" ".float 0f0.707106781188\n"
...@@ -114,7 +114,7 @@ void hsqrt2 (void) ...@@ -114,7 +114,7 @@ void hsqrt2 (void)
); );
} }
void C_1 (void) void C_1_sse (void)
{ {
__asm__ ( __asm__ (
".float 0f-1.0\n" ".float 0f-1.0\n"
...@@ -174,7 +174,7 @@ static void fft_8_sse (complex_t *x) ...@@ -174,7 +174,7 @@ static void fft_8_sse (complex_t *x)
"subps %%xmm5, %%xmm7\n" /* yb = i*(x6-x2)+x0-x4 | -x6-x2+x0+x4 */ "subps %%xmm5, %%xmm7\n" /* yb = i*(x6-x2)+x0-x4 | -x6-x2+x0+x4 */
"movhps 24(%%eax), %%xmm1\n" /* x[3] | x[1] */ "movhps 24(%%eax), %%xmm1\n" /* x[3] | x[1] */
"movl $hsqrt2, %%ebx\n" "movl $hsqrt2_sse, %%ebx\n"
"movlps 40(%%eax), %%xmm2\n" /* x[5] */ "movlps 40(%%eax), %%xmm2\n" /* x[5] */
"movhps 56(%%eax), %%xmm2\n" /* x[7] | x[5] */ "movhps 56(%%eax), %%xmm2\n" /* x[7] | x[5] */
"movups %%xmm1, %%xmm3\n" /* x[3] | x[1] */ "movups %%xmm1, %%xmm3\n" /* x[3] | x[1] */
...@@ -191,7 +191,7 @@ static void fft_8_sse (complex_t *x) ...@@ -191,7 +191,7 @@ static void fft_8_sse (complex_t *x)
"movlhps %%xmm6, %%xmm1\n" /* (1+i)/sqrt2 * (x[1]-x[5]) | x[1]+x[5] */ "movlhps %%xmm6, %%xmm1\n" /* (1+i)/sqrt2 * (x[1]-x[5]) | x[1]+x[5] */
"shufps $0xe4, %%xmm6, %%xmm5\n" /* (-1-i)/sqrt2 * (x[3]-x[7]) | x[3]+x[7] */ "shufps $0xe4, %%xmm6, %%xmm5\n" /* (-1-i)/sqrt2 * (x[3]-x[7]) | x[3]+x[7] */
"movups %%xmm1, %%xmm3\n" /* (1-i)/sqrt2 * (x[1]-x[5]) | x[1]+x[5] */ "movups %%xmm1, %%xmm3\n" /* (1-i)/sqrt2 * (x[1]-x[5]) | x[1]+x[5] */
"movl $C_1, %%ebx\n" "movl $C_1_sse, %%ebx\n"
"addps %%xmm5, %%xmm1\n" /* u */ "addps %%xmm5, %%xmm1\n" /* u */
"subps %%xmm5, %%xmm3\n" /* v */ "subps %%xmm5, %%xmm3\n" /* v */
"movups %%xmm0, %%xmm2\n" /* yb */ "movups %%xmm0, %%xmm2\n" /* yb */
...@@ -258,7 +258,7 @@ static void fft_asmb_sse (int k, complex_t *x, complex_t *wTB, ...@@ -258,7 +258,7 @@ static void fft_asmb_sse (int k, complex_t *x, complex_t *wTB,
"movhlps %%xmm5, %%xmm7\n" /* wT[1].im * d[1].im | wT[1].re * d[1].im */ "movhlps %%xmm5, %%xmm7\n" /* wT[1].im * d[1].im | wT[1].re * d[1].im */
"movlhps %%xmm6, %%xmm5\n" /* wB[1].im * d3[1].re | wB[1].re * d3[1].re | wT[1].im * d[1].re | wT[1].re * d[1].re */ "movlhps %%xmm6, %%xmm5\n" /* wB[1].im * d3[1].re | wB[1].re * d3[1].re | wT[1].im * d[1].re | wT[1].re * d[1].re */
"shufps $0xb1, %%xmm6, %%xmm7\n" /* wB[1].re * d3[1].im | wB[i].im * d3[1].im | wT[1].re * d[1].im | wT[1].im * d[1].im */ "shufps $0xb1, %%xmm6, %%xmm7\n" /* wB[1].re * d3[1].im | wB[i].im * d3[1].im | wT[1].re * d[1].im | wT[1].im * d[1].im */
"movl $C_1, %%edi\n" "movl $C_1_sse, %%edi\n"
"movups (%%edi), %%xmm4\n" "movups (%%edi), %%xmm4\n"
"mulps %%xmm4, %%xmm7\n" "mulps %%xmm4, %%xmm7\n"
"addps %%xmm7, %%xmm5\n" /* wB[1] * d3[1] | wT[1] * d[1] */ "addps %%xmm7, %%xmm5\n" /* wB[1] * d3[1] | wT[1] * d[1] */
...@@ -318,7 +318,7 @@ static void fft_asmb_sse (int k, complex_t *x, complex_t *wTB, ...@@ -318,7 +318,7 @@ static void fft_asmb_sse (int k, complex_t *x, complex_t *wTB,
"mulps %%xmm5, %%xmm4\n" /* wB[0].im * d3[0].im | wB[0].re * d3[0].im | wB[0].im * d3[0].re | wB[0].re * d3[0].re */ "mulps %%xmm5, %%xmm4\n" /* wB[0].im * d3[0].im | wB[0].re * d3[0].im | wB[0].im * d3[0].re | wB[0].re * d3[0].re */
"mulps %%xmm7, %%xmm6\n" /* wB[1].im * d3[1].im | wB[1].re * d3[1].im | wB[1].im * d3[1].re | wB[1].re * d3[1].re */ "mulps %%xmm7, %%xmm6\n" /* wB[1].im * d3[1].im | wB[1].re * d3[1].im | wB[1].im * d3[1].re | wB[1].re * d3[1].re */
"shufps $0xb1, %%xmm2, %%xmm1\n" /* d[1].im * wT[1].re | d[1].im * wT[1].im | d[0].im * wT[0].re | d[0].im * wT[0].im */ "shufps $0xb1, %%xmm2, %%xmm1\n" /* d[1].im * wT[1].re | d[1].im * wT[1].im | d[0].im * wT[0].re | d[0].im * wT[0].im */
"movl $C_1, %%edi\n" "movl $C_1_sse, %%edi\n"
"movups (%%edi), %%xmm3\n" /* 1.0 | -1.0 | 1.0 | -1.0 */ "movups (%%edi), %%xmm3\n" /* 1.0 | -1.0 | 1.0 | -1.0 */
"movhlps %%xmm4, %%xmm5\n" /* wB[0].im * d3[0].im | wB[0].re * d3[0].im */ "movhlps %%xmm4, %%xmm5\n" /* wB[0].im * d3[0].im | wB[0].re * d3[0].im */
......
/*****************************************************************************
* imdct3dn.c : accelerated 3D Now! IMDCT module
*****************************************************************************
* Copyright (C) 1999, 2000 VideoLAN
* $Id: imdct3dn.c,v 1.1 2001/05/16 14:51:29 reno Exp $
*
* Authors: Gaël Hendryckx <jimmy@via.ecp.fr>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111, USA.
*****************************************************************************/
#define MODULE_NAME imdct3dn
#include "modules_inner.h"
/*****************************************************************************
* Preamble
*****************************************************************************/
#include "defs.h"
#include <stdlib.h>
#include "config.h"
#include "common.h"
#include "threads.h"
#include "mtime.h"
#include "tests.h"
#include "ac3_imdct.h"
#include "ac3_imdct_common.h"
#include "modules.h"
/*****************************************************************************
* Local and extern prototypes.
*****************************************************************************/
static void imdct_getfunctions( function_list_t * p_function_list );
static int imdct_Probe ( probedata_t *p_data );
/*****************************************************************************
* Build configuration tree.
*****************************************************************************/
MODULE_CONFIG_START
ADD_WINDOW( "Configuration for IMDCT module" )
ADD_COMMENT( "Ha, ha -- nothing to configure yet" )
MODULE_CONFIG_END
/*****************************************************************************
* InitModule: get the module structure and configuration.
*****************************************************************************
* We have to fill psz_name, psz_longname and psz_version. These variables
* will be strdup()ed later by the main application because the module can
* be unloaded later to save memory, and we want to be able to access this
* data even after the module has been unloaded.
*****************************************************************************/
MODULE_INIT
{
p_module->psz_name = MODULE_STRING;
p_module->psz_longname = "AC3 IMDCT module";
p_module->psz_version = VERSION;
p_module->i_capabilities = MODULE_CAPABILITY_NULL
| MODULE_CAPABILITY_IMDCT;
return( 0 );
}
/*****************************************************************************
* ActivateModule: set the module to an usable state.
*****************************************************************************
* This function fills the capability functions and the configuration
* structure. Once ActivateModule() has been called, the i_usage can
* be set to 0 and calls to NeedModule() be made to increment it. To unload
* the module, one has to wait until i_usage == 0 and call DeactivateModule().
*****************************************************************************/
MODULE_ACTIVATE
{
p_module->p_functions = malloc( sizeof( module_functions_t ) );
if( p_module->p_functions == NULL )
{
return( -1 );
}
imdct_getfunctions( &p_module->p_functions->imdct );
p_module->p_config = p_config;
return( 0 );
}
/*****************************************************************************
* DeactivateModule: make sure the module can be unloaded.
*****************************************************************************
* This function must only be called when i_usage == 0. If it successfully
* returns, i_usage can be set to -1 and the module unloaded. Be careful to
* lock usage_lock during the whole process.
*****************************************************************************/
MODULE_DEACTIVATE
{
free( p_module->p_functions );
return( 0 );
}
/* Following functions are local */
/*****************************************************************************
* Functions exported as capabilities. They are declared as static so that
* we don't pollute the namespace too much.
*****************************************************************************/
static void imdct_getfunctions( function_list_t * p_function_list )
{
p_function_list->pf_probe = imdct_Probe;
#define F p_function_list->functions.imdct
F.pf_imdct_init = _M( imdct_init );
F.pf_imdct_256 = _M( imdct_do_256 );
F.pf_imdct_256_nol = _M( imdct_do_256_nol );
F.pf_imdct_512 = _M( imdct_do_512 );
F.pf_imdct_512_nol = _M( imdct_do_512_nol );
#undef F
}
/*****************************************************************************
* imdct_Probe: returns a preference score
*****************************************************************************/
static int imdct_Probe( probedata_t *p_data )
{
if( !TestCPU( CPU_CAPABILITY_3DNOW ) )
{
return( 0 );
}
if( TestMethod( DOWNMIX_METHOD_VAR, "imdct3dn" ) )
{
return( 999 );
}
/* This plugin always works */
return( 200 );
}
...@@ -2,7 +2,7 @@ ...@@ -2,7 +2,7 @@
* ac3_imdct.c: ac3 DCT * ac3_imdct.c: ac3 DCT
***************************************************************************** *****************************************************************************
* Copyright (C) 1999, 2000 VideoLAN * Copyright (C) 1999, 2000 VideoLAN
* $Id: ac3_imdct.c,v 1.20 2001/05/15 16:19:42 sam Exp $ * $Id: ac3_imdct.c,v 1.21 2001/05/16 14:51:29 reno Exp $
* *
* Authors: Michel Kaempf <maxx@via.ecp.fr> * Authors: Michel Kaempf <maxx@via.ecp.fr>
* Aaron Holtzman <aholtzma@engr.uvic.ca> * Aaron Holtzman <aholtzma@engr.uvic.ca>
...@@ -54,7 +54,7 @@ void imdct_init(imdct_t * p_imdct) ...@@ -54,7 +54,7 @@ void imdct_init(imdct_t * p_imdct)
int i; int i;
float scale = 181.019; float scale = 181.019;
p_imdct->pf_imdct_init( p_imdct ); p_imdct->pf_imdct_init( p_imdct );
/* More twiddle factors to turn IFFT into IMDCT */ /* More twiddle factors to turn IFFT into IMDCT */
for (i=0; i < 64; i++) { for (i=0; i < 64; i++) {
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment