Commit db7b0421 authored by Renaud Dartus's avatar Renaud Dartus

* Begin of SSE/3DNow! support for imdct and downmix

If you have a PIII or a Athlon and you want to try this, just comment #if 0
in ac3_downmix.c and ac3_imdct.c and add in AC3_DECODER section of Makefile :
	src/ac3_decoder/ac3_imdct_sse.o \
	src/ac3_decoder/ac3_srfft_sse.o \
	src/ac3_decoder/ac3_downmix_sse.o \
	src/ac3_decoder/ac3_downmix_3dn.o \
parent c1df8159
......@@ -2,7 +2,7 @@
* tests.h: several test functions needed by the plugins
*****************************************************************************
* Copyright (C) 1996, 1997, 1998, 1999, 2000 VideoLAN
* $Id: tests.h,v 1.9 2001/03/21 13:42:33 sam Exp $
* $Id: tests.h,v 1.10 2001/05/14 15:58:03 reno Exp $
*
* Authors: Samuel Hocevar <sam@zoy.org>
*
......@@ -28,6 +28,7 @@
#define CPU_CAPABILITY_MMX 1<<3
#define CPU_CAPABILITY_3DNOW 1<<4
#define CPU_CAPABILITY_MMXEXT 1<<5
#define CPU_CAPABILITY_SSE 1<<6
#define CPU_CAPABILITY_ALTIVEC 1<<16
/*****************************************************************************
......
......@@ -2,7 +2,7 @@
* ac3_bit_allocate.c: ac3 allocation tables
*****************************************************************************
* Copyright (C) 2000 VideoLAN
* $Id: ac3_bit_allocate.c,v 1.20 2001/05/06 04:32:02 sam Exp $
* $Id: ac3_bit_allocate.c,v 1.21 2001/05/14 15:58:03 reno Exp $
*
* Authors: Michel Kaempf <maxx@via.ecp.fr>
* Aaron Holtzman <aholtzma@engr.uvic.ca>
......@@ -37,7 +37,7 @@
#include "input_ext-dec.h"
#include "ac3_decoder.h"
#include "ac3_internal.h"
#include "ac3_internal.h" /* DELTA_BIT_REUSE */
static void ba_compute_psd (bit_allocate_t * p_bit, s16 start, s16 end, s16 exps[]);
......
......@@ -2,7 +2,7 @@
* ac3_decoder.c: core ac3 decoder
*****************************************************************************
* Copyright (C) 1999, 2000 VideoLAN
* $Id: ac3_decoder.c,v 1.32 2001/05/07 03:14:09 stef Exp $
* $Id: ac3_decoder.c,v 1.33 2001/05/14 15:58:03 reno Exp $
*
* Authors: Michel Kaempf <maxx@via.ecp.fr>
* Michel Lespinasse <walken@zoy.org>
......@@ -40,21 +40,14 @@
#include "audio_output.h"
#include "ac3_decoder.h"
#include "ac3_decoder_thread.h"
#include "ac3_decoder_thread.h" /* ac3dec_thread_t */
#include "ac3_internal.h"
#include <stdio.h>
void imdct_init (imdct_t * p_imdct);
void downmix_init (downmix_t * p_downmix);
static float cmixlev_lut[4] = { 0.707, 0.595, 0.500, 0.707 };
static float smixlev_lut[4] = { 0.707, 0.500, 0.0 , 0.500 };
static const float cmixlev_lut[4] = { 0.707, 0.595, 0.500, 0.707 };
static const float smixlev_lut[4] = { 0.707, 0.500, 0.0 , 0.500 };
int ac3_init (ac3dec_t * p_ac3dec)
{
// p_ac3dec->bit_stream.buffer = 0;
// p_ac3dec->bit_stream.i_available = 0;
p_ac3dec->mantissa.lfsr_state = 1; /* dither_gen initialization */
imdct_init(&p_ac3dec->imdct);
downmix_init(&p_ac3dec->downmix);
......@@ -69,7 +62,7 @@ int ac3_decode_frame (ac3dec_t * p_ac3dec, s16 * buffer)
if (parse_bsi (p_ac3dec))
{
intf_WarnMsg (3,"Error during ac3parsing");
intf_WarnMsg (3,"ac3dec warn: error during parsing");
parse_auxdata (p_ac3dec);
return 1;
}
......@@ -102,7 +95,7 @@ int ac3_decode_frame (ac3dec_t * p_ac3dec, s16 * buffer)
if (parse_audblk (p_ac3dec, i))
{
intf_WarnMsg (3,"Error during ac3audioblock");
intf_WarnMsg (3,"ac3dec warn: error during audioblock");
parse_auxdata (p_ac3dec);
return 1;
}
......@@ -114,7 +107,7 @@ int ac3_decode_frame (ac3dec_t * p_ac3dec, s16 * buffer)
if (exponent_unpack (p_ac3dec))
{
intf_WarnMsg (3,"Error during ac3unpack");
intf_WarnMsg (3,"ac3dec warn: error during unpack");
parse_auxdata (p_ac3dec);
return 1;
}
......
......@@ -2,7 +2,7 @@
* ac3_decoder.h : ac3 decoder interface
*****************************************************************************
* Copyright (C) 1999, 2000 VideoLAN
* $Id: ac3_decoder.h,v 1.7 2001/04/30 21:04:20 reno Exp $
* $Id: ac3_decoder.h,v 1.8 2001/05/14 15:58:03 reno Exp $
*
* Authors: Michel Kaempf <maxx@via.ecp.fr>
* Renaud Dartus <reno@videolan.org>
......
......@@ -2,7 +2,7 @@
* ac3_decoder_thread.h : ac3 decoder thread interface
*****************************************************************************
* Copyright (C) 1999, 2000 VideoLAN
* $Id: ac3_decoder_thread.h,v 1.6 2001/05/01 04:18:18 sam Exp $
* $Id: ac3_decoder_thread.h,v 1.7 2001/05/14 15:58:03 reno Exp $
*
* Authors: Michel Kaempf <maxx@via.ecp.fr>
*
......@@ -30,14 +30,11 @@ typedef struct ac3dec_thread_s
* Thread properties
*/
vlc_thread_t thread_id; /* id for thread functions */
// bit_stream_t bit_stream;
/*
* Input properties
*/
decoder_fifo_t * p_fifo; /* stores the PES stream data */
// data_packet_t * p_data;
int sync_ptr; /* sync ptr from ac3 magic header */
adec_config_t * p_config;
......
......@@ -2,7 +2,7 @@
* ac3_downmix.c: ac3 downmix functions
*****************************************************************************
* Copyright (C) 1999, 2000 VideoLAN
* $Id: ac3_downmix.c,v 1.22 2001/05/06 04:32:02 sam Exp $
* $Id: ac3_downmix.c,v 1.23 2001/05/14 15:58:03 reno Exp $
*
* Authors: Michel Kaempf <maxx@via.ecp.fr>
* Aaron Holtzman <aholtzma@engr.uvic.ca>
......@@ -31,29 +31,41 @@
#include "threads.h"
#include "mtime.h"
#include "intf_msg.h" /* intf_DbgMsg(), intf_ErrMsg() */
#include "tests.h"
#include "stream_control.h"
#include "input_ext-dec.h"
#include "ac3_decoder.h"
#include "ac3_internal.h"
#include "ac3_downmix.h"
void downmix_init (downmix_t * p_downmix)
{
#if 0
if ( TestCPU (CPU_CAPABILITY_MMX) )
if ( TestCPU (CPU_CAPABILITY_SSE) )
{
fprintf(stderr,"Using MMX for downmix\n");
p_downmix->downmix_3f_2r_to_2ch = downmix_3f_2r_to_2ch_kni;
p_downmix->downmix_2f_2r_to_2ch = downmix_2f_2r_to_2ch_kni;
p_downmix->downmix_3f_1r_to_2ch = downmix_3f_1r_to_2ch_kni;
p_downmix->downmix_2f_1r_to_2ch = downmix_2f_1r_to_2ch_kni;
p_downmix->downmix_3f_0r_to_2ch = downmix_3f_0r_to_2ch_kni;
p_downmix->stream_sample_2ch_to_s16 = stream_sample_2ch_to_s16_kni;
p_downmix->stream_sample_1ch_to_s16 = stream_sample_1ch_to_s16_kni;
} else
intf_WarnMsg (1,"ac3dec: using MMX_SSE for downmix");
p_downmix->downmix_3f_2r_to_2ch = downmix_3f_2r_to_2ch_sse;
p_downmix->downmix_2f_2r_to_2ch = downmix_2f_2r_to_2ch_sse;
p_downmix->downmix_3f_1r_to_2ch = downmix_3f_1r_to_2ch_sse;
p_downmix->downmix_2f_1r_to_2ch = downmix_2f_1r_to_2ch_sse;
p_downmix->downmix_3f_0r_to_2ch = downmix_3f_0r_to_2ch_sse;
p_downmix->stream_sample_2ch_to_s16 = stream_sample_2ch_to_s16_sse;
p_downmix->stream_sample_1ch_to_s16 = stream_sample_1ch_to_s16_sse;
}
else if ( TestCPU (CPU_CAPABILITY_3DNOW) )
{
intf_WarnMsg (1,"ac3dec: using MMX_3DNOW for downmix");
p_downmix->downmix_3f_2r_to_2ch = downmix_3f_2r_to_2ch_3dn;
p_downmix->downmix_2f_2r_to_2ch = downmix_2f_2r_to_2ch_3dn;
p_downmix->downmix_3f_1r_to_2ch = downmix_3f_1r_to_2ch_3dn;
p_downmix->downmix_2f_1r_to_2ch = downmix_2f_1r_to_2ch_3dn;
p_downmix->downmix_3f_0r_to_2ch = downmix_3f_0r_to_2ch_3dn;
p_downmix->stream_sample_2ch_to_s16 = stream_sample_2ch_to_s16_3dn;
p_downmix->stream_sample_1ch_to_s16 = stream_sample_1ch_to_s16_3dn;
}
else
#endif
{
p_downmix->downmix_3f_2r_to_2ch = downmix_3f_2r_to_2ch_c;
......
......@@ -2,7 +2,7 @@
* ac3_downmix.h: ac3 downmix functions
*****************************************************************************
* Copyright (C) 2000, 2001 VideoLAN
* $Id: ac3_downmix.h,v 1.6 2001/04/30 21:04:20 reno Exp $
* $Id: ac3_downmix.h,v 1.7 2001/05/14 15:58:04 reno Exp $
*
* Authors: Renaud Dartus <reno@videolan.org>
*
......@@ -30,13 +30,22 @@ void downmix_3f_0r_to_2ch_c(float *samples, dm_par_t * dm_par);
void stream_sample_2ch_to_s16_c(s16 *s16_samples, float *left, float *right);
void stream_sample_1ch_to_s16_c(s16 *s16_samples, float *center);
#if 0
/* Kni functions */
void downmix_3f_2r_to_2ch_kni(float *samples, dm_par_t * dm_par);
void downmix_3f_1r_to_2ch_kni(float *samples, dm_par_t * dm_par);
void downmix_2f_2r_to_2ch_kni(float *samples, dm_par_t * dm_par);
void downmix_2f_1r_to_2ch_kni(float *samples, dm_par_t * dm_par);
void downmix_3f_0r_to_2ch_kni(float *samples, dm_par_t * dm_par);
void stream_sample_2ch_to_s16_kni(s16 *s16_samples, float *left, float *right);
void stream_sample_1ch_to_s16_kni(s16 *s16_samples, float *center);
#endif
/* SSE functions */
void downmix_3f_2r_to_2ch_sse(float *samples, dm_par_t * dm_par);
void downmix_3f_1r_to_2ch_sse(float *samples, dm_par_t * dm_par);
void downmix_2f_2r_to_2ch_sse(float *samples, dm_par_t * dm_par);
void downmix_2f_1r_to_2ch_sse(float *samples, dm_par_t * dm_par);
void downmix_3f_0r_to_2ch_sse(float *samples, dm_par_t * dm_par);
void stream_sample_2ch_to_s16_sse(s16 *s16_samples, float *left, float *right);
void stream_sample_1ch_to_s16_sse(s16 *s16_samples, float *center);
/* 3DNow! functions */
void downmix_3f_2r_to_2ch_3dn(float *samples, dm_par_t * dm_par);
void downmix_3f_1r_to_2ch_3dn(float *samples, dm_par_t * dm_par);
void downmix_2f_2r_to_2ch_3dn(float *samples, dm_par_t * dm_par);
void downmix_2f_1r_to_2ch_3dn(float *samples, dm_par_t * dm_par);
void downmix_3f_0r_to_2ch_3dn(float *samples, dm_par_t * dm_par);
void stream_sample_2ch_to_s16_3dn(s16 *s16_samples, float *left, float *right);
void stream_sample_1ch_to_s16_3dn(s16 *s16_samples, float *center);
/*****************************************************************************
* ac3_downmix_3dn.c: ac3 downmix functions
*****************************************************************************
* Copyright (C) 1999, 2000, 2001 VideoLAN
* $Id: ac3_downmix_3dn.c,v 1.1 2001/05/14 15:58:04 reno Exp $
*
* Authors: Renaud Dartus <reno@videolan.org>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111, USA.
*****************************************************************************/
#include "defs.h"
#include "config.h"
#include "common.h"
#include "threads.h"
#include "mtime.h"
#include "tests.h"
#include "stream_control.h"
#include "input_ext-dec.h"
#include "ac3_decoder.h"
void downmix_3f_2r_to_2ch_3dn (float * samples, dm_par_t * dm_par)
{
__asm__ __volatile__ (
"pushl %%ecx\n"
"movl $128, %%ecx\n" /* loop counter */
"movd (%%ebx), %%mm5\n" /* unit */
"punpckldq %%mm5, %%mm5\n" /* unit | unit */
"movd 4(%%ebx), %%mm6\n" /* clev */
"punpckldq %%mm6, %%mm6\n" /* clev | clev */
"movd 8(%%ebx), %%mm7\n" /* slev */
"punpckldq %%mm7, %%mm7\n" /* slev | slev */
".loop:\n"
"movq (%%eax), %%mm0\n" /* left */
"movq 2048(%%eax), %%mm1\n" /* right */
"movq 1024(%%eax), %%mm2\n" /* center */
"movq 3072(%%eax), %%mm3\n" /* leftsur */
"movq 4096(%%eax), %%mm4\n" /* rightsur */
"pfmul %%mm5, %%mm0\n"
"pfmul %%mm5, %%mm1\n"
"pfmul %%mm6, %%mm2\n"
"pfadd %%mm2, %%mm0\n"
"pfadd %%mm2, %%mm1\n"
"pfmul %%mm7, %%mm3\n"
"pfmul %%mm7, %%mm4\n"
"pfadd %%mm3, %%mm0\n"
"pfadd %%mm4, %%mm1\n"
"movq %%mm0, (%%eax)\n"
"movq %%mm1, 1024(%%eax)\n"
"addl $8, %%eax\n"
"decl %%ecx\n"
"jnz .loop\n"
"popl %%ecx\n"
"femms\n"
: "=a" (samples)
: "a" (samples), "b" (dm_par));
}
void downmix_2f_2r_to_2ch_3dn (float *samples, dm_par_t * dm_par)
{
__asm__ __volatile__ (
"pushl %%ecx\n"
"movl $128, %%ecx\n" /* loop counter */
"movd (%%ebx), %%mm5\n" /* unit */
"punpckldq %%mm5, %%mm5\n" /* unit | unit */
"movd 8(%%ebx), %%mm7\n" /* slev */
"punpckldq %%mm7, %%mm7\n" /* slev | slev */
".loop3:\n"
"movq (%%eax), %%mm0\n" /* left */
"movq 1024(%%eax), %%mm1\n" /* right */
"movq 2048(%%eax), %%mm3\n" /* leftsur */
"movq 3072(%%eax), %%mm4\n" /* rightsur */
"pfmul %%mm5, %%mm0\n"
"pfmul %%mm5, %%mm1\n"
"pfmul %%mm7, %%mm3\n"
"pfmul %%mm7, %%mm4\n"
"pfadd %%mm3, %%mm0\n"
"pfadd %%mm4, %%mm1\n"
"movq %%mm0, (%%eax)\n"
"movq %%mm1, 1024(%%eax)\n"
"addl $8, %%eax\n"
"decl %%ecx\n"
"jnz .loop3\n"
"popl %%ecx\n"
"femms\n"
: "=a" (samples)
: "a" (samples), "b" (dm_par));
}
void downmix_3f_1r_to_2ch_3dn (float *samples, dm_par_t * dm_par)
{
__asm__ __volatile__ (
"pushl %%ecx\n"
"movl $128, %%ecx\n" /* loop counter */
"movd (%%ebx), %%mm5\n" /* unit */
"punpckldq %%mm5, %%mm5\n" /* unit | unit */
"movd 4(%%ebx), %%mm6\n" /* clev */
"punpckldq %%mm6, %%mm6\n" /* clev | clev */
"movd 8(%%ebx), %%mm7\n" /* slev */
"punpckldq %%mm7, %%mm7\n" /* slev | slev */
".loop4:\n"
"movq (%%eax), %%mm0\n" /* left */
"movq 2048(%%eax), %%mm1\n" /* right */
"movq 1024(%%eax), %%mm2\n" /* center */
"movq 3072(%%eax), %%mm3\n" /* sur */
"pfmul %%mm5, %%mm0\n"
"pfmul %%mm5, %%mm1\n"
"pfmul %%mm6, %%mm2\n"
"pfadd %%mm2, %%mm0\n"
"pfmul %%mm7, %%mm3\n"
"pfadd %%mm2, %%mm1\n"
"pfsub %%mm3, %%mm0\n"
"pfadd %%mm3, %%mm1\n"
"movq %%mm0, (%%eax)\n"
"movq %%mm1, 1024(%%eax)\n"
"addl $8, %%eax\n"
"decl %%ecx\n"
"jnz .loop4\n"
"popl %%ecx\n"
"femms\n"
: "=a" (samples)
: "a" (samples), "b" (dm_par));
}
void downmix_2f_1r_to_2ch_3dn (float *samples, dm_par_t * dm_par)
{
__asm__ __volatile__ (
"pushl %%ecx\n"
"movl $128, %%ecx\n" /* loop counter */
"movd (%%ebx), %%mm5\n" /* unit */
"punpckldq %%mm5, %%mm5\n" /* unit | unit */
"movd 8(%%ebx), %%mm7\n" /* slev */
"punpckldq %%mm7, %%mm7\n" /* slev | slev */
".loop5:\n"
"movq (%%eax), %%mm0\n" /* left */
"movq 1024(%%eax), %%mm1\n" /* right */
"movq 2048(%%eax), %%mm3\n" /* sur */
"pfmul %%mm5, %%mm0\n"
"pfmul %%mm5, %%mm1\n"
"pfmul %%mm7, %%mm3\n"
"pfsub %%mm3, %%mm0\n"
"pfadd %%mm3, %%mm1\n"
"movq %%mm0, (%%eax)\n"
"movq %%mm1, 1024(%%eax)\n"
"addl $8, %%eax\n"
"decl %%ecx\n"
"jnz .loop5\n"
"popl %%ecx\n"
"femms\n"
: "=a" (samples)
: "a" (samples), "b" (dm_par));
}
void downmix_3f_0r_to_2ch_3dn (float *samples, dm_par_t * dm_par)
{
__asm__ __volatile__ (
"pushl %%ecx\n"
"movl $128, %%ecx\n" /* loop counter */
"movd (%%ebx), %%mm5\n" /* unit */
"punpckldq %%mm5, %%mm5\n" /* unit | unit */
"movd 4(%%ebx), %%mm6\n" /* clev */
"punpckldq %%mm6, %%mm6\n" /* clev | clev */
".loop6:\n"
"movq (%%eax), %%mm0\n" /*left */
"movq 2048(%%eax), %%mm1\n" /* right */
"movq 1024(%%eax), %%mm2\n" /* center */
"pfmul %%mm5, %%mm0\n"
"pfmul %%mm5, %%mm1\n"
"pfmul %%mm6, %%mm2\n"
"pfadd %%mm2, %%mm0\n"
"pfadd %%mm2, %%mm1\n"
"movq %%mm0, (%%eax)\n"
"movq %%mm1, 1024(%%eax)\n"
"addl $8, %%eax\n"
"decl %%ecx\n"
"jnz .loop6\n"
"popl %%ecx\n"
"femms\n"
: "=a" (samples)
: "a" (samples), "b" (dm_par));
}
void stream_sample_1ch_to_s16_3dn (s16 *s16_samples, float *left)
{
__asm__ __volatile__ (
"pushl %%ecx\n"
"pushl %%edx\n"
"movl $sqrt2, %%edx\n"
"movd (%%edx), %%mm7\n"
"punpckldq %%mm7, %%mm7\n" /* sqrt2 | sqrt2 */
"movl $128, %%ecx\n"
".loop2:\n"
"movq (%%ebx), %%mm0\n" /* c1 | c0 */
"pfmul %%mm7, %%mm0\n"
"pf2id %%mm0, %%mm0\n" /* c1 c0 --> mm0, int_32 */
"packssdw %%mm0, %%mm0\n" /* c1 c1 c0 c0 --> mm0, int_16 */
"movq %%mm0, (%%eax)\n"
"addl $8, %%eax\n"
"addl $8, %%ebx\n"
"decl %%ecx\n"
"jnz .loop2\n"
"popl %%edx\n"
"popl %%ecx\n"
"femms\n"
: "=a" (s16_samples), "=b" (left)
: "a" (s16_samples), "b" (left));
}
void stream_sample_2ch_to_s16_3dn (s16 *s16_samples, float *left, float *right)
{
__asm__ __volatile__ (
"pushl %%ecx\n"
"movl $128, %%ecx\n"
".loop1:\n"
"movq (%%ebx), %%mm0\n" /* l1 | l0 */
"movq (%%edx), %%mm1\n" /* r1 | r0 */
"movq %%mm0, %%mm2\n" /* l1 | l0 */
"punpckldq %%mm1, %%mm0\n" /* r0 | l0 */
"punpckhdq %%mm1, %%mm2\n" /* r1 | l1 */
"pf2id %%mm0, %%mm0\n" /* r0 l0 --> mm0, int_32 */
"pf2id %%mm2, %%mm2\n" /* r0 l0 --> mm0, int_32 */
"packssdw %%mm2, %%mm0\n" /* r1 l1 r0 l0 --> mm0, int_16 */
"movq %%mm0, (%%eax)\n"
"movq %%mm2, 8(%%eax)\n"
"addl $8, %%eax\n"
"addl $8, %%ebx\n"
"addl $8, %%edx\n"
"decl %%ecx\n"
"jnz .loop1\n"
"popl %%ecx\n"
"femms\n"
: "=a" (s16_samples), "=b" (left), "=d" (right)
: "a" (s16_samples), "b" (left), "d" (right));
}
......@@ -2,7 +2,7 @@
* ac3_downmix_c.c: ac3 downmix functions
*****************************************************************************
* Copyright (C) 1999, 2000, 2001 VideoLAN
* $Id: ac3_downmix_c.c,v 1.7 2001/05/06 04:32:02 sam Exp $
* $Id: ac3_downmix_c.c,v 1.8 2001/05/14 15:58:04 reno Exp $
*
* Authors: Renaud Dartus <reno@videolan.org>
* Aaron Holtzman <aholtzma@engr.uvic.ca>
......@@ -35,11 +35,8 @@
#include "input_ext-dec.h"
#include "ac3_decoder.h"
#include "ac3_internal.h"
#include "ac3_downmix.h"
void __inline__ downmix_3f_2r_to_2ch_c (float *samples, dm_par_t *dm_par)
void downmix_3f_2r_to_2ch_c (float *samples, dm_par_t *dm_par)
{
int i;
float *left, *right, *center, *left_sur, *right_sur;
......@@ -59,7 +56,7 @@ void __inline__ downmix_3f_2r_to_2ch_c (float *samples, dm_par_t *dm_par)
}
}
void __inline__ downmix_2f_2r_to_2ch_c (float *samples, dm_par_t *dm_par)
void downmix_2f_2r_to_2ch_c (float *samples, dm_par_t *dm_par)
{
int i;
float *left, *right, *left_sur, *right_sur;
......@@ -78,7 +75,7 @@ void __inline__ downmix_2f_2r_to_2ch_c (float *samples, dm_par_t *dm_par)
}
}
void __inline__ downmix_3f_1r_to_2ch_c (float *samples, dm_par_t *dm_par)
void downmix_3f_1r_to_2ch_c (float *samples, dm_par_t *dm_par)
{
int i;
float *left, *right, *center, *right_sur;
......@@ -98,7 +95,7 @@ void __inline__ downmix_3f_1r_to_2ch_c (float *samples, dm_par_t *dm_par)
}
void __inline__ downmix_2f_1r_to_2ch_c (float *samples, dm_par_t *dm_par)
void downmix_2f_1r_to_2ch_c (float *samples, dm_par_t *dm_par)
{
int i;
float *left, *right, *right_sur;
......@@ -117,7 +114,7 @@ void __inline__ downmix_2f_1r_to_2ch_c (float *samples, dm_par_t *dm_par)
}
void __inline__ downmix_3f_0r_to_2ch_c (float *samples, dm_par_t *dm_par)
void downmix_3f_0r_to_2ch_c (float *samples, dm_par_t *dm_par)
{
int i;
float *left, *right, *center;
......@@ -136,7 +133,7 @@ void __inline__ downmix_3f_0r_to_2ch_c (float *samples, dm_par_t *dm_par)
}
void __inline__ stream_sample_2ch_to_s16_c (s16 *out_buf, float *left, float *right)
void stream_sample_2ch_to_s16_c (s16 *out_buf, float *left, float *right)
{
int i;
for (i=0; i < 256; i++) {
......@@ -146,7 +143,7 @@ void __inline__ stream_sample_2ch_to_s16_c (s16 *out_buf, float *left, float *ri
}
void __inline__ stream_sample_1ch_to_s16_c (s16 *out_buf, float *center)
void stream_sample_1ch_to_s16_c (s16 *out_buf, float *center)
{
int i;
float tmp;
......
/*****************************************************************************
* ac3_downmix_sse.c: ac3 downmix functions
*****************************************************************************
* Copyright (C) 1999, 2000, 2001 VideoLAN
* $Id: ac3_downmix_sse.c,v 1.1 2001/05/14 15:58:04 reno Exp $
*
* Authors: Renaud Dartus <reno@videolan.org>
* Aaron Holtzman <aholtzma@engr.uvic.ca>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111, USA.
*****************************************************************************/
#include "defs.h"
#include "config.h"
#include "common.h"
#include "threads.h"
#include "mtime.h"
#include "tests.h"
#include "stream_control.h"
#include "input_ext-dec.h"
#include "ac3_decoder.h"
void sqrt2 (void)
{
__asm__ (".float 0f0.7071068");
}
void downmix_3f_2r_to_2ch_sse (float * samples, dm_par_t * dm_par)
{
__asm__ __volatile__ (
"pushl %%ecx\n"
"movl $64, %%ecx\n" /* loop counter */
"movss (%%ebx), %%xmm5\n" /* unit */
"shufps $0, %%xmm5, %%xmm5\n" /* unit | unit | unit | unit */
"movss 4(%%ebx), %%xmm6\n" /* clev */
"shufps $0, %%xmm6, %%xmm6\n" /* clev | clev | clev | clev */
"movss 8(%%ebx), %%xmm7\n" /* slev */
"shufps $0, %%xmm7, %%xmm7\n" /* slev | slev | slev | slev */
".loop:\n"
"movups (%%eax), %%xmm0\n" /* left */
"movups 2048(%%eax), %%xmm1\n" /* right */
"movups 1024(%%eax), %%xmm2\n" /* center */
"movups 3072(%%eax), %%xmm3\n" /* leftsur */
"movups 4096(%%eax), %%xmm4\n" /* rithgsur */
"mulps %%xmm5, %%xmm0\n"
"mulps %%xmm5, %%xmm1\n"
"mulps %%xmm6, %%xmm2\n"
"addps %%xmm2, %%xmm0\n"
"addps %%xmm2, %%xmm1\n"
"mulps %%xmm7, %%xmm3\n"
"mulps %%xmm7, %%xmm4\n"
"addps %%xmm3, %%xmm0\n"
"addps %%xmm4, %%xmm1\n"
"movups %%xmm0, (%%eax)\n"
"movups %%xmm1, 1024(%%eax)\n"
"addl $16, %%eax\n"
"decl %%ecx\n"
"jnz .loop\n"
"popl %%ecx\n"
: "=a" (samples)
: "a" (samples), "b" (dm_par));
}
void downmix_2f_2r_to_2ch_sse (float *samples, dm_par_t * dm_par)
{
__asm__ __volatile__ (
"pushl %%ecx\n"
"movl $64, %%ecx\n" /* loop counter */
"movss (%%ebx), %%xmm5\n" /* unit */
"shufps $0, %%xmm5, %%xmm5\n" /* unit | unit | unit | unit */
"movss 8(%%ebx), %%xmm7\n" /* slev */
"shufps $0, %%xmm7, %%xmm7\n" /* slev | slev | slev | slev */
".loop3:\n"
"movups (%%eax), %%xmm0\n" /* left */
"movups 1024(%%eax), %%xmm1\n" /* right */
"movups 2048(%%eax), %%xmm3\n" /* leftsur */
"movups 3072(%%eax), %%xmm4\n" /* rightsur */
"mulps %%xmm5, %%xmm0\n"
"mulps %%xmm5, %%xmm1\n"
"mulps %%xmm7, %%xmm3\n"
"mulps %%xmm7, %%xmm4\n"
"addps %%xmm3, %%xmm0\n"
"addps %%xmm4, %%xmm1\n"
"movups %%xmm0, (%%eax)\n"
"movups %%xmm1, 1024(%%eax)\n"
"addl $16, %%eax\n"
"decl %%ecx\n"
"jnz .loop3\n"
"popl %%ecx\n"
: "=a" (samples)
: "a" (samples), "b" (dm_par));
}
void downmix_3f_1r_to_2ch_sse (float *samples, dm_par_t * dm_par)
{
__asm__ __volatile__ (
"pushl %%ecx\n"
"movl $64, %%ecx\n" /* loop counter */
"movss (%%ebx), %%xmm5\n" /* unit */
"shufps $0, %%xmm5, %%xmm5\n" /* unit | unit | unit | unit */
"movss 4(%%ebx), %%xmm6\n" /* clev */
"shufps $0, %%xmm6, %%xmm6\n" /* clev | clev | clev | clev */
"movss 8(%%ebx), %%xmm7\n" /* slev */
"shufps $0, %%xmm7, %%xmm7\n" /* slev | slev | slev | slev */
".loop4:\n"
"movups (%%eax), %%xmm0\n" /* left */
"movups 2048(%%eax), %%xmm1\n" /* right */
"movups 1024(%%eax), %%xmm2\n" /* center */
"movups 3072(%%eax), %%xmm3\n" /* sur */
"mulps %%xmm5, %%xmm0\n"
"mulps %%xmm5, %%xmm1\n"
"mulps %%xmm6, %%xmm2\n"
"addps %%xmm2, %%xmm0\n"
"mulps %%xmm7, %%xmm3\n"
"addps %%xmm2, %%xmm1\n"
"subps %%xmm3, %%xmm0\n"
"addps %%xmm3, %%xmm1\n"
"movups %%xmm0, (%%eax)\n"
"movups %%xmm1, 1024(%%eax)\n"
"addl $16, %%eax\n"
"decl %%ecx\n"
"jnz .loop4\n"
"popl %%ecx\n"
: "=a" (samples)
: "a" (samples), "b" (dm_par));
}
void downmix_2f_1r_to_2ch_sse (float *samples, dm_par_t * dm_par)
{
__asm__ __volatile__ (
"pushl %%ecx\n"
"movl $64, %%ecx\n" /* loop counter */
"movss (%%ebx), %%xmm5\n" /* unit */
"shufps $0, %%xmm5, %%xmm5\n" /* unit | unit | unit | unit */
"movss 8(%%ebx), %%xmm7\n" /* slev */
"shufps $0, %%xmm7, %%xmm7\n" /* slev | slev | slev | slev */
".loop5:\n"
"movups (%%eax), %%xmm0\n" /* left */
"movups 1024(%%eax), %%xmm1\n" /* right */
"movups 2048(%%eax), %%xmm3\n" /* sur */
"mulps %%xmm5, %%xmm0\n"
"mulps %%xmm5, %%xmm1\n"
"mulps %%xmm7, %%xmm3\n"
"subps %%xmm3, %%xmm0\n"
"addps %%xmm3, %%xmm1\n"
"movups %%xmm0, (%%eax)\n"
"movups %%xmm1, 1024(%%eax)\n"
"addl $16, %%eax\n"
"decl %%ecx\n"
"jnz .loop5\n"
"popl %%ecx\n"
: "=a" (samples)
: "a" (samples), "b" (dm_par));
}
void downmix_3f_0r_to_2ch_sse (float *samples, dm_par_t * dm_par)
{
__asm__ __volatile__ (
"pushl %%ecx\n"
"movl $64, %%ecx\n" /* loop counter */
"movss (%%ebx), %%xmm5\n" /* unit */
"shufps $0, %%xmm5, %%xmm5\n" /* unit | unit | unit | unit */
"movss 4(%%ebx), %%xmm6\n" /* clev */
"shufps $0, %%xmm6, %%xmm6\n" /* clev | clev | clev | clev */
".loop6:\n"
"movups (%%eax), %%xmm0\n" /*left */
"movups 2048(%%eax), %%xmm1\n" /* right */
"movups 1024(%%eax), %%xmm2\n" /* center */
"mulps %%xmm5, %%xmm0\n"
"mulps %%xmm5, %%xmm1\n"
"mulps %%xmm6, %%xmm2\n"
"addps %%xmm2, %%xmm0\n"
"addps %%xmm2, %%xmm1\n"
"movups %%xmm0, (%%eax)\n"
"movups %%xmm1, 1024(%%eax)\n"
"addl $16, %%eax\n"
"decl %%ecx\n"
"jnz .loop6\n"
"popl %%ecx\n"
: "=a" (samples)
: "a" (samples), "b" (dm_par));
}
void stream_sample_1ch_to_s16_sse (s16 *s16_samples, float *left)
{
__asm__ __volatile__ (
"pushl %%ecx\n"
"pushl %%edx\n"
"movl $sqrt2, %%edx\n"
"movss (%%edx), %%xmm7\n"
"shufps $0, %%xmm7, %%xmm7\n" /* sqrt2 | sqrt2 | sqrt2 | sqrt2 */
"movl $64, %%ecx\n"
".loop2:\n"
"movups (%%ebx), %%xmm0\n" /* c3 | c2 | c1 | c0 */
"mulps %%xmm7, %%xmm0\n"
"movhlps %%xmm0, %%xmm2\n" /* c3 | c2 */
"cvtps2pi %%xmm0, %%mm0\n" /* c1 c0 --> mm0, int_32 */
"cvtps2pi %%xmm2, %%mm1\n" /* c3 c2 --> mm1, int_32 */
"packssdw %%mm0, %%mm0\n" /* c1 c1 c0 c0 --> mm0, int_16 */
"packssdw %%mm1, %%mm1\n" /* c3 c3 c2 c2 --> mm1, int_16 */
"movq %%mm0, (%%eax)\n"
"movq %%mm1, 8(%%eax)\n"
"addl $16, %%eax\n"
"addl $16, %%ebx\n"
"decl %%ecx\n"
"jnz .loop2\n"
"popl %%edx\n"
"popl %%ecx\n"
"emms\n"
: "=a" (s16_samples), "=b" (left)
: "a" (s16_samples), "b" (left));
}
void stream_sample_2ch_to_s16_sse (s16 *s16_samples, float *left, float *right)
{
__asm__ __volatile__ (
"pushl %%ecx\n"
"movl $64, %%ecx\n"
".loop1:\n"
"movups (%%ebx), %%xmm0\n" /* l3 | l2 | l1 | l0 */
"movups (%%edx), %%xmm1\n" /* r3 | r2 | r1 | r0 */
"movhlps %%xmm0, %%xmm2\n" /* l3 | l2 */
"movhlps %%xmm1, %%xmm3\n" /* r3 | r2 */
"unpcklps %%xmm1, %%xmm0\n" /* r1 | l1 | r0 | l0 */
"unpcklps %%xmm3, %%xmm2\n" /* r3 | l3 | r2 | l2 */
"cvtps2pi %%xmm0, %%mm0\n" /* r0 l0 --> mm0, int_32 */
"movhlps %%xmm0, %%xmm0\n"
"cvtps2pi %%xmm0, %%mm1\n" /* r1 l1 --> mm1, int_32 */
"cvtps2pi %%xmm2, %%mm2\n" /* r2 l2 --> mm2, int_32 */
"movhlps %%xmm2, %%xmm2\n"
"cvtps2pi %%xmm2, %%mm3\n" /* r3 l3 --> mm3, int_32 */
"packssdw %%mm1, %%mm0\n" /* r1 l1 r0 l0 --> mm0, int_16 */
"packssdw %%mm3, %%mm2\n" /* r3 l3 r2 l2 --> mm2, int_16 */
"movq %%mm0, (%%eax)\n"
"movq %%mm2, 8(%%eax)\n"
"addl $16, %%eax\n"
"addl $16, %%ebx\n"
"addl $16, %%edx\n"
"decl %%ecx\n"
"jnz .loop1\n"
"popl %%ecx\n"
"emms\n"
: "=a" (s16_samples), "=b" (left), "=d" (right)
: "a" (s16_samples), "b" (left), "d" (right));
}
......@@ -2,7 +2,7 @@
* ac3_exponent.c: ac3 exponent calculations
*****************************************************************************
* Copyright (C) 1999, 2000 VideoLAN
* $Id: ac3_exponent.c,v 1.23 2001/04/20 12:14:34 reno Exp $
* $Id: ac3_exponent.c,v 1.24 2001/05/14 15:58:04 reno Exp $
*
* Authors: Michel Kaempf <maxx@via.ecp.fr>
* Michel Lespinasse <walken@zoy.org>
......@@ -31,15 +31,14 @@
#include "threads.h"
#include "mtime.h"
#include "intf_msg.h" /* intf_DbgMsg(), intf_ErrMsg() */
#include "stream_control.h"
#include "input_ext-dec.h"
#include "audio_output.h"
#include "ac3_decoder.h"
#include "ac3_decoder_thread.h"
#include "intf_msg.h"
#include "ac3_internal.h"
......
......@@ -2,7 +2,7 @@
* ac3_imdct.c: ac3 DCT
*****************************************************************************
* Copyright (C) 1999, 2000 VideoLAN
* $Id: ac3_imdct.c,v 1.18 2001/05/06 04:32:02 sam Exp $
* $Id: ac3_imdct.c,v 1.19 2001/05/14 15:58:04 reno Exp $
*
* Authors: Michel Kaempf <maxx@via.ecp.fr>
* Aaron Holtzman <aholtzma@engr.uvic.ca>
......@@ -39,15 +39,11 @@
#include "input_ext-dec.h"
#include "ac3_decoder.h"
#include "ac3_internal.h"
#include "ac3_downmix.h"
#include "ac3_imdct_c.h"
#if 0
#include "ac3_imdct_kni.h"
#endif
#include "ac3_imdct_c.h" /* imdct_init_c */
#include "ac3_imdct_sse.h" /* imdct_init_sse */
#include "tests.h"
#include "tests.h" /* TestCPU */
#ifndef M_PI
# define M_PI 3.14159265358979323846
......@@ -57,13 +53,13 @@
void imdct_init(imdct_t * p_imdct)
{
int i;
float scale = 255.99609372;
float scale = 181.019;
#if 0
if ( TestCPU (CPU_CAPABILITY_MMX) )
if ( TestCPU (CPU_CAPABILITY_SSE) )
{
imdct_init_kni (p_imdct);
} else
imdct_init_sse (p_imdct);
}
else
#endif
{
imdct_init_c (p_imdct);
......
......@@ -2,7 +2,7 @@
* ac3_imdct_c.c: ac3 DCT
*****************************************************************************
* Copyright (C) 1999, 2000 VideoLAN
* $Id: ac3_imdct_c.c,v 1.2 2001/05/06 04:32:02 sam Exp $
* $Id: ac3_imdct_c.c,v 1.3 2001/05/14 15:58:04 reno Exp $
*
* Authors: Renaud Dartus <reno@videolan.org>
* Aaron Holtzman <aholtzma@engr.uvic.ca>
......@@ -38,7 +38,7 @@
#include "input_ext-dec.h"
#include "ac3_decoder.h"
#include "ac3_internal.h"
#include "ac3_imdct_c.h"
#ifndef M_PI
# define M_PI 3.14159265358979323846
......@@ -46,9 +46,6 @@
void fft_64p_c (complex_t *x);
void fft_128p_c (complex_t *x);
void imdct_do_512_c (imdct_t * p_imdct, float data[], float delay[]);
void imdct_do_512_nol_c (imdct_t * p_imdct, float data[], float delay[]);
static float window[] = {
0.00014, 0.00024, 0.00037, 0.00051, 0.00067, 0.00086, 0.00107, 0.00130,
......@@ -112,7 +109,7 @@ static const int pm64[64] =
int imdct_init_c (imdct_t * p_imdct)
{
int i;
float scale = 255.99609372;
float scale = 181.019;
p_imdct->imdct_do_512 = imdct_do_512_c;
p_imdct->imdct_do_512_nol = imdct_do_512_nol_c;
......
/*****************************************************************************
* ac3_imdct_sse.c: ac3 DCT
*****************************************************************************
* Copyright (C) 1999, 2000 VideoLAN
* $Id: ac3_imdct_sse.c,v 1.1 2001/05/14 15:58:04 reno Exp $
*
* Authors: Renaud Dartus <reno@videolan.org>
* Aaron Holtzman <aholtzma@engr.uvic.ca>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111, USA.
*****************************************************************************/
#include "defs.h"
#include <math.h>
#include <stdio.h>
#include "config.h"
#include "common.h"
#include "threads.h"
#include "mtime.h"
#include "intf_msg.h" /* intf_DbgMsg(), intf_ErrMsg() */
#include "stream_control.h"
#include "input_ext-dec.h"
#include "ac3_decoder.h"
#include "ac3_imdct_sse.h"
static const float window[] = {
0.00014, 0.00024, 0.00037, 0.00051, 0.00067, 0.00086, 0.00107, 0.00130,
0.00157, 0.00187, 0.00220, 0.00256, 0.00297, 0.00341, 0.00390, 0.00443,
0.00501, 0.00564, 0.00632, 0.00706, 0.00785, 0.00871, 0.00962, 0.01061,
0.01166, 0.01279, 0.01399, 0.01526, 0.01662, 0.01806, 0.01959, 0.02121,
0.02292, 0.02472, 0.02662, 0.02863, 0.03073, 0.03294, 0.03527, 0.03770,
0.04025, 0.04292, 0.04571, 0.04862, 0.05165, 0.05481, 0.05810, 0.06153,
0.06508, 0.06878, 0.07261, 0.07658, 0.08069, 0.08495, 0.08935, 0.09389,
0.09859, 0.10343, 0.10842, 0.11356, 0.11885, 0.12429, 0.12988, 0.13563,
0.14152, 0.14757, 0.15376, 0.16011, 0.16661, 0.17325, 0.18005, 0.18699,
0.19407, 0.20130, 0.20867, 0.21618, 0.22382, 0.23161, 0.23952, 0.24757,
0.25574, 0.26404, 0.27246, 0.28100, 0.28965, 0.29841, 0.30729, 0.31626,
0.32533, 0.33450, 0.34376, 0.35311, 0.36253, 0.37204, 0.38161, 0.39126,
0.40096, 0.41072, 0.42054, 0.43040, 0.44030, 0.45023, 0.46020, 0.47019,
0.48020, 0.49022, 0.50025, 0.51028, 0.52031, 0.53033, 0.54033, 0.55031,
0.56026, 0.57019, 0.58007, 0.58991, 0.59970, 0.60944, 0.61912, 0.62873,
0.63827, 0.64774, 0.65713, 0.66643, 0.67564, 0.68476, 0.69377, 0.70269,
0.71150, 0.72019, 0.72877, 0.73723, 0.74557, 0.75378, 0.76186, 0.76981,
0.77762, 0.78530, 0.79283, 0.80022, 0.80747, 0.81457, 0.82151, 0.82831,
0.83496, 0.84145, 0.84779, 0.85398, 0.86001, 0.86588, 0.87160, 0.87716,
0.88257, 0.88782, 0.89291, 0.89785, 0.90264, 0.90728, 0.91176, 0.91610,
0.92028, 0.92432, 0.92822, 0.93197, 0.93558, 0.93906, 0.94240, 0.94560,
0.94867, 0.95162, 0.95444, 0.95713, 0.95971, 0.96217, 0.96451, 0.96674,
0.96887, 0.97089, 0.97281, 0.97463, 0.97635, 0.97799, 0.97953, 0.98099,
0.98236, 0.98366, 0.98488, 0.98602, 0.98710, 0.98811, 0.98905, 0.98994,
0.99076, 0.99153, 0.99225, 0.99291, 0.99353, 0.99411, 0.99464, 0.99513,
0.99558, 0.99600, 0.99639, 0.99674, 0.99706, 0.99736, 0.99763, 0.99788,
0.99811, 0.99831, 0.99850, 0.99867, 0.99882, 0.99895, 0.99908, 0.99919,
0.99929, 0.99938, 0.99946, 0.99953, 0.99959, 0.99965, 0.99969, 0.99974,
0.99978, 0.99981, 0.99984, 0.99986, 0.99988, 0.99990, 0.99992, 0.99993,
0.99994, 0.99995, 0.99996, 0.99997, 0.99998, 0.99998, 0.99998, 0.99999,
0.99999, 0.99999, 0.99999, 1.00000, 1.00000, 1.00000, 1.00000, 1.00000,
1.00000, 1.00000, 1.00000, 1.00000, 1.00000, 1.00000, 1.00000, 1.00000
};
static const int pm128[128] =
{
0, 16, 32, 48, 64, 80, 96, 112, 8, 40, 72, 104, 24, 56, 88, 120,
4, 20, 36, 52, 68, 84, 100, 116, 12, 28, 44, 60, 76, 92, 108, 124,
2, 18, 34, 50, 66, 82, 98, 114, 10, 42, 74, 106, 26, 58, 90, 122,
6, 22, 38, 54, 70, 86, 102, 118, 14, 46, 78, 110, 30, 62, 94, 126,
1, 17, 33, 49, 65, 81, 97, 113, 9, 41, 73, 105, 25, 57, 89, 121,
5, 21, 37, 53, 69, 85, 101, 117, 13, 29, 45, 61, 77, 93, 109, 125,
3, 19, 35, 51, 67, 83, 99, 115, 11, 43, 75, 107, 27, 59, 91, 123,
7, 23, 39, 55, 71, 87, 103, 119, 15, 31, 47, 63, 79, 95, 111, 127
};
void fft_64p_sse (complex_t *x);
void fft_128p_sse(complex_t *a);
static void imdct512_pre_ifft_twiddle_sse (const int *pmt, complex_t *buf, float *data, float *xcos_sin_sse);
static void imdct512_post_ifft_twiddle_sse (complex_t *buf, float *xcos_sin_sse);
static void imdct512_window_delay_sse (complex_t *buf, float *data_ptr, float *window_prt, float *delay_prt);
static void imdct512_window_delay_nol_sse (complex_t *buf, float *data_ptr, float *window_prt, float *delay_prt);
int imdct_init_sse (imdct_t * p_imdct)
{
int i;
float scale = 181.019;
intf_WarnMsg (1, "ac3dec: using MMX_SSE for imdct");
p_imdct->imdct_do_512 = imdct_do_512_sse;
p_imdct->imdct_do_512_nol = imdct_do_512_nol_sse;
p_imdct->fft_64p = fft_64p_sse;
for (i=0; i < 128; i++)
{
float xcos_i = cos(2.0f * M_PI * (8*i+1)/(8*N)) * scale;
float xsin_i = sin(2.0f * M_PI * (8*i+1)/(8*N)) * scale;
p_imdct->xcos_sin_sse[i * 4] = xcos_i;
p_imdct->xcos_sin_sse[i * 4 + 1] = -xsin_i;
p_imdct->xcos_sin_sse[i * 4 + 2] = -xsin_i;
p_imdct->xcos_sin_sse[i * 4 + 3] = -xcos_i;
}
return 0;
}
void imdct_do_512_sse (imdct_t * p_imdct, float data[], float delay[])
{
imdct512_pre_ifft_twiddle_sse (pm128, p_imdct->buf, data, p_imdct->xcos_sin_sse);
fft_128p_sse (p_imdct->buf);
imdct512_post_ifft_twiddle_sse (p_imdct->buf, p_imdct->xcos_sin_sse);
imdct512_window_delay_sse (p_imdct->buf, data, window, delay);
}
void imdct_do_512_nol_sse (imdct_t * p_imdct, float data[], float delay[])
{
imdct512_pre_ifft_twiddle_sse (pm128, p_imdct->buf, data, p_imdct->xcos_sin_sse);
fft_128p_sse (p_imdct->buf);
imdct512_post_ifft_twiddle_sse (p_imdct->buf, p_imdct->xcos_sin_sse);
imdct512_window_delay_nol_sse (p_imdct->buf, data, window, delay);
}
static void imdct512_pre_ifft_twiddle_sse (const int *pmt, complex_t *buf, float *data, float *xcos_sin_sse)
{
__asm__ __volatile__ (
"pushl %%ebp\n"
"movl %%esp, %%ebp\n"
"addl $-4, %%esp\n" /* local variable, loop counter */
"pushl %%eax\n"
"pushl %%ebx\n"
"pushl %%ecx\n"
"pushl %%edx\n"
"pushl %%edi\n"
"pushl %%esi\n"
"movl 8(%%ebp), %%eax\n" /* pmt */
"movl 12(%%ebp), %%ebx\n" /* buf */
"movl 16(%%ebp), %%ecx\n" /* data */
"movl 20(%%ebp), %%edx\n" /* xcos_sin_sse */
"movl $64, -4(%%ebp)\n"
".loop:\n"
"movl (%%eax), %%esi\n"
"movl 4(%%eax), %%edi\n"
"movss (%%ecx, %%esi, 8), %%xmm1\n" /* 2j */
"movss (%%ecx, %%edi, 8), %%xmm3\n" /* 2(j+1) */
"shll $1, %%esi\n"
"shll $1, %%edi\n"
"movups (%%edx, %%esi, 8), %%xmm0\n" /* -c_j | -s_j | -s_j | c_j */
"movups (%%edx, %%edi, 8), %%xmm2\n" /* -c_j+1 | -s_j+1 | -s_j+1 | c_j+1 */
"negl %%esi\n"
"negl %%edi\n"
"movss 1020(%%ecx, %%esi, 4), %%xmm4\n" /* 255-2j */
"addl $8, %%eax\n"
"movss 1020(%%ecx, %%edi, 4), %%xmm5\n" /* 255-2(j+1) */
"shufps $0, %%xmm1, %%xmm4\n" /* 2j | 2j | 255-2j | 255-2j */
"shufps $0, %%xmm3, %%xmm5\n" /* 2(j+1) | 2(j+1) | 255-2(j+1) | 255-2(j+1) */
"mulps %%xmm4, %%xmm0\n"
"mulps %%xmm5, %%xmm2\n"
"movhlps %%xmm0, %%xmm1\n"
"movhlps %%xmm2, %%xmm3\n"
"addl $16, %%ebx\n"
"addps %%xmm1, %%xmm0\n"
"addps %%xmm3, %%xmm2\n"
"movlhps %%xmm2, %%xmm0\n"
"movups %%xmm0, -16(%%ebx)\n"
"decl -4(%%ebp)\n"
"jnz .loop\n"
"popl %%esi\n"
"popl %%edi\n"
"popl %%edx\n"
"popl %%ecx\n"
"popl %%ebx\n"
"popl %%eax\n"
"addl $4, %%esp\n"
"popl %%ebp\n"
::);
}
static void imdct512_post_ifft_twiddle_sse (complex_t *buf, float *xcos_sin_sse)
{
__asm__ __volatile__ (
"pushl %%ecx\n"
"movl $32, %%ecx\n" /* loop counter */
".loop1:\n"
"movups (%%eax), %%xmm0\n" /* im1 | re1 | im0 | re0 */
"movups (%%ebx), %%xmm2\n" /* -c | -s | -s | c */
"movhlps %%xmm0, %%xmm1\n" /* im1 | re1 */
"movups 16(%%ebx), %%xmm3\n" /* -c1 | -s1 | -s1 | c1 */
"shufps $0x50, %%xmm0, %%xmm0\n" /* im0 | im0 | re0 | re0 */
"shufps $0x50, %%xmm1, %%xmm1\n" /* im1 | im1 | re1 | re1 */
"movups 16(%%eax), %%xmm4\n" /* im3 | re3 | im2 | re2 */
"shufps $0x27, %%xmm2, %%xmm2\n" /* c | -s | -s | -c */
"movhlps %%xmm4, %%xmm5\n" /* im3 | re3 */
"shufps $0x27, %%xmm3, %%xmm3\n" /* c1 | -s1 | -s1 | -c1 */
"movups 32(%%ebx), %%xmm6\n" /* -c2 | -s2 | -s2 | c2 */
"movups 48(%%ebx), %%xmm7\n" /* -c3 | -s3 | -s3 | c3 */
"shufps $0x50, %%xmm4, %%xmm4\n" /* im2 | im2 | re2 | re2 */
"shufps $0x50, %%xmm5, %%xmm5\n" /* im3 | im3 | re3 | re3 */
"mulps %%xmm2, %%xmm0\n"
"mulps %%xmm3, %%xmm1\n"
"shufps $0x27, %%xmm6, %%xmm6\n" /* c2 | -s2 | -s2 | -c2 */
"shufps $0x27, %%xmm7, %%xmm7\n" /* c3 | -s3 | -s3 | -c3 */
"movhlps %%xmm0, %%xmm2\n"
"movhlps %%xmm1, %%xmm3\n"
"mulps %%xmm6, %%xmm4\n"
"mulps %%xmm7, %%xmm5\n"
"addps %%xmm2, %%xmm0\n"
"addps %%xmm3, %%xmm1\n"
"movhlps %%xmm4, %%xmm6\n"
"movhlps %%xmm5, %%xmm7\n"
"addps %%xmm6, %%xmm4\n"
"addps %%xmm7, %%xmm5\n"
"movlhps %%xmm1, %%xmm0\n"
"movlhps %%xmm5, %%xmm4\n"
"movups %%xmm0, (%%eax)\n"
"movups %%xmm4, 16(%%eax)\n"
"addl $64, %%ebx\n"
"addl $32, %%eax\n"
"decl %%ecx\n"
"jnz .loop1\n"
"popl %%ecx\n"
: "=a" (buf)
: "a" (buf), "b" (xcos_sin_sse) );
}
static void imdct512_window_delay_sse (complex_t *buf, float *data_ptr, float *window_prt, float *delay_prt)
{
__asm__ __volatile__ (
"pushl %%ebp\n"
"movl %%esp, %%ebp\n"
"pushl %%eax\n"
"pushl %%ebx\n"
"pushl %%ecx\n"
"pushl %%edx\n"
"pushl %%esi\n"
"pushl %%edi\n"
"movl 20(%%ebp), %%ebx\n" /* delay */
"movl 16(%%ebp), %%edx\n" /* window */
"movl 8(%%ebp), %%eax\n" /* buf */
"movl $16, %%ecx\n" /* loop count */
"leal 516(%%eax), %%esi\n" /* buf[64].im */
"leal 504(%%eax), %%edi\n" /* buf[63].re */
"movl 12(%%ebp), %%eax\n" /* data */
".first_128_samples:\n"
"movss (%%esi), %%xmm0\n"
"movss 8(%%esi), %%xmm2\n"
"movss (%%edi), %%xmm1\n"
"movss -8(%%edi), %%xmm3\n"
"movlhps %%xmm2, %%xmm0\n" /* 0.0 | im1 | 0.0 | im0 */
"movlhps %%xmm3, %%xmm1\n" /* 0.0 | re1 | 0.0 | re0 */
"movups (%%edx), %%xmm4\n" /* w3 | w2 | w1 | w0 */
"movups (%%ebx), %%xmm5\n" /* d3 | d2 | d1 | d0 */
"shufps $0xb1, %%xmm1, %%xmm1\n"/* re1 | 0.0 | re0 | 0.0 */
"movss 16(%%esi), %%xmm6\n" /* im2 */
"movss 24(%%esi), %%xmm7\n" /* im3 */
"subps %%xmm1, %%xmm0\n" /* -re1 | im1 | -re0 | im0 */
"movss -16(%%edi), %%xmm2\n" /* re2 */
"movss -24(%%edi), %%xmm3\n" /* re3 */
"mulps %%xmm4, %%xmm0\n"
"movlhps %%xmm7, %%xmm6\n" /* 0.0 | im3 | 0.0 | im2 */
"movlhps %%xmm3, %%xmm2\n" /* 0.0 | re3 | 0.0 | re2 */
"addps %%xmm5, %%xmm0\n"
"shufps $0xb1, %%xmm2, %%xmm2\n"/* re3 | 0.0 | re2 | 0.0 */
"movups 16(%%edx), %%xmm4\n" /* w7 | w6 | w5 | w4 */
"movups 16(%%ebx), %%xmm5\n" /* d7 | d6 | d5 | d4 */
"subps %%xmm2, %%xmm6\n" /* -re3 | im3 | -re2 | im2 */
"addl $32, %%edx\n"
"movups %%xmm0, (%%eax)\n"
"addl $32, %%ebx\n"
"mulps %%xmm4, %%xmm6\n"
"addl $32, %%esi\n"
"addl $32, %%eax\n"
"addps %%xmm5, %%xmm6\n"
"addl $-32, %%edi\n"
"movups %%xmm6, -16(%%eax)\n"
"decl %%ecx\n"
"jnz .first_128_samples\n"
"movl 8(%%ebp), %%esi\n" /* buf[0].re */
"leal 1020(%%esi), %%edi\n" /* buf[127].im */
"movl $16, %%ecx\n" /* loop count */
".second_128_samples:\n"
"movss (%%esi), %%xmm0\n" /* buf[i].re */
"movss 8(%%esi), %%xmm2\n" /* re1 */
"movss (%%edi), %%xmm1\n" /* buf[127-i].im */
"movss -8(%%edi), %%xmm3\n" /* im1 */
"movlhps %%xmm2, %%xmm0\n" /* 0.0 | re1 | 0.0 | re0 */
"movlhps %%xmm3, %%xmm1\n" /* 0.0 | im1 | 0.0 | im1 */
"movups (%%edx), %%xmm4\n" /* w3 | w2 | w1 | w0 */
"movups (%%ebx), %%xmm5\n" /* d3 | d2 | d1 | d0 */
"shufps $0xb1, %%xmm1, %%xmm1\n"/* im1 | 0.0 | im0 | 0.0 */
"movss 16(%%esi), %%xmm6\n" /* re2 */
"movss 24(%%esi), %%xmm7\n" /* re3 */
"movss -16(%%edi), %%xmm2\n" /* im2 */
"movss -24(%%edi), %%xmm3\n" /* im3 */
"subps %%xmm1, %%xmm0\n" /* -im1 | re1 | -im0 | re0 */
"movlhps %%xmm7, %%xmm6\n" /* 0.0 | re3 | 0.0 | re2 */
"movlhps %%xmm3, %%xmm2\n" /* 0.0 | im3 | 0.0 | im2 */
"mulps %%xmm4, %%xmm0\n"
"shufps $0xb1, %%xmm2, %%xmm2\n"/* im3 | 0.0 | im2 | 0.0 */
"movups 16(%%edx), %%xmm4\n" /* w7 | w6 | w5 | w4 */
"addl $32, %%esi\n"
"subps %%xmm2, %%xmm6\n" /* -im3 | re3 | -im2 | re2 */
"addps %%xmm5, %%xmm0\n"
"mulps %%xmm4, %%xmm6\n"
"addl $-32, %%edi\n"
"movups 16(%%ebx), %%xmm5\n" /* d7 | d6 | d5 | d4 */
"movups %%xmm0, (%%eax)\n"
"addps %%xmm5, %%xmm6\n"
"addl $32, %%edx\n"
"addl $32, %%eax\n"
"addl $32, %%ebx\n"
"movups %%xmm6, -16(%%eax)\n"
"decl %%ecx\n"
"jnz .second_128_samples\n"
"movl 8(%%ebp), %%eax\n"
"leal 512(%%eax), %%esi\n" /* buf[64].re */
"leal 508(%%eax), %%edi\n" /* buf[63].im */
"movl $16, %%ecx\n" /* loop count */
"movl 20(%%ebp), %%eax\n" /* delay */
".first_128_delay:\n"
"movss (%%esi), %%xmm0\n"
"movss 8(%%esi), %%xmm2\n"
"movss (%%edi), %%xmm1\n"
"movss -8(%%edi), %%xmm3\n"
"movlhps %%xmm2, %%xmm0\n" /* 0.0 | re1 | 0.0 | re0 */
"movlhps %%xmm3, %%xmm1\n" /* 0.0 | im1 | 0.0 | im0 */
"movups -16(%%edx), %%xmm4\n" /* w3 | w2 | w1 | w0 */
"shufps $0xb1, %%xmm1, %%xmm1\n"/* im1 | 0.0 | im0 | 0.0 */
"movss 16(%%esi), %%xmm6\n" /* re2 */
"movss 24(%%esi), %%xmm7\n" /* re3 */
"movss -16(%%edi), %%xmm2\n" /* im2 */
"movss -24(%%edi), %%xmm3\n" /* im3 */
"subps %%xmm1, %%xmm0\n" /* -im1 | re1 | -im0 | re0 */
"addl $-32, %%edx\n"
"movlhps %%xmm7, %%xmm6\n" /* 0.0 | re3 | 0.0 | re2 */
"movlhps %%xmm3, %%xmm2\n" /* 0.0 | im3 | 0.0 | im2 */
"mulps %%xmm4, %%xmm0\n"
"movups (%%edx), %%xmm5\n" /* w7 | w6 | w5 | w4 */
"shufps $0xb1, %%xmm2, %%xmm2\n"/* im3 | 0.0 | im2 | 0.0 */
"movups %%xmm0, (%%eax)\n"
"addl $32, %%esi\n"
"subps %%xmm2, %%xmm6\n" /* -im3 | re3 | -im2 | re2 */
"addl $-32, %%edi\n"
"mulps %%xmm5, %%xmm6\n"
"addl $32, %%eax\n"
"movups %%xmm6, -16(%%eax)\n"
"decl %%ecx\n"
"jnz .first_128_delay\n"
"movl 8(%%ebp), %%ebx\n"
"leal 4(%%ebx), %%esi\n" /* buf[0].im */
"leal 1016(%%ebx), %%edi\n" /* buf[127].re */
"movl $16, %%ecx\n" /* loop count */
".second_128_delay:\n"
"movss (%%esi), %%xmm0\n"
"movss 8(%%esi), %%xmm2\n"
"movss (%%edi), %%xmm1\n"
"movss -8(%%edi), %%xmm3\n"
"movlhps %%xmm2, %%xmm0\n" /* 0.0 | im1 | 0.0 | im0 */
"movlhps %%xmm3, %%xmm1\n" /* 0.0 | re1 | 0.0 | re0 */
"movups -16(%%edx), %%xmm4\n" /* w3 | w2 | w1 | w0 */
"shufps $0xb1, %%xmm1, %%xmm1\n"/* re1 | 0.0 | re0 | 0.0 */
"movss 16(%%esi), %%xmm6\n" /* im2 */
"movss 24(%%esi), %%xmm7\n" /* im3 */
"movss -16(%%edi), %%xmm2\n" /* re2 */
"movss -24(%%edi), %%xmm3\n" /* re3 */
"subps %%xmm0, %%xmm1\n" /* re1 | -im1 | re0 | -im0 */
"addl $-32, %%edx\n"
"movlhps %%xmm7, %%xmm6\n" /* 0.0 | im3 | 0.0 | im2 */
"movlhps %%xmm3, %%xmm2\n" /* 0.0 | re3 | 0.0 | re2 */
"mulps %%xmm4, %%xmm1\n"
"movups (%%edx), %%xmm5\n" /* w7 | w6 | w5 | w4 */
"shufps $0xb1, %%xmm2, %%xmm2\n"/* re3 | 0.0 | re2 | 0.0 */
"movups %%xmm1, (%%eax)\n"
"addl $32, %%esi\n"
"subps %%xmm6, %%xmm2\n" /* re | -im3 | re | -im2 */
"addl $-32, %%edi\n"
"mulps %%xmm5, %%xmm2\n"
"addl $32, %%eax\n"
"movups %%xmm2, -16(%%eax)\n"
"decl %%ecx\n"
"jnz .second_128_delay\n"
"popl %%edi\n"
"popl %%esi\n"
"popl %%edx\n"
"popl %%ecx\n"
"popl %%ebx\n"
"popl %%eax\n"
"leave\n"
::);
}
static void imdct512_window_delay_nol_sse (complex_t *buf, float *data_ptr, float *window_prt, float *delay_prt)
{
__asm__ __volatile__ (
"pushl %%ebp\n"
"movl %%esp, %%ebp\n"
"pushl %%eax\n"
"pushl %%ebx\n"
"pushl %%ecx\n"
"pushl %%edx\n"
"pushl %%esi\n"
"pushl %%edi\n"
/* movl 20(%%ebp), %%ebx delay */
"movl 16(%%ebp), %%edx\n" /* window */
"movl 8(%%ebp), %%eax\n" /* buf */
"movl $16, %%ecx\n" /* loop count */
"leal 516(%%eax), %%esi\n" /* buf[64].im */
"leal 504(%%eax), %%edi\n" /* buf[63].re */
"movl 12(%%ebp), %%eax\n" /* data */
".first_128_sample:\n"
"movss (%%esi), %%xmm0\n"
"movss 8(%%esi), %%xmm2\n"
"movss (%%edi), %%xmm1\n"
"movss -8(%%edi), %%xmm3\n"
"movlhps %%xmm2, %%xmm0\n" /* 0.0 | im1 | 0.0 | im0 */
"movlhps %%xmm3, %%xmm1\n" /* 0.0 | re1 | 0.0 | re0 */
"movups (%%edx), %%xmm4\n" /* w3 | w2 | w1 | w0 */
/* movups (%%ebx), %%xmm5 d3 | d2 | d1 | d0 */
"shufps $0xb1, %%xmm1, %%xmm1\n"/* re1 | 0.0 | re0 | 0.0 */
"movss 16(%%esi), %%xmm6\n" /* im2 */
"movss 24(%%esi), %%xmm7\n" /* im3 */
"subps %%xmm1, %%xmm0\n" /* -re1 | im1 | -re0 | im0 */
"movss -16(%%edi), %%xmm2\n" /* re2 */
"movss -24(%%edi), %%xmm3\n" /* re3 */
"mulps %%xmm4, %%xmm0\n"
"movlhps %%xmm7, %%xmm6\n" /* 0.0 | im3 | 0.0 | im2 */
"movlhps %%xmm3, %%xmm2\n" /* 0.0 | re3 | 0.0 | re2 */
/* addps %%xmm5, %%xmm0 */
"shufps $0xb1, %%xmm2, %%xmm2\n"/* re3 | 0.0 | re2 | 0.0 */
"movups 16(%%edx), %%xmm4\n" /* w7 | w6 | w5 | w4 */
/* movups 16(%%ebx), %%xmm5 d7 | d6 | d5 | d4 */
"subps %%xmm2, %%xmm6\n" /* -re3 | im3 | -re2 | im2 */
"addl $32, %%edx\n"
"movups %%xmm0, (%%eax)\n"
/* addl $32, %%ebx */
"mulps %%xmm4, %%xmm6\n"
"addl $32, %%esi\n"
"addl $32, %%eax\n"
/* addps %%xmm5, %%xmm6 */
"addl $-32, %%edi\n"
"movups %%xmm6, -16(%%eax)\n"
"decl %%ecx\n"
"jnz .first_128_sample\n"
"movl 8(%%ebp), %%esi\n" /* buf[0].re */
"leal 1020(%%esi), %%edi\n" /* buf[127].im */
"movl $16, %%ecx\n" /* loop count */
".second_128_sample:\n"
"movss (%%esi), %%xmm0\n" /* buf[i].re */
"movss 8(%%esi), %%xmm2\n" /* re1 */
"movss (%%edi), %%xmm1\n" /* buf[127-i].im */
"movss -8(%%edi), %%xmm3\n" /* im1 */
"movlhps %%xmm2, %%xmm0\n" /* 0.0 | re1 | 0.0 | re0 */
"movlhps %%xmm3, %%xmm1\n" /* 0.0 | im1 | 0.0 | im1 */
"movups (%%edx), %%xmm4\n" /* w3 | w2 | w1 | w0 */
/* movups (%%ebx), %%xmm5 d3 | d2 | d1 | d0 */
"shufps $0xb1, %%xmm1, %%xmm1\n"/* im1 | 0.0 | im0 | 0.0 */
"movss 16(%%esi), %%xmm6\n" /* re2 */
"movss 24(%%esi), %%xmm7\n" /* re3 */
"movss -16(%%edi), %%xmm2\n" /* im2 */
"movss -24(%%edi), %%xmm3\n" /* im3 */
"subps %%xmm1, %%xmm0\n" /* -im1 | re1 | -im0 | re0 */
"movlhps %%xmm7, %%xmm6\n" /* 0.0 | re3 | 0.0 | re2 */
"movlhps %%xmm3, %%xmm2\n" /* 0.0 | im3 | 0.0 | im2 */
"mulps %%xmm4, %%xmm0\n"
"shufps $0xb1, %%xmm2, %%xmm2\n"/* im3 | 0.0 | im2 | 0.0 */
"movups 16(%%edx), %%xmm4\n" /* w7 | w6 | w5 | w4 */
"addl $32, %%esi\n"
"subps %%xmm2, %%xmm6\n" /* -im3 | re3 | -im2 | re2 */
/* addps %%xmm5, %%xmm0 */
"mulps %%xmm4, %%xmm6\n"
"addl $-32, %%edi\n"
/* movups 16(%%ebx), %%xmm5 d7 | d6 | d5 | d4 */
"movups %%xmm0, (%%eax)\n"
/* addps %%xmm5, %%xmm6 */
"addl $32, %%edx\n"
"addl $32, %%eax\n"
/* addl $32, %%ebx */
"movups %%xmm6, -16(%%eax)\n"
"decl %%ecx\n"
"jnz .second_128_sample\n"
"movl 8(%%ebp), %%eax\n"
"leal 512(%%eax), %%esi\n" /* buf[64].re */
"leal 508(%%eax), %%edi\n" /* buf[63].im */
"movl $16, %%ecx\n" /* loop count */
"movl 20(%%ebp), %%eax\n" /* delay */
".first_128_delays:\n"
"movss (%%esi), %%xmm0\n"
"movss 8(%%esi), %%xmm2\n"
"movss (%%edi), %%xmm1\n"
"movss -8(%%edi), %%xmm3\n"
"movlhps %%xmm2, %%xmm0\n" /* 0.0 | re1 | 0.0 | re0 */
"movlhps %%xmm3, %%xmm1\n" /* 0.0 | im1 | 0.0 | im0 */
"movups -16(%%edx), %%xmm4\n" /* w3 | w2 | w1 | w0 */
"shufps $0xb1, %%xmm1, %%xmm1\n"/* im1 | 0.0 | im0 | 0.0 */
"movss 16(%%esi), %%xmm6\n" /* re2 */
"movss 24(%%esi), %%xmm7\n" /* re3 */
"movss -16(%%edi), %%xmm2\n" /* im2 */
"movss -24(%%edi), %%xmm3\n" /* im3 */
"subps %%xmm1, %%xmm0\n" /* -im1 | re1 | -im0 | re0 */
"addl $-32, %%edx\n"
"movlhps %%xmm7, %%xmm6\n" /* 0.0 | re3 | 0.0 | re2 */
"movlhps %%xmm3, %%xmm2\n" /* 0.0 | im3 | 0.0 | im2 */
"mulps %%xmm4, %%xmm0\n"
"movups (%%edx), %%xmm5\n" /* w7 | w6 | w5 | w4 */
"shufps $0xb1, %%xmm2, %%xmm2\n"/* im3 | 0.0 | im2 | 0.0 */
"movups %%xmm0, (%%eax)\n"
"addl $32, %%esi\n"
"subps %%xmm2, %%xmm6\n" /* -im3 | re3 | -im2 | re2 */
"addl $-32, %%edi\n"
"mulps %%xmm5, %%xmm6\n"
"addl $32, %%eax\n"
"movups %%xmm6, -16(%%eax)\n"
"decl %%ecx\n"
"jnz .first_128_delays\n"
"movl 8(%%ebp), %%ebx\n"
"leal 4(%%ebx), %%esi\n" /* buf[0].im */
"leal 1016(%%ebx), %%edi\n" /* buf[127].re */
"movl $16, %%ecx\n" /* loop count */
".second_128_delays:\n"
"movss (%%esi), %%xmm0\n"
"movss 8(%%esi), %%xmm2\n"
"movss (%%edi), %%xmm1\n"
"movss -8(%%edi), %%xmm3\n"
"movlhps %%xmm2, %%xmm0\n" /* 0.0 | im1 | 0.0 | im0 */
"movlhps %%xmm3, %%xmm1\n" /* 0.0 | re1 | 0.0 | re0 */
"movups -16(%%edx), %%xmm4\n" /* w3 | w2 | w1 | w0 */
"shufps $0xb1, %%xmm1, %%xmm1\n"/* re1 | 0.0 | re0 | 0.0 */
"movss 16(%%esi), %%xmm6\n" /* im2 */
"movss 24(%%esi), %%xmm7\n" /* im3 */
"movss -16(%%edi), %%xmm2\n" /* re2 */
"movss -24(%%edi), %%xmm3\n" /* re3 */
"subps %%xmm0, %%xmm1\n" /* re1 | -im1 | re0 | -im0 */
"addl $-32, %%edx\n"
"movlhps %%xmm7, %%xmm6\n" /* 0.0 | im3 | 0.0 | im2 */
"movlhps %%xmm3, %%xmm2\n" /* 0.0 | re3 | 0.0 | re2 */
"mulps %%xmm4, %%xmm1\n"
"movups (%%edx), %%xmm5\n" /* w7 | w6 | w5 | w4 */
"shufps $0xb1, %%xmm2, %%xmm2\n"/* re3 | 0.0 | re2 | 0.0 */
"movups %%xmm1, (%%eax)\n"
"addl $32, %%esi\n"
"subps %%xmm6, %%xmm2\n" /* re | -im3 | re | -im2 */
"addl $-32, %%edi\n"
"mulps %%xmm5, %%xmm2\n"
"addl $32, %%eax\n"
"movups %%xmm2, -16(%%eax)\n"
"decl %%ecx\n"
"jnz .second_128_delays\n"
"popl %%edi\n"
"popl %%esi\n"
"popl %%edx\n"
"popl %%ecx\n"
"popl %%ebx\n"
"popl %%eax\n"
"leave\n"
::);
}
int imdct_init_sse (imdct_t * p_imdct);
void imdct_do_512_sse(imdct_t * p_imdct, float data[], float delay[]);
void imdct_do_512_nol_sse(imdct_t * p_imdct, float data[], float delay[]);
......@@ -2,7 +2,7 @@
* ac3_internals.h: needed by the ac3 decoder
*****************************************************************************
* Copyright (C) 2000 VideoLAN
* $Id: ac3_internal.h,v 1.8 2001/03/21 13:42:34 sam Exp $
* $Id: ac3_internal.h,v 1.9 2001/05/14 15:58:04 reno Exp $
*
* Authors: Michel Lespinasse <walken@zoy.org>
*
......@@ -37,12 +37,13 @@
void bit_allocate (ac3dec_t *);
/* ac3_downmix.c */
int downmix (ac3dec_t *, float *, s16 *);
void downmix_init (downmix_t * p_downmix);
/* ac3_exponent.c */
int exponent_unpack (ac3dec_t *);
/* ac3_imdct.c */
void imdct_init (imdct_t * p_imdct);
void imdct (ac3dec_t * p_ac3dec, s16 * buffer);
/* ac3_mantissa.c */
......
......@@ -2,7 +2,7 @@
* ac3_mantissa.c: ac3 mantissa computation
*****************************************************************************
* Copyright (C) 1999, 2000, 2001 VideoLAN
* $Id: ac3_mantissa.c,v 1.27 2001/05/07 03:14:09 stef Exp $
* $Id: ac3_mantissa.c,v 1.28 2001/05/14 15:58:04 reno Exp $
*
* Authors: Michel Kaempf <maxx@via.ecp.fr>
* Aaron Holtzman <aholtzma@engr.uvic.ca>
......@@ -38,9 +38,6 @@
#include "audio_output.h"
#include "ac3_decoder.h"
#include "ac3_decoder_thread.h"
#include "ac3_internal.h"
#include "intf_msg.h"
......@@ -291,7 +288,7 @@ static __inline__ float coeff_get_float (ac3dec_t * p_ac3dec, u16 bap, u16 dithf
p_ac3dec->total_bits_read += 5;
if ((group_code = GetBits (&p_ac3dec->bit_stream,5)) > 26)
{
intf_WarnMsg ( 3, "ac3dec error: invalid mantissa (1)" );
intf_WarnMsg ( 3, "ac3dec warn: invalid mantissa (1)" );
return 0;
}
......@@ -312,7 +309,7 @@ static __inline__ float coeff_get_float (ac3dec_t * p_ac3dec, u16 bap, u16 dithf
p_ac3dec->total_bits_read += 7;
if ((group_code = GetBits (&p_ac3dec->bit_stream,7)) > 124)
{
intf_WarnMsg ( 3, "ac3dec error: invalid mantissa (2)" );
intf_WarnMsg ( 3, "ac3dec warn: invalid mantissa (2)" );
return 0;
}
......@@ -327,7 +324,7 @@ static __inline__ float coeff_get_float (ac3dec_t * p_ac3dec, u16 bap, u16 dithf
p_ac3dec->total_bits_read += 3;
if ((group_code = GetBits (&p_ac3dec->bit_stream,3)) > 6)
{
intf_WarnMsg ( 3, "ac3dec error: invalid mantissa (3)" );
intf_WarnMsg ( 3, "ac3dec warn: invalid mantissa (3)" );
return 0;
}
......@@ -343,7 +340,7 @@ static __inline__ float coeff_get_float (ac3dec_t * p_ac3dec, u16 bap, u16 dithf
p_ac3dec->total_bits_read += 7;
if ((group_code = GetBits (&p_ac3dec->bit_stream,7)) > 120)
{
intf_WarnMsg ( 3, "ac3dec error: invalid mantissa (4)" );
intf_WarnMsg ( 3, "ac3dec warn: invalid mantissa (4)" );
return 0;
}
......@@ -357,7 +354,7 @@ static __inline__ float coeff_get_float (ac3dec_t * p_ac3dec, u16 bap, u16 dithf
p_ac3dec->total_bits_read += 4;
if ((group_code = GetBits (&p_ac3dec->bit_stream,4)) > 14)
{
intf_WarnMsg ( 3, "ac3dec error: invalid mantissa (5)" );
intf_WarnMsg ( 3, "ac3dec warn: invalid mantissa (5)" );
return 0;
}
......
......@@ -2,7 +2,7 @@
* ac3_parse.c: ac3 parsing procedures
*****************************************************************************
* Copyright (C) 1999, 2000, 2001 VideoLAN
* $Id: ac3_parse.c,v 1.21 2001/05/07 04:42:42 sam Exp $
* $Id: ac3_parse.c,v 1.22 2001/05/14 15:58:04 reno Exp $
*
* Authors: Michel Kaempf <maxx@via.ecp.fr>
* Aaron Holtzman <aholtzma@engr.uvic.ca>
......@@ -40,9 +40,9 @@
#include "intf_msg.h"
#include "ac3_decoder.h"
#include "ac3_decoder_thread.h"
#include "ac3_decoder_thread.h" /* ac3dec_thread_t */
#include "ac3_internal.h"
#include "ac3_internal.h" /* EXP_REUSE */
/* Misc LUT */
static const u16 nfchans[] = { 2, 1, 2, 3, 3, 4, 4, 5 };
......@@ -97,8 +97,10 @@ static const struct frmsize_s frmsizecod_tbl[] =
static const int fscod_tbl[] = {48000, 44100, 32000};
/* Some internal functions */
void parse_bsi_stats (ac3dec_t * p_ac3dec);
void parse_audblk_stats (ac3dec_t * p_ac3dec);
#ifdef STATS
static void parse_bsi_stats (ac3dec_t * p_ac3dec);
static void parse_audblk_stats (ac3dec_t * p_ac3dec);
#endif
/* Parse a syncinfo structure */
int ac3_sync_frame (ac3dec_t * p_ac3dec, ac3_sync_info_t * p_sync_info)
......@@ -778,7 +780,7 @@ int parse_audblk (ac3dec_t * p_ac3dec, int blknum)
}
#ifdef STATS
// parse_audblk_stats(p_ac3dec);
parse_audblk_stats(p_ac3dec);
#endif
return 0;
......@@ -806,7 +808,8 @@ void parse_auxdata (ac3dec_t * p_ac3dec)
RemoveBits (&p_ac3dec->bit_stream,16);
}
void parse_bsi_stats (ac3dec_t * p_ac3dec) /*Some stats */
#ifdef STATS
static void parse_bsi_stats (ac3dec_t * p_ac3dec) /* Some stats */
{
struct mixlev_s
{
......@@ -850,7 +853,7 @@ void parse_bsi_stats (ac3dec_t * p_ac3dec) /*Some stats */
i = 0;
}
void parse_audblk_stats (ac3dec_t * p_ac3dec)
static void parse_audblk_stats (ac3dec_t * p_ac3dec)
{
char *exp_strat_tbl[4] = {"R ","D15 ","D25 ","D45 "};
u32 i;
......@@ -871,3 +874,4 @@ void parse_audblk_stats (ac3dec_t * p_ac3dec)
intf_ErrMsg ("\n");
}
#endif
......@@ -2,7 +2,7 @@
* ac3_rematrix.c: ac3 audio rematrixing
*****************************************************************************
* Copyright (C) 1999, 2000 VideoLAN
* $Id: ac3_rematrix.c,v 1.16 2001/05/06 04:32:02 sam Exp $
* $Id: ac3_rematrix.c,v 1.17 2001/05/14 15:58:04 reno Exp $
*
* Authors: Michel Kaempf <maxx@via.ecp.fr>
* Aaron Holtzman <aholtzma@engr.uvic.ca>
......@@ -34,7 +34,6 @@
#include "input_ext-dec.h"
#include "ac3_decoder.h"
#include "ac3_internal.h"
struct rematrix_band_s {
u32 start;
......
......@@ -2,7 +2,7 @@
* ac3_srfft.c: ac3 FFT
*****************************************************************************
* Copyright (C) 1999, 2000 VideoLAN
* $Id: ac3_srfft.c,v 1.3 2001/05/06 04:32:02 sam Exp $
* $Id: ac3_srfft.c,v 1.4 2001/05/14 15:58:04 reno Exp $
*
* Authors: Renaud Dartus <reno@videolan.org>
* Aaron Holtzman <aholtzma@engr.uvic.ca>
......@@ -40,9 +40,9 @@
#include "ac3_decoder.h"
#include "ac3_srfft.h"
void fft_8 (complex_t *x);
static void fft_8 (complex_t *x);
void fft_4(complex_t *x)
static void fft_4(complex_t *x)
{
/* delta_p = 1 here */
/* x[k] = sum_{i=0..3} x[i] * w^{i*k}, w=e^{-2*pi/4}
......@@ -90,7 +90,7 @@ void fft_4(complex_t *x)
}
void fft_8 (complex_t *x)
static void fft_8 (complex_t *x)
{
/* delta_p = diag{1, sqrt(i)} here */
/* x[k] = sum_{i=0..7} x[i] * w^{i*k}, w=e^{-2*pi/8}
......@@ -205,7 +205,7 @@ void fft_8 (complex_t *x)
}
void fft_asmb(int k, complex_t *x, complex_t *wTB,
static void fft_asmb(int k, complex_t *x, complex_t *wTB,
const complex_t *d, const complex_t *d_3)
{
register complex_t *x2k, *x3k, *x4k, *wB;
......@@ -236,7 +236,7 @@ void fft_asmb(int k, complex_t *x, complex_t *wTB,
}
void fft_asmb16(complex_t *x, complex_t *wTB)
static void fft_asmb16(complex_t *x, complex_t *wTB)
{
register float a_r, a_i, a1_r, a1_i, u_r, u_i, v_r, v_i;
int k = 2;
......
......@@ -2,7 +2,7 @@
* ac3_srfft.h: ac3 FFT
*****************************************************************************
* Copyright (C) 1999, 2000 VideoLAN
* $Id: ac3_srfft.h,v 1.2 2001/04/30 21:10:25 reno Exp $
* $Id: ac3_srfft.h,v 1.3 2001/05/14 15:58:04 reno Exp $
*
* Authors: Renaud Dartus <reno@videolan.org>
* Aaron Holtzman <aholtzma@engr.uvic.ca>
......@@ -22,19 +22,19 @@
* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111, USA.
*****************************************************************************/
static complex_t delta16[4] =
static const complex_t delta16[4] =
{ {1.00000000000000, 0.00000000000000},
{0.92387953251129, -0.38268343236509},
{0.70710678118655, -0.70710678118655},
{0.38268343236509, -0.92387953251129}};
static complex_t delta16_3[4] =
static const complex_t delta16_3[4] =
{ {1.00000000000000, 0.00000000000000},
{0.38268343236509, -0.92387953251129},
{-0.70710678118655, -0.70710678118655},
{-0.92387953251129, 0.38268343236509}};
static complex_t delta32[8] =
static const complex_t delta32[8] =
{ {1.00000000000000, 0.00000000000000},
{0.98078528040323, -0.19509032201613},
{0.92387953251129, -0.38268343236509},
......@@ -44,7 +44,7 @@ static complex_t delta32[8] =
{0.38268343236509, -0.92387953251129},
{0.19509032201613, -0.98078528040323}};
static complex_t delta32_3[8] =
static const complex_t delta32_3[8] =
{ {1.00000000000000, 0.00000000000000},
{0.83146961230255, -0.55557023301960},
{0.38268343236509, -0.92387953251129},
......@@ -54,7 +54,7 @@ static complex_t delta32_3[8] =
{-0.92387953251129, 0.38268343236509},
{-0.55557023301960, 0.83146961230255}};
static complex_t delta64[16] =
static const complex_t delta64[16] =
{ {1.00000000000000, 0.00000000000000},
{0.99518472667220, -0.09801714032956},
{0.98078528040323, -0.19509032201613},
......@@ -72,7 +72,7 @@ static complex_t delta64[16] =
{0.19509032201613, -0.98078528040323},
{0.09801714032956, -0.99518472667220}};
static complex_t delta64_3[16] =
static const complex_t delta64_3[16] =
{ {1.00000000000000, 0.00000000000000},
{0.95694033573221, -0.29028467725446},
{0.83146961230255, -0.55557023301960},
......@@ -90,7 +90,7 @@ static complex_t delta64_3[16] =
{-0.55557023301960, 0.83146961230255},
{-0.29028467725446, 0.95694033573221}};
static complex_t delta128[32] =
static const complex_t delta128[32] =
{ {1.00000000000000, 0.00000000000000},
{0.99879545620517, -0.04906767432742},
{0.99518472667220, -0.09801714032956},
......@@ -124,7 +124,7 @@ static complex_t delta128[32] =
{0.09801714032956, -0.99518472667220},
{0.04906767432742, -0.99879545620517}};
static complex_t delta128_3[32] =
static const complex_t delta128_3[32] =
{ {1.00000000000000, 0.00000000000000},
{0.98917650996478, -0.14673047445536},
{0.95694033573221, -0.29028467725446},
......
/*****************************************************************************
* ac3_srfft_sse.c: ac3 fft functions
*****************************************************************************
* Copyright (C) 1999, 2000, 2001 VideoLAN
* $Id: ac3_srfft_sse.c,v 1.1 2001/05/14 15:58:04 reno Exp $
*
* Authors: Renaud Dartus <reno@videolan.org>
* Aaron Holtzman <aholtzma@engr.uvic.ca>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111, USA.
*****************************************************************************/
#include <stdio.h>
#include "defs.h"
#include <math.h>
#include <stdio.h>
#include "config.h"
#include "common.h"
#include "threads.h"
#include "mtime.h"
#include "stream_control.h"
#include "input_ext-dec.h"
#include "ac3_decoder.h"
#include "ac3_srfft.h"
void hsqrt2 (void);
void C_1 (void);
static void fft_4_sse (complex_t *x);
static void fft_8_sse (complex_t *x);
static void fft_asmb_sse (int k, complex_t *x, complex_t *wTB,
const complex_t *d, const complex_t *d_3);
void fft_64p_sse(complex_t *a)
{
fft_8_sse(&a[0]); fft_4_sse(&a[8]); fft_4_sse(&a[12]);
fft_asmb_sse(2, &a[0], &a[8], &delta16[0], &delta16_3[0]);
fft_8_sse(&a[16]), fft_8_sse(&a[24]);
fft_asmb_sse(4, &a[0], &a[16],&delta32[0], &delta32_3[0]);
fft_8_sse(&a[32]); fft_4_sse(&a[40]); fft_4_sse(&a[44]);
fft_asmb_sse(2, &a[32], &a[40], &delta16[0], &delta16_3[0]);
fft_8_sse(&a[48]); fft_4_sse(&a[56]); fft_4_sse(&a[60]);
fft_asmb_sse(2, &a[48], &a[56], &delta16[0], &delta16_3[0]);
fft_asmb_sse(8, &a[0], &a[32],&delta64[0], &delta64_3[0]);
}
void fft_128p_sse(complex_t *a)
{
fft_8_sse(&a[0]); fft_4_sse(&a[8]); fft_4_sse(&a[12]);
fft_asmb_sse(2, &a[0], &a[8], &delta16[0], &delta16_3[0]);
fft_8_sse(&a[16]), fft_8_sse(&a[24]);
fft_asmb_sse(4, &a[0], &a[16],&delta32[0], &delta32_3[0]);
fft_8_sse(&a[32]); fft_4_sse(&a[40]); fft_4_sse(&a[44]);
fft_asmb_sse(2, &a[32], &a[40], &delta16[0], &delta16_3[0]);
fft_8_sse(&a[48]); fft_4_sse(&a[56]); fft_4_sse(&a[60]);
fft_asmb_sse(2, &a[48], &a[56], &delta16[0], &delta16_3[0]);
fft_asmb_sse(8, &a[0], &a[32],&delta64[0], &delta64_3[0]);
fft_8_sse(&a[64]); fft_4_sse(&a[72]); fft_4_sse(&a[76]);
/* fft_16(&a[64]); */
fft_asmb_sse(2, &a[64], &a[72], &delta16[0], &delta16_3[0]);
fft_8_sse(&a[80]); fft_8_sse(&a[88]);
/* fft_32(&a[64]); */
fft_asmb_sse(4, &a[64], &a[80],&delta32[0], &delta32_3[0]);
fft_8_sse(&a[96]); fft_4_sse(&a[104]), fft_4_sse(&a[108]);
/* fft_16(&a[96]); */
fft_asmb_sse(2, &a[96], &a[104], &delta16[0], &delta16_3[0]);
fft_8_sse(&a[112]), fft_8_sse(&a[120]);
/* fft_32(&a[96]); */
fft_asmb_sse(4, &a[96], &a[112], &delta32[0], &delta32_3[0]);
/* fft_128(&a[0]); */
fft_asmb_sse(16, &a[0], &a[64], &delta128[0], &delta128_3[0]);
}
void hsqrt2 (void)
{
__asm__ (
".float 0f0.707106781188\n"
".float 0f0.707106781188\n"
".float 0f-0.707106781188\n"
".float 0f-0.707106781188\n"
);
}
void C_1 (void)
{
__asm__ (
".float 0f-1.0\n"
".float 0f1.0\n"
".float 0f-1.0\n"
".float 0f1.0\n"
);
}
static void fft_4_sse (complex_t *x)
{
__asm__ __volatile__ (
"movups (%%eax), %%xmm0\n" /* x[1] | x[0] */
"movups 16(%%eax), %%xmm2\n" /* x[3] | x[2] */
"movups %%xmm0, %%xmm1\n" /* x[1] | x[0] */
"addps %%xmm2, %%xmm0\n" /* x[1] + x[3] | x[0] + x[2] */
"subps %%xmm2, %%xmm1\n" /* x[1] - x[3] | x[0] - x[2] */
"xorps %%xmm6, %%xmm6\n"
"movhlps %%xmm1, %%xmm4\n" /* ? | x[1] - x[3] */
"movhlps %%xmm0, %%xmm3\n" /* ? | x[1] + x[3] */
"subss %%xmm4, %%xmm6\n" /* 0 | -(x[1] - x[3]).re */
"movlhps %%xmm1, %%xmm0\n" /* x[0] - x[2] | x[0] + x[2] */
"movlhps %%xmm6, %%xmm4\n" /* 0 | -(x[1] - x[3]).re | (x[1] - x[3]).im | (x[3]-x[1]).re */
"movups %%xmm0, %%xmm2\n" /* x[0] - x[2] | x[0] + x[2] */
"shufps $0x94, %%xmm4, %%xmm3\n" /* i*(x[1] - x[3]) | x[1] + x[3] */
"addps %%xmm3, %%xmm0\n"
"subps %%xmm3, %%xmm2\n"
"movups %%xmm0, (%%eax)\n"
"movups %%xmm2, 16(%%eax)\n"
: "=a" (x)
: "a" (x) );
}
static void fft_8_sse (complex_t *x)
{
__asm__ __volatile__ (
"pushl %%ebx\n"
"movlps (%%eax), %%xmm0\n" /* x[0] */
"movlps 32(%%eax), %%xmm1\n" /* x[4] */
"movhps 16(%%eax), %%xmm0\n" /* x[2] | x[0] */
"movhps 48(%%eax), %%xmm1\n" /* x[6] | x[4] */
"movups %%xmm0, %%xmm2\n" /* x[2] | x[0] */
"xorps %%xmm3, %%xmm3\n"
"addps %%xmm1, %%xmm0\n" /* x[2] + x[6] | x[0] + x[4] */
"subps %%xmm1, %%xmm2\n" /* x[2] - x[6] | x[0] - x[4] */
"movhlps %%xmm0, %%xmm5\n" /* x[2] + x[6] */
"movhlps %%xmm2, %%xmm4\n" /* x[2] - x[6] */
"movlhps %%xmm2, %%xmm0\n" /* x[0] - x[4] | x[0] + x[4] */
"subss %%xmm4, %%xmm3\n" /* (x[2]-x[6]).im | -(x[2]-x[6]).re */
"movups %%xmm0, %%xmm7\n" /* x[0] - x[4] | x[0] + x[4] */
"movups %%xmm3, %%xmm4\n" /* (x[2]-x[6]).im | -(x[2]-x[6]).re */
"movlps 8(%%eax), %%xmm1\n" /* x[1] */
"shufps $0x14, %%xmm4, %%xmm5\n" /* i*(x[2] - x[6]) | x[2] + x[6] */
"addps %%xmm5, %%xmm0\n" /* yt = i*(x2-x6)+x0-x4 | x2+x6+x0+x4 */
"subps %%xmm5, %%xmm7\n" /* yb = i*(x6-x2)+x0-x4 | -x6-x2+x0+x4 */
"movhps 24(%%eax), %%xmm1\n" /* x[3] | x[1] */
"movl $hsqrt2, %%ebx\n"
"movlps 40(%%eax), %%xmm2\n" /* x[5] */
"movhps 56(%%eax), %%xmm2\n" /* x[7] | x[5] */
"movups %%xmm1, %%xmm3\n" /* x[3] | x[1] */
"addps %%xmm2, %%xmm1\n" /* x[3] + x[7] | x[1] + x[5] */
"subps %%xmm2, %%xmm3\n" /* x[3] - x[7] | x[1] - x[5] */
"movups (%%ebx), %%xmm4\n" /* -1/sqrt2 | -1/sqrt2 | 1/sqrt2 | 1/sqrt2 */
"movups %%xmm3, %%xmm6\n" /* x[3] - x[7] | x[1] - x[5] */
"mulps %%xmm4, %%xmm3\n" /* -1/s2*(x[3] - x[7]) | 1/s2*(x[1] - x[5]) */
"shufps $0xc8, %%xmm4, %%xmm4\n" /* -1/sqrt2 | 1/sqrt2 | -1/sqrt2 | 1/sqrt2 */
"shufps $0xb1, %%xmm6, %%xmm6\n" /* (x3-x7).re|(x3-x7).im|(x1-x5).re|(x1-x5).im */
"mulps %%xmm4, %%xmm6\n" /* (x7-x3).re/s2|(x3-x7).im/s2|(x5-x1).re/s2|(x1-x5).im/s2 */
"addps %%xmm3, %%xmm6\n" /* (-1-i)/sqrt2 * (x[3]-x[7]) | (1-i)/sqrt2 * (x[1] - x[5]) */
"movhlps %%xmm1, %%xmm5\n" /* x[3] + x[7] */
"movlhps %%xmm6, %%xmm1\n" /* (1+i)/sqrt2 * (x[1]-x[5]) | x[1]+x[5] */
"shufps $0xe4, %%xmm6, %%xmm5\n" /* (-1-i)/sqrt2 * (x[3]-x[7]) | x[3]+x[7] */
"movups %%xmm1, %%xmm3\n" /* (1-i)/sqrt2 * (x[1]-x[5]) | x[1]+x[5] */
"movl $C_1, %%ebx\n"
"addps %%xmm5, %%xmm1\n" /* u */
"subps %%xmm5, %%xmm3\n" /* v */
"movups %%xmm0, %%xmm2\n" /* yb */
"movups %%xmm7, %%xmm4\n" /* yt */
"movups (%%ebx), %%xmm5\n"
"mulps %%xmm5, %%xmm3\n"
"addps %%xmm1, %%xmm0\n" /* yt + u */
"subps %%xmm1, %%xmm2\n" /* yt - u */
"shufps $0xb1, %%xmm3, %%xmm3\n" /* -i * v */
"movups %%xmm0, (%%eax)\n"
"movups %%xmm2, 32(%%eax)\n"
"addps %%xmm3, %%xmm4\n" /* yb - i*v */
"subps %%xmm3, %%xmm7\n" /* yb + i*v */
"movups %%xmm4, 16(%%eax)\n"
"movups %%xmm7, 48(%%eax)\n"
"popl %%ebx\n"
: "=a" (x)
: "a" (x));
}
static void fft_asmb_sse (int k, complex_t *x, complex_t *wTB,
const complex_t *d, const complex_t *d_3)
{
__asm__ __volatile__ (
"pushl %%ebp\n"
"movl %%esp, %%ebp\n"
"subl $4, %%esp\n"
"pushl %%eax\n"
"pushl %%ebx\n"
"pushl %%ecx\n"
"pushl %%edx\n"
"pushl %%esi\n"
"pushl %%edi\n"
"movl 8(%%ebp), %%ecx\n" /* k */
"movl 12(%%ebp), %%eax\n" /* x */
"movl %%ecx, -4(%%ebp)\n" /* k */
"movl 16(%%ebp), %%ebx\n" /* wT */
"movl 20(%%ebp), %%edx\n" /* d */
"movl 24(%%ebp), %%esi\n" /* d3 */
"shll $4, %%ecx\n" /* 16k */
"addl $8, %%edx\n"
"leal (%%eax, %%ecx, 2), %%edi\n"
"addl $8, %%esi\n"
/* TRANSZERO and TRANS */
"movups (%%eax), %%xmm0\n" /* x[1] | x[0] */
"movups (%%ebx), %%xmm1\n" /* wT[1] | wT[0] */
"movups (%%ebx, %%ecx), %%xmm2\n" /* wB[1] | wB[0] */
"movlps (%%edx), %%xmm3\n" /* d */
"movlps (%%esi), %%xmm4\n" /* d3 */
"movhlps %%xmm1, %%xmm5\n" /* wT[1] */
"movhlps %%xmm2, %%xmm6\n" /* wB[1] */
"shufps $0x50, %%xmm3, %%xmm3\n" /* d[1].im | d[1].im | d[1].re | d[1].re */
"shufps $0x50, %%xmm4, %%xmm4\n" /* d3[1].im | d3[1].im | d3[i].re | d3[i].re */
"movlhps %%xmm5, %%xmm5\n" /* wT[1] | wT[1] */
"movlhps %%xmm6, %%xmm6\n" /* wB[1] | wB[1] */
"mulps %%xmm3, %%xmm5\n"
"mulps %%xmm4, %%xmm6\n"
"movhlps %%xmm5, %%xmm7\n" /* wT[1].im * d[1].im | wT[1].re * d[1].im */
"movlhps %%xmm6, %%xmm5\n" /* wB[1].im * d3[1].re | wB[1].re * d3[1].re | wT[1].im * d[1].re | wT[1].re * d[1].re */
"shufps $0xb1, %%xmm6, %%xmm7\n" /* wB[1].re * d3[1].im | wB[i].im * d3[1].im | wT[1].re * d[1].im | wT[1].im * d[1].im */
"movl $C_1, %%edi\n"
"movups (%%edi), %%xmm4\n"
"mulps %%xmm4, %%xmm7\n"
"addps %%xmm7, %%xmm5\n" /* wB[1] * d3[1] | wT[1] * d[1] */
"movlhps %%xmm5, %%xmm1\n" /* d[1] * wT[1] | wT[0] */
"shufps $0xe4, %%xmm5, %%xmm2\n" /* d3[1] * wB[1] | wB[0] */
"movups %%xmm1, %%xmm3\n" /* d[1] * wT[1] | wT[0] */
"leal (%%eax, %%ecx, 2), %%edi\n"
"addps %%xmm2, %%xmm1\n" /* u */
"subps %%xmm2, %%xmm3\n" /* v */
"mulps %%xmm4, %%xmm3\n"
"movups (%%eax, %%ecx), %%xmm5\n" /* xk[1] | xk[0] */
"shufps $0xb1, %%xmm3, %%xmm3\n" /* -i * v */
"movups %%xmm0, %%xmm2\n" /* x[1] | x[0] */
"movups %%xmm5, %%xmm6\n" /* xk[1] | xk[0] */
"addps %%xmm1, %%xmm0\n"
"subps %%xmm1, %%xmm2\n"
"addps %%xmm3, %%xmm5\n"
"subps %%xmm3, %%xmm6\n"
"movups %%xmm0, (%%eax)\n"
"movups %%xmm2, (%%edi)\n"
"movups %%xmm5, (%%eax, %%ecx)\n"
"movups %%xmm6, (%%edi, %%ecx)\n"
"addl $16, %%eax\n"
"addl $16, %%ebx\n"
"addl $8, %%edx\n"
"addl $8, %%esi\n"
"decl -4(%%ebp)\n"
".loop:\n"
"movups (%%ebx), %%xmm0\n" /* wT[1] | wT[0] */
"movups (%%edx), %%xmm1\n" /* d[1] | d[0] */
"movups (%%ebx, %%ecx), %%xmm4\n" /* wB[1] | wB[0] */
"movups (%%esi), %%xmm5\n" /* d3[1] | d3[0] */
"movhlps %%xmm0, %%xmm2\n" /* wT[1] */
"movhlps %%xmm1, %%xmm3\n" /* d[1] */
"movhlps %%xmm4, %%xmm6\n" /* wB[1] */
"movhlps %%xmm5, %%xmm7\n" /* d3[1] */
"shufps $0x50, %%xmm1, %%xmm1\n" /* d[0].im | d[0].im | d[0].re | d[0].re */
"shufps $0x50, %%xmm3, %%xmm3\n" /* d[1].im | d[1].im | d[1].re | d[1].re */
"movlhps %%xmm0, %%xmm0\n" /* wT[0] | wT[0] */
"shufps $0x50, %%xmm5, %%xmm5\n" /* d3[0].im | d3[0].im | d3[0].re | d3[0].re */
"movlhps %%xmm2, %%xmm2\n" /* wT[1] | wT[1] */
"shufps $0x50, %%xmm7, %%xmm7\n" /* d3[1].im | d3[1].im | d3[1].re | d3[1].re */
"mulps %%xmm1, %%xmm0\n" /* d[0].im * wT[0].im | d[0].im * wT[0].re | d[0].re * wT[0].im | d[0].re * wT[0].re */
"mulps %%xmm3, %%xmm2\n" /* d[1].im * wT[1].im | d[1].im * wT[1].re | d[1].re * wT[1].im | d[1].re * wT[1].re */
"movlhps %%xmm4, %%xmm4\n" /* wB[0] | wB[0] */
"movlhps %%xmm6, %%xmm6\n" /* wB[1] | wB[1] */
"movhlps %%xmm0, %%xmm1\n" /* d[0].im * wT[0].im | d[0].im * wT[0].re */
"movlhps %%xmm2, %%xmm0\n" /* d[1].re * wT[1].im | d[1].re * wT[1].re | d[0].re * wT[0].im | d[0].re * wT[0].re */
"mulps %%xmm5, %%xmm4\n" /* wB[0].im * d3[0].im | wB[0].re * d3[0].im | wB[0].im * d3[0].re | wB[0].re * d3[0].re */
"mulps %%xmm7, %%xmm6\n" /* wB[1].im * d3[1].im | wB[1].re * d3[1].im | wB[1].im * d3[1].re | wB[1].re * d3[1].re */
"shufps $0xb1, %%xmm2, %%xmm1\n" /* d[1].im * wT[1].re | d[1].im * wT[1].im | d[0].im * wT[0].re | d[0].im * wT[0].im */
"movl $C_1, %%edi\n"
"movups (%%edi), %%xmm3\n" /* 1.0 | -1.0 | 1.0 | -1.0 */
"movhlps %%xmm4, %%xmm5\n" /* wB[0].im * d3[0].im | wB[0].re * d3[0].im */
"mulps %%xmm3, %%xmm1\n" /* d[1].im * wT[1].re | -d[1].im * wT[1].im | d[0].im * wT[0].re | -d[0].im * wT[0].im */
"movlhps %%xmm6, %%xmm4\n" /* wB[1].im * d3[1].re | wB[1].re * d3[1].re | wB[0].im * d3[0].re | wB[0].im * d3[0].re */
"addps %%xmm1, %%xmm0\n" /* wT[1] * d[1] | wT[0] * d[0] */
"shufps $0xb1, %%xmm6, %%xmm5\n" /* wB[1].re * d3[1].im | wB[1].im * d3[1].im | wB[0].re * d3[0].im | wB[0].im * d3[0].im */
"mulps %%xmm3, %%xmm5\n" /* wB[1].re * d3[1].im | -wB[1].im * d3[1].im | wB[0].re * d3[0].im | -wB[0].im * d3[0].im */
"addps %%xmm5, %%xmm4\n" /* wB[1] * d3[1] | wB[0] * d3[0] */
"movups %%xmm0, %%xmm1\n" /* wT[1] * d[1] | wT[0] * d[0] */
"addps %%xmm4, %%xmm0\n" /* u */
"subps %%xmm4, %%xmm1\n" /* v */
"movups (%%eax), %%xmm6\n" /* x[1] | x[0] */
"leal (%%eax, %%ecx, 2), %%edi\n"
"mulps %%xmm3, %%xmm1\n"
"addl $16, %%ebx\n"
"addl $16, %%esi\n"
"shufps $0xb1, %%xmm1, %%xmm1\n" /* -i * v */
"movups (%%eax, %%ecx), %%xmm7\n" /* xk[1] | xk[0] */
"movups %%xmm6, %%xmm2\n"
"movups %%xmm7, %%xmm4\n"
"addps %%xmm0, %%xmm6\n"
"subps %%xmm0, %%xmm2\n"
"movups %%xmm6, (%%eax)\n"
"movups %%xmm2, (%%edi)\n"
"addps %%xmm1, %%xmm7\n"
"subps %%xmm1, %%xmm4\n"
"addl $16, %%edx\n"
"movups %%xmm7, (%%eax, %%ecx)\n"
"movups %%xmm4, (%%edi, %%ecx)\n"
"addl $16, %%eax\n"
"decl -4(%%ebp)\n"
"jnz .loop\n"
".end:\n"
"popl %%edi\n"
"popl %%esi\n"
"popl %%edx\n"
"popl %%ecx\n"
"popl %%ebx\n"
"popl %%eax\n"
"addl $4, %%esp\n"
"leave\n"
::);
}
......@@ -4,7 +4,7 @@
* and spawn threads.
*****************************************************************************
* Copyright (C) 1998, 1999, 2000 VideoLAN
* $Id: main.c,v 1.93 2001/05/07 03:14:09 stef Exp $
* $Id: main.c,v 1.94 2001/05/14 15:58:04 reno Exp $
*
* Authors: Vincent Seguin <seguin@via.ecp.fr>
* Samuel Hocevar <sam@zoy.org>
......@@ -974,6 +974,7 @@ static int CPUCapabilities( void )
if( i_edx & 0x02000000 )
{
i_capabilities |= CPU_CAPABILITY_MMXEXT;
i_capabilities |= CPU_CAPABILITY_SSE;
}
/* test for additional capabilities */
......@@ -996,7 +997,6 @@ static int CPUCapabilities( void )
{
i_capabilities |= CPU_CAPABILITY_MMXEXT;
}
#else
/* default behaviour */
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment