Commit 7501b475 authored by lorenm's avatar lorenm

split-radix FFT

c is 1.9x faster than previous c (on various x86 cpus), sse is 1.6x faster than previous sse.


git-svn-id: file:///var/local/repositories/ffmpeg/trunk@14698 9553f0bf-9b14-0410-a0b8-cfaf0461ba5b
parent 8b78f51f
...@@ -388,6 +388,8 @@ OBJS += i386/fdct_mmx.o \ ...@@ -388,6 +388,8 @@ OBJS += i386/fdct_mmx.o \
i386/simple_idct_mmx.o \ i386/simple_idct_mmx.o \
i386/idct_mmx_xvid.o \ i386/idct_mmx_xvid.o \
i386/idct_sse2_xvid.o \ i386/idct_sse2_xvid.o \
OBJS-$(HAVE_YASM) += i386/fft_mmx.o \
i386/fft_sse.o \ i386/fft_sse.o \
i386/fft_3dn.o \ i386/fft_3dn.o \
i386/fft_3dn2.o \ i386/fft_3dn2.o \
......
...@@ -639,6 +639,8 @@ typedef struct FFTContext { ...@@ -639,6 +639,8 @@ typedef struct FFTContext {
uint16_t *revtab; uint16_t *revtab;
FFTComplex *exptab; FFTComplex *exptab;
FFTComplex *exptab1; /* only used by SSE code */ FFTComplex *exptab1; /* only used by SSE code */
FFTComplex *tmp_buf;
void (*fft_permute)(struct FFTContext *s, FFTComplex *z);
void (*fft_calc)(struct FFTContext *s, FFTComplex *z); void (*fft_calc)(struct FFTContext *s, FFTComplex *z);
void (*imdct_calc)(struct MDCTContext *s, FFTSample *output, void (*imdct_calc)(struct MDCTContext *s, FFTSample *output,
const FFTSample *input, FFTSample *tmp); const FFTSample *input, FFTSample *tmp);
...@@ -647,13 +649,18 @@ typedef struct FFTContext { ...@@ -647,13 +649,18 @@ typedef struct FFTContext {
} FFTContext; } FFTContext;
int ff_fft_init(FFTContext *s, int nbits, int inverse); int ff_fft_init(FFTContext *s, int nbits, int inverse);
void ff_fft_permute(FFTContext *s, FFTComplex *z); void ff_fft_permute_c(FFTContext *s, FFTComplex *z);
void ff_fft_permute_sse(FFTContext *s, FFTComplex *z);
void ff_fft_calc_c(FFTContext *s, FFTComplex *z); void ff_fft_calc_c(FFTContext *s, FFTComplex *z);
void ff_fft_calc_sse(FFTContext *s, FFTComplex *z); void ff_fft_calc_sse(FFTContext *s, FFTComplex *z);
void ff_fft_calc_3dn(FFTContext *s, FFTComplex *z); void ff_fft_calc_3dn(FFTContext *s, FFTComplex *z);
void ff_fft_calc_3dn2(FFTContext *s, FFTComplex *z); void ff_fft_calc_3dn2(FFTContext *s, FFTComplex *z);
void ff_fft_calc_altivec(FFTContext *s, FFTComplex *z); void ff_fft_calc_altivec(FFTContext *s, FFTComplex *z);
static inline void ff_fft_permute(FFTContext *s, FFTComplex *z)
{
s->fft_permute(s, z);
}
static inline void ff_fft_calc(FFTContext *s, FFTComplex *z) static inline void ff_fft_calc(FFTContext *s, FFTComplex *z)
{ {
s->fft_calc(s, z); s->fft_calc(s, z);
......
This diff is collapsed.
/* /*
* FFT/MDCT transform with 3DNow! optimizations * FFT/MDCT transform with 3DNow! optimizations
* Copyright (c) 2006 Zuxy MENG Jie, Loren Merritt * Copyright (c) 2008 Loren Merritt
* Based on fft_sse.c copyright (c) 2002 Fabrice Bellard.
* *
* This file is part of FFmpeg. * This file is part of FFmpeg.
* *
...@@ -20,109 +19,5 @@ ...@@ -20,109 +19,5 @@
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/ */
#include "libavutil/x86_cpu.h" #define EMULATE_3DNOWEXT
#include "libavcodec/dsputil.h" #include "fft_3dn2.c"
static const int p1m1[2] __attribute__((aligned(8))) =
{ 0, 1 << 31 };
static const int m1p1[2] __attribute__((aligned(8))) =
{ 1 << 31, 0 };
void ff_fft_calc_3dn(FFTContext *s, FFTComplex *z)
{
int ln = s->nbits;
long j;
x86_reg i;
long nblocks, nloops;
FFTComplex *p, *cptr;
asm volatile(
/* FEMMS is not a must here but recommended by AMD */
"femms \n\t"
"movq %0, %%mm7 \n\t"
::"m"(*(s->inverse ? m1p1 : p1m1))
);
i = 8 << ln;
asm volatile(
"1: \n\t"
"sub $32, %0 \n\t"
"movq (%0,%1), %%mm0 \n\t"
"movq 16(%0,%1), %%mm1 \n\t"
"movq 8(%0,%1), %%mm2 \n\t"
"movq 24(%0,%1), %%mm3 \n\t"
"movq %%mm0, %%mm4 \n\t"
"movq %%mm1, %%mm5 \n\t"
"pfadd %%mm2, %%mm0 \n\t"
"pfadd %%mm3, %%mm1 \n\t"
"pfsub %%mm2, %%mm4 \n\t"
"pfsub %%mm3, %%mm5 \n\t"
"movq %%mm0, %%mm2 \n\t"
"punpckldq %%mm5, %%mm6 \n\t"
"punpckhdq %%mm6, %%mm5 \n\t"
"movq %%mm4, %%mm3 \n\t"
"pxor %%mm7, %%mm5 \n\t"
"pfadd %%mm1, %%mm0 \n\t"
"pfadd %%mm5, %%mm4 \n\t"
"pfsub %%mm1, %%mm2 \n\t"
"pfsub %%mm5, %%mm3 \n\t"
"movq %%mm0, (%0,%1) \n\t"
"movq %%mm4, 8(%0,%1) \n\t"
"movq %%mm2, 16(%0,%1) \n\t"
"movq %%mm3, 24(%0,%1) \n\t"
"jg 1b \n\t"
:"+r"(i)
:"r"(z)
);
/* pass 2 .. ln-1 */
nblocks = 1 << (ln-3);
nloops = 1 << 2;
cptr = s->exptab1;
do {
p = z;
j = nblocks;
do {
i = nloops*8;
asm volatile(
"1: \n\t"
"sub $16, %0 \n\t"
"movq (%1,%0), %%mm0 \n\t"
"movq 8(%1,%0), %%mm1 \n\t"
"movq (%2,%0), %%mm2 \n\t"
"movq 8(%2,%0), %%mm3 \n\t"
"movq %%mm2, %%mm4 \n\t"
"movq %%mm3, %%mm5 \n\t"
"punpckldq %%mm2, %%mm2 \n\t"
"punpckldq %%mm3, %%mm3 \n\t"
"punpckhdq %%mm4, %%mm4 \n\t"
"punpckhdq %%mm5, %%mm5 \n\t"
"pfmul (%3,%0,2), %%mm2 \n\t" // cre*re cim*re
"pfmul 8(%3,%0,2), %%mm3 \n\t"
"pfmul 16(%3,%0,2), %%mm4 \n\t" // -cim*im cre*im
"pfmul 24(%3,%0,2), %%mm5 \n\t"
"pfadd %%mm2, %%mm4 \n\t" // cre*re-cim*im cim*re+cre*im
"pfadd %%mm3, %%mm5 \n\t"
"movq %%mm0, %%mm2 \n\t"
"movq %%mm1, %%mm3 \n\t"
"pfadd %%mm4, %%mm0 \n\t"
"pfadd %%mm5, %%mm1 \n\t"
"pfsub %%mm4, %%mm2 \n\t"
"pfsub %%mm5, %%mm3 \n\t"
"movq %%mm0, (%1,%0) \n\t"
"movq %%mm1, 8(%1,%0) \n\t"
"movq %%mm2, (%2,%0) \n\t"
"movq %%mm3, 8(%2,%0) \n\t"
"jg 1b \n\t"
:"+r"(i)
:"r"(p), "r"(p + nloops), "r"(cptr)
);
p += nloops*2;
} while (--j);
cptr += nloops*2;
nblocks >>= 1;
nloops <<= 1;
} while (nblocks != 0);
asm volatile("femms");
}
...@@ -23,105 +23,26 @@ ...@@ -23,105 +23,26 @@
#include "libavutil/x86_cpu.h" #include "libavutil/x86_cpu.h"
#include "libavcodec/dsputil.h" #include "libavcodec/dsputil.h"
static const int p1m1[2] __attribute__((aligned(8))) = #ifdef EMULATE_3DNOWEXT
{ 0, 1 << 31 }; #define ff_fft_calc_3dn2 ff_fft_calc_3dn
#define ff_fft_dispatch_3dn2 ff_fft_dispatch_3dn
#define ff_fft_dispatch_interleave_3dn2 ff_fft_dispatch_interleave_3dn
#define ff_imdct_calc_3dn2 ff_imdct_calc_3dn
#define ff_imdct_half_3dn2 ff_imdct_half_3dn
#endif
static const int m1p1[2] __attribute__((aligned(8))) = void ff_fft_dispatch_3dn2(FFTComplex *z, int nbits);
{ 1 << 31, 0 }; void ff_fft_dispatch_interleave_3dn2(FFTComplex *z, int nbits);
void ff_fft_calc_3dn2(FFTContext *s, FFTComplex *z) void ff_fft_calc_3dn2(FFTContext *s, FFTComplex *z)
{ {
int ln = s->nbits; int n = 1<<s->nbits;
long j; int i;
x86_reg i; ff_fft_dispatch_interleave_3dn2(z, s->nbits);
long nblocks, nloops;
FFTComplex *p, *cptr;
asm volatile(
/* FEMMS is not a must here but recommended by AMD */
"femms \n\t"
"movq %0, %%mm7 \n\t"
::"m"(*(s->inverse ? m1p1 : p1m1))
);
i = 8 << ln;
asm volatile(
"1: \n\t"
"sub $32, %0 \n\t"
"movq (%0,%1), %%mm0 \n\t"
"movq 16(%0,%1), %%mm1 \n\t"
"movq 8(%0,%1), %%mm2 \n\t"
"movq 24(%0,%1), %%mm3 \n\t"
"movq %%mm0, %%mm4 \n\t"
"movq %%mm1, %%mm5 \n\t"
"pfadd %%mm2, %%mm0 \n\t"
"pfadd %%mm3, %%mm1 \n\t"
"pfsub %%mm2, %%mm4 \n\t"
"pfsub %%mm3, %%mm5 \n\t"
"movq %%mm0, %%mm2 \n\t"
"pswapd %%mm5, %%mm5 \n\t"
"movq %%mm4, %%mm3 \n\t"
"pxor %%mm7, %%mm5 \n\t"
"pfadd %%mm1, %%mm0 \n\t"
"pfadd %%mm5, %%mm4 \n\t"
"pfsub %%mm1, %%mm2 \n\t"
"pfsub %%mm5, %%mm3 \n\t"
"movq %%mm0, (%0,%1) \n\t"
"movq %%mm4, 8(%0,%1) \n\t"
"movq %%mm2, 16(%0,%1) \n\t"
"movq %%mm3, 24(%0,%1) \n\t"
"jg 1b \n\t"
:"+r"(i)
:"r"(z)
);
/* pass 2 .. ln-1 */
nblocks = 1 << (ln-3);
nloops = 1 << 2;
cptr = s->exptab1;
do {
p = z;
j = nblocks;
do {
i = nloops*8;
asm volatile(
"1: \n\t"
"sub $16, %0 \n\t"
"movq (%1,%0), %%mm0 \n\t"
"movq 8(%1,%0), %%mm1 \n\t"
"movq (%2,%0), %%mm2 \n\t"
"movq 8(%2,%0), %%mm3 \n\t"
"movq (%3,%0,2), %%mm4 \n\t"
"movq 8(%3,%0,2), %%mm5 \n\t"
"pswapd %%mm4, %%mm6 \n\t" // no need for cptr[2] & cptr[3]
"pswapd %%mm5, %%mm7 \n\t"
"pfmul %%mm2, %%mm4 \n\t" // cre*re cim*im
"pfmul %%mm3, %%mm5 \n\t"
"pfmul %%mm2, %%mm6 \n\t" // cim*re cre*im
"pfmul %%mm3, %%mm7 \n\t"
"pfpnacc %%mm6, %%mm4 \n\t" // cre*re-cim*im cim*re+cre*im
"pfpnacc %%mm7, %%mm5 \n\t"
"movq %%mm0, %%mm2 \n\t"
"movq %%mm1, %%mm3 \n\t"
"pfadd %%mm4, %%mm0 \n\t"
"pfadd %%mm5, %%mm1 \n\t"
"pfsub %%mm4, %%mm2 \n\t"
"pfsub %%mm5, %%mm3 \n\t"
"movq %%mm0, (%1,%0) \n\t"
"movq %%mm1, 8(%1,%0) \n\t"
"movq %%mm2, (%2,%0) \n\t"
"movq %%mm3, 8(%2,%0) \n\t"
"jg 1b \n\t"
:"+r"(i)
:"r"(p), "r"(p + nloops), "r"(cptr)
);
p += nloops*2;
} while (--j);
cptr += nloops*2;
nblocks >>= 1;
nloops <<= 1;
} while (nblocks != 0);
asm volatile("femms"); asm volatile("femms");
if(n <= 8)
for(i=0; i<n; i+=2)
FFSWAP(FFTSample, z[i].im, z[i+1].re);
} }
static void imdct_3dn2(MDCTContext *s, const FFTSample *input, FFTSample *tmp) static void imdct_3dn2(MDCTContext *s, const FFTSample *input, FFTSample *tmp)
...@@ -162,7 +83,7 @@ static void imdct_3dn2(MDCTContext *s, const FFTSample *input, FFTSample *tmp) ...@@ -162,7 +83,7 @@ static void imdct_3dn2(MDCTContext *s, const FFTSample *input, FFTSample *tmp)
); );
} }
ff_fft_calc(&s->fft, z); ff_fft_calc_3dn2(&s->fft, z);
/* post rotation + reordering */ /* post rotation + reordering */
for(k = 0; k < n4; k++) { for(k = 0; k < n4; k++) {
......
This diff is collapsed.
...@@ -22,124 +22,55 @@ ...@@ -22,124 +22,55 @@
#include "libavutil/x86_cpu.h" #include "libavutil/x86_cpu.h"
#include "libavcodec/dsputil.h" #include "libavcodec/dsputil.h"
static const int p1p1p1m1[4] __attribute__((aligned(16))) =
{ 0, 0, 0, 1 << 31 };
static const int p1p1m1p1[4] __attribute__((aligned(16))) =
{ 0, 0, 1 << 31, 0 };
static const int p1p1m1m1[4] __attribute__((aligned(16))) =
{ 0, 0, 1 << 31, 1 << 31 };
static const int p1m1p1m1[4] __attribute__((aligned(16))) = static const int p1m1p1m1[4] __attribute__((aligned(16))) =
{ 0, 1 << 31, 0, 1 << 31 }; { 0, 1 << 31, 0, 1 << 31 };
static const int m1m1m1m1[4] __attribute__((aligned(16))) = static const int m1m1m1m1[4] __attribute__((aligned(16))) =
{ 1 << 31, 1 << 31, 1 << 31, 1 << 31 }; { 1 << 31, 1 << 31, 1 << 31, 1 << 31 };
#if 0 void ff_fft_dispatch_sse(FFTComplex *z, int nbits);
static void print_v4sf(const char *str, __m128 a) void ff_fft_dispatch_interleave_sse(FFTComplex *z, int nbits);
{
float *p = (float *)&a;
printf("%s: %f %f %f %f\n",
str, p[0], p[1], p[2], p[3]);
}
#endif
/* XXX: handle reverse case */
void ff_fft_calc_sse(FFTContext *s, FFTComplex *z) void ff_fft_calc_sse(FFTContext *s, FFTComplex *z)
{ {
int ln = s->nbits; int n = 1 << s->nbits;
x86_reg i;
long j;
long nblocks, nloops;
FFTComplex *p, *cptr;
asm volatile( ff_fft_dispatch_interleave_sse(z, s->nbits);
"movaps %0, %%xmm4 \n\t"
"movaps %1, %%xmm5 \n\t"
::"m"(*p1p1m1m1),
"m"(*(s->inverse ? p1p1m1p1 : p1p1p1m1))
);
i = 8 << ln; if(n <= 16) {
asm volatile( x86_reg i = -8*n;
"1: \n\t" asm volatile(
"sub $32, %0 \n\t" "1: \n"
/* do the pass 0 butterfly */ "movaps (%0,%1), %%xmm0 \n"
"movaps (%0,%1), %%xmm0 \n\t" "movaps %%xmm0, %%xmm1 \n"
"movaps %%xmm0, %%xmm1 \n\t" "unpcklps 16(%0,%1), %%xmm0 \n"
"shufps $0x4E, %%xmm0, %%xmm0 \n\t" "unpckhps 16(%0,%1), %%xmm1 \n"
"xorps %%xmm4, %%xmm1 \n\t" "movaps %%xmm0, (%0,%1) \n"
"addps %%xmm1, %%xmm0 \n\t" "movaps %%xmm1, 16(%0,%1) \n"
"movaps 16(%0,%1), %%xmm2 \n\t" "add $32, %0 \n"
"movaps %%xmm2, %%xmm3 \n\t" "jl 1b \n"
"shufps $0x4E, %%xmm2, %%xmm2 \n\t" :"+r"(i)
"xorps %%xmm4, %%xmm3 \n\t" :"r"(z+n)
"addps %%xmm3, %%xmm2 \n\t" :"memory"
/* multiply third by -i */ );
/* by toggling the sign bit */ }
"shufps $0xB4, %%xmm2, %%xmm2 \n\t" }
"xorps %%xmm5, %%xmm2 \n\t"
/* do the pass 1 butterfly */
"movaps %%xmm0, %%xmm1 \n\t"
"addps %%xmm2, %%xmm0 \n\t"
"subps %%xmm2, %%xmm1 \n\t"
"movaps %%xmm0, (%0,%1) \n\t"
"movaps %%xmm1, 16(%0,%1) \n\t"
"jg 1b \n\t"
:"+r"(i)
:"r"(z)
);
/* pass 2 .. ln-1 */
nblocks = 1 << (ln-3); void ff_fft_permute_sse(FFTContext *s, FFTComplex *z)
nloops = 1 << 2; {
cptr = s->exptab1; int n = 1 << s->nbits;
do { int i;
p = z; for(i=0; i<n; i+=2) {
j = nblocks; asm volatile(
do { "movaps %2, %%xmm0 \n"
i = nloops*8; "movlps %%xmm0, %0 \n"
asm volatile( "movhps %%xmm0, %1 \n"
"1: \n\t" :"=m"(s->tmp_buf[s->revtab[i]]),
"sub $32, %0 \n\t" "=m"(s->tmp_buf[s->revtab[i+1]])
"movaps (%2,%0), %%xmm1 \n\t" :"m"(z[i])
"movaps (%1,%0), %%xmm0 \n\t" );
"movaps 16(%2,%0), %%xmm5 \n\t" }
"movaps 16(%1,%0), %%xmm4 \n\t" memcpy(z, s->tmp_buf, n*sizeof(FFTComplex));
"movaps %%xmm1, %%xmm2 \n\t"
"movaps %%xmm5, %%xmm6 \n\t"
"shufps $0xA0, %%xmm1, %%xmm1 \n\t"
"shufps $0xF5, %%xmm2, %%xmm2 \n\t"
"shufps $0xA0, %%xmm5, %%xmm5 \n\t"
"shufps $0xF5, %%xmm6, %%xmm6 \n\t"
"mulps (%3,%0,2), %%xmm1 \n\t" // cre*re cim*re
"mulps 16(%3,%0,2), %%xmm2 \n\t" // -cim*im cre*im
"mulps 32(%3,%0,2), %%xmm5 \n\t" // cre*re cim*re
"mulps 48(%3,%0,2), %%xmm6 \n\t" // -cim*im cre*im
"addps %%xmm2, %%xmm1 \n\t"
"addps %%xmm6, %%xmm5 \n\t"
"movaps %%xmm0, %%xmm3 \n\t"
"movaps %%xmm4, %%xmm7 \n\t"
"addps %%xmm1, %%xmm0 \n\t"
"subps %%xmm1, %%xmm3 \n\t"
"addps %%xmm5, %%xmm4 \n\t"
"subps %%xmm5, %%xmm7 \n\t"
"movaps %%xmm0, (%1,%0) \n\t"
"movaps %%xmm3, (%2,%0) \n\t"
"movaps %%xmm4, 16(%1,%0) \n\t"
"movaps %%xmm7, 16(%2,%0) \n\t"
"jg 1b \n\t"
:"+r"(i)
:"r"(p), "r"(p + nloops), "r"(cptr)
);
p += nloops*2;
} while (--j);
cptr += nloops*2;
nblocks >>= 1;
nloops <<= 1;
} while (nblocks != 0);
} }
static void imdct_sse(MDCTContext *s, const FFTSample *input, FFTSample *tmp) static void imdct_sse(MDCTContext *s, const FFTSample *input, FFTSample *tmp)
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment