Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Support
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
V
vlc
Project overview
Project overview
Details
Activity
Releases
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Issues
0
Issues
0
List
Boards
Labels
Milestones
Redmine
Redmine
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Operations
Operations
Metrics
Environments
Analytics
Analytics
CI / CD
Repository
Value Stream
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
videolan
vlc
Commits
dee3179d
Commit
dee3179d
authored
Jul 08, 2001
by
Renaud Dartus
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
* Alignement in asm functions
* 16 bytes alignement for data (need fo SSE) * Optimization in SSE
parent
5b49dba8
Changes
10
Show whitespace changes
Inline
Side-by-side
Showing
10 changed files
with
462 additions
and
400 deletions
+462
-400
include/ac3_imdct.h
include/ac3_imdct.h
+5
-4
plugins/downmix/ac3_downmix_3dn.c
plugins/downmix/ac3_downmix_3dn.c
+15
-2
plugins/downmix/ac3_downmix_sse.c
plugins/downmix/ac3_downmix_sse.c
+162
-152
plugins/imdct/ac3_imdct_3dn.c
plugins/imdct/ac3_imdct_3dn.c
+15
-1
plugins/imdct/ac3_imdct_sse.c
plugins/imdct/ac3_imdct_sse.c
+134
-130
plugins/imdct/ac3_srfft_3dn.c
plugins/imdct/ac3_srfft_3dn.c
+2
-1
plugins/imdct/ac3_srfft_sse.c
plugins/imdct/ac3_srfft_sse.c
+105
-98
src/ac3_decoder/ac3_decoder.h
src/ac3_decoder/ac3_decoder.h
+4
-3
src/ac3_decoder/ac3_decoder_thread.c
src/ac3_decoder/ac3_decoder_thread.c
+9
-2
src/ac3_decoder/ac3_decoder_thread.h
src/ac3_decoder/ac3_decoder_thread.h
+11
-7
No files found.
include/ac3_imdct.h
View file @
dee3179d
...
@@ -2,7 +2,7 @@
...
@@ -2,7 +2,7 @@
* ac3_imdct.h : AC3 IMDCT types
* ac3_imdct.h : AC3 IMDCT types
*****************************************************************************
*****************************************************************************
* Copyright (C) 1999, 2000 VideoLAN
* Copyright (C) 1999, 2000 VideoLAN
* $Id: ac3_imdct.h,v 1.
4 2001/06/12 00:30:4
1 reno Exp $
* $Id: ac3_imdct.h,v 1.
5 2001/07/08 23:15:1
1 reno Exp $
*
*
* Authors: Michel Kaempf <maxx@via.ecp.fr>
* Authors: Michel Kaempf <maxx@via.ecp.fr>
* Renaud Dartus <reno@videolan.org>
* Renaud Dartus <reno@videolan.org>
...
@@ -42,18 +42,19 @@ typedef struct imdct_s
...
@@ -42,18 +42,19 @@ typedef struct imdct_s
float
xsin1
[
N
/
4
]
__attribute__
((
aligned
(
16
)));
float
xsin1
[
N
/
4
]
__attribute__
((
aligned
(
16
)));
float
xcos2
[
N
/
8
]
__attribute__
((
aligned
(
16
)));
float
xcos2
[
N
/
8
]
__attribute__
((
aligned
(
16
)));
float
xsin2
[
N
/
8
]
__attribute__
((
aligned
(
16
)));
float
xsin2
[
N
/
8
]
__attribute__
((
aligned
(
16
)));
float
xcos_sin_sse
[
128
*
4
]
__attribute__
((
aligned
(
16
)));
/* Twiddle factor LUT */
/* Twiddle factor LUT */
complex_t
*
w
[
7
]
__attribute__
((
aligned
(
16
)));
complex_t
w_1
[
1
]
__attribute__
((
aligned
(
16
)));
complex_t
w_1
[
1
]
__attribute__
((
aligned
(
16
)));
float
used_for_alignement1
;
float
used_for_alignement2
;
complex_t
w_2
[
2
]
__attribute__
((
aligned
(
16
)));
complex_t
w_2
[
2
]
__attribute__
((
aligned
(
16
)));
complex_t
w_4
[
4
]
__attribute__
((
aligned
(
16
)));
complex_t
w_4
[
4
]
__attribute__
((
aligned
(
16
)));
complex_t
w_8
[
8
]
__attribute__
((
aligned
(
16
)));
complex_t
w_8
[
8
]
__attribute__
((
aligned
(
16
)));
complex_t
w_16
[
16
]
__attribute__
((
aligned
(
16
)));
complex_t
w_16
[
16
]
__attribute__
((
aligned
(
16
)));
complex_t
w_32
[
32
]
__attribute__
((
aligned
(
16
)));
complex_t
w_32
[
32
]
__attribute__
((
aligned
(
16
)));
complex_t
w_64
[
64
]
__attribute__
((
aligned
(
16
)));
complex_t
w_64
[
64
]
__attribute__
((
aligned
(
16
)));
complex_t
*
w
[
7
]
__attribute__
((
aligned
(
16
)));
float
xcos_sin_sse
[
128
*
4
]
__attribute__
((
aligned
(
16
)));
/* Module used and shortcuts */
/* Module used and shortcuts */
struct
module_s
*
p_module
;
struct
module_s
*
p_module
;
...
...
plugins/downmix/ac3_downmix_3dn.c
View file @
dee3179d
...
@@ -2,7 +2,7 @@
...
@@ -2,7 +2,7 @@
* ac3_downmix_3dn.c: accelerated 3D Now! ac3 downmix functions
* ac3_downmix_3dn.c: accelerated 3D Now! ac3 downmix functions
*****************************************************************************
*****************************************************************************
* Copyright (C) 1999, 2000, 2001 VideoLAN
* Copyright (C) 1999, 2000, 2001 VideoLAN
* $Id: ac3_downmix_3dn.c,v 1.
3 2001/07/01 08:49:09 gbazin
Exp $
* $Id: ac3_downmix_3dn.c,v 1.
4 2001/07/08 23:15:11 reno
Exp $
*
*
* Authors: Renaud Dartus <reno@videolan.org>
* Authors: Renaud Dartus <reno@videolan.org>
*
*
...
@@ -46,6 +46,7 @@ void sqrt2_3dn (void)
...
@@ -46,6 +46,7 @@ void sqrt2_3dn (void)
void
_M
(
downmix_3f_2r_to_2ch
)
(
float
*
samples
,
dm_par_t
*
dm_par
)
void
_M
(
downmix_3f_2r_to_2ch
)
(
float
*
samples
,
dm_par_t
*
dm_par
)
{
{
__asm__
__volatile__
(
__asm__
__volatile__
(
".align 16
\n
"
"pushl %%ebx
\n
"
"pushl %%ebx
\n
"
"movl $128, %%ebx
\n
"
/* loop counter */
"movl $128, %%ebx
\n
"
/* loop counter */
...
@@ -58,6 +59,7 @@ void _M( downmix_3f_2r_to_2ch ) (float * samples, dm_par_t * dm_par)
...
@@ -58,6 +59,7 @@ void _M( downmix_3f_2r_to_2ch ) (float * samples, dm_par_t * dm_par)
"movd 8(%%ecx), %%mm7
\n
"
/* slev */
"movd 8(%%ecx), %%mm7
\n
"
/* slev */
"punpckldq %%mm7, %%mm7
\n
"
/* slev | slev */
"punpckldq %%mm7, %%mm7
\n
"
/* slev | slev */
".align 16
\n
"
".loop:
\n
"
".loop:
\n
"
"movq (%%eax), %%mm0
\n
"
/* left */
"movq (%%eax), %%mm0
\n
"
/* left */
"movq 2048(%%eax), %%mm1
\n
"
/* right */
"movq 2048(%%eax), %%mm1
\n
"
/* right */
...
@@ -90,6 +92,7 @@ void _M( downmix_3f_2r_to_2ch ) (float * samples, dm_par_t * dm_par)
...
@@ -90,6 +92,7 @@ void _M( downmix_3f_2r_to_2ch ) (float * samples, dm_par_t * dm_par)
void
_M
(
downmix_2f_2r_to_2ch
)
(
float
*
samples
,
dm_par_t
*
dm_par
)
void
_M
(
downmix_2f_2r_to_2ch
)
(
float
*
samples
,
dm_par_t
*
dm_par
)
{
{
__asm__
__volatile__
(
__asm__
__volatile__
(
".align 16
\n
"
"pushl %%ebx
\n
"
"pushl %%ebx
\n
"
"movl $128, %%ebx
\n
"
/* loop counter */
"movl $128, %%ebx
\n
"
/* loop counter */
...
@@ -99,6 +102,7 @@ void _M( downmix_2f_2r_to_2ch ) (float *samples, dm_par_t * dm_par)
...
@@ -99,6 +102,7 @@ void _M( downmix_2f_2r_to_2ch ) (float *samples, dm_par_t * dm_par)
"movd 8(%%ecx), %%mm7
\n
"
/* slev */
"movd 8(%%ecx), %%mm7
\n
"
/* slev */
"punpckldq %%mm7, %%mm7
\n
"
/* slev | slev */
"punpckldq %%mm7, %%mm7
\n
"
/* slev | slev */
".align 16
\n
"
".loop3:
\n
"
".loop3:
\n
"
"movq (%%eax), %%mm0
\n
"
/* left */
"movq (%%eax), %%mm0
\n
"
/* left */
"movq 1024(%%eax), %%mm1
\n
"
/* right */
"movq 1024(%%eax), %%mm1
\n
"
/* right */
...
@@ -127,7 +131,7 @@ void _M( downmix_2f_2r_to_2ch ) (float *samples, dm_par_t * dm_par)
...
@@ -127,7 +131,7 @@ void _M( downmix_2f_2r_to_2ch ) (float *samples, dm_par_t * dm_par)
void
_M
(
downmix_3f_1r_to_2ch
)
(
float
*
samples
,
dm_par_t
*
dm_par
)
void
_M
(
downmix_3f_1r_to_2ch
)
(
float
*
samples
,
dm_par_t
*
dm_par
)
{
{
__asm__
__volatile__
(
__asm__
__volatile__
(
".align 16
\n
"
"pushl %%ebx
\n
"
"pushl %%ebx
\n
"
"movl $128, %%ebx
\n
"
/* loop counter */
"movl $128, %%ebx
\n
"
/* loop counter */
...
@@ -140,6 +144,7 @@ void _M( downmix_3f_1r_to_2ch ) (float *samples, dm_par_t * dm_par)
...
@@ -140,6 +144,7 @@ void _M( downmix_3f_1r_to_2ch ) (float *samples, dm_par_t * dm_par)
"movd 8(%%ecx), %%mm7
\n
"
/* slev */
"movd 8(%%ecx), %%mm7
\n
"
/* slev */
"punpckldq %%mm7, %%mm7
\n
"
/* slev | slev */
"punpckldq %%mm7, %%mm7
\n
"
/* slev | slev */
".align 16
\n
"
".loop4:
\n
"
".loop4:
\n
"
"movq (%%eax), %%mm0
\n
"
/* left */
"movq (%%eax), %%mm0
\n
"
/* left */
"movq 2048(%%eax), %%mm1
\n
"
/* right */
"movq 2048(%%eax), %%mm1
\n
"
/* right */
...
@@ -170,6 +175,7 @@ void _M( downmix_3f_1r_to_2ch ) (float *samples, dm_par_t * dm_par)
...
@@ -170,6 +175,7 @@ void _M( downmix_3f_1r_to_2ch ) (float *samples, dm_par_t * dm_par)
void
_M
(
downmix_2f_1r_to_2ch
)
(
float
*
samples
,
dm_par_t
*
dm_par
)
void
_M
(
downmix_2f_1r_to_2ch
)
(
float
*
samples
,
dm_par_t
*
dm_par
)
{
{
__asm__
__volatile__
(
__asm__
__volatile__
(
".align 16
\n
"
"pushl %%ebx
\n
"
"pushl %%ebx
\n
"
"movl $128, %%ebx
\n
"
/* loop counter */
"movl $128, %%ebx
\n
"
/* loop counter */
...
@@ -179,6 +185,7 @@ void _M( downmix_2f_1r_to_2ch ) (float *samples, dm_par_t * dm_par)
...
@@ -179,6 +185,7 @@ void _M( downmix_2f_1r_to_2ch ) (float *samples, dm_par_t * dm_par)
"movd 8(%%ecx), %%mm7
\n
"
/* slev */
"movd 8(%%ecx), %%mm7
\n
"
/* slev */
"punpckldq %%mm7, %%mm7
\n
"
/* slev | slev */
"punpckldq %%mm7, %%mm7
\n
"
/* slev | slev */
".align 16
\n
"
".loop5:
\n
"
".loop5:
\n
"
"movq (%%eax), %%mm0
\n
"
/* left */
"movq (%%eax), %%mm0
\n
"
/* left */
"movq 1024(%%eax), %%mm1
\n
"
/* right */
"movq 1024(%%eax), %%mm1
\n
"
/* right */
...
@@ -205,6 +212,7 @@ void _M( downmix_2f_1r_to_2ch ) (float *samples, dm_par_t * dm_par)
...
@@ -205,6 +212,7 @@ void _M( downmix_2f_1r_to_2ch ) (float *samples, dm_par_t * dm_par)
void
_M
(
downmix_3f_0r_to_2ch
)
(
float
*
samples
,
dm_par_t
*
dm_par
)
void
_M
(
downmix_3f_0r_to_2ch
)
(
float
*
samples
,
dm_par_t
*
dm_par
)
{
{
__asm__
__volatile__
(
__asm__
__volatile__
(
".align 16
\n
"
"pushl %%ebx
\n
"
"pushl %%ebx
\n
"
"movl $128, %%ebx
\n
"
/* loop counter */
"movl $128, %%ebx
\n
"
/* loop counter */
...
@@ -214,6 +222,7 @@ void _M( downmix_3f_0r_to_2ch ) (float *samples, dm_par_t * dm_par)
...
@@ -214,6 +222,7 @@ void _M( downmix_3f_0r_to_2ch ) (float *samples, dm_par_t * dm_par)
"movd 4(%%ecx), %%mm6
\n
"
/* clev */
"movd 4(%%ecx), %%mm6
\n
"
/* clev */
"punpckldq %%mm6, %%mm6
\n
"
/* clev | clev */
"punpckldq %%mm6, %%mm6
\n
"
/* clev | clev */
".align 16
\n
"
".loop6:
\n
"
".loop6:
\n
"
"movq (%%eax), %%mm0
\n
"
/*left */
"movq (%%eax), %%mm0
\n
"
/*left */
"movq 2048(%%eax), %%mm1
\n
"
/* right */
"movq 2048(%%eax), %%mm1
\n
"
/* right */
...
@@ -240,6 +249,7 @@ void _M( downmix_3f_0r_to_2ch ) (float *samples, dm_par_t * dm_par)
...
@@ -240,6 +249,7 @@ void _M( downmix_3f_0r_to_2ch ) (float *samples, dm_par_t * dm_par)
void
_M
(
stream_sample_1ch_to_s16
)
(
s16
*
s16_samples
,
float
*
left
)
void
_M
(
stream_sample_1ch_to_s16
)
(
s16
*
s16_samples
,
float
*
left
)
{
{
__asm__
__volatile__
(
__asm__
__volatile__
(
".align 16
\n
"
"pushl %%ebx
\n
"
"pushl %%ebx
\n
"
"pushl %%edx
\n
"
"pushl %%edx
\n
"
...
@@ -248,6 +258,7 @@ void _M( stream_sample_1ch_to_s16 ) (s16 *s16_samples, float *left)
...
@@ -248,6 +258,7 @@ void _M( stream_sample_1ch_to_s16 ) (s16 *s16_samples, float *left)
"punpckldq %%mm7, %%mm7
\n
"
/* sqrt2 | sqrt2 */
"punpckldq %%mm7, %%mm7
\n
"
/* sqrt2 | sqrt2 */
"movl $128, %%ebx
\n
"
"movl $128, %%ebx
\n
"
".align 16
\n
"
".loop2:
\n
"
".loop2:
\n
"
"movq (%%ecx), %%mm0
\n
"
/* c1 | c0 */
"movq (%%ecx), %%mm0
\n
"
/* c1 | c0 */
"pfmul %%mm7, %%mm0
\n
"
"pfmul %%mm7, %%mm0
\n
"
...
@@ -274,9 +285,11 @@ void _M( stream_sample_2ch_to_s16 ) (s16 *s16_samples, float *left, float *right
...
@@ -274,9 +285,11 @@ void _M( stream_sample_2ch_to_s16 ) (s16 *s16_samples, float *left, float *right
{
{
__asm__
__volatile__
(
__asm__
__volatile__
(
".align 16
\n
"
"pushl %%ebx
\n
"
"pushl %%ebx
\n
"
"movl $128, %%ebx
\n
"
"movl $128, %%ebx
\n
"
".align 16
\n
"
".loop1:
\n
"
".loop1:
\n
"
"movq (%%ecx), %%mm0
\n
"
/* l1 | l0 */
"movq (%%ecx), %%mm0
\n
"
/* l1 | l0 */
"movq (%%edx), %%mm1
\n
"
/* r1 | r0 */
"movq (%%edx), %%mm1
\n
"
/* r1 | r0 */
...
...
plugins/downmix/ac3_downmix_sse.c
View file @
dee3179d
...
@@ -2,7 +2,7 @@
...
@@ -2,7 +2,7 @@
* ac3_downmix_sse.c: accelerated SSE ac3 downmix functions
* ac3_downmix_sse.c: accelerated SSE ac3 downmix functions
*****************************************************************************
*****************************************************************************
* Copyright (C) 1999, 2000, 2001 VideoLAN
* Copyright (C) 1999, 2000, 2001 VideoLAN
* $Id: ac3_downmix_sse.c,v 1.
3 2001/07/01 08:49:09 gbazin
Exp $
* $Id: ac3_downmix_sse.c,v 1.
4 2001/07/08 23:15:11 reno
Exp $
*
*
* Authors: Renaud Dartus <reno@videolan.org>
* Authors: Renaud Dartus <reno@videolan.org>
* Aaron Holtzman <aholtzma@engr.uvic.ca>
* Aaron Holtzman <aholtzma@engr.uvic.ca>
...
@@ -41,12 +41,14 @@
...
@@ -41,12 +41,14 @@
void
sqrt2_sse
(
void
)
__asm__
(
"sqrt2_sse"
);
void
sqrt2_sse
(
void
)
__asm__
(
"sqrt2_sse"
);
void
sqrt2_sse
(
void
)
void
sqrt2_sse
(
void
)
{
{
__asm__
(
".float 0f0.7071068"
);
__asm__
(
".align 16
\n
"
".float 0f0.7071068"
);
}
}
void
_M
(
downmix_3f_2r_to_2ch
)
(
float
*
samples
,
dm_par_t
*
dm_par
)
void
_M
(
downmix_3f_2r_to_2ch
)
(
float
*
samples
,
dm_par_t
*
dm_par
)
{
{
__asm__
__volatile__
(
__asm__
__volatile__
(
".align 16
\n
"
"pushl %%ebx
\n
"
"pushl %%ebx
\n
"
"movl $64, %%ebx
\n
"
/* loop counter */
"movl $64, %%ebx
\n
"
/* loop counter */
...
@@ -59,12 +61,13 @@ void _M( downmix_3f_2r_to_2ch ) (float * samples, dm_par_t * dm_par)
...
@@ -59,12 +61,13 @@ void _M( downmix_3f_2r_to_2ch ) (float * samples, dm_par_t * dm_par)
"movss 8(%%ecx), %%xmm7
\n
"
/* slev */
"movss 8(%%ecx), %%xmm7
\n
"
/* slev */
"shufps $0, %%xmm7, %%xmm7
\n
"
/* slev | slev | slev | slev */
"shufps $0, %%xmm7, %%xmm7
\n
"
/* slev | slev | slev | slev */
".align 16
\n
"
".loop:
\n
"
".loop:
\n
"
"mov
ups (%%eax),
%%xmm0
\n
"
/* left */
"mov
aps (%%eax),
%%xmm0
\n
"
/* left */
"mov
ups
2048(%%eax), %%xmm1
\n
"
/* right */
"mov
aps
2048(%%eax), %%xmm1
\n
"
/* right */
"mov
ups 1024(%%eax), %%xmm2
\n
"
/* center */
"mov
aps 1024(%%eax), %%xmm2
\n
"
/* center */
"mov
ups 3072(%%eax), %%xmm3
\n
"
/* leftsur */
"mov
aps 3072(%%eax), %%xmm3
\n
"
/* leftsur */
"mov
ups 4096(%%eax), %%xmm4
\n
"
/* rithgsur */
"mov
aps 4096(%%eax), %%xmm4
\n
"
/* rithgsur */
"mulps %%xmm5, %%xmm0
\n
"
"mulps %%xmm5, %%xmm0
\n
"
"mulps %%xmm5, %%xmm1
\n
"
"mulps %%xmm5, %%xmm1
\n
"
"mulps %%xmm6, %%xmm2
\n
"
"mulps %%xmm6, %%xmm2
\n
"
...
@@ -75,8 +78,8 @@ void _M( downmix_3f_2r_to_2ch ) (float * samples, dm_par_t * dm_par)
...
@@ -75,8 +78,8 @@ void _M( downmix_3f_2r_to_2ch ) (float * samples, dm_par_t * dm_par)
"addps %%xmm3, %%xmm0
\n
"
"addps %%xmm3, %%xmm0
\n
"
"addps %%xmm4, %%xmm1
\n
"
"addps %%xmm4, %%xmm1
\n
"
"mov
ups
%%xmm0, (%%eax)
\n
"
"mov
aps
%%xmm0, (%%eax)
\n
"
"mov
ups
%%xmm1, 1024(%%eax)
\n
"
"mov
aps
%%xmm1, 1024(%%eax)
\n
"
"addl $16, %%eax
\n
"
"addl $16, %%eax
\n
"
"decl %%ebx
\n
"
"decl %%ebx
\n
"
...
@@ -90,6 +93,7 @@ void _M( downmix_3f_2r_to_2ch ) (float * samples, dm_par_t * dm_par)
...
@@ -90,6 +93,7 @@ void _M( downmix_3f_2r_to_2ch ) (float * samples, dm_par_t * dm_par)
void
_M
(
downmix_2f_2r_to_2ch
)
(
float
*
samples
,
dm_par_t
*
dm_par
)
void
_M
(
downmix_2f_2r_to_2ch
)
(
float
*
samples
,
dm_par_t
*
dm_par
)
{
{
__asm__
__volatile__
(
__asm__
__volatile__
(
".align 16
\n
"
"pushl %%ebx
\n
"
"pushl %%ebx
\n
"
"movl $64, %%ebx
\n
"
/* loop counter */
"movl $64, %%ebx
\n
"
/* loop counter */
...
@@ -99,11 +103,12 @@ void _M( downmix_2f_2r_to_2ch ) (float *samples, dm_par_t * dm_par)
...
@@ -99,11 +103,12 @@ void _M( downmix_2f_2r_to_2ch ) (float *samples, dm_par_t * dm_par)
"movss 8(%%ecx), %%xmm7
\n
"
/* slev */
"movss 8(%%ecx), %%xmm7
\n
"
/* slev */
"shufps $0, %%xmm7, %%xmm7
\n
"
/* slev | slev | slev | slev */
"shufps $0, %%xmm7, %%xmm7
\n
"
/* slev | slev | slev | slev */
".align 16
\n
"
".loop3:
\n
"
".loop3:
\n
"
"mov
ups (%%eax), %%xmm0
\n
"
/* left */
"mov
aps (%%eax), %%xmm0
\n
"
/* left */
"mov
ups
1024(%%eax), %%xmm1
\n
"
/* right */
"mov
aps
1024(%%eax), %%xmm1
\n
"
/* right */
"mov
ups 2048(%%eax), %%xmm3
\n
"
/* leftsur */
"mov
aps 2048(%%eax), %%xmm3
\n
"
/* leftsur */
"mov
ups 3072(%%eax), %%xmm4
\n
"
/* rightsur */
"mov
aps 3072(%%eax), %%xmm4
\n
"
/* rightsur */
"mulps %%xmm5, %%xmm0
\n
"
"mulps %%xmm5, %%xmm0
\n
"
"mulps %%xmm5, %%xmm1
\n
"
"mulps %%xmm5, %%xmm1
\n
"
"mulps %%xmm7, %%xmm3
\n
"
"mulps %%xmm7, %%xmm3
\n
"
...
@@ -111,8 +116,8 @@ void _M( downmix_2f_2r_to_2ch ) (float *samples, dm_par_t * dm_par)
...
@@ -111,8 +116,8 @@ void _M( downmix_2f_2r_to_2ch ) (float *samples, dm_par_t * dm_par)
"addps %%xmm3, %%xmm0
\n
"
"addps %%xmm3, %%xmm0
\n
"
"addps %%xmm4, %%xmm1
\n
"
"addps %%xmm4, %%xmm1
\n
"
"mov
ups
%%xmm0, (%%eax)
\n
"
"mov
aps
%%xmm0, (%%eax)
\n
"
"mov
ups
%%xmm1, 1024(%%eax)
\n
"
"mov
aps
%%xmm1, 1024(%%eax)
\n
"
"addl $16, %%eax
\n
"
"addl $16, %%eax
\n
"
"decl %%ebx
\n
"
"decl %%ebx
\n
"
...
@@ -126,7 +131,7 @@ void _M( downmix_2f_2r_to_2ch ) (float *samples, dm_par_t * dm_par)
...
@@ -126,7 +131,7 @@ void _M( downmix_2f_2r_to_2ch ) (float *samples, dm_par_t * dm_par)
void
_M
(
downmix_3f_1r_to_2ch
)
(
float
*
samples
,
dm_par_t
*
dm_par
)
void
_M
(
downmix_3f_1r_to_2ch
)
(
float
*
samples
,
dm_par_t
*
dm_par
)
{
{
__asm__
__volatile__
(
__asm__
__volatile__
(
".align 16
\n
"
"pushl %%ebx
\n
"
"pushl %%ebx
\n
"
"movl $64, %%ebx
\n
"
/* loop counter */
"movl $64, %%ebx
\n
"
/* loop counter */
...
@@ -139,11 +144,12 @@ void _M( downmix_3f_1r_to_2ch ) (float *samples, dm_par_t * dm_par)
...
@@ -139,11 +144,12 @@ void _M( downmix_3f_1r_to_2ch ) (float *samples, dm_par_t * dm_par)
"movss 8(%%ecx), %%xmm7
\n
"
/* slev */
"movss 8(%%ecx), %%xmm7
\n
"
/* slev */
"shufps $0, %%xmm7, %%xmm7
\n
"
/* slev | slev | slev | slev */
"shufps $0, %%xmm7, %%xmm7
\n
"
/* slev | slev | slev | slev */
".align 16
\n
"
".loop4:
\n
"
".loop4:
\n
"
"mov
ups (%%eax), %%xmm0
\n
"
/* left */
"mov
aps (%%eax), %%xmm0
\n
"
/* left */
"mov
ups
2048(%%eax), %%xmm1
\n
"
/* right */
"mov
aps
2048(%%eax), %%xmm1
\n
"
/* right */
"mov
ups 1024(%%eax), %%xmm2
\n
"
/* center */
"mov
aps 1024(%%eax), %%xmm2
\n
"
/* center */
"mov
ups 3072(%%eax), %%xmm3
\n
"
/* sur */
"mov
aps 3072(%%eax), %%xmm3
\n
"
/* sur */
"mulps %%xmm5, %%xmm0
\n
"
"mulps %%xmm5, %%xmm0
\n
"
"mulps %%xmm5, %%xmm1
\n
"
"mulps %%xmm5, %%xmm1
\n
"
"mulps %%xmm6, %%xmm2
\n
"
"mulps %%xmm6, %%xmm2
\n
"
...
@@ -153,8 +159,8 @@ void _M( downmix_3f_1r_to_2ch ) (float *samples, dm_par_t * dm_par)
...
@@ -153,8 +159,8 @@ void _M( downmix_3f_1r_to_2ch ) (float *samples, dm_par_t * dm_par)
"subps %%xmm3, %%xmm0
\n
"
"subps %%xmm3, %%xmm0
\n
"
"addps %%xmm3, %%xmm1
\n
"
"addps %%xmm3, %%xmm1
\n
"
"mov
ups
%%xmm0, (%%eax)
\n
"
"mov
aps
%%xmm0, (%%eax)
\n
"
"mov
ups
%%xmm1, 1024(%%eax)
\n
"
"mov
aps
%%xmm1, 1024(%%eax)
\n
"
"addl $16, %%eax
\n
"
"addl $16, %%eax
\n
"
"decl %%ebx
\n
"
"decl %%ebx
\n
"
...
@@ -163,12 +169,12 @@ void _M( downmix_3f_1r_to_2ch ) (float *samples, dm_par_t * dm_par)
...
@@ -163,12 +169,12 @@ void _M( downmix_3f_1r_to_2ch ) (float *samples, dm_par_t * dm_par)
"popl %%ebx
\n
"
"popl %%ebx
\n
"
:
"=a"
(
samples
)
:
"=a"
(
samples
)
:
"a"
(
samples
),
"c"
(
dm_par
));
:
"a"
(
samples
),
"c"
(
dm_par
));
}
}
void
_M
(
downmix_2f_1r_to_2ch
)
(
float
*
samples
,
dm_par_t
*
dm_par
)
void
_M
(
downmix_2f_1r_to_2ch
)
(
float
*
samples
,
dm_par_t
*
dm_par
)
{
{
__asm__
__volatile__
(
__asm__
__volatile__
(
".align 16
\n
"
"pushl %%ebx
\n
"
"pushl %%ebx
\n
"
"movl $64, %%ebx
\n
"
/* loop counter */
"movl $64, %%ebx
\n
"
/* loop counter */
...
@@ -178,18 +184,19 @@ void _M( downmix_2f_1r_to_2ch ) (float *samples, dm_par_t * dm_par)
...
@@ -178,18 +184,19 @@ void _M( downmix_2f_1r_to_2ch ) (float *samples, dm_par_t * dm_par)
"movss 8(%%ecx), %%xmm7
\n
"
/* slev */
"movss 8(%%ecx), %%xmm7
\n
"
/* slev */
"shufps $0, %%xmm7, %%xmm7
\n
"
/* slev | slev | slev | slev */
"shufps $0, %%xmm7, %%xmm7
\n
"
/* slev | slev | slev | slev */
".align 16
\n
"
".loop5:
\n
"
".loop5:
\n
"
"mov
ups (%%eax), %%xmm0
\n
"
/* left */
"mov
aps (%%eax), %%xmm0
\n
"
/* left */
"mov
ups
1024(%%eax), %%xmm1
\n
"
/* right */
"mov
aps
1024(%%eax), %%xmm1
\n
"
/* right */
"mov
ups 2048(%%eax), %%xmm3
\n
"
/* sur */
"mov
aps 2048(%%eax), %%xmm3
\n
"
/* sur */
"mulps %%xmm5, %%xmm0
\n
"
"mulps %%xmm5, %%xmm0
\n
"
"mulps %%xmm5, %%xmm1
\n
"
"mulps %%xmm5, %%xmm1
\n
"
"mulps %%xmm7, %%xmm3
\n
"
"mulps %%xmm7, %%xmm3
\n
"
"subps %%xmm3, %%xmm0
\n
"
"subps %%xmm3, %%xmm0
\n
"
"addps %%xmm3, %%xmm1
\n
"
"addps %%xmm3, %%xmm1
\n
"
"mov
ups
%%xmm0, (%%eax)
\n
"
"mov
aps
%%xmm0, (%%eax)
\n
"
"mov
ups
%%xmm1, 1024(%%eax)
\n
"
"mov
aps
%%xmm1, 1024(%%eax)
\n
"
"addl $16, %%eax
\n
"
"addl $16, %%eax
\n
"
"decl %%ebx
\n
"
"decl %%ebx
\n
"
...
@@ -198,13 +205,12 @@ void _M( downmix_2f_1r_to_2ch ) (float *samples, dm_par_t * dm_par)
...
@@ -198,13 +205,12 @@ void _M( downmix_2f_1r_to_2ch ) (float *samples, dm_par_t * dm_par)
"popl %%ebx
\n
"
"popl %%ebx
\n
"
:
"=a"
(
samples
)
:
"=a"
(
samples
)
:
"a"
(
samples
),
"c"
(
dm_par
));
:
"a"
(
samples
),
"c"
(
dm_par
));
}
}
void
_M
(
downmix_3f_0r_to_2ch
)
(
float
*
samples
,
dm_par_t
*
dm_par
)
void
_M
(
downmix_3f_0r_to_2ch
)
(
float
*
samples
,
dm_par_t
*
dm_par
)
{
{
__asm__
__volatile__
(
__asm__
__volatile__
(
".align 16
\n
"
"pushl %%ebx
\n
"
"pushl %%ebx
\n
"
"movl $64, %%ebx
\n
"
/* loop counter */
"movl $64, %%ebx
\n
"
/* loop counter */
...
@@ -214,18 +220,19 @@ void _M( downmix_3f_0r_to_2ch ) (float *samples, dm_par_t * dm_par)
...
@@ -214,18 +220,19 @@ void _M( downmix_3f_0r_to_2ch ) (float *samples, dm_par_t * dm_par)
"movss 4(%%ecx), %%xmm6
\n
"
/* clev */
"movss 4(%%ecx), %%xmm6
\n
"
/* clev */
"shufps $0, %%xmm6, %%xmm6
\n
"
/* clev | clev | clev | clev */
"shufps $0, %%xmm6, %%xmm6
\n
"
/* clev | clev | clev | clev */
".align 16
\n
"
".loop6:
\n
"
".loop6:
\n
"
"mov
ups (%%eax), %%xmm0
\n
"
/*left */
"mov
aps (%%eax), %%xmm0
\n
"
/*left */
"mov
ups
2048(%%eax), %%xmm1
\n
"
/* right */
"mov
aps
2048(%%eax), %%xmm1
\n
"
/* right */
"mov
ups 1024(%%eax), %%xmm2
\n
"
/* center */
"mov
aps 1024(%%eax), %%xmm2
\n
"
/* center */
"mulps %%xmm5, %%xmm0
\n
"
"mulps %%xmm5, %%xmm0
\n
"
"mulps %%xmm5, %%xmm1
\n
"
"mulps %%xmm5, %%xmm1
\n
"
"mulps %%xmm6, %%xmm2
\n
"
"mulps %%xmm6, %%xmm2
\n
"
"addps %%xmm2, %%xmm0
\n
"
"addps %%xmm2, %%xmm0
\n
"
"addps %%xmm2, %%xmm1
\n
"
"addps %%xmm2, %%xmm1
\n
"
"mov
ups
%%xmm0, (%%eax)
\n
"
"mov
aps
%%xmm0, (%%eax)
\n
"
"mov
ups
%%xmm1, 1024(%%eax)
\n
"
"mov
aps
%%xmm1, 1024(%%eax)
\n
"
"addl $16, %%eax
\n
"
"addl $16, %%eax
\n
"
"decl %%ebx
\n
"
"decl %%ebx
\n
"
...
@@ -239,6 +246,7 @@ void _M( downmix_3f_0r_to_2ch ) (float *samples, dm_par_t * dm_par)
...
@@ -239,6 +246,7 @@ void _M( downmix_3f_0r_to_2ch ) (float *samples, dm_par_t * dm_par)
void
_M
(
stream_sample_1ch_to_s16
)
(
s16
*
s16_samples
,
float
*
left
)
void
_M
(
stream_sample_1ch_to_s16
)
(
s16
*
s16_samples
,
float
*
left
)
{
{
__asm__
__volatile__
(
__asm__
__volatile__
(
".align 16
\n
"
"pushl %%ebx
\n
"
"pushl %%ebx
\n
"
"pushl %%edx
\n
"
"pushl %%edx
\n
"
...
@@ -247,8 +255,9 @@ void _M( stream_sample_1ch_to_s16 ) (s16 *s16_samples, float *left)
...
@@ -247,8 +255,9 @@ void _M( stream_sample_1ch_to_s16 ) (s16 *s16_samples, float *left)
"shufps $0, %%xmm7, %%xmm7
\n
"
/* sqrt2 | sqrt2 | sqrt2 | sqrt2 */
"shufps $0, %%xmm7, %%xmm7
\n
"
/* sqrt2 | sqrt2 | sqrt2 | sqrt2 */
"movl $64, %%ebx
\n
"
"movl $64, %%ebx
\n
"
".align 16
\n
"
".loop2:
\n
"
".loop2:
\n
"
"mov
ups (%%ecx), %%xmm0
\n
"
/* c3 | c2 | c1 | c0 */
"mov
aps (%%ecx), %%xmm0
\n
"
/* c3 | c2 | c1 | c0 */
"mulps %%xmm7, %%xmm0
\n
"
"mulps %%xmm7, %%xmm0
\n
"
"movhlps %%xmm0, %%xmm2
\n
"
/* c3 | c2 */
"movhlps %%xmm0, %%xmm2
\n
"
/* c3 | c2 */
...
@@ -275,14 +284,15 @@ void _M( stream_sample_1ch_to_s16 ) (s16 *s16_samples, float *left)
...
@@ -275,14 +284,15 @@ void _M( stream_sample_1ch_to_s16 ) (s16 *s16_samples, float *left)
void
_M
(
stream_sample_2ch_to_s16
)
(
s16
*
s16_samples
,
float
*
left
,
float
*
right
)
void
_M
(
stream_sample_2ch_to_s16
)
(
s16
*
s16_samples
,
float
*
left
,
float
*
right
)
{
{
__asm__
__volatile__
(
__asm__
__volatile__
(
".align 16
\n
"
"pushl %%ebx
\n
"
"pushl %%ebx
\n
"
"movl $64, %%ebx
\n
"
"movl $64, %%ebx
\n
"
".align 16
\n
"
".loop1:
\n
"
".loop1:
\n
"
"mov
ups (%%ecx), %%xmm0
\n
"
/* l3 | l2 | l1 | l0 */
"mov
aps (%%ecx), %%xmm0
\n
"
/* l3 | l2 | l1 | l0 */
"mov
ups (%%edx), %%xmm1
\n
"
/* r3 | r2 | r1 | r0 */
"mov
aps (%%edx), %%xmm1
\n
"
/* r3 | r2 | r1 | r0 */
"movhlps %%xmm0, %%xmm2
\n
"
/* l3 | l2 */
"movhlps %%xmm0, %%xmm2
\n
"
/* l3 | l2 */
"movhlps %%xmm1, %%xmm3
\n
"
/* r3 | r2 */
"movhlps %%xmm1, %%xmm3
\n
"
/* r3 | r2 */
"unpcklps %%xmm1, %%xmm0
\n
"
/* r1 | l1 | r0 | l0 */
"unpcklps %%xmm1, %%xmm0
\n
"
/* r1 | l1 | r0 | l0 */
...
...
plugins/imdct/ac3_imdct_3dn.c
View file @
dee3179d
...
@@ -2,7 +2,7 @@
...
@@ -2,7 +2,7 @@
* ac3_imdct_3dn.c: accelerated 3D Now! ac3 DCT
* ac3_imdct_3dn.c: accelerated 3D Now! ac3 DCT
*****************************************************************************
*****************************************************************************
* Copyright (C) 1999, 2000 VideoLAN
* Copyright (C) 1999, 2000 VideoLAN
* $Id: ac3_imdct_3dn.c,v 1.
4 2001/06/03 12:47:21 sam
Exp $
* $Id: ac3_imdct_3dn.c,v 1.
5 2001/07/08 23:15:11 reno
Exp $
*
*
* Authors: Renaud Dartus <reno@videolan.org>
* Authors: Renaud Dartus <reno@videolan.org>
*
*
...
@@ -89,6 +89,7 @@ void _M( imdct_do_512_nol ) (imdct_t * p_imdct, float data[], float delay[])
...
@@ -89,6 +89,7 @@ void _M( imdct_do_512_nol ) (imdct_t * p_imdct, float data[], float delay[])
static
void
imdct512_pre_ifft_twiddle_3dn
(
const
int
*
pmt
,
complex_t
*
buf
,
float
*
data
,
float
*
xcos_sin_sse
)
static
void
imdct512_pre_ifft_twiddle_3dn
(
const
int
*
pmt
,
complex_t
*
buf
,
float
*
data
,
float
*
xcos_sin_sse
)
{
{
__asm__
__volatile__
(
__asm__
__volatile__
(
".align 16
\n
"
"pushl %%ebp
\n
"
"pushl %%ebp
\n
"
"movl %%esp, %%ebp
\n
"
"movl %%esp, %%ebp
\n
"
"addl $-4, %%esp
\n
"
/* local variable, loop counter */
"addl $-4, %%esp
\n
"
/* local variable, loop counter */
...
@@ -106,6 +107,7 @@ static void imdct512_pre_ifft_twiddle_3dn (const int *pmt, complex_t *buf, float
...
@@ -106,6 +107,7 @@ static void imdct512_pre_ifft_twiddle_3dn (const int *pmt, complex_t *buf, float
"movl 20(%%ebp), %%edx
\n
"
/* xcos_sin_sse */
"movl 20(%%ebp), %%edx
\n
"
/* xcos_sin_sse */
"movl $128, -4(%%ebp)
\n
"
"movl $128, -4(%%ebp)
\n
"
".align 16
\n
"
".loop:
\n
"
".loop:
\n
"
"movl (%%eax), %%esi
\n
"
"movl (%%eax), %%esi
\n
"
"movd (%%ecx, %%esi, 8), %%mm1
\n
"
/* 2j */
"movd (%%ecx, %%esi, 8), %%mm1
\n
"
/* 2j */
...
@@ -147,9 +149,11 @@ static void imdct512_pre_ifft_twiddle_3dn (const int *pmt, complex_t *buf, float
...
@@ -147,9 +149,11 @@ static void imdct512_pre_ifft_twiddle_3dn (const int *pmt, complex_t *buf, float
static
void
imdct512_post_ifft_twiddle_3dn
(
complex_t
*
buf
,
float
*
xcos_sin_sse
)
static
void
imdct512_post_ifft_twiddle_3dn
(
complex_t
*
buf
,
float
*
xcos_sin_sse
)
{
{
__asm__
__volatile__
(
__asm__
__volatile__
(
".align 16
\n
"
"pushl %%ebx
\n
"
"pushl %%ebx
\n
"
"movl $64, %%ebx
\n
"
/* loop counter */
"movl $64, %%ebx
\n
"
/* loop counter */
".align 16
\n
"
".loop1:
\n
"
".loop1:
\n
"
"movq (%%eax), %%mm0
\n
"
/* im0 | re0 */
"movq (%%eax), %%mm0
\n
"
/* im0 | re0 */
"movq %%mm0, %%mm1
\n
"
/* im0 | re0 */
"movq %%mm0, %%mm1
\n
"
/* im0 | re0 */
...
@@ -200,6 +204,7 @@ static void imdct512_post_ifft_twiddle_3dn (complex_t *buf, float *xcos_sin_sse)
...
@@ -200,6 +204,7 @@ static void imdct512_post_ifft_twiddle_3dn (complex_t *buf, float *xcos_sin_sse)
static
void
imdct512_window_delay_3dn
(
complex_t
*
buf
,
float
*
data_ptr
,
float
*
window_prt
,
float
*
delay_prt
)
static
void
imdct512_window_delay_3dn
(
complex_t
*
buf
,
float
*
data_ptr
,
float
*
window_prt
,
float
*
delay_prt
)
{
{
__asm__
__volatile__
(
__asm__
__volatile__
(
".align 16
\n
"
"pushl %%ebp
\n
"
"pushl %%ebp
\n
"
"movl %%esp, %%ebp
\n
"
"movl %%esp, %%ebp
\n
"
...
@@ -219,6 +224,7 @@ static void imdct512_window_delay_3dn (complex_t *buf, float *data_ptr, float *w
...
@@ -219,6 +224,7 @@ static void imdct512_window_delay_3dn (complex_t *buf, float *data_ptr, float *w
"leal 504(%%eax), %%edi
\n
"
/* buf[63].re */
"leal 504(%%eax), %%edi
\n
"
/* buf[63].re */
"movl 12(%%ebp), %%eax
\n
"
/* data */
"movl 12(%%ebp), %%eax
\n
"
/* data */
".align 16
\n
"
".first_128_samples:
\n
"
".first_128_samples:
\n
"
"movd (%%esi), %%mm0
\n
"
/* im0 */
"movd (%%esi), %%mm0
\n
"
/* im0 */
"movd 8(%%esi), %%mm2
\n
"
/* im1 */
"movd 8(%%esi), %%mm2
\n
"
/* im1 */
...
@@ -258,6 +264,7 @@ static void imdct512_window_delay_3dn (complex_t *buf, float *data_ptr, float *w
...
@@ -258,6 +264,7 @@ static void imdct512_window_delay_3dn (complex_t *buf, float *data_ptr, float *w
"leal 1020(%%esi), %%edi
\n
"
/* buf[127].im */
"leal 1020(%%esi), %%edi
\n
"
/* buf[127].im */
"movl $32, %%ecx
\n
"
/* loop count */
"movl $32, %%ecx
\n
"
/* loop count */
".align 16
\n
"
".second_128_samples:
\n
"
".second_128_samples:
\n
"
"movd (%%esi), %%mm0
\n
"
/* buf[i].re */
"movd (%%esi), %%mm0
\n
"
/* buf[i].re */
"movd 8(%%esi), %%mm2
\n
"
/* re1 */
"movd 8(%%esi), %%mm2
\n
"
/* re1 */
...
@@ -302,6 +309,7 @@ static void imdct512_window_delay_3dn (complex_t *buf, float *data_ptr, float *w
...
@@ -302,6 +309,7 @@ static void imdct512_window_delay_3dn (complex_t *buf, float *data_ptr, float *w
"movl $32, %%ecx
\n
"
/* loop count */
"movl $32, %%ecx
\n
"
/* loop count */
"movl 20(%%ebp), %%eax
\n
"
/* delay */
"movl 20(%%ebp), %%eax
\n
"
/* delay */
".align 16
\n
"
".first_128_delay:
\n
"
".first_128_delay:
\n
"
"movd (%%esi), %%mm0
\n
"
/* re0 */
"movd (%%esi), %%mm0
\n
"
/* re0 */
"movd 8(%%esi), %%mm2
\n
"
/* re1 */
"movd 8(%%esi), %%mm2
\n
"
/* re1 */
...
@@ -339,6 +347,7 @@ static void imdct512_window_delay_3dn (complex_t *buf, float *data_ptr, float *w
...
@@ -339,6 +347,7 @@ static void imdct512_window_delay_3dn (complex_t *buf, float *data_ptr, float *w
"leal 1016(%%ebx), %%edi
\n
"
/* buf[127].re */
"leal 1016(%%ebx), %%edi
\n
"
/* buf[127].re */
"movl $32, %%ecx
\n
"
/* loop count */
"movl $32, %%ecx
\n
"
/* loop count */
".align 16
\n
"
".second_128_delay:
\n
"
".second_128_delay:
\n
"
"movd (%%esi), %%mm0
\n
"
/* im0 */
"movd (%%esi), %%mm0
\n
"
/* im0 */
"movd 8(%%esi), %%mm2
\n
"
/* im1 */
"movd 8(%%esi), %%mm2
\n
"
/* im1 */
...
@@ -386,6 +395,7 @@ static void imdct512_window_delay_3dn (complex_t *buf, float *data_ptr, float *w
...
@@ -386,6 +395,7 @@ static void imdct512_window_delay_3dn (complex_t *buf, float *data_ptr, float *w
static
void
imdct512_window_delay_nol_3dn
(
complex_t
*
buf
,
float
*
data_ptr
,
float
*
window_prt
,
float
*
delay_prt
)
static
void
imdct512_window_delay_nol_3dn
(
complex_t
*
buf
,
float
*
data_ptr
,
float
*
window_prt
,
float
*
delay_prt
)
{
{
__asm__
__volatile__
(
__asm__
__volatile__
(
".align 16
\n
"
"pushl %%ebp
\n
"
"pushl %%ebp
\n
"
"movl %%esp, %%ebp
\n
"
"movl %%esp, %%ebp
\n
"
...
@@ -405,6 +415,7 @@ static void imdct512_window_delay_nol_3dn (complex_t *buf, float *data_ptr, floa
...
@@ -405,6 +415,7 @@ static void imdct512_window_delay_nol_3dn (complex_t *buf, float *data_ptr, floa
"leal 504(%%eax), %%edi
\n
"
/* buf[63].re */
"leal 504(%%eax), %%edi
\n
"
/* buf[63].re */
"movl 12(%%ebp), %%eax
\n
"
/* data */
"movl 12(%%ebp), %%eax
\n
"
/* data */
".align 16
\n
"
".first_128_samples2:
\n
"
".first_128_samples2:
\n
"
"movd (%%esi), %%mm0
\n
"
/* im0 */
"movd (%%esi), %%mm0
\n
"
/* im0 */
"movd 8(%%esi), %%mm2
\n
"
/* im1 */
"movd 8(%%esi), %%mm2
\n
"
/* im1 */
...
@@ -439,6 +450,7 @@ static void imdct512_window_delay_nol_3dn (complex_t *buf, float *data_ptr, floa
...
@@ -439,6 +450,7 @@ static void imdct512_window_delay_nol_3dn (complex_t *buf, float *data_ptr, floa
"leal 1020(%%esi), %%edi
\n
"
/* buf[127].im */
"leal 1020(%%esi), %%edi
\n
"
/* buf[127].im */
"movl $32, %%ecx
\n
"
/* loop count */
"movl $32, %%ecx
\n
"
/* loop count */
".align 16
\n
"
".second_128_samples2:
\n
"
".second_128_samples2:
\n
"
"movd (%%esi), %%mm0
\n
"
/* buf[i].re */
"movd (%%esi), %%mm0
\n
"
/* buf[i].re */
"movd 8(%%esi), %%mm2
\n
"
/* re1 */
"movd 8(%%esi), %%mm2
\n
"
/* re1 */
...
@@ -478,6 +490,7 @@ static void imdct512_window_delay_nol_3dn (complex_t *buf, float *data_ptr, floa
...
@@ -478,6 +490,7 @@ static void imdct512_window_delay_nol_3dn (complex_t *buf, float *data_ptr, floa
"movl $32, %%ecx
\n
"
/* loop count */
"movl $32, %%ecx
\n
"
/* loop count */
"movl 20(%%ebp), %%eax
\n
"
/* delay */
"movl 20(%%ebp), %%eax
\n
"
/* delay */
".align 16
\n
"
".first_128_delays:
\n
"
".first_128_delays:
\n
"
"movd (%%esi), %%mm0
\n
"
/* re0 */
"movd (%%esi), %%mm0
\n
"
/* re0 */
"movd 8(%%esi), %%mm2
\n
"
/* re1 */
"movd 8(%%esi), %%mm2
\n
"
/* re1 */
...
@@ -515,6 +528,7 @@ static void imdct512_window_delay_nol_3dn (complex_t *buf, float *data_ptr, floa
...
@@ -515,6 +528,7 @@ static void imdct512_window_delay_nol_3dn (complex_t *buf, float *data_ptr, floa
"leal 1016(%%ebx), %%edi
\n
"
/* buf[127].re */
"leal 1016(%%ebx), %%edi
\n
"
/* buf[127].re */
"movl $32, %%ecx
\n
"
/* loop count */
"movl $32, %%ecx
\n
"
/* loop count */
".align 16
\n
"
".second_128_delays:
\n
"
".second_128_delays:
\n
"
"movd (%%esi), %%mm0
\n
"
/* im0 */
"movd (%%esi), %%mm0
\n
"
/* im0 */
"movd 8(%%esi), %%mm2
\n
"
/* im1 */
"movd 8(%%esi), %%mm2
\n
"
/* im1 */
...
...
plugins/imdct/ac3_imdct_sse.c
View file @
dee3179d
...
@@ -2,7 +2,7 @@
...
@@ -2,7 +2,7 @@
* ac3_imdct_sse.c: accelerated SSE ac3 DCT
* ac3_imdct_sse.c: accelerated SSE ac3 DCT
*****************************************************************************
*****************************************************************************
* Copyright (C) 1999, 2000 VideoLAN
* Copyright (C) 1999, 2000 VideoLAN
* $Id: ac3_imdct_sse.c,v 1.
3 2001/05/28 02:38:48 sam
Exp $
* $Id: ac3_imdct_sse.c,v 1.
4 2001/07/08 23:15:11 reno
Exp $
*
*
* Authors: Renaud Dartus <reno@videolan.org>
* Authors: Renaud Dartus <reno@videolan.org>
* Aaron Holtzman <aholtzma@engr.uvic.ca>
* Aaron Holtzman <aholtzma@engr.uvic.ca>
...
@@ -91,6 +91,7 @@ void _M( imdct_do_512_nol ) (imdct_t * p_imdct, float data[], float delay[])
...
@@ -91,6 +91,7 @@ void _M( imdct_do_512_nol ) (imdct_t * p_imdct, float data[], float delay[])
static
void
imdct512_pre_ifft_twiddle_sse
(
const
int
*
pmt
,
complex_t
*
buf
,
float
*
data
,
float
*
xcos_sin_sse
)
static
void
imdct512_pre_ifft_twiddle_sse
(
const
int
*
pmt
,
complex_t
*
buf
,
float
*
data
,
float
*
xcos_sin_sse
)
{
{
__asm__
__volatile__
(
__asm__
__volatile__
(
".align 16
\n
"
"pushl %%ebp
\n
"
"pushl %%ebp
\n
"
"movl %%esp, %%ebp
\n
"
"movl %%esp, %%ebp
\n
"
"addl $-4, %%esp
\n
"
/* local variable, loop counter */
"addl $-4, %%esp
\n
"
/* local variable, loop counter */
...
@@ -108,6 +109,7 @@ static void imdct512_pre_ifft_twiddle_sse (const int *pmt, complex_t *buf, float
...
@@ -108,6 +109,7 @@ static void imdct512_pre_ifft_twiddle_sse (const int *pmt, complex_t *buf, float
"movl 20(%%ebp), %%edx
\n
"
/* xcos_sin_sse */
"movl 20(%%ebp), %%edx
\n
"
/* xcos_sin_sse */
"movl $64, -4(%%ebp)
\n
"
"movl $64, -4(%%ebp)
\n
"
".align 16
\n
"
".loop:
\n
"
".loop:
\n
"
"movl (%%eax), %%esi
\n
"
"movl (%%eax), %%esi
\n
"
"movl 4(%%eax), %%edi
\n
"
"movl 4(%%eax), %%edi
\n
"
...
@@ -117,8 +119,8 @@ static void imdct512_pre_ifft_twiddle_sse (const int *pmt, complex_t *buf, float
...
@@ -117,8 +119,8 @@ static void imdct512_pre_ifft_twiddle_sse (const int *pmt, complex_t *buf, float
"shll $1, %%esi
\n
"
"shll $1, %%esi
\n
"
"shll $1, %%edi
\n
"
"shll $1, %%edi
\n
"
"mov
u
ps (%%edx, %%esi, 8), %%xmm0
\n
"
/* -c_j | -s_j | -s_j | c_j */
"mov
a
ps (%%edx, %%esi, 8), %%xmm0
\n
"
/* -c_j | -s_j | -s_j | c_j */
"mov
u
ps (%%edx, %%edi, 8), %%xmm2
\n
"
/* -c_j+1 | -s_j+1 | -s_j+1 | c_j+1 */
"mov
a
ps (%%edx, %%edi, 8), %%xmm2
\n
"
/* -c_j+1 | -s_j+1 | -s_j+1 | c_j+1 */
"negl %%esi
\n
"
"negl %%esi
\n
"
"negl %%edi
\n
"
"negl %%edi
\n
"
...
@@ -138,7 +140,7 @@ static void imdct512_pre_ifft_twiddle_sse (const int *pmt, complex_t *buf, float
...
@@ -138,7 +140,7 @@ static void imdct512_pre_ifft_twiddle_sse (const int *pmt, complex_t *buf, float
"addps %%xmm3, %%xmm2
\n
"
"addps %%xmm3, %%xmm2
\n
"
"movlhps %%xmm2, %%xmm0
\n
"
"movlhps %%xmm2, %%xmm0
\n
"
"mov
u
ps %%xmm0, -16(%%ebx)
\n
"
"mov
a
ps %%xmm0, -16(%%ebx)
\n
"
"decl -4(%%ebp)
\n
"
"decl -4(%%ebp)
\n
"
"jnz .loop
\n
"
"jnz .loop
\n
"
...
@@ -157,27 +159,29 @@ static void imdct512_pre_ifft_twiddle_sse (const int *pmt, complex_t *buf, float
...
@@ -157,27 +159,29 @@ static void imdct512_pre_ifft_twiddle_sse (const int *pmt, complex_t *buf, float
static
void
imdct512_post_ifft_twiddle_sse
(
complex_t
*
buf
,
float
*
xcos_sin_sse
)
static
void
imdct512_post_ifft_twiddle_sse
(
complex_t
*
buf
,
float
*
xcos_sin_sse
)
{
{
__asm__
__volatile__
(
__asm__
__volatile__
(
".align 16
\n
"
"pushl %%ebx
\n
"
"pushl %%ebx
\n
"
"movl $32, %%ebx
\n
"
/* loop counter */
"movl $32, %%ebx
\n
"
/* loop counter */
".align 16
\n
"
".loop1:
\n
"
".loop1:
\n
"
"mov
ups (%%eax), %%xmm0
\n
"
/* im1 | re1 | im0 | re0 */
"mov
aps (%%eax), %%xmm0
\n
"
/* im1 | re1 | im0 | re0 */
"mov
ups
(%%ecx), %%xmm2
\n
"
/* -c | -s | -s | c */
"mov
aps
(%%ecx), %%xmm2
\n
"
/* -c | -s | -s | c */
"movhlps %%xmm0, %%xmm1
\n
"
/* im1 | re1 */
"movhlps %%xmm0, %%xmm1
\n
"
/* im1 | re1 */
"mov
ups
16(%%ecx), %%xmm3
\n
"
/* -c1 | -s1 | -s1 | c1 */
"mov
aps
16(%%ecx), %%xmm3
\n
"
/* -c1 | -s1 | -s1 | c1 */
"shufps $0x50, %%xmm0, %%xmm0
\n
"
/* im0 | im0 | re0 | re0 */
"shufps $0x50, %%xmm0, %%xmm0
\n
"
/* im0 | im0 | re0 | re0 */
"shufps $0x50, %%xmm1, %%xmm1
\n
"
/* im1 | im1 | re1 | re1 */
"shufps $0x50, %%xmm1, %%xmm1
\n
"
/* im1 | im1 | re1 | re1 */
"mov
ups 16(%%eax), %%xmm4
\n
"
/* im3 | re3 | im2 | re2 */
"mov
aps 16(%%eax), %%xmm4
\n
"
/* im3 | re3 | im2 | re2 */
"shufps $0x27, %%xmm2, %%xmm2
\n
"
/* c | -s | -s | -c */
"shufps $0x27, %%xmm2, %%xmm2
\n
"
/* c | -s | -s | -c */
"movhlps %%xmm4, %%xmm5
\n
"
/* im3 | re3 */
"movhlps %%xmm4, %%xmm5
\n
"
/* im3 | re3 */
"shufps $0x27, %%xmm3, %%xmm3
\n
"
/* c1 | -s1 | -s1 | -c1 */
"shufps $0x27, %%xmm3, %%xmm3
\n
"
/* c1 | -s1 | -s1 | -c1 */
"mov
ups
32(%%ecx), %%xmm6
\n
"
/* -c2 | -s2 | -s2 | c2 */
"mov
aps
32(%%ecx), %%xmm6
\n
"
/* -c2 | -s2 | -s2 | c2 */
"mov
ups
48(%%ecx), %%xmm7
\n
"
/* -c3 | -s3 | -s3 | c3 */
"mov
aps
48(%%ecx), %%xmm7
\n
"
/* -c3 | -s3 | -s3 | c3 */
"shufps $0x50, %%xmm4, %%xmm4
\n
"
/* im2 | im2 | re2 | re2 */
"shufps $0x50, %%xmm4, %%xmm4
\n
"
/* im2 | im2 | re2 | re2 */
"shufps $0x50, %%xmm5, %%xmm5
\n
"
/* im3 | im3 | re3 | re3 */
"shufps $0x50, %%xmm5, %%xmm5
\n
"
/* im3 | im3 | re3 | re3 */
...
@@ -206,8 +210,8 @@ static void imdct512_post_ifft_twiddle_sse (complex_t *buf, float *xcos_sin_sse)
...
@@ -206,8 +210,8 @@ static void imdct512_post_ifft_twiddle_sse (complex_t *buf, float *xcos_sin_sse)
"movlhps %%xmm1, %%xmm0
\n
"
"movlhps %%xmm1, %%xmm0
\n
"
"movlhps %%xmm5, %%xmm4
\n
"
"movlhps %%xmm5, %%xmm4
\n
"
"mov
u
ps %%xmm0, (%%eax)
\n
"
"mov
a
ps %%xmm0, (%%eax)
\n
"
"mov
u
ps %%xmm4, 16(%%eax)
\n
"
"mov
a
ps %%xmm4, 16(%%eax)
\n
"
"addl $64, %%ecx
\n
"
"addl $64, %%ecx
\n
"
"addl $32, %%eax
\n
"
"addl $32, %%eax
\n
"
"decl %%ebx
\n
"
"decl %%ebx
\n
"
...
@@ -221,6 +225,7 @@ static void imdct512_post_ifft_twiddle_sse (complex_t *buf, float *xcos_sin_sse)
...
@@ -221,6 +225,7 @@ static void imdct512_post_ifft_twiddle_sse (complex_t *buf, float *xcos_sin_sse)
static
void
imdct512_window_delay_sse
(
complex_t
*
buf
,
float
*
data_ptr
,
float
*
window_prt
,
float
*
delay_prt
)
static
void
imdct512_window_delay_sse
(
complex_t
*
buf
,
float
*
data_ptr
,
float
*
window_prt
,
float
*
delay_prt
)
{
{
__asm__
__volatile__
(
__asm__
__volatile__
(
".align 16
\n
"
"pushl %%ebp
\n
"
"pushl %%ebp
\n
"
"movl %%esp, %%ebp
\n
"
"movl %%esp, %%ebp
\n
"
...
@@ -240,6 +245,7 @@ static void imdct512_window_delay_sse (complex_t *buf, float *data_ptr, float *w
...
@@ -240,6 +245,7 @@ static void imdct512_window_delay_sse (complex_t *buf, float *data_ptr, float *w
"leal 504(%%eax), %%edi
\n
"
/* buf[63].re */
"leal 504(%%eax), %%edi
\n
"
/* buf[63].re */
"movl 12(%%ebp), %%eax
\n
"
/* data */
"movl 12(%%ebp), %%eax
\n
"
/* data */
".align 16
\n
"
".first_128_samples:
\n
"
".first_128_samples:
\n
"
"movss (%%esi), %%xmm0
\n
"
"movss (%%esi), %%xmm0
\n
"
"movss 8(%%esi), %%xmm2
\n
"
"movss 8(%%esi), %%xmm2
\n
"
...
@@ -250,7 +256,7 @@ static void imdct512_window_delay_sse (complex_t *buf, float *data_ptr, float *w
...
@@ -250,7 +256,7 @@ static void imdct512_window_delay_sse (complex_t *buf, float *data_ptr, float *w
"movlhps %%xmm3, %%xmm1
\n
"
/* 0.0 | re1 | 0.0 | re0 */
"movlhps %%xmm3, %%xmm1
\n
"
/* 0.0 | re1 | 0.0 | re0 */
"movups (%%edx), %%xmm4
\n
"
/* w3 | w2 | w1 | w0 */
"movups (%%edx), %%xmm4
\n
"
/* w3 | w2 | w1 | w0 */
"mov
u
ps (%%ebx), %%xmm5
\n
"
/* d3 | d2 | d1 | d0 */
"mov
a
ps (%%ebx), %%xmm5
\n
"
/* d3 | d2 | d1 | d0 */
"shufps $0xb1, %%xmm1, %%xmm1
\n
"
/* re1 | 0.0 | re0 | 0.0 */
"shufps $0xb1, %%xmm1, %%xmm1
\n
"
/* re1 | 0.0 | re0 | 0.0 */
"movss 16(%%esi), %%xmm6
\n
"
/* im2 */
"movss 16(%%esi), %%xmm6
\n
"
/* im2 */
...
@@ -264,17 +270,17 @@ static void imdct512_window_delay_sse (complex_t *buf, float *data_ptr, float *w
...
@@ -264,17 +270,17 @@ static void imdct512_window_delay_sse (complex_t *buf, float *data_ptr, float *w
"addps %%xmm5, %%xmm0
\n
"
"addps %%xmm5, %%xmm0
\n
"
"shufps $0xb1, %%xmm2, %%xmm2
\n
"
/* re3 | 0.0 | re2 | 0.0 */
"shufps $0xb1, %%xmm2, %%xmm2
\n
"
/* re3 | 0.0 | re2 | 0.0 */
"movups 16(%%edx), %%xmm4
\n
"
/* w7 | w6 | w5 | w4 */
"movups 16(%%edx), %%xmm4
\n
"
/* w7 | w6 | w5 | w4 */
"mov
u
ps 16(%%ebx), %%xmm5
\n
"
/* d7 | d6 | d5 | d4 */
"mov
a
ps 16(%%ebx), %%xmm5
\n
"
/* d7 | d6 | d5 | d4 */
"subps %%xmm2, %%xmm6
\n
"
/* -re3 | im3 | -re2 | im2 */
"subps %%xmm2, %%xmm6
\n
"
/* -re3 | im3 | -re2 | im2 */
"addl $32, %%edx
\n
"
"addl $32, %%edx
\n
"
"mov
u
ps %%xmm0, (%%eax)
\n
"
"mov
a
ps %%xmm0, (%%eax)
\n
"
"addl $32, %%ebx
\n
"
"addl $32, %%ebx
\n
"
"mulps %%xmm4, %%xmm6
\n
"
"mulps %%xmm4, %%xmm6
\n
"
"addl $32, %%esi
\n
"
"addl $32, %%esi
\n
"
"addl $32, %%eax
\n
"
"addl $32, %%eax
\n
"
"addps %%xmm5, %%xmm6
\n
"
"addps %%xmm5, %%xmm6
\n
"
"addl $-32, %%edi
\n
"
"addl $-32, %%edi
\n
"
"mov
u
ps %%xmm6, -16(%%eax)
\n
"
"mov
a
ps %%xmm6, -16(%%eax)
\n
"
"decl %%ecx
\n
"
"decl %%ecx
\n
"
"jnz .first_128_samples
\n
"
"jnz .first_128_samples
\n
"
...
@@ -282,6 +288,7 @@ static void imdct512_window_delay_sse (complex_t *buf, float *data_ptr, float *w
...
@@ -282,6 +288,7 @@ static void imdct512_window_delay_sse (complex_t *buf, float *data_ptr, float *w
"leal 1020(%%esi), %%edi
\n
"
/* buf[127].im */
"leal 1020(%%esi), %%edi
\n
"
/* buf[127].im */
"movl $16, %%ecx
\n
"
/* loop count */
"movl $16, %%ecx
\n
"
/* loop count */
".align 16
\n
"
".second_128_samples:
\n
"
".second_128_samples:
\n
"
"movss (%%esi), %%xmm0
\n
"
/* buf[i].re */
"movss (%%esi), %%xmm0
\n
"
/* buf[i].re */
"movss 8(%%esi), %%xmm2
\n
"
/* re1 */
"movss 8(%%esi), %%xmm2
\n
"
/* re1 */
...
@@ -292,7 +299,7 @@ static void imdct512_window_delay_sse (complex_t *buf, float *data_ptr, float *w
...
@@ -292,7 +299,7 @@ static void imdct512_window_delay_sse (complex_t *buf, float *data_ptr, float *w
"movlhps %%xmm3, %%xmm1
\n
"
/* 0.0 | im1 | 0.0 | im1 */
"movlhps %%xmm3, %%xmm1
\n
"
/* 0.0 | im1 | 0.0 | im1 */
"movups (%%edx), %%xmm4
\n
"
/* w3 | w2 | w1 | w0 */
"movups (%%edx), %%xmm4
\n
"
/* w3 | w2 | w1 | w0 */
"mov
u
ps (%%ebx), %%xmm5
\n
"
/* d3 | d2 | d1 | d0 */
"mov
a
ps (%%ebx), %%xmm5
\n
"
/* d3 | d2 | d1 | d0 */
"shufps $0xb1, %%xmm1, %%xmm1
\n
"
/* im1 | 0.0 | im0 | 0.0 */
"shufps $0xb1, %%xmm1, %%xmm1
\n
"
/* im1 | 0.0 | im0 | 0.0 */
"movss 16(%%esi), %%xmm6
\n
"
/* re2 */
"movss 16(%%esi), %%xmm6
\n
"
/* re2 */
...
@@ -310,13 +317,13 @@ static void imdct512_window_delay_sse (complex_t *buf, float *data_ptr, float *w
...
@@ -310,13 +317,13 @@ static void imdct512_window_delay_sse (complex_t *buf, float *data_ptr, float *w
"addps %%xmm5, %%xmm0
\n
"
"addps %%xmm5, %%xmm0
\n
"
"mulps %%xmm4, %%xmm6
\n
"
"mulps %%xmm4, %%xmm6
\n
"
"addl $-32, %%edi
\n
"
"addl $-32, %%edi
\n
"
"mov
u
ps 16(%%ebx), %%xmm5
\n
"
/* d7 | d6 | d5 | d4 */
"mov
a
ps 16(%%ebx), %%xmm5
\n
"
/* d7 | d6 | d5 | d4 */
"mov
u
ps %%xmm0, (%%eax)
\n
"
"mov
a
ps %%xmm0, (%%eax)
\n
"
"addps %%xmm5, %%xmm6
\n
"
"addps %%xmm5, %%xmm6
\n
"
"addl $32, %%edx
\n
"
"addl $32, %%edx
\n
"
"addl $32, %%eax
\n
"
"addl $32, %%eax
\n
"
"addl $32, %%ebx
\n
"
"addl $32, %%ebx
\n
"
"mov
u
ps %%xmm6, -16(%%eax)
\n
"
"mov
a
ps %%xmm6, -16(%%eax)
\n
"
"decl %%ecx
\n
"
"decl %%ecx
\n
"
"jnz .second_128_samples
\n
"
"jnz .second_128_samples
\n
"
...
@@ -326,6 +333,7 @@ static void imdct512_window_delay_sse (complex_t *buf, float *data_ptr, float *w
...
@@ -326,6 +333,7 @@ static void imdct512_window_delay_sse (complex_t *buf, float *data_ptr, float *w
"movl $16, %%ecx
\n
"
/* loop count */
"movl $16, %%ecx
\n
"
/* loop count */
"movl 20(%%ebp), %%eax
\n
"
/* delay */
"movl 20(%%ebp), %%eax
\n
"
/* delay */
".align 16
\n
"
".first_128_delay:
\n
"
".first_128_delay:
\n
"
"movss (%%esi), %%xmm0
\n
"
"movss (%%esi), %%xmm0
\n
"
"movss 8(%%esi), %%xmm2
\n
"
"movss 8(%%esi), %%xmm2
\n
"
...
@@ -348,13 +356,13 @@ static void imdct512_window_delay_sse (complex_t *buf, float *data_ptr, float *w
...
@@ -348,13 +356,13 @@ static void imdct512_window_delay_sse (complex_t *buf, float *data_ptr, float *w
"mulps %%xmm4, %%xmm0
\n
"
"mulps %%xmm4, %%xmm0
\n
"
"movups (%%edx), %%xmm5
\n
"
/* w7 | w6 | w5 | w4 */
"movups (%%edx), %%xmm5
\n
"
/* w7 | w6 | w5 | w4 */
"shufps $0xb1, %%xmm2, %%xmm2
\n
"
/* im3 | 0.0 | im2 | 0.0 */
"shufps $0xb1, %%xmm2, %%xmm2
\n
"
/* im3 | 0.0 | im2 | 0.0 */
"mov
u
ps %%xmm0, (%%eax)
\n
"
"mov
a
ps %%xmm0, (%%eax)
\n
"
"addl $32, %%esi
\n
"
"addl $32, %%esi
\n
"
"subps %%xmm2, %%xmm6
\n
"
/* -im3 | re3 | -im2 | re2 */
"subps %%xmm2, %%xmm6
\n
"
/* -im3 | re3 | -im2 | re2 */
"addl $-32, %%edi
\n
"
"addl $-32, %%edi
\n
"
"mulps %%xmm5, %%xmm6
\n
"
"mulps %%xmm5, %%xmm6
\n
"
"addl $32, %%eax
\n
"
"addl $32, %%eax
\n
"
"mov
u
ps %%xmm6, -16(%%eax)
\n
"
"mov
a
ps %%xmm6, -16(%%eax)
\n
"
"decl %%ecx
\n
"
"decl %%ecx
\n
"
"jnz .first_128_delay
\n
"
"jnz .first_128_delay
\n
"
...
@@ -363,6 +371,7 @@ static void imdct512_window_delay_sse (complex_t *buf, float *data_ptr, float *w
...
@@ -363,6 +371,7 @@ static void imdct512_window_delay_sse (complex_t *buf, float *data_ptr, float *w
"leal 1016(%%ebx), %%edi
\n
"
/* buf[127].re */
"leal 1016(%%ebx), %%edi
\n
"
/* buf[127].re */
"movl $16, %%ecx
\n
"
/* loop count */
"movl $16, %%ecx
\n
"
/* loop count */
".align 16
\n
"
".second_128_delay:
\n
"
".second_128_delay:
\n
"
"movss (%%esi), %%xmm0
\n
"
"movss (%%esi), %%xmm0
\n
"
"movss 8(%%esi), %%xmm2
\n
"
"movss 8(%%esi), %%xmm2
\n
"
...
@@ -385,13 +394,13 @@ static void imdct512_window_delay_sse (complex_t *buf, float *data_ptr, float *w
...
@@ -385,13 +394,13 @@ static void imdct512_window_delay_sse (complex_t *buf, float *data_ptr, float *w
"mulps %%xmm4, %%xmm1
\n
"
"mulps %%xmm4, %%xmm1
\n
"
"movups (%%edx), %%xmm5
\n
"
/* w7 | w6 | w5 | w4 */
"movups (%%edx), %%xmm5
\n
"
/* w7 | w6 | w5 | w4 */
"shufps $0xb1, %%xmm2, %%xmm2
\n
"
/* re3 | 0.0 | re2 | 0.0 */
"shufps $0xb1, %%xmm2, %%xmm2
\n
"
/* re3 | 0.0 | re2 | 0.0 */
"mov
u
ps %%xmm1, (%%eax)
\n
"
"mov
a
ps %%xmm1, (%%eax)
\n
"
"addl $32, %%esi
\n
"
"addl $32, %%esi
\n
"
"subps %%xmm6, %%xmm2
\n
"
/* re | -im3 | re | -im2 */
"subps %%xmm6, %%xmm2
\n
"
/* re | -im3 | re | -im2 */
"addl $-32, %%edi
\n
"
"addl $-32, %%edi
\n
"
"mulps %%xmm5, %%xmm2
\n
"
"mulps %%xmm5, %%xmm2
\n
"
"addl $32, %%eax
\n
"
"addl $32, %%eax
\n
"
"mov
u
ps %%xmm2, -16(%%eax)
\n
"
"mov
a
ps %%xmm2, -16(%%eax)
\n
"
"decl %%ecx
\n
"
"decl %%ecx
\n
"
"jnz .second_128_delay
\n
"
"jnz .second_128_delay
\n
"
...
@@ -409,6 +418,7 @@ static void imdct512_window_delay_sse (complex_t *buf, float *data_ptr, float *w
...
@@ -409,6 +418,7 @@ static void imdct512_window_delay_sse (complex_t *buf, float *data_ptr, float *w
static
void
imdct512_window_delay_nol_sse
(
complex_t
*
buf
,
float
*
data_ptr
,
float
*
window_prt
,
float
*
delay_prt
)
static
void
imdct512_window_delay_nol_sse
(
complex_t
*
buf
,
float
*
data_ptr
,
float
*
window_prt
,
float
*
delay_prt
)
{
{
__asm__
__volatile__
(
__asm__
__volatile__
(
".align 16
\n
"
"pushl %%ebp
\n
"
"pushl %%ebp
\n
"
"movl %%esp, %%ebp
\n
"
"movl %%esp, %%ebp
\n
"
...
@@ -428,6 +438,7 @@ static void imdct512_window_delay_nol_sse (complex_t *buf, float *data_ptr, floa
...
@@ -428,6 +438,7 @@ static void imdct512_window_delay_nol_sse (complex_t *buf, float *data_ptr, floa
"leal 504(%%eax), %%edi
\n
"
/* buf[63].re */
"leal 504(%%eax), %%edi
\n
"
/* buf[63].re */
"movl 12(%%ebp), %%eax
\n
"
/* data */
"movl 12(%%ebp), %%eax
\n
"
/* data */
".align 16
\n
"
".first_128_sample:
\n
"
".first_128_sample:
\n
"
"movss (%%esi), %%xmm0
\n
"
"movss (%%esi), %%xmm0
\n
"
"movss 8(%%esi), %%xmm2
\n
"
"movss 8(%%esi), %%xmm2
\n
"
...
@@ -438,7 +449,6 @@ static void imdct512_window_delay_nol_sse (complex_t *buf, float *data_ptr, floa
...
@@ -438,7 +449,6 @@ static void imdct512_window_delay_nol_sse (complex_t *buf, float *data_ptr, floa
"movlhps %%xmm3, %%xmm1
\n
"
/* 0.0 | re1 | 0.0 | re0 */
"movlhps %%xmm3, %%xmm1
\n
"
/* 0.0 | re1 | 0.0 | re0 */
"movups (%%edx), %%xmm4
\n
"
/* w3 | w2 | w1 | w0 */
"movups (%%edx), %%xmm4
\n
"
/* w3 | w2 | w1 | w0 */
/* movups (%%ebx), %%xmm5 d3 | d2 | d1 | d0 */
"shufps $0xb1, %%xmm1, %%xmm1
\n
"
/* re1 | 0.0 | re0 | 0.0 */
"shufps $0xb1, %%xmm1, %%xmm1
\n
"
/* re1 | 0.0 | re0 | 0.0 */
"movss 16(%%esi), %%xmm6
\n
"
/* im2 */
"movss 16(%%esi), %%xmm6
\n
"
/* im2 */
...
@@ -449,20 +459,16 @@ static void imdct512_window_delay_nol_sse (complex_t *buf, float *data_ptr, floa
...
@@ -449,20 +459,16 @@ static void imdct512_window_delay_nol_sse (complex_t *buf, float *data_ptr, floa
"mulps %%xmm4, %%xmm0
\n
"
"mulps %%xmm4, %%xmm0
\n
"
"movlhps %%xmm7, %%xmm6
\n
"
/* 0.0 | im3 | 0.0 | im2 */
"movlhps %%xmm7, %%xmm6
\n
"
/* 0.0 | im3 | 0.0 | im2 */
"movlhps %%xmm3, %%xmm2
\n
"
/* 0.0 | re3 | 0.0 | re2 */
"movlhps %%xmm3, %%xmm2
\n
"
/* 0.0 | re3 | 0.0 | re2 */
/* addps %%xmm5, %%xmm0 */
"shufps $0xb1, %%xmm2, %%xmm2
\n
"
/* re3 | 0.0 | re2 | 0.0 */
"shufps $0xb1, %%xmm2, %%xmm2
\n
"
/* re3 | 0.0 | re2 | 0.0 */
"movups 16(%%edx), %%xmm4
\n
"
/* w7 | w6 | w5 | w4 */
"movups 16(%%edx), %%xmm4
\n
"
/* w7 | w6 | w5 | w4 */
/* movups 16(%%ebx), %%xmm5 d7 | d6 | d5 | d4 */
"subps %%xmm2, %%xmm6
\n
"
/* -re3 | im3 | -re2 | im2 */
"subps %%xmm2, %%xmm6
\n
"
/* -re3 | im3 | -re2 | im2 */
"addl $32, %%edx
\n
"
"addl $32, %%edx
\n
"
"movups %%xmm0, (%%eax)
\n
"
"movaps %%xmm0, (%%eax)
\n
"
/* addl $32, %%ebx */
"mulps %%xmm4, %%xmm6
\n
"
"mulps %%xmm4, %%xmm6
\n
"
"addl $32, %%esi
\n
"
"addl $32, %%esi
\n
"
"addl $32, %%eax
\n
"
"addl $32, %%eax
\n
"
/* addps %%xmm5, %%xmm6 */
"addl $-32, %%edi
\n
"
"addl $-32, %%edi
\n
"
"mov
u
ps %%xmm6, -16(%%eax)
\n
"
"mov
a
ps %%xmm6, -16(%%eax)
\n
"
"decl %%ecx
\n
"
"decl %%ecx
\n
"
"jnz .first_128_sample
\n
"
"jnz .first_128_sample
\n
"
...
@@ -470,6 +476,7 @@ static void imdct512_window_delay_nol_sse (complex_t *buf, float *data_ptr, floa
...
@@ -470,6 +476,7 @@ static void imdct512_window_delay_nol_sse (complex_t *buf, float *data_ptr, floa
"leal 1020(%%esi), %%edi
\n
"
/* buf[127].im */
"leal 1020(%%esi), %%edi
\n
"
/* buf[127].im */
"movl $16, %%ecx
\n
"
/* loop count */
"movl $16, %%ecx
\n
"
/* loop count */
".align 16
\n
"
".second_128_sample:
\n
"
".second_128_sample:
\n
"
"movss (%%esi), %%xmm0
\n
"
/* buf[i].re */
"movss (%%esi), %%xmm0
\n
"
/* buf[i].re */
"movss 8(%%esi), %%xmm2
\n
"
/* re1 */
"movss 8(%%esi), %%xmm2
\n
"
/* re1 */
...
@@ -480,7 +487,6 @@ static void imdct512_window_delay_nol_sse (complex_t *buf, float *data_ptr, floa
...
@@ -480,7 +487,6 @@ static void imdct512_window_delay_nol_sse (complex_t *buf, float *data_ptr, floa
"movlhps %%xmm3, %%xmm1
\n
"
/* 0.0 | im1 | 0.0 | im1 */
"movlhps %%xmm3, %%xmm1
\n
"
/* 0.0 | im1 | 0.0 | im1 */
"movups (%%edx), %%xmm4
\n
"
/* w3 | w2 | w1 | w0 */
"movups (%%edx), %%xmm4
\n
"
/* w3 | w2 | w1 | w0 */
/* movups (%%ebx), %%xmm5 d3 | d2 | d1 | d0 */
"shufps $0xb1, %%xmm1, %%xmm1
\n
"
/* im1 | 0.0 | im0 | 0.0 */
"shufps $0xb1, %%xmm1, %%xmm1
\n
"
/* im1 | 0.0 | im0 | 0.0 */
"movss 16(%%esi), %%xmm6
\n
"
/* re2 */
"movss 16(%%esi), %%xmm6
\n
"
/* re2 */
...
@@ -495,16 +501,12 @@ static void imdct512_window_delay_nol_sse (complex_t *buf, float *data_ptr, floa
...
@@ -495,16 +501,12 @@ static void imdct512_window_delay_nol_sse (complex_t *buf, float *data_ptr, floa
"movups 16(%%edx), %%xmm4
\n
"
/* w7 | w6 | w5 | w4 */
"movups 16(%%edx), %%xmm4
\n
"
/* w7 | w6 | w5 | w4 */
"addl $32, %%esi
\n
"
"addl $32, %%esi
\n
"
"subps %%xmm2, %%xmm6
\n
"
/* -im3 | re3 | -im2 | re2 */
"subps %%xmm2, %%xmm6
\n
"
/* -im3 | re3 | -im2 | re2 */
/* addps %%xmm5, %%xmm0 */
"mulps %%xmm4, %%xmm6
\n
"
"mulps %%xmm4, %%xmm6
\n
"
"addl $-32, %%edi
\n
"
"addl $-32, %%edi
\n
"
/* movups 16(%%ebx), %%xmm5 d7 | d6 | d5 | d4 */
"movaps %%xmm0, (%%eax)
\n
"
"movups %%xmm0, (%%eax)
\n
"
/* addps %%xmm5, %%xmm6 */
"addl $32, %%edx
\n
"
"addl $32, %%edx
\n
"
"addl $32, %%eax
\n
"
"addl $32, %%eax
\n
"
/* addl $32, %%ebx */
"movaps %%xmm6, -16(%%eax)
\n
"
"movups %%xmm6, -16(%%eax)
\n
"
"decl %%ecx
\n
"
"decl %%ecx
\n
"
"jnz .second_128_sample
\n
"
"jnz .second_128_sample
\n
"
...
@@ -514,6 +516,7 @@ static void imdct512_window_delay_nol_sse (complex_t *buf, float *data_ptr, floa
...
@@ -514,6 +516,7 @@ static void imdct512_window_delay_nol_sse (complex_t *buf, float *data_ptr, floa
"movl $16, %%ecx
\n
"
/* loop count */
"movl $16, %%ecx
\n
"
/* loop count */
"movl 20(%%ebp), %%eax
\n
"
/* delay */
"movl 20(%%ebp), %%eax
\n
"
/* delay */
".align 16
\n
"
".first_128_delays:
\n
"
".first_128_delays:
\n
"
"movss (%%esi), %%xmm0
\n
"
"movss (%%esi), %%xmm0
\n
"
"movss 8(%%esi), %%xmm2
\n
"
"movss 8(%%esi), %%xmm2
\n
"
...
@@ -536,13 +539,13 @@ static void imdct512_window_delay_nol_sse (complex_t *buf, float *data_ptr, floa
...
@@ -536,13 +539,13 @@ static void imdct512_window_delay_nol_sse (complex_t *buf, float *data_ptr, floa
"mulps %%xmm4, %%xmm0
\n
"
"mulps %%xmm4, %%xmm0
\n
"
"movups (%%edx), %%xmm5
\n
"
/* w7 | w6 | w5 | w4 */
"movups (%%edx), %%xmm5
\n
"
/* w7 | w6 | w5 | w4 */
"shufps $0xb1, %%xmm2, %%xmm2
\n
"
/* im3 | 0.0 | im2 | 0.0 */
"shufps $0xb1, %%xmm2, %%xmm2
\n
"
/* im3 | 0.0 | im2 | 0.0 */
"mov
u
ps %%xmm0, (%%eax)
\n
"
"mov
a
ps %%xmm0, (%%eax)
\n
"
"addl $32, %%esi
\n
"
"addl $32, %%esi
\n
"
"subps %%xmm2, %%xmm6
\n
"
/* -im3 | re3 | -im2 | re2 */
"subps %%xmm2, %%xmm6
\n
"
/* -im3 | re3 | -im2 | re2 */
"addl $-32, %%edi
\n
"
"addl $-32, %%edi
\n
"
"mulps %%xmm5, %%xmm6
\n
"
"mulps %%xmm5, %%xmm6
\n
"
"addl $32, %%eax
\n
"
"addl $32, %%eax
\n
"
"mov
u
ps %%xmm6, -16(%%eax)
\n
"
"mov
a
ps %%xmm6, -16(%%eax)
\n
"
"decl %%ecx
\n
"
"decl %%ecx
\n
"
"jnz .first_128_delays
\n
"
"jnz .first_128_delays
\n
"
...
@@ -551,6 +554,7 @@ static void imdct512_window_delay_nol_sse (complex_t *buf, float *data_ptr, floa
...
@@ -551,6 +554,7 @@ static void imdct512_window_delay_nol_sse (complex_t *buf, float *data_ptr, floa
"leal 1016(%%ebx), %%edi
\n
"
/* buf[127].re */
"leal 1016(%%ebx), %%edi
\n
"
/* buf[127].re */
"movl $16, %%ecx
\n
"
/* loop count */
"movl $16, %%ecx
\n
"
/* loop count */
".align 16
\n
"
".second_128_delays:
\n
"
".second_128_delays:
\n
"
"movss (%%esi), %%xmm0
\n
"
"movss (%%esi), %%xmm0
\n
"
"movss 8(%%esi), %%xmm2
\n
"
"movss 8(%%esi), %%xmm2
\n
"
...
@@ -573,13 +577,13 @@ static void imdct512_window_delay_nol_sse (complex_t *buf, float *data_ptr, floa
...
@@ -573,13 +577,13 @@ static void imdct512_window_delay_nol_sse (complex_t *buf, float *data_ptr, floa
"mulps %%xmm4, %%xmm1
\n
"
"mulps %%xmm4, %%xmm1
\n
"
"movups (%%edx), %%xmm5
\n
"
/* w7 | w6 | w5 | w4 */
"movups (%%edx), %%xmm5
\n
"
/* w7 | w6 | w5 | w4 */
"shufps $0xb1, %%xmm2, %%xmm2
\n
"
/* re3 | 0.0 | re2 | 0.0 */
"shufps $0xb1, %%xmm2, %%xmm2
\n
"
/* re3 | 0.0 | re2 | 0.0 */
"mov
u
ps %%xmm1, (%%eax)
\n
"
"mov
a
ps %%xmm1, (%%eax)
\n
"
"addl $32, %%esi
\n
"
"addl $32, %%esi
\n
"
"subps %%xmm6, %%xmm2
\n
"
/* re | -im3 | re | -im2 */
"subps %%xmm6, %%xmm2
\n
"
/* re | -im3 | re | -im2 */
"addl $-32, %%edi
\n
"
"addl $-32, %%edi
\n
"
"mulps %%xmm5, %%xmm2
\n
"
"mulps %%xmm5, %%xmm2
\n
"
"addl $32, %%eax
\n
"
"addl $32, %%eax
\n
"
"mov
u
ps %%xmm2, -16(%%eax)
\n
"
"mov
a
ps %%xmm2, -16(%%eax)
\n
"
"decl %%ecx
\n
"
"decl %%ecx
\n
"
"jnz .second_128_delays
\n
"
"jnz .second_128_delays
\n
"
...
...
plugins/imdct/ac3_srfft_3dn.c
View file @
dee3179d
...
@@ -2,7 +2,7 @@
...
@@ -2,7 +2,7 @@
* ac3_srfft_3dn.c: accelerated 3D Now! ac3 fft functions
* ac3_srfft_3dn.c: accelerated 3D Now! ac3 fft functions
*****************************************************************************
*****************************************************************************
* Copyright (C) 1999, 2000, 2001 VideoLAN
* Copyright (C) 1999, 2000, 2001 VideoLAN
* $Id: ac3_srfft_3dn.c,v 1.
1 2001/05/16 14:51:29
reno Exp $
* $Id: ac3_srfft_3dn.c,v 1.
2 2001/07/08 23:15:11
reno Exp $
*
*
* Authors: Renaud Dartus <reno@videolan.org>
* Authors: Renaud Dartus <reno@videolan.org>
*
*
...
@@ -126,6 +126,7 @@ void C_1_3dn (void)
...
@@ -126,6 +126,7 @@ void C_1_3dn (void)
static
void
fft_4_3dn
(
complex_t
*
x
)
static
void
fft_4_3dn
(
complex_t
*
x
)
{
{
__asm__
__volatile__
(
__asm__
__volatile__
(
".align 16
\n
"
"movq (%%eax), %%mm0
\n
"
/* x[0] */
"movq (%%eax), %%mm0
\n
"
/* x[0] */
"movq 8(%%eax), %%mm1
\n
"
/* x[1] */
"movq 8(%%eax), %%mm1
\n
"
/* x[1] */
"movq 16(%%eax), %%mm2
\n
"
/* x[2] */
"movq 16(%%eax), %%mm2
\n
"
/* x[2] */
...
...
plugins/imdct/ac3_srfft_sse.c
View file @
dee3179d
...
@@ -2,7 +2,7 @@
...
@@ -2,7 +2,7 @@
* ac3_srfft_sse.c: accelerated SSE ac3 fft functions
* ac3_srfft_sse.c: accelerated SSE ac3 fft functions
*****************************************************************************
*****************************************************************************
* Copyright (C) 1999, 2000, 2001 VideoLAN
* Copyright (C) 1999, 2000, 2001 VideoLAN
* $Id: ac3_srfft_sse.c,v 1.
3 2001/07/01 08:49:09 gbazin
Exp $
* $Id: ac3_srfft_sse.c,v 1.
4 2001/07/08 23:15:11 reno
Exp $
*
*
* Authors: Renaud Dartus <reno@videolan.org>
* Authors: Renaud Dartus <reno@videolan.org>
* Aaron Holtzman <aholtzma@engr.uvic.ca>
* Aaron Holtzman <aholtzma@engr.uvic.ca>
...
@@ -106,7 +106,7 @@ void _M( fft_128p ) ( complex_t *a )
...
@@ -106,7 +106,7 @@ void _M( fft_128p ) ( complex_t *a )
void
hsqrt2_sse
(
void
)
void
hsqrt2_sse
(
void
)
{
{
__asm__
(
__asm__
__volatile__
(
".float 0f0.707106781188
\n
"
".float 0f0.707106781188
\n
"
".float 0f0.707106781188
\n
"
".float 0f0.707106781188
\n
"
".float 0f-0.707106781188
\n
"
".float 0f-0.707106781188
\n
"
...
@@ -116,7 +116,7 @@ void hsqrt2_sse (void)
...
@@ -116,7 +116,7 @@ void hsqrt2_sse (void)
void
C_1_sse
(
void
)
void
C_1_sse
(
void
)
{
{
__asm__
(
__asm__
__volatile__
(
".float 0f-1.0
\n
"
".float 0f-1.0
\n
"
".float 0f1.0
\n
"
".float 0f1.0
\n
"
".float 0f-1.0
\n
"
".float 0f-1.0
\n
"
...
@@ -127,9 +127,10 @@ void C_1_sse (void)
...
@@ -127,9 +127,10 @@ void C_1_sse (void)
static
void
fft_4_sse
(
complex_t
*
x
)
static
void
fft_4_sse
(
complex_t
*
x
)
{
{
__asm__
__volatile__
(
__asm__
__volatile__
(
"movups (%%eax), %%xmm0
\n
"
/* x[1] | x[0] */
".align 16
\n
"
"movups 16(%%eax), %%xmm2
\n
"
/* x[3] | x[2] */
"movaps (%%eax), %%xmm0
\n
"
/* x[1] | x[0] */
"movups %%xmm0, %%xmm1
\n
"
/* x[1] | x[0] */
"movaps 16(%%eax), %%xmm2
\n
"
/* x[3] | x[2] */
"movaps %%xmm0, %%xmm1
\n
"
/* x[1] | x[0] */
"addps %%xmm2, %%xmm0
\n
"
/* x[1] + x[3] | x[0] + x[2] */
"addps %%xmm2, %%xmm0
\n
"
/* x[1] + x[3] | x[0] + x[2] */
"subps %%xmm2, %%xmm1
\n
"
/* x[1] - x[3] | x[0] - x[2] */
"subps %%xmm2, %%xmm1
\n
"
/* x[1] - x[3] | x[0] - x[2] */
"xorps %%xmm6, %%xmm6
\n
"
"xorps %%xmm6, %%xmm6
\n
"
...
@@ -138,12 +139,12 @@ static void fft_4_sse (complex_t *x)
...
@@ -138,12 +139,12 @@ static void fft_4_sse (complex_t *x)
"subss %%xmm4, %%xmm6
\n
"
/* 0 | -(x[1] - x[3]).re */
"subss %%xmm4, %%xmm6
\n
"
/* 0 | -(x[1] - x[3]).re */
"movlhps %%xmm1, %%xmm0
\n
"
/* x[0] - x[2] | x[0] + x[2] */
"movlhps %%xmm1, %%xmm0
\n
"
/* x[0] - x[2] | x[0] + x[2] */
"movlhps %%xmm6, %%xmm4
\n
"
/* 0 | -(x[1] - x[3]).re | (x[1] - x[3]).im | (x[3]-x[1]).re */
"movlhps %%xmm6, %%xmm4
\n
"
/* 0 | -(x[1] - x[3]).re | (x[1] - x[3]).im | (x[3]-x[1]).re */
"mov
ups %%xmm0, %%xmm2
\n
"
/* x[0] - x[2] | x[0] + x[2] */
"mov
aps %%xmm0, %%xmm2
\n
"
/* x[0] - x[2] | x[0] + x[2] */
"shufps $0x94, %%xmm4, %%xmm3
\n
"
/* i*(x[1] - x[3]) | x[1] + x[3] */
"shufps $0x94, %%xmm4, %%xmm3
\n
"
/* i*(x[1] - x[3]) | x[1] + x[3] */
"addps %%xmm3, %%xmm0
\n
"
"addps %%xmm3, %%xmm0
\n
"
"subps %%xmm3, %%xmm2
\n
"
"subps %%xmm3, %%xmm2
\n
"
"mov
u
ps %%xmm0, (%%eax)
\n
"
"mov
a
ps %%xmm0, (%%eax)
\n
"
"mov
u
ps %%xmm2, 16(%%eax)
\n
"
"mov
a
ps %%xmm2, 16(%%eax)
\n
"
:
"=a"
(
x
)
:
"=a"
(
x
)
:
"a"
(
x
)
);
:
"a"
(
x
)
);
}
}
...
@@ -151,13 +152,14 @@ static void fft_4_sse (complex_t *x)
...
@@ -151,13 +152,14 @@ static void fft_4_sse (complex_t *x)
static
void
fft_8_sse
(
complex_t
*
x
)
static
void
fft_8_sse
(
complex_t
*
x
)
{
{
__asm__
__volatile__
(
__asm__
__volatile__
(
".align 16
\n
"
"pushl %%ebx
\n
"
"pushl %%ebx
\n
"
"movlps (%%eax), %%xmm0
\n
"
/* x[0] */
"movlps (%%eax), %%xmm0
\n
"
/* x[0] */
"movlps 32(%%eax), %%xmm1
\n
"
/* x[4] */
"movlps 32(%%eax), %%xmm1
\n
"
/* x[4] */
"movhps 16(%%eax), %%xmm0
\n
"
/* x[2] | x[0] */
"movhps 16(%%eax), %%xmm0
\n
"
/* x[2] | x[0] */
"movhps 48(%%eax), %%xmm1
\n
"
/* x[6] | x[4] */
"movhps 48(%%eax), %%xmm1
\n
"
/* x[6] | x[4] */
"mov
ups %%xmm0, %%xmm2
\n
"
/* x[2] | x[0] */
"mov
aps %%xmm0, %%xmm2
\n
"
/* x[2] | x[0] */
"xorps %%xmm3, %%xmm3
\n
"
"xorps %%xmm3, %%xmm3
\n
"
"addps %%xmm1, %%xmm0
\n
"
/* x[2] + x[6] | x[0] + x[4] */
"addps %%xmm1, %%xmm0
\n
"
/* x[2] + x[6] | x[0] + x[4] */
"subps %%xmm1, %%xmm2
\n
"
/* x[2] - x[6] | x[0] - x[4] */
"subps %%xmm1, %%xmm2
\n
"
/* x[2] - x[6] | x[0] - x[4] */
...
@@ -165,8 +167,8 @@ static void fft_8_sse (complex_t *x)
...
@@ -165,8 +167,8 @@ static void fft_8_sse (complex_t *x)
"movhlps %%xmm2, %%xmm4
\n
"
/* x[2] - x[6] */
"movhlps %%xmm2, %%xmm4
\n
"
/* x[2] - x[6] */
"movlhps %%xmm2, %%xmm0
\n
"
/* x[0] - x[4] | x[0] + x[4] */
"movlhps %%xmm2, %%xmm0
\n
"
/* x[0] - x[4] | x[0] + x[4] */
"subss %%xmm4, %%xmm3
\n
"
/* (x[2]-x[6]).im | -(x[2]-x[6]).re */
"subss %%xmm4, %%xmm3
\n
"
/* (x[2]-x[6]).im | -(x[2]-x[6]).re */
"mov
ups %%xmm0, %%xmm7
\n
"
/* x[0] - x[4] | x[0] + x[4] */
"mov
aps %%xmm0, %%xmm7
\n
"
/* x[0] - x[4] | x[0] + x[4] */
"mov
ups %%xmm3, %%xmm4
\n
"
/* (x[2]-x[6]).im | -(x[2]-x[6]).re */
"mov
aps %%xmm3, %%xmm4
\n
"
/* (x[2]-x[6]).im | -(x[2]-x[6]).re */
"movlps 8(%%eax), %%xmm1
\n
"
/* x[1] */
"movlps 8(%%eax), %%xmm1
\n
"
/* x[1] */
"shufps $0x14, %%xmm4, %%xmm5
\n
"
/* i*(x[2] - x[6]) | x[2] + x[6] */
"shufps $0x14, %%xmm4, %%xmm5
\n
"
/* i*(x[2] - x[6]) | x[2] + x[6] */
...
@@ -177,11 +179,11 @@ static void fft_8_sse (complex_t *x)
...
@@ -177,11 +179,11 @@ static void fft_8_sse (complex_t *x)
"movl $hsqrt2_sse, %%ebx
\n
"
"movl $hsqrt2_sse, %%ebx
\n
"
"movlps 40(%%eax), %%xmm2
\n
"
/* x[5] */
"movlps 40(%%eax), %%xmm2
\n
"
/* x[5] */
"movhps 56(%%eax), %%xmm2
\n
"
/* x[7] | x[5] */
"movhps 56(%%eax), %%xmm2
\n
"
/* x[7] | x[5] */
"mov
ups %%xmm1, %%xmm3
\n
"
/* x[3] | x[1] */
"mov
aps %%xmm1, %%xmm3
\n
"
/* x[3] | x[1] */
"addps %%xmm2, %%xmm1
\n
"
/* x[3] + x[7] | x[1] + x[5] */
"addps %%xmm2, %%xmm1
\n
"
/* x[3] + x[7] | x[1] + x[5] */
"subps %%xmm2, %%xmm3
\n
"
/* x[3] - x[7] | x[1] - x[5] */
"subps %%xmm2, %%xmm3
\n
"
/* x[3] - x[7] | x[1] - x[5] */
"movups (%%ebx), %%xmm4
\n
"
/* -1/sqrt2 | -1/sqrt2 | 1/sqrt2 | 1/sqrt2 */
"movups (%%ebx), %%xmm4
\n
"
/* -1/sqrt2 | -1/sqrt2 | 1/sqrt2 | 1/sqrt2 */
"mov
ups %%xmm3, %%xmm6
\n
"
/* x[3] - x[7] | x[1] - x[5] */
"mov
aps %%xmm3, %%xmm6
\n
"
/* x[3] - x[7] | x[1] - x[5] */
"mulps %%xmm4, %%xmm3
\n
"
/* -1/s2*(x[3] - x[7]) | 1/s2*(x[1] - x[5]) */
"mulps %%xmm4, %%xmm3
\n
"
/* -1/s2*(x[3] - x[7]) | 1/s2*(x[1] - x[5]) */
"shufps $0xc8, %%xmm4, %%xmm4
\n
"
/* -1/sqrt2 | 1/sqrt2 | -1/sqrt2 | 1/sqrt2 */
"shufps $0xc8, %%xmm4, %%xmm4
\n
"
/* -1/sqrt2 | 1/sqrt2 | -1/sqrt2 | 1/sqrt2 */
"shufps $0xb1, %%xmm6, %%xmm6
\n
"
/* (x3-x7).re|(x3-x7).im|(x1-x5).re|(x1-x5).im */
"shufps $0xb1, %%xmm6, %%xmm6
\n
"
/* (x3-x7).re|(x3-x7).im|(x1-x5).re|(x1-x5).im */
...
@@ -190,23 +192,23 @@ static void fft_8_sse (complex_t *x)
...
@@ -190,23 +192,23 @@ static void fft_8_sse (complex_t *x)
"movhlps %%xmm1, %%xmm5
\n
"
/* x[3] + x[7] */
"movhlps %%xmm1, %%xmm5
\n
"
/* x[3] + x[7] */
"movlhps %%xmm6, %%xmm1
\n
"
/* (1+i)/sqrt2 * (x[1]-x[5]) | x[1]+x[5] */
"movlhps %%xmm6, %%xmm1
\n
"
/* (1+i)/sqrt2 * (x[1]-x[5]) | x[1]+x[5] */
"shufps $0xe4, %%xmm6, %%xmm5
\n
"
/* (-1-i)/sqrt2 * (x[3]-x[7]) | x[3]+x[7] */
"shufps $0xe4, %%xmm6, %%xmm5
\n
"
/* (-1-i)/sqrt2 * (x[3]-x[7]) | x[3]+x[7] */
"mov
ups %%xmm1, %%xmm3
\n
"
/* (1-i)/sqrt2 * (x[1]-x[5]) | x[1]+x[5] */
"mov
aps %%xmm1, %%xmm3
\n
"
/* (1-i)/sqrt2 * (x[1]-x[5]) | x[1]+x[5] */
"movl $C_1_sse, %%ebx
\n
"
"movl $C_1_sse, %%ebx
\n
"
"addps %%xmm5, %%xmm1
\n
"
/* u */
"addps %%xmm5, %%xmm1
\n
"
/* u */
"subps %%xmm5, %%xmm3
\n
"
/* v */
"subps %%xmm5, %%xmm3
\n
"
/* v */
"mov
ups %%xmm0, %%xmm2
\n
"
/* yb */
"mov
aps %%xmm0, %%xmm2
\n
"
/* yb */
"mov
ups %%xmm7, %%xmm4
\n
"
/* yt */
"mov
aps %%xmm7, %%xmm4
\n
"
/* yt */
"movups (%%ebx), %%xmm5
\n
"
"movups (%%ebx), %%xmm5
\n
"
"mulps %%xmm5, %%xmm3
\n
"
"mulps %%xmm5, %%xmm3
\n
"
"addps %%xmm1, %%xmm0
\n
"
/* yt + u */
"addps %%xmm1, %%xmm0
\n
"
/* yt + u */
"subps %%xmm1, %%xmm2
\n
"
/* yt - u */
"subps %%xmm1, %%xmm2
\n
"
/* yt - u */
"shufps $0xb1, %%xmm3, %%xmm3
\n
"
/* -i * v */
"shufps $0xb1, %%xmm3, %%xmm3
\n
"
/* -i * v */
"mov
u
ps %%xmm0, (%%eax)
\n
"
"mov
a
ps %%xmm0, (%%eax)
\n
"
"mov
u
ps %%xmm2, 32(%%eax)
\n
"
"mov
a
ps %%xmm2, 32(%%eax)
\n
"
"addps %%xmm3, %%xmm4
\n
"
/* yb - i*v */
"addps %%xmm3, %%xmm4
\n
"
/* yb - i*v */
"subps %%xmm3, %%xmm7
\n
"
/* yb + i*v */
"subps %%xmm3, %%xmm7
\n
"
/* yb + i*v */
"mov
u
ps %%xmm4, 16(%%eax)
\n
"
"mov
a
ps %%xmm4, 16(%%eax)
\n
"
"mov
u
ps %%xmm7, 48(%%eax)
\n
"
"mov
a
ps %%xmm7, 48(%%eax)
\n
"
"popl %%ebx
\n
"
"popl %%ebx
\n
"
:
"=a"
(
x
)
:
"=a"
(
x
)
...
@@ -218,6 +220,7 @@ static void fft_asmb_sse (int k, complex_t *x, complex_t *wTB,
...
@@ -218,6 +220,7 @@ static void fft_asmb_sse (int k, complex_t *x, complex_t *wTB,
const
complex_t
*
d
,
const
complex_t
*
d_3
)
const
complex_t
*
d
,
const
complex_t
*
d_3
)
{
{
__asm__
__volatile__
(
__asm__
__volatile__
(
".align 16
\n
"
"pushl %%ebp
\n
"
"pushl %%ebp
\n
"
"movl %%esp, %%ebp
\n
"
"movl %%esp, %%ebp
\n
"
...
@@ -225,10 +228,11 @@ static void fft_asmb_sse (int k, complex_t *x, complex_t *wTB,
...
@@ -225,10 +228,11 @@ static void fft_asmb_sse (int k, complex_t *x, complex_t *wTB,
"pushl %%eax
\n
"
"pushl %%eax
\n
"
"pushl %%ebx
\n
"
"pushl %%ebx
\n
"
"pushl %%ecx
\n
"
"pushl %%ecx
\n
"
//
"pushl %%edx
\n
"
"pushl %%edx
\n
"
"pushl %%esi
\n
"
"pushl %%esi
\n
"
"pushl %%edi
\n
"
// "movl %%edi, %%ecx\n" /* k */
"pushl %%edi
\n
"
//
"movl 8(%%ebp), %%ecx
\n
"
/* k */
"movl 8(%%ebp), %%ecx
\n
"
/* k */
"movl 12(%%ebp), %%eax
\n
"
/* x */
"movl 12(%%ebp), %%eax
\n
"
/* x */
...
@@ -236,15 +240,16 @@ static void fft_asmb_sse (int k, complex_t *x, complex_t *wTB,
...
@@ -236,15 +240,16 @@ static void fft_asmb_sse (int k, complex_t *x, complex_t *wTB,
"movl 16(%%ebp), %%ebx
\n
"
/* wT */
"movl 16(%%ebp), %%ebx
\n
"
/* wT */
"movl 20(%%ebp), %%edx
\n
"
/* d */
"movl 20(%%ebp), %%edx
\n
"
/* d */
"movl 24(%%ebp), %%esi
\n
"
/* d3 */
"movl 24(%%ebp), %%esi
\n
"
/* d3 */
"shll $4, %%ecx
\n
"
/* 16k */
"shll $4, %%ecx
\n
"
/* 16k */
///
"addl $8, %%edx
\n
"
"addl $8, %%edx
\n
"
"leal (%%eax, %%ecx, 2), %%edi
\n
"
"leal (%%eax, %%ecx, 2), %%edi
\n
"
"addl $8, %%esi
\n
"
"addl $8, %%esi
\n
"
/* TRANSZERO and TRANS */
/* TRANSZERO and TRANS */
"movups (%%eax), %%xmm0
\n
"
/* x[1] | x[0] */
".align 16
\n
"
"movups (%%ebx), %%xmm1
\n
"
/* wT[1] | wT[0] */
"movaps (%%eax), %%xmm0
\n
"
/* x[1] | x[0] */
"movups (%%ebx, %%ecx), %%xmm2
\n
"
/* wB[1] | wB[0] */
"movaps (%%ebx), %%xmm1
\n
"
/* wT[1] | wT[0] */
"movaps (%%ebx, %%ecx), %%xmm2
\n
"
/* wB[1] | wB[0] */
"movlps (%%edx), %%xmm3
\n
"
/* d */
"movlps (%%edx), %%xmm3
\n
"
/* d */
"movlps (%%esi), %%xmm4
\n
"
/* d3 */
"movlps (%%esi), %%xmm4
\n
"
/* d3 */
"movhlps %%xmm1, %%xmm5
\n
"
/* wT[1] */
"movhlps %%xmm1, %%xmm5
\n
"
/* wT[1] */
...
@@ -259,40 +264,41 @@ static void fft_asmb_sse (int k, complex_t *x, complex_t *wTB,
...
@@ -259,40 +264,41 @@ static void fft_asmb_sse (int k, complex_t *x, complex_t *wTB,
"movlhps %%xmm6, %%xmm5
\n
"
/* wB[1].im * d3[1].re | wB[1].re * d3[1].re | wT[1].im * d[1].re | wT[1].re * d[1].re */
"movlhps %%xmm6, %%xmm5
\n
"
/* wB[1].im * d3[1].re | wB[1].re * d3[1].re | wT[1].im * d[1].re | wT[1].re * d[1].re */
"shufps $0xb1, %%xmm6, %%xmm7
\n
"
/* wB[1].re * d3[1].im | wB[i].im * d3[1].im | wT[1].re * d[1].im | wT[1].im * d[1].im */
"shufps $0xb1, %%xmm6, %%xmm7
\n
"
/* wB[1].re * d3[1].im | wB[i].im * d3[1].im | wT[1].re * d[1].im | wT[1].im * d[1].im */
"movl $C_1_sse, %%edi
\n
"
"movl $C_1_sse, %%edi
\n
"
"mov
u
ps (%%edi), %%xmm4
\n
"
"mov
a
ps (%%edi), %%xmm4
\n
"
"mulps %%xmm4, %%xmm7
\n
"
"mulps %%xmm4, %%xmm7
\n
"
"addps %%xmm7, %%xmm5
\n
"
/* wB[1] * d3[1] | wT[1] * d[1] */
"addps %%xmm7, %%xmm5
\n
"
/* wB[1] * d3[1] | wT[1] * d[1] */
"movlhps %%xmm5, %%xmm1
\n
"
/* d[1] * wT[1] | wT[0] */
"movlhps %%xmm5, %%xmm1
\n
"
/* d[1] * wT[1] | wT[0] */
"shufps $0xe4, %%xmm5, %%xmm2
\n
"
/* d3[1] * wB[1] | wB[0] */
"shufps $0xe4, %%xmm5, %%xmm2
\n
"
/* d3[1] * wB[1] | wB[0] */
"mov
u
ps %%xmm1, %%xmm3
\n
"
/* d[1] * wT[1] | wT[0] */
"mov
a
ps %%xmm1, %%xmm3
\n
"
/* d[1] * wT[1] | wT[0] */
"leal (%%eax, %%ecx, 2), %%edi
\n
"
"leal (%%eax, %%ecx, 2), %%edi
\n
"
"addps %%xmm2, %%xmm1
\n
"
/* u */
"addps %%xmm2, %%xmm1
\n
"
/* u */
"subps %%xmm2, %%xmm3
\n
"
/* v */
"subps %%xmm2, %%xmm3
\n
"
/* v */
"mulps %%xmm4, %%xmm3
\n
"
"mulps %%xmm4, %%xmm3
\n
"
"mov
u
ps (%%eax, %%ecx), %%xmm5
\n
"
/* xk[1] | xk[0] */
"mov
a
ps (%%eax, %%ecx), %%xmm5
\n
"
/* xk[1] | xk[0] */
"shufps $0xb1, %%xmm3, %%xmm3
\n
"
/* -i * v */
"shufps $0xb1, %%xmm3, %%xmm3
\n
"
/* -i * v */
"mov
ups %%xmm0, %%xmm2
\n
"
/* x[1] | x[0] */
"mov
aps %%xmm0, %%xmm2
\n
"
/* x[1] | x[0] */
"mov
ups %%xmm5, %%xmm6
\n
"
/* xk[1] | xk[0] */
"mov
aps %%xmm5, %%xmm6
\n
"
/* xk[1] | xk[0] */
"addps %%xmm1, %%xmm0
\n
"
"addps %%xmm1, %%xmm0
\n
"
"subps %%xmm1, %%xmm2
\n
"
"subps %%xmm1, %%xmm2
\n
"
"addps %%xmm3, %%xmm5
\n
"
"addps %%xmm3, %%xmm5
\n
"
"subps %%xmm3, %%xmm6
\n
"
"subps %%xmm3, %%xmm6
\n
"
"mov
u
ps %%xmm0, (%%eax)
\n
"
"mov
a
ps %%xmm0, (%%eax)
\n
"
"mov
u
ps %%xmm2, (%%edi)
\n
"
"mov
a
ps %%xmm2, (%%edi)
\n
"
"mov
u
ps %%xmm5, (%%eax, %%ecx)
\n
"
"mov
a
ps %%xmm5, (%%eax, %%ecx)
\n
"
"mov
u
ps %%xmm6, (%%edi, %%ecx)
\n
"
"mov
a
ps %%xmm6, (%%edi, %%ecx)
\n
"
"addl $16, %%eax
\n
"
"addl $16, %%eax
\n
"
"addl $16, %%ebx
\n
"
"addl $16, %%ebx
\n
"
"addl $8, %%edx
\n
"
"addl $8, %%edx
\n
"
"addl $8, %%esi
\n
"
"addl $8, %%esi
\n
"
"decl -4(%%ebp)
\n
"
"decl -4(%%ebp)
\n
"
".align 16
\n
"
".loop:
\n
"
".loop:
\n
"
"mov
u
ps (%%ebx), %%xmm0
\n
"
/* wT[1] | wT[0] */
"mov
a
ps (%%ebx), %%xmm0
\n
"
/* wT[1] | wT[0] */
"mov
u
ps (%%edx), %%xmm1
\n
"
/* d[1] | d[0] */
"mov
a
ps (%%edx), %%xmm1
\n
"
/* d[1] | d[0] */
"mov
u
ps (%%ebx, %%ecx), %%xmm4
\n
"
/* wB[1] | wB[0] */
"mov
a
ps (%%ebx, %%ecx), %%xmm4
\n
"
/* wB[1] | wB[0] */
"mov
u
ps (%%esi), %%xmm5
\n
"
/* d3[1] | d3[0] */
"mov
a
ps (%%esi), %%xmm5
\n
"
/* d3[1] | d3[0] */
"movhlps %%xmm0, %%xmm2
\n
"
/* wT[1] */
"movhlps %%xmm0, %%xmm2
\n
"
/* wT[1] */
"movhlps %%xmm1, %%xmm3
\n
"
/* d[1] */
"movhlps %%xmm1, %%xmm3
\n
"
/* d[1] */
...
@@ -319,7 +325,7 @@ static void fft_asmb_sse (int k, complex_t *x, complex_t *wTB,
...
@@ -319,7 +325,7 @@ static void fft_asmb_sse (int k, complex_t *x, complex_t *wTB,
"mulps %%xmm7, %%xmm6
\n
"
/* wB[1].im * d3[1].im | wB[1].re * d3[1].im | wB[1].im * d3[1].re | wB[1].re * d3[1].re */
"mulps %%xmm7, %%xmm6
\n
"
/* wB[1].im * d3[1].im | wB[1].re * d3[1].im | wB[1].im * d3[1].re | wB[1].re * d3[1].re */
"shufps $0xb1, %%xmm2, %%xmm1
\n
"
/* d[1].im * wT[1].re | d[1].im * wT[1].im | d[0].im * wT[0].re | d[0].im * wT[0].im */
"shufps $0xb1, %%xmm2, %%xmm1
\n
"
/* d[1].im * wT[1].re | d[1].im * wT[1].im | d[0].im * wT[0].re | d[0].im * wT[0].im */
"movl $C_1_sse, %%edi
\n
"
"movl $C_1_sse, %%edi
\n
"
"mov
u
ps (%%edi), %%xmm3
\n
"
/* 1.0 | -1.0 | 1.0 | -1.0 */
"mov
a
ps (%%edi), %%xmm3
\n
"
/* 1.0 | -1.0 | 1.0 | -1.0 */
"movhlps %%xmm4, %%xmm5
\n
"
/* wB[0].im * d3[0].im | wB[0].re * d3[0].im */
"movhlps %%xmm4, %%xmm5
\n
"
/* wB[0].im * d3[0].im | wB[0].re * d3[0].im */
"mulps %%xmm3, %%xmm1
\n
"
/* d[1].im * wT[1].re | -d[1].im * wT[1].im | d[0].im * wT[0].re | -d[0].im * wT[0].im */
"mulps %%xmm3, %%xmm1
\n
"
/* d[1].im * wT[1].re | -d[1].im * wT[1].im | d[0].im * wT[0].re | -d[0].im * wT[0].im */
...
@@ -330,37 +336,38 @@ static void fft_asmb_sse (int k, complex_t *x, complex_t *wTB,
...
@@ -330,37 +336,38 @@ static void fft_asmb_sse (int k, complex_t *x, complex_t *wTB,
"mulps %%xmm3, %%xmm5
\n
"
/* wB[1].re * d3[1].im | -wB[1].im * d3[1].im | wB[0].re * d3[0].im | -wB[0].im * d3[0].im */
"mulps %%xmm3, %%xmm5
\n
"
/* wB[1].re * d3[1].im | -wB[1].im * d3[1].im | wB[0].re * d3[0].im | -wB[0].im * d3[0].im */
"addps %%xmm5, %%xmm4
\n
"
/* wB[1] * d3[1] | wB[0] * d3[0] */
"addps %%xmm5, %%xmm4
\n
"
/* wB[1] * d3[1] | wB[0] * d3[0] */
"mov
u
ps %%xmm0, %%xmm1
\n
"
/* wT[1] * d[1] | wT[0] * d[0] */
"mov
a
ps %%xmm0, %%xmm1
\n
"
/* wT[1] * d[1] | wT[0] * d[0] */
"addps %%xmm4, %%xmm0
\n
"
/* u */
"addps %%xmm4, %%xmm0
\n
"
/* u */
"subps %%xmm4, %%xmm1
\n
"
/* v */
"subps %%xmm4, %%xmm1
\n
"
/* v */
"mov
u
ps (%%eax), %%xmm6
\n
"
/* x[1] | x[0] */
"mov
a
ps (%%eax), %%xmm6
\n
"
/* x[1] | x[0] */
"leal (%%eax, %%ecx, 2), %%edi
\n
"
"leal (%%eax, %%ecx, 2), %%edi
\n
"
"mulps %%xmm3, %%xmm1
\n
"
"mulps %%xmm3, %%xmm1
\n
"
"addl $16, %%ebx
\n
"
"addl $16, %%ebx
\n
"
"addl $16, %%esi
\n
"
"addl $16, %%esi
\n
"
"shufps $0xb1, %%xmm1, %%xmm1
\n
"
/* -i * v */
"shufps $0xb1, %%xmm1, %%xmm1
\n
"
/* -i * v */
"mov
u
ps (%%eax, %%ecx), %%xmm7
\n
"
/* xk[1] | xk[0] */
"mov
a
ps (%%eax, %%ecx), %%xmm7
\n
"
/* xk[1] | xk[0] */
"mov
u
ps %%xmm6, %%xmm2
\n
"
"mov
a
ps %%xmm6, %%xmm2
\n
"
"mov
u
ps %%xmm7, %%xmm4
\n
"
"mov
a
ps %%xmm7, %%xmm4
\n
"
"addps %%xmm0, %%xmm6
\n
"
"addps %%xmm0, %%xmm6
\n
"
"subps %%xmm0, %%xmm2
\n
"
"subps %%xmm0, %%xmm2
\n
"
"mov
u
ps %%xmm6, (%%eax)
\n
"
"mov
a
ps %%xmm6, (%%eax)
\n
"
"mov
u
ps %%xmm2, (%%edi)
\n
"
"mov
a
ps %%xmm2, (%%edi)
\n
"
"addps %%xmm1, %%xmm7
\n
"
"addps %%xmm1, %%xmm7
\n
"
"subps %%xmm1, %%xmm4
\n
"
"subps %%xmm1, %%xmm4
\n
"
"addl $16, %%edx
\n
"
"addl $16, %%edx
\n
"
"mov
u
ps %%xmm7, (%%eax, %%ecx)
\n
"
"mov
a
ps %%xmm7, (%%eax, %%ecx)
\n
"
"mov
u
ps %%xmm4, (%%edi, %%ecx)
\n
"
"mov
a
ps %%xmm4, (%%edi, %%ecx)
\n
"
"addl $16, %%eax
\n
"
"addl $16, %%eax
\n
"
"decl -4(%%ebp)
\n
"
"decl -4(%%ebp)
\n
"
"jnz .loop
\n
"
"jnz .loop
\n
"
".align 16
\n
"
".end:
\n
"
".end:
\n
"
"popl %%edi
\n
"
"popl %%edi
\n
"
//
"popl %%esi
\n
"
"popl %%esi
\n
"
"popl %%edx
\n
"
"popl %%edx
\n
"
"popl %%ecx
\n
"
"popl %%ecx
\n
"
//
"popl %%ebx
\n
"
"popl %%ebx
\n
"
"popl %%eax
\n
"
"popl %%eax
\n
"
...
...
src/ac3_decoder/ac3_decoder.h
View file @
dee3179d
...
@@ -2,7 +2,7 @@
...
@@ -2,7 +2,7 @@
* ac3_decoder.h : ac3 decoder interface
* ac3_decoder.h : ac3 decoder interface
*****************************************************************************
*****************************************************************************
* Copyright (C) 1999, 2000 VideoLAN
* Copyright (C) 1999, 2000 VideoLAN
* $Id: ac3_decoder.h,v 1.1
0 2001/06/12 00:30:4
1 reno Exp $
* $Id: ac3_decoder.h,v 1.1
1 2001/07/08 23:15:1
1 reno Exp $
*
*
* Authors: Michel Kaempf <maxx@via.ecp.fr>
* Authors: Michel Kaempf <maxx@via.ecp.fr>
* Renaud Dartus <reno@videolan.org>
* Renaud Dartus <reno@videolan.org>
...
@@ -354,6 +354,9 @@ typedef struct mantissa_s
...
@@ -354,6 +354,9 @@ typedef struct mantissa_s
struct
ac3dec_s
struct
ac3dec_s
{
{
float
samples
[
6
][
256
]
__attribute__
((
aligned
(
16
)));
imdct_t
imdct
__attribute__
((
aligned
(
16
)));
/*
/*
* Input properties
* Input properties
*/
*/
...
@@ -370,12 +373,10 @@ struct ac3dec_s
...
@@ -370,12 +373,10 @@ struct ac3dec_s
bsi_t
bsi
;
bsi_t
bsi
;
audblk_t
audblk
;
audblk_t
audblk
;
float
samples
[
6
][
256
]
__attribute__
((
aligned
(
16
)));
dm_par_t
dm_par
;
dm_par_t
dm_par
;
bit_allocate_t
bit_allocate
;
bit_allocate_t
bit_allocate
;
mantissa_t
mantissa
;
mantissa_t
mantissa
;
imdct_t
imdct
;
downmix_t
downmix
;
downmix_t
downmix
;
};
};
...
...
src/ac3_decoder/ac3_decoder_thread.c
View file @
dee3179d
...
@@ -2,7 +2,7 @@
...
@@ -2,7 +2,7 @@
* ac3_decoder_thread.c: ac3 decoder thread
* ac3_decoder_thread.c: ac3 decoder thread
*****************************************************************************
*****************************************************************************
* Copyright (C) 1999, 2000 VideoLAN
* Copyright (C) 1999, 2000 VideoLAN
* $Id: ac3_decoder_thread.c,v 1.3
4 2001/05/31 01:37:08 sam
Exp $
* $Id: ac3_decoder_thread.c,v 1.3
5 2001/07/08 23:15:11 reno
Exp $
*
*
* Authors: Michel Lespinasse <walken@zoy.org>
* Authors: Michel Lespinasse <walken@zoy.org>
*
*
...
@@ -82,7 +82,13 @@ vlc_thread_t ac3dec_CreateThread( adec_config_t * p_config )
...
@@ -82,7 +82,13 @@ vlc_thread_t ac3dec_CreateThread( adec_config_t * p_config )
intf_DbgMsg
(
"ac3dec debug: creating ac3 decoder thread"
);
intf_DbgMsg
(
"ac3dec debug: creating ac3 decoder thread"
);
/* Allocate the memory needed to store the thread's structure */
/* Allocate the memory needed to store the thread's structure */
if
((
p_ac3thread
=
(
ac3dec_thread_t
*
)
malloc
(
sizeof
(
ac3dec_thread_t
)))
==
NULL
)
p_ac3thread
=
(
ac3dec_thread_t
*
)
malloc
(
sizeof
(
ac3dec_thread_t
));
/* We need to be 16 bytes aligned */
p_ac3thread
->
ac3thread
=
(
int
)
p_ac3thread
&
(
-
15
);
p_ac3thread
=
(
ac3dec_thread_t
*
)
p_ac3thread
->
ac3thread
;
if
(
p_ac3thread
==
NULL
)
{
{
intf_ErrMsg
(
"ac3dec error: not enough memory "
intf_ErrMsg
(
"ac3dec error: not enough memory "
"for ac3dec_CreateThread() to create the new thread"
);
"for ac3dec_CreateThread() to create the new thread"
);
...
@@ -335,6 +341,7 @@ static void EndThread (ac3dec_thread_t * p_ac3thread)
...
@@ -335,6 +341,7 @@ static void EndThread (ac3dec_thread_t * p_ac3thread)
/* Destroy descriptor */
/* Destroy descriptor */
free
(
p_ac3thread
->
p_config
);
free
(
p_ac3thread
->
p_config
);
p_ac3thread
=
(
ac3dec_thread_t
*
)
p_ac3thread
->
ac3thread
;
free
(
p_ac3thread
);
free
(
p_ac3thread
);
intf_DbgMsg
(
"ac3dec debug: ac3 decoder thread %p destroyed"
,
p_ac3thread
);
intf_DbgMsg
(
"ac3dec debug: ac3 decoder thread %p destroyed"
,
p_ac3thread
);
...
...
src/ac3_decoder/ac3_decoder_thread.h
View file @
dee3179d
...
@@ -2,7 +2,7 @@
...
@@ -2,7 +2,7 @@
* ac3_decoder_thread.h : ac3 decoder thread interface
* ac3_decoder_thread.h : ac3 decoder thread interface
*****************************************************************************
*****************************************************************************
* Copyright (C) 1999, 2000 VideoLAN
* Copyright (C) 1999, 2000 VideoLAN
* $Id: ac3_decoder_thread.h,v 1.
7 2001/05/14 15:58:03
reno Exp $
* $Id: ac3_decoder_thread.h,v 1.
8 2001/07/08 23:15:11
reno Exp $
*
*
* Authors: Michel Kaempf <maxx@via.ecp.fr>
* Authors: Michel Kaempf <maxx@via.ecp.fr>
*
*
...
@@ -24,8 +24,16 @@
...
@@ -24,8 +24,16 @@
/*****************************************************************************
/*****************************************************************************
* ac3dec_thread_t : ac3 decoder thread descriptor
* ac3dec_thread_t : ac3 decoder thread descriptor
*****************************************************************************/
*****************************************************************************/
typedef
struct
ac3dec_thread_s
typedef
struct
ac3dec_thread_s
{
{
/*
* Decoder properties
*/
float
used_for_alignement1
;
float
used_for_alignement2
;
ac3dec_t
ac3_decoder
__attribute__
((
aligned
(
16
)));
/*
/*
* Thread properties
* Thread properties
*/
*/
...
@@ -38,15 +46,11 @@ typedef struct ac3dec_thread_s
...
@@ -38,15 +46,11 @@ typedef struct ac3dec_thread_s
int
sync_ptr
;
/* sync ptr from ac3 magic header */
int
sync_ptr
;
/* sync ptr from ac3 magic header */
adec_config_t
*
p_config
;
adec_config_t
*
p_config
;
/*
* Decoder properties
*/
ac3dec_t
ac3_decoder
;
/*
/*
* Output properties
* Output properties
*/
*/
aout_fifo_t
*
p_aout_fifo
;
/* stores the decompressed audio frames */
aout_fifo_t
*
p_aout_fifo
;
/* stores the decompressed audio frames */
int
ac3thread
;
/* save the old pointer */
}
ac3dec_thread_t
;
}
ac3dec_thread_t
;
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment