Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Support
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
V
vlc-1.1
Project overview
Project overview
Details
Activity
Releases
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Issues
0
Issues
0
List
Boards
Labels
Milestones
Redmine
Redmine
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Operations
Operations
Metrics
Environments
Analytics
Analytics
CI / CD
Repository
Value Stream
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
videolan
vlc-1.1
Commits
dee3179d
Commit
dee3179d
authored
Jul 08, 2001
by
Renaud Dartus
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
* Alignement in asm functions
* 16 bytes alignement for data (need fo SSE) * Optimization in SSE
parent
5b49dba8
Changes
10
Hide whitespace changes
Inline
Side-by-side
Showing
10 changed files
with
462 additions
and
400 deletions
+462
-400
include/ac3_imdct.h
include/ac3_imdct.h
+5
-4
plugins/downmix/ac3_downmix_3dn.c
plugins/downmix/ac3_downmix_3dn.c
+15
-2
plugins/downmix/ac3_downmix_sse.c
plugins/downmix/ac3_downmix_sse.c
+162
-152
plugins/imdct/ac3_imdct_3dn.c
plugins/imdct/ac3_imdct_3dn.c
+15
-1
plugins/imdct/ac3_imdct_sse.c
plugins/imdct/ac3_imdct_sse.c
+134
-130
plugins/imdct/ac3_srfft_3dn.c
plugins/imdct/ac3_srfft_3dn.c
+2
-1
plugins/imdct/ac3_srfft_sse.c
plugins/imdct/ac3_srfft_sse.c
+105
-98
src/ac3_decoder/ac3_decoder.h
src/ac3_decoder/ac3_decoder.h
+4
-3
src/ac3_decoder/ac3_decoder_thread.c
src/ac3_decoder/ac3_decoder_thread.c
+9
-2
src/ac3_decoder/ac3_decoder_thread.h
src/ac3_decoder/ac3_decoder_thread.h
+11
-7
No files found.
include/ac3_imdct.h
View file @
dee3179d
...
...
@@ -2,7 +2,7 @@
* ac3_imdct.h : AC3 IMDCT types
*****************************************************************************
* Copyright (C) 1999, 2000 VideoLAN
* $Id: ac3_imdct.h,v 1.
4 2001/06/12 00:30:4
1 reno Exp $
* $Id: ac3_imdct.h,v 1.
5 2001/07/08 23:15:1
1 reno Exp $
*
* Authors: Michel Kaempf <maxx@via.ecp.fr>
* Renaud Dartus <reno@videolan.org>
...
...
@@ -42,18 +42,19 @@ typedef struct imdct_s
float
xsin1
[
N
/
4
]
__attribute__
((
aligned
(
16
)));
float
xcos2
[
N
/
8
]
__attribute__
((
aligned
(
16
)));
float
xsin2
[
N
/
8
]
__attribute__
((
aligned
(
16
)));
float
xcos_sin_sse
[
128
*
4
]
__attribute__
((
aligned
(
16
)));
/* Twiddle factor LUT */
complex_t
*
w
[
7
]
__attribute__
((
aligned
(
16
)));
complex_t
w_1
[
1
]
__attribute__
((
aligned
(
16
)));
float
used_for_alignement1
;
float
used_for_alignement2
;
complex_t
w_2
[
2
]
__attribute__
((
aligned
(
16
)));
complex_t
w_4
[
4
]
__attribute__
((
aligned
(
16
)));
complex_t
w_8
[
8
]
__attribute__
((
aligned
(
16
)));
complex_t
w_16
[
16
]
__attribute__
((
aligned
(
16
)));
complex_t
w_32
[
32
]
__attribute__
((
aligned
(
16
)));
complex_t
w_64
[
64
]
__attribute__
((
aligned
(
16
)));
float
xcos_sin_sse
[
128
*
4
]
__attribute__
((
aligned
(
16
)));
complex_t
*
w
[
7
]
__attribute__
((
aligned
(
16
)));
/* Module used and shortcuts */
struct
module_s
*
p_module
;
...
...
plugins/downmix/ac3_downmix_3dn.c
View file @
dee3179d
...
...
@@ -2,7 +2,7 @@
* ac3_downmix_3dn.c: accelerated 3D Now! ac3 downmix functions
*****************************************************************************
* Copyright (C) 1999, 2000, 2001 VideoLAN
* $Id: ac3_downmix_3dn.c,v 1.
3 2001/07/01 08:49:09 gbazin
Exp $
* $Id: ac3_downmix_3dn.c,v 1.
4 2001/07/08 23:15:11 reno
Exp $
*
* Authors: Renaud Dartus <reno@videolan.org>
*
...
...
@@ -46,6 +46,7 @@ void sqrt2_3dn (void)
void
_M
(
downmix_3f_2r_to_2ch
)
(
float
*
samples
,
dm_par_t
*
dm_par
)
{
__asm__
__volatile__
(
".align 16
\n
"
"pushl %%ebx
\n
"
"movl $128, %%ebx
\n
"
/* loop counter */
...
...
@@ -58,6 +59,7 @@ void _M( downmix_3f_2r_to_2ch ) (float * samples, dm_par_t * dm_par)
"movd 8(%%ecx), %%mm7
\n
"
/* slev */
"punpckldq %%mm7, %%mm7
\n
"
/* slev | slev */
".align 16
\n
"
".loop:
\n
"
"movq (%%eax), %%mm0
\n
"
/* left */
"movq 2048(%%eax), %%mm1
\n
"
/* right */
...
...
@@ -90,6 +92,7 @@ void _M( downmix_3f_2r_to_2ch ) (float * samples, dm_par_t * dm_par)
void
_M
(
downmix_2f_2r_to_2ch
)
(
float
*
samples
,
dm_par_t
*
dm_par
)
{
__asm__
__volatile__
(
".align 16
\n
"
"pushl %%ebx
\n
"
"movl $128, %%ebx
\n
"
/* loop counter */
...
...
@@ -99,6 +102,7 @@ void _M( downmix_2f_2r_to_2ch ) (float *samples, dm_par_t * dm_par)
"movd 8(%%ecx), %%mm7
\n
"
/* slev */
"punpckldq %%mm7, %%mm7
\n
"
/* slev | slev */
".align 16
\n
"
".loop3:
\n
"
"movq (%%eax), %%mm0
\n
"
/* left */
"movq 1024(%%eax), %%mm1
\n
"
/* right */
...
...
@@ -127,7 +131,7 @@ void _M( downmix_2f_2r_to_2ch ) (float *samples, dm_par_t * dm_par)
void
_M
(
downmix_3f_1r_to_2ch
)
(
float
*
samples
,
dm_par_t
*
dm_par
)
{
__asm__
__volatile__
(
".align 16
\n
"
"pushl %%ebx
\n
"
"movl $128, %%ebx
\n
"
/* loop counter */
...
...
@@ -140,6 +144,7 @@ void _M( downmix_3f_1r_to_2ch ) (float *samples, dm_par_t * dm_par)
"movd 8(%%ecx), %%mm7
\n
"
/* slev */
"punpckldq %%mm7, %%mm7
\n
"
/* slev | slev */
".align 16
\n
"
".loop4:
\n
"
"movq (%%eax), %%mm0
\n
"
/* left */
"movq 2048(%%eax), %%mm1
\n
"
/* right */
...
...
@@ -170,6 +175,7 @@ void _M( downmix_3f_1r_to_2ch ) (float *samples, dm_par_t * dm_par)
void
_M
(
downmix_2f_1r_to_2ch
)
(
float
*
samples
,
dm_par_t
*
dm_par
)
{
__asm__
__volatile__
(
".align 16
\n
"
"pushl %%ebx
\n
"
"movl $128, %%ebx
\n
"
/* loop counter */
...
...
@@ -179,6 +185,7 @@ void _M( downmix_2f_1r_to_2ch ) (float *samples, dm_par_t * dm_par)
"movd 8(%%ecx), %%mm7
\n
"
/* slev */
"punpckldq %%mm7, %%mm7
\n
"
/* slev | slev */
".align 16
\n
"
".loop5:
\n
"
"movq (%%eax), %%mm0
\n
"
/* left */
"movq 1024(%%eax), %%mm1
\n
"
/* right */
...
...
@@ -205,6 +212,7 @@ void _M( downmix_2f_1r_to_2ch ) (float *samples, dm_par_t * dm_par)
void
_M
(
downmix_3f_0r_to_2ch
)
(
float
*
samples
,
dm_par_t
*
dm_par
)
{
__asm__
__volatile__
(
".align 16
\n
"
"pushl %%ebx
\n
"
"movl $128, %%ebx
\n
"
/* loop counter */
...
...
@@ -214,6 +222,7 @@ void _M( downmix_3f_0r_to_2ch ) (float *samples, dm_par_t * dm_par)
"movd 4(%%ecx), %%mm6
\n
"
/* clev */
"punpckldq %%mm6, %%mm6
\n
"
/* clev | clev */
".align 16
\n
"
".loop6:
\n
"
"movq (%%eax), %%mm0
\n
"
/*left */
"movq 2048(%%eax), %%mm1
\n
"
/* right */
...
...
@@ -240,6 +249,7 @@ void _M( downmix_3f_0r_to_2ch ) (float *samples, dm_par_t * dm_par)
void
_M
(
stream_sample_1ch_to_s16
)
(
s16
*
s16_samples
,
float
*
left
)
{
__asm__
__volatile__
(
".align 16
\n
"
"pushl %%ebx
\n
"
"pushl %%edx
\n
"
...
...
@@ -248,6 +258,7 @@ void _M( stream_sample_1ch_to_s16 ) (s16 *s16_samples, float *left)
"punpckldq %%mm7, %%mm7
\n
"
/* sqrt2 | sqrt2 */
"movl $128, %%ebx
\n
"
".align 16
\n
"
".loop2:
\n
"
"movq (%%ecx), %%mm0
\n
"
/* c1 | c0 */
"pfmul %%mm7, %%mm0
\n
"
...
...
@@ -274,9 +285,11 @@ void _M( stream_sample_2ch_to_s16 ) (s16 *s16_samples, float *left, float *right
{
__asm__
__volatile__
(
".align 16
\n
"
"pushl %%ebx
\n
"
"movl $128, %%ebx
\n
"
".align 16
\n
"
".loop1:
\n
"
"movq (%%ecx), %%mm0
\n
"
/* l1 | l0 */
"movq (%%edx), %%mm1
\n
"
/* r1 | r0 */
...
...
plugins/downmix/ac3_downmix_sse.c
View file @
dee3179d
...
...
@@ -2,7 +2,7 @@
* ac3_downmix_sse.c: accelerated SSE ac3 downmix functions
*****************************************************************************
* Copyright (C) 1999, 2000, 2001 VideoLAN
* $Id: ac3_downmix_sse.c,v 1.
3 2001/07/01 08:49:09 gbazin
Exp $
* $Id: ac3_downmix_sse.c,v 1.
4 2001/07/08 23:15:11 reno
Exp $
*
* Authors: Renaud Dartus <reno@videolan.org>
* Aaron Holtzman <aholtzma@engr.uvic.ca>
...
...
@@ -41,48 +41,51 @@
void
sqrt2_sse
(
void
)
__asm__
(
"sqrt2_sse"
);
void
sqrt2_sse
(
void
)
{
__asm__
(
".float 0f0.7071068"
);
__asm__
(
".align 16
\n
"
".float 0f0.7071068"
);
}
void
_M
(
downmix_3f_2r_to_2ch
)
(
float
*
samples
,
dm_par_t
*
dm_par
)
{
__asm__
__volatile__
(
".align 16
\n
"
"pushl %%ebx
\n
"
"movl $64,
%%ebx
\n
"
/* loop counter */
"movl $64, %%ebx
\n
"
/* loop counter */
"movss
(%%ecx), %%xmm5
\n
"
/* unit */
"shufps
$0, %%xmm5, %%xmm5
\n
"
/* unit | unit | unit | unit */
"movss
(%%ecx), %%xmm5
\n
"
/* unit */
"shufps
$0, %%xmm5, %%xmm5
\n
"
/* unit | unit | unit | unit */
"movss 4(%%ecx), %%xmm6
\n
"
/* clev */
"shufps
$0, %%xmm6, %%xmm6
\n
"
/* clev | clev | clev | clev */
"movss 4(%%ecx), %%xmm6
\n
"
/* clev */
"shufps
$0, %%xmm6, %%xmm6
\n
"
/* clev | clev | clev | clev */
"movss 8(%%ecx), %%xmm7
\n
"
/* slev */
"shufps
$0, %%xmm7, %%xmm7
\n
"
/* slev | slev | slev | slev */
"movss 8(%%ecx), %%xmm7
\n
"
/* slev */
"shufps
$0, %%xmm7, %%xmm7
\n
"
/* slev | slev | slev | slev */
".align 16
\n
"
".loop:
\n
"
"mov
ups (%%eax),
%%xmm0
\n
"
/* left */
"mov
ups
2048(%%eax), %%xmm1
\n
"
/* right */
"mov
ups 1024(%%eax), %%xmm2
\n
"
/* center */
"mov
ups 3072(%%eax), %%xmm3
\n
"
/* leftsur */
"mov
ups 4096(%%eax), %%xmm4
\n
"
/* rithgsur */
"mulps
%%xmm5, %%xmm0
\n
"
"mulps
%%xmm5, %%xmm1
\n
"
"mulps
%%xmm6, %%xmm2
\n
"
"addps
%%xmm2, %%xmm0
\n
"
"addps
%%xmm2, %%xmm1
\n
"
"mulps
%%xmm7, %%xmm3
\n
"
"mulps
%%xmm7, %%xmm4
\n
"
"addps
%%xmm3, %%xmm0
\n
"
"addps
%%xmm4, %%xmm1
\n
"
"mov
ups
%%xmm0, (%%eax)
\n
"
"mov
ups
%%xmm1, 1024(%%eax)
\n
"
"addl
$16, %%eax
\n
"
"decl
%%ebx
\n
"
"jnz
.loop
\n
"
"mov
aps (%%eax),
%%xmm0
\n
"
/* left */
"mov
aps
2048(%%eax), %%xmm1
\n
"
/* right */
"mov
aps 1024(%%eax), %%xmm2
\n
"
/* center */
"mov
aps 3072(%%eax), %%xmm3
\n
"
/* leftsur */
"mov
aps 4096(%%eax), %%xmm4
\n
"
/* rithgsur */
"mulps %%xmm5, %%xmm0
\n
"
"mulps %%xmm5, %%xmm1
\n
"
"mulps %%xmm6, %%xmm2
\n
"
"addps %%xmm2, %%xmm0
\n
"
"addps %%xmm2, %%xmm1
\n
"
"mulps %%xmm7, %%xmm3
\n
"
"mulps %%xmm7, %%xmm4
\n
"
"addps %%xmm3, %%xmm0
\n
"
"addps %%xmm4, %%xmm1
\n
"
"mov
aps
%%xmm0, (%%eax)
\n
"
"mov
aps
%%xmm1, 1024(%%eax)
\n
"
"addl $16, %%eax
\n
"
"decl %%ebx
\n
"
"jnz .loop
\n
"
"popl
%%ebx
\n
"
"popl %%ebx
\n
"
:
"=a"
(
samples
)
:
"a"
(
samples
),
"c"
(
dm_par
));
}
...
...
@@ -90,35 +93,37 @@ void _M( downmix_3f_2r_to_2ch ) (float * samples, dm_par_t * dm_par)
void
_M
(
downmix_2f_2r_to_2ch
)
(
float
*
samples
,
dm_par_t
*
dm_par
)
{
__asm__
__volatile__
(
".align 16
\n
"
"pushl %%ebx
\n
"
"movl $64, %%ebx
\n
"
/* loop counter */
"movss
(%%ecx), %%xmm5
\n
"
/* unit */
"movss
(%%ecx), %%xmm5
\n
"
/* unit */
"shufps $0, %%xmm5, %%xmm5
\n
"
/* unit | unit | unit | unit */
"movss 8(%%ecx), %%xmm7
\n
"
/* slev */
"shufps
$0, %%xmm7, %%xmm7
\n
"
/* slev | slev | slev | slev */
"movss 8(%%ecx), %%xmm7
\n
"
/* slev */
"shufps
$0, %%xmm7, %%xmm7
\n
"
/* slev | slev | slev | slev */
".align 16
\n
"
".loop3:
\n
"
"mov
ups (%%eax), %%xmm0
\n
"
/* left */
"mov
ups
1024(%%eax), %%xmm1
\n
"
/* right */
"mov
ups 2048(%%eax), %%xmm3
\n
"
/* leftsur */
"mov
ups 3072(%%eax), %%xmm4
\n
"
/* rightsur */
"mulps
%%xmm5, %%xmm0
\n
"
"mulps
%%xmm5, %%xmm1
\n
"
"mulps
%%xmm7, %%xmm3
\n
"
"mulps
%%xmm7, %%xmm4
\n
"
"addps
%%xmm3, %%xmm0
\n
"
"addps
%%xmm4, %%xmm1
\n
"
"mov
ups
%%xmm0, (%%eax)
\n
"
"mov
ups
%%xmm1, 1024(%%eax)
\n
"
"addl
$16, %%eax
\n
"
"decl
%%ebx
\n
"
"jnz
.loop3
\n
"
"popl
%%ebx
\n
"
"mov
aps (%%eax), %%xmm0
\n
"
/* left */
"mov
aps
1024(%%eax), %%xmm1
\n
"
/* right */
"mov
aps 2048(%%eax), %%xmm3
\n
"
/* leftsur */
"mov
aps 3072(%%eax), %%xmm4
\n
"
/* rightsur */
"mulps %%xmm5, %%xmm0
\n
"
"mulps %%xmm5, %%xmm1
\n
"
"mulps %%xmm7, %%xmm3
\n
"
"mulps %%xmm7, %%xmm4
\n
"
"addps %%xmm3, %%xmm0
\n
"
"addps %%xmm4, %%xmm1
\n
"
"mov
aps
%%xmm0, (%%eax)
\n
"
"mov
aps
%%xmm1, 1024(%%eax)
\n
"
"addl $16, %%eax
\n
"
"decl %%ebx
\n
"
"jnz .loop3
\n
"
"popl %%ebx
\n
"
:
"=a"
(
samples
)
:
"a"
(
samples
),
"c"
(
dm_par
));
}
...
...
@@ -126,112 +131,114 @@ void _M( downmix_2f_2r_to_2ch ) (float *samples, dm_par_t * dm_par)
void
_M
(
downmix_3f_1r_to_2ch
)
(
float
*
samples
,
dm_par_t
*
dm_par
)
{
__asm__
__volatile__
(
".align 16
\n
"
"pushl %%ebx
\n
"
"movl $64, %%ebx
\n
"
/* loop counter */
"pushl %%ebx
\n
"
"movl $64, %%ebx
\n
"
/* loop counter */
"movss (%%ecx), %%xmm5
\n
"
/* unit */
"shufps $0, %%xmm5, %%xmm5
\n
"
/* unit | unit | unit | unit */
"movss (%%ecx), %%xmm5
\n
"
/* unit */
"shufps $0, %%xmm5, %%xmm5
\n
"
/* unit | unit | unit | unit */
"movss 4(%%ecx), %%xmm6
\n
"
/* clev */
"shufps
$0, %%xmm6, %%xmm6
\n
"
/* clev | clev | clev | clev */
"movss 4(%%ecx), %%xmm6
\n
"
/* clev */
"shufps
$0, %%xmm6, %%xmm6
\n
"
/* clev | clev | clev | clev */
"movss 8(%%ecx), %%xmm7
\n
"
/* slev */
"shufps
$0, %%xmm7, %%xmm7
\n
"
/* slev | slev | slev | slev */
"movss 8(%%ecx), %%xmm7
\n
"
/* slev */
"shufps
$0, %%xmm7, %%xmm7
\n
"
/* slev | slev | slev | slev */
".align 16
\n
"
".loop4:
\n
"
"mov
ups (%%eax), %%xmm0
\n
"
/* left */
"mov
ups
2048(%%eax), %%xmm1
\n
"
/* right */
"mov
ups 1024(%%eax), %%xmm2
\n
"
/* center */
"mov
ups 3072(%%eax), %%xmm3
\n
"
/* sur */
"mulps
%%xmm5, %%xmm0
\n
"
"mulps
%%xmm5, %%xmm1
\n
"
"mulps
%%xmm6, %%xmm2
\n
"
"addps
%%xmm2, %%xmm0
\n
"
"mulps
%%xmm7, %%xmm3
\n
"
"addps
%%xmm2, %%xmm1
\n
"
"subps
%%xmm3, %%xmm0
\n
"
"addps
%%xmm3, %%xmm1
\n
"
"mov
ups
%%xmm0, (%%eax)
\n
"
"mov
ups
%%xmm1, 1024(%%eax)
\n
"
"addl
$16, %%eax
\n
"
"decl
%%ebx
\n
"
"jnz
.loop4
\n
"
"popl
%%ebx
\n
"
"mov
aps (%%eax), %%xmm0
\n
"
/* left */
"mov
aps
2048(%%eax), %%xmm1
\n
"
/* right */
"mov
aps 1024(%%eax), %%xmm2
\n
"
/* center */
"mov
aps 3072(%%eax), %%xmm3
\n
"
/* sur */
"mulps %%xmm5, %%xmm0
\n
"
"mulps %%xmm5, %%xmm1
\n
"
"mulps %%xmm6, %%xmm2
\n
"
"addps %%xmm2, %%xmm0
\n
"
"mulps %%xmm7, %%xmm3
\n
"
"addps %%xmm2, %%xmm1
\n
"
"subps %%xmm3, %%xmm0
\n
"
"addps %%xmm3, %%xmm1
\n
"
"mov
aps
%%xmm0, (%%eax)
\n
"
"mov
aps
%%xmm1, 1024(%%eax)
\n
"
"addl $16, %%eax
\n
"
"decl %%ebx
\n
"
"jnz .loop4
\n
"
"popl %%ebx
\n
"
:
"=a"
(
samples
)
:
"a"
(
samples
),
"c"
(
dm_par
));
}
void
_M
(
downmix_2f_1r_to_2ch
)
(
float
*
samples
,
dm_par_t
*
dm_par
)
{
__asm__
__volatile__
(
"pushl %%ebx
\n
"
"movl $64, %%ebx
\n
"
/* loop counter */
".align 16
\n
"
"pushl %%ebx
\n
"
"movl $64, %%ebx
\n
"
/* loop counter */
"movss
(%%ecx), %%xmm5
\n
"
/* unit */
"shufps
$0, %%xmm5, %%xmm5
\n
"
/* unit | unit | unit | unit */
"movss
(%%ecx), %%xmm5
\n
"
/* unit */
"shufps
$0, %%xmm5, %%xmm5
\n
"
/* unit | unit | unit | unit */
"movss 8(%%ecx), %%xmm7
\n
"
/* slev */
"shufps
$0, %%xmm7, %%xmm7
\n
"
/* slev | slev | slev | slev */
"movss 8(%%ecx), %%xmm7
\n
"
/* slev */
"shufps
$0, %%xmm7, %%xmm7
\n
"
/* slev | slev | slev | slev */
".align 16
\n
"
".loop5:
\n
"
"movups (%%eax), %%xmm0
\n
"
/* left */
"movups 1024(%%eax), %%xmm1
\n
"
/* right */
"movups 2048(%%eax), %%xmm3
\n
"
/* sur */
"mulps %%xmm5, %%xmm0
\n
"
"mulps %%xmm5, %%xmm1
\n
"
"mulps %%xmm7, %%xmm3
\n
"
"subps %%xmm3, %%xmm0
\n
"
"addps %%xmm3, %%xmm1
\n
"
"movups %%xmm0, (%%eax)
\n
"
"movups %%xmm1, 1024(%%eax)
\n
"
"addl $16, %%eax
\n
"
"decl %%ebx
\n
"
"jnz .loop5
\n
"
"popl %%ebx
\n
"
:
"=a"
(
samples
)
:
"a"
(
samples
),
"c"
(
dm_par
));
"movaps (%%eax), %%xmm0
\n
"
/* left */
"movaps 1024(%%eax), %%xmm1
\n
"
/* right */
"movaps 2048(%%eax), %%xmm3
\n
"
/* sur */
"mulps %%xmm5, %%xmm0
\n
"
"mulps %%xmm5, %%xmm1
\n
"
"mulps %%xmm7, %%xmm3
\n
"
"subps %%xmm3, %%xmm0
\n
"
"addps %%xmm3, %%xmm1
\n
"
"movaps %%xmm0, (%%eax)
\n
"
"movaps %%xmm1, 1024(%%eax)
\n
"
"addl $16, %%eax
\n
"
"decl %%ebx
\n
"
"jnz .loop5
\n
"
"popl %%ebx
\n
"
:
"=a"
(
samples
)
:
"a"
(
samples
),
"c"
(
dm_par
));
}
void
_M
(
downmix_3f_0r_to_2ch
)
(
float
*
samples
,
dm_par_t
*
dm_par
)
{
__asm__
__volatile__
(
"pushl %%ebx
\n
"
"movl $64, %%ebx
\n
"
/* loop counter */
".align 16
\n
"
"pushl %%ebx
\n
"
"movl $64, %%ebx
\n
"
/* loop counter */
"movss
(%%ecx), %%xmm5
\n
"
/* unit */
"shufps
$0, %%xmm5, %%xmm5
\n
"
/* unit | unit | unit | unit */
"movss
(%%ecx), %%xmm5
\n
"
/* unit */
"shufps
$0, %%xmm5, %%xmm5
\n
"
/* unit | unit | unit | unit */
"movss 4(%%ecx), %%xmm6
\n
"
/* clev */
"shufps
$0, %%xmm6, %%xmm6
\n
"
/* clev | clev | clev | clev */
"movss 4(%%ecx), %%xmm6
\n
"
/* clev */
"shufps
$0, %%xmm6, %%xmm6
\n
"
/* clev | clev | clev | clev */
".align 16
\n
"
".loop6:
\n
"
"mov
ups (%%eax), %%xmm0
\n
"
/*left */
"mov
ups
2048(%%eax), %%xmm1
\n
"
/* right */
"mov
ups 1024(%%eax), %%xmm2
\n
"
/* center */
"mulps
%%xmm5, %%xmm0
\n
"
"mulps
%%xmm5, %%xmm1
\n
"
"mulps
%%xmm6, %%xmm2
\n
"
"addps
%%xmm2, %%xmm0
\n
"
"addps
%%xmm2, %%xmm1
\n
"
"mov
ups
%%xmm0, (%%eax)
\n
"
"mov
ups
%%xmm1, 1024(%%eax)
\n
"
"addl
$16, %%eax
\n
"
"decl
%%ebx
\n
"
"jnz
.loop6
\n
"
"popl
%%ebx
\n
"
"mov
aps (%%eax), %%xmm0
\n
"
/*left */
"mov
aps
2048(%%eax), %%xmm1
\n
"
/* right */
"mov
aps 1024(%%eax), %%xmm2
\n
"
/* center */
"mulps %%xmm5, %%xmm0
\n
"
"mulps %%xmm5, %%xmm1
\n
"
"mulps %%xmm6, %%xmm2
\n
"
"addps %%xmm2, %%xmm0
\n
"
"addps %%xmm2, %%xmm1
\n
"
"mov
aps
%%xmm0, (%%eax)
\n
"
"mov
aps
%%xmm1, 1024(%%eax)
\n
"
"addl $16, %%eax
\n
"
"decl %%ebx
\n
"
"jnz .loop6
\n
"
"popl %%ebx
\n
"
:
"=a"
(
samples
)
:
"a"
(
samples
),
"c"
(
dm_par
));
}
...
...
@@ -239,24 +246,26 @@ void _M( downmix_3f_0r_to_2ch ) (float *samples, dm_par_t * dm_par)
void
_M
(
stream_sample_1ch_to_s16
)
(
s16
*
s16_samples
,
float
*
left
)
{
__asm__
__volatile__
(
".align 16
\n
"
"pushl %%ebx
\n
"
"pushl %%edx
\n
"
"movl $sqrt2_sse, %%edx
\n
"
"movss (%%edx), %%xmm7
\n
"
"shufps $0, %%xmm7, %%xmm7
\n
"
/* sqrt2 | sqrt2 | sqrt2 | sqrt2 */
"movl $64, %%ebx
\n
"
"movss (%%edx), %%xmm7
\n
"
"shufps $0, %%xmm7, %%xmm7
\n
"
/* sqrt2 | sqrt2 | sqrt2 | sqrt2 */
"movl $64, %%ebx
\n
"
".align 16
\n
"
".loop2:
\n
"
"mov
ups (%%ecx), %%xmm0
\n
"
/* c3 | c2 | c1 | c0 */
"mov
aps (%%ecx), %%xmm0
\n
"
/* c3 | c2 | c1 | c0 */
"mulps %%xmm7, %%xmm0
\n
"
"movhlps %%xmm0, %%xmm2
\n
"
/* c3 | c2 */
"movhlps %%xmm0, %%xmm2
\n
"
/* c3 | c2 */
"cvtps2pi %%xmm0, %%mm0
\n
"
/* c1 c0 --> mm0, int_32 */
"cvtps2pi %%xmm2, %%mm1
\n
"
/* c3 c2 --> mm1, int_32 */
"cvtps2pi %%xmm0, %%mm0
\n
"
/* c1 c0 --> mm0, int_32 */
"cvtps2pi %%xmm2, %%mm1
\n
"
/* c3 c2 --> mm1, int_32 */
"packssdw %%mm0, %%mm0
\n
"
/* c1 c1 c0 c0 --> mm0, int_16 */
"packssdw %%mm1, %%mm1
\n
"
/* c3 c3 c2 c2 --> mm1, int_16 */
"packssdw %%mm0, %%mm0
\n
"
/* c1 c1 c0 c0 --> mm0, int_16 */
"packssdw %%mm1, %%mm1
\n
"
/* c3 c3 c2 c2 --> mm1, int_16 */
"movq %%mm0, (%%eax)
\n
"
"movq %%mm1, 8(%%eax)
\n
"
...
...
@@ -275,18 +284,19 @@ void _M( stream_sample_1ch_to_s16 ) (s16 *s16_samples, float *left)
void
_M
(
stream_sample_2ch_to_s16
)
(
s16
*
s16_samples
,
float
*
left
,
float
*
right
)
{
__asm__
__volatile__
(
".align 16
\n
"
"pushl %%ebx
\n
"
"movl $64, %%ebx
\n
"
"movl
$64, %%ebx
\n
"
".align 16
\n
"
".loop1:
\n
"
"mov
ups (%%ecx), %%xmm0
\n
"
/* l3 | l2 | l1 | l0 */
"mov
ups (%%edx), %%xmm1
\n
"
/* r3 | r2 | r1 | r0 */
"movhlps %%xmm0, %%xmm2
\n
"
/* l3 | l2 */
"movhlps %%xmm1, %%xmm3
\n
"
/* r3 | r2 */
"unpcklps %%xmm1, %%xmm0
\n
"
/* r1 | l1 | r0 | l0 */
"unpcklps %%xmm3, %%xmm2
\n
"
/* r3 | l3 | r2 | l2 */
"mov
aps (%%ecx), %%xmm0
\n
"
/* l3 | l2 | l1 | l0 */
"mov
aps (%%edx), %%xmm1
\n
"
/* r3 | r2 | r1 | r0 */
"movhlps %%xmm0, %%xmm2
\n
"
/* l3 | l2 */
"movhlps %%xmm1, %%xmm3
\n
"
/* r3 | r2 */
"unpcklps %%xmm1, %%xmm0
\n
"
/* r1 | l1 | r0 | l0 */
"unpcklps %%xmm3, %%xmm2
\n
"
/* r3 | l3 | r2 | l2 */
"cvtps2pi %%xmm0, %%mm0
\n
"
/* r0 l0 --> mm0, int_32 */
"movhlps %%xmm0, %%xmm0
\n
"
...
...
@@ -295,8 +305,8 @@ void _M( stream_sample_2ch_to_s16 ) (s16 *s16_samples, float *left, float *right
"movhlps %%xmm2, %%xmm2
\n
"
"cvtps2pi %%xmm2, %%mm3
\n
"
/* r3 l3 --> mm3, int_32 */
"packssdw %%mm1, %%mm0
\n
"
/* r1 l1 r0 l0 --> mm0, int_16 */
"packssdw %%mm3, %%mm2
\n
"
/* r3 l3 r2 l2 --> mm2, int_16 */
"packssdw %%mm1, %%mm0
\n
"
/* r1 l1 r0 l0 --> mm0, int_16 */
"packssdw %%mm3, %%mm2
\n
"
/* r3 l3 r2 l2 --> mm2, int_16 */
"movq %%mm0, (%%eax)
\n
"
"movq %%mm2, 8(%%eax)
\n
"
...
...
plugins/imdct/ac3_imdct_3dn.c
View file @
dee3179d
...
...
@@ -2,7 +2,7 @@
* ac3_imdct_3dn.c: accelerated 3D Now! ac3 DCT
*****************************************************************************
* Copyright (C) 1999, 2000 VideoLAN
* $Id: ac3_imdct_3dn.c,v 1.
4 2001/06/03 12:47:21 sam
Exp $
* $Id: ac3_imdct_3dn.c,v 1.
5 2001/07/08 23:15:11 reno
Exp $
*
* Authors: Renaud Dartus <reno@videolan.org>
*
...
...
@@ -89,6 +89,7 @@ void _M( imdct_do_512_nol ) (imdct_t * p_imdct, float data[], float delay[])
static
void
imdct512_pre_ifft_twiddle_3dn
(
const
int
*
pmt
,
complex_t
*
buf
,
float
*
data
,
float
*
xcos_sin_sse
)
{
__asm__
__volatile__
(
".align 16
\n
"
"pushl %%ebp
\n
"
"movl %%esp, %%ebp
\n
"
"addl $-4, %%esp
\n
"
/* local variable, loop counter */
...
...
@@ -106,6 +107,7 @@ static void imdct512_pre_ifft_twiddle_3dn (const int *pmt, complex_t *buf, float
"movl 20(%%ebp), %%edx
\n
"
/* xcos_sin_sse */
"movl $128, -4(%%ebp)
\n
"
".align 16
\n
"
".loop:
\n
"
"movl (%%eax), %%esi
\n
"
"movd (%%ecx, %%esi, 8), %%mm1
\n
"
/* 2j */
...
...
@@ -147,9 +149,11 @@ static void imdct512_pre_ifft_twiddle_3dn (const int *pmt, complex_t *buf, float
static
void
imdct512_post_ifft_twiddle_3dn
(
complex_t
*
buf
,
float
*
xcos_sin_sse
)
{
__asm__
__volatile__
(
".align 16
\n
"
"pushl %%ebx
\n
"
"movl $64, %%ebx
\n
"
/* loop counter */
".align 16
\n
"
".loop1:
\n
"
"movq (%%eax), %%mm0
\n
"
/* im0 | re0 */
"movq %%mm0, %%mm1
\n
"
/* im0 | re0 */
...
...
@@ -200,6 +204,7 @@ static void imdct512_post_ifft_twiddle_3dn (complex_t *buf, float *xcos_sin_sse)
static
void
imdct512_window_delay_3dn
(
complex_t
*
buf
,
float
*
data_ptr
,
float
*
window_prt
,
float
*
delay_prt
)
{
__asm__
__volatile__
(
".align 16
\n
"
"pushl %%ebp
\n
"
"movl %%esp, %%ebp
\n
"
...
...
@@ -219,6 +224,7 @@ static void imdct512_window_delay_3dn (complex_t *buf, float *data_ptr, float *w
"leal 504(%%eax), %%edi
\n
"
/* buf[63].re */
"movl 12(%%ebp), %%eax
\n
"
/* data */
".align 16
\n
"
".first_128_samples:
\n
"
"movd (%%esi), %%mm0
\n
"
/* im0 */
"movd 8(%%esi), %%mm2
\n
"
/* im1 */
...
...
@@ -258,6 +264,7 @@ static void imdct512_window_delay_3dn (complex_t *buf, float *data_ptr, float *w
"leal 1020(%%esi), %%edi
\n
"
/* buf[127].im */
"movl $32, %%ecx
\n
"
/* loop count */
".align 16
\n
"
".second_128_samples:
\n
"
"movd (%%esi), %%mm0
\n
"
/* buf[i].re */
"movd 8(%%esi), %%mm2
\n
"
/* re1 */
...
...
@@ -302,6 +309,7 @@ static void imdct512_window_delay_3dn (complex_t *buf, float *data_ptr, float *w
"movl $32, %%ecx
\n
"
/* loop count */
"movl 20(%%ebp), %%eax
\n
"
/* delay */
".align 16
\n
"
".first_128_delay:
\n
"
"movd (%%esi), %%mm0
\n
"
/* re0 */
"movd 8(%%esi), %%mm2
\n
"
/* re1 */
...
...
@@ -339,6 +347,7 @@ static void imdct512_window_delay_3dn (complex_t *buf, float *data_ptr, float *w
"leal 1016(%%ebx), %%edi
\n
"
/* buf[127].re */
"movl $32, %%ecx
\n
"
/* loop count */
".align 16
\n
"
".second_128_delay:
\n
"
"movd (%%esi), %%mm0
\n
"
/* im0 */
"movd 8(%%esi), %%mm2
\n
"
/* im1 */
...
...
@@ -386,6 +395,7 @@ static void imdct512_window_delay_3dn (complex_t *buf, float *data_ptr, float *w
static
void
imdct512_window_delay_nol_3dn
(
complex_t
*
buf
,
float
*
data_ptr
,
float
*
window_prt
,
float
*
delay_prt
)
{
__asm__
__volatile__
(
".align 16
\n
"
"pushl %%ebp
\n
"
"movl %%esp, %%ebp
\n
"
...
...
@@ -405,6 +415,7 @@ static void imdct512_window_delay_nol_3dn (complex_t *buf, float *data_ptr, floa
"leal 504(%%eax), %%edi
\n
"
/* buf[63].re */
"movl 12(%%ebp), %%eax
\n
"
/* data */
".align 16
\n
"
".first_128_samples2:
\n
"
"movd (%%esi), %%mm0
\n
"
/* im0 */
"movd 8(%%esi), %%mm2
\n
"
/* im1 */
...
...
@@ -439,6 +450,7 @@ static void imdct512_window_delay_nol_3dn (complex_t *buf, float *data_ptr, floa
"leal 1020(%%esi), %%edi
\n
"
/* buf[127].im */
"movl $32, %%ecx
\n
"
/* loop count */
".align 16
\n
"
".second_128_samples2:
\n
"
"movd (%%esi), %%mm0
\n
"
/* buf[i].re */
"movd 8(%%esi), %%mm2
\n
"
/* re1 */
...
...
@@ -478,6 +490,7 @@ static void imdct512_window_delay_nol_3dn (complex_t *buf, float *data_ptr, floa
"movl $32, %%ecx
\n
"
/* loop count */
"movl 20(%%ebp), %%eax
\n
"
/* delay */
".align 16
\n
"
".first_128_delays:
\n
"
"movd (%%esi), %%mm0
\n
"
/* re0 */
"movd 8(%%esi), %%mm2
\n
"
/* re1 */
...
...
@@ -515,6 +528,7 @@ static void imdct512_window_delay_nol_3dn (complex_t *buf, float *data_ptr, floa
"leal 1016(%%ebx), %%edi
\n
"
/* buf[127].re */
"movl $32, %%ecx
\n
"
/* loop count */
".align 16
\n
"
".second_128_delays:
\n
"
"movd (%%esi), %%mm0
\n
"
/* im0 */
"movd 8(%%esi), %%mm2
\n
"
/* im1 */
...
...
plugins/imdct/ac3_imdct_sse.c
View file @
dee3179d
...
...
@@ -2,7 +2,7 @@
* ac3_imdct_sse.c: accelerated SSE ac3 DCT
*****************************************************************************
* Copyright (C) 1999, 2000 VideoLAN
* $Id: ac3_imdct_sse.c,v 1.
3 2001/05/28 02:38:48 sam
Exp $
* $Id: ac3_imdct_sse.c,v 1.
4 2001/07/08 23:15:11 reno
Exp $
*
* Authors: Renaud Dartus <reno@videolan.org>
* Aaron Holtzman <aholtzma@engr.uvic.ca>
...
...
@@ -91,6 +91,7 @@ void _M( imdct_do_512_nol ) (imdct_t * p_imdct, float data[], float delay[])
static
void
imdct512_pre_ifft_twiddle_sse
(
const
int
*
pmt
,
complex_t
*
buf
,
float
*
data
,
float
*
xcos_sin_sse
)
{
__asm__
__volatile__
(
".align 16
\n
"
"pushl %%ebp
\n
"
"movl %%esp, %%ebp
\n
"
"addl $-4, %%esp
\n
"
/* local variable, loop counter */
...
...
@@ -103,11 +104,12 @@ static void imdct512_pre_ifft_twiddle_sse (const int *pmt, complex_t *buf, float
"pushl %%esi
\n
"
"movl 8(%%ebp), %%eax
\n
"
/* pmt */
"movl 12(%%ebp), %%ebx
\n
"
/* buf */
"movl 16(%%ebp), %%ecx
\n
"
/* data */
"movl 12(%%ebp), %%ebx
\n
"
/* buf */
"movl 16(%%ebp), %%ecx
\n
"
/* data */
"movl 20(%%ebp), %%edx
\n
"
/* xcos_sin_sse */
"movl $64, -4(%%ebp)
\n
"
".align 16
\n
"
".loop:
\n
"
"movl (%%eax), %%esi
\n
"
"movl 4(%%eax), %%edi
\n
"
...
...
@@ -117,18 +119,18 @@ static void imdct512_pre_ifft_twiddle_sse (const int *pmt, complex_t *buf, float
"shll $1, %%esi
\n
"
"shll $1, %%edi
\n
"
"mov
u
ps (%%edx, %%esi, 8), %%xmm0
\n
"
/* -c_j | -s_j | -s_j | c_j */
"mov
u
ps (%%edx, %%edi, 8), %%xmm2
\n
"
/* -c_j+1 | -s_j+1 | -s_j+1 | c_j+1 */
"mov
a
ps (%%edx, %%esi, 8), %%xmm0
\n
"
/* -c_j | -s_j | -s_j | c_j */
"mov
a
ps (%%edx, %%edi, 8), %%xmm2
\n
"
/* -c_j+1 | -s_j+1 | -s_j+1 | c_j+1 */
"negl %%esi
\n
"
"negl %%edi
\n
"
"movss 1020(%%ecx, %%esi, 4), %%xmm4
\n
"
/* 255-2j */
"addl $8, %%eax
\n
"
"addl
$8, %%eax
\n
"
"movss 1020(%%ecx, %%edi, 4), %%xmm5
\n
"
/* 255-2(j+1) */
"shufps $0, %%xmm1, %%xmm4
\n
"
/* 2j | 2j | 255-2j | 255-2j */
"shufps $0, %%xmm3, %%xmm5
\n
"
/* 2(j+1) | 2(j+1) | 255-2(j+1) | 255-2(j+1) */
"shufps
$0, %%xmm1, %%xmm4
\n
"
/* 2j | 2j | 255-2j | 255-2j */
"shufps
$0, %%xmm3, %%xmm5
\n
"
/* 2(j+1) | 2(j+1) | 255-2(j+1) | 255-2(j+1) */
"mulps %%xmm4, %%xmm0
\n
"
"mulps %%xmm5, %%xmm2
\n
"
"movhlps %%xmm0, %%xmm1
\n
"
...
...
@@ -138,9 +140,9 @@ static void imdct512_pre_ifft_twiddle_sse (const int *pmt, complex_t *buf, float
"addps %%xmm3, %%xmm2
\n
"
"movlhps %%xmm2, %%xmm0
\n
"
"mov
u
ps %%xmm0, -16(%%ebx)
\n
"
"decl -4(%%ebp)
\n
"
"jnz
.loop
\n
"
"mov
a
ps %%xmm0, -16(%%ebx)
\n
"
"decl
-4(%%ebp)
\n
"
"jnz
.loop
\n
"
"popl %%esi
\n
"
"popl %%edi
\n
"
...
...
@@ -157,36 +159,38 @@ static void imdct512_pre_ifft_twiddle_sse (const int *pmt, complex_t *buf, float
static
void
imdct512_post_ifft_twiddle_sse
(
complex_t
*
buf
,
float
*
xcos_sin_sse
)
{
__asm__
__volatile__
(
".align 16
\n
"
"pushl %%ebx
\n
"
"movl
$32, %%ebx
\n
"
/* loop counter */
"movl
$32, %%ebx
\n
"
/* loop counter */
".align 16
\n
"
".loop1:
\n
"
"mov
ups (%%eax), %%xmm0
\n
"
/* im1 | re1 | im0 | re0 */
"mov
aps (%%eax), %%xmm0
\n
"
/* im1 | re1 | im0 | re0 */
"mov
ups
(%%ecx), %%xmm2
\n
"
/* -c | -s | -s | c */
"movhlps
%%xmm0, %%xmm1
\n
"
/* im1 | re1 */
"mov
ups
16(%%ecx), %%xmm3
\n
"
/* -c1 | -s1 | -s1 | c1 */
"mov
aps
(%%ecx), %%xmm2
\n
"
/* -c | -s | -s | c */
"movhlps %%xmm0, %%xmm1
\n
"
/* im1 | re1 */
"mov
aps
16(%%ecx), %%xmm3
\n
"
/* -c1 | -s1 | -s1 | c1 */
"shufps $0x50, %%xmm0, %%xmm0
\n
"
/* im0 | im0 | re0 | re0 */
"shufps $0x50, %%xmm1, %%xmm1
\n
"
/* im1 | im1 | re1 | re1 */
"shufps $0x50, %%xmm0, %%xmm0
\n
"
/* im0 | im0 | re0 | re0 */
"shufps $0x50, %%xmm1, %%xmm1
\n
"
/* im1 | im1 | re1 | re1 */
"mov
ups 16(%%eax), %%xmm4
\n
"
/* im3 | re3 | im2 | re2 */
"mov
aps 16(%%eax), %%xmm4
\n
"
/* im3 | re3 | im2 | re2 */
"shufps
$0x27, %%xmm2, %%xmm2
\n
"
/* c | -s | -s | -c */
"movhlps
%%xmm4, %%xmm5
\n
"
/* im3 | re3 */
"shufps
$0x27, %%xmm3, %%xmm3
\n
"
/* c1 | -s1 | -s1 | -c1 */
"shufps
$0x27, %%xmm2, %%xmm2
\n
"
/* c | -s | -s | -c */
"movhlps %%xmm4, %%xmm5
\n
"
/* im3 | re3 */
"shufps
$0x27, %%xmm3, %%xmm3
\n
"
/* c1 | -s1 | -s1 | -c1 */
"mov
ups
32(%%ecx), %%xmm6
\n
"
/* -c2 | -s2 | -s2 | c2 */
"mov
ups
48(%%ecx), %%xmm7
\n
"
/* -c3 | -s3 | -s3 | c3 */
"mov
aps
32(%%ecx), %%xmm6
\n
"
/* -c2 | -s2 | -s2 | c2 */
"mov
aps
48(%%ecx), %%xmm7
\n
"
/* -c3 | -s3 | -s3 | c3 */
"shufps $0x50, %%xmm4, %%xmm4
\n
"
/* im2 | im2 | re2 | re2 */
"shufps $0x50, %%xmm5, %%xmm5
\n
"
/* im3 | im3 | re3 | re3 */
"shufps $0x50, %%xmm4, %%xmm4
\n
"
/* im2 | im2 | re2 | re2 */
"shufps $0x50, %%xmm5, %%xmm5
\n
"
/* im3 | im3 | re3 | re3 */
"mulps %%xmm2, %%xmm0
\n
"
"mulps %%xmm3, %%xmm1
\n
"
"shufps $0x27, %%xmm6, %%xmm6
\n
"
/* c2 | -s2 | -s2 | -c2 */
"shufps $0x27, %%xmm7, %%xmm7
\n
"
/* c3 | -s3 | -s3 | -c3 */
"shufps $0x27, %%xmm6, %%xmm6
\n
"
/* c2 | -s2 | -s2 | -c2 */
"shufps $0x27, %%xmm7, %%xmm7
\n
"
/* c3 | -s3 | -s3 | -c3 */
"movhlps %%xmm0, %%xmm2
\n
"
"movhlps %%xmm1, %%xmm3
\n
"
...
...
@@ -206,8 +210,8 @@ static void imdct512_post_ifft_twiddle_sse (complex_t *buf, float *xcos_sin_sse)
"movlhps %%xmm1, %%xmm0
\n
"
"movlhps %%xmm5, %%xmm4
\n
"
"mov
u
ps %%xmm0, (%%eax)
\n
"
"mov
u
ps %%xmm4, 16(%%eax)
\n
"
"mov
a
ps %%xmm0, (%%eax)
\n
"
"mov
a
ps %%xmm4, 16(%%eax)
\n
"
"addl $64, %%ecx
\n
"
"addl $32, %%eax
\n
"
"decl %%ebx
\n
"
...
...
@@ -221,6 +225,7 @@ static void imdct512_post_ifft_twiddle_sse (complex_t *buf, float *xcos_sin_sse)
static
void
imdct512_window_delay_sse
(
complex_t
*
buf
,
float
*
data_ptr
,
float
*
window_prt
,
float
*
delay_prt
)
{
__asm__
__volatile__
(
".align 16
\n
"
"pushl %%ebp
\n
"
"movl %%esp, %%ebp
\n
"
...
...
@@ -240,6 +245,7 @@ static void imdct512_window_delay_sse (complex_t *buf, float *data_ptr, float *w
"leal 504(%%eax), %%edi
\n
"
/* buf[63].re */
"movl 12(%%ebp), %%eax
\n
"
/* data */
".align 16
\n
"
".first_128_samples:
\n
"
"movss (%%esi), %%xmm0
\n
"
"movss 8(%%esi), %%xmm2
\n
"
...
...
@@ -250,7 +256,7 @@ static void imdct512_window_delay_sse (complex_t *buf, float *data_ptr, float *w
"movlhps %%xmm3, %%xmm1
\n
"
/* 0.0 | re1 | 0.0 | re0 */
"movups (%%edx), %%xmm4
\n
"
/* w3 | w2 | w1 | w0 */
"mov
u
ps (%%ebx), %%xmm5
\n
"
/* d3 | d2 | d1 | d0 */
"mov
a
ps (%%ebx), %%xmm5
\n
"
/* d3 | d2 | d1 | d0 */
"shufps $0xb1, %%xmm1, %%xmm1
\n
"
/* re1 | 0.0 | re0 | 0.0 */
"movss 16(%%esi), %%xmm6
\n
"
/* im2 */
...
...
@@ -261,27 +267,28 @@ static void imdct512_window_delay_sse (complex_t *buf, float *data_ptr, float *w
"mulps %%xmm4, %%xmm0
\n
"
"movlhps %%xmm7, %%xmm6
\n
"
/* 0.0 | im3 | 0.0 | im2 */
"movlhps %%xmm3, %%xmm2
\n
"
/* 0.0 | re3 | 0.0 | re2 */
"addps %%xmm5, %%xmm0
\n
"
"addps
%%xmm5, %%xmm0
\n
"
"shufps $0xb1, %%xmm2, %%xmm2
\n
"
/* re3 | 0.0 | re2 | 0.0 */
"movups 16(%%edx), %%xmm4
\n
"
/* w7 | w6 | w5 | w4 */
"mov
u
ps 16(%%ebx), %%xmm5
\n
"
/* d7 | d6 | d5 | d4 */
"subps
%%xmm2, %%xmm6
\n
"
/* -re3 | im3 | -re2 | im2 */
"addl $32, %%edx
\n
"
"mov
u
ps %%xmm0, (%%eax)
\n
"
"addl $32, %%ebx
\n
"
"mulps %%xmm4, %%xmm6
\n
"
"addl $32, %%esi
\n
"
"addl $32, %%eax
\n
"
"addps %%xmm5, %%xmm6
\n
"
"addl $-32, %%edi
\n
"
"mov
u
ps %%xmm6, -16(%%eax)
\n
"
"decl %%ecx
\n
"
"mov
a
ps 16(%%ebx), %%xmm5
\n
"
/* d7 | d6 | d5 | d4 */
"subps
%%xmm2, %%xmm6
\n
"
/* -re3 | im3 | -re2 | im2 */
"addl
$32, %%edx
\n
"
"mov
a
ps %%xmm0, (%%eax)
\n
"
"addl
$32, %%ebx
\n
"
"mulps
%%xmm4, %%xmm6
\n
"
"addl
$32, %%esi
\n
"
"addl
$32, %%eax
\n
"
"addps
%%xmm5, %%xmm6
\n
"
"addl
$-32, %%edi
\n
"
"mov
a
ps %%xmm6, -16(%%eax)
\n
"
"decl
%%ecx
\n
"
"jnz .first_128_samples
\n
"
"movl 8(%%ebp), %%esi
\n
"
/* buf[0].re */
"leal 1020(%%esi), %%edi
\n
"
/* buf[127].im */
"movl $16, %%ecx
\n
"
/* loop count */
".align 16
\n
"
".second_128_samples:
\n
"
"movss (%%esi), %%xmm0
\n
"
/* buf[i].re */
"movss 8(%%esi), %%xmm2
\n
"
/* re1 */
...
...
@@ -292,7 +299,7 @@ static void imdct512_window_delay_sse (complex_t *buf, float *data_ptr, float *w
"movlhps %%xmm3, %%xmm1
\n
"
/* 0.0 | im1 | 0.0 | im1 */
"movups (%%edx), %%xmm4
\n
"
/* w3 | w2 | w1 | w0 */
"mov
u
ps (%%ebx), %%xmm5
\n
"
/* d3 | d2 | d1 | d0 */
"mov
a
ps (%%ebx), %%xmm5
\n
"
/* d3 | d2 | d1 | d0 */
"shufps $0xb1, %%xmm1, %%xmm1
\n
"
/* im1 | 0.0 | im0 | 0.0 */
"movss 16(%%esi), %%xmm6
\n
"
/* re2 */
...
...
@@ -305,19 +312,19 @@ static void imdct512_window_delay_sse (complex_t *buf, float *data_ptr, float *w
"mulps %%xmm4, %%xmm0
\n
"
"shufps $0xb1, %%xmm2, %%xmm2
\n
"
/* im3 | 0.0 | im2 | 0.0 */
"movups 16(%%edx), %%xmm4
\n
"
/* w7 | w6 | w5 | w4 */
"addl $32, %%esi
\n
"
"subps
%%xmm2, %%xmm6
\n
"
/* -im3 | re3 | -im2 | re2 */
"addps %%xmm5, %%xmm0
\n
"
"mulps %%xmm4, %%xmm6
\n
"
"addl $-32, %%edi
\n
"
"mov
u
ps 16(%%ebx), %%xmm5
\n
"
/* d7 | d6 | d5 | d4 */
"mov
u
ps %%xmm0, (%%eax)
\n
"
"addps %%xmm5, %%xmm6
\n
"
"addl $32, %%edx
\n
"
"addl $32, %%eax
\n
"
"addl $32, %%ebx
\n
"
"mov
u
ps %%xmm6, -16(%%eax)
\n
"
"decl %%ecx
\n
"
"addl
$32, %%esi
\n
"
"subps
%%xmm2, %%xmm6
\n
"
/* -im3 | re3 | -im2 | re2 */
"addps
%%xmm5, %%xmm0
\n
"
"mulps
%%xmm4, %%xmm6
\n
"
"addl
$-32, %%edi
\n
"
"mov
a
ps 16(%%ebx), %%xmm5
\n
"
/* d7 | d6 | d5 | d4 */
"mov
a
ps %%xmm0, (%%eax)
\n
"
"addps
%%xmm5, %%xmm6
\n
"
"addl
$32, %%edx
\n
"
"addl
$32, %%eax
\n
"
"addl
$32, %%ebx
\n
"
"mov
a
ps %%xmm6, -16(%%eax)
\n
"
"decl
%%ecx
\n
"
"jnz .second_128_samples
\n
"
"movl 8(%%ebp), %%eax
\n
"
...
...
@@ -326,6 +333,7 @@ static void imdct512_window_delay_sse (complex_t *buf, float *data_ptr, float *w
"movl $16, %%ecx
\n
"
/* loop count */
"movl 20(%%ebp), %%eax
\n
"
/* delay */
".align 16
\n
"
".first_128_delay:
\n
"
"movss (%%esi), %%xmm0
\n
"
"movss 8(%%esi), %%xmm2
\n
"
...
...
@@ -341,21 +349,21 @@ static void imdct512_window_delay_sse (complex_t *buf, float *data_ptr, float *w
"movss 24(%%esi), %%xmm7
\n
"
/* re3 */
"movss -16(%%edi), %%xmm2
\n
"
/* im2 */
"movss -24(%%edi), %%xmm3
\n
"
/* im3 */
"subps
%%xmm1, %%xmm0
\n
"
/* -im1 | re1 | -im0 | re0 */
"addl $-32, %%edx
\n
"
"subps
%%xmm1, %%xmm0
\n
"
/* -im1 | re1 | -im0 | re0 */
"addl
$-32, %%edx
\n
"
"movlhps %%xmm7, %%xmm6
\n
"
/* 0.0 | re3 | 0.0 | re2 */
"movlhps %%xmm3, %%xmm2
\n
"
/* 0.0 | im3 | 0.0 | im2 */
"mulps %%xmm4, %%xmm0
\n
"
"movups (%%edx), %%xmm5
\n
"
/* w7 | w6 | w5 | w4 */
"shufps $0xb1, %%xmm2, %%xmm2
\n
"
/* im3 | 0.0 | im2 | 0.0 */
"mov
u
ps %%xmm0, (%%eax)
\n
"
"addl $32, %%esi
\n
"
"subps
%%xmm2, %%xmm6
\n
"
/* -im3 | re3 | -im2 | re2 */
"addl $-32, %%edi
\n
"
"mulps %%xmm5, %%xmm6
\n
"
"addl $32, %%eax
\n
"
"mov
u
ps %%xmm6, -16(%%eax)
\n
"
"decl %%ecx
\n
"
"mov
a
ps %%xmm0, (%%eax)
\n
"
"addl
$32, %%esi
\n
"
"subps
%%xmm2, %%xmm6
\n
"
/* -im3 | re3 | -im2 | re2 */
"addl
$-32, %%edi
\n
"
"mulps
%%xmm5, %%xmm6
\n
"
"addl
$32, %%eax
\n
"
"mov
a
ps %%xmm6, -16(%%eax)
\n
"
"decl
%%ecx
\n
"
"jnz .first_128_delay
\n
"
"movl 8(%%ebp), %%ebx
\n
"
...
...
@@ -363,6 +371,7 @@ static void imdct512_window_delay_sse (complex_t *buf, float *data_ptr, float *w
"leal 1016(%%ebx), %%edi
\n
"
/* buf[127].re */
"movl $16, %%ecx
\n
"
/* loop count */
".align 16
\n
"
".second_128_delay:
\n
"
"movss (%%esi), %%xmm0
\n
"
"movss 8(%%esi), %%xmm2
\n
"
...
...
@@ -378,21 +387,21 @@ static void imdct512_window_delay_sse (complex_t *buf, float *data_ptr, float *w
"movss 24(%%esi), %%xmm7
\n
"
/* im3 */
"movss -16(%%edi), %%xmm2
\n
"
/* re2 */
"movss -24(%%edi), %%xmm3
\n
"
/* re3 */
"subps
%%xmm0, %%xmm1
\n
"
/* re1 | -im1 | re0 | -im0 */
"addl $-32, %%edx
\n
"
"subps
%%xmm0, %%xmm1
\n
"
/* re1 | -im1 | re0 | -im0 */
"addl
$-32, %%edx
\n
"
"movlhps %%xmm7, %%xmm6
\n
"
/* 0.0 | im3 | 0.0 | im2 */
"movlhps %%xmm3, %%xmm2
\n
"
/* 0.0 | re3 | 0.0 | re2 */
"mulps %%xmm4, %%xmm1
\n
"
"movups (%%edx), %%xmm5
\n
"
/* w7 | w6 | w5 | w4 */
"shufps $0xb1, %%xmm2, %%xmm2
\n
"
/* re3 | 0.0 | re2 | 0.0 */
"mov
u
ps %%xmm1, (%%eax)
\n
"
"addl $32, %%esi
\n
"
"subps
%%xmm6, %%xmm2
\n
"
/* re | -im3 | re | -im2 */
"addl $-32, %%edi
\n
"
"mulps %%xmm5, %%xmm2
\n
"
"addl $32, %%eax
\n
"
"mov
u
ps %%xmm2, -16(%%eax)
\n
"
"decl %%ecx
\n
"
"mov
a
ps %%xmm1, (%%eax)
\n
"
"addl
$32, %%esi
\n
"
"subps
%%xmm6, %%xmm2
\n
"
/* re | -im3 | re | -im2 */
"addl
$-32, %%edi
\n
"
"mulps
%%xmm5, %%xmm2
\n
"
"addl
$32, %%eax
\n
"
"mov
a
ps %%xmm2, -16(%%eax)
\n
"
"decl
%%ecx
\n
"
"jnz .second_128_delay
\n
"
"popl %%edi
\n
"
...
...
@@ -409,6 +418,7 @@ static void imdct512_window_delay_sse (complex_t *buf, float *data_ptr, float *w
static
void
imdct512_window_delay_nol_sse
(
complex_t
*
buf
,
float
*
data_ptr
,
float
*
window_prt
,
float
*
delay_prt
)
{
__asm__
__volatile__
(
".align 16
\n
"
"pushl %%ebp
\n
"
"movl %%esp, %%ebp
\n
"
...
...
@@ -428,6 +438,7 @@ static void imdct512_window_delay_nol_sse (complex_t *buf, float *data_ptr, floa
"leal 504(%%eax), %%edi
\n
"
/* buf[63].re */
"movl 12(%%ebp), %%eax
\n
"
/* data */
".align 16
\n
"
".first_128_sample:
\n
"
"movss (%%esi), %%xmm0
\n
"
"movss 8(%%esi), %%xmm2
\n
"
...
...
@@ -438,7 +449,6 @@ static void imdct512_window_delay_nol_sse (complex_t *buf, float *data_ptr, floa
"movlhps %%xmm3, %%xmm1
\n
"
/* 0.0 | re1 | 0.0 | re0 */
"movups (%%edx), %%xmm4
\n
"
/* w3 | w2 | w1 | w0 */
/* movups (%%ebx), %%xmm5 d3 | d2 | d1 | d0 */
"shufps $0xb1, %%xmm1, %%xmm1
\n
"
/* re1 | 0.0 | re0 | 0.0 */
"movss 16(%%esi), %%xmm6
\n
"
/* im2 */
...
...
@@ -446,30 +456,27 @@ static void imdct512_window_delay_nol_sse (complex_t *buf, float *data_ptr, floa
"subps %%xmm1, %%xmm0
\n
"
/* -re1 | im1 | -re0 | im0 */
"movss -16(%%edi), %%xmm2
\n
"
/* re2 */
"movss -24(%%edi), %%xmm3
\n
"
/* re3 */
"mulps %%xmm4, %%xmm0
\n
"
"mulps
%%xmm4, %%xmm0
\n
"
"movlhps %%xmm7, %%xmm6
\n
"
/* 0.0 | im3 | 0.0 | im2 */
"movlhps %%xmm3, %%xmm2
\n
"
/* 0.0 | re3 | 0.0 | re2 */
/* addps %%xmm5, %%xmm0 */
"shufps $0xb1, %%xmm2, %%xmm2
\n
"
/* re3 | 0.0 | re2 | 0.0 */
"movups 16(%%edx), %%xmm4
\n
"
/* w7 | w6 | w5 | w4 */
/* movups 16(%%ebx), %%xmm5 d7 | d6 | d5 | d4 */
"subps %%xmm2, %%xmm6
\n
"
/* -re3 | im3 | -re2 | im2 */
"addl $32, %%edx
\n
"
"movups %%xmm0, (%%eax)
\n
"
/* addl $32, %%ebx */
"mulps %%xmm4, %%xmm6
\n
"
"addl $32, %%esi
\n
"
"addl $32, %%eax
\n
"
/* addps %%xmm5, %%xmm6 */
"addl $-32, %%edi
\n
"
"movups %%xmm6, -16(%%eax)
\n
"
"decl %%ecx
\n
"
"subps %%xmm2, %%xmm6
\n
"
/* -re3 | im3 | -re2 | im2 */
"addl $32, %%edx
\n
"
"movaps %%xmm0, (%%eax)
\n
"
"mulps %%xmm4, %%xmm6
\n
"
"addl $32, %%esi
\n
"
"addl $32, %%eax
\n
"
"addl $-32, %%edi
\n
"
"movaps %%xmm6, -16(%%eax)
\n
"
"decl %%ecx
\n
"
"jnz .first_128_sample
\n
"
"movl 8(%%ebp), %%esi
\n
"
/* buf[0].re */
"leal 1020(%%esi), %%edi
\n
"
/* buf[127].im */
"movl $16, %%ecx
\n
"
/* loop count */
".align 16
\n
"
".second_128_sample:
\n
"
"movss (%%esi), %%xmm0
\n
"
/* buf[i].re */
"movss 8(%%esi), %%xmm2
\n
"
/* re1 */
...
...
@@ -480,32 +487,27 @@ static void imdct512_window_delay_nol_sse (complex_t *buf, float *data_ptr, floa
"movlhps %%xmm3, %%xmm1
\n
"
/* 0.0 | im1 | 0.0 | im1 */
"movups (%%edx), %%xmm4
\n
"
/* w3 | w2 | w1 | w0 */
/* movups (%%ebx), %%xmm5 d3 | d2 | d1 | d0 */
"shufps $0xb1, %%xmm1, %%xmm1
\n
"
/* im1 | 0.0 | im0 | 0.0 */
"movss 16(%%esi), %%xmm6
\n
"
/* re2 */
"movss 24(%%esi), %%xmm7
\n
"
/* re3 */
"movss -16(%%edi), %%xmm2
\n
"
/* im2 */
"movss -24(%%edi), %%xmm3
\n
"
/* im3 */
"subps
%%xmm1, %%xmm0
\n
"
/* -im1 | re1 | -im0 | re0 */
"subps
%%xmm1, %%xmm0
\n
"
/* -im1 | re1 | -im0 | re0 */
"movlhps %%xmm7, %%xmm6
\n
"
/* 0.0 | re3 | 0.0 | re2 */
"movlhps %%xmm3, %%xmm2
\n
"
/* 0.0 | im3 | 0.0 | im2 */
"mulps %%xmm4, %%xmm0
\n
"
"mulps
%%xmm4, %%xmm0
\n
"
"shufps $0xb1, %%xmm2, %%xmm2
\n
"
/* im3 | 0.0 | im2 | 0.0 */
"movups 16(%%edx), %%xmm4
\n
"
/* w7 | w6 | w5 | w4 */
"addl $32, %%esi
\n
"
"subps %%xmm2, %%xmm6
\n
"
/* -im3 | re3 | -im2 | re2 */
/* addps %%xmm5, %%xmm0 */
"mulps %%xmm4, %%xmm6
\n
"
"addl $-32, %%edi
\n
"
/* movups 16(%%ebx), %%xmm5 d7 | d6 | d5 | d4 */
"movups %%xmm0, (%%eax)
\n
"
/* addps %%xmm5, %%xmm6 */
"addl $32, %%edx
\n
"
"addl $32, %%eax
\n
"
/* addl $32, %%ebx */
"movups %%xmm6, -16(%%eax)
\n
"
"decl %%ecx
\n
"
"addl $32, %%esi
\n
"
"subps %%xmm2, %%xmm6
\n
"
/* -im3 | re3 | -im2 | re2 */
"mulps %%xmm4, %%xmm6
\n
"
"addl $-32, %%edi
\n
"
"movaps %%xmm0, (%%eax)
\n
"
"addl $32, %%edx
\n
"
"addl $32, %%eax
\n
"
"movaps %%xmm6, -16(%%eax)
\n
"
"decl %%ecx
\n
"
"jnz .second_128_sample
\n
"
"movl 8(%%ebp), %%eax
\n
"
...
...
@@ -514,6 +516,7 @@ static void imdct512_window_delay_nol_sse (complex_t *buf, float *data_ptr, floa
"movl $16, %%ecx
\n
"
/* loop count */
"movl 20(%%ebp), %%eax
\n
"
/* delay */
".align 16
\n
"
".first_128_delays:
\n
"
"movss (%%esi), %%xmm0
\n
"
"movss 8(%%esi), %%xmm2
\n
"
...
...
@@ -530,20 +533,20 @@ static void imdct512_window_delay_nol_sse (complex_t *buf, float *data_ptr, floa
"movss -16(%%edi), %%xmm2
\n
"
/* im2 */
"movss -24(%%edi), %%xmm3
\n
"
/* im3 */
"subps %%xmm1, %%xmm0
\n
"
/* -im1 | re1 | -im0 | re0 */
"addl $-32, %%edx
\n
"
"addl
$-32, %%edx
\n
"
"movlhps %%xmm7, %%xmm6
\n
"
/* 0.0 | re3 | 0.0 | re2 */
"movlhps %%xmm3, %%xmm2
\n
"
/* 0.0 | im3 | 0.0 | im2 */
"mulps %%xmm4, %%xmm0
\n
"
"mulps
%%xmm4, %%xmm0
\n
"
"movups (%%edx), %%xmm5
\n
"
/* w7 | w6 | w5 | w4 */
"shufps $0xb1, %%xmm2, %%xmm2
\n
"
/* im3 | 0.0 | im2 | 0.0 */
"mov
u
ps %%xmm0, (%%eax)
\n
"
"addl $32, %%esi
\n
"
"subps
%%xmm2, %%xmm6
\n
"
/* -im3 | re3 | -im2 | re2 */
"addl $-32, %%edi
\n
"
"mulps %%xmm5, %%xmm6
\n
"
"addl $32, %%eax
\n
"
"mov
u
ps %%xmm6, -16(%%eax)
\n
"
"decl %%ecx
\n
"
"mov
a
ps %%xmm0, (%%eax)
\n
"
"addl
$32, %%esi
\n
"
"subps
%%xmm2, %%xmm6
\n
"
/* -im3 | re3 | -im2 | re2 */
"addl
$-32, %%edi
\n
"
"mulps
%%xmm5, %%xmm6
\n
"
"addl
$32, %%eax
\n
"
"mov
a
ps %%xmm6, -16(%%eax)
\n
"
"decl
%%ecx
\n
"
"jnz .first_128_delays
\n
"
"movl 8(%%ebp), %%ebx
\n
"
...
...
@@ -551,6 +554,7 @@ static void imdct512_window_delay_nol_sse (complex_t *buf, float *data_ptr, floa
"leal 1016(%%ebx), %%edi
\n
"
/* buf[127].re */
"movl $16, %%ecx
\n
"
/* loop count */
".align 16
\n
"
".second_128_delays:
\n
"
"movss (%%esi), %%xmm0
\n
"
"movss 8(%%esi), %%xmm2
\n
"
...
...
@@ -566,21 +570,21 @@ static void imdct512_window_delay_nol_sse (complex_t *buf, float *data_ptr, floa
"movss 24(%%esi), %%xmm7
\n
"
/* im3 */
"movss -16(%%edi), %%xmm2
\n
"
/* re2 */
"movss -24(%%edi), %%xmm3
\n
"
/* re3 */
"subps
%%xmm0, %%xmm1
\n
"
/* re1 | -im1 | re0 | -im0 */
"addl $-32, %%edx
\n
"
"subps
%%xmm0, %%xmm1
\n
"
/* re1 | -im1 | re0 | -im0 */
"addl
$-32, %%edx
\n
"
"movlhps %%xmm7, %%xmm6
\n
"
/* 0.0 | im3 | 0.0 | im2 */
"movlhps %%xmm3, %%xmm2
\n
"
/* 0.0 | re3 | 0.0 | re2 */
"mulps %%xmm4, %%xmm1
\n
"
"mulps
%%xmm4, %%xmm1
\n
"
"movups (%%edx), %%xmm5
\n
"
/* w7 | w6 | w5 | w4 */
"shufps $0xb1, %%xmm2, %%xmm2
\n
"
/* re3 | 0.0 | re2 | 0.0 */
"mov
u
ps %%xmm1, (%%eax)
\n
"
"addl $32, %%esi
\n
"
"subps
%%xmm6, %%xmm2
\n
"
/* re | -im3 | re | -im2 */
"addl $-32, %%edi
\n
"
"mulps %%xmm5, %%xmm2
\n
"
"addl $32, %%eax
\n
"
"mov
u
ps %%xmm2, -16(%%eax)
\n
"
"decl %%ecx
\n
"
"mov
a
ps %%xmm1, (%%eax)
\n
"
"addl
$32, %%esi
\n
"
"subps
%%xmm6, %%xmm2
\n
"
/* re | -im3 | re | -im2 */
"addl
$-32, %%edi
\n
"
"mulps
%%xmm5, %%xmm2
\n
"
"addl
$32, %%eax
\n
"
"mov
a
ps %%xmm2, -16(%%eax)
\n
"
"decl
%%ecx
\n
"
"jnz .second_128_delays
\n
"
"popl %%edi
\n
"
...
...
plugins/imdct/ac3_srfft_3dn.c
View file @
dee3179d
...
...
@@ -2,7 +2,7 @@
* ac3_srfft_3dn.c: accelerated 3D Now! ac3 fft functions
*****************************************************************************
* Copyright (C) 1999, 2000, 2001 VideoLAN
* $Id: ac3_srfft_3dn.c,v 1.
1 2001/05/16 14:51:29
reno Exp $
* $Id: ac3_srfft_3dn.c,v 1.
2 2001/07/08 23:15:11
reno Exp $
*
* Authors: Renaud Dartus <reno@videolan.org>
*
...
...
@@ -126,6 +126,7 @@ void C_1_3dn (void)
static
void
fft_4_3dn
(
complex_t
*
x
)
{
__asm__
__volatile__
(
".align 16
\n
"
"movq (%%eax), %%mm0
\n
"
/* x[0] */
"movq 8(%%eax), %%mm1
\n
"
/* x[1] */
"movq 16(%%eax), %%mm2
\n
"
/* x[2] */
...
...
plugins/imdct/ac3_srfft_sse.c
View file @
dee3179d
...
...
@@ -2,7 +2,7 @@
* ac3_srfft_sse.c: accelerated SSE ac3 fft functions
*****************************************************************************
* Copyright (C) 1999, 2000, 2001 VideoLAN
* $Id: ac3_srfft_sse.c,v 1.
3 2001/07/01 08:49:09 gbazin
Exp $
* $Id: ac3_srfft_sse.c,v 1.
4 2001/07/08 23:15:11 reno
Exp $
*
* Authors: Renaud Dartus <reno@videolan.org>
* Aaron Holtzman <aholtzma@engr.uvic.ca>
...
...
@@ -106,44 +106,45 @@ void _M( fft_128p ) ( complex_t *a )
void
hsqrt2_sse
(
void
)
{
__asm__
(
".float 0f0.707106781188
\n
"
".float 0f0.707106781188
\n
"
".float 0f-0.707106781188
\n
"
".float 0f-0.707106781188
\n
"
);
__asm__
__volatile__
(
".float 0f0.707106781188
\n
"
".float 0f0.707106781188
\n
"
".float 0f-0.707106781188
\n
"
".float 0f-0.707106781188
\n
"
);
}
void
C_1_sse
(
void
)
{
__asm__
(
".float 0f-1.0
\n
"
".float 0f1.0
\n
"
".float 0f-1.0
\n
"
".float 0f1.0
\n
"
);
__asm__
__volatile__
(
".float 0f-1.0
\n
"
".float 0f1.0
\n
"
".float 0f-1.0
\n
"
".float 0f1.0
\n
"
);
}
static
void
fft_4_sse
(
complex_t
*
x
)
{
__asm__
__volatile__
(
"movups (%%eax), %%xmm0
\n
"
/* x[1] | x[0] */
"movups 16(%%eax), %%xmm2
\n
"
/* x[3] | x[2] */
"movups %%xmm0, %%xmm1
\n
"
/* x[1] | x[0] */
"addps %%xmm2, %%xmm0
\n
"
/* x[1] + x[3] | x[0] + x[2] */
"subps %%xmm2, %%xmm1
\n
"
/* x[1] - x[3] | x[0] - x[2] */
".align 16
\n
"
"movaps (%%eax), %%xmm0
\n
"
/* x[1] | x[0] */
"movaps 16(%%eax), %%xmm2
\n
"
/* x[3] | x[2] */
"movaps %%xmm0, %%xmm1
\n
"
/* x[1] | x[0] */
"addps %%xmm2, %%xmm0
\n
"
/* x[1] + x[3] | x[0] + x[2] */
"subps %%xmm2, %%xmm1
\n
"
/* x[1] - x[3] | x[0] - x[2] */
"xorps %%xmm6, %%xmm6
\n
"
"movhlps %%xmm1, %%xmm4
\n
"
/* ? | x[1] - x[3] */
"movhlps %%xmm0, %%xmm3
\n
"
/* ? | x[1] + x[3] */
"subss %%xmm4, %%xmm6
\n
"
/* 0 | -(x[1] - x[3]).re */
"movlhps %%xmm1, %%xmm0
\n
"
/* x[0] - x[2] | x[0] + x[2] */
"movlhps %%xmm6, %%xmm4
\n
"
/* 0 | -(x[1] - x[3]).re | (x[1] - x[3]).im | (x[3]-x[1]).re */
"mov
ups %%xmm0, %%xmm2
\n
"
/* x[0] - x[2] | x[0] + x[2] */
"movhlps %%xmm1, %%xmm4
\n
"
/* ? | x[1] - x[3] */
"movhlps %%xmm0, %%xmm3
\n
"
/* ? | x[1] + x[3] */
"subss %%xmm4, %%xmm6
\n
"
/* 0 | -(x[1] - x[3]).re */
"movlhps %%xmm1, %%xmm0
\n
"
/* x[0] - x[2] | x[0] + x[2] */
"movlhps %%xmm6, %%xmm4
\n
"
/* 0 | -(x[1] - x[3]).re | (x[1] - x[3]).im | (x[3]-x[1]).re */
"mov
aps %%xmm0, %%xmm2
\n
"
/* x[0] - x[2] | x[0] + x[2] */
"shufps $0x94, %%xmm4, %%xmm3
\n
"
/* i*(x[1] - x[3]) | x[1] + x[3] */
"addps %%xmm3, %%xmm0
\n
"
"subps %%xmm3, %%xmm2
\n
"
"mov
u
ps %%xmm0, (%%eax)
\n
"
"mov
u
ps %%xmm2, 16(%%eax)
\n
"
"mov
a
ps %%xmm0, (%%eax)
\n
"
"mov
a
ps %%xmm2, 16(%%eax)
\n
"
:
"=a"
(
x
)
:
"a"
(
x
)
);
}
...
...
@@ -151,62 +152,63 @@ static void fft_4_sse (complex_t *x)
static
void
fft_8_sse
(
complex_t
*
x
)
{
__asm__
__volatile__
(
".align 16
\n
"
"pushl %%ebx
\n
"
"movlps (%%eax), %%xmm0
\n
"
/* x[0] */
"movlps 32(%%eax), %%xmm1
\n
"
/* x[4] */
"movhps 16(%%eax), %%xmm0
\n
"
/* x[2] | x[0] */
"movhps 48(%%eax), %%xmm1
\n
"
/* x[6] | x[4] */
"mov
ups %%xmm0, %%xmm2
\n
"
/* x[2] | x[0] */
"mov
aps %%xmm0, %%xmm2
\n
"
/* x[2] | x[0] */
"xorps %%xmm3, %%xmm3
\n
"
"addps %%xmm1, %%xmm0
\n
"
/* x[2] + x[6] | x[0] + x[4] */
"subps %%xmm1, %%xmm2
\n
"
/* x[2] - x[6] | x[0] - x[4] */
"movhlps %%xmm0, %%xmm5
\n
"
/* x[2] + x[6] */
"addps %%xmm1, %%xmm0
\n
"
/* x[2] + x[6] | x[0] + x[4] */
"subps %%xmm1, %%xmm2
\n
"
/* x[2] - x[6] | x[0] - x[4] */
"movhlps %%xmm0, %%xmm5
\n
"
/* x[2] + x[6] */
"movhlps %%xmm2, %%xmm4
\n
"
/* x[2] - x[6] */
"movlhps %%xmm2, %%xmm0
\n
"
/* x[0] - x[4] | x[0] + x[4] */
"subss %%xmm4, %%xmm3
\n
"
/* (x[2]-x[6]).im | -(x[2]-x[6]).re */
"mov
ups %%xmm0, %%xmm7
\n
"
/* x[0] - x[4] | x[0] + x[4] */
"mov
ups %%xmm3, %%xmm4
\n
"
/* (x[2]-x[6]).im | -(x[2]-x[6]).re */
"movlps 8(%%eax), %%xmm1
\n
"
/* x[1] */
"movlhps %%xmm2, %%xmm0
\n
"
/* x[0] - x[4] | x[0] + x[4] */
"subss %%xmm4, %%xmm3
\n
"
/* (x[2]-x[6]).im | -(x[2]-x[6]).re */
"mov
aps %%xmm0, %%xmm7
\n
"
/* x[0] - x[4] | x[0] + x[4] */
"mov
aps %%xmm3, %%xmm4
\n
"
/* (x[2]-x[6]).im | -(x[2]-x[6]).re */
"movlps 8(%%eax), %%xmm1
\n
"
/* x[1] */
"shufps $0x14, %%xmm4, %%xmm5
\n
"
/* i*(x[2] - x[6]) | x[2] + x[6] */
"addps %%xmm5, %%xmm0
\n
"
/* yt = i*(x2-x6)+x0-x4 | x2+x6+x0+x4 */
"subps %%xmm5, %%xmm7
\n
"
/* yb = i*(x6-x2)+x0-x4 | -x6-x2+x0+x4 */
"addps %%xmm5, %%xmm0
\n
"
/* yt = i*(x2-x6)+x0-x4 | x2+x6+x0+x4 */
"subps %%xmm5, %%xmm7
\n
"
/* yb = i*(x6-x2)+x0-x4 | -x6-x2+x0+x4 */
"movhps 24(%%eax), %%xmm1
\n
"
/* x[3] | x[1] */
"movl $hsqrt2_sse, %%ebx
\n
"
"movlps 40(%%eax), %%xmm2
\n
"
/* x[5] */
"movhps 56(%%eax), %%xmm2
\n
"
/* x[7] | x[5] */
"mov
ups %%xmm1, %%xmm3
\n
"
/* x[3] | x[1] */
"addps %%xmm2, %%xmm1
\n
"
/* x[3] + x[7] | x[1] + x[5] */
"subps %%xmm2, %%xmm3
\n
"
/* x[3] - x[7] | x[1] - x[5] */
"movups (%%ebx), %%xmm4
\n
"
/* -1/sqrt2 | -1/sqrt2 | 1/sqrt2 | 1/sqrt2 */
"mov
ups %%xmm3, %%xmm6
\n
"
/* x[3] - x[7] | x[1] - x[5] */
"mov
aps %%xmm1, %%xmm3
\n
"
/* x[3] | x[1] */
"addps %%xmm2, %%xmm1
\n
"
/* x[3] + x[7] | x[1] + x[5] */
"subps %%xmm2, %%xmm3
\n
"
/* x[3] - x[7] | x[1] - x[5] */
"movups (%%ebx), %%xmm4
\n
"
/* -1/sqrt2 | -1/sqrt2 | 1/sqrt2 | 1/sqrt2 */
"mov
aps %%xmm3, %%xmm6
\n
"
/* x[3] - x[7] | x[1] - x[5] */
"mulps %%xmm4, %%xmm3
\n
"
/* -1/s2*(x[3] - x[7]) | 1/s2*(x[1] - x[5]) */
"shufps $0xc8, %%xmm4, %%xmm4
\n
"
/* -1/sqrt2 | 1/sqrt2 | -1/sqrt2 | 1/sqrt2 */
"shufps $0xb1, %%xmm6, %%xmm6
\n
"
/* (x3-x7).re|(x3-x7).im|(x1-x5).re|(x1-x5).im */
"mulps %%xmm4, %%xmm6
\n
"
/* (x7-x3).re/s2|(x3-x7).im/s2|(x5-x1).re/s2|(x1-x5).im/s2 */
"addps %%xmm3, %%xmm6
\n
"
/* (-1-i)/sqrt2 * (x[3]-x[7]) | (1-i)/sqrt2 * (x[1] - x[5]) */
"movhlps %%xmm1, %%xmm5
\n
"
/* x[3] + x[7] */
"movlhps %%xmm6, %%xmm1
\n
"
/* (1+i)/sqrt2 * (x[1]-x[5]) | x[1]+x[5] */
"shufps $0xe4, %%xmm6, %%xmm5
\n
"
/* (-1-i)/sqrt2 * (x[3]-x[7]) | x[3]+x[7] */
"mov
ups %%xmm1, %%xmm3
\n
"
/* (1-i)/sqrt2 * (x[1]-x[5]) | x[1]+x[5] */
"addps %%xmm3, %%xmm6
\n
"
/* (-1-i)/sqrt2 * (x[3]-x[7]) | (1-i)/sqrt2 * (x[1] - x[5]) */
"movhlps %%xmm1, %%xmm5
\n
"
/* x[3] + x[7] */
"movlhps %%xmm6, %%xmm1
\n
"
/* (1+i)/sqrt2 * (x[1]-x[5]) | x[1]+x[5] */
"shufps $0xe4, %%xmm6, %%xmm5
\n
"
/* (-1-i)/sqrt2 * (x[3]-x[7]) | x[3]+x[7] */
"mov
aps %%xmm1, %%xmm3
\n
"
/* (1-i)/sqrt2 * (x[1]-x[5]) | x[1]+x[5] */
"movl $C_1_sse, %%ebx
\n
"
"addps %%xmm5, %%xmm1
\n
"
/* u */
"subps %%xmm5, %%xmm3
\n
"
/* v */
"mov
ups %%xmm0, %%xmm2
\n
"
/* yb */
"mov
ups %%xmm7, %%xmm4
\n
"
/* yt */
"addps %%xmm5, %%xmm1
\n
"
/* u */
"subps %%xmm5, %%xmm3
\n
"
/* v */
"mov
aps %%xmm0, %%xmm2
\n
"
/* yb */
"mov
aps %%xmm7, %%xmm4
\n
"
/* yt */
"movups (%%ebx), %%xmm5
\n
"
"mulps %%xmm5, %%xmm3
\n
"
"addps %%xmm1, %%xmm0
\n
"
/* yt + u */
"subps %%xmm1, %%xmm2
\n
"
/* yt - u */
"addps %%xmm1, %%xmm0
\n
"
/* yt + u */
"subps %%xmm1, %%xmm2
\n
"
/* yt - u */
"shufps $0xb1, %%xmm3, %%xmm3
\n
"
/* -i * v */
"mov
u
ps %%xmm0, (%%eax)
\n
"
"mov
u
ps %%xmm2, 32(%%eax)
\n
"
"addps %%xmm3, %%xmm4
\n
"
/* yb - i*v */
"subps %%xmm3, %%xmm7
\n
"
/* yb + i*v */
"mov
u
ps %%xmm4, 16(%%eax)
\n
"
"mov
u
ps %%xmm7, 48(%%eax)
\n
"
"mov
a
ps %%xmm0, (%%eax)
\n
"
"mov
a
ps %%xmm2, 32(%%eax)
\n
"
"addps %%xmm3, %%xmm4
\n
"
/* yb - i*v */
"subps %%xmm3, %%xmm7
\n
"
/* yb + i*v */
"mov
a
ps %%xmm4, 16(%%eax)
\n
"
"mov
a
ps %%xmm7, 48(%%eax)
\n
"
"popl %%ebx
\n
"
:
"=a"
(
x
)
...
...
@@ -218,6 +220,7 @@ static void fft_asmb_sse (int k, complex_t *x, complex_t *wTB,
const
complex_t
*
d
,
const
complex_t
*
d_3
)
{
__asm__
__volatile__
(
".align 16
\n
"
"pushl %%ebp
\n
"
"movl %%esp, %%ebp
\n
"
...
...
@@ -225,10 +228,11 @@ static void fft_asmb_sse (int k, complex_t *x, complex_t *wTB,
"pushl %%eax
\n
"
"pushl %%ebx
\n
"
"pushl %%ecx
\n
"
"pushl %%ecx
\n
"
//
"pushl %%edx
\n
"
"pushl %%esi
\n
"
"pushl %%edi
\n
"
// "movl %%edi, %%ecx\n" /* k */
"pushl %%edi
\n
"
//
"movl 8(%%ebp), %%ecx
\n
"
/* k */
"movl 12(%%ebp), %%eax
\n
"
/* x */
...
...
@@ -236,19 +240,20 @@ static void fft_asmb_sse (int k, complex_t *x, complex_t *wTB,
"movl 16(%%ebp), %%ebx
\n
"
/* wT */
"movl 20(%%ebp), %%edx
\n
"
/* d */
"movl 24(%%ebp), %%esi
\n
"
/* d3 */
"shll $4, %%ecx
\n
"
/* 16k */
"addl $8, %%edx
\n
"
"shll $4, %%ecx
\n
"
/* 16k */
///
"addl $8, %%edx
\n
"
"leal (%%eax, %%ecx, 2), %%edi
\n
"
"addl $8, %%esi
\n
"
/* TRANSZERO and TRANS */
"movups (%%eax), %%xmm0
\n
"
/* x[1] | x[0] */
"movups (%%ebx), %%xmm1
\n
"
/* wT[1] | wT[0] */
"movups (%%ebx, %%ecx), %%xmm2
\n
"
/* wB[1] | wB[0] */
"movlps (%%edx), %%xmm3
\n
"
/* d */
"movlps (%%esi), %%xmm4
\n
"
/* d3 */
"movhlps %%xmm1, %%xmm5
\n
"
/* wT[1] */
"movhlps %%xmm2, %%xmm6
\n
"
/* wB[1] */
".align 16
\n
"
"movaps (%%eax), %%xmm0
\n
"
/* x[1] | x[0] */
"movaps (%%ebx), %%xmm1
\n
"
/* wT[1] | wT[0] */
"movaps (%%ebx, %%ecx), %%xmm2
\n
"
/* wB[1] | wB[0] */
"movlps (%%edx), %%xmm3
\n
"
/* d */
"movlps (%%esi), %%xmm4
\n
"
/* d3 */
"movhlps %%xmm1, %%xmm5
\n
"
/* wT[1] */
"movhlps %%xmm2, %%xmm6
\n
"
/* wB[1] */
"shufps $0x50, %%xmm3, %%xmm3
\n
"
/* d[1].im | d[1].im | d[1].re | d[1].re */
"shufps $0x50, %%xmm4, %%xmm4
\n
"
/* d3[1].im | d3[1].im | d3[i].re | d3[i].re */
"movlhps %%xmm5, %%xmm5
\n
"
/* wT[1] | wT[1] */
...
...
@@ -259,40 +264,41 @@ static void fft_asmb_sse (int k, complex_t *x, complex_t *wTB,
"movlhps %%xmm6, %%xmm5
\n
"
/* wB[1].im * d3[1].re | wB[1].re * d3[1].re | wT[1].im * d[1].re | wT[1].re * d[1].re */
"shufps $0xb1, %%xmm6, %%xmm7
\n
"
/* wB[1].re * d3[1].im | wB[i].im * d3[1].im | wT[1].re * d[1].im | wT[1].im * d[1].im */
"movl $C_1_sse, %%edi
\n
"
"mov
u
ps (%%edi), %%xmm4
\n
"
"mov
a
ps (%%edi), %%xmm4
\n
"
"mulps %%xmm4, %%xmm7
\n
"
"addps %%xmm7, %%xmm5
\n
"
/* wB[1] * d3[1] | wT[1] * d[1] */
"movlhps %%xmm5, %%xmm1
\n
"
/* d[1] * wT[1] | wT[0] */
"shufps $0xe4, %%xmm5, %%xmm2
\n
"
/* d3[1] * wB[1] | wB[0] */
"mov
u
ps %%xmm1, %%xmm3
\n
"
/* d[1] * wT[1] | wT[0] */
"mov
a
ps %%xmm1, %%xmm3
\n
"
/* d[1] * wT[1] | wT[0] */
"leal (%%eax, %%ecx, 2), %%edi
\n
"
"addps %%xmm2, %%xmm1
\n
"
/* u */
"subps %%xmm2, %%xmm3
\n
"
/* v */
"mulps %%xmm4, %%xmm3
\n
"
"mov
u
ps (%%eax, %%ecx), %%xmm5
\n
"
/* xk[1] | xk[0] */
"mov
a
ps (%%eax, %%ecx), %%xmm5
\n
"
/* xk[1] | xk[0] */
"shufps $0xb1, %%xmm3, %%xmm3
\n
"
/* -i * v */
"mov
ups %%xmm0, %%xmm2
\n
"
/* x[1] | x[0] */
"mov
ups %%xmm5, %%xmm6
\n
"
/* xk[1] | xk[0] */
"mov
aps %%xmm0, %%xmm2
\n
"
/* x[1] | x[0] */
"mov
aps %%xmm5, %%xmm6
\n
"
/* xk[1] | xk[0] */
"addps %%xmm1, %%xmm0
\n
"
"subps %%xmm1, %%xmm2
\n
"
"addps %%xmm3, %%xmm5
\n
"
"subps %%xmm3, %%xmm6
\n
"
"mov
u
ps %%xmm0, (%%eax)
\n
"
"mov
u
ps %%xmm2, (%%edi)
\n
"
"mov
u
ps %%xmm5, (%%eax, %%ecx)
\n
"
"mov
u
ps %%xmm6, (%%edi, %%ecx)
\n
"
"mov
a
ps %%xmm0, (%%eax)
\n
"
"mov
a
ps %%xmm2, (%%edi)
\n
"
"mov
a
ps %%xmm5, (%%eax, %%ecx)
\n
"
"mov
a
ps %%xmm6, (%%edi, %%ecx)
\n
"
"addl $16, %%eax
\n
"
"addl $16, %%ebx
\n
"
"addl $8, %%edx
\n
"
"addl $8, %%esi
\n
"
"decl -4(%%ebp)
\n
"
".align 16
\n
"
".loop:
\n
"
"mov
u
ps (%%ebx), %%xmm0
\n
"
/* wT[1] | wT[0] */
"mov
u
ps (%%edx), %%xmm1
\n
"
/* d[1] | d[0] */
"mov
a
ps (%%ebx), %%xmm0
\n
"
/* wT[1] | wT[0] */
"mov
a
ps (%%edx), %%xmm1
\n
"
/* d[1] | d[0] */
"mov
u
ps (%%ebx, %%ecx), %%xmm4
\n
"
/* wB[1] | wB[0] */
"mov
u
ps (%%esi), %%xmm5
\n
"
/* d3[1] | d3[0] */
"mov
a
ps (%%ebx, %%ecx), %%xmm4
\n
"
/* wB[1] | wB[0] */
"mov
a
ps (%%esi), %%xmm5
\n
"
/* d3[1] | d3[0] */
"movhlps %%xmm0, %%xmm2
\n
"
/* wT[1] */
"movhlps %%xmm1, %%xmm3
\n
"
/* d[1] */
...
...
@@ -317,50 +323,51 @@ static void fft_asmb_sse (int k, complex_t *x, complex_t *wTB,
"movlhps %%xmm2, %%xmm0
\n
"
/* d[1].re * wT[1].im | d[1].re * wT[1].re | d[0].re * wT[0].im | d[0].re * wT[0].re */
"mulps %%xmm5, %%xmm4
\n
"
/* wB[0].im * d3[0].im | wB[0].re * d3[0].im | wB[0].im * d3[0].re | wB[0].re * d3[0].re */
"mulps %%xmm7, %%xmm6
\n
"
/* wB[1].im * d3[1].im | wB[1].re * d3[1].im | wB[1].im * d3[1].re | wB[1].re * d3[1].re */
"shufps $0xb1, %%xmm2, %%xmm1
\n
"
/* d[1].im * wT[1].re | d[1].im * wT[1].im | d[0].im * wT[0].re | d[0].im * wT[0].im */
"shufps $0xb1, %%xmm2, %%xmm1
\n
"
/* d[1].im * wT[1].re | d[1].im * wT[1].im | d[0].im * wT[0].re | d[0].im * wT[0].im */
"movl $C_1_sse, %%edi
\n
"
"mov
u
ps (%%edi), %%xmm3
\n
"
/* 1.0 | -1.0 | 1.0 | -1.0 */
"mov
a
ps (%%edi), %%xmm3
\n
"
/* 1.0 | -1.0 | 1.0 | -1.0 */
"movhlps %%xmm4, %%xmm5
\n
"
/* wB[0].im * d3[0].im | wB[0].re * d3[0].im */
"mulps %%xmm3, %%xmm1
\n
"
/* d[1].im * wT[1].re | -d[1].im * wT[1].im | d[0].im * wT[0].re | -d[0].im * wT[0].im */
"movlhps %%xmm6, %%xmm4
\n
"
/* wB[1].im * d3[1].re | wB[1].re * d3[1].re | wB[0].im * d3[0].re | wB[0].im * d3[0].re */
"addps %%xmm1, %%xmm0
\n
"
/* wT[1] * d[1] | wT[0] * d[0] */
"shufps $0xb1, %%xmm6, %%xmm5
\n
"
/* wB[1].re * d3[1].im | wB[1].im * d3[1].im | wB[0].re * d3[0].im | wB[0].im * d3[0].im */
"shufps $0xb1, %%xmm6, %%xmm5
\n
"
/* wB[1].re * d3[1].im | wB[1].im * d3[1].im | wB[0].re * d3[0].im | wB[0].im * d3[0].im */
"mulps %%xmm3, %%xmm5
\n
"
/* wB[1].re * d3[1].im | -wB[1].im * d3[1].im | wB[0].re * d3[0].im | -wB[0].im * d3[0].im */
"addps %%xmm5, %%xmm4
\n
"
/* wB[1] * d3[1] | wB[0] * d3[0] */
"mov
u
ps %%xmm0, %%xmm1
\n
"
/* wT[1] * d[1] | wT[0] * d[0] */
"mov
a
ps %%xmm0, %%xmm1
\n
"
/* wT[1] * d[1] | wT[0] * d[0] */
"addps %%xmm4, %%xmm0
\n
"
/* u */
"subps %%xmm4, %%xmm1
\n
"
/* v */
"mov
u
ps (%%eax), %%xmm6
\n
"
/* x[1] | x[0] */
"mov
a
ps (%%eax), %%xmm6
\n
"
/* x[1] | x[0] */
"leal (%%eax, %%ecx, 2), %%edi
\n
"
"mulps %%xmm3, %%xmm1
\n
"
"addl $16, %%ebx
\n
"
"addl $16, %%esi
\n
"
"shufps $0xb1, %%xmm1, %%xmm1
\n
"
/* -i * v */
"mov
u
ps (%%eax, %%ecx), %%xmm7
\n
"
/* xk[1] | xk[0] */
"mov
u
ps %%xmm6, %%xmm2
\n
"
"mov
u
ps %%xmm7, %%xmm4
\n
"
"mov
a
ps (%%eax, %%ecx), %%xmm7
\n
"
/* xk[1] | xk[0] */
"mov
a
ps %%xmm6, %%xmm2
\n
"
"mov
a
ps %%xmm7, %%xmm4
\n
"
"addps %%xmm0, %%xmm6
\n
"
"subps %%xmm0, %%xmm2
\n
"
"mov
u
ps %%xmm6, (%%eax)
\n
"
"mov
u
ps %%xmm2, (%%edi)
\n
"
"mov
a
ps %%xmm6, (%%eax)
\n
"
"mov
a
ps %%xmm2, (%%edi)
\n
"
"addps %%xmm1, %%xmm7
\n
"
"subps %%xmm1, %%xmm4
\n
"
"addl $16, %%edx
\n
"
"mov
u
ps %%xmm7, (%%eax, %%ecx)
\n
"
"mov
u
ps %%xmm4, (%%edi, %%ecx)
\n
"
"mov
a
ps %%xmm7, (%%eax, %%ecx)
\n
"
"mov
a
ps %%xmm4, (%%edi, %%ecx)
\n
"
"addl $16, %%eax
\n
"
"decl -4(%%ebp)
\n
"
"jnz .loop
\n
"
".align 16
\n
"
".end:
\n
"
"popl %%edi
\n
"
"popl %%edi
\n
"
//
"popl %%esi
\n
"
"popl %%edx
\n
"
"popl %%ecx
\n
"
"popl %%ecx
\n
"
//
"popl %%ebx
\n
"
"popl %%eax
\n
"
...
...
src/ac3_decoder/ac3_decoder.h
View file @
dee3179d
...
...
@@ -2,7 +2,7 @@
* ac3_decoder.h : ac3 decoder interface
*****************************************************************************
* Copyright (C) 1999, 2000 VideoLAN
* $Id: ac3_decoder.h,v 1.1
0 2001/06/12 00:30:4
1 reno Exp $
* $Id: ac3_decoder.h,v 1.1
1 2001/07/08 23:15:1
1 reno Exp $
*
* Authors: Michel Kaempf <maxx@via.ecp.fr>
* Renaud Dartus <reno@videolan.org>
...
...
@@ -354,6 +354,9 @@ typedef struct mantissa_s
struct
ac3dec_s
{
float
samples
[
6
][
256
]
__attribute__
((
aligned
(
16
)));
imdct_t
imdct
__attribute__
((
aligned
(
16
)));
/*
* Input properties
*/
...
...
@@ -370,12 +373,10 @@ struct ac3dec_s
bsi_t
bsi
;
audblk_t
audblk
;
float
samples
[
6
][
256
]
__attribute__
((
aligned
(
16
)));
dm_par_t
dm_par
;
bit_allocate_t
bit_allocate
;
mantissa_t
mantissa
;
imdct_t
imdct
;
downmix_t
downmix
;
};
...
...
src/ac3_decoder/ac3_decoder_thread.c
View file @
dee3179d
...
...
@@ -2,7 +2,7 @@
* ac3_decoder_thread.c: ac3 decoder thread
*****************************************************************************
* Copyright (C) 1999, 2000 VideoLAN
* $Id: ac3_decoder_thread.c,v 1.3
4 2001/05/31 01:37:08 sam
Exp $
* $Id: ac3_decoder_thread.c,v 1.3
5 2001/07/08 23:15:11 reno
Exp $
*
* Authors: Michel Lespinasse <walken@zoy.org>
*
...
...
@@ -82,7 +82,13 @@ vlc_thread_t ac3dec_CreateThread( adec_config_t * p_config )
intf_DbgMsg
(
"ac3dec debug: creating ac3 decoder thread"
);
/* Allocate the memory needed to store the thread's structure */
if
((
p_ac3thread
=
(
ac3dec_thread_t
*
)
malloc
(
sizeof
(
ac3dec_thread_t
)))
==
NULL
)
p_ac3thread
=
(
ac3dec_thread_t
*
)
malloc
(
sizeof
(
ac3dec_thread_t
));
/* We need to be 16 bytes aligned */
p_ac3thread
->
ac3thread
=
(
int
)
p_ac3thread
&
(
-
15
);
p_ac3thread
=
(
ac3dec_thread_t
*
)
p_ac3thread
->
ac3thread
;
if
(
p_ac3thread
==
NULL
)
{
intf_ErrMsg
(
"ac3dec error: not enough memory "
"for ac3dec_CreateThread() to create the new thread"
);
...
...
@@ -335,6 +341,7 @@ static void EndThread (ac3dec_thread_t * p_ac3thread)
/* Destroy descriptor */
free
(
p_ac3thread
->
p_config
);
p_ac3thread
=
(
ac3dec_thread_t
*
)
p_ac3thread
->
ac3thread
;
free
(
p_ac3thread
);
intf_DbgMsg
(
"ac3dec debug: ac3 decoder thread %p destroyed"
,
p_ac3thread
);
...
...
src/ac3_decoder/ac3_decoder_thread.h
View file @
dee3179d
...
...
@@ -2,7 +2,7 @@
* ac3_decoder_thread.h : ac3 decoder thread interface
*****************************************************************************
* Copyright (C) 1999, 2000 VideoLAN
* $Id: ac3_decoder_thread.h,v 1.
7 2001/05/14 15:58:03
reno Exp $
* $Id: ac3_decoder_thread.h,v 1.
8 2001/07/08 23:15:11
reno Exp $
*
* Authors: Michel Kaempf <maxx@via.ecp.fr>
*
...
...
@@ -24,8 +24,16 @@
/*****************************************************************************
* ac3dec_thread_t : ac3 decoder thread descriptor
*****************************************************************************/
typedef
struct
ac3dec_thread_s
{
/*
* Decoder properties
*/
float
used_for_alignement1
;
float
used_for_alignement2
;
ac3dec_t
ac3_decoder
__attribute__
((
aligned
(
16
)));
/*
* Thread properties
*/
...
...
@@ -38,16 +46,12 @@ typedef struct ac3dec_thread_s
int
sync_ptr
;
/* sync ptr from ac3 magic header */
adec_config_t
*
p_config
;
/*
* Decoder properties
*/
ac3dec_t
ac3_decoder
;
/*
* Output properties
*/
aout_fifo_t
*
p_aout_fifo
;
/* stores the decompressed audio frames */
int
ac3thread
;
/* save the old pointer */
}
ac3dec_thread_t
;
/*****************************************************************************
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment