Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Support
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
V
vlc
Project overview
Project overview
Details
Activity
Releases
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Issues
0
Issues
0
List
Boards
Labels
Milestones
Redmine
Redmine
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Operations
Operations
Metrics
Environments
Analytics
Analytics
CI / CD
Repository
Value Stream
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
videolan
vlc
Commits
cf0b7cf9
Commit
cf0b7cf9
authored
Jan 17, 2001
by
Christophe Massiot
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
* Borrowed LiViD's MMX and MMX EXT IDCT.
This might break things. Expect a performance increase.
parent
fd1220a2
Changes
17
Hide whitespace changes
Inline
Side-by-side
Showing
17 changed files
with
802 additions
and
1056 deletions
+802
-1056
Makefile.in
Makefile.in
+3
-4
include/modules.h
include/modules.h
+1
-0
plugins/idct/idct.c
plugins/idct/idct.c
+7
-22
plugins/idct/idct.h
plugins/idct/idct.h
+11
-2
plugins/idct/idct_common.c
plugins/idct/idct_common.c
+18
-3
plugins/idct/idct_mmxext.c
plugins/idct/idct_mmxext.c
+0
-79
plugins/idct/idctclassic.c
plugins/idct/idctclassic.c
+7
-22
plugins/idct/idctmmx.c
plugins/idct/idctmmx.c
+352
-22
plugins/idct/idctmmx_asm.S
plugins/idct/idctmmx_asm.S
+0
-829
plugins/idct/idctmmxext.c
plugins/idct/idctmmxext.c
+343
-12
src/video_decoder/vdec_idct.h
src/video_decoder/vdec_idct.h
+2
-2
src/video_decoder/video_parser.h
src/video_decoder/video_parser.h
+8
-2
src/video_decoder/vpar_blocks.h
src/video_decoder/vpar_blocks.h
+4
-3
src/video_decoder/vpar_headers.h
src/video_decoder/vpar_headers.h
+2
-2
src/video_parser/video_parser.c
src/video_parser/video_parser.c
+3
-1
src/video_parser/vpar_blocks.c
src/video_parser/vpar_blocks.c
+27
-12
src/video_parser/vpar_headers.c
src/video_parser/vpar_headers.c
+14
-39
No files found.
Makefile.in
View file @
cf0b7cf9
...
...
@@ -278,7 +278,7 @@ endif
ifneq
(,$(findstring 86,$(ARCH)))
ifneq
(,$(findstring mmx,$(ARCH)))
ASM_OBJ
=
STD_PLUGIN_ASM
=
plugins/idct/idctmmx_asm.o
STD_PLUGIN_ASM
=
endif
endif
...
...
@@ -351,11 +351,10 @@ PLUGIN_IDCTCLASSIC = plugins/idct/idctclassic.o \
plugins/idct/idct_common.o
PLUGIN_IDCTMMX
=
plugins/idct/idctmmx.o
\
plugins/idct/idct_common.o
\
plugins/idct/idctmmx_asm.o
plugins/idct/idct_common.o
PLUGIN_IDCTMMXEXT
=
plugins/idct/idctmmxext.o
\
plugins/idct/idct_
mmxext
.o
plugins/idct/idct_
common
.o
PLUGIN_ALSA
=
plugins/alsa/alsa.o
\
plugins/alsa/aout_alsa.o
...
...
include/modules.h
View file @
cf0b7cf9
...
...
@@ -83,6 +83,7 @@ typedef struct function_list_s
void
(
*
pf_idct
)
(
struct
vdec_thread_s
*
p_vdec
,
dctelem_t
*
p_block
,
int
i_idontcare
);
void
(
*
pf_norm_scan
)
(
u8
ppi_scan
[
2
][
64
]
);
}
idct
;
struct
...
...
plugins/idct/idct.c
View file @
cf0b7cf9
...
...
@@ -2,7 +2,7 @@
* idct.c : IDCT module
*****************************************************************************
* Copyright (C) 1999, 2000 VideoLAN
* $Id: idct.c,v 1.
4 2001/01/16 05:04:25 sam
Exp $
* $Id: idct.c,v 1.
5 2001/01/17 18:17:30 massiot
Exp $
*
* Authors: Gaël Hendryckx <jimmy@via.ecp.fr>
*
...
...
@@ -50,13 +50,8 @@
* Local and extern prototypes.
*****************************************************************************/
static
void
idct_getfunctions
(
function_list_t
*
p_function_list
);
static
int
idct_Probe
(
probedata_t
*
p_data
);
static
void
vdec_InitIDCT
(
vdec_thread_t
*
p_vdec
);
void
vdec_SparseIDCT
(
vdec_thread_t
*
p_vdec
,
dctelem_t
*
p_block
,
int
i_sparse_pos
);
static
void
vdec_IDCT
(
vdec_thread_t
*
p_vdec
,
dctelem_t
*
p_block
,
int
i_idontcare
);
static
void
vdec_NormScan
(
u8
ppi_scan
[
2
][
64
]
);
/*****************************************************************************
...
...
@@ -136,6 +131,7 @@ static void idct_getfunctions( function_list_t * p_function_list )
p_function_list
->
functions
.
idct
.
pf_init
=
vdec_InitIDCT
;
p_function_list
->
functions
.
idct
.
pf_sparse_idct
=
vdec_SparseIDCT
;
p_function_list
->
functions
.
idct
.
pf_idct
=
vdec_IDCT
;
p_function_list
->
functions
.
idct
.
pf_norm_scan
=
vdec_NormScan
;
}
/*****************************************************************************
...
...
@@ -153,28 +149,17 @@ static int idct_Probe( probedata_t *p_data )
}
/*****************************************************************************
* vdec_
InitIDCT : initialize datas for vdec_Sparse
IDCT
* vdec_
NormScan : Unused in this
IDCT
*****************************************************************************/
static
void
vdec_
InitIDCT
(
vdec_thread_t
*
p_vdec
)
static
void
vdec_
NormScan
(
u8
ppi_scan
[
2
][
64
]
)
{
int
i
;
dctelem_t
*
p_pre
=
p_vdec
->
p_pre_idct
;
memset
(
p_pre
,
0
,
64
*
64
*
sizeof
(
dctelem_t
)
);
for
(
i
=
0
;
i
<
64
;
i
++
)
{
p_pre
[
i
*
64
+
i
]
=
1
<<
SPARSE_SCALE_FACTOR
;
vdec_IDCT
(
p_vdec
,
&
p_pre
[
i
*
64
],
0
)
;
}
return
;
}
/*****************************************************************************
* vdec_IDCT : IDCT function for normal matrices
*****************************************************************************/
static
void
vdec_IDCT
(
vdec_thread_t
*
p_vdec
,
dctelem_t
*
p_block
,
int
i_idontcare
)
void
vdec_IDCT
(
vdec_thread_t
*
p_vdec
,
dctelem_t
*
p_block
,
int
i_idontcare
)
{
s32
tmp0
,
tmp1
,
tmp2
,
tmp3
;
s32
tmp10
,
tmp11
,
tmp12
,
tmp13
;
...
...
plugins/idct/idct.h
View file @
cf0b7cf9
...
...
@@ -2,7 +2,7 @@
* idct.h : macros for the inverse discrete cosine transform
*****************************************************************************
* Copyright (C) 1999, 2000 VideoLAN
* $Id: idct.h,v 1.
1 2001/01/13 12:57:20 sam
Exp $
* $Id: idct.h,v 1.
2 2001/01/17 18:17:30 massiot
Exp $
*
* Authors: Gaël Hendryckx <jimmy@via.ecp.fr>
* Christophe Massiot <massiot@via.ecp.fr>
...
...
@@ -26,7 +26,7 @@
#define DCTSIZE 8
/* 8*8 DCT */
/*****************************************************************************
*
Macros
* Macros
*****************************************************************************/
/* We assume that right shift corresponds to signed division by 2 with
...
...
@@ -140,3 +140,12 @@
#define MULTIPLY(var,const) ((var) * (const))
#endif
/*****************************************************************************
* Protoypes
*****************************************************************************/
void
vdec_SparseIDCT
(
vdec_thread_t
*
p_vdec
,
dctelem_t
*
p_block
,
int
i_sparse_pos
);
void
vdec_InitIDCT
(
vdec_thread_t
*
p_vdec
);
void
vdec_IDCT
(
vdec_thread_t
*
p_vdec
,
dctelem_t
*
p_block
,
int
i_idontcare
);
plugins/idct/idct_common.c
View file @
cf0b7cf9
...
...
@@ -2,7 +2,7 @@
* idct_common.c : common IDCT functions
*****************************************************************************
* Copyright (C) 1999, 2000 VideoLAN
* $Id: idct_common.c,v 1.
2 2001/01/15 06:18:23 sam
Exp $
* $Id: idct_common.c,v 1.
3 2001/01/17 18:17:30 massiot
Exp $
*
* Authors: Gal Hendryckx <jimmy@via.ecp.fr>
*
...
...
@@ -40,8 +40,23 @@
#include "idct.h"
void
vdec_SparseIDCT
(
vdec_thread_t
*
p_vdec
,
dctelem_t
*
p_block
,
int
i_sparse_pos
);
/*****************************************************************************
* vdec_InitIDCT : initialize datas for vdec_SparseIDCT
*****************************************************************************/
void
vdec_InitIDCT
(
vdec_thread_t
*
p_vdec
)
{
int
i
;
dctelem_t
*
p_pre
=
p_vdec
->
p_pre_idct
;
memset
(
p_pre
,
0
,
64
*
64
*
sizeof
(
dctelem_t
)
);
for
(
i
=
0
;
i
<
64
;
i
++
)
{
p_pre
[
i
*
64
+
i
]
=
1
<<
SPARSE_SCALE_FACTOR
;
vdec_IDCT
(
p_vdec
,
&
p_pre
[
i
*
64
],
0
)
;
}
return
;
}
/*****************************************************************************
* vdec_SparseIDCT : IDCT function for sparse matrices
...
...
plugins/idct/idct_mmxext.c
deleted
100644 → 0
View file @
fd1220a2
/*****************************************************************************
* idct_mmxext.c : MMX EXT IDCT functions
*****************************************************************************
* Copyright (C) 1999, 2000 VideoLAN
* $Id: idct_mmxext.c,v 1.1 2001/01/16 13:26:46 sam Exp $
*
* Authors:
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111, USA.
*****************************************************************************/
/*****************************************************************************
* Preamble
*****************************************************************************/
#include "defs.h"
#include <stdlib.h>
#include "config.h"
#include "common.h"
#include "threads.h"
#include "mtime.h"
#include "tests.h"
#include "video.h"
#include "video_output.h"
#include "video_decoder.h"
#include "idct.h"
/*****************************************************************************
* Local and extern prototypes.
*****************************************************************************/
void
vdec_InitIDCT
(
vdec_thread_t
*
p_vdec
);
void
vdec_SparseIDCT
(
vdec_thread_t
*
p_vdec
,
dctelem_t
*
p_block
,
int
i_sparse_pos
);
void
vdec_IDCT
(
vdec_thread_t
*
p_vdec
,
dctelem_t
*
p_block
,
int
i_idontcare
);
/*****************************************************************************
* vdec_InitIDCT : initialize datas for vdec_SparseIDCT
*****************************************************************************/
void
vdec_InitIDCT
(
vdec_thread_t
*
p_vdec
)
{
return
;
}
/*****************************************************************************
* vdec_IDCT : IDCT function for normal matrices
*****************************************************************************/
void
vdec_IDCT
(
vdec_thread_t
*
p_vdec
,
dctelem_t
*
p_block
,
int
i_idontcare
)
{
return
;
}
/*****************************************************************************
* vdec_SparseIDCT : IDCT function for sparse matrices
*****************************************************************************/
void
vdec_SparseIDCT
(
vdec_thread_t
*
p_vdec
,
dctelem_t
*
p_block
,
int
i_sparse_pos
)
{
return
;
}
plugins/idct/idctclassic.c
View file @
cf0b7cf9
...
...
@@ -2,7 +2,7 @@
* idctclassic.c : Classic IDCT module
*****************************************************************************
* Copyright (C) 1999, 2000 VideoLAN
* $Id: idctclassic.c,v 1.
4 2001/01/16 05:04:25 sam
Exp $
* $Id: idctclassic.c,v 1.
5 2001/01/17 18:17:30 massiot
Exp $
*
* Authors: Gaël Hendryckx <jimmy@via.ecp.fr>
*
...
...
@@ -50,13 +50,8 @@
* Local and extern prototypes.
*****************************************************************************/
static
void
idct_getfunctions
(
function_list_t
*
p_function_list
);
static
int
idct_Probe
(
probedata_t
*
p_data
);
static
void
vdec_InitIDCT
(
vdec_thread_t
*
p_vdec
);
void
vdec_SparseIDCT
(
vdec_thread_t
*
p_vdec
,
dctelem_t
*
p_block
,
int
i_sparse_pos
);
static
void
vdec_IDCT
(
vdec_thread_t
*
p_vdec
,
dctelem_t
*
p_block
,
int
i_idontcare
);
static
void
vdec_NormScan
(
u8
ppi_scan
[
2
][
64
]
);
/*****************************************************************************
...
...
@@ -136,6 +131,7 @@ static void idct_getfunctions( function_list_t * p_function_list )
p_function_list
->
functions
.
idct
.
pf_init
=
vdec_InitIDCT
;
p_function_list
->
functions
.
idct
.
pf_sparse_idct
=
vdec_SparseIDCT
;
p_function_list
->
functions
.
idct
.
pf_idct
=
vdec_IDCT
;
p_function_list
->
functions
.
idct
.
pf_norm_scan
=
vdec_NormScan
;
}
/*****************************************************************************
...
...
@@ -153,28 +149,17 @@ static int idct_Probe( probedata_t *p_data )
}
/*****************************************************************************
* vdec_
InitIDCT : initialize datas for vdec_Sparse
IDCT
* vdec_
NormScan : Unused in this
IDCT
*****************************************************************************/
static
void
vdec_
InitIDCT
(
vdec_thread_t
*
p_vdec
)
static
void
vdec_
NormScan
(
u8
ppi_scan
[
2
][
64
]
)
{
int
i
;
dctelem_t
*
p_pre
=
p_vdec
->
p_pre_idct
;
memset
(
p_pre
,
0
,
64
*
64
*
sizeof
(
dctelem_t
)
);
for
(
i
=
0
;
i
<
64
;
i
++
)
{
p_pre
[
i
*
64
+
i
]
=
1
<<
SPARSE_SCALE_FACTOR
;
vdec_IDCT
(
p_vdec
,
&
p_pre
[
i
*
64
],
0
)
;
}
return
;
}
/*****************************************************************************
* vdec_IDCT : IDCT function for normal matrices
*****************************************************************************/
static
void
vdec_IDCT
(
vdec_thread_t
*
p_vdec
,
dctelem_t
*
p_block
,
int
i_idontcare
)
void
vdec_IDCT
(
vdec_thread_t
*
p_vdec
,
dctelem_t
*
p_block
,
int
i_idontcare
)
{
/* dct classique: pour tester la meilleure entre la classique et la */
/* no classique */
...
...
plugins/idct/idctmmx.c
View file @
cf0b7cf9
...
...
@@ -2,9 +2,13 @@
* idctmmx.c : MMX IDCT module
*****************************************************************************
* Copyright (C) 1999, 2000 VideoLAN
* $Id: idctmmx.c,v 1.
5 2001/01/16 16:09:52 sam
Exp $
* $Id: idctmmx.c,v 1.
6 2001/01/17 18:17:30 massiot
Exp $
*
* Authors: Gaël Hendryckx <jimmy@via.ecp.fr>
* Authors: Aaron Holtzman <aholtzma@ess.engr.uvic.ca>
* Michel Lespinasse <walken@zoy.org>
* Peter Gubanov <peter@elecard.net.ru>
* (from the LiViD project)
* Christophe Massiot <massiot@via.ecp.fr>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
...
...
@@ -46,17 +50,15 @@
#include "idct.h"
#include "attributes.h"
#include "mmx.h"
/*****************************************************************************
* Local prototypes.
*****************************************************************************/
static
void
idct_getfunctions
(
function_list_t
*
p_function_list
);
static
int
idct_Probe
(
probedata_t
*
p_data
);
static
void
vdec_InitIDCT
(
vdec_thread_t
*
p_vdec
);
void
vdec_SparseIDCT
(
vdec_thread_t
*
p_vdec
,
dctelem_t
*
p_block
,
int
i_sparse_pos
);
void
vdec_IDCT
(
vdec_thread_t
*
p_vdec
,
dctelem_t
*
p_block
,
int
i_idontcare
);
static
void
vdec_NormScan
(
u8
ppi_scan
[
2
][
64
]
);
/*****************************************************************************
...
...
@@ -127,8 +129,7 @@ int DeactivateModule( module_t * p_module )
/* Following functions are local */
/*****************************************************************************
* Functions exported as capabilities. They are declared as static so that
* we don't pollute the namespace too much.
* Functions exported as capabilities.
*****************************************************************************/
static
void
idct_getfunctions
(
function_list_t
*
p_function_list
)
{
...
...
@@ -136,6 +137,7 @@ static void idct_getfunctions( function_list_t * p_function_list )
p_function_list
->
functions
.
idct
.
pf_init
=
vdec_InitIDCT
;
p_function_list
->
functions
.
idct
.
pf_sparse_idct
=
vdec_SparseIDCT
;
p_function_list
->
functions
.
idct
.
pf_idct
=
vdec_IDCT
;
p_function_list
->
functions
.
idct
.
pf_norm_scan
=
vdec_NormScan
;
}
/*****************************************************************************
...
...
@@ -149,10 +151,10 @@ static int idct_Probe( probedata_t *p_data )
{
return
(
999
);
}
else
else
{
return
(
100
);
}
}
}
else
{
...
...
@@ -161,20 +163,348 @@ static int idct_Probe( probedata_t *p_data )
}
/*****************************************************************************
* vdec_
InitIDCT : initialize datas for vdec_SparceIDCT
* vdec_
NormScan : This IDCT uses reordered coeffs, so we patch the scan table
*****************************************************************************/
static
void
vdec_
InitIDCT
(
vdec_thread_t
*
p_vdec
)
static
void
vdec_
NormScan
(
u8
ppi_scan
[
2
][
64
]
)
{
int
i
;
dctelem_t
*
p_pre
=
p_vdec
->
p_pre_idct
;
memset
(
p_pre
,
0
,
64
*
64
*
sizeof
(
dctelem_t
)
);
int
i
,
j
;
for
(
i
=
0
;
i
<
64
;
i
++
)
for
(
i
=
0
;
i
<
64
;
i
++
)
{
p_pre
[
i
*
64
+
i
]
=
1
<<
SPARSE_SCALE_FACTOR
;
vdec_IDCT
(
p_vdec
,
&
p_pre
[
i
*
64
],
0
)
;
j
=
ppi_scan
[
0
][
i
];
ppi_scan
[
0
][
i
]
=
(
j
&
0x38
)
|
((
j
&
6
)
>>
1
)
|
((
j
&
1
)
<<
2
);
j
=
ppi_scan
[
1
][
i
];
ppi_scan
[
1
][
i
]
=
(
j
&
0x38
)
|
((
j
&
6
)
>>
1
)
|
((
j
&
1
)
<<
2
);
}
return
;
}
/*****************************************************************************
* vdec_IDCT :
*****************************************************************************/
#define ROW_SHIFT 11
#define COL_SHIFT 6
#define round(bias) ((int)(((bias)+0.5) * (1<<ROW_SHIFT)))
#define rounder(bias) {round (bias), round (bias)}
#define table(c1,c2,c3,c4,c5,c6,c7) { c4, c2, c4, c6, \
c4, c6, -c4, -c2, \
c1, c3, c3, -c7, \
c5, c7, -c1, -c5, \
c4, -c6, c4, -c2, \
-c4, c2, c4, -c6, \
c5, -c1, c7, -c5, \
c7, c3, c3, -c1 }
static
__inline__
void
RowHead
(
dctelem_t
*
row
,
int
offset
,
dctelem_t
*
table
)
{
movq_m2r
(
*
(
row
+
offset
),
mm2
);
// mm2 = x6 x4 x2 x0
movq_m2r
(
*
(
row
+
offset
+
4
),
mm5
);
// mm5 = x7 x5 x3 x1
movq_r2r
(
mm2
,
mm0
);
// mm0 = x6 x4 x2 x0
movq_m2r
(
*
table
,
mm3
);
// mm3 = C6 C4 C2 C4
movq_r2r
(
mm5
,
mm6
);
// mm6 = x7 x5 x3 x1
punpckldq_r2r
(
mm0
,
mm0
);
// mm0 = x2 x0 x2 x0
movq_m2r
(
*
(
table
+
4
),
mm4
);
// mm4 = -C2 -C4 C6 C4
pmaddwd_r2r
(
mm0
,
mm3
);
// mm3 = C4*x0+C6*x2 C4*x0+C2*x2
movq_m2r
(
*
(
table
+
8
),
mm1
);
// mm1 = -C7 C3 C3 C1
punpckhdq_r2r
(
mm2
,
mm2
);
// mm2 = x6 x4 x6 x4
}
static
__inline__
void
Row
(
dctelem_t
*
table
,
s32
*
rounder
)
{
pmaddwd_r2r
(
mm2
,
mm4
);
// mm4 = -C4*x4-C2*x6 C4*x4+C6*x6
punpckldq_r2r
(
mm5
,
mm5
);
// mm5 = x3 x1 x3 x1
pmaddwd_m2r
(
*
(
table
+
16
),
mm0
);
// mm0 = C4*x0-C2*x2 C4*x0-C6*x2
punpckhdq_r2r
(
mm6
,
mm6
);
// mm6 = x7 x5 x7 x5
movq_m2r
(
*
(
table
+
12
),
mm7
);
// mm7 = -C5 -C1 C7 C5
pmaddwd_r2r
(
mm5
,
mm1
);
// mm1 = C3*x1-C7*x3 C1*x1+C3*x3
paddd_m2r
(
*
rounder
,
mm3
);
// mm3 += rounder
pmaddwd_r2r
(
mm6
,
mm7
);
// mm7 = -C1*x5-C5*x7 C5*x5+C7*x7
pmaddwd_m2r
(
*
(
table
+
20
),
mm2
);
// mm2 = C4*x4-C6*x6 -C4*x4+C2*x6
paddd_r2r
(
mm4
,
mm3
);
// mm3 = a1 a0 + rounder
pmaddwd_m2r
(
*
(
table
+
24
),
mm5
);
// mm5 = C7*x1-C5*x3 C5*x1-C1*x3
movq_r2r
(
mm3
,
mm4
);
// mm4 = a1 a0 + rounder
pmaddwd_m2r
(
*
(
table
+
28
),
mm6
);
// mm6 = C3*x5-C1*x7 C7*x5+C3*x7
paddd_r2r
(
mm7
,
mm1
);
// mm1 = b1 b0
paddd_m2r
(
*
rounder
,
mm0
);
// mm0 += rounder
psubd_r2r
(
mm1
,
mm3
);
// mm3 = a1-b1 a0-b0 + rounder
psrad_i2r
(
ROW_SHIFT
,
mm3
);
// mm3 = y6 y7
paddd_r2r
(
mm4
,
mm1
);
// mm1 = a1+b1 a0+b0 + rounder
paddd_r2r
(
mm2
,
mm0
);
// mm0 = a3 a2 + rounder
psrad_i2r
(
ROW_SHIFT
,
mm1
);
// mm1 = y1 y0
paddd_r2r
(
mm6
,
mm5
);
// mm5 = b3 b2
movq_r2r
(
mm0
,
mm7
);
// mm7 = a3 a2 + rounder
paddd_r2r
(
mm5
,
mm0
);
// mm0 = a3+b3 a2+b2 + rounder
psubd_r2r
(
mm5
,
mm7
);
// mm7 = a3-b3 a2-b2 + rounder
}
static
__inline__
void
RowTail
(
dctelem_t
*
row
,
int
store
)
{
psrad_i2r
(
ROW_SHIFT
,
mm0
);
// mm0 = y3 y2
psrad_i2r
(
ROW_SHIFT
,
mm7
);
// mm7 = y4 y5
packssdw_r2r
(
mm0
,
mm1
);
// mm1 = y3 y2 y1 y0
packssdw_r2r
(
mm3
,
mm7
);
// mm7 = y6 y7 y4 y5
movq_r2m
(
mm1
,
*
(
row
+
store
));
// save y3 y2 y1 y0
movq_r2r
(
mm7
,
mm4
);
// mm4 = y6 y7 y4 y5
pslld_i2r
(
16
,
mm7
);
// mm7 = y7 0 y5 0
psrld_i2r
(
16
,
mm4
);
// mm4 = 0 y6 0 y4
por_r2r
(
mm4
,
mm7
);
// mm7 = y7 y6 y5 y4
// slot
movq_r2m
(
mm7
,
*
(
row
+
store
+
4
));
// save y7 y6 y5 y4
}
static
__inline__
void
RowMid
(
dctelem_t
*
row
,
int
store
,
int
offset
,
dctelem_t
*
table
)
{
movq_m2r
(
*
(
row
+
offset
),
mm2
);
// mm2 = x6 x4 x2 x0
psrad_i2r
(
ROW_SHIFT
,
mm0
);
// mm0 = y3 y2
movq_m2r
(
*
(
row
+
offset
+
4
),
mm5
);
// mm5 = x7 x5 x3 x1
psrad_i2r
(
ROW_SHIFT
,
mm7
);
// mm7 = y4 y5
packssdw_r2r
(
mm0
,
mm1
);
// mm1 = y3 y2 y1 y0
movq_r2r
(
mm5
,
mm6
);
// mm6 = x7 x5 x3 x1
packssdw_r2r
(
mm3
,
mm7
);
// mm7 = y6 y7 y4 y5
movq_r2r
(
mm2
,
mm0
);
// mm0 = x6 x4 x2 x0
movq_r2m
(
mm1
,
*
(
row
+
store
));
// save y3 y2 y1 y0
movq_r2r
(
mm7
,
mm1
);
// mm1 = y6 y7 y4 y5
punpckldq_r2r
(
mm0
,
mm0
);
// mm0 = x2 x0 x2 x0
psrld_i2r
(
16
,
mm7
);
// mm7 = 0 y6 0 y4
movq_m2r
(
*
table
,
mm3
);
// mm3 = C6 C4 C2 C4
pslld_i2r
(
16
,
mm1
);
// mm1 = y7 0 y5 0
movq_m2r
(
*
(
table
+
4
),
mm4
);
// mm4 = -C2 -C4 C6 C4
por_r2r
(
mm1
,
mm7
);
// mm7 = y7 y6 y5 y4
movq_m2r
(
*
(
table
+
8
),
mm1
);
// mm1 = -C7 C3 C3 C1
punpckhdq_r2r
(
mm2
,
mm2
);
// mm2 = x6 x4 x6 x4
movq_r2m
(
mm7
,
*
(
row
+
store
+
4
));
// save y7 y6 y5 y4
pmaddwd_r2r
(
mm0
,
mm3
);
// mm3 = C4*x0+C6*x2 C4*x0+C2*x2
}
static
__inline__
void
Col
(
dctelem_t
*
col
,
int
offset
)
{
#define T1 13036
#define T2 27146
#define T3 43790
#define C4 23170
static
short
_T1
[]
ATTR_ALIGN
(
8
)
=
{
T1
,
T1
,
T1
,
T1
};
static
short
_T2
[]
ATTR_ALIGN
(
8
)
=
{
T2
,
T2
,
T2
,
T2
};
static
short
_T3
[]
ATTR_ALIGN
(
8
)
=
{
T3
,
T3
,
T3
,
T3
};
static
short
_C4
[]
ATTR_ALIGN
(
8
)
=
{
C4
,
C4
,
C4
,
C4
};
static
mmx_t
scratch0
,
scratch1
;
/* column code adapted from peter gubanov */
/* http://www.elecard.com/peter/idct.shtml */
movq_m2r
(
*
_T1
,
mm0
);
// mm0 = T1
movq_m2r
(
*
(
col
+
offset
+
1
*
8
),
mm1
);
// mm1 = x1
movq_r2r
(
mm0
,
mm2
);
// mm2 = T1
movq_m2r
(
*
(
col
+
offset
+
7
*
8
),
mm4
);
// mm4 = x7
pmulhw_r2r
(
mm1
,
mm0
);
// mm0 = T1*x1
movq_m2r
(
*
_T3
,
mm5
);
// mm5 = T3
pmulhw_r2r
(
mm4
,
mm2
);
// mm2 = T1*x7
movq_m2r
(
*
(
col
+
offset
+
5
*
8
),
mm6
);
// mm6 = x5
movq_r2r
(
mm5
,
mm7
);
// mm7 = T3-1
movq_m2r
(
*
(
col
+
offset
+
3
*
8
),
mm3
);
// mm3 = x3
psubsw_r2r
(
mm4
,
mm0
);
// mm0 = v17
movq_m2r
(
*
_T2
,
mm4
);
// mm4 = T2
pmulhw_r2r
(
mm3
,
mm5
);
// mm5 = (T3-1)*x3
paddsw_r2r
(
mm2
,
mm1
);
// mm1 = u17
pmulhw_r2r
(
mm6
,
mm7
);
// mm7 = (T3-1)*x5
//slot
movq_r2r
(
mm4
,
mm2
);
// mm2 = T2
paddsw_r2r
(
mm3
,
mm5
);
// mm5 = T3*x3
pmulhw_m2r
(
*
(
col
+
offset
+
2
*
8
),
mm4
);
// mm4 = T2*x2
paddsw_r2r
(
mm6
,
mm7
);
// mm7 = T3*x5
psubsw_r2r
(
mm6
,
mm5
);
// mm5 = v35
paddsw_r2r
(
mm3
,
mm7
);
// mm7 = u35
movq_m2r
(
*
(
col
+
offset
+
6
*
8
),
mm3
);
// mm3 = x6
movq_r2r
(
mm0
,
mm6
);
// mm6 = v17
pmulhw_r2r
(
mm3
,
mm2
);
// mm2 = T2*x6
psubsw_r2r
(
mm5
,
mm0
);
// mm0 = b3
psubsw_r2r
(
mm3
,
mm4
);
// mm4 = v26
paddsw_r2r
(
mm6
,
mm5
);
// mm5 = v12
movq_r2m
(
mm0
,
scratch0
);
// save b3
movq_r2r
(
mm1
,
mm6
);
// mm6 = u17
paddsw_m2r
(
*
(
col
+
offset
+
2
*
8
),
mm2
);
// mm2 = u26
paddsw_r2r
(
mm7
,
mm6
);
// mm6 = b0
psubsw_r2r
(
mm7
,
mm1
);
// mm1 = u12
movq_r2r
(
mm1
,
mm7
);
// mm7 = u12
movq_m2r
(
*
(
col
+
offset
+
0
*
8
),
mm3
);
// mm3 = x0
paddsw_r2r
(
mm5
,
mm1
);
// mm1 = u12+v12
movq_m2r
(
*
_C4
,
mm0
);
// mm0 = C4/2
psubsw_r2r
(
mm5
,
mm7
);
// mm7 = u12-v12
movq_r2m
(
mm6
,
scratch1
);
// save b0
pmulhw_r2r
(
mm0
,
mm1
);
// mm1 = b1/2
movq_r2r
(
mm4
,
mm6
);
// mm6 = v26
pmulhw_r2r
(
mm0
,
mm7
);
// mm7 = b2/2
movq_m2r
(
*
(
col
+
offset
+
4
*
8
),
mm5
);
// mm5 = x4
movq_r2r
(
mm3
,
mm0
);
// mm0 = x0
psubsw_r2r
(
mm5
,
mm3
);
// mm3 = v04
paddsw_r2r
(
mm5
,
mm0
);
// mm0 = u04
paddsw_r2r
(
mm3
,
mm4
);
// mm4 = a1
movq_r2r
(
mm0
,
mm5
);
// mm5 = u04
psubsw_r2r
(
mm6
,
mm3
);
// mm3 = a2
paddsw_r2r
(
mm2
,
mm5
);
// mm5 = a0
paddsw_r2r
(
mm1
,
mm1
);
// mm1 = b1
psubsw_r2r
(
mm2
,
mm0
);
// mm0 = a3
paddsw_r2r
(
mm7
,
mm7
);
// mm7 = b2
movq_r2r
(
mm3
,
mm2
);
// mm2 = a2
movq_r2r
(
mm4
,
mm6
);
// mm6 = a1
paddsw_r2r
(
mm7
,
mm3
);
// mm3 = a2+b2
psraw_i2r
(
COL_SHIFT
,
mm3
);
// mm3 = y2
paddsw_r2r
(
mm1
,
mm4
);
// mm4 = a1+b1
psraw_i2r
(
COL_SHIFT
,
mm4
);
// mm4 = y1
psubsw_r2r
(
mm1
,
mm6
);
// mm6 = a1-b1
movq_m2r
(
scratch1
,
mm1
);
// mm1 = b0
psubsw_r2r
(
mm7
,
mm2
);
// mm2 = a2-b2
psraw_i2r
(
COL_SHIFT
,
mm6
);
// mm6 = y6
movq_r2r
(
mm5
,
mm7
);
// mm7 = a0
movq_r2m
(
mm4
,
*
(
col
+
offset
+
1
*
8
));
// save y1
psraw_i2r
(
COL_SHIFT
,
mm2
);
// mm2 = y5
movq_r2m
(
mm3
,
*
(
col
+
offset
+
2
*
8
));
// save y2
paddsw_r2r
(
mm1
,
mm5
);
// mm5 = a0+b0
movq_m2r
(
scratch0
,
mm4
);
// mm4 = b3
psubsw_r2r
(
mm1
,
mm7
);
// mm7 = a0-b0
psraw_i2r
(
COL_SHIFT
,
mm5
);
// mm5 = y0
movq_r2r
(
mm0
,
mm3
);
// mm3 = a3
movq_r2m
(
mm2
,
*
(
col
+
offset
+
5
*
8
));
// save y5
psubsw_r2r
(
mm4
,
mm3
);
// mm3 = a3-b3
psraw_i2r
(
COL_SHIFT
,
mm7
);
// mm7 = y7
paddsw_r2r
(
mm0
,
mm4
);
// mm4 = a3+b3
movq_r2m
(
mm5
,
*
(
col
+
offset
+
0
*
8
));
// save y0
psraw_i2r
(
COL_SHIFT
,
mm3
);
// mm3 = y4
movq_r2m
(
mm6
,
*
(
col
+
offset
+
6
*
8
));
// save y6
psraw_i2r
(
COL_SHIFT
,
mm4
);
// mm4 = y3
movq_r2m
(
mm7
,
*
(
col
+
offset
+
7
*
8
));
// save y7
movq_r2m
(
mm3
,
*
(
col
+
offset
+
4
*
8
));
// save y4
movq_r2m
(
mm4
,
*
(
col
+
offset
+
3
*
8
));
// save y3
}
static
s32
rounder0
[]
ATTR_ALIGN
(
8
)
=
rounder
((
1
<<
(
COL_SHIFT
-
1
))
-
0
.
5
);
static
s32
rounder4
[]
ATTR_ALIGN
(
8
)
=
rounder
(
0
);
static
s32
rounder1
[]
ATTR_ALIGN
(
8
)
=
rounder
(
1
.
25683487303
);
// C1*(C1/C4+C1+C7)/2
static
s32
rounder7
[]
ATTR_ALIGN
(
8
)
=
rounder
(
-
0
.
25
);
// C1*(C7/C4+C7-C1)/2
static
s32
rounder2
[]
ATTR_ALIGN
(
8
)
=
rounder
(
0
.
60355339059
);
// C2 * (C6+C2)/2
static
s32
rounder6
[]
ATTR_ALIGN
(
8
)
=
rounder
(
-
0
.
25
);
// C2 * (C6-C2)/2
static
s32
rounder3
[]
ATTR_ALIGN
(
8
)
=
rounder
(
0
.
087788325588
);
// C3*(-C3/C4+C3+C5)/2
static
s32
rounder5
[]
ATTR_ALIGN
(
8
)
=
rounder
(
-
0
.
441341716183
);
// C3*(-C5/C4+C5-C3)/2
void
vdec_IDCT
(
vdec_thread_t
*
p_vdec
,
dctelem_t
*
p_block
,
int
i_idontcare
)
{
static
dctelem_t
table04
[]
ATTR_ALIGN
(
16
)
=
table
(
22725
,
21407
,
19266
,
16384
,
12873
,
8867
,
4520
);
static
dctelem_t
table17
[]
ATTR_ALIGN
(
16
)
=
table
(
31521
,
29692
,
26722
,
22725
,
17855
,
12299
,
6270
);
static
dctelem_t
table26
[]
ATTR_ALIGN
(
16
)
=
table
(
29692
,
27969
,
25172
,
21407
,
16819
,
11585
,
5906
);
static
dctelem_t
table35
[]
ATTR_ALIGN
(
16
)
=
table
(
26722
,
25172
,
22654
,
19266
,
15137
,
10426
,
5315
);
RowHead
(
p_block
,
0
*
8
,
table04
);
Row
(
table04
,
rounder0
);
RowMid
(
p_block
,
0
*
8
,
4
*
8
,
table04
);
Row
(
table04
,
rounder4
);
RowMid
(
p_block
,
4
*
8
,
1
*
8
,
table17
);
Row
(
table17
,
rounder1
);
RowMid
(
p_block
,
1
*
8
,
7
*
8
,
table17
);
Row
(
table17
,
rounder7
);
RowMid
(
p_block
,
7
*
8
,
2
*
8
,
table26
);
Row
(
table26
,
rounder2
);
RowMid
(
p_block
,
2
*
8
,
6
*
8
,
table26
);
Row
(
table26
,
rounder6
);
RowMid
(
p_block
,
6
*
8
,
3
*
8
,
table35
);
Row
(
table35
,
rounder3
);
RowMid
(
p_block
,
3
*
8
,
5
*
8
,
table35
);
Row
(
table35
,
rounder5
);
RowTail
(
p_block
,
5
*
8
);
Col
(
p_block
,
0
);
Col
(
p_block
,
4
);
}
plugins/idct/idctmmx_asm.S
deleted
100644 → 0
View file @
fd1220a2
/*****************************************************************************
*
vdec_idctmmx
.
S
:
MMX
IDCT
implementation
****************************************************************************
*
*
Copyright
(
C
)
1999
,
2000
VideoLAN
*
$Id
:
idctmmx_asm
.
S
,
v
1
.1
2001
/
01
/
13
12
:
57
:
20
sam
Exp
$
*
*
Authors
:
*
*
This
program
is
free
software
; you can redistribute it and/or modify
*
it
under
the
terms
of
the
GNU
General
Public
License
as
published
by
*
the
Free
Software
Foundation
; either version 2 of the License, or
*
(
at
your
option
)
any
later
version
.
*
*
This
program
is
distributed
in
the
hope
that
it
will
be
useful
,
*
but
WITHOUT
ANY
WARRANTY
; without even the implied warranty of
*
MERCHANTABILITY
or
FITNESS
FOR
A
PARTICULAR
PURPOSE
.
See
the
*
GNU
General
Public
License
for
more
details
.
*
*
You
should
have
received
a
copy
of
the
GNU
General
Public
License
*
along
with
this
program
; if not, write to the Free Software
*
Foundation
,
Inc
.
,
59
Temple
Place
-
Suite
330
,
Boston
,
MA
02111
,
USA
.
*****************************************************************************/
/*
*
the
input
data
is
tranposed
and
each
16
bit
element
in
the
8
x8
matrix
*
is
left
aligned
:
*
for
example
in
11
...1110000
format
*
If
the
iDCT
is
of
I
macroblock
then
0
.5
needs
to
be
added
to
the
;DC Component
*
(
element
[
0
][
0
]
of
the
matrix
)
*/
.
data
.
align
16
.
type
preSC
,
@
object
preSC
:
.
short
16384
,
22725
,
21407
,
19266
,
16384
,
12873
,
8867
,
4520
.
short
22725
,
31521
,
29692
,
26722
,
22725
,
17855
,
12299
,
6270
.
short
21407
,
29692
,
27969
,
25172
,
21407
,
16819
,
11585
,
5906
.
short
19266
,
26722
,
25172
,
22654
,
19266
,
15137
,
10426
,
5315
.
short
16384
,
22725
,
21407
,
19266
,
16384
,
12873
,
8867
,
4520
.
short
12873
,
17855
,
16819
,
15137
,
25746
,
20228
,
13933
,
7103
.
short
17734
,
24598
,
23170
,
20853
,
17734
,
13933
,
9597
,
4892
.
short
18081
,
25080
,
23624
,
21261
,
18081
,
14206
,
9785
,
4988
.
size
preSC
,
128
.
align
8
.
type
x0005000200010001
,
@
object
.
size
x0005000200010001
,
8
x0005000200010001
:
.
long
0x00010001
,
0x00050002
.
align
8
.
type
x0040000000000000
,
@
object
.
size
x0040000000000000
,
8
x0040000000000000
:
.
long
0
,
0x00400000
.
align
8
.
type
x5a825a825a825a82
,
@
object
.
size
x5a825a825a825a82
,
8
x5a825a825a825a82
:
.
long
0x5a825a82
,
0x5a825a82
.
align
8
.
type
x539f539f539f539f
,
@
object
.
size
x539f539f539f539f
,
8
x539f539f539f539f
:
.
long
0x539f539f
,
0x539f539f
.
align
8
.
type
x4546454645464546
,
@
object
.
size
x4546454645464546
,
8
x4546454645464546
:
.
long
0x45464546
,
0x45464546
.
align
8
.
type
x61f861f861f861f8
,
@
object
.
size
x61f861f861f861f8
,
8
x61f861f861f861f8
:
.
long
0x61f861f8
,
0x61f861f8
.
align
8
.
type
scratch1
,
@
object
.
size
scratch1
,
8
scratch1
:
.
long
0
,
0
.
align
8
.
type
scratch3
,
@
object
.
size
scratch3
,
8
scratch3
:
.
long
0
,
0
.
align
8
.
type
scratch5
,
@
object
.
size
scratch5
,
8
scratch5
:
.
long
0
,
0
.
align
8
.
type
scratch7
,
@
object
.
size
scratch7
,
8
scratch7
:
.
long
0
,
0
.
type
x0
,
@
object
.
size
x0
,
8
x0
:
.
long
0
,
0
.
align
8
.
text
.
align
4
/*
this
seems
to
annoy
the
compiler
in
-
g
mode
,
is
it
normal
?
*/
.
globl
vdec_IDCT
.
type
vdec_IDCT
,
@
function
vdec_IDCT
:
pushl
%
ebp
movl
%
esp
,%
ebp
pushl
%
ebx
pushl
%
ecx
pushl
%
edx
pushl
%
esi
pushl
%
edi
leal
preSC
,
%
ecx
movl
12
(%
ebp
),%
esi
movq
(%
esi
),
%
mm0
movq
8
(%
esi
),
%
mm1
movq
16
(%
esi
),
%
mm2
movq
24
(%
esi
),
%
mm3
movq
32
(%
esi
),
%
mm4
movq
40
(%
esi
),
%
mm5
movq
48
(%
esi
),
%
mm6
movq
56
(%
esi
),
%
mm7
psllw
$
4
,
%
mm0
psllw
$
4
,
%
mm1
psllw
$
4
,
%
mm2
psllw
$
4
,
%
mm3
psllw
$
4
,
%
mm4
psllw
$
4
,
%
mm5
psllw
$
4
,
%
mm6
psllw
$
4
,
%
mm7
movq
%
mm0
,
(%
esi
)
movq
%
mm1
,
8
(%
esi
)
movq
%
mm2
,
16
(%
esi
)
movq
%
mm3
,
24
(%
esi
)
movq
%
mm4
,
32
(%
esi
)
movq
%
mm5
,
40
(%
esi
)
movq
%
mm6
,
48
(%
esi
)
movq
%
mm7
,
56
(%
esi
)
movq
64
(%
esi
),
%
mm0
movq
72
(%
esi
),
%
mm1
movq
80
(%
esi
),
%
mm2
movq
88
(%
esi
),
%
mm3
movq
96
(%
esi
),
%
mm4
movq
104
(%
esi
),
%
mm5
movq
112
(%
esi
),
%
mm6
movq
120
(%
esi
),
%
mm7
psllw
$
4
,
%
mm0
psllw
$
4
,
%
mm1
psllw
$
4
,
%
mm2
psllw
$
4
,
%
mm3
psllw
$
4
,
%
mm4
psllw
$
4
,
%
mm5
psllw
$
4
,
%
mm6
psllw
$
4
,
%
mm7
movq
%
mm0
,
64
(%
esi
)
movq
%
mm1
,
72
(%
esi
)
movq
%
mm2
,
80
(%
esi
)
movq
%
mm3
,
88
(%
esi
)
movq
%
mm4
,
96
(%
esi
)
movq
%
mm5
,
104
(%
esi
)
movq
%
mm6
,
112
(%
esi
)
movq
%
mm7
,
120
(%
esi
)
/*
column
0
:
even
part
*
use
V4
,
V12
,
V0
,
V8
to
produce
V22
..
V25
*/
movq
8
*
12
(%
ecx
),
%
mm0
/*
maybe
the
first
mul
can
be
done
together
*/
/
*
with
the
dequantization
in
iHuff
module
*/
pmulhw
8
*
12
(%
esi
),
%
mm0
/*
V12
*/
movq
8
*
4
(%
ecx
),
%
mm1
pmulhw
8
*
4
(%
esi
),
%
mm1
/*
V4
*/
movq
(%
ecx
),
%
mm3
psraw
$
1
,
%
mm0
/*
t64
=
t66
*/
pmulhw
(%
esi
),
%
mm3
/*
V0
*/
movq
8
*
8
(%
ecx
),
%
mm5
/*
duplicate
V4
*/
movq
%
mm1
,
%
mm2
/*
added
11
/
1
/
96
*/
pmulhw
8
*
8
(%
esi
),%
mm5
/*
V8
*/
psubsw
%
mm0
,
%
mm1
/*
V16
*/
pmulhw
x5a825a825a825a82
,
%
mm1
/*
23170
->
V18
*/
paddsw
%
mm0
,
%
mm2
/*
V17
*/
movq
%
mm2
,
%
mm0
/*
duplicate
V17
*/
psraw
$
1
,
%
mm2
/*
t75
=
t82
*/
psraw
$
2
,
%
mm0
/*
t72
*/
movq
%
mm3
,
%
mm4
/*
duplicate
V0
*/
paddsw
%
mm5
,
%
mm3
/*
V19
*/
psubsw
%
mm5
,
%
mm4
/*
V20
;mm5 free */
/*
moved
from
the
block
below
*/
movq
8
*
10
(%
ecx
),
%
mm7
psraw
$
1
,
%
mm3
/*
t74
=
t81
*/
movq
%
mm3
,
%
mm6
/*
duplicate
t74
=
t81
*/
psraw
$
2
,
%
mm4
/*
t77
=
t79
*/
psubsw
%
mm0
,
%
mm1
/*
V21
; mm0 free */
paddsw
%
mm2
,
%
mm3
/*
V22
*/
movq
%
mm1
,
%
mm5
/*
duplicate
V21
*/
paddsw
%
mm4
,
%
mm1
/*
V23
*/
movq
%
mm3
,
8
*
4
(%
esi
)
/*
V22
*/
psubsw
%
mm5
,
%
mm4
/*
V24
; mm5 free */
movq
%
mm1
,
8
*
12
(%
esi
)
/*
V23
*/
psubsw
%
mm2
,
%
mm6
/*
V25
; mm2 free */
movq
%
mm4
,
(%
esi
)
/*
V24
*/
/*
keep
mm6
alive
all
along
the
next
block
*/
/
*
movq
%
mm6
,
8
*
8
(%
esi
)
V25
*/
/*
column
0
:
odd
part
*
use
V2
,
V6
,
V10
,
V14
to
produce
V31
,
V39
,
V40
,
V41
*/
/*
moved
above
:
movq
8
*
10
(%
ecx
),
%
mm7
*/
pmulhw
8
*
10
(%
esi
),
%
mm7
/*
V10
*/
movq
8
*
6
(%
ecx
),
%
mm0
pmulhw
8
*
6
(%
esi
),
%
mm0
/*
V6
*/
movq
8
*
2
(%
ecx
),
%
mm5
movq
%
mm7
,
%
mm3
/*
duplicate
V10
*/
pmulhw
8
*
2
(%
esi
),
%
mm5
/*
V2
*/
movq
8
*
14
(%
ecx
),
%
mm4
psubsw
%
mm0
,
%
mm7
/*
V26
*/
pmulhw
8
*
14
(%
esi
),
%
mm4
/*
V14
*/
paddsw
%
mm0
,
%
mm3
/*
V29
; free mm0 */
movq
%
mm7
,
%
mm1
/*
duplicate
V26
*/
psraw
$
1
,
%
mm3
/*
t91
=
t94
*/
pmulhw
x539f539f539f539f
,%
mm7
/*
V33
*/
psraw
$
1
,
%
mm1
/*
t96
*/
movq
%
mm5
,
%
mm0
/*
duplicate
V2
*/
psraw
$
2
,
%
mm4
/*
t85
=
t87
*/
paddsw
%
mm4
,%
mm5
/*
V27
*/
psubsw
%
mm4
,
%
mm0
/*
V28
; free mm4 */
movq
%
mm0
,
%
mm2
/*
duplicate
V28
*/
psraw
$
1
,
%
mm5
/*
t90
=
t93
*/
pmulhw
x4546454645464546
,%
mm0
/*
V35
*/
psraw
$
1
,
%
mm2
/*
t97
*/
movq
%
mm5
,
%
mm4
/*
duplicate
t90
=
t93
*/
psubsw
%
mm2
,
%
mm1
/*
V32
; free mm2 */
pmulhw
x61f861f861f861f8
,%
mm1
/*
V36
*/
psllw
$
1
,
%
mm7
/*
t107
*/
paddsw
%
mm3
,
%
mm5
/*
V31
*/
psubsw
%
mm3
,
%
mm4
/*
V30
; free mm3 */
pmulhw
x5a825a825a825a82
,%
mm4
/*
V34
*/
nop
psubsw
%
mm1
,
%
mm0
/*
V38
*/
psubsw
%
mm7
,
%
mm1
/*
V37
; free mm7 */
psllw
$
1
,
%
mm1
/*
t114
*/
/*
move
from
the
next
block
*/
movq
%
mm6
,
%
mm3
/*
duplicate
V25
*/
/*
move
from
the
next
block
*/
movq
8
*
4
(%
esi
),
%
mm7
/*
V22
*/
psllw
$
1
,
%
mm0
/*
t110
*/
psubsw
%
mm5
,
%
mm0
/*
V39
(
mm5
needed
for
next
block
)
*/
psllw
$
2
,
%
mm4
/*
t112
*/
/*
moved
from
the
next
block
*/
movq
8
*
12
(%
esi
),
%
mm2
/*
V23
*/
psubsw
%
mm0
,
%
mm4
/*
V40
*/
paddsw
%
mm4
,
%
mm1
/*
V41
; free mm0 */
/*
moved
from
the
next
block
*/
psllw
$
1
,
%
mm2
/*
t117
=
t125
*/
/*
column
0
:
output
butterfly
*/
/*
moved
above
:
*
movq
%
mm6
,
%
mm3
duplicate
V25
*
movq
8
*
4
(%
esi
),
%
mm7
V22
*
movq
8
*
12
(%
esi
),
%
mm2
V23
*
psllw
$
1
,
%
mm2
t117
=
t125
*/
psubsw
%
mm1
,
%
mm6
/*
tm6
*/
paddsw
%
mm1
,
%
mm3
/*
tm8
; free mm1 */
movq
%
mm7
,
%
mm1
/*
duplicate
V22
*/
paddsw
%
mm5
,
%
mm7
/*
tm0
*/
movq
%
mm3
,
8
*
8
(%
esi
)
/*
tm8
; free mm3 */
psubsw
%
mm5
,
%
mm1
/*
tm14
; free mm5 */
movq
%
mm6
,
8
*
6
(%
esi
)
/*
tm6
; free mm6 */
movq
%
mm2
,
%
mm3
/*
duplicate
t117
=
t125
*/
movq
(%
esi
),
%
mm6
/*
V24
*/
paddsw
%
mm0
,
%
mm2
/*
tm2
*/
movq
%
mm7
,
(%
esi
)
/*
tm0
; free mm7 */
psubsw
%
mm0
,
%
mm3
/*
tm12
; free mm0 */
movq
%
mm1
,
8
*
14
(%
esi
)
/*
tm14
; free mm1 */
psllw
$
1
,
%
mm6
/*
t119
=
t123
*/
movq
%
mm2
,
8
*
2
(%
esi
)
/*
tm2
; free mm2 */
movq
%
mm6
,
%
mm0
/*
duplicate
t119
=
t123
*/
movq
%
mm3
,
8
*
12
(%
esi
)
/*
tm12
; free mm3 */
paddsw
%
mm4
,
%
mm6
/*
tm4
*/
/*
moved
from
next
block
*/
movq
8
*
5
(%
ecx
),
%
mm1
psubsw
%
mm4
,
%
mm0
/*
tm10
; free mm4 */
/*
moved
from
next
block
*/
pmulhw
8
*
5
(%
esi
),
%
mm1
/*
V5
*/
movq
%
mm6
,
8
*
4
(%
esi
)
/*
tm4
; free mm6 */
movq
%
mm0
,
8
*
10
(%
esi
)
/*
tm10
; free mm0 */
/*
column
1
:
even
part
*
use
V5
,
V13
,
V1
,
V9
to
produce
V56
..
V59
*/
/*
moved
to
prev
block
:
*
movq
8
*
5
(%
ecx
),
%
mm1
*
pmulhw
8
*
5
(%
esi
),
%
mm1
V5
*/
movq
8
*
13
(%
ecx
),
%
mm7
psllw
$
1
,
%
mm1
/*
t128
=
t130
*/
pmulhw
8
*
13
(%
esi
),
%
mm7
/*
V13
*/
movq
%
mm1
,
%
mm2
/*
duplicate
t128
=
t130
*/
movq
8
(%
ecx
),
%
mm3
pmulhw
8
(%
esi
),
%
mm3
/*
V1
*/
movq
8
*
9
(%
ecx
),
%
mm5
psubsw
%
mm7
,
%
mm1
/*
V50
*/
pmulhw
8
*
9
(%
esi
),
%
mm5
/*
V9
*/
paddsw
%
mm7
,
%
mm2
/*
V51
*/
pmulhw
x5a825a825a825a82
,
%
mm1
/*
23170
->
V52
*/
movq
%
mm2
,
%
mm6
/*
duplicate
V51
*/
psraw
$
1
,
%
mm2
/*
t138
=
t144
*/
movq
%
mm3
,
%
mm4
/*
duplicate
V1
*/
psraw
$
2
,
%
mm6
/*
t136
*/
paddsw
%
mm5
,
%
mm3
/*
V53
*/
psubsw
%
mm5
,
%
mm4
/*
V54
;mm5 free */
movq
%
mm3
,
%
mm7
/*
duplicate
V53
*/
/*
moved
from
next
block
*/
movq
8
*
11
(%
ecx
),
%
mm0
psraw
$
1
,
%
mm4
/*
t140
=
t142
*/
psubsw
%
mm6
,
%
mm1
/*
V55
; mm6 free */
paddsw
%
mm2
,
%
mm3
/*
V56
*/
movq
%
mm4
,
%
mm5
/*
duplicate
t140
=
t142
*/
paddsw
%
mm1
,
%
mm4
/*
V57
*/
movq
%
mm3
,
8
*
5
(%
esi
)
/*
V56
*/
psubsw
%
mm1
,
%
mm5
/*
V58
; mm1 free */
movq
%
mm4
,
8
*
13
(%
esi
)
/*
V57
*/
psubsw
%
mm2
,
%
mm7
/*
V59
; mm2 free */
movq
%
mm5
,
8
*
9
(%
esi
)
/*
V58
*/
/*
keep
mm7
alive
all
along
the
next
block
*
movq
%
mm7
,
8
(%
esi
)
V59
*
moved
above
*
movq
8
*
11
(%
ecx
),
%
mm0
*/
pmulhw
8
*
11
(%
esi
),
%
mm0
/*
V11
*/
movq
8
*
7
(%
ecx
),
%
mm6
pmulhw
8
*
7
(%
esi
),
%
mm6
/*
V7
*/
movq
8
*
15
(%
ecx
),
%
mm4
movq
%
mm0
,
%
mm3
/*
duplicate
V11
*/
pmulhw
8
*
15
(%
esi
),
%
mm4
/*
V15
*/
movq
8
*
3
(%
ecx
),
%
mm5
psllw
$
1
,
%
mm6
/*
t146
=
t152
*/
pmulhw
8
*
3
(%
esi
),
%
mm5
/*
V3
*/
paddsw
%
mm6
,
%
mm0
/*
V63
*/
/*
note
that
V15
computation
has
a
correction
step
:
*
this
is
a
'magic'
constant
that
rebiases
the
results
to
be
closer
to
the
*
expected
result
.
this
magic
constant
can
be
refined
to
reduce
the
error
*
even
more
by
doing
the
correction
step
in
a
later
stage
when
the
number
*
is
actually
multiplied
by
16
*/
paddw
x0005000200010001
,
%
mm4
psubsw
%
mm6
,
%
mm3
/*
V60
; free mm6 */
psraw
$
1
,
%
mm0
/*
t154
=
t156
*/
movq
%
mm3
,
%
mm1
/*
duplicate
V60
*/
pmulhw
x539f539f539f539f
,
%
mm1
/*
V67
*/
movq
%
mm5
,
%
mm6
/*
duplicate
V3
*/
psraw
$
2
,
%
mm4
/*
t148
=
t150
*/
paddsw
%
mm4
,
%
mm5
/*
V61
*/
psubsw
%
mm4
,
%
mm6
/*
V62
; free mm4 */
movq
%
mm5
,
%
mm4
/*
duplicate
V61
*/
psllw
$
1
,
%
mm1
/*
t169
*/
paddsw
%
mm0
,
%
mm5
/*
V65
->
result
*/
psubsw
%
mm0
,
%
mm4
/*
V64
; free mm0 */
pmulhw
x5a825a825a825a82
,
%
mm4
/*
V68
*/
psraw
$
1
,
%
mm3
/*
t158
*/
psubsw
%
mm6
,
%
mm3
/*
V66
*/
movq
%
mm5
,
%
mm2
/*
duplicate
V65
*/
pmulhw
x61f861f861f861f8
,
%
mm3
/*
V70
*/
psllw
$
1
,
%
mm6
/*
t165
*/
pmulhw
x4546454645464546
,
%
mm6
/*
V69
*/
psraw
$
1
,
%
mm2
/*
t172
*/
/*
moved
from
next
block
*/
movq
8
*
5
(%
esi
),
%
mm0
/*
V56
*/
psllw
$
1
,
%
mm4
/*
t174
*/
/*
moved
from
next
block
*/
psraw
$
1
,
%
mm0
/*
t177
=
t188
*/
nop
psubsw
%
mm3
,
%
mm6
/*
V72
*/
psubsw
%
mm1
,
%
mm3
/*
V71
; free mm1 */
psubsw
%
mm2
,
%
mm6
/*
V73
; free mm2 */
/*
moved
from
next
block
*/
psraw
$
1
,
%
mm5
/*
t178
=
t189
*/
psubsw
%
mm6
,
%
mm4
/*
V74
*/
/*
moved
from
next
block
*/
movq
%
mm0
,
%
mm1
/*
duplicate
t177
=
t188
*/
paddsw
%
mm4
,
%
mm3
/*
V75
*/
/*
moved
from
next
block
*/
paddsw
%
mm5
,
%
mm0
/*
tm1
*/
/*
location
*
5
-
V56
*
13
-
V57
*
9
-
V58
*
X
-
V59
,
mm7
*
X
-
V65
,
mm5
*
X
-
V73
,
mm6
*
X
-
V74
,
mm4
*
X
-
V75
,
mm3
*
free
mm0
,
mm1
&
mm2
*
moved
above
*
movq
8
*
5
(%
esi
),
%
mm0
V56
*
psllw
$
1
,
%
mm0
t177
=
t188
!
new
!!
*
psllw
$
1
,
%
mm5
t178
=
t189
!
new
!!
*
movq
%
mm0
,
%
mm1
duplicate
t177
=
t188
*
paddsw
%
mm5
,
%
mm0
tm1
*/
movq
8
*
13
(%
esi
),
%
mm2
/*
V57
*/
psubsw
%
mm5
,
%
mm1
/*
tm15
; free mm5 */
movq
%
mm0
,
8
(%
esi
)
/*
tm1
; free mm0 */
psraw
$
1
,
%
mm7
/*
t182
=
t184
!
new
!!
*/
/*
save
the
store
as
used
directly
in
the
transpose
*
movq
%
mm1
,
120
(%
esi
)
tm15
; free mm1
*/
movq
%
mm7
,
%
mm5
/*
duplicate
t182
=
t184
*/
psubsw
%
mm3
,
%
mm7
/*
tm7
*/
paddsw
%
mm3
,
%
mm5
/*
tm9
; free mm3 */
movq
8
*
9
(%
esi
),
%
mm0
/*
V58
*/
movq
%
mm2
,
%
mm3
/*
duplicate
V57
*/
movq
%
mm7
,
8
*
7
(%
esi
)
/*
tm7
; free mm7 */
psubsw
%
mm6
,
%
mm3
/*
tm13
*/
paddsw
%
mm6
,
%
mm2
/*
tm3
; free mm6 */
/*
moved
up
from
the
transpose
*/
movq
%
mm3
,
%
mm7
/*
moved
up
from
the
transpose
*/
punpcklwd
%
mm1
,
%
mm3
movq
%
mm0
,
%
mm6
/*
duplicate
V58
*/
movq
%
mm2
,
8
*
3
(%
esi
)
/*
tm3
; free mm2 */
paddsw
%
mm4
,
%
mm0
/*
tm5
*/
psubsw
%
mm4
,
%
mm6
/*
tm11
; free mm4 */
/*
moved
up
from
the
transpose
*/
punpckhwd
%
mm1
,
%
mm7
movq
%
mm0
,
8
*
5
(%
esi
)
/*
tm5
; free mm0 */
/*
moved
up
from
the
transpose
*/
movq
%
mm5
,
%
mm2
/*
transpose
-
M4
part
*
---------
---------
*
| M1 |
M2
| |
M1
'| M3'
|
*
---------
-->
---------
*
| M3 |
M4
| |
M2
'| M4'
|
*
---------
---------
*
Two
alternatives
:
use
full
mmword
approach
so
the
following
code
can
be
*
scheduled
before
the
transpose
is
done
without
stores
,
or
use
the
faster
*
half
mmword
stores
(
when
possible
)
*/
movd
%
mm3
,
8
*
9
+
4
(%
esi
)
/*
MS
part
of
tmt9
*/
punpcklwd
%
mm6
,
%
mm5
movd
%
mm7
,
8
*
13
+
4
(%
esi
)
/*
MS
part
of
tmt13
*/
punpckhwd
%
mm6
,
%
mm2
movd
%
mm5
,
8
*
9
(%
esi
)
/*
LS
part
of
tmt9
*/
punpckhdq
%
mm3
,
%
mm5
/*
free
mm3
*/
movd
%
mm2
,
8
*
13
(%
esi
)
/*
LS
part
of
tmt13
*/
punpckhdq
%
mm7
,
%
mm2
/*
free
mm7
*/
/*
moved
up
from
the
M3
transpose
*/
movq
8
*
8
(%
esi
),
%
mm0
/*
moved
up
from
the
M3
transpose
*/
movq
8
*
10
(%
esi
),
%
mm1
/*
moved
up
from
the
M3
transpose
*/
movq
%
mm0
,
%
mm3
/*
shuffle
the
rest
of
the
data
,
and
write
it
with
2
mmword
writes
*/
movq
%
mm5
,
8
*
11
(%
esi
)
/*
tmt11
*/
/*
moved
up
from
the
M3
transpose
*/
punpcklwd
%
mm1
,
%
mm0
movq
%
mm2
,
8
*
15
(%
esi
)
/*
tmt15
*/
/*
moved
up
from
the
M3
transpose
*/
punpckhwd
%
mm1
,
%
mm3
/*
transpose
-
M3
part
*
moved
up
to
previous
code
section
*
movq
8
*
8
(%
esi
),
%
mm0
*
movq
8
*
10
(%
esi
),
%
mm1
*
movq
%
mm0
,
%
mm3
*
punpcklwd
%
mm1
,
%
mm0
*
punpckhwd
%
mm1
,
%
mm3
*/
movq
8
*
12
(%
esi
),
%
mm6
movq
8
*
14
(%
esi
),
%
mm4
movq
%
mm6
,
%
mm2
/*
shuffle
the
data
and
write
the
lower
parts
of
the
transposed
in
4
dwords
*/
punpcklwd
%
mm4
,
%
mm6
movq
%
mm0
,
%
mm1
punpckhdq
%
mm6
,
%
mm1
movq
%
mm3
,
%
mm7
punpckhwd
%
mm4
,
%
mm2
/*
free
mm4
*/
punpckldq
%
mm6
,
%
mm0
/*
free
mm6
*/
/*
moved
from
next
block
*/
movq
8
*
13
(%
esi
),
%
mm4
/*
tmt13
*/
punpckldq
%
mm2
,
%
mm3
punpckhdq
%
mm2
,
%
mm7
/*
free
mm2
*/
/*
moved
from
next
block
*/
movq
%
mm3
,
%
mm5
/*
duplicate
tmt5
*/
/*
column
1
:
even
part
(
after
transpose
)
*
moved
above
*
movq
%
mm3
,
%
mm5
duplicate
tmt5
*
movq
8
*
13
(%
esi
),
%
mm4
tmt13
*/
psubsw
%
mm4
,
%
mm3
/*
V134
*/
pmulhw
x5a825a825a825a82
,
%
mm3
/*
23170
->
V136
*/
movq
8
*
9
(%
esi
),
%
mm6
/*
tmt9
*/
paddsw
%
mm4
,
%
mm5
/*
V135
; mm4 free */
movq
%
mm0
,
%
mm4
/*
duplicate
tmt1
*/
paddsw
%
mm6
,
%
mm0
/*
V137
*/
psubsw
%
mm6
,
%
mm4
/*
V138
; mm6 free */
psllw
$
2
,
%
mm3
/*
t290
*/
psubsw
%
mm5
,
%
mm3
/*
V139
*/
movq
%
mm0
,
%
mm6
/*
duplicate
V137
*/
paddsw
%
mm5
,
%
mm0
/*
V140
*/
movq
%
mm4
,
%
mm2
/*
duplicate
V138
*/
paddsw
%
mm3
,
%
mm2
/*
V141
*/
psubsw
%
mm3
,
%
mm4
/*
V142
; mm3 free */
movq
%
mm0
,
8
*
9
(%
esi
)
/*
V140
*/
psubsw
%
mm5
,
%
mm6
/*
V143
; mm5 free */
/*
moved
from
next
block
*/
movq
8
*
11
(%
esi
),
%
mm0
/*
tmt11
*/
movq
%
mm2
,
8
*
13
(%
esi
)
/*
V141
*/
/*
moved
from
next
block
*/
movq
%
mm0
,
%
mm2
/*
duplicate
tmt11
*/
/*
column
1
:
odd
part
(
after
transpose
)
*/
/*
moved
up
to
the
prev
block
*
movq
8
*
11
(%
esi
),
%
mm0
tmt11
*
movq
%
mm0
,
%
mm2
duplicate
tmt11
*/
movq
8
*
15
(%
esi
),
%
mm5
/*
tmt15
*/
psubsw
%
mm7
,
%
mm0
/*
V144
*/
movq
%
mm0
,
%
mm3
/*
duplicate
V144
*/
paddsw
%
mm7
,
%
mm2
/*
V147
; free mm7 */
pmulhw
x539f539f539f539f
,
%
mm0
/*
21407
->
V151
*/
movq
%
mm1
,
%
mm7
/*
duplicate
tmt3
*/
paddsw
%
mm5
,
%
mm7
/*
V145
*/
psubsw
%
mm5
,
%
mm1
/*
V146
; free mm5 */
psubsw
%
mm1
,
%
mm3
/*
V150
*/
movq
%
mm7
,
%
mm5
/*
duplicate
V145
*/
pmulhw
x4546454645464546
,
%
mm1
/*
17734
->
V153
*/
psubsw
%
mm2
,
%
mm5
/*
V148
*/
pmulhw
x61f861f861f861f8
,
%
mm3
/*
25080
->
V154
*/
psllw
$
2
,
%
mm0
/*
t311
*/
pmulhw
x5a825a825a825a82
,
%
mm5
/*
23170
->
V152
*/
paddsw
%
mm2
,
%
mm7
/*
V149
; free mm2 */
psllw
$
1
,
%
mm1
/*
t313
*/
nop
/*
without
the
nop
-
freeze
here
for
one
clock
*/
movq
%
mm3
,
%
mm2
/*
duplicate
V154
*/
psubsw
%
mm0
,
%
mm3
/*
V155
; free mm0 */
psubsw
%
mm2
,
%
mm1
/*
V156
; free mm2 */
/*
moved
from
the
next
block
*/
movq
%
mm6
,
%
mm2
/*
duplicate
V143
*/
/*
moved
from
the
next
block
*/
movq
8
*
13
(%
esi
),
%
mm0
/*
V141
*/
psllw
$
1
,
%
mm1
/*
t315
*/
psubsw
%
mm7
,
%
mm1
/*
V157
(
keep
V149
)
*/
psllw
$
2
,
%
mm5
/*
t317
*/
psubsw
%
mm1
,
%
mm5
/*
V158
*/
psllw
$
1
,
%
mm3
/*
t319
*/
paddsw
%
mm5
,
%
mm3
/*
V159
*/
/*
column
1
:
output
butterfly
(
after
transform
)
*
moved
to
the
prev
block
*
movq
%
mm6
,
%
mm2
duplicate
V143
*
movq
8
*
13
(%
esi
),
%
mm0
V141
*/
psubsw
%
mm3
,
%
mm2
/*
V163
*/
paddsw
%
mm3
,
%
mm6
/*
V164
; free mm3 */
movq
%
mm4
,
%
mm3
/*
duplicate
V142
*/
psubsw
%
mm5
,
%
mm4
/*
V165
; free mm5 */
movq
%
mm2
,
scratch7
/*
out7
*/
psraw
$
4
,
%
mm6
psraw
$
4
,
%
mm4
paddsw
%
mm5
,
%
mm3
/*
V162
*/
movq
8
*
9
(%
esi
),
%
mm2
/*
V140
*/
movq
%
mm0
,
%
mm5
/*
duplicate
V141
*/
/*
in
order
not
to
perculate
this
line
up
,
*
we
read
72
(%
esi
)
very
near
to
this
location
*/
movq
%
mm6
,
8
*
9
(%
esi
)
/*
out9
*/
paddsw
%
mm1
,
%
mm0
/*
V161
*/
movq
%
mm3
,
scratch5
/*
out5
*/
psubsw
%
mm1
,
%
mm5
/*
V166
; free mm1 */
movq
%
mm4
,
8
*
11
(%
esi
)
/*
out11
*/
psraw
$
4
,
%
mm5
movq
%
mm0
,
scratch3
/*
out3
*/
movq
%
mm2
,
%
mm4
/*
duplicate
V140
*/
movq
%
mm5
,
8
*
13
(%
esi
)
/*
out13
*/
paddsw
%
mm7
,
%
mm2
/*
V160
*/
/*
moved
from
the
next
block
*/
movq
8
(%
esi
),
%
mm0
psubsw
%
mm7
,
%
mm4
/*
V167
; free mm7 */
/*
moved
from
the
next
block
*/
movq
8
*
3
(%
esi
),
%
mm7
psraw
$
4
,
%
mm4
movq
%
mm2
,
scratch1
/*
out1
*/
/*
moved
from
the
next
block
*/
movq
%
mm0
,
%
mm1
movq
%
mm4
,
8
*
15
(%
esi
)
/*
out15
*/
/*
moved
from
the
next
block
*/
punpcklwd
%
mm7
,
%
mm0
/*
transpose
-
M2
parts
*
moved
up
to
the
prev
block
*
movq
8
(%
esi
),
%
mm0
*
movq
8
*
3
(%
esi
),
%
mm7
*
movq
%
mm0
,
%
mm1
*
punpcklwd
%
mm7
,
%
mm0
*/
movq
8
*
5
(%
esi
),
%
mm5
punpckhwd
%
mm7
,
%
mm1
movq
8
*
7
(%
esi
),
%
mm4
movq
%
mm5
,
%
mm3
/*
shuffle
the
data
and
write
the
lower
parts
of
the
trasposed
in
4
dwords
*/
movd
%
mm0
,
8
*
8
(%
esi
)
/*
LS
part
of
tmt8
*/
punpcklwd
%
mm4
,
%
mm5
movd
%
mm1
,
8
*
12
(%
esi
)
/*
LS
part
of
tmt12
*/
punpckhwd
%
mm4
,
%
mm3
movd
%
mm5
,
8
*
8
+
4
(%
esi
)
/*
MS
part
of
tmt8
*/
punpckhdq
%
mm5
,
%
mm0
/*
tmt10
*/
movd
%
mm3
,
8
*
12
+
4
(%
esi
)
/*
MS
part
of
tmt12
*/
punpckhdq
%
mm3
,
%
mm1
/*
tmt14
*/
/*
transpose
-
M1
parts
*/
movq
(%
esi
),
%
mm7
movq
8
*
2
(%
esi
),
%
mm2
movq
%
mm7
,
%
mm6
movq
8
*
4
(%
esi
),
%
mm5
punpcklwd
%
mm2
,
%
mm7
movq
8
*
6
(%
esi
),
%
mm4
punpckhwd
%
mm2
,
%
mm6
/*
free
mm2
*/
movq
%
mm5
,
%
mm3
punpcklwd
%
mm4
,
%
mm5
punpckhwd
%
mm4
,
%
mm3
/*
free
mm4
*/
movq
%
mm7
,
%
mm2
movq
%
mm6
,
%
mm4
punpckldq
%
mm5
,
%
mm7
/*
tmt0
*/
punpckhdq
%
mm5
,
%
mm2
/*
tmt2
; free mm5 */
/*
shuffle
the
rest
of
the
data
,
and
write
it
with
2
mmword
writes
*/
punpckldq
%
mm3
,
%
mm6
/*
tmt4
*/
/*
moved
from
next
block
*/
movq
%
mm2
,
%
mm5
/*
duplicate
tmt2
*/
punpckhdq
%
mm3
,
%
mm4
/*
tmt6
; free mm3 */
/*
moved
from
next
block
*/
movq
%
mm0
,
%
mm3
/*
duplicate
tmt10
*/
/*
column
0
:
odd
part
(
after
transpose
)
*
moved
up
to
prev
block
*
movq
%
mm0
,
%
mm3
duplicate
tmt10
*
movq
%
mm2
,
%
mm5
duplicate
tmt2
*/
psubsw
%
mm4
,
%
mm0
/*
V110
*/
paddsw
%
mm4
,
%
mm3
/*
V113
; free mm4 */
movq
%
mm0
,
%
mm4
/*
duplicate
V110
*/
paddsw
%
mm1
,
%
mm2
/*
V111
*/
pmulhw
x539f539f539f539f
,
%
mm0
/*
21407
->
V117
*/
psubsw
%
mm1
,
%
mm5
/*
V112
; free mm1 */
psubsw
%
mm5
,
%
mm4
/*
V116
*/
movq
%
mm2
,
%
mm1
/*
duplicate
V111
*/
pmulhw
x4546454645464546
,
%
mm5
/*
17734
->
V119
*/
psubsw
%
mm3
,
%
mm2
/*
V114
*/
pmulhw
x61f861f861f861f8
,
%
mm4
/*
25080
->
V120
*/
paddsw
%
mm3
,
%
mm1
/*
V115
; free mm3 */
pmulhw
x5a825a825a825a82
,
%
mm2
/*
23170
->
V118
*/
psllw
$
2
,
%
mm0
/*
t266
*/
movq
%
mm1
,
(%
esi
)
/*
save
V115
*/
psllw
$
1
,
%
mm5
/*
t268
*/
psubsw
%
mm4
,
%
mm5
/*
V122
*/
psubsw
%
mm0
,
%
mm4
/*
V121
; free mm0 */
psllw
$
1
,
%
mm5
/*
t270
*/
psubsw
%
mm1
,
%
mm5
/*
V123
; free mm1 */
psllw
$
2
,
%
mm2
/*
t272
*/
psubsw
%
mm5
,
%
mm2
/*
V124
(
keep
V123
)
*/
psllw
$
1
,
%
mm4
/*
t274
*/
movq
%
mm5
,
8
*
2
(%
esi
)
/*
save
V123
; free mm5 */
paddsw
%
mm2
,
%
mm4
/*
V125
(
keep
V124
)
*/
/*
column
0
:
even
part
(
after
transpose
)
*/
movq
8
*
12
(%
esi
),
%
mm0
/*
tmt12
*/
movq
%
mm6
,
%
mm3
/*
duplicate
tmt4
*/
psubsw
%
mm0
,
%
mm6
/*
V100
*/
paddsw
%
mm0
,
%
mm3
/*
V101
; free mm0 */
pmulhw
x5a825a825a825a82
,
%
mm6
/*
23170
->
V102
*/
movq
%
mm7
,
%
mm5
/*
duplicate
tmt0
*/
movq
8
*
8
(%
esi
),
%
mm1
/*
tmt8
*/
paddsw
%
mm1
,
%
mm7
/*
V103
*/
psubsw
%
mm1
,
%
mm5
/*
V104
; free mm1 */
movq
%
mm7
,
%
mm0
/*
duplicate
V103
*/
psllw
$
2
,
%
mm6
/*
t245
*/
paddsw
%
mm3
,
%
mm7
/*
V106
*/
movq
%
mm5
,
%
mm1
/*
duplicate
V104
*/
psubsw
%
mm3
,
%
mm6
/*
V105
*/
psubsw
%
mm3
,
%
mm0
/*
V109
; free mm3 */
paddsw
%
mm6
,
%
mm5
/*
V107
*/
psubsw
%
mm6
,
%
mm1
/*
V108
; free mm6 */
/*
column
0
:
output
butterfly
(
after
transform
)
*/
movq
%
mm1
,
%
mm3
/*
duplicate
V108
*/
paddsw
%
mm2
,
%
mm1
/*
out4
*/
psraw
$
4
,
%
mm1
psubsw
%
mm2
,
%
mm3
/*
out10
; free mm2 */
psraw
$
4
,
%
mm3
movq
%
mm0
,
%
mm6
/*
duplicate
V109
*/
movq
%
mm1
,
8
*
4
(%
esi
)
/*
out4
; free mm1 */
psubsw
%
mm4
,
%
mm0
/*
out6
*/
movq
%
mm3
,
8
*
10
(%
esi
)
/*
out10
; free mm3 */
psraw
$
4
,
%
mm0
paddsw
%
mm4
,
%
mm6
/*
out8
; free mm4 */
movq
%
mm7
,
%
mm1
/*
duplicate
V106
*/
movq
%
mm0
,
8
*
6
(%
esi
)
/*
out6
; free mm0 */
psraw
$
4
,
%
mm6
movq
(%
esi
),
%
mm4
/*
V115
*/
movq
%
mm6
,
8
*
8
(%
esi
)
/*
out8
; free mm6 */
movq
%
mm5
,
%
mm2
/*
duplicate
V107
*/
movq
8
*
2
(%
esi
),
%
mm3
/*
V123
*/
paddsw
%
mm4
,
%
mm7
/*
out0
*/
/*
moved
up
from
next
block
*/
movq
scratch3
,
%
mm0
psraw
$
4
,
%
mm7
/*
moved
up
from
next
block
*/
movq
scratch5
,
%
mm6
psubsw
%
mm4
,
%
mm1
/*
out14
; free mm4 */
paddsw
%
mm3
,
%
mm5
/*
out2
*/
psraw
$
4
,
%
mm1
movq
%
mm7
,
(%
esi
)
/*
out0
; free mm7 */
psraw
$
4
,
%
mm5
movq
%
mm1
,
8
*
14
(%
esi
)
/*
out14
; free mm1 */
psubsw
%
mm3
,
%
mm2
/*
out12
; free mm3 */
movq
%
mm5
,
8
*
2
(%
esi
)
/*
out2
; free mm5 */
psraw
$
4
,
%
mm2
/*
moved
up
to
the
prev
block
*/
movq
scratch7
,
%
mm4
/*
moved
up
to
the
prev
block
*/
psraw
$
4
,
%
mm0
movq
%
mm2
,
8
*
12
(%
esi
)
/*
out12
; free mm2 */
/*
moved
up
to
the
prev
block
*/
psraw
$
4
,
%
mm6
/*
move
back
the
data
to
its
correct
place
*
moved
up
to
the
prev
block
*
movq
scratch3
,
%
mm0
*
movq
scratch5
,
%
mm6
*
movq
scratch7
,
%
mm4
*
psraw
$
4
,
%
mm0
*
psraw
$
4
,
%
mm6
*/
movq
scratch1
,
%
mm1
psraw
$
4
,
%
mm4
movq
%
mm0
,
8
*
3
(%
esi
)
/*
out3
*/
psraw
$
4
,
%
mm1
movq
%
mm6
,
8
*
5
(%
esi
)
/*
out5
*/
movq
%
mm4
,
8
*
7
(%
esi
)
/*
out7
*/
movq
%
mm1
,
8
(%
esi
)
/*
out1
*/
/*
transpose
matrix
*/
movl
$
8
,
%
ebx
/*
ebx
is
x_size
*/
movl
%
esi
,
%
edi
/*
pointer
to
the
matrix
*/
movl
%
ebx
,
%
ecx
sal
$
2
,
%
ecx
movl
%
ebx
,
%
eax
addl
%
ebx
,
%
ecx
subl
$
4
,
%
eax
/*
eax
is
inner
loop
variable
*/
addl
%
ebx
,
%
ecx
/*
ecx
is
6
*
row
size
*/
movl
%
eax
,
%
edx
/*
edx
is
the
outer
loop
variable
*/
.
L1
:
movq
(%
esi
),
%
mm0
/*
first
line
*/
movq
(%
esi
,%
ebx
,
4
),
%
mm2
/*
third
line
*/
movq
%
mm0
,
%
mm6
/*
copy
first
line
*/
punpcklwd
(%
esi
,%
ebx
,
2
),
%
mm0
/*
interleave
fist
and
second
lines
*/
movq
%
mm2
,
%
mm7
/*
copy
third
line
*/
punpcklwd
(%
esi
,%
ecx
),
%
mm2
/*
interleave
third
and
fourth
lines
*/
movq
%
mm0
,
%
mm4
/*
copy
first
intermediate
result
*/
movq
(%
esi
,%
ebx
,
2
),
%
mm1
/*
second
line
*/
/*
the
next
line
'punpcklwd %mm2, %mm0'
inverted
two
pixels
.
*/
/*
punpckldq
make
printing
cleaner
*/
punpckldq
%
mm2
,
%
mm0
/*
interleave
to
produce
result
1
*/
movq
(%
esi
,%
ecx
),
%
mm3
/*
fourth
line
*/
punpckhdq
%
mm2
,
%
mm4
/*
interleave
to
produce
result
2
*/
movq
%
mm0
,
(%
esi
)
/*
write
result
1
*/
punpckhwd
%
mm1
,
%
mm6
/*
interleave
first
and
second
lines
*/
movq
%
mm4
,
(%
esi
,%
ebx
,
2
)
/*
write
result
2
*/
punpckhwd
%
mm3
,
%
mm7
/*
interleave
3
rd
and
4
th
lines
*/
movq
%
mm6
,
%
mm5
/*
copy
first
intermediate
result
*/
punpckldq
%
mm7
,
%
mm6
/*
interleave
to
produce
result
3
*/
leal
(%
edi
,%
ebx
,
8
),
%
edi
/*
point
to
4
x4
set
4
rows
down
*/
punpckhdq
%
mm7
,
%
mm5
/*
interleave
to
produce
result
4
*/
movq
%
mm6
,
(%
esi
,%
ebx
,
4
)
/*
write
result
3
*/
movq
%
mm5
,
(%
esi
,%
ecx
)
/*
write
result
4
*/
/
*
check
to
see
if
number
of
rows
left
is
zero
*/
cmpl
$
0
,
%
edx
/
*
last
time
through
you
are
done
and
ready
to
exit
*/
je
.
L3
.
L2
:
movq
8
(%
esi
),
%
mm0
/*
first
line
*/
movq
8
(%
esi
,%
ebx
,
4
),
%
mm2
/*
third
line
*/
movq
%
mm0
,
%
mm6
/*
copy
first
line
*/
punpcklwd
8
(%
esi
,%
ebx
,
2
),
%
mm0
/*
interleave
first
and
second
lines
*/
movq
%
mm2
,
%
mm7
/*
copy
third
line
*/
punpcklwd
8
(%
esi
,%
ecx
),
%
mm2
/*
interleave
3
rd
and
4
th
lines
*/
movq
%
mm0
,
%
mm4
/*
copy
first
intermediate
*/
movq
(%
edi
),
%
mm1
/*
first
line
*/
punpckldq
%
mm2
,
%
mm0
/*
interleave
to
produce
1
st
result
*/
movq
(%
edi
,%
ebx
,
4
),
%
mm3
/*
third
line
*/
punpckhdq
%
mm2
,
%
mm4
/*
interleave
to
produce
2
nd
result
*/
punpckhwd
8
(%
esi
,%
ebx
,
2
),
%
mm6
/*
interleave
1
st
and
2
nd
lines
*/
movq
%
mm1
,
%
mm2
/*
copy
first
line
*/
punpckhwd
8
(%
esi
,%
ecx
),
%
mm7
/*
interleave
3
rd
and
4
th
lines
*/
movq
%
mm6
,
%
mm5
/*
copy
first
intermediate
*/
movq
%
mm0
,
(%
edi
)
/*
write
result
1
*/
punpckhdq
%
mm7
,
%
mm5
/*
produce
third
result
*/
punpcklwd
(%
edi
,%
ebx
,
2
),
%
mm1
/*
interleave
1
st
and
2
nd
lines
*/
movq
%
mm3
,
%
mm0
/*
copy
third
line
*/
punpckhwd
(%
edi
,%
ebx
,
2
),
%
mm2
/*
interleave
1
st
and
2
nd
lines
*/
movq
%
mm4
,
(%
edi
,%
ebx
,
2
)
/*
write
result
2
*/
punpckldq
%
mm7
,
%
mm6
/*
produce
fourth
result
*/
punpcklwd
(%
edi
,%
ecx
),
%
mm3
/*
interleave
3
rd
and
4
th
lines
*/
movq
%
mm1
,
%
mm4
/*
copy
first
intermediate
*/
movq
%
mm6
,
(%
edi
,%
ebx
,
4
)
/*
write
result
3
*/
punpckldq
%
mm3
,
%
mm1
punpckhwd
(%
edi
,%
ecx
),
%
mm0
/*
interleave
3
rd
and
4
th
lines
*/
movq
%
mm2
,
%
mm6
/*
copy
second
intermediate
*/
movq
%
mm5
,
(%
edi
,%
ecx
)
/*
write
result
4
*/
punpckhdq
%
mm3
,
%
mm4
/*
produce
second
result
*/
movq
%
mm1
,
8
(%
esi
)
/*
write
result
5
*/
punpckldq
%
mm0
,
%
mm2
/*
produce
third
result
*/
movq
%
mm4
,
8
(%
esi
,%
ebx
,
2
)
/*
write
result
6
*/
punpckhdq
%
mm0
,
%
mm6
/*
produce
fourth
result
*/
movq
%
mm2
,
8
(%
esi
,%
ebx
,
4
)
/*
write
result
7
*/
movq
%
mm6
,
8
(%
esi
,%
ecx
)
/*
write
result
8
*/
/
*
increment
%
esi
to
point
to
next
4
x4
block
in
same
row
*/
addl
$
8
,
%
esi
/
*
increment
%
edi
to
point
to
nxt
4
x4
block
below
current
*/
leal
(%
edi
,%
ebx
,
8
),
%
edi
sub
$
4
,
%
eax
/*
decrement
inner
loop
var
*/
jnz
.
L2
/
*
%
edi
points
to
start
of
second
row
in
block
just
finished
*/
sal
$
1
,
%
edx
leal
8
(%
esi
,%
ebx
,
8
),
%
esi
subl
%
edx
,
%
esi
/
*
subtract
the
number
of
bytes
in
last
row
*/
/
*
now
we
point
to
spot
where
row
=
col
*/
subl
$
8
,
%
edx
/*
sub
4
from
row
number
*/
sarl
$
1
,
%
edx
mov
%
esi
,
%
edi
mov
%
edx
,
%
eax
/
*
reset
x_size
to
outer
loop
variable
to
start
new
row
*/
jmp
.
L1
.
L3
:
emms
popl
%
edi
popl
%
esi
popl
%
edx
popl
%
ecx
popl
%
ebx
movl
%
ebp
,%
esp
popl
%
ebp
ret
.
Lfe1
:
.
size
vdec_IDCT
,
.
Lfe1
-
vdec_IDCT
plugins/idct/idctmmxext.c
View file @
cf0b7cf9
...
...
@@ -2,9 +2,13 @@
* idctmmxext.c : MMX EXT IDCT module
*****************************************************************************
* Copyright (C) 1999, 2000 VideoLAN
* $Id: idctmmxext.c,v 1.
3 2001/01/16 16:09:52 sam
Exp $
* $Id: idctmmxext.c,v 1.
4 2001/01/17 18:17:30 massiot
Exp $
*
* Authors:
* Authors: Aaron Holtzman <aholtzma@ess.engr.uvic.ca>
* Michel Lespinasse <walken@zoy.org>
* Peter Gubanov <peter@elecard.net.ru>
* (from the LiViD project)
* Christophe Massiot <massiot@via.ecp.fr>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
...
...
@@ -46,18 +50,15 @@
#include "idct.h"
#include "attributes.h"
#include "mmx.h"
/*****************************************************************************
* Local prototypes.
*****************************************************************************/
static
void
idct_getfunctions
(
function_list_t
*
p_function_list
);
static
int
idct_Probe
(
probedata_t
*
p_data
);
void
vdec_InitIDCT
(
vdec_thread_t
*
p_vdec
);
void
vdec_SparseIDCT
(
vdec_thread_t
*
p_vdec
,
dctelem_t
*
p_block
,
int
i_sparse_pos
);
void
vdec_IDCT
(
vdec_thread_t
*
p_vdec
,
dctelem_t
*
p_block
,
int
i_idontcare
);
static
void
vdec_NormScan
(
u8
ppi_scan
[
2
][
64
]
);
/*****************************************************************************
...
...
@@ -136,6 +137,7 @@ static void idct_getfunctions( function_list_t * p_function_list )
p_function_list
->
functions
.
idct
.
pf_init
=
vdec_InitIDCT
;
p_function_list
->
functions
.
idct
.
pf_sparse_idct
=
vdec_SparseIDCT
;
p_function_list
->
functions
.
idct
.
pf_idct
=
vdec_IDCT
;
p_function_list
->
functions
.
idct
.
pf_norm_scan
=
vdec_NormScan
;
}
/*****************************************************************************
...
...
@@ -143,16 +145,16 @@ static void idct_getfunctions( function_list_t * p_function_list )
*****************************************************************************/
static
int
idct_Probe
(
probedata_t
*
p_data
)
{
if
(
0
/*TestCPU( CPU_CAPABILITY_MMXEXT )*/
)
if
(
TestCPU
(
CPU_CAPABILITY_MMXEXT
)
)
{
if
(
TestMethod
(
IDCT_METHOD_VAR
,
"idctmmxext"
)
)
{
return
(
999
);
}
else
else
{
return
(
200
);
}
}
}
else
{
...
...
@@ -160,3 +162,332 @@ static int idct_Probe( probedata_t *p_data )
}
}
/*****************************************************************************
* vdec_NormScan : This IDCT uses reordered coeffs, so we patch the scan table
*****************************************************************************/
static
void
vdec_NormScan
(
u8
ppi_scan
[
2
][
64
]
)
{
int
i
,
j
;
for
(
i
=
0
;
i
<
64
;
i
++
)
{
j
=
ppi_scan
[
0
][
i
];
ppi_scan
[
0
][
i
]
=
(
j
&
0x38
)
|
((
j
&
6
)
>>
1
)
|
((
j
&
1
)
<<
2
);
j
=
ppi_scan
[
1
][
i
];
ppi_scan
[
1
][
i
]
=
(
j
&
0x38
)
|
((
j
&
6
)
>>
1
)
|
((
j
&
1
)
<<
2
);
}
}
/*****************************************************************************
* vdec_IDCT :
*****************************************************************************/
#define ROW_SHIFT 11
#define COL_SHIFT 6
#define round(bias) ((int)(((bias)+0.5) * (1<<ROW_SHIFT)))
#define rounder(bias) {round (bias), round (bias)}
#define table(c1,c2,c3,c4,c5,c6,c7) { c4, c2, -c4, -c2, \
c4, c6, c4, c6, \
c1, c3, -c1, -c5, \
c5, c7, c3, -c7, \
c4, -c6, c4, -c6, \
-c4, c2, c4, -c2, \
c5, -c1, c3, -c1, \
c7, c3, c7, -c5 }
static
__inline__
void
RowHead
(
dctelem_t
*
row
,
int
offset
,
dctelem_t
*
table
)
{
movq_m2r
(
*
(
row
+
offset
),
mm2
);
// mm2 = x6 x4 x2 x0
movq_m2r
(
*
(
row
+
offset
+
4
),
mm5
);
// mm5 = x7 x5 x3 x1
movq_r2r
(
mm2
,
mm0
);
// mm0 = x6 x4 x2 x0
movq_m2r
(
*
table
,
mm3
);
// mm3 = -C2 -C4 C2 C4
movq_r2r
(
mm5
,
mm6
);
// mm6 = x7 x5 x3 x1
movq_m2r
(
*
(
table
+
4
),
mm4
);
// mm4 = C6 C4 C6 C4
pmaddwd_r2r
(
mm0
,
mm3
);
// mm3 = -C4*x4-C2*x6 C4*x0+C2*x2
pshufw_r2r
(
mm2
,
mm2
,
0x4e
);
// mm2 = x2 x0 x6 x4
}
static
__inline__
void
Row
(
dctelem_t
*
table
,
s32
*
rounder
)
{
movq_m2r
(
*
(
table
+
8
),
mm1
);
// mm1 = -C5 -C1 C3 C1
pmaddwd_r2r
(
mm2
,
mm4
);
// mm4 = C4*x0+C6*x2 C4*x4+C6*x6
pmaddwd_m2r
(
*
(
table
+
16
),
mm0
);
// mm0 = C4*x4-C6*x6 C4*x0-C6*x2
pshufw_r2r
(
mm6
,
mm6
,
0x4e
);
// mm6 = x3 x1 x7 x5
movq_m2r
(
*
(
table
+
12
),
mm7
);
// mm7 = -C7 C3 C7 C5
pmaddwd_r2r
(
mm5
,
mm1
);
// mm1 = -C1*x5-C5*x7 C1*x1+C3*x3
paddd_m2r
(
*
rounder
,
mm3
);
// mm3 += rounder
pmaddwd_r2r
(
mm6
,
mm7
);
// mm7 = C3*x1-C7*x3 C5*x5+C7*x7
pmaddwd_m2r
(
*
(
table
+
20
),
mm2
);
// mm2 = C4*x0-C2*x2 -C4*x4+C2*x6
paddd_r2r
(
mm4
,
mm3
);
// mm3 = a1 a0 + rounder
pmaddwd_m2r
(
*
(
table
+
24
),
mm5
);
// mm5 = C3*x5-C1*x7 C5*x1-C1*x3
movq_r2r
(
mm3
,
mm4
);
// mm4 = a1 a0 + rounder
pmaddwd_m2r
(
*
(
table
+
28
),
mm6
);
// mm6 = C7*x1-C5*x3 C7*x5+C3*x7
paddd_r2r
(
mm7
,
mm1
);
// mm1 = b1 b0
paddd_m2r
(
*
rounder
,
mm0
);
// mm0 += rounder
psubd_r2r
(
mm1
,
mm3
);
// mm3 = a1-b1 a0-b0 + rounder
psrad_i2r
(
ROW_SHIFT
,
mm3
);
// mm3 = y6 y7
paddd_r2r
(
mm4
,
mm1
);
// mm1 = a1+b1 a0+b0 + rounder
paddd_r2r
(
mm2
,
mm0
);
// mm0 = a3 a2 + rounder
psrad_i2r
(
ROW_SHIFT
,
mm1
);
// mm1 = y1 y0
paddd_r2r
(
mm6
,
mm5
);
// mm5 = b3 b2
movq_r2r
(
mm0
,
mm4
);
// mm4 = a3 a2 + rounder
paddd_r2r
(
mm5
,
mm0
);
// mm0 = a3+b3 a2+b2 + rounder
psubd_r2r
(
mm5
,
mm4
);
// mm4 = a3-b3 a2-b2 + rounder
}
static
__inline__
void
RowTail
(
dctelem_t
*
row
,
int
store
)
{
psrad_i2r
(
ROW_SHIFT
,
mm0
);
// mm0 = y3 y2
psrad_i2r
(
ROW_SHIFT
,
mm4
);
// mm4 = y4 y5
packssdw_r2r
(
mm0
,
mm1
);
// mm1 = y3 y2 y1 y0
packssdw_r2r
(
mm3
,
mm4
);
// mm4 = y6 y7 y4 y5
movq_r2m
(
mm1
,
*
(
row
+
store
));
// save y3 y2 y1 y0
pshufw_r2r
(
mm4
,
mm4
,
0xb1
);
// mm4 = y7 y6 y5 y4
// slot
movq_r2m
(
mm4
,
*
(
row
+
store
+
4
));
// save y7 y6 y5 y4
}
static
__inline__
void
RowMid
(
dctelem_t
*
row
,
int
store
,
int
offset
,
dctelem_t
*
table
)
{
movq_m2r
(
*
(
row
+
offset
),
mm2
);
// mm2 = x6 x4 x2 x0
psrad_i2r
(
ROW_SHIFT
,
mm0
);
// mm0 = y3 y2
movq_m2r
(
*
(
row
+
offset
+
4
),
mm5
);
// mm5 = x7 x5 x3 x1
psrad_i2r
(
ROW_SHIFT
,
mm4
);
// mm4 = y4 y5
packssdw_r2r
(
mm0
,
mm1
);
// mm1 = y3 y2 y1 y0
movq_r2r
(
mm5
,
mm6
);
// mm6 = x7 x5 x3 x1
packssdw_r2r
(
mm3
,
mm4
);
// mm4 = y6 y7 y4 y5
movq_r2r
(
mm2
,
mm0
);
// mm0 = x6 x4 x2 x0
movq_r2m
(
mm1
,
*
(
row
+
store
));
// save y3 y2 y1 y0
pshufw_r2r
(
mm4
,
mm4
,
0xb1
);
// mm4 = y7 y6 y5 y4
movq_m2r
(
*
table
,
mm3
);
// mm3 = -C2 -C4 C2 C4
movq_r2m
(
mm4
,
*
(
row
+
store
+
4
));
// save y7 y6 y5 y4
pmaddwd_r2r
(
mm0
,
mm3
);
// mm3 = -C4*x4-C2*x6 C4*x0+C2*x2
movq_m2r
(
*
(
table
+
4
),
mm4
);
// mm4 = C6 C4 C6 C4
pshufw_r2r
(
mm2
,
mm2
,
0x4e
);
// mm2 = x2 x0 x6 x4
}
static
__inline__
void
Col
(
dctelem_t
*
col
,
int
offset
)
{
#define T1 13036
#define T2 27146
#define T3 43790
#define C4 23170
static
short
_T1
[]
ATTR_ALIGN
(
8
)
=
{
T1
,
T1
,
T1
,
T1
};
static
short
_T2
[]
ATTR_ALIGN
(
8
)
=
{
T2
,
T2
,
T2
,
T2
};
static
short
_T3
[]
ATTR_ALIGN
(
8
)
=
{
T3
,
T3
,
T3
,
T3
};
static
short
_C4
[]
ATTR_ALIGN
(
8
)
=
{
C4
,
C4
,
C4
,
C4
};
static
mmx_t
scratch0
,
scratch1
;
/* column code adapted from peter gubanov */
/* http://www.elecard.com/peter/idct.shtml */
movq_m2r
(
*
_T1
,
mm0
);
// mm0 = T1
movq_m2r
(
*
(
col
+
offset
+
1
*
8
),
mm1
);
// mm1 = x1
movq_r2r
(
mm0
,
mm2
);
// mm2 = T1
movq_m2r
(
*
(
col
+
offset
+
7
*
8
),
mm4
);
// mm4 = x7
pmulhw_r2r
(
mm1
,
mm0
);
// mm0 = T1*x1
movq_m2r
(
*
_T3
,
mm5
);
// mm5 = T3
pmulhw_r2r
(
mm4
,
mm2
);
// mm2 = T1*x7
movq_m2r
(
*
(
col
+
offset
+
5
*
8
),
mm6
);
// mm6 = x5
movq_r2r
(
mm5
,
mm7
);
// mm7 = T3-1
movq_m2r
(
*
(
col
+
offset
+
3
*
8
),
mm3
);
// mm3 = x3
psubsw_r2r
(
mm4
,
mm0
);
// mm0 = v17
movq_m2r
(
*
_T2
,
mm4
);
// mm4 = T2
pmulhw_r2r
(
mm3
,
mm5
);
// mm5 = (T3-1)*x3
paddsw_r2r
(
mm2
,
mm1
);
// mm1 = u17
pmulhw_r2r
(
mm6
,
mm7
);
// mm7 = (T3-1)*x5
//slot
movq_r2r
(
mm4
,
mm2
);
// mm2 = T2
paddsw_r2r
(
mm3
,
mm5
);
// mm5 = T3*x3
pmulhw_m2r
(
*
(
col
+
offset
+
2
*
8
),
mm4
);
// mm4 = T2*x2
paddsw_r2r
(
mm6
,
mm7
);
// mm7 = T3*x5
psubsw_r2r
(
mm6
,
mm5
);
// mm5 = v35
paddsw_r2r
(
mm3
,
mm7
);
// mm7 = u35
movq_m2r
(
*
(
col
+
offset
+
6
*
8
),
mm3
);
// mm3 = x6
movq_r2r
(
mm0
,
mm6
);
// mm6 = v17
pmulhw_r2r
(
mm3
,
mm2
);
// mm2 = T2*x6
psubsw_r2r
(
mm5
,
mm0
);
// mm0 = b3
psubsw_r2r
(
mm3
,
mm4
);
// mm4 = v26
paddsw_r2r
(
mm6
,
mm5
);
// mm5 = v12
movq_r2m
(
mm0
,
scratch0
);
// save b3
movq_r2r
(
mm1
,
mm6
);
// mm6 = u17
paddsw_m2r
(
*
(
col
+
offset
+
2
*
8
),
mm2
);
// mm2 = u26
paddsw_r2r
(
mm7
,
mm6
);
// mm6 = b0
psubsw_r2r
(
mm7
,
mm1
);
// mm1 = u12
movq_r2r
(
mm1
,
mm7
);
// mm7 = u12
movq_m2r
(
*
(
col
+
offset
+
0
*
8
),
mm3
);
// mm3 = x0
paddsw_r2r
(
mm5
,
mm1
);
// mm1 = u12+v12
movq_m2r
(
*
_C4
,
mm0
);
// mm0 = C4/2
psubsw_r2r
(
mm5
,
mm7
);
// mm7 = u12-v12
movq_r2m
(
mm6
,
scratch1
);
// save b0
pmulhw_r2r
(
mm0
,
mm1
);
// mm1 = b1/2
movq_r2r
(
mm4
,
mm6
);
// mm6 = v26
pmulhw_r2r
(
mm0
,
mm7
);
// mm7 = b2/2
movq_m2r
(
*
(
col
+
offset
+
4
*
8
),
mm5
);
// mm5 = x4
movq_r2r
(
mm3
,
mm0
);
// mm0 = x0
psubsw_r2r
(
mm5
,
mm3
);
// mm3 = v04
paddsw_r2r
(
mm5
,
mm0
);
// mm0 = u04
paddsw_r2r
(
mm3
,
mm4
);
// mm4 = a1
movq_r2r
(
mm0
,
mm5
);
// mm5 = u04
psubsw_r2r
(
mm6
,
mm3
);
// mm3 = a2
paddsw_r2r
(
mm2
,
mm5
);
// mm5 = a0
paddsw_r2r
(
mm1
,
mm1
);
// mm1 = b1
psubsw_r2r
(
mm2
,
mm0
);
// mm0 = a3
paddsw_r2r
(
mm7
,
mm7
);
// mm7 = b2
movq_r2r
(
mm3
,
mm2
);
// mm2 = a2
movq_r2r
(
mm4
,
mm6
);
// mm6 = a1
paddsw_r2r
(
mm7
,
mm3
);
// mm3 = a2+b2
psraw_i2r
(
COL_SHIFT
,
mm3
);
// mm3 = y2
paddsw_r2r
(
mm1
,
mm4
);
// mm4 = a1+b1
psraw_i2r
(
COL_SHIFT
,
mm4
);
// mm4 = y1
psubsw_r2r
(
mm1
,
mm6
);
// mm6 = a1-b1
movq_m2r
(
scratch1
,
mm1
);
// mm1 = b0
psubsw_r2r
(
mm7
,
mm2
);
// mm2 = a2-b2
psraw_i2r
(
COL_SHIFT
,
mm6
);
// mm6 = y6
movq_r2r
(
mm5
,
mm7
);
// mm7 = a0
movq_r2m
(
mm4
,
*
(
col
+
offset
+
1
*
8
));
// save y1
psraw_i2r
(
COL_SHIFT
,
mm2
);
// mm2 = y5
movq_r2m
(
mm3
,
*
(
col
+
offset
+
2
*
8
));
// save y2
paddsw_r2r
(
mm1
,
mm5
);
// mm5 = a0+b0
movq_m2r
(
scratch0
,
mm4
);
// mm4 = b3
psubsw_r2r
(
mm1
,
mm7
);
// mm7 = a0-b0
psraw_i2r
(
COL_SHIFT
,
mm5
);
// mm5 = y0
movq_r2r
(
mm0
,
mm3
);
// mm3 = a3
movq_r2m
(
mm2
,
*
(
col
+
offset
+
5
*
8
));
// save y5
psubsw_r2r
(
mm4
,
mm3
);
// mm3 = a3-b3
psraw_i2r
(
COL_SHIFT
,
mm7
);
// mm7 = y7
paddsw_r2r
(
mm0
,
mm4
);
// mm4 = a3+b3
movq_r2m
(
mm5
,
*
(
col
+
offset
+
0
*
8
));
// save y0
psraw_i2r
(
COL_SHIFT
,
mm3
);
// mm3 = y4
movq_r2m
(
mm6
,
*
(
col
+
offset
+
6
*
8
));
// save y6
psraw_i2r
(
COL_SHIFT
,
mm4
);
// mm4 = y3
movq_r2m
(
mm7
,
*
(
col
+
offset
+
7
*
8
));
// save y7
movq_r2m
(
mm3
,
*
(
col
+
offset
+
4
*
8
));
// save y4
movq_r2m
(
mm4
,
*
(
col
+
offset
+
3
*
8
));
// save y3
}
static
s32
rounder0
[]
ATTR_ALIGN
(
8
)
=
rounder
((
1
<<
(
COL_SHIFT
-
1
))
-
0
.
5
);
static
s32
rounder4
[]
ATTR_ALIGN
(
8
)
=
rounder
(
0
);
static
s32
rounder1
[]
ATTR_ALIGN
(
8
)
=
rounder
(
1
.
25683487303
);
// C1*(C1/C4+C1+C7)/2
static
s32
rounder7
[]
ATTR_ALIGN
(
8
)
=
rounder
(
-
0
.
25
);
// C1*(C7/C4+C7-C1)/2
static
s32
rounder2
[]
ATTR_ALIGN
(
8
)
=
rounder
(
0
.
60355339059
);
// C2 * (C6+C2)/2
static
s32
rounder6
[]
ATTR_ALIGN
(
8
)
=
rounder
(
-
0
.
25
);
// C2 * (C6-C2)/2
static
s32
rounder3
[]
ATTR_ALIGN
(
8
)
=
rounder
(
0
.
087788325588
);
// C3*(-C3/C4+C3+C5)/2
static
s32
rounder5
[]
ATTR_ALIGN
(
8
)
=
rounder
(
-
0
.
441341716183
);
// C3*(-C5/C4+C5-C3)/2
void
vdec_IDCT
(
vdec_thread_t
*
p_vdec
,
dctelem_t
*
p_block
,
int
i_idontcare
)
{
static
dctelem_t
table04
[]
ATTR_ALIGN
(
16
)
=
table
(
22725
,
21407
,
19266
,
16384
,
12873
,
8867
,
4520
);
static
dctelem_t
table17
[]
ATTR_ALIGN
(
16
)
=
table
(
31521
,
29692
,
26722
,
22725
,
17855
,
12299
,
6270
);
static
dctelem_t
table26
[]
ATTR_ALIGN
(
16
)
=
table
(
29692
,
27969
,
25172
,
21407
,
16819
,
11585
,
5906
);
static
dctelem_t
table35
[]
ATTR_ALIGN
(
16
)
=
table
(
26722
,
25172
,
22654
,
19266
,
15137
,
10426
,
5315
);
RowHead
(
p_block
,
0
*
8
,
table04
);
Row
(
table04
,
rounder0
);
RowMid
(
p_block
,
0
*
8
,
4
*
8
,
table04
);
Row
(
table04
,
rounder4
);
RowMid
(
p_block
,
4
*
8
,
1
*
8
,
table17
);
Row
(
table17
,
rounder1
);
RowMid
(
p_block
,
1
*
8
,
7
*
8
,
table17
);
Row
(
table17
,
rounder7
);
RowMid
(
p_block
,
7
*
8
,
2
*
8
,
table26
);
Row
(
table26
,
rounder2
);
RowMid
(
p_block
,
2
*
8
,
6
*
8
,
table26
);
Row
(
table26
,
rounder6
);
RowMid
(
p_block
,
6
*
8
,
3
*
8
,
table35
);
Row
(
table35
,
rounder3
);
RowMid
(
p_block
,
3
*
8
,
5
*
8
,
table35
);
Row
(
table35
,
rounder5
);
RowTail
(
p_block
,
5
*
8
);
Col
(
p_block
,
0
);
Col
(
p_block
,
4
);
}
src/video_decoder/vdec_idct.h
View file @
cf0b7cf9
...
...
@@ -2,7 +2,7 @@
* vdec_idct.h : types for the inverse discrete cosine transform
*****************************************************************************
* Copyright (C) 1999, 2000 VideoLAN
* $Id: vdec_idct.h,v 1.
3 2001/01/13 12:57:47 sam
Exp $
* $Id: vdec_idct.h,v 1.
4 2001/01/17 18:17:30 massiot
Exp $
*
* Authors: Gaël Hendryckx <jimmy@via.ecp.fr>
* Christophe Massiot <massiot@via.ecp.fr>
...
...
@@ -26,4 +26,4 @@ struct vdec_thread_s;
typedef
void
(
*
idct_init_t
)
(
struct
vdec_thread_s
*
);
typedef
void
(
*
f_idct_t
)
(
struct
vdec_thread_s
*
,
dctelem_t
*
,
int
);
typedef
void
(
*
norm_scan_t
)
(
u8
ppi_scan
[
2
][
64
]
);
src/video_decoder/video_parser.h
View file @
cf0b7cf9
...
...
@@ -2,7 +2,7 @@
* video_parser.h : video parser thread
*****************************************************************************
* Copyright (C) 1999, 2000 VideoLAN
* $Id: video_parser.h,v 1.
4 2001/01/15 13:25:09
massiot Exp $
* $Id: video_parser.h,v 1.
5 2001/01/17 18:17:30
massiot Exp $
*
* Authors: Christophe Massiot <massiot@via.ecp.fr>
*
...
...
@@ -125,14 +125,20 @@ typedef struct vpar_thread_s
lookup_t
*
pl_coded_pattern
;
/* variable length codes for the structure dct_dc_size for intra blocks */
lookup_t
*
pppl_dct_dc_size
[
2
][
2
];
/* Structure to store the tables B14 & B15 (ISO/
CEI
13818-2 B.4) */
/* Structure to store the tables B14 & B15 (ISO/
IEC
13818-2 B.4) */
dct_lookup_t
ppl_dct_coef
[
2
][
16384
];
/* Scan table */
u8
ppi_scan
[
2
][
64
];
/* Default quantization matrices */
u8
pi_default_intra_quant
[
64
];
u8
pi_default_nonintra_quant
[
64
];
/* IDCT plugin used and shortcuts to access its capabilities */
struct
module_s
*
p_module
;
idct_init_t
pf_init
;
f_idct_t
pf_sparse_idct
;
f_idct_t
pf_idct
;
norm_scan_t
pf_norm_scan
;
#ifdef STATS
/* Statistics */
...
...
src/video_decoder/vpar_blocks.h
View file @
cf0b7cf9
...
...
@@ -2,7 +2,7 @@
* vpar_blocks.h : video parser blocks management
*****************************************************************************
* Copyright (C) 1999, 2000 VideoLAN
* $Id: vpar_blocks.h,v 1.
1 2000/12/21 17:19:52
massiot Exp $
* $Id: vpar_blocks.h,v 1.
2 2001/01/17 18:17:30
massiot Exp $
*
* Authors: Christophe Massiot <massiot@via.ecp.fr>
* Jean-Marc Dressler <polux@via.ecp.fr>
...
...
@@ -157,8 +157,8 @@ typedef struct dct_lookup_s
/*****************************************************************************
* Constants
*****************************************************************************/
extern
int
pi_default_intra_quant
[
];
extern
int
pi_default_nonintra_quant
[
];
extern
u8
pi_default_intra_quant
[
64
];
extern
u8
pi_default_nonintra_quant
[
64
];
extern
u8
pi_scan
[
2
][
64
];
/*****************************************************************************
...
...
@@ -170,4 +170,5 @@ void vpar_InitPMBType( struct vpar_thread_s * p_vpar );
void
vpar_InitBMBType
(
struct
vpar_thread_s
*
p_vpar
);
void
vpar_InitCodedPattern
(
struct
vpar_thread_s
*
p_vpar
);
void
vpar_InitDCTTables
(
struct
vpar_thread_s
*
p_vpar
);
void
vpar_InitScanTable
(
struct
vpar_thread_s
*
p_vpar
);
void
vpar_PictureData
(
struct
vpar_thread_s
*
p_vpar
,
int
i_mb_base
);
src/video_decoder/vpar_headers.h
View file @
cf0b7cf9
...
...
@@ -2,7 +2,7 @@
* vpar_headers.h : video parser : headers parsing
*****************************************************************************
* Copyright (C) 1999, 2000 VideoLAN
* $Id: vpar_headers.h,v 1.
3 2001/01/15 18:02:48
massiot Exp $
* $Id: vpar_headers.h,v 1.
4 2001/01/17 18:17:30
massiot Exp $
*
* Authors: Christophe Massiot <massiot@via.ecp.fr>
* Stphane Borel <stef@via.ecp.fr>
...
...
@@ -40,7 +40,7 @@
*****************************************************************************/
typedef
struct
quant_matrix_s
{
int
*
pi_matrix
;
u8
*
pi_matrix
;
boolean_t
b_allocated
;
/* Has the matrix been allocated by vpar_headers ? */
}
quant_matrix_t
;
...
...
src/video_parser/video_parser.c
View file @
cf0b7cf9
...
...
@@ -2,7 +2,7 @@
* video_parser.c : video parser thread
*****************************************************************************
* Copyright (C) 1999, 2000 VideoLAN
* $Id: video_parser.c,v 1.6
7 2001/01/15 13:25:09
massiot Exp $
* $Id: video_parser.c,v 1.6
8 2001/01/17 18:17:31
massiot Exp $
*
* Authors: Christophe Massiot <massiot@via.ecp.fr>
* Samuel Hocevar <sam@via.ecp.fr>
...
...
@@ -116,6 +116,7 @@ vlc_thread_t vpar_CreateThread( vdec_config_t * p_config )
p_vpar
->
pf_init
=
idct_functions
.
pf_init
;
p_vpar
->
pf_sparse_idct
=
idct_functions
.
pf_sparse_idct
;
p_vpar
->
pf_idct
=
idct_functions
.
pf_idct
;
p_vpar
->
pf_norm_scan
=
idct_functions
.
pf_norm_scan
;
#undef idct_functions
/* Spawn the video parser thread */
...
...
@@ -228,6 +229,7 @@ static int InitThread( vpar_thread_t *p_vpar )
vpar_InitPMBType
(
p_vpar
);
vpar_InitBMBType
(
p_vpar
);
vpar_InitDCTTables
(
p_vpar
);
vpar_InitScanTable
(
p_vpar
);
/*
* Initialize the synchro properties
...
...
src/video_parser/vpar_blocks.c
View file @
cf0b7cf9
...
...
@@ -2,7 +2,7 @@
* vpar_blocks.c : blocks parsing
*****************************************************************************
* Copyright (C) 1999, 2000 VideoLAN
* $Id: vpar_blocks.c,v 1.7
0 2001/01/13 12:57:21 sam
Exp $
* $Id: vpar_blocks.c,v 1.7
1 2001/01/17 18:17:31 massiot
Exp $
*
* Authors: Christophe Massiot <massiot@via.ecp.fr>
* Jean-Marc Dressler <polux@via.ecp.fr>
...
...
@@ -609,6 +609,26 @@ void vpar_InitDCTTables( vpar_thread_t * p_vpar )
FillDCTTable
(
p_vpar
->
ppl_dct_coef
[
1
],
pl_DCT_tab6
,
1
,
16
,
16
);
}
/*****************************************************************************
* vpar_InitScanTable : Initialize scan table
*****************************************************************************/
void
vpar_InitScanTable
(
vpar_thread_t
*
p_vpar
)
{
int
i
;
memcpy
(
p_vpar
->
ppi_scan
,
pi_scan
,
sizeof
(
pi_scan
)
);
p_vpar
->
pf_norm_scan
(
p_vpar
->
ppi_scan
);
/* If scan table has changed, we must change the quantization matrices. */
for
(
i
=
0
;
i
<
64
;
i
++
)
{
p_vpar
->
pi_default_intra_quant
[
p_vpar
->
ppi_scan
[
0
][
i
]
]
=
pi_default_intra_quant
[
pi_scan
[
0
][
i
]
];
p_vpar
->
pi_default_nonintra_quant
[
p_vpar
->
ppi_scan
[
0
][
i
]
]
=
pi_default_nonintra_quant
[
pi_scan
[
0
][
i
]
];
}
}
/*
* Block parsing
...
...
@@ -686,11 +706,6 @@ static __inline__ void DecodeMPEG1NonIntra( vpar_thread_t * p_vpar,
break
;
case
DCT_EOB
:
#ifdef HAVE_MMX
/* The MMX IDCT has a precision problem with non-intra
* blocks. */
p_mb
->
ppi_blocks
[
i_b
][
0
]
+=
4
;
#endif
if
(
i_nc
<=
1
)
{
p_mb
->
pf_idct
[
i_b
]
=
p_vpar
->
pf_sparse_idct
;
...
...
@@ -715,7 +730,7 @@ static __inline__ void DecodeMPEG1NonIntra( vpar_thread_t * p_vpar,
break
;
}
i_pos
=
pi_scan
[
p_vpar
->
picture
.
b_alternate_scan
][
i_parse
];
i_pos
=
p
_vpar
->
pp
i_scan
[
p_vpar
->
picture
.
b_alternate_scan
][
i_parse
];
i_level
=
(
((
i_level
<<
1
)
+
1
)
*
p_vpar
->
mb
.
i_quantizer_scale
*
p_vpar
->
sequence
.
nonintra_quant
.
pi_matrix
[
i_pos
]
)
>>
4
;
...
...
@@ -870,7 +885,7 @@ static __inline__ void DecodeMPEG1Intra( vpar_thread_t * p_vpar,
}
/* Determine the position of the block in the frame */
i_pos
=
pi_scan
[
p_vpar
->
picture
.
b_alternate_scan
][
i_parse
];
i_pos
=
p
_vpar
->
pp
i_scan
[
p_vpar
->
picture
.
b_alternate_scan
][
i_parse
];
i_level
=
(
i_level
*
p_vpar
->
mb
.
i_quantizer_scale
*
p_vpar
->
sequence
.
intra_quant
.
pi_matrix
[
i_pos
]
)
>>
3
;
...
...
@@ -908,7 +923,7 @@ static __inline__ void DecodeMPEG2NonIntra( vpar_thread_t * p_vpar,
boolean_t
b_dc
;
boolean_t
b_sign
;
boolean_t
b_chroma
;
int
*
pi_quant
;
u8
*
pi_quant
;
/* Give the chromatic component (0, 1, 2) */
i_cc
=
pi_cc_index
[
i_b
];
...
...
@@ -988,7 +1003,7 @@ static __inline__ void DecodeMPEG2NonIntra( vpar_thread_t * p_vpar,
break
;
}
i_pos
=
pi_scan
[
p_vpar
->
picture
.
b_alternate_scan
][
i_parse
];
i_pos
=
p
_vpar
->
pp
i_scan
[
p_vpar
->
picture
.
b_alternate_scan
][
i_parse
];
i_level
=
(
((
i_level
<<
1
)
+
1
)
*
p_vpar
->
mb
.
i_quantizer_scale
*
pi_quant
[
i_pos
]
)
>>
5
;
p_mb
->
ppi_blocks
[
i_b
][
i_pos
]
=
b_sign
?
-
i_level
:
i_level
;
...
...
@@ -1019,7 +1034,7 @@ static __inline__ void DecodeMPEG2Intra( vpar_thread_t * p_vpar,
boolean_t
b_vlc_intra
;
boolean_t
b_sign
;
boolean_t
b_chroma
;
int
*
pi_quant
;
u8
*
pi_quant
;
/* Give the chromatic component (0, 1, 2) */
i_cc
=
pi_cc_index
[
i_b
];
...
...
@@ -1132,7 +1147,7 @@ static __inline__ void DecodeMPEG2Intra( vpar_thread_t * p_vpar,
}
/* Determine the position of the block in the frame */
i_pos
=
pi_scan
[
p_vpar
->
picture
.
b_alternate_scan
][
i_parse
];
i_pos
=
p
_vpar
->
pp
i_scan
[
p_vpar
->
picture
.
b_alternate_scan
][
i_parse
];
i_level
=
(
i_level
*
p_vpar
->
mb
.
i_quantizer_scale
*
pi_quant
[
i_pos
]
)
>>
4
;
...
...
src/video_parser/vpar_headers.c
View file @
cf0b7cf9
...
...
@@ -2,7 +2,7 @@
* vpar_headers.c : headers parsing
*****************************************************************************
* Copyright (C) 1999, 2000 VideoLAN
* $Id: vpar_headers.c,v 1.7
2 2001/01/16 18:06:0
1 massiot Exp $
* $Id: vpar_headers.c,v 1.7
3 2001/01/17 18:17:3
1 massiot Exp $
*
* Authors: Christophe Massiot <massiot@via.ecp.fr>
* Stphane Borel <stef@via.ecp.fr>
...
...
@@ -76,8 +76,7 @@ static void CopyrightExtension( vpar_thread_t * p_vpar );
/*****************************************************************************
* pi_default_intra_quant : default quantization matrix
*****************************************************************************/
#ifndef VDEC_DFT
int
pi_default_intra_quant
[]
=
u8
pi_default_intra_quant
[]
=
{
8
,
16
,
19
,
22
,
26
,
27
,
29
,
34
,
16
,
16
,
22
,
24
,
27
,
29
,
34
,
37
,
...
...
@@ -88,25 +87,11 @@ int pi_default_intra_quant[] =
26
,
27
,
29
,
34
,
38
,
46
,
56
,
69
,
27
,
29
,
35
,
38
,
46
,
56
,
69
,
83
};
#else
int
pi_default_intra_quant
[]
=
{
2048
,
5681
,
6355
,
6623
,
6656
,
5431
,
4018
,
2401
,
5681
,
7880
,
10207
,
10021
,
9587
,
8091
,
6534
,
3625
,
6355
,
10207
,
11363
,
10619
,
9700
,
8935
,
6155
,
3507
,
6623
,
9186
,
10226
,
9557
,
8730
,
8041
,
6028
,
3322
,
5632
,
9232
,
9031
,
8730
,
8192
,
7040
,
5542
,
3390
,
5230
,
7533
,
7621
,
7568
,
7040
,
6321
,
5225
,
3219
,
3602
,
5189
,
5250
,
5539
,
5265
,
5007
,
4199
,
2638
,
1907
,
2841
,
3230
,
3156
,
3249
,
3108
,
2638
,
1617
};
#endif
/*****************************************************************************
* pi_default_nonintra_quant : default quantization matrix
*****************************************************************************/
#ifndef VDEC_DFT
int
pi_default_nonintra_quant
[]
=
u8
pi_default_nonintra_quant
[]
=
{
16
,
16
,
16
,
16
,
16
,
16
,
16
,
16
,
16
,
16
,
16
,
16
,
16
,
16
,
16
,
16
,
...
...
@@ -117,19 +102,6 @@ int pi_default_nonintra_quant[] =
16
,
16
,
16
,
16
,
16
,
16
,
16
,
16
,
16
,
16
,
16
,
16
,
16
,
16
,
16
,
16
};
#else
int
pi_default_nonintra_quanit
[]
=
{
4096
,
5680
,
5344
,
4816
,
4096
,
3216
,
2224
,
1136
,
5680
,
7888
,
7424
,
6688
,
5680
,
4464
,
3072
,
1568
,
5344
,
7424
,
6992
,
6288
,
5344
,
4208
,
2896
,
1472
,
4816
,
6688
,
6288
,
5664
,
4816
,
3792
,
2608
,
1328
,
4096
,
5680
,
5344
,
4816
,
4096
,
3216
,
2224
,
1136
,
3216
,
4464
,
4208
,
3792
,
3216
,
2528
,
1744
,
880
,
2224
,
3072
,
2896
,
2608
,
2224
,
1744
,
1200
,
608
,
1136
,
1568
,
1472
,
1328
,
1136
,
880
,
608
,
304
};
#endif
/*****************************************************************************
* pi_scan : zig-zag and alternate scan patterns
...
...
@@ -211,14 +183,15 @@ static void __inline__ ReferenceReplace( vpar_thread_t * p_vpar,
/*****************************************************************************
* LoadMatrix : Load a quantization matrix
*****************************************************************************/
static
__inline__
void
LoadMatrix
(
vpar_thread_t
*
p_vpar
,
quant_matrix_t
*
p_matrix
)
static
__inline__
void
LoadMatrix
(
vpar_thread_t
*
p_vpar
,
quant_matrix_t
*
p_matrix
)
{
int
i_dummy
;
if
(
!
p_matrix
->
b_allocated
)
{
/* Allocate a piece of memory to load the matrix. */
if
(
(
p_matrix
->
pi_matrix
=
(
int
*
)
malloc
(
64
*
sizeof
(
int
)
))
==
NULL
)
if
(
(
p_matrix
->
pi_matrix
=
(
u8
*
)
malloc
(
64
*
sizeof
(
u8
)
))
==
NULL
)
{
intf_ErrMsg
(
"vpar error: allocation error in LoadMatrix()"
);
p_vpar
->
p_fifo
->
b_error
=
1
;
...
...
@@ -229,7 +202,7 @@ static __inline__ void LoadMatrix( vpar_thread_t * p_vpar, quant_matrix_t * p_ma
for
(
i_dummy
=
0
;
i_dummy
<
64
;
i_dummy
++
)
{
p_matrix
->
pi_matrix
[
pi_scan
[
SCAN_ZIGZAG
][
i_dummy
]]
p_matrix
->
pi_matrix
[
p
_vpar
->
pp
i_scan
[
SCAN_ZIGZAG
][
i_dummy
]]
=
GetBits
(
&
p_vpar
->
bit_stream
,
8
);
}
...
...
@@ -243,7 +216,7 @@ static __inline__ void LoadMatrix( vpar_thread_t * p_vpar, quant_matrix_t * p_ma
/*****************************************************************************
* LinkMatrix : Link a quantization matrix to another
*****************************************************************************/
static
__inline__
void
LinkMatrix
(
quant_matrix_t
*
p_matrix
,
int
*
pi_array
)
static
__inline__
void
LinkMatrix
(
quant_matrix_t
*
p_matrix
,
u8
*
pi_array
)
{
if
(
p_matrix
->
b_allocated
)
{
...
...
@@ -366,7 +339,8 @@ static void SequenceHeader( vpar_thread_t * p_vpar )
else
{
/* Use default matrix. */
LinkMatrix
(
&
p_vpar
->
sequence
.
intra_quant
,
pi_default_intra_quant
);
LinkMatrix
(
&
p_vpar
->
sequence
.
intra_quant
,
p_vpar
->
pi_default_intra_quant
);
}
if
(
GetBits
(
&
p_vpar
->
bit_stream
,
1
)
)
/* load_non_intra_quantizer_matrix */
...
...
@@ -376,7 +350,8 @@ static void SequenceHeader( vpar_thread_t * p_vpar )
else
{
/* Use default matrix. */
LinkMatrix
(
&
p_vpar
->
sequence
.
nonintra_quant
,
pi_default_nonintra_quant
);
LinkMatrix
(
&
p_vpar
->
sequence
.
nonintra_quant
,
p_vpar
->
pi_default_nonintra_quant
);
}
/* Unless later overwritten by a matrix extension, we have the same
...
...
@@ -905,7 +880,7 @@ static void QuantMatrixExtension( vpar_thread_t * p_vpar )
{
/* Use the default matrix. */
LinkMatrix
(
&
p_vpar
->
sequence
.
intra_quant
,
pi_default_intra_quant
);
p
_vpar
->
p
i_default_intra_quant
);
}
if
(
GetBits
(
&
p_vpar
->
bit_stream
,
1
)
)
{
...
...
@@ -916,7 +891,7 @@ static void QuantMatrixExtension( vpar_thread_t * p_vpar )
{
/* Use the default matrix. */
LinkMatrix
(
&
p_vpar
->
sequence
.
nonintra_quant
,
pi_default_nonintra_quant
);
p
_vpar
->
p
i_default_nonintra_quant
);
}
if
(
GetBits
(
&
p_vpar
->
bit_stream
,
1
)
)
{
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment