Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Support
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
V
vlc-1.1
Project overview
Project overview
Details
Activity
Releases
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Issues
0
Issues
0
List
Boards
Labels
Milestones
Redmine
Redmine
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Operations
Operations
Metrics
Environments
Analytics
Analytics
CI / CD
Repository
Value Stream
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
videolan
vlc-1.1
Commits
62107e56
Commit
62107e56
authored
Nov 17, 2009
by
Laurent Aimar
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
Improved performance when copying video surface in dxva2.
parent
106208c9
Changes
1
Show whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
329 additions
and
112 deletions
+329
-112
modules/codec/avcodec/dxva2.c
modules/codec/avcodec/dxva2.c
+329
-112
No files found.
modules/codec/avcodec/dxva2.c
View file @
62107e56
...
...
@@ -240,6 +240,9 @@ typedef struct
/* Option conversion */
D3DFORMAT
output
;
uint8_t
*
surface_cache_base
;
uint8_t
*
surface_cache
;
size_t
surface_cache_size
;
/* */
struct
dxva_context
hw
;
...
...
@@ -282,6 +285,13 @@ static int DxResetVideoDecoder(vlc_va_dxva2_t *);
static
void
DxCreateVideoConversion
(
vlc_va_dxva2_t
*
);
static
void
DxDestroyVideoConversion
(
vlc_va_dxva2_t
*
);
static
void
CopyFromNv12
(
picture_t
*
dst
,
const
D3DLOCKED_RECT
*
src
,
uint8_t
*
cache
,
size_t
cache_size
,
unsigned
width
,
unsigned
height
);
static
void
CopyFromYv12
(
picture_t
*
dst
,
const
D3DLOCKED_RECT
*
src
,
uint8_t
*
cache
,
size_t
cache_size
,
unsigned
width
,
unsigned
height
);
/* */
static
int
Setup
(
vlc_va_t
*
external
,
void
**
hw
,
vlc_fourcc_t
*
chroma
,
int
width
,
int
height
)
...
...
@@ -330,86 +340,15 @@ ok:
return
VLC_SUCCESS
;
}
static
void
SplitUV
(
uint8_t
*
dstu
,
size_t
dstu_pitch
,
uint8_t
*
dstv
,
size_t
dstv_pitch
,
const
uint8_t
*
src
,
size_t
src_pitch
,
unsigned
w
,
unsigned
h
)
{
for
(
unsigned
y
=
0
;
y
<
h
;
y
++
)
{
for
(
unsigned
x
=
0
;
x
<
w
;
x
++
)
{
dstu
[
x
]
=
src
[
2
*
x
+
0
];
dstv
[
x
]
=
src
[
2
*
x
+
1
];
}
src
+=
src_pitch
;
dstu
+=
dstu_pitch
;
dstv
+=
dstv_pitch
;
}
}
/* FIXME CAN_COMPILE_SSSE3 seems undefined for me */
#if 0 && defined(CAN_COMPILE_SSSE3) && ((__GNUC__ > 3) || (__GNUC__ == 3 && __GNUC_MINOR__ > 0))
//#define HAVE_SPLITUV_SSSE3
static void SplitUVSSSE3(uint8_t *dstu, size_t dstu_pitch,
uint8_t *dstv, size_t dstv_pitch,
const uint8_t *src, size_t src_pitch,
unsigned w, unsigned h)
{
const uint8_t maskm[] = { 0, 2, 4, 6, 8, 10, 12, 14,
1, 3, 5, 7, 9, 11, 13, 15 };
for (unsigned y = 0; y < h; y++) {
unsigned x;
/* Prefetch the line */
for (x = 0; x < 2*w; x += 64) {
__asm__ volatile (
"prefetchnta 0(%[src])\n"
: : [src]"r"(((intptr_t)src & ~15)+ x));
}
for (x = 0; x < (w & ~31); x += 32) {
__asm__ volatile (
"movdqu (%[mask]), %%xmm0\n"
"movdqu 0(%[src]), %%xmm1\n"
"movdqu 16(%[src]), %%xmm2\n"
"movdqu 32(%[src]), %%xmm3\n"
"movdqu 48(%[src]), %%xmm4\n"
"pshufb %%xmm0, %%xmm1\n"
"pshufb %%xmm0, %%xmm2\n"
"pshufb %%xmm0, %%xmm3\n"
"pshufb %%xmm0, %%xmm4\n"
"movq %%xmm1, 0(%[dstu])\n"
"movq %%xmm2, 8(%[dstu])\n"
"movhpd %%xmm1, 0(%[dstv])\n"
"movhpd %%xmm2, 8(%[dstv])\n"
"movq %%xmm3, 16(%[dstu])\n"
"movq %%xmm4, 24(%[dstu])\n"
"movhpd %%xmm3, 16(%[dstv])\n"
"movhpd %%xmm4, 24(%[dstv])\n"
: : [dstu]"r"(&dstu[x]), [dstv]"r"(&dstv[x]), [src]"r"(&src[2*x]), [mask]"r"(maskm) : "memory");
}
/* Remaining */
for (; x < w; x++) {
dstu[x] = src[2*x+0];
dstv[x] = src[2*x+1];
}
src += src_pitch;
dstu += dstu_pitch;
dstv += dstv_pitch;
}
__asm__ volatile ( "emms" );
}
#endif
static
int
Extract
(
vlc_va_t
*
external
,
picture_t
*
picture
,
AVFrame
*
ff
)
{
vlc_va_dxva2_t
*
va
=
vlc_va_dxva2_Get
(
external
);
LPDIRECT3DSURFACE9
d3d
=
(
LPDIRECT3DSURFACE9
)(
uintptr_t
)
ff
->
data
[
3
];
if
(
!
va
->
surface_cache
)
return
VLC_EGENERIC
;
/* */
assert
(
va
->
render
==
MAKEFOURCC
(
'Y'
,
'V'
,
'1'
,
'2'
)
||
va
->
render
==
MAKEFOURCC
(
'N'
,
'V'
,
'1'
,
'2'
));
assert
(
va
->
output
==
MAKEFOURCC
(
'Y'
,
'V'
,
'1'
,
'2'
));
/* */
...
...
@@ -419,45 +358,15 @@ static int Extract(vlc_va_t *external, picture_t *picture, AVFrame *ff)
return
VLC_EGENERIC
;
}
/* Copy the Y plane */
plane_t
src
;
memset
(
&
src
,
0
,
sizeof
(
src
));
src
.
p_pixels
=
lock
.
pBits
;
src
.
i_pitch
=
lock
.
Pitch
;
src
.
i_lines
=
va
->
surface_height
;
src
.
i_visible_pitch
=
va
->
surface_width
;
src
.
i_visible_lines
=
va
->
surface_height
;
src
.
i_pixel_pitch
=
1
;
plane_CopyPixels
(
&
picture
->
p
[
0
],
&
src
);
/* */
src
.
p_pixels
+=
src
.
i_pitch
*
src
.
i_lines
*
src
.
i_pixel_pitch
;
/* */
if
(
va
->
render
==
MAKEFOURCC
(
'Y'
,
'V'
,
'1'
,
'2'
))
{
src
.
i_pitch
/=
2
;
src
.
i_lines
/=
2
;
src
.
i_visible_pitch
/=
2
;
src
.
i_visible_lines
/=
2
;
for
(
unsigned
n
=
1
;
n
<
3
;
n
++
)
{
plane_CopyPixels
(
&
picture
->
p
[
n
],
&
src
);
src
.
p_pixels
+=
src
.
i_pitch
*
src
.
i_lines
*
src
.
i_pixel_pitch
;
}
CopyFromYv12
(
picture
,
&
lock
,
va
->
surface_cache
,
va
->
surface_cache_size
,
va
->
surface_width
,
va
->
surface_height
);
}
else
{
plane_t
*
u
=
&
picture
->
p
[
2
];
plane_t
*
v
=
&
picture
->
p
[
1
];
void
(
*
split
)(
uint8_t
*
,
size_t
,
uint8_t
*
,
size_t
,
const
uint8_t
*
,
size_t
,
unsigned
,
unsigned
);
#ifdef HAVE_SPLITUV_SSSE3
if
(
vlc_CPU
()
&
CPU_CAPABILITY_SSSE3
)
split
=
SplitUVSSSE3
;
else
#endif
split
=
SplitUV
;
split
(
u
->
p_pixels
,
u
->
i_pitch
,
v
->
p_pixels
,
v
->
i_pitch
,
src
.
p_pixels
,
src
.
i_pitch
,
va
->
surface_width
/
2
,
va
->
surface_height
/
2
);
assert
(
va
->
render
==
MAKEFOURCC
(
'N'
,
'V'
,
'1'
,
'2'
));
CopyFromNv12
(
picture
,
&
lock
,
va
->
surface_cache
,
va
->
surface_cache_size
,
va
->
surface_width
,
va
->
surface_height
);
}
/* */
...
...
@@ -1013,11 +922,319 @@ static void DxCreateVideoConversion(vlc_va_dxva2_t *va)
va
->
output
=
va
->
render
;
break
;
}
va
->
surface_cache_size
=
__MAX
((
va
->
surface_width
+
0x0f
)
&
~
0x0f
,
4096
);
va
->
surface_cache_base
=
malloc
(
16
+
va
->
surface_cache_size
);
va
->
surface_cache
=
&
va
->
surface_cache_base
[
16
-
((
intptr_t
)
va
->
surface_cache_base
&
0x0f
)];
}
static
void
DxDestroyVideoConversion
(
vlc_va_dxva2_t
*
va
)
{
VLC_UNUSED
(
va
);
free
(
va
->
surface_cache_base
);
va
->
surface_cache_base
=
NULL
;
va
->
surface_cache
=
NULL
;
va
->
surface_cache_size
=
0
;
}
/* Copy 64 bytes from srcp to dsp loading data with the SSE>=2 instruction load and
* storing data with the SSE>=2 instruction store.
*/
#define COPY64(dstp, srcp, load, store) \
asm volatile ( \
load " 0(%[src]), %%xmm1\n" \
load " 16(%[src]), %%xmm2\n" \
load " 32(%[src]), %%xmm3\n" \
load " 48(%[src]), %%xmm4\n" \
store " %%xmm1, 0(%[dst])\n" \
store " %%xmm2, 16(%[dst])\n" \
store " %%xmm3, 32(%[dst])\n" \
store " %%xmm4, 48(%[dst])\n" \
: : [dst]"r"(dstp), [src]"r"(srcp) : "memory")
/* Execute the instruction op only if SSE2 is supported. */
#ifdef CAN_COMPILE_SSE2
# define ASM_SSE2(cpu, op) do { \
if (cpu & CPU_CAPABILITY_SSE2) \
asm volatile (op); \
} while (0)
#else
# define ASM_SSE2(cpu, op)
#endif
/* Optimized copy from "Uncacheable Speculative Write Combining" memory
* as used by some video surface.
* XXX It is really efficient only when SSE4.1 is available.
*/
static
void
CopyFromUswc
(
uint8_t
*
dst
,
size_t
dst_pitch
,
const
uint8_t
*
src
,
size_t
src_pitch
,
unsigned
unaligned
,
unsigned
width
,
unsigned
height
,
unsigned
cpu
)
{
assert
(((
intptr_t
)
dst
&
0x0f
)
==
0
&&
(
dst_pitch
&
0x0f
)
==
0
);
ASM_SSE2
(
cpu
,
"mfence"
);
for
(
unsigned
y
=
0
;
y
<
height
;
y
++
)
{
unsigned
x
;
for
(
x
=
0
;
x
<
unaligned
;
x
++
)
dst
[
x
]
=
src
[
x
];
#ifdef CAN_COMPILE_SSE4_1
if
(
cpu
&
CPU_CAPABILITY_SSE4_1
)
{
if
(
!
unaligned
)
{
for
(;
x
+
63
<
width
;
x
+=
64
)
COPY64
(
&
dst
[
x
],
&
src
[
x
],
"movntdqa"
,
"movdqa"
);
}
else
{
for
(;
x
+
63
<
width
;
x
+=
64
)
COPY64
(
&
dst
[
x
],
&
src
[
x
],
"movntdqa"
,
"movdqu"
);
}
}
else
#endif
#ifdef CAN_COMPILE_SSE2
if
(
cpu
&
CPU_CAPABILITY_SSE2
)
{
if
(
!
unaligned
)
{
for
(;
x
+
63
<
width
;
x
+=
64
)
COPY64
(
&
dst
[
x
],
&
src
[
x
],
"movdqa"
,
"movdqa"
);
}
else
{
for
(;
x
+
63
<
width
;
x
+=
64
)
COPY64
(
&
dst
[
x
],
&
src
[
x
],
"movdqa"
,
"movdqu"
);
}
}
#endif
for
(;
x
<
width
;
x
++
)
dst
[
x
]
=
src
[
x
];
src
+=
src_pitch
;
dst
+=
dst_pitch
;
}
}
static
void
Copy2d
(
uint8_t
*
dst
,
size_t
dst_pitch
,
const
uint8_t
*
src
,
size_t
src_pitch
,
unsigned
width
,
unsigned
height
,
unsigned
cpu
)
{
assert
(((
intptr_t
)
src
&
0x0f
)
==
0
&&
(
src_pitch
&
0x0f
)
==
0
);
ASM_SSE2
(
cpu
,
"mfence"
);
for
(
unsigned
y
=
0
;
y
<
height
;
y
++
)
{
unsigned
x
=
0
;
bool
unaligned
=
((
intptr_t
)
dst
&
0x0f
)
!=
0
;
#ifdef CAN_COMPILE_SSE2
if
(
cpu
&
CPU_CAPABILITY_SSE2
)
{
if
(
!
unaligned
)
{
for
(;
x
+
63
<
width
;
x
+=
64
)
COPY64
(
&
dst
[
x
],
&
src
[
x
],
"movdqa"
,
"movntdq"
);
}
else
{
for
(;
x
+
63
<
width
;
x
+=
64
)
COPY64
(
&
dst
[
x
],
&
src
[
x
],
"movdqa"
,
"movdqu"
);
}
}
#endif
for
(;
x
<
width
;
x
++
)
dst
[
x
]
=
src
[
x
];
src
+=
src_pitch
;
dst
+=
dst_pitch
;
}
}
static
void
SplitUV
(
uint8_t
*
dstu
,
size_t
dstu_pitch
,
uint8_t
*
dstv
,
size_t
dstv_pitch
,
const
uint8_t
*
src
,
size_t
src_pitch
,
unsigned
width
,
unsigned
height
,
unsigned
cpu
)
{
const
uint8_t
shuffle
[]
=
{
0
,
2
,
4
,
6
,
8
,
10
,
12
,
14
,
1
,
3
,
5
,
7
,
9
,
11
,
13
,
15
};
const
uint8_t
mask
[]
=
{
0xff
,
0x00
,
0xff
,
0x00
,
0xff
,
0x00
,
0xff
,
0x00
,
0xff
,
0x00
,
0xff
,
0x00
,
0xff
,
0x00
,
0xff
,
0x00
};
assert
(((
intptr_t
)
src
&
0x0f
)
==
0
&&
(
src_pitch
&
0x0f
)
==
0
);
ASM_SSE2
(
cpu
,
"mfence"
);
for
(
unsigned
y
=
0
;
y
<
height
;
y
++
)
{
unsigned
x
=
0
;
#define LOAD64 \
"movdqa 0(%[src]), %%xmm0\n" \
"movdqa 16(%[src]), %%xmm1\n" \
"movdqa 32(%[src]), %%xmm2\n" \
"movdqa 48(%[src]), %%xmm3\n"
#define STORE2X32 \
"movq %%xmm0, 0(%[dst1])\n" \
"movq %%xmm1, 8(%[dst1])\n" \
"movhpd %%xmm0, 0(%[dst2])\n" \
"movhpd %%xmm1, 8(%[dst2])\n" \
"movq %%xmm2, 16(%[dst1])\n" \
"movq %%xmm3, 24(%[dst1])\n" \
"movhpd %%xmm2, 16(%[dst2])\n" \
"movhpd %%xmm3, 24(%[dst2])\n"
#ifdef CAN_COMPILE_SSSE3
if
(
cpu
&
CPU_CAPABILITY_SSSE3
)
{
for
(
x
=
0
;
x
<
(
width
&
~
31
);
x
+=
32
)
{
asm
volatile
(
"movdqu (%[shuffle]), %%xmm7
\n
"
LOAD64
"pshufb %%xmm7, %%xmm0
\n
"
"pshufb %%xmm7, %%xmm1
\n
"
"pshufb %%xmm7, %%xmm2
\n
"
"pshufb %%xmm7, %%xmm3
\n
"
STORE2X32
:
:
[
dst1
]
"r"
(
&
dstu
[
x
]),
[
dst2
]
"r"
(
&
dstv
[
x
]),
[
src
]
"r"
(
&
src
[
2
*
x
]),
[
shuffle
]
"r"
(
shuffle
)
:
"memory"
);
}
}
else
#endif
#ifdef CAN_COMPILE_SSE2
if
(
cpu
&
CPU_CAPABILITY_SSE2
)
{
for
(
x
=
0
;
x
<
(
width
&
~
31
);
x
+=
32
)
{
asm
volatile
(
"movdqu (%[mask]), %%xmm7
\n
"
LOAD64
"movdqa %%xmm0, %%xmm4
\n
"
"movdqa %%xmm1, %%xmm5
\n
"
"movdqa %%xmm2, %%xmm6
\n
"
"psrlw $8, %%xmm0
\n
"
"psrlw $8, %%xmm1
\n
"
"pand %%xmm7, %%xmm4
\n
"
"pand %%xmm7, %%xmm5
\n
"
"pand %%xmm7, %%xmm6
\n
"
"packuswb %%xmm4, %%xmm0
\n
"
"packuswb %%xmm5, %%xmm1
\n
"
"pand %%xmm3, %%xmm7
\n
"
"psrlw $8, %%xmm2
\n
"
"psrlw $8, %%xmm3
\n
"
"packuswb %%xmm6, %%xmm2
\n
"
"packuswb %%xmm7, %%xmm3
\n
"
STORE2X32
:
:
[
dst2
]
"r"
(
&
dstu
[
x
]),
[
dst1
]
"r"
(
&
dstv
[
x
]),
[
src
]
"r"
(
&
src
[
2
*
x
]),
[
mask
]
"r"
(
mask
)
:
"memory"
);
}
}
#endif
#undef STORE2X32
#undef LOAD64
for
(;
x
<
width
;
x
++
)
{
dstu
[
x
]
=
src
[
2
*
x
+
0
];
dstv
[
x
]
=
src
[
2
*
x
+
1
];
}
src
+=
src_pitch
;
dstu
+=
dstu_pitch
;
dstv
+=
dstv_pitch
;
}
}
static
void
CopyPlane
(
uint8_t
*
dst
,
size_t
dst_pitch
,
const
uint8_t
*
src
,
size_t
src_pitch
,
uint8_t
*
cache
,
size_t
cache_size
,
unsigned
width
,
unsigned
height
,
unsigned
cpu
)
{
const
unsigned
w16
=
(
width
+
15
)
&
~
15
;
const
unsigned
hstep
=
cache_size
/
w16
;
assert
(
hstep
>
0
);
for
(
unsigned
y
=
0
;
y
<
height
;
y
+=
hstep
)
{
const
unsigned
unaligned
=
(
intptr_t
)
src
&
0x0f
;
const
unsigned
hblock
=
__MIN
(
hstep
,
height
-
y
);
/* Copy a bunch of line into our cache */
CopyFromUswc
(
cache
,
w16
,
src
,
src_pitch
,
unaligned
,
width
,
hblock
,
cpu
);
/* Copy from our cache to the destination */
Copy2d
(
dst
,
dst_pitch
,
cache
,
w16
,
width
,
hblock
,
cpu
);
/* */
src
+=
src_pitch
*
hblock
;
dst
+=
dst_pitch
*
hblock
;
}
ASM_SSE2
(
cpu
,
"mfence"
);
}
static
void
SplitPlanes
(
uint8_t
*
dstu
,
size_t
dstu_pitch
,
uint8_t
*
dstv
,
size_t
dstv_pitch
,
const
uint8_t
*
src
,
size_t
src_pitch
,
uint8_t
*
cache
,
size_t
cache_size
,
unsigned
width
,
unsigned
height
,
unsigned
cpu
)
{
const
unsigned
w2_16
=
(
2
*
width
+
15
)
&
~
15
;
const
unsigned
hstep
=
cache_size
/
w2_16
;
assert
(
hstep
>
0
);
for
(
unsigned
y
=
0
;
y
<
height
;
y
+=
hstep
)
{
const
unsigned
unaligned
=
(
intptr_t
)
src
&
0x0f
;
const
unsigned
hblock
=
__MIN
(
hstep
,
height
-
y
);
/* Copy a bunch of line into our cache */
CopyFromUswc
(
cache
,
w2_16
,
src
,
src_pitch
,
unaligned
,
2
*
width
,
hblock
,
cpu
);
/* Copy from our cache to the destination */
SplitUV
(
dstu
,
dstu_pitch
,
dstv
,
dstv_pitch
,
cache
,
w2_16
,
width
,
hblock
,
cpu
);
/* */
src
+=
src_pitch
*
hblock
;
dstu
+=
dstu_pitch
*
hblock
;
dstv
+=
dstv_pitch
*
hblock
;
}
ASM_SSE2
(
cpu
,
"mfence"
);
}
static
void
CopyFromNv12
(
picture_t
*
dst
,
const
D3DLOCKED_RECT
*
src
,
uint8_t
*
cache
,
size_t
cache_size
,
unsigned
width
,
unsigned
height
)
{
const
unsigned
cpu
=
vlc_CPU
();
/* */
CopyPlane
(
dst
->
p
[
0
].
p_pixels
,
dst
->
p
[
0
].
i_pitch
,
src
->
pBits
,
src
->
Pitch
,
cache
,
cache_size
,
width
,
height
,
cpu
);
SplitPlanes
(
dst
->
p
[
2
].
p_pixels
,
dst
->
p
[
2
].
i_pitch
,
dst
->
p
[
1
].
p_pixels
,
dst
->
p
[
1
].
i_pitch
,
(
const
uint8_t
*
)
src
->
pBits
+
src
->
Pitch
*
height
,
src
->
Pitch
,
cache
,
cache_size
,
width
/
2
,
height
/
2
,
cpu
);
ASM_SSE2
(
cpu
,
"emms"
);
}
static
void
CopyFromYv12
(
picture_t
*
dst
,
const
D3DLOCKED_RECT
*
src
,
uint8_t
*
cache
,
size_t
cache_size
,
unsigned
width
,
unsigned
height
)
{
const
unsigned
cpu
=
vlc_CPU
();
/* */
for
(
unsigned
n
=
0
,
offset
=
0
;
n
<
3
;
n
++
)
{
const
unsigned
d
=
n
>
0
?
2
:
1
;
CopyPlane
(
dst
->
p
[
n
].
p_pixels
,
dst
->
p
[
n
].
i_pitch
,
(
const
uint8_t
*
)
src
->
pBits
+
offset
,
src
->
Pitch
/
d
,
cache
,
cache_size
,
width
/
d
,
height
/
d
,
cpu
);
offset
+=
(
src
->
Pitch
/
d
)
*
(
height
/
d
);
}
ASM_SSE2
(
cpu
,
"emms"
);
}
#undef ASM_SSE2
#undef COPY64
#else
vlc_va_t
*
vlc_va_NewDxva2
(
vlc_object_t
*
log
,
int
codec_id
)
{
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment