Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Support
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
V
vlc
Project overview
Project overview
Details
Activity
Releases
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Issues
0
Issues
0
List
Boards
Labels
Milestones
Redmine
Redmine
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Operations
Operations
Metrics
Environments
Analytics
Analytics
CI / CD
Repository
Value Stream
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
videolan
vlc
Commits
0777bcbf
Commit
0777bcbf
authored
May 08, 2010
by
Laurent Aimar
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
Moved out optimized VA nv12/yv12 copy functions from dxva2.
parent
287ccf14
Changes
4
Hide whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
431 additions
and
326 deletions
+431
-326
modules/codec/avcodec/Modules.am
modules/codec/avcodec/Modules.am
+2
-0
modules/codec/avcodec/copy.c
modules/codec/avcodec/copy.c
+355
-0
modules/codec/avcodec/copy.h
modules/codec/avcodec/copy.h
+44
-0
modules/codec/avcodec/dxva2.c
modules/codec/avcodec/dxva2.c
+30
-326
No files found.
modules/codec/avcodec/Modules.am
View file @
0777bcbf
...
...
@@ -10,6 +10,8 @@ libavcodec_plugin_la_SOURCES = \
chroma.c \
vaapi.c \
dxva2.c \
copy.c \
copy.h \
va.h \
$(NULL)
if ENABLE_SOUT
...
...
modules/codec/avcodec/copy.c
0 → 100644
View file @
0777bcbf
/*****************************************************************************
* copy.c: Fast YV12/NV12 copy
*****************************************************************************
* Copyright (C) 2010 Laurent Aimar
* $Id$
*
* Authors: Laurent Aimar <fenrir _AT_ videolan _DOT_ org>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston MA 02110-1301, USA.
*****************************************************************************/
#ifdef HAVE_CONFIG_H
# include "config.h"
#endif
#include <vlc_common.h>
#include <vlc_picture.h>
#include <vlc_cpu.h>
#include <assert.h>
#include "copy.h"
/* Copy 64 bytes from srcp to dsp loading data with the SSE>=2 instruction load and
* storing data with the SSE>=2 instruction store.
*/
#define COPY64(dstp, srcp, load, store) \
asm volatile ( \
load " 0(%[src]), %%xmm1\n" \
load " 16(%[src]), %%xmm2\n" \
load " 32(%[src]), %%xmm3\n" \
load " 48(%[src]), %%xmm4\n" \
store " %%xmm1, 0(%[dst])\n" \
store " %%xmm2, 16(%[dst])\n" \
store " %%xmm3, 32(%[dst])\n" \
store " %%xmm4, 48(%[dst])\n" \
: : [dst]"r"(dstp), [src]"r"(srcp) : "memory")
/* Execute the instruction op only if SSE2 is supported. */
#ifdef CAN_COMPILE_SSE2
# define ASM_SSE2(cpu, op) do { \
if (cpu & CPU_CAPABILITY_SSE2) \
asm volatile (op); \
} while (0)
#else
# define ASM_SSE2(cpu, op)
#endif
/* Optimized copy from "Uncacheable Speculative Write Combining" memory
* as used by some video surface.
* XXX It is really efficient only when SSE4.1 is available.
*/
static
void
CopyFromUswc
(
uint8_t
*
dst
,
size_t
dst_pitch
,
const
uint8_t
*
src
,
size_t
src_pitch
,
unsigned
unaligned
,
unsigned
width
,
unsigned
height
,
unsigned
cpu
)
{
assert
(((
intptr_t
)
dst
&
0x0f
)
==
0
&&
(
dst_pitch
&
0x0f
)
==
0
);
ASM_SSE2
(
cpu
,
"mfence"
);
for
(
unsigned
y
=
0
;
y
<
height
;
y
++
)
{
unsigned
x
;
for
(
x
=
0
;
x
<
unaligned
;
x
++
)
dst
[
x
]
=
src
[
x
];
#ifdef CAN_COMPILE_SSE4_1
if
(
cpu
&
CPU_CAPABILITY_SSE4_1
)
{
if
(
!
unaligned
)
{
for
(;
x
+
63
<
width
;
x
+=
64
)
COPY64
(
&
dst
[
x
],
&
src
[
x
],
"movntdqa"
,
"movdqa"
);
}
else
{
for
(;
x
+
63
<
width
;
x
+=
64
)
COPY64
(
&
dst
[
x
],
&
src
[
x
],
"movntdqa"
,
"movdqu"
);
}
}
else
#endif
#ifdef CAN_COMPILE_SSE2
if
(
cpu
&
CPU_CAPABILITY_SSE2
)
{
if
(
!
unaligned
)
{
for
(;
x
+
63
<
width
;
x
+=
64
)
COPY64
(
&
dst
[
x
],
&
src
[
x
],
"movdqa"
,
"movdqa"
);
}
else
{
for
(;
x
+
63
<
width
;
x
+=
64
)
COPY64
(
&
dst
[
x
],
&
src
[
x
],
"movdqa"
,
"movdqu"
);
}
}
#endif
for
(;
x
<
width
;
x
++
)
dst
[
x
]
=
src
[
x
];
src
+=
src_pitch
;
dst
+=
dst_pitch
;
}
}
static
void
Copy2d
(
uint8_t
*
dst
,
size_t
dst_pitch
,
const
uint8_t
*
src
,
size_t
src_pitch
,
unsigned
width
,
unsigned
height
,
unsigned
cpu
)
{
assert
(((
intptr_t
)
src
&
0x0f
)
==
0
&&
(
src_pitch
&
0x0f
)
==
0
);
ASM_SSE2
(
cpu
,
"mfence"
);
for
(
unsigned
y
=
0
;
y
<
height
;
y
++
)
{
unsigned
x
=
0
;
bool
unaligned
=
((
intptr_t
)
dst
&
0x0f
)
!=
0
;
#ifdef CAN_COMPILE_SSE2
if
(
cpu
&
CPU_CAPABILITY_SSE2
)
{
if
(
!
unaligned
)
{
for
(;
x
+
63
<
width
;
x
+=
64
)
COPY64
(
&
dst
[
x
],
&
src
[
x
],
"movdqa"
,
"movntdq"
);
}
else
{
for
(;
x
+
63
<
width
;
x
+=
64
)
COPY64
(
&
dst
[
x
],
&
src
[
x
],
"movdqa"
,
"movdqu"
);
}
}
#endif
for
(;
x
<
width
;
x
++
)
dst
[
x
]
=
src
[
x
];
src
+=
src_pitch
;
dst
+=
dst_pitch
;
}
}
static
void
SplitUV
(
uint8_t
*
dstu
,
size_t
dstu_pitch
,
uint8_t
*
dstv
,
size_t
dstv_pitch
,
const
uint8_t
*
src
,
size_t
src_pitch
,
unsigned
width
,
unsigned
height
,
unsigned
cpu
)
{
const
uint8_t
shuffle
[]
=
{
0
,
2
,
4
,
6
,
8
,
10
,
12
,
14
,
1
,
3
,
5
,
7
,
9
,
11
,
13
,
15
};
const
uint8_t
mask
[]
=
{
0xff
,
0x00
,
0xff
,
0x00
,
0xff
,
0x00
,
0xff
,
0x00
,
0xff
,
0x00
,
0xff
,
0x00
,
0xff
,
0x00
,
0xff
,
0x00
};
assert
(((
intptr_t
)
src
&
0x0f
)
==
0
&&
(
src_pitch
&
0x0f
)
==
0
);
ASM_SSE2
(
cpu
,
"mfence"
);
for
(
unsigned
y
=
0
;
y
<
height
;
y
++
)
{
unsigned
x
=
0
;
#define LOAD64 \
"movdqa 0(%[src]), %%xmm0\n" \
"movdqa 16(%[src]), %%xmm1\n" \
"movdqa 32(%[src]), %%xmm2\n" \
"movdqa 48(%[src]), %%xmm3\n"
#define STORE2X32 \
"movq %%xmm0, 0(%[dst1])\n" \
"movq %%xmm1, 8(%[dst1])\n" \
"movhpd %%xmm0, 0(%[dst2])\n" \
"movhpd %%xmm1, 8(%[dst2])\n" \
"movq %%xmm2, 16(%[dst1])\n" \
"movq %%xmm3, 24(%[dst1])\n" \
"movhpd %%xmm2, 16(%[dst2])\n" \
"movhpd %%xmm3, 24(%[dst2])\n"
#ifdef CAN_COMPILE_SSSE3
if
(
cpu
&
CPU_CAPABILITY_SSSE3
)
{
for
(
x
=
0
;
x
<
(
width
&
~
31
);
x
+=
32
)
{
asm
volatile
(
"movdqu (%[shuffle]), %%xmm7
\n
"
LOAD64
"pshufb %%xmm7, %%xmm0
\n
"
"pshufb %%xmm7, %%xmm1
\n
"
"pshufb %%xmm7, %%xmm2
\n
"
"pshufb %%xmm7, %%xmm3
\n
"
STORE2X32
:
:
[
dst1
]
"r"
(
&
dstu
[
x
]),
[
dst2
]
"r"
(
&
dstv
[
x
]),
[
src
]
"r"
(
&
src
[
2
*
x
]),
[
shuffle
]
"r"
(
shuffle
)
:
"memory"
);
}
}
else
#endif
#ifdef CAN_COMPILE_SSE2
if
(
cpu
&
CPU_CAPABILITY_SSE2
)
{
for
(
x
=
0
;
x
<
(
width
&
~
31
);
x
+=
32
)
{
asm
volatile
(
"movdqu (%[mask]), %%xmm7
\n
"
LOAD64
"movdqa %%xmm0, %%xmm4
\n
"
"movdqa %%xmm1, %%xmm5
\n
"
"movdqa %%xmm2, %%xmm6
\n
"
"psrlw $8, %%xmm0
\n
"
"psrlw $8, %%xmm1
\n
"
"pand %%xmm7, %%xmm4
\n
"
"pand %%xmm7, %%xmm5
\n
"
"pand %%xmm7, %%xmm6
\n
"
"packuswb %%xmm4, %%xmm0
\n
"
"packuswb %%xmm5, %%xmm1
\n
"
"pand %%xmm3, %%xmm7
\n
"
"psrlw $8, %%xmm2
\n
"
"psrlw $8, %%xmm3
\n
"
"packuswb %%xmm6, %%xmm2
\n
"
"packuswb %%xmm7, %%xmm3
\n
"
STORE2X32
:
:
[
dst2
]
"r"
(
&
dstu
[
x
]),
[
dst1
]
"r"
(
&
dstv
[
x
]),
[
src
]
"r"
(
&
src
[
2
*
x
]),
[
mask
]
"r"
(
mask
)
:
"memory"
);
}
}
#endif
#undef STORE2X32
#undef LOAD64
for
(;
x
<
width
;
x
++
)
{
dstu
[
x
]
=
src
[
2
*
x
+
0
];
dstv
[
x
]
=
src
[
2
*
x
+
1
];
}
src
+=
src_pitch
;
dstu
+=
dstu_pitch
;
dstv
+=
dstv_pitch
;
}
}
static
void
CopyPlane
(
uint8_t
*
dst
,
size_t
dst_pitch
,
const
uint8_t
*
src
,
size_t
src_pitch
,
uint8_t
*
cache
,
size_t
cache_size
,
unsigned
width
,
unsigned
height
,
unsigned
cpu
)
{
const
unsigned
w16
=
(
width
+
15
)
&
~
15
;
const
unsigned
hstep
=
cache_size
/
w16
;
assert
(
hstep
>
0
);
for
(
unsigned
y
=
0
;
y
<
height
;
y
+=
hstep
)
{
const
unsigned
unaligned
=
(
intptr_t
)
src
&
0x0f
;
const
unsigned
hblock
=
__MIN
(
hstep
,
height
-
y
);
/* Copy a bunch of line into our cache */
CopyFromUswc
(
cache
,
w16
,
src
,
src_pitch
,
unaligned
,
width
,
hblock
,
cpu
);
/* Copy from our cache to the destination */
Copy2d
(
dst
,
dst_pitch
,
cache
,
w16
,
width
,
hblock
,
cpu
);
/* */
src
+=
src_pitch
*
hblock
;
dst
+=
dst_pitch
*
hblock
;
}
ASM_SSE2
(
cpu
,
"mfence"
);
}
static
void
SplitPlanes
(
uint8_t
*
dstu
,
size_t
dstu_pitch
,
uint8_t
*
dstv
,
size_t
dstv_pitch
,
const
uint8_t
*
src
,
size_t
src_pitch
,
uint8_t
*
cache
,
size_t
cache_size
,
unsigned
width
,
unsigned
height
,
unsigned
cpu
)
{
const
unsigned
w2_16
=
(
2
*
width
+
15
)
&
~
15
;
const
unsigned
hstep
=
cache_size
/
w2_16
;
assert
(
hstep
>
0
);
for
(
unsigned
y
=
0
;
y
<
height
;
y
+=
hstep
)
{
const
unsigned
unaligned
=
(
intptr_t
)
src
&
0x0f
;
const
unsigned
hblock
=
__MIN
(
hstep
,
height
-
y
);
/* Copy a bunch of line into our cache */
CopyFromUswc
(
cache
,
w2_16
,
src
,
src_pitch
,
unaligned
,
2
*
width
,
hblock
,
cpu
);
/* Copy from our cache to the destination */
SplitUV
(
dstu
,
dstu_pitch
,
dstv
,
dstv_pitch
,
cache
,
w2_16
,
width
,
hblock
,
cpu
);
/* */
src
+=
src_pitch
*
hblock
;
dstu
+=
dstu_pitch
*
hblock
;
dstv
+=
dstv_pitch
*
hblock
;
}
ASM_SSE2
(
cpu
,
"mfence"
);
}
int
CopyInitCache
(
copy_cache_t
*
cache
,
unsigned
width
)
{
cache
->
size
=
__MAX
((
width
+
0x0f
)
&
~
0x0f
,
4096
);
cache
->
base
=
malloc
(
16
+
cache
->
size
);
if
(
cache
->
base
==
NULL
)
{
cache
->
buffer
=
NULL
;
return
VLC_EGENERIC
;
}
cache
->
buffer
=
&
cache
->
base
[
16
-
((
intptr_t
)
cache
->
base
&
0x0f
)];
return
VLC_SUCCESS
;
}
void
CopyCleanCache
(
copy_cache_t
*
cache
)
{
free
(
cache
->
base
);
cache
->
base
=
NULL
;
cache
->
buffer
=
NULL
;
cache
->
size
=
0
;
}
void
CopyFromNv12
(
picture_t
*
dst
,
uint8_t
*
src
[
2
],
size_t
src_pitch
[
2
],
unsigned
width
,
unsigned
height
,
copy_cache_t
*
cache
)
{
const
unsigned
cpu
=
vlc_CPU
();
/* */
CopyPlane
(
dst
->
p
[
0
].
p_pixels
,
dst
->
p
[
0
].
i_pitch
,
src
[
0
],
src_pitch
[
0
],
cache
->
buffer
,
cache
->
size
,
width
,
height
,
cpu
);
SplitPlanes
(
dst
->
p
[
2
].
p_pixels
,
dst
->
p
[
2
].
i_pitch
,
dst
->
p
[
1
].
p_pixels
,
dst
->
p
[
1
].
i_pitch
,
src
[
1
],
src_pitch
[
1
],
cache
->
buffer
,
cache
->
size
,
width
/
2
,
height
/
2
,
cpu
);
ASM_SSE2
(
cpu
,
"emms"
);
}
void
CopyFromYv12
(
picture_t
*
dst
,
uint8_t
*
src
[
3
],
size_t
src_pitch
[
3
],
unsigned
width
,
unsigned
height
,
copy_cache_t
*
cache
)
{
const
unsigned
cpu
=
vlc_CPU
();
/* */
for
(
unsigned
n
=
0
;
n
<
3
;
n
++
)
{
const
unsigned
d
=
n
>
0
?
2
:
1
;
CopyPlane
(
dst
->
p
[
n
].
p_pixels
,
dst
->
p
[
n
].
i_pitch
,
src
[
n
],
src_pitch
[
n
],
cache
->
buffer
,
cache
->
size
,
width
/
d
,
height
/
d
,
cpu
);
}
ASM_SSE2
(
cpu
,
"emms"
);
}
#undef ASM_SSE2
#undef COPY64
modules/codec/avcodec/copy.h
0 → 100644
View file @
0777bcbf
/*****************************************************************************
* copy.h: Fast YV12/NV12 copy
*****************************************************************************
* Copyright (C) 2009 Laurent Aimar
* $Id$
*
* Authors: Laurent Aimar <fenrir_AT_ videolan _DOT_ org>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston MA 02110-1301, USA.
*****************************************************************************/
#ifndef _VLC_AVCODEC_COPY_H
#define _VLC_AVCODEC_COPY_H 1
typedef
struct
{
uint8_t
*
base
;
uint8_t
*
buffer
;
size_t
size
;
}
copy_cache_t
;
int
CopyInitCache
(
copy_cache_t
*
cache
,
unsigned
width
);
void
CopyCleanCache
(
copy_cache_t
*
cache
);
void
CopyFromNv12
(
picture_t
*
dst
,
uint8_t
*
src
[
2
],
size_t
src_pitch
[
2
],
unsigned
width
,
unsigned
height
,
copy_cache_t
*
cache
);
void
CopyFromYv12
(
picture_t
*
dst
,
uint8_t
*
src
[
3
],
size_t
src_pitch
[
3
],
unsigned
width
,
unsigned
height
,
copy_cache_t
*
cache
);
#endif
modules/codec/avcodec/dxva2.c
View file @
0777bcbf
...
...
@@ -47,6 +47,7 @@
#include "avcodec.h"
#include "va.h"
#include "copy.h"
#ifdef HAVE_AVCODEC_DXVA2
...
...
@@ -252,9 +253,7 @@ typedef struct
/* Option conversion */
D3DFORMAT
output
;
uint8_t
*
surface_cache_base
;
uint8_t
*
surface_cache
;
size_t
surface_cache_size
;
copy_cache_t
surface_cache
;
/* */
struct
dxva_context
hw
;
...
...
@@ -297,13 +296,6 @@ static int DxResetVideoDecoder(vlc_va_dxva2_t *);
static
void
DxCreateVideoConversion
(
vlc_va_dxva2_t
*
);
static
void
DxDestroyVideoConversion
(
vlc_va_dxva2_t
*
);
static
void
CopyFromNv12
(
picture_t
*
dst
,
const
D3DLOCKED_RECT
*
src
,
uint8_t
*
cache
,
size_t
cache_size
,
unsigned
width
,
unsigned
height
);
static
void
CopyFromYv12
(
picture_t
*
dst
,
const
D3DLOCKED_RECT
*
src
,
uint8_t
*
cache
,
size_t
cache_size
,
unsigned
width
,
unsigned
height
);
/* */
static
int
Setup
(
vlc_va_t
*
external
,
void
**
hw
,
vlc_fourcc_t
*
chroma
,
int
width
,
int
height
)
...
...
@@ -356,7 +348,7 @@ static int Extract(vlc_va_t *external, picture_t *picture, AVFrame *ff)
vlc_va_dxva2_t
*
va
=
vlc_va_dxva2_Get
(
external
);
LPDIRECT3DSURFACE9
d3d
=
(
LPDIRECT3DSURFACE9
)(
uintptr_t
)
ff
->
data
[
3
];
if
(
!
va
->
surface_cache
)
if
(
!
va
->
surface_cache
.
buffer
)
return
VLC_EGENERIC
;
/* */
...
...
@@ -370,14 +362,33 @@ static int Extract(vlc_va_t *external, picture_t *picture, AVFrame *ff)
}
if
(
va
->
render
==
MAKEFOURCC
(
'Y'
,
'V'
,
'1'
,
'2'
))
{
CopyFromYv12
(
picture
,
&
lock
,
va
->
surface_cache
,
va
->
surface_cache_size
,
va
->
surface_width
,
va
->
surface_height
);
uint8_t
*
plane
[
3
]
=
{
lock
.
pBits
,
(
uint8_t
*
)
lock
.
pBits
+
lock
.
Pitch
*
va
->
surface_height
,
(
uint8_t
*
)
lock
.
pBits
+
lock
.
Pitch
*
va
->
surface_height
+
(
lock
.
Pitch
/
2
)
*
(
va
->
surface_height
/
2
)
};
size_t
pitch
[
3
]
=
{
lock
.
Pitch
,
lock
.
Pitch
/
2
,
lock
.
Pitch
/
2
,
};
CopyFromYv12
(
picture
,
plane
,
pitch
,
va
->
surface_width
,
va
->
surface_height
,
&
va
->
surface_cache
);
}
else
{
assert
(
va
->
render
==
MAKEFOURCC
(
'N'
,
'V'
,
'1'
,
'2'
));
CopyFromNv12
(
picture
,
&
lock
,
va
->
surface_cache
,
va
->
surface_cache_size
,
va
->
surface_width
,
va
->
surface_height
);
uint8_t
*
plane
[
2
]
=
{
lock
.
pBits
,
(
uint8_t
*
)
lock
.
pBits
+
lock
.
Pitch
*
va
->
surface_height
};
size_t
pitch
[
2
]
=
{
lock
.
Pitch
,
lock
.
Pitch
/
2
,
};
CopyFromNv12
(
picture
,
plane
,
pitch
,
va
->
surface_width
,
va
->
surface_height
,
&
va
->
surface_cache
);
}
/* */
...
...
@@ -954,319 +965,12 @@ static void DxCreateVideoConversion(vlc_va_dxva2_t *va)
va
->
output
=
va
->
render
;
break
;
}
va
->
surface_cache_size
=
__MAX
((
va
->
surface_width
+
0x0f
)
&
~
0x0f
,
4096
);
va
->
surface_cache_base
=
malloc
(
16
+
va
->
surface_cache_size
);
va
->
surface_cache
=
&
va
->
surface_cache_base
[
16
-
((
intptr_t
)
va
->
surface_cache_base
&
0x0f
)];
CopyInitCache
(
&
va
->
surface_cache
,
va
->
surface_width
);
}
static
void
DxDestroyVideoConversion
(
vlc_va_dxva2_t
*
va
)
{
free
(
va
->
surface_cache_base
);
va
->
surface_cache_base
=
NULL
;
va
->
surface_cache
=
NULL
;
va
->
surface_cache_size
=
0
;
CopyCleanCache
(
&
va
->
surface_cache
);
}
/* Copy 64 bytes from srcp to dsp loading data with the SSE>=2 instruction load and
* storing data with the SSE>=2 instruction store.
*/
#define COPY64(dstp, srcp, load, store) \
asm volatile ( \
load " 0(%[src]), %%xmm1\n" \
load " 16(%[src]), %%xmm2\n" \
load " 32(%[src]), %%xmm3\n" \
load " 48(%[src]), %%xmm4\n" \
store " %%xmm1, 0(%[dst])\n" \
store " %%xmm2, 16(%[dst])\n" \
store " %%xmm3, 32(%[dst])\n" \
store " %%xmm4, 48(%[dst])\n" \
: : [dst]"r"(dstp), [src]"r"(srcp) : "memory")
/* Execute the instruction op only if SSE2 is supported. */
#ifdef CAN_COMPILE_SSE2
# define ASM_SSE2(cpu, op) do { \
if (cpu & CPU_CAPABILITY_SSE2) \
asm volatile (op); \
} while (0)
#else
# define ASM_SSE2(cpu, op)
#endif
/* Optimized copy from "Uncacheable Speculative Write Combining" memory
* as used by some video surface.
* XXX It is really efficient only when SSE4.1 is available.
*/
static
void
CopyFromUswc
(
uint8_t
*
dst
,
size_t
dst_pitch
,
const
uint8_t
*
src
,
size_t
src_pitch
,
unsigned
unaligned
,
unsigned
width
,
unsigned
height
,
unsigned
cpu
)
{
assert
(((
intptr_t
)
dst
&
0x0f
)
==
0
&&
(
dst_pitch
&
0x0f
)
==
0
);
ASM_SSE2
(
cpu
,
"mfence"
);
for
(
unsigned
y
=
0
;
y
<
height
;
y
++
)
{
unsigned
x
;
for
(
x
=
0
;
x
<
unaligned
;
x
++
)
dst
[
x
]
=
src
[
x
];
#ifdef CAN_COMPILE_SSE4_1
if
(
cpu
&
CPU_CAPABILITY_SSE4_1
)
{
if
(
!
unaligned
)
{
for
(;
x
+
63
<
width
;
x
+=
64
)
COPY64
(
&
dst
[
x
],
&
src
[
x
],
"movntdqa"
,
"movdqa"
);
}
else
{
for
(;
x
+
63
<
width
;
x
+=
64
)
COPY64
(
&
dst
[
x
],
&
src
[
x
],
"movntdqa"
,
"movdqu"
);
}
}
else
#endif
#ifdef CAN_COMPILE_SSE2
if
(
cpu
&
CPU_CAPABILITY_SSE2
)
{
if
(
!
unaligned
)
{
for
(;
x
+
63
<
width
;
x
+=
64
)
COPY64
(
&
dst
[
x
],
&
src
[
x
],
"movdqa"
,
"movdqa"
);
}
else
{
for
(;
x
+
63
<
width
;
x
+=
64
)
COPY64
(
&
dst
[
x
],
&
src
[
x
],
"movdqa"
,
"movdqu"
);
}
}
#endif
for
(;
x
<
width
;
x
++
)
dst
[
x
]
=
src
[
x
];
src
+=
src_pitch
;
dst
+=
dst_pitch
;
}
}
static
void
Copy2d
(
uint8_t
*
dst
,
size_t
dst_pitch
,
const
uint8_t
*
src
,
size_t
src_pitch
,
unsigned
width
,
unsigned
height
,
unsigned
cpu
)
{
assert
(((
intptr_t
)
src
&
0x0f
)
==
0
&&
(
src_pitch
&
0x0f
)
==
0
);
ASM_SSE2
(
cpu
,
"mfence"
);
for
(
unsigned
y
=
0
;
y
<
height
;
y
++
)
{
unsigned
x
=
0
;
bool
unaligned
=
((
intptr_t
)
dst
&
0x0f
)
!=
0
;
#ifdef CAN_COMPILE_SSE2
if
(
cpu
&
CPU_CAPABILITY_SSE2
)
{
if
(
!
unaligned
)
{
for
(;
x
+
63
<
width
;
x
+=
64
)
COPY64
(
&
dst
[
x
],
&
src
[
x
],
"movdqa"
,
"movntdq"
);
}
else
{
for
(;
x
+
63
<
width
;
x
+=
64
)
COPY64
(
&
dst
[
x
],
&
src
[
x
],
"movdqa"
,
"movdqu"
);
}
}
#endif
for
(;
x
<
width
;
x
++
)
dst
[
x
]
=
src
[
x
];
src
+=
src_pitch
;
dst
+=
dst_pitch
;
}
}
static
void
SplitUV
(
uint8_t
*
dstu
,
size_t
dstu_pitch
,
uint8_t
*
dstv
,
size_t
dstv_pitch
,
const
uint8_t
*
src
,
size_t
src_pitch
,
unsigned
width
,
unsigned
height
,
unsigned
cpu
)
{
const
uint8_t
shuffle
[]
=
{
0
,
2
,
4
,
6
,
8
,
10
,
12
,
14
,
1
,
3
,
5
,
7
,
9
,
11
,
13
,
15
};
const
uint8_t
mask
[]
=
{
0xff
,
0x00
,
0xff
,
0x00
,
0xff
,
0x00
,
0xff
,
0x00
,
0xff
,
0x00
,
0xff
,
0x00
,
0xff
,
0x00
,
0xff
,
0x00
};
assert
(((
intptr_t
)
src
&
0x0f
)
==
0
&&
(
src_pitch
&
0x0f
)
==
0
);
ASM_SSE2
(
cpu
,
"mfence"
);
for
(
unsigned
y
=
0
;
y
<
height
;
y
++
)
{
unsigned
x
=
0
;
#define LOAD64 \
"movdqa 0(%[src]), %%xmm0\n" \
"movdqa 16(%[src]), %%xmm1\n" \
"movdqa 32(%[src]), %%xmm2\n" \
"movdqa 48(%[src]), %%xmm3\n"
#define STORE2X32 \
"movq %%xmm0, 0(%[dst1])\n" \
"movq %%xmm1, 8(%[dst1])\n" \
"movhpd %%xmm0, 0(%[dst2])\n" \
"movhpd %%xmm1, 8(%[dst2])\n" \
"movq %%xmm2, 16(%[dst1])\n" \
"movq %%xmm3, 24(%[dst1])\n" \
"movhpd %%xmm2, 16(%[dst2])\n" \
"movhpd %%xmm3, 24(%[dst2])\n"
#ifdef CAN_COMPILE_SSSE3
if
(
cpu
&
CPU_CAPABILITY_SSSE3
)
{
for
(
x
=
0
;
x
<
(
width
&
~
31
);
x
+=
32
)
{
asm
volatile
(
"movdqu (%[shuffle]), %%xmm7
\n
"
LOAD64
"pshufb %%xmm7, %%xmm0
\n
"
"pshufb %%xmm7, %%xmm1
\n
"
"pshufb %%xmm7, %%xmm2
\n
"
"pshufb %%xmm7, %%xmm3
\n
"
STORE2X32
:
:
[
dst1
]
"r"
(
&
dstu
[
x
]),
[
dst2
]
"r"
(
&
dstv
[
x
]),
[
src
]
"r"
(
&
src
[
2
*
x
]),
[
shuffle
]
"r"
(
shuffle
)
:
"memory"
);
}
}
else
#endif
#ifdef CAN_COMPILE_SSE2
if
(
cpu
&
CPU_CAPABILITY_SSE2
)
{
for
(
x
=
0
;
x
<
(
width
&
~
31
);
x
+=
32
)
{
asm
volatile
(
"movdqu (%[mask]), %%xmm7
\n
"
LOAD64
"movdqa %%xmm0, %%xmm4
\n
"
"movdqa %%xmm1, %%xmm5
\n
"
"movdqa %%xmm2, %%xmm6
\n
"
"psrlw $8, %%xmm0
\n
"
"psrlw $8, %%xmm1
\n
"
"pand %%xmm7, %%xmm4
\n
"
"pand %%xmm7, %%xmm5
\n
"
"pand %%xmm7, %%xmm6
\n
"
"packuswb %%xmm4, %%xmm0
\n
"
"packuswb %%xmm5, %%xmm1
\n
"
"pand %%xmm3, %%xmm7
\n
"
"psrlw $8, %%xmm2
\n
"
"psrlw $8, %%xmm3
\n
"
"packuswb %%xmm6, %%xmm2
\n
"
"packuswb %%xmm7, %%xmm3
\n
"
STORE2X32
:
:
[
dst2
]
"r"
(
&
dstu
[
x
]),
[
dst1
]
"r"
(
&
dstv
[
x
]),
[
src
]
"r"
(
&
src
[
2
*
x
]),
[
mask
]
"r"
(
mask
)
:
"memory"
);
}
}
#endif
#undef STORE2X32
#undef LOAD64
for
(;
x
<
width
;
x
++
)
{
dstu
[
x
]
=
src
[
2
*
x
+
0
];
dstv
[
x
]
=
src
[
2
*
x
+
1
];
}
src
+=
src_pitch
;
dstu
+=
dstu_pitch
;
dstv
+=
dstv_pitch
;
}
}
static
void
CopyPlane
(
uint8_t
*
dst
,
size_t
dst_pitch
,
const
uint8_t
*
src
,
size_t
src_pitch
,
uint8_t
*
cache
,
size_t
cache_size
,
unsigned
width
,
unsigned
height
,
unsigned
cpu
)
{
const
unsigned
w16
=
(
width
+
15
)
&
~
15
;
const
unsigned
hstep
=
cache_size
/
w16
;
assert
(
hstep
>
0
);
for
(
unsigned
y
=
0
;
y
<
height
;
y
+=
hstep
)
{
const
unsigned
unaligned
=
(
intptr_t
)
src
&
0x0f
;
const
unsigned
hblock
=
__MIN
(
hstep
,
height
-
y
);
/* Copy a bunch of line into our cache */
CopyFromUswc
(
cache
,
w16
,
src
,
src_pitch
,
unaligned
,
width
,
hblock
,
cpu
);
/* Copy from our cache to the destination */
Copy2d
(
dst
,
dst_pitch
,
cache
,
w16
,
width
,
hblock
,
cpu
);
/* */
src
+=
src_pitch
*
hblock
;
dst
+=
dst_pitch
*
hblock
;
}
ASM_SSE2
(
cpu
,
"mfence"
);
}
static
void
SplitPlanes
(
uint8_t
*
dstu
,
size_t
dstu_pitch
,
uint8_t
*
dstv
,
size_t
dstv_pitch
,
const
uint8_t
*
src
,
size_t
src_pitch
,
uint8_t
*
cache
,
size_t
cache_size
,
unsigned
width
,
unsigned
height
,
unsigned
cpu
)
{
const
unsigned
w2_16
=
(
2
*
width
+
15
)
&
~
15
;
const
unsigned
hstep
=
cache_size
/
w2_16
;
assert
(
hstep
>
0
);
for
(
unsigned
y
=
0
;
y
<
height
;
y
+=
hstep
)
{
const
unsigned
unaligned
=
(
intptr_t
)
src
&
0x0f
;
const
unsigned
hblock
=
__MIN
(
hstep
,
height
-
y
);
/* Copy a bunch of line into our cache */
CopyFromUswc
(
cache
,
w2_16
,
src
,
src_pitch
,
unaligned
,
2
*
width
,
hblock
,
cpu
);
/* Copy from our cache to the destination */
SplitUV
(
dstu
,
dstu_pitch
,
dstv
,
dstv_pitch
,
cache
,
w2_16
,
width
,
hblock
,
cpu
);
/* */
src
+=
src_pitch
*
hblock
;
dstu
+=
dstu_pitch
*
hblock
;
dstv
+=
dstv_pitch
*
hblock
;
}
ASM_SSE2
(
cpu
,
"mfence"
);
}
static
void
CopyFromNv12
(
picture_t
*
dst
,
const
D3DLOCKED_RECT
*
src
,
uint8_t
*
cache
,
size_t
cache_size
,
unsigned
width
,
unsigned
height
)
{
const
unsigned
cpu
=
vlc_CPU
();
/* */
CopyPlane
(
dst
->
p
[
0
].
p_pixels
,
dst
->
p
[
0
].
i_pitch
,
src
->
pBits
,
src
->
Pitch
,
cache
,
cache_size
,
width
,
height
,
cpu
);
SplitPlanes
(
dst
->
p
[
2
].
p_pixels
,
dst
->
p
[
2
].
i_pitch
,
dst
->
p
[
1
].
p_pixels
,
dst
->
p
[
1
].
i_pitch
,
(
const
uint8_t
*
)
src
->
pBits
+
src
->
Pitch
*
height
,
src
->
Pitch
,
cache
,
cache_size
,
width
/
2
,
height
/
2
,
cpu
);
ASM_SSE2
(
cpu
,
"emms"
);
}
static
void
CopyFromYv12
(
picture_t
*
dst
,
const
D3DLOCKED_RECT
*
src
,
uint8_t
*
cache
,
size_t
cache_size
,
unsigned
width
,
unsigned
height
)
{
const
unsigned
cpu
=
vlc_CPU
();
/* */
for
(
unsigned
n
=
0
,
offset
=
0
;
n
<
3
;
n
++
)
{
const
unsigned
d
=
n
>
0
?
2
:
1
;
CopyPlane
(
dst
->
p
[
n
].
p_pixels
,
dst
->
p
[
n
].
i_pitch
,
(
const
uint8_t
*
)
src
->
pBits
+
offset
,
src
->
Pitch
/
d
,
cache
,
cache_size
,
width
/
d
,
height
/
d
,
cpu
);
offset
+=
(
src
->
Pitch
/
d
)
*
(
height
/
d
);
}
ASM_SSE2
(
cpu
,
"emms"
);
}
#undef ASM_SSE2
#undef COPY64
#else
vlc_va_t
*
vlc_va_NewDxva2
(
vlc_object_t
*
log
,
int
codec_id
)
{
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment