Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Support
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
V
vlc-gpu
Project overview
Project overview
Details
Activity
Releases
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Issues
0
Issues
0
List
Boards
Labels
Milestones
Redmine
Redmine
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Operations
Operations
Metrics
Environments
Analytics
Analytics
CI / CD
Repository
Value Stream
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
videolan
vlc-gpu
Commits
0777bcbf
Commit
0777bcbf
authored
May 08, 2010
by
Laurent Aimar
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
Moved out optimized VA nv12/yv12 copy functions from dxva2.
parent
287ccf14
Changes
4
Hide whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
431 additions
and
326 deletions
+431
-326
modules/codec/avcodec/Modules.am
modules/codec/avcodec/Modules.am
+2
-0
modules/codec/avcodec/copy.c
modules/codec/avcodec/copy.c
+355
-0
modules/codec/avcodec/copy.h
modules/codec/avcodec/copy.h
+44
-0
modules/codec/avcodec/dxva2.c
modules/codec/avcodec/dxva2.c
+30
-326
No files found.
modules/codec/avcodec/Modules.am
View file @
0777bcbf
...
...
@@ -10,6 +10,8 @@ libavcodec_plugin_la_SOURCES = \
chroma.c \
vaapi.c \
dxva2.c \
copy.c \
copy.h \
va.h \
$(NULL)
if ENABLE_SOUT
...
...
modules/codec/avcodec/copy.c
0 → 100644
View file @
0777bcbf
/*****************************************************************************
* copy.c: Fast YV12/NV12 copy
*****************************************************************************
* Copyright (C) 2010 Laurent Aimar
* $Id$
*
* Authors: Laurent Aimar <fenrir _AT_ videolan _DOT_ org>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston MA 02110-1301, USA.
*****************************************************************************/
#ifdef HAVE_CONFIG_H
# include "config.h"
#endif
#include <vlc_common.h>
#include <vlc_picture.h>
#include <vlc_cpu.h>
#include <assert.h>
#include "copy.h"
/* Copy 64 bytes from srcp to dsp loading data with the SSE>=2 instruction load and
* storing data with the SSE>=2 instruction store.
*/
#define COPY64(dstp, srcp, load, store) \
asm volatile ( \
load " 0(%[src]), %%xmm1\n" \
load " 16(%[src]), %%xmm2\n" \
load " 32(%[src]), %%xmm3\n" \
load " 48(%[src]), %%xmm4\n" \
store " %%xmm1, 0(%[dst])\n" \
store " %%xmm2, 16(%[dst])\n" \
store " %%xmm3, 32(%[dst])\n" \
store " %%xmm4, 48(%[dst])\n" \
: : [dst]"r"(dstp), [src]"r"(srcp) : "memory")
/* Execute the instruction op only if SSE2 is supported. */
#ifdef CAN_COMPILE_SSE2
# define ASM_SSE2(cpu, op) do { \
if (cpu & CPU_CAPABILITY_SSE2) \
asm volatile (op); \
} while (0)
#else
# define ASM_SSE2(cpu, op)
#endif
/* Optimized copy from "Uncacheable Speculative Write Combining" memory
* as used by some video surface.
* XXX It is really efficient only when SSE4.1 is available.
*/
static
void
CopyFromUswc
(
uint8_t
*
dst
,
size_t
dst_pitch
,
const
uint8_t
*
src
,
size_t
src_pitch
,
unsigned
unaligned
,
unsigned
width
,
unsigned
height
,
unsigned
cpu
)
{
assert
(((
intptr_t
)
dst
&
0x0f
)
==
0
&&
(
dst_pitch
&
0x0f
)
==
0
);
ASM_SSE2
(
cpu
,
"mfence"
);
for
(
unsigned
y
=
0
;
y
<
height
;
y
++
)
{
unsigned
x
;
for
(
x
=
0
;
x
<
unaligned
;
x
++
)
dst
[
x
]
=
src
[
x
];
#ifdef CAN_COMPILE_SSE4_1
if
(
cpu
&
CPU_CAPABILITY_SSE4_1
)
{
if
(
!
unaligned
)
{
for
(;
x
+
63
<
width
;
x
+=
64
)
COPY64
(
&
dst
[
x
],
&
src
[
x
],
"movntdqa"
,
"movdqa"
);
}
else
{
for
(;
x
+
63
<
width
;
x
+=
64
)
COPY64
(
&
dst
[
x
],
&
src
[
x
],
"movntdqa"
,
"movdqu"
);
}
}
else
#endif
#ifdef CAN_COMPILE_SSE2
if
(
cpu
&
CPU_CAPABILITY_SSE2
)
{
if
(
!
unaligned
)
{
for
(;
x
+
63
<
width
;
x
+=
64
)
COPY64
(
&
dst
[
x
],
&
src
[
x
],
"movdqa"
,
"movdqa"
);
}
else
{
for
(;
x
+
63
<
width
;
x
+=
64
)
COPY64
(
&
dst
[
x
],
&
src
[
x
],
"movdqa"
,
"movdqu"
);
}
}
#endif
for
(;
x
<
width
;
x
++
)
dst
[
x
]
=
src
[
x
];
src
+=
src_pitch
;
dst
+=
dst_pitch
;
}
}
static
void
Copy2d
(
uint8_t
*
dst
,
size_t
dst_pitch
,
const
uint8_t
*
src
,
size_t
src_pitch
,
unsigned
width
,
unsigned
height
,
unsigned
cpu
)
{
assert
(((
intptr_t
)
src
&
0x0f
)
==
0
&&
(
src_pitch
&
0x0f
)
==
0
);
ASM_SSE2
(
cpu
,
"mfence"
);
for
(
unsigned
y
=
0
;
y
<
height
;
y
++
)
{
unsigned
x
=
0
;
bool
unaligned
=
((
intptr_t
)
dst
&
0x0f
)
!=
0
;
#ifdef CAN_COMPILE_SSE2
if
(
cpu
&
CPU_CAPABILITY_SSE2
)
{
if
(
!
unaligned
)
{
for
(;
x
+
63
<
width
;
x
+=
64
)
COPY64
(
&
dst
[
x
],
&
src
[
x
],
"movdqa"
,
"movntdq"
);
}
else
{
for
(;
x
+
63
<
width
;
x
+=
64
)
COPY64
(
&
dst
[
x
],
&
src
[
x
],
"movdqa"
,
"movdqu"
);
}
}
#endif
for
(;
x
<
width
;
x
++
)
dst
[
x
]
=
src
[
x
];
src
+=
src_pitch
;
dst
+=
dst_pitch
;
}
}
static
void
SplitUV
(
uint8_t
*
dstu
,
size_t
dstu_pitch
,
uint8_t
*
dstv
,
size_t
dstv_pitch
,
const
uint8_t
*
src
,
size_t
src_pitch
,
unsigned
width
,
unsigned
height
,
unsigned
cpu
)
{
const
uint8_t
shuffle
[]
=
{
0
,
2
,
4
,
6
,
8
,
10
,
12
,
14
,
1
,
3
,
5
,
7
,
9
,
11
,
13
,
15
};
const
uint8_t
mask
[]
=
{
0xff
,
0x00
,
0xff
,
0x00
,
0xff
,
0x00
,
0xff
,
0x00
,
0xff
,
0x00
,
0xff
,
0x00
,
0xff
,
0x00
,
0xff
,
0x00
};
assert
(((
intptr_t
)
src
&
0x0f
)
==
0
&&
(
src_pitch
&
0x0f
)
==
0
);
ASM_SSE2
(
cpu
,
"mfence"
);
for
(
unsigned
y
=
0
;
y
<
height
;
y
++
)
{
unsigned
x
=
0
;
#define LOAD64 \
"movdqa 0(%[src]), %%xmm0\n" \
"movdqa 16(%[src]), %%xmm1\n" \
"movdqa 32(%[src]), %%xmm2\n" \
"movdqa 48(%[src]), %%xmm3\n"
#define STORE2X32 \
"movq %%xmm0, 0(%[dst1])\n" \
"movq %%xmm1, 8(%[dst1])\n" \
"movhpd %%xmm0, 0(%[dst2])\n" \
"movhpd %%xmm1, 8(%[dst2])\n" \
"movq %%xmm2, 16(%[dst1])\n" \
"movq %%xmm3, 24(%[dst1])\n" \
"movhpd %%xmm2, 16(%[dst2])\n" \
"movhpd %%xmm3, 24(%[dst2])\n"
#ifdef CAN_COMPILE_SSSE3
if
(
cpu
&
CPU_CAPABILITY_SSSE3
)
{
for
(
x
=
0
;
x
<
(
width
&
~
31
);
x
+=
32
)
{
asm
volatile
(
"movdqu (%[shuffle]), %%xmm7
\n
"
LOAD64
"pshufb %%xmm7, %%xmm0
\n
"
"pshufb %%xmm7, %%xmm1
\n
"
"pshufb %%xmm7, %%xmm2
\n
"
"pshufb %%xmm7, %%xmm3
\n
"
STORE2X32
:
:
[
dst1
]
"r"
(
&
dstu
[
x
]),
[
dst2
]
"r"
(
&
dstv
[
x
]),
[
src
]
"r"
(
&
src
[
2
*
x
]),
[
shuffle
]
"r"
(
shuffle
)
:
"memory"
);
}
}
else
#endif
#ifdef CAN_COMPILE_SSE2
if
(
cpu
&
CPU_CAPABILITY_SSE2
)
{
for
(
x
=
0
;
x
<
(
width
&
~
31
);
x
+=
32
)
{
asm
volatile
(
"movdqu (%[mask]), %%xmm7
\n
"
LOAD64
"movdqa %%xmm0, %%xmm4
\n
"
"movdqa %%xmm1, %%xmm5
\n
"
"movdqa %%xmm2, %%xmm6
\n
"
"psrlw $8, %%xmm0
\n
"
"psrlw $8, %%xmm1
\n
"
"pand %%xmm7, %%xmm4
\n
"
"pand %%xmm7, %%xmm5
\n
"
"pand %%xmm7, %%xmm6
\n
"
"packuswb %%xmm4, %%xmm0
\n
"
"packuswb %%xmm5, %%xmm1
\n
"
"pand %%xmm3, %%xmm7
\n
"
"psrlw $8, %%xmm2
\n
"
"psrlw $8, %%xmm3
\n
"
"packuswb %%xmm6, %%xmm2
\n
"
"packuswb %%xmm7, %%xmm3
\n
"
STORE2X32
:
:
[
dst2
]
"r"
(
&
dstu
[
x
]),
[
dst1
]
"r"
(
&
dstv
[
x
]),
[
src
]
"r"
(
&
src
[
2
*
x
]),
[
mask
]
"r"
(
mask
)
:
"memory"
);
}
}
#endif
#undef STORE2X32
#undef LOAD64
for
(;
x
<
width
;
x
++
)
{
dstu
[
x
]
=
src
[
2
*
x
+
0
];
dstv
[
x
]
=
src
[
2
*
x
+
1
];
}
src
+=
src_pitch
;
dstu
+=
dstu_pitch
;
dstv
+=
dstv_pitch
;
}
}
static
void
CopyPlane
(
uint8_t
*
dst
,
size_t
dst_pitch
,
const
uint8_t
*
src
,
size_t
src_pitch
,
uint8_t
*
cache
,
size_t
cache_size
,
unsigned
width
,
unsigned
height
,
unsigned
cpu
)
{
const
unsigned
w16
=
(
width
+
15
)
&
~
15
;
const
unsigned
hstep
=
cache_size
/
w16
;
assert
(
hstep
>
0
);
for
(
unsigned
y
=
0
;
y
<
height
;
y
+=
hstep
)
{
const
unsigned
unaligned
=
(
intptr_t
)
src
&
0x0f
;
const
unsigned
hblock
=
__MIN
(
hstep
,
height
-
y
);
/* Copy a bunch of line into our cache */
CopyFromUswc
(
cache
,
w16
,
src
,
src_pitch
,
unaligned
,
width
,
hblock
,
cpu
);
/* Copy from our cache to the destination */
Copy2d
(
dst
,
dst_pitch
,
cache
,
w16
,
width
,
hblock
,
cpu
);
/* */
src
+=
src_pitch
*
hblock
;
dst
+=
dst_pitch
*
hblock
;
}
ASM_SSE2
(
cpu
,
"mfence"
);
}
static
void
SplitPlanes
(
uint8_t
*
dstu
,
size_t
dstu_pitch
,
uint8_t
*
dstv
,
size_t
dstv_pitch
,
const
uint8_t
*
src
,
size_t
src_pitch
,
uint8_t
*
cache
,
size_t
cache_size
,
unsigned
width
,
unsigned
height
,
unsigned
cpu
)
{
const
unsigned
w2_16
=
(
2
*
width
+
15
)
&
~
15
;
const
unsigned
hstep
=
cache_size
/
w2_16
;
assert
(
hstep
>
0
);
for
(
unsigned
y
=
0
;
y
<
height
;
y
+=
hstep
)
{
const
unsigned
unaligned
=
(
intptr_t
)
src
&
0x0f
;
const
unsigned
hblock
=
__MIN
(
hstep
,
height
-
y
);
/* Copy a bunch of line into our cache */
CopyFromUswc
(
cache
,
w2_16
,
src
,
src_pitch
,
unaligned
,
2
*
width
,
hblock
,
cpu
);
/* Copy from our cache to the destination */
SplitUV
(
dstu
,
dstu_pitch
,
dstv
,
dstv_pitch
,
cache
,
w2_16
,
width
,
hblock
,
cpu
);
/* */
src
+=
src_pitch
*
hblock
;
dstu
+=
dstu_pitch
*
hblock
;
dstv
+=
dstv_pitch
*
hblock
;
}
ASM_SSE2
(
cpu
,
"mfence"
);
}
int
CopyInitCache
(
copy_cache_t
*
cache
,
unsigned
width
)
{
cache
->
size
=
__MAX
((
width
+
0x0f
)
&
~
0x0f
,
4096
);
cache
->
base
=
malloc
(
16
+
cache
->
size
);
if
(
cache
->
base
==
NULL
)
{
cache
->
buffer
=
NULL
;
return
VLC_EGENERIC
;
}
cache
->
buffer
=
&
cache
->
base
[
16
-
((
intptr_t
)
cache
->
base
&
0x0f
)];
return
VLC_SUCCESS
;
}
void
CopyCleanCache
(
copy_cache_t
*
cache
)
{
free
(
cache
->
base
);
cache
->
base
=
NULL
;
cache
->
buffer
=
NULL
;
cache
->
size
=
0
;
}
void
CopyFromNv12
(
picture_t
*
dst
,
uint8_t
*
src
[
2
],
size_t
src_pitch
[
2
],
unsigned
width
,
unsigned
height
,
copy_cache_t
*
cache
)
{
const
unsigned
cpu
=
vlc_CPU
();
/* */
CopyPlane
(
dst
->
p
[
0
].
p_pixels
,
dst
->
p
[
0
].
i_pitch
,
src
[
0
],
src_pitch
[
0
],
cache
->
buffer
,
cache
->
size
,
width
,
height
,
cpu
);
SplitPlanes
(
dst
->
p
[
2
].
p_pixels
,
dst
->
p
[
2
].
i_pitch
,
dst
->
p
[
1
].
p_pixels
,
dst
->
p
[
1
].
i_pitch
,
src
[
1
],
src_pitch
[
1
],
cache
->
buffer
,
cache
->
size
,
width
/
2
,
height
/
2
,
cpu
);
ASM_SSE2
(
cpu
,
"emms"
);
}
void
CopyFromYv12
(
picture_t
*
dst
,
uint8_t
*
src
[
3
],
size_t
src_pitch
[
3
],
unsigned
width
,
unsigned
height
,
copy_cache_t
*
cache
)
{
const
unsigned
cpu
=
vlc_CPU
();
/* */
for
(
unsigned
n
=
0
;
n
<
3
;
n
++
)
{
const
unsigned
d
=
n
>
0
?
2
:
1
;
CopyPlane
(
dst
->
p
[
n
].
p_pixels
,
dst
->
p
[
n
].
i_pitch
,
src
[
n
],
src_pitch
[
n
],
cache
->
buffer
,
cache
->
size
,
width
/
d
,
height
/
d
,
cpu
);
}
ASM_SSE2
(
cpu
,
"emms"
);
}
#undef ASM_SSE2
#undef COPY64
modules/codec/avcodec/copy.h
0 → 100644
View file @
0777bcbf
/*****************************************************************************
* copy.h: Fast YV12/NV12 copy
*****************************************************************************
* Copyright (C) 2009 Laurent Aimar
* $Id$
*
* Authors: Laurent Aimar <fenrir_AT_ videolan _DOT_ org>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston MA 02110-1301, USA.
*****************************************************************************/
#ifndef _VLC_AVCODEC_COPY_H
#define _VLC_AVCODEC_COPY_H 1
typedef
struct
{
uint8_t
*
base
;
uint8_t
*
buffer
;
size_t
size
;
}
copy_cache_t
;
int
CopyInitCache
(
copy_cache_t
*
cache
,
unsigned
width
);
void
CopyCleanCache
(
copy_cache_t
*
cache
);
void
CopyFromNv12
(
picture_t
*
dst
,
uint8_t
*
src
[
2
],
size_t
src_pitch
[
2
],
unsigned
width
,
unsigned
height
,
copy_cache_t
*
cache
);
void
CopyFromYv12
(
picture_t
*
dst
,
uint8_t
*
src
[
3
],
size_t
src_pitch
[
3
],
unsigned
width
,
unsigned
height
,
copy_cache_t
*
cache
);
#endif
modules/codec/avcodec/dxva2.c
View file @
0777bcbf
...
...
@@ -47,6 +47,7 @@
#include "avcodec.h"
#include "va.h"
#include "copy.h"
#ifdef HAVE_AVCODEC_DXVA2
...
...
@@ -252,9 +253,7 @@ typedef struct
/* Option conversion */
D3DFORMAT
output
;
uint8_t
*
surface_cache_base
;
uint8_t
*
surface_cache
;
size_t
surface_cache_size
;
copy_cache_t
surface_cache
;
/* */
struct
dxva_context
hw
;
...
...
@@ -297,13 +296,6 @@ static int DxResetVideoDecoder(vlc_va_dxva2_t *);
static
void
DxCreateVideoConversion
(
vlc_va_dxva2_t
*
);
static
void
DxDestroyVideoConversion
(
vlc_va_dxva2_t
*
);
static
void
CopyFromNv12
(
picture_t
*
dst
,
const
D3DLOCKED_RECT
*
src
,
uint8_t
*
cache
,
size_t
cache_size
,
unsigned
width
,
unsigned
height
);
static
void
CopyFromYv12
(
picture_t
*
dst
,
const
D3DLOCKED_RECT
*
src
,
uint8_t
*
cache
,
size_t
cache_size
,
unsigned
width
,
unsigned
height
);
/* */
static
int
Setup
(
vlc_va_t
*
external
,
void
**
hw
,
vlc_fourcc_t
*
chroma
,
int
width
,
int
height
)
...
...
@@ -356,7 +348,7 @@ static int Extract(vlc_va_t *external, picture_t *picture, AVFrame *ff)
vlc_va_dxva2_t
*
va
=
vlc_va_dxva2_Get
(
external
);
LPDIRECT3DSURFACE9
d3d
=
(
LPDIRECT3DSURFACE9
)(
uintptr_t
)
ff
->
data
[
3
];
if
(
!
va
->
surface_cache
)
if
(
!
va
->
surface_cache
.
buffer
)
return
VLC_EGENERIC
;
/* */
...
...
@@ -370,14 +362,33 @@ static int Extract(vlc_va_t *external, picture_t *picture, AVFrame *ff)
}
if
(
va
->
render
==
MAKEFOURCC
(
'Y'
,
'V'
,
'1'
,
'2'
))
{
CopyFromYv12
(
picture
,
&
lock
,
va
->
surface_cache
,
va
->
surface_cache_size
,
va
->
surface_width
,
va
->
surface_height
);
uint8_t
*
plane
[
3
]
=
{
lock
.
pBits
,
(
uint8_t
*
)
lock
.
pBits
+
lock
.
Pitch
*
va
->
surface_height
,
(
uint8_t
*
)
lock
.
pBits
+
lock
.
Pitch
*
va
->
surface_height
+
(
lock
.
Pitch
/
2
)
*
(
va
->
surface_height
/
2
)
};
size_t
pitch
[
3
]
=
{
lock
.
Pitch
,
lock
.
Pitch
/
2
,
lock
.
Pitch
/
2
,
};
CopyFromYv12
(
picture
,
plane
,
pitch
,
va
->
surface_width
,
va
->
surface_height
,
&
va
->
surface_cache
);
}
else
{
assert
(
va
->
render
==
MAKEFOURCC
(
'N'
,
'V'
,
'1'
,
'2'
));
CopyFromNv12
(
picture
,
&
lock
,
va
->
surface_cache
,
va
->
surface_cache_size
,
va
->
surface_width
,
va
->
surface_height
);
uint8_t
*
plane
[
2
]
=
{
lock
.
pBits
,
(
uint8_t
*
)
lock
.
pBits
+
lock
.
Pitch
*
va
->
surface_height
};
size_t
pitch
[
2
]
=
{
lock
.
Pitch
,
lock
.
Pitch
/
2
,
};
CopyFromNv12
(
picture
,
plane
,
pitch
,
va
->
surface_width
,
va
->
surface_height
,
&
va
->
surface_cache
);
}
/* */
...
...
@@ -954,319 +965,12 @@ static void DxCreateVideoConversion(vlc_va_dxva2_t *va)
va
->
output
=
va
->
render
;
break
;
}
va
->
surface_cache_size
=
__MAX
((
va
->
surface_width
+
0x0f
)
&
~
0x0f
,
4096
);
va
->
surface_cache_base
=
malloc
(
16
+
va
->
surface_cache_size
);
va
->
surface_cache
=
&
va
->
surface_cache_base
[
16
-
((
intptr_t
)
va
->
surface_cache_base
&
0x0f
)];
CopyInitCache
(
&
va
->
surface_cache
,
va
->
surface_width
);
}
static
void
DxDestroyVideoConversion
(
vlc_va_dxva2_t
*
va
)
{
free
(
va
->
surface_cache_base
);
va
->
surface_cache_base
=
NULL
;
va
->
surface_cache
=
NULL
;
va
->
surface_cache_size
=
0
;
CopyCleanCache
(
&
va
->
surface_cache
);
}
/* Copy 64 bytes from srcp to dsp loading data with the SSE>=2 instruction load and
* storing data with the SSE>=2 instruction store.
*/
#define COPY64(dstp, srcp, load, store) \
asm volatile ( \
load " 0(%[src]), %%xmm1\n" \
load " 16(%[src]), %%xmm2\n" \
load " 32(%[src]), %%xmm3\n" \
load " 48(%[src]), %%xmm4\n" \
store " %%xmm1, 0(%[dst])\n" \
store " %%xmm2, 16(%[dst])\n" \
store " %%xmm3, 32(%[dst])\n" \
store " %%xmm4, 48(%[dst])\n" \
: : [dst]"r"(dstp), [src]"r"(srcp) : "memory")
/* Execute the instruction op only if SSE2 is supported. */
#ifdef CAN_COMPILE_SSE2
# define ASM_SSE2(cpu, op) do { \
if (cpu & CPU_CAPABILITY_SSE2) \
asm volatile (op); \
} while (0)
#else
# define ASM_SSE2(cpu, op)
#endif
/* Optimized copy from "Uncacheable Speculative Write Combining" memory
* as used by some video surface.
* XXX It is really efficient only when SSE4.1 is available.
*/
static
void
CopyFromUswc
(
uint8_t
*
dst
,
size_t
dst_pitch
,
const
uint8_t
*
src
,
size_t
src_pitch
,
unsigned
unaligned
,
unsigned
width
,
unsigned
height
,
unsigned
cpu
)
{
assert
(((
intptr_t
)
dst
&
0x0f
)
==
0
&&
(
dst_pitch
&
0x0f
)
==
0
);
ASM_SSE2
(
cpu
,
"mfence"
);
for
(
unsigned
y
=
0
;
y
<
height
;
y
++
)
{
unsigned
x
;
for
(
x
=
0
;
x
<
unaligned
;
x
++
)
dst
[
x
]
=
src
[
x
];
#ifdef CAN_COMPILE_SSE4_1
if
(
cpu
&
CPU_CAPABILITY_SSE4_1
)
{
if
(
!
unaligned
)
{
for
(;
x
+
63
<
width
;
x
+=
64
)
COPY64
(
&
dst
[
x
],
&
src
[
x
],
"movntdqa"
,
"movdqa"
);
}
else
{
for
(;
x
+
63
<
width
;
x
+=
64
)
COPY64
(
&
dst
[
x
],
&
src
[
x
],
"movntdqa"
,
"movdqu"
);
}
}
else
#endif
#ifdef CAN_COMPILE_SSE2
if
(
cpu
&
CPU_CAPABILITY_SSE2
)
{
if
(
!
unaligned
)
{
for
(;
x
+
63
<
width
;
x
+=
64
)
COPY64
(
&
dst
[
x
],
&
src
[
x
],
"movdqa"
,
"movdqa"
);
}
else
{
for
(;
x
+
63
<
width
;
x
+=
64
)
COPY64
(
&
dst
[
x
],
&
src
[
x
],
"movdqa"
,
"movdqu"
);
}
}
#endif
for
(;
x
<
width
;
x
++
)
dst
[
x
]
=
src
[
x
];
src
+=
src_pitch
;
dst
+=
dst_pitch
;
}
}
static
void
Copy2d
(
uint8_t
*
dst
,
size_t
dst_pitch
,
const
uint8_t
*
src
,
size_t
src_pitch
,
unsigned
width
,
unsigned
height
,
unsigned
cpu
)
{
assert
(((
intptr_t
)
src
&
0x0f
)
==
0
&&
(
src_pitch
&
0x0f
)
==
0
);
ASM_SSE2
(
cpu
,
"mfence"
);
for
(
unsigned
y
=
0
;
y
<
height
;
y
++
)
{
unsigned
x
=
0
;
bool
unaligned
=
((
intptr_t
)
dst
&
0x0f
)
!=
0
;
#ifdef CAN_COMPILE_SSE2
if
(
cpu
&
CPU_CAPABILITY_SSE2
)
{
if
(
!
unaligned
)
{
for
(;
x
+
63
<
width
;
x
+=
64
)
COPY64
(
&
dst
[
x
],
&
src
[
x
],
"movdqa"
,
"movntdq"
);
}
else
{
for
(;
x
+
63
<
width
;
x
+=
64
)
COPY64
(
&
dst
[
x
],
&
src
[
x
],
"movdqa"
,
"movdqu"
);
}
}
#endif
for
(;
x
<
width
;
x
++
)
dst
[
x
]
=
src
[
x
];
src
+=
src_pitch
;
dst
+=
dst_pitch
;
}
}
static
void
SplitUV
(
uint8_t
*
dstu
,
size_t
dstu_pitch
,
uint8_t
*
dstv
,
size_t
dstv_pitch
,
const
uint8_t
*
src
,
size_t
src_pitch
,
unsigned
width
,
unsigned
height
,
unsigned
cpu
)
{
const
uint8_t
shuffle
[]
=
{
0
,
2
,
4
,
6
,
8
,
10
,
12
,
14
,
1
,
3
,
5
,
7
,
9
,
11
,
13
,
15
};
const
uint8_t
mask
[]
=
{
0xff
,
0x00
,
0xff
,
0x00
,
0xff
,
0x00
,
0xff
,
0x00
,
0xff
,
0x00
,
0xff
,
0x00
,
0xff
,
0x00
,
0xff
,
0x00
};
assert
(((
intptr_t
)
src
&
0x0f
)
==
0
&&
(
src_pitch
&
0x0f
)
==
0
);
ASM_SSE2
(
cpu
,
"mfence"
);
for
(
unsigned
y
=
0
;
y
<
height
;
y
++
)
{
unsigned
x
=
0
;
#define LOAD64 \
"movdqa 0(%[src]), %%xmm0\n" \
"movdqa 16(%[src]), %%xmm1\n" \
"movdqa 32(%[src]), %%xmm2\n" \
"movdqa 48(%[src]), %%xmm3\n"
#define STORE2X32 \
"movq %%xmm0, 0(%[dst1])\n" \
"movq %%xmm1, 8(%[dst1])\n" \
"movhpd %%xmm0, 0(%[dst2])\n" \
"movhpd %%xmm1, 8(%[dst2])\n" \
"movq %%xmm2, 16(%[dst1])\n" \
"movq %%xmm3, 24(%[dst1])\n" \
"movhpd %%xmm2, 16(%[dst2])\n" \
"movhpd %%xmm3, 24(%[dst2])\n"
#ifdef CAN_COMPILE_SSSE3
if
(
cpu
&
CPU_CAPABILITY_SSSE3
)
{
for
(
x
=
0
;
x
<
(
width
&
~
31
);
x
+=
32
)
{
asm
volatile
(
"movdqu (%[shuffle]), %%xmm7
\n
"
LOAD64
"pshufb %%xmm7, %%xmm0
\n
"
"pshufb %%xmm7, %%xmm1
\n
"
"pshufb %%xmm7, %%xmm2
\n
"
"pshufb %%xmm7, %%xmm3
\n
"
STORE2X32
:
:
[
dst1
]
"r"
(
&
dstu
[
x
]),
[
dst2
]
"r"
(
&
dstv
[
x
]),
[
src
]
"r"
(
&
src
[
2
*
x
]),
[
shuffle
]
"r"
(
shuffle
)
:
"memory"
);
}
}
else
#endif
#ifdef CAN_COMPILE_SSE2
if
(
cpu
&
CPU_CAPABILITY_SSE2
)
{
for
(
x
=
0
;
x
<
(
width
&
~
31
);
x
+=
32
)
{
asm
volatile
(
"movdqu (%[mask]), %%xmm7
\n
"
LOAD64
"movdqa %%xmm0, %%xmm4
\n
"
"movdqa %%xmm1, %%xmm5
\n
"
"movdqa %%xmm2, %%xmm6
\n
"
"psrlw $8, %%xmm0
\n
"
"psrlw $8, %%xmm1
\n
"
"pand %%xmm7, %%xmm4
\n
"
"pand %%xmm7, %%xmm5
\n
"
"pand %%xmm7, %%xmm6
\n
"
"packuswb %%xmm4, %%xmm0
\n
"
"packuswb %%xmm5, %%xmm1
\n
"
"pand %%xmm3, %%xmm7
\n
"
"psrlw $8, %%xmm2
\n
"
"psrlw $8, %%xmm3
\n
"
"packuswb %%xmm6, %%xmm2
\n
"
"packuswb %%xmm7, %%xmm3
\n
"
STORE2X32
:
:
[
dst2
]
"r"
(
&
dstu
[
x
]),
[
dst1
]
"r"
(
&
dstv
[
x
]),
[
src
]
"r"
(
&
src
[
2
*
x
]),
[
mask
]
"r"
(
mask
)
:
"memory"
);
}
}
#endif
#undef STORE2X32
#undef LOAD64
for
(;
x
<
width
;
x
++
)
{
dstu
[
x
]
=
src
[
2
*
x
+
0
];
dstv
[
x
]
=
src
[
2
*
x
+
1
];
}
src
+=
src_pitch
;
dstu
+=
dstu_pitch
;
dstv
+=
dstv_pitch
;
}
}
static
void
CopyPlane
(
uint8_t
*
dst
,
size_t
dst_pitch
,
const
uint8_t
*
src
,
size_t
src_pitch
,
uint8_t
*
cache
,
size_t
cache_size
,
unsigned
width
,
unsigned
height
,
unsigned
cpu
)
{
const
unsigned
w16
=
(
width
+
15
)
&
~
15
;
const
unsigned
hstep
=
cache_size
/
w16
;
assert
(
hstep
>
0
);
for
(
unsigned
y
=
0
;
y
<
height
;
y
+=
hstep
)
{
const
unsigned
unaligned
=
(
intptr_t
)
src
&
0x0f
;
const
unsigned
hblock
=
__MIN
(
hstep
,
height
-
y
);
/* Copy a bunch of line into our cache */
CopyFromUswc
(
cache
,
w16
,
src
,
src_pitch
,
unaligned
,
width
,
hblock
,
cpu
);
/* Copy from our cache to the destination */
Copy2d
(
dst
,
dst_pitch
,
cache
,
w16
,
width
,
hblock
,
cpu
);
/* */
src
+=
src_pitch
*
hblock
;
dst
+=
dst_pitch
*
hblock
;
}
ASM_SSE2
(
cpu
,
"mfence"
);
}
static
void
SplitPlanes
(
uint8_t
*
dstu
,
size_t
dstu_pitch
,
uint8_t
*
dstv
,
size_t
dstv_pitch
,
const
uint8_t
*
src
,
size_t
src_pitch
,
uint8_t
*
cache
,
size_t
cache_size
,
unsigned
width
,
unsigned
height
,
unsigned
cpu
)
{
const
unsigned
w2_16
=
(
2
*
width
+
15
)
&
~
15
;
const
unsigned
hstep
=
cache_size
/
w2_16
;
assert
(
hstep
>
0
);
for
(
unsigned
y
=
0
;
y
<
height
;
y
+=
hstep
)
{
const
unsigned
unaligned
=
(
intptr_t
)
src
&
0x0f
;
const
unsigned
hblock
=
__MIN
(
hstep
,
height
-
y
);
/* Copy a bunch of line into our cache */
CopyFromUswc
(
cache
,
w2_16
,
src
,
src_pitch
,
unaligned
,
2
*
width
,
hblock
,
cpu
);
/* Copy from our cache to the destination */
SplitUV
(
dstu
,
dstu_pitch
,
dstv
,
dstv_pitch
,
cache
,
w2_16
,
width
,
hblock
,
cpu
);
/* */
src
+=
src_pitch
*
hblock
;
dstu
+=
dstu_pitch
*
hblock
;
dstv
+=
dstv_pitch
*
hblock
;
}
ASM_SSE2
(
cpu
,
"mfence"
);
}
static
void
CopyFromNv12
(
picture_t
*
dst
,
const
D3DLOCKED_RECT
*
src
,
uint8_t
*
cache
,
size_t
cache_size
,
unsigned
width
,
unsigned
height
)
{
const
unsigned
cpu
=
vlc_CPU
();
/* */
CopyPlane
(
dst
->
p
[
0
].
p_pixels
,
dst
->
p
[
0
].
i_pitch
,
src
->
pBits
,
src
->
Pitch
,
cache
,
cache_size
,
width
,
height
,
cpu
);
SplitPlanes
(
dst
->
p
[
2
].
p_pixels
,
dst
->
p
[
2
].
i_pitch
,
dst
->
p
[
1
].
p_pixels
,
dst
->
p
[
1
].
i_pitch
,
(
const
uint8_t
*
)
src
->
pBits
+
src
->
Pitch
*
height
,
src
->
Pitch
,
cache
,
cache_size
,
width
/
2
,
height
/
2
,
cpu
);
ASM_SSE2
(
cpu
,
"emms"
);
}
static
void
CopyFromYv12
(
picture_t
*
dst
,
const
D3DLOCKED_RECT
*
src
,
uint8_t
*
cache
,
size_t
cache_size
,
unsigned
width
,
unsigned
height
)
{
const
unsigned
cpu
=
vlc_CPU
();
/* */
for
(
unsigned
n
=
0
,
offset
=
0
;
n
<
3
;
n
++
)
{
const
unsigned
d
=
n
>
0
?
2
:
1
;
CopyPlane
(
dst
->
p
[
n
].
p_pixels
,
dst
->
p
[
n
].
i_pitch
,
(
const
uint8_t
*
)
src
->
pBits
+
offset
,
src
->
Pitch
/
d
,
cache
,
cache_size
,
width
/
d
,
height
/
d
,
cpu
);
offset
+=
(
src
->
Pitch
/
d
)
*
(
height
/
d
);
}
ASM_SSE2
(
cpu
,
"emms"
);
}
#undef ASM_SSE2
#undef COPY64
#else
vlc_va_t
*
vlc_va_NewDxva2
(
vlc_object_t
*
log
,
int
codec_id
)
{
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment