Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Support
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
V
vlc
Project overview
Project overview
Details
Activity
Releases
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Issues
0
Issues
0
List
Boards
Labels
Milestones
Redmine
Redmine
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Operations
Operations
Metrics
Environments
Analytics
Analytics
CI / CD
Repository
Value Stream
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
videolan
vlc
Commits
262b177b
Commit
262b177b
authored
Aug 10, 2007
by
Damien Fouilleul
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
i422_yuy2: SSE2 improvements
parent
43bd7bed
Changes
4
Hide whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
327 additions
and
23 deletions
+327
-23
configure.ac
configure.ac
+3
-3
modules/video_chroma/Modules.am
modules/video_chroma/Modules.am
+5
-0
modules/video_chroma/i422_yuy2.c
modules/video_chroma/i422_yuy2.c
+190
-18
modules/video_chroma/i422_yuy2.h
modules/video_chroma/i422_yuy2.h
+129
-2
No files found.
configure.ac
View file @
262b177b
...
...
@@ -1257,7 +1257,7 @@ MMXEXT_MODULES="memcpymmxext"
#MMXEXT_MODULES="${MMXEXT_MODULES} idctmmxext motionmmxext"
THREEDNOW_MODULES="memcpy3dn"
SSE_MODULES=""
SSE2_MODULES="i420_rgb_sse2 i420_yuy2_sse2"
SSE2_MODULES="i420_rgb_sse2 i420_yuy2_sse2
i422_yuy2_sse2
"
ALTIVEC_MODULES="memcpyaltivec i420_yuy2_altivec"
#ALTIVEC_MODULES="${ALTIVEC_MODULES} idctaltivec motionaltivec"
...
...
@@ -1283,7 +1283,7 @@ AC_CACHE_CHECK([if \$CC groks MMX intrinsics],
[ac_cv_c_mmx_intrinsics=no])])
if test "${ac_cv_c_mmx_intrinsics}" != "no"; then
AC_DEFINE(HAVE_MMX_INTRINSICS, 1, Define if MMX intrinsics are available.)
VLC_ADD_CFLAGS([
i420_rgb_mmx
],[-mmmx])
VLC_ADD_CFLAGS([
${MMX_MODULES}
],[-mmmx])
fi
dnl Check for fully workin SSE2 intrinsics
...
...
@@ -1308,7 +1308,7 @@ AC_CACHE_CHECK([if \$CC groks SSE2 intrinsics],
[ac_cv_c_sse2_intrinsics=no])])
if test "${ac_cv_c_sse2_intrinsics}" != "no"; then
AC_DEFINE(HAVE_SSE2_INTRINSICS, 1, Define if SSE2 intrinsics are available.)
VLC_ADD_CFLAGS([
i420_rgb_sse2
],[-msse2])
VLC_ADD_CFLAGS([
${SSE2_MODULES}
],[-msse2])
fi
AC_CACHE_CHECK([if \$CC groks MMX inline assembly],
...
...
modules/video_chroma/Modules.am
View file @
262b177b
...
...
@@ -50,6 +50,11 @@ SOURCES_i422_yuy2_mmx = \
i422_yuy2.h \
$(NULL)
SOURCES_i422_yuy2_sse2 = \
i422_yuy2.c \
i422_yuy2.h \
$(NULL)
SOURCES_i420_ymga = \
i420_ymga.c \
$(NULL)
...
...
modules/video_chroma/i422_yuy2.c
View file @
262b177b
...
...
@@ -67,6 +67,10 @@ vlc_module_begin();
set_description
(
_
(
"MMX conversions from "
SRC_FOURCC
" to "
DEST_FOURCC
)
);
set_capability
(
"chroma"
,
100
);
add_requirement
(
MMX
);
#elif defined (MODULE_NAME_IS_i422_yuy2_sse2)
set_description
(
_
(
"SSE2 conversions from "
SRC_FOURCC
" to "
DEST_FOURCC
)
);
set_capability
(
"chroma"
,
120
);
add_requirement
(
MMX
);
#endif
set_callbacks
(
Activate
,
NULL
);
vlc_module_end
();
...
...
@@ -143,17 +147,66 @@ static int Activate( vlc_object_t *p_this )
static
void
I422_YUY2
(
vout_thread_t
*
p_vout
,
picture_t
*
p_source
,
picture_t
*
p_dest
)
{
uint8_t
*
p_pixels
=
p_dest
->
p
->
p_pixels
;
int
i_pitch
=
p_dest
->
p
->
i_pitch
;
uint8_t
*
p_line
=
p_dest
->
p
->
p_pixels
;
uint8_t
*
p_y
=
p_source
->
Y_PIXELS
;
uint8_t
*
p_u
=
p_source
->
U_PIXELS
;
uint8_t
*
p_v
=
p_source
->
V_PIXELS
;
int
i_x
,
i_y
;
const
int
i_source_margin
=
p_source
->
p
[
0
].
i_pitch
-
p_source
->
p
[
0
].
i_visible_pitch
;
const
int
i_source_margin_c
=
p_source
->
p
[
1
].
i_pitch
-
p_source
->
p
[
1
].
i_visible_pitch
;
const
int
i_dest_margin
=
p_dest
->
p
->
i_pitch
-
p_dest
->
p
->
i_visible_pitch
;
#if defined (MODULE_NAME_IS_i422_yuy2_sse2)
if
(
0
==
(
15
&
(
p_source
->
p
[
Y_PLANE
].
i_pitch
|
p_dest
->
p
->
i_pitch
|
((
int
)
p_line
|
(
int
)
p_y
)))
)
{
/* use faster SSE2 aligned fetch and store */
for
(
i_y
=
p_vout
->
render
.
i_height
;
i_y
--
;
)
{
for
(
i_x
=
p_vout
->
render
.
i_width
/
16
;
i_x
--
;
)
{
SSE2_CALL
(
SSE2_YUV422_YUYV_ALIGNED
);
}
for
(
i_x
=
(
p_vout
->
render
.
i_width
%
16
)
/
2
;
i_x
--
;
)
{
C_YUV422_YUYV
(
p_line
,
p_y
,
p_u
,
p_v
);
}
p_y
+=
i_source_margin
;
p_u
+=
i_source_margin_c
;
p_v
+=
i_source_margin_c
;
p_line
+=
i_dest_margin
;
}
}
else
{
/* use slower SSE2 unaligned fetch and store */
for
(
i_y
=
p_vout
->
render
.
i_height
;
i_y
--
;
)
{
for
(
i_x
=
p_vout
->
render
.
i_width
/
16
;
i_x
--
;
)
{
SSE2_CALL
(
SSE2_YUV422_YUYV_UNALIGNED
);
}
for
(
i_x
=
(
p_vout
->
render
.
i_width
%
16
)
/
2
;
i_x
--
;
)
{
C_YUV422_YUYV
(
p_line
,
p_y
,
p_u
,
p_v
);
}
p_y
+=
i_source_margin
;
p_u
+=
i_source_margin_c
;
p_v
+=
i_source_margin_c
;
p_line
+=
i_dest_margin
;
}
}
SSE2_END
;
#else
for
(
i_y
=
p_vout
->
render
.
i_height
;
i_y
--
;
)
{
uint8_t
*
p_line
=
p_pixels
;
for
(
i_x
=
p_vout
->
render
.
i_width
/
8
;
i_x
--
;
)
{
#if defined (MODULE_NAME_IS_i422_yuy2)
...
...
@@ -165,12 +218,19 @@ static void I422_YUY2( vout_thread_t *p_vout, picture_t *p_source,
MMX_CALL
(
MMX_YUV422_YUYV
);
#endif
}
p_pixels
+=
i_pitch
;
for
(
i_x
=
(
p_vout
->
render
.
i_width
%
8
)
/
2
;
i_x
--
;
)
{
C_YUV422_YUYV
(
p_line
,
p_y
,
p_u
,
p_v
);
}
p_y
+=
i_source_margin
;
p_u
+=
i_source_margin_c
;
p_v
+=
i_source_margin_c
;
p_line
+=
i_dest_margin
;
}
#if defined (MODULE_NAME_IS_i422_yuy2_mmx)
MMX_END
;
#e
lif defined (MODULE_NAME_IS_i422_yuy2_sse2)
SSE2_END
;
#e
ndif
#endif
}
...
...
@@ -180,17 +240,66 @@ static void I422_YUY2( vout_thread_t *p_vout, picture_t *p_source,
static
void
I422_YVYU
(
vout_thread_t
*
p_vout
,
picture_t
*
p_source
,
picture_t
*
p_dest
)
{
uint8_t
*
p_pixels
=
p_dest
->
p
->
p_pixels
;
int
i_pitch
=
p_dest
->
p
->
i_pitch
;
uint8_t
*
p_line
=
p_dest
->
p
->
p_pixels
;
uint8_t
*
p_y
=
p_source
->
Y_PIXELS
;
uint8_t
*
p_u
=
p_source
->
U_PIXELS
;
uint8_t
*
p_v
=
p_source
->
V_PIXELS
;
int
i_x
,
i_y
;
const
int
i_source_margin
=
p_source
->
p
[
0
].
i_pitch
-
p_source
->
p
[
0
].
i_visible_pitch
;
const
int
i_source_margin_c
=
p_source
->
p
[
1
].
i_pitch
-
p_source
->
p
[
1
].
i_visible_pitch
;
const
int
i_dest_margin
=
p_dest
->
p
->
i_pitch
-
p_dest
->
p
->
i_visible_pitch
;
#if defined (MODULE_NAME_IS_i422_yuy2_sse2)
if
(
0
==
(
15
&
(
p_source
->
p
[
Y_PLANE
].
i_pitch
|
p_dest
->
p
->
i_pitch
|
((
int
)
p_line
|
(
int
)
p_y
)))
)
{
/* use faster SSE2 aligned fetch and store */
for
(
i_y
=
p_vout
->
render
.
i_height
;
i_y
--
;
)
{
for
(
i_x
=
p_vout
->
render
.
i_width
/
16
;
i_x
--
;
)
{
SSE2_CALL
(
SSE2_YUV422_YVYU_ALIGNED
);
}
for
(
i_x
=
(
p_vout
->
render
.
i_width
%
16
)
/
2
;
i_x
--
;
)
{
C_YUV422_YVYU
(
p_line
,
p_y
,
p_u
,
p_v
);
}
p_y
+=
i_source_margin
;
p_u
+=
i_source_margin_c
;
p_v
+=
i_source_margin_c
;
p_line
+=
i_dest_margin
;
}
}
else
{
/* use slower SSE2 unaligned fetch and store */
for
(
i_y
=
p_vout
->
render
.
i_height
;
i_y
--
;
)
{
for
(
i_x
=
p_vout
->
render
.
i_width
/
16
;
i_x
--
;
)
{
SSE2_CALL
(
SSE2_YUV422_YVYU_UNALIGNED
);
}
for
(
i_x
=
(
p_vout
->
render
.
i_width
%
16
)
/
2
;
i_x
--
;
)
{
C_YUV422_YVYU
(
p_line
,
p_y
,
p_u
,
p_v
);
}
p_y
+=
i_source_margin
;
p_u
+=
i_source_margin_c
;
p_v
+=
i_source_margin_c
;
p_line
+=
i_dest_margin
;
}
}
SSE2_END
;
#else
for
(
i_y
=
p_vout
->
render
.
i_height
;
i_y
--
;
)
{
uint8_t
*
p_line
=
p_pixels
;
for
(
i_x
=
p_vout
->
render
.
i_width
/
8
;
i_x
--
;
)
{
#if defined (MODULE_NAME_IS_i422_yuy2)
...
...
@@ -202,12 +311,19 @@ static void I422_YVYU( vout_thread_t *p_vout, picture_t *p_source,
MMX_CALL
(
MMX_YUV422_YVYU
);
#endif
}
p_pixels
+=
i_pitch
;
for
(
i_x
=
(
p_vout
->
render
.
i_width
%
8
)
/
2
;
i_x
--
;
)
{
C_YUV422_YVYU
(
p_line
,
p_y
,
p_u
,
p_v
);
}
p_y
+=
i_source_margin
;
p_u
+=
i_source_margin_c
;
p_v
+=
i_source_margin_c
;
p_line
+=
i_dest_margin
;
}
#if defined (MODULE_NAME_IS_i422_yuy2_mmx)
MMX_END
;
#e
lif defined (MODULE_NAME_IS_i422_yuy2_sse2)
SSE2_END
;
#e
ndif
#endif
}
...
...
@@ -217,17 +333,66 @@ static void I422_YVYU( vout_thread_t *p_vout, picture_t *p_source,
static
void
I422_UYVY
(
vout_thread_t
*
p_vout
,
picture_t
*
p_source
,
picture_t
*
p_dest
)
{
uint8_t
*
p_pixels
=
p_dest
->
p
->
p_pixels
;
int
i_pitch
=
p_dest
->
p
->
i_pitch
;
uint8_t
*
p_line
=
p_dest
->
p
->
p_pixels
;
uint8_t
*
p_y
=
p_source
->
Y_PIXELS
;
uint8_t
*
p_u
=
p_source
->
U_PIXELS
;
uint8_t
*
p_v
=
p_source
->
V_PIXELS
;
int
i_x
,
i_y
;
const
int
i_source_margin
=
p_source
->
p
[
0
].
i_pitch
-
p_source
->
p
[
0
].
i_visible_pitch
;
const
int
i_source_margin_c
=
p_source
->
p
[
1
].
i_pitch
-
p_source
->
p
[
1
].
i_visible_pitch
;
const
int
i_dest_margin
=
p_dest
->
p
->
i_pitch
-
p_dest
->
p
->
i_visible_pitch
;
#if defined (MODULE_NAME_IS_i422_yuy2_sse2)
if
(
0
==
(
15
&
(
p_source
->
p
[
Y_PLANE
].
i_pitch
|
p_dest
->
p
->
i_pitch
|
((
int
)
p_line
|
(
int
)
p_y
)))
)
{
/* use faster SSE2 aligned fetch and store */
for
(
i_y
=
p_vout
->
render
.
i_height
;
i_y
--
;
)
{
for
(
i_x
=
p_vout
->
render
.
i_width
/
16
;
i_x
--
;
)
{
SSE2_CALL
(
SSE2_YUV422_UYVY_ALIGNED
);
}
for
(
i_x
=
(
p_vout
->
render
.
i_width
%
16
)
/
2
;
i_x
--
;
)
{
C_YUV422_UYVY
(
p_line
,
p_y
,
p_u
,
p_v
);
}
p_y
+=
i_source_margin
;
p_u
+=
i_source_margin_c
;
p_v
+=
i_source_margin_c
;
p_line
+=
i_dest_margin
;
}
}
else
{
/* use slower SSE2 unaligned fetch and store */
for
(
i_y
=
p_vout
->
render
.
i_height
;
i_y
--
;
)
{
for
(
i_x
=
p_vout
->
render
.
i_width
/
16
;
i_x
--
;
)
{
SSE2_CALL
(
SSE2_YUV422_UYVY_UNALIGNED
);
}
for
(
i_x
=
(
p_vout
->
render
.
i_width
%
16
)
/
2
;
i_x
--
;
)
{
C_YUV422_UYVY
(
p_line
,
p_y
,
p_u
,
p_v
);
}
p_y
+=
i_source_margin
;
p_u
+=
i_source_margin_c
;
p_v
+=
i_source_margin_c
;
p_line
+=
i_dest_margin
;
}
}
SSE2_END
;
#else
for
(
i_y
=
p_vout
->
render
.
i_height
;
i_y
--
;
)
{
uint8_t
*
p_line
=
p_pixels
;
for
(
i_x
=
p_vout
->
render
.
i_width
/
8
;
i_x
--
;
)
{
#if defined (MODULE_NAME_IS_i422_yuy2)
...
...
@@ -239,12 +404,19 @@ static void I422_UYVY( vout_thread_t *p_vout, picture_t *p_source,
MMX_CALL
(
MMX_YUV422_UYVY
);
#endif
}
p_pixels
+=
i_pitch
;
for
(
i_x
=
(
p_vout
->
render
.
i_width
%
8
)
/
2
;
i_x
--
;
)
{
C_YUV422_UYVY
(
p_line
,
p_y
,
p_u
,
p_v
);
}
p_y
+=
i_source_margin
;
p_u
+=
i_source_margin_c
;
p_v
+=
i_source_margin_c
;
p_line
+=
i_dest_margin
;
}
#if defined (MODULE_NAME_IS_i422_yuy2_mmx)
MMX_END
;
#e
lif defined (MODULE_NAME_IS_i422_yuy2_sse2)
SSE2_END
;
#e
ndif
#endif
}
...
...
modules/video_chroma/i422_yuy2.h
View file @
262b177b
...
...
@@ -87,8 +87,49 @@ movq %%mm1, 8(%0) # Store high UYVY \n\
#include <mmintrin.h>
#define MMX_CALL(MMX_INSTRUCTIONS) \
do { \
__m64 mm0, mm1, mm2; \
MMX_INSTRUCTIONS \
p_line += 16; p_y += 8; \
p_u += 4; p_v += 4; \
} while(0)
#define MMX_END _mm_empty()
#define MMX_YUV422_YUYV \
mm0 = (__m64)*(uint64_t*)p_y; \
mm1 = _mm_cvtsi32_si64(*(int*)p_u); \
mm2 = _mm_cvtsi32_si64(*(int*)p_v); \
mm1 = _mm_unpacklo_pi8(mm1, mm2); \
mm2 = mm0; \
mm2 = _mm_unpacklo_pi8(mm2, mm1); \
*(uint64_t*)p_line = (uint64_t)mm2; \
mm0 = _mm_unpackhi_pi8(mm0, mm1); \
*(uint64_t*)(p_line+8) = (uint64_t)mm0;
#define MMX_YUV422_YVYU \
mm0 = (__m64)*(uint64_t*)p_y; \
mm2 = _mm_cvtsi32_si64(*(int*)p_u); \
mm1 = _mm_cvtsi32_si64(*(int*)p_v); \
mm1 = _mm_unpacklo_pi8(mm1, mm2); \
mm2 = mm0; \
mm2 = _mm_unpacklo_pi8(mm2, mm1); \
*(uint64_t*)p_line = (uint64_t)mm2; \
mm0 = _mm_unpackhi_pi8(mm0, mm1); \
*(uint64_t*)(p_line+8) = (uint64_t)mm0;
#define MMX_YUV422_UYVY \
mm0 = (__m64)*(uint64_t*)p_y; \
mm1 = _mm_cvtsi32_si64(*(int*)p_u); \
mm2 = _mm_cvtsi32_si64(*(int*)p_v); \
mm1 = _mm_unpacklo_pi8(mm1, mm2); \
mm2 = mm1; \
mm2 = _mm_unpacklo_pi8(mm2, mm0); \
*(uint64_t*)p_line = (uint64_t)mm2; \
mm1 = _mm_unpackhi_pi8(mm1, mm0); \
*(uint64_t*)(p_line+8) = (uint64_t)mm1;
#endif
#elif defined( MODULE_NAME_IS_i422_yuy2_sse2 )
...
...
@@ -97,8 +138,95 @@ movq %%mm1, 8(%0) # Store high UYVY \n\
/* SSE2 assembly */
#define SSE2_CALL(MMX_INSTRUCTIONS) \
do { \
__asm__ __volatile__( \
".p2align 3 \n\t" \
MMX_INSTRUCTIONS \
: \
: "r" (p_line), "r" (p_y), \
"r" (p_u), "r" (p_v) ); \
p_line += 32; p_y += 16; \
p_u += 8; p_v += 8; \
} while(0)
#define SSE2_END __asm__ __volatile__ ( "sfence" ::: "memory" )
#define SSE2_YUV422_YUYV_ALIGNED " \n\
movdqa (%1), %%xmm0 # Load 8 Y y7 y6 y5 y4 y3 y2 y1 y0 \n\
movq (%2), %%xmm1 # Load 4 Cb 00 00 00 00 u3 u2 u1 u0 \n\
movq (%3), %%xmm2 # Load 4 Cr 00 00 00 00 v3 v2 v1 v0 \n\
punpcklbw %%xmm2, %%xmm1 # v3 u3 v2 u2 v1 u1 v0 u0 \n\
movdqa %%xmm0, %%xmm2 # y7 y6 y5 y4 y3 y2 y1 y0 \n\
punpcklbw %%xmm1, %%xmm2 # v1 y3 u1 y2 v0 y1 u0 y0 \n\
movntdq %%xmm2, (%0) # Store low YUYV \n\
punpckhbw %%xmm1, %%xmm0 # v3 y7 u3 y6 v2 y5 u2 y4 \n\
movntdq %%xmm0, 16(%0) # Store high YUYV \n\
"
#define SSE2_YUV422_YUYV_UNALIGNED " \n\
movdqu (%1), %%xmm0 # Load 8 Y y7 y6 y5 y4 y3 y2 y1 y0 \n\
movq (%2), %%xmm1 # Load 4 Cb 00 00 00 00 u3 u2 u1 u0 \n\
movq (%3), %%xmm2 # Load 4 Cr 00 00 00 00 v3 v2 v1 v0 \n\
prefetchnta (%0) # Tell CPU not to cache output YUYV data \n\
punpcklbw %%xmm2, %%xmm1 # v3 u3 v2 u2 v1 u1 v0 u0 \n\
movdqa %%xmm0, %%xmm2 # y7 y6 y5 y4 y3 y2 y1 y0 \n\
punpcklbw %%xmm1, %%xmm2 # v1 y3 u1 y2 v0 y1 u0 y0 \n\
movdqu %%xmm2, (%0) # Store low YUYV \n\
punpckhbw %%xmm1, %%xmm0 # v3 y7 u3 y6 v2 y5 u2 y4 \n\
movdqu %%xmm0, 16(%0) # Store high YUYV \n\
"
#define SSE2_YUV422_YVYU_ALIGNED " \n\
movdqa (%1), %%xmm0 # Load 8 Y y7 y6 y5 y4 y3 y2 y1 y0 \n\
movq (%2), %%xmm2 # Load 4 Cb 00 00 00 00 u3 u2 u1 u0 \n\
movq (%3), %%xmm1 # Load 4 Cr 00 00 00 00 v3 v2 v1 v0 \n\
punpcklbw %%xmm2, %%xmm1 # u3 v3 u2 v2 u1 v1 u0 v0 \n\
movdqa %%xmm0, %%xmm2 # y7 y6 y5 y4 y3 y2 y1 y0 \n\
punpcklbw %%xmm1, %%xmm2 # u1 y3 v1 y2 u0 y1 v0 y0 \n\
movntdq %%xmm2, (%0) # Store low YUYV \n\
punpckhbw %%xmm1, %%xmm0 # u3 y7 v3 y6 u2 y5 v2 y4 \n\
movntdq %%xmm0, 16(%0) # Store high YUYV \n\
"
#define SSE2_YUV422_YVYU_UNALIGNED " \n\
movdqu (%1), %%xmm0 # Load 8 Y y7 y6 y5 y4 y3 y2 y1 y0 \n\
movq (%2), %%xmm2 # Load 4 Cb 00 00 00 00 u3 u2 u1 u0 \n\
movq (%3), %%xmm1 # Load 4 Cr 00 00 00 00 v3 v2 v1 v0 \n\
prefetchnta (%0) # Tell CPU not to cache output YUYV data \n\
punpcklbw %%xmm2, %%xmm1 # u3 v3 u2 v2 u1 v1 u0 v0 \n\
movdqa %%xmm0, %%xmm2 # y7 y6 y5 y4 y3 y2 y1 y0 \n\
punpcklbw %%xmm1, %%xmm2 # u1 y3 v1 y2 u0 y1 v0 y0 \n\
movdqu %%xmm2, (%0) # Store low YUYV \n\
punpckhbw %%xmm1, %%xmm0 # u3 y7 v3 y6 u2 y5 v2 y4 \n\
movdqu %%xmm0, 16(%0) # Store high YUYV \n\
"
#define SSE2_YUV422_UYVY_ALIGNED " \n\
movdqa (%1), %%xmm0 # Load 8 Y y7 y6 y5 y4 y3 y2 y1 y0 \n\
movq (%2), %%xmm1 # Load 4 Cb 00 00 00 00 u3 u2 u1 u0 \n\
movq (%3), %%xmm2 # Load 4 Cr 00 00 00 00 v3 v2 v1 v0 \n\
punpcklbw %%xmm2, %%xmm1 # v3 u3 v2 u2 v1 u1 v0 u0 \n\
movdqa %%xmm1, %%xmm2 # v3 u3 v2 u2 v1 u1 v0 u0 \n\
punpcklbw %%xmm0, %%xmm2 # y3 v1 y2 u1 y1 v0 y0 u0 \n\
movntdq %%xmm2, (%0) # Store low UYVY \n\
punpckhbw %%xmm0, %%xmm1 # y7 v3 y6 u3 y5 v2 y4 u2 \n\
movntdq %%xmm1, 16(%0) # Store high UYVY \n\
"
#define SSE2_YUV422_UYVY_UNALIGNED " \n\
movdqu (%1), %%xmm0 # Load 8 Y y7 y6 y5 y4 y3 y2 y1 y0 \n\
movq (%2), %%xmm1 # Load 4 Cb 00 00 00 00 u3 u2 u1 u0 \n\
movq (%3), %%xmm2 # Load 4 Cr 00 00 00 00 v3 v2 v1 v0 \n\
prefetchnta (%0) # Tell CPU not to cache output YUYV data \n\
punpcklbw %%xmm2, %%xmm1 # v3 u3 v2 u2 v1 u1 v0 u0 \n\
movdqa %%xmm1, %%xmm2 # v3 u3 v2 u2 v1 u1 v0 u0 \n\
punpcklbw %%xmm0, %%xmm2 # y3 v1 y2 u1 y1 v0 y0 u0 \n\
movdqu %%xmm2, (%0) # Store low UYVY \n\
punpckhbw %%xmm0, %%xmm1 # y7 v3 y6 u3 y5 v2 y4 u2 \n\
movdqu %%xmm1, 16(%0) # Store high UYVY \n\
"
#elif defined(HAVE_SSE2_INTRINSICS)
/* SSE2 intrinsics */
...
...
@@ -110,7 +238,7 @@ movq %%mm1, 8(%0) # Store high UYVY \n\
#endif
#e
lif defined (MODULE_NAME_IS_i422_yuy2)
#e
ndif
#define C_YUV422_YUYV( p_line, p_y, p_u, p_v ) \
*(p_line)++ = *(p_y)++; \
...
...
@@ -136,5 +264,4 @@ movq %%mm1, 8(%0) # Store high UYVY \n\
*(p_line)++ = *(p_y); p_y += 2; \
*(p_line)++ = *(p_v) - 0x80; p_v += 2; \
#endif
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment