Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Support
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
V
vlc-gpu
Project overview
Project overview
Details
Activity
Releases
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Issues
0
Issues
0
List
Boards
Labels
Milestones
Redmine
Redmine
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Operations
Operations
Metrics
Environments
Analytics
Analytics
CI / CD
Repository
Value Stream
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
videolan
vlc-gpu
Commits
5e4dc54c
Commit
5e4dc54c
authored
Aug 02, 2007
by
Damien Fouilleul
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
chromas: more SSE2/MMX fixes, added I420_RGBA conversion
parent
c23c9ae9
Changes
5
Hide whitespace changes
Inline
Side-by-side
Showing
5 changed files
with
575 additions
and
171 deletions
+575
-171
modules/video_chroma/i420_rgb.c
modules/video_chroma/i420_rgb.c
+1
-2
modules/video_chroma/i420_rgb.h
modules/video_chroma/i420_rgb.h
+1
-0
modules/video_chroma/i420_rgb16.c
modules/video_chroma/i420_rgb16.c
+239
-0
modules/video_chroma/i420_rgb_mmx.h
modules/video_chroma/i420_rgb_mmx.h
+316
-151
modules/video_chroma/i420_yuy2.h
modules/video_chroma/i420_yuy2.h
+18
-18
No files found.
modules/video_chroma/i420_rgb.c
View file @
5e4dc54c
...
...
@@ -161,8 +161,7 @@ static int Activate( vlc_object_t *p_this )
{
/* R8G8B8A8 pixel format */
msg_Dbg
(
p_this
,
"RGB pixel format is R8G8B8A8"
);
//p_vout->chroma.pf_convert = E_(I420_B8G8R8A8);
return
-
1
;
p_vout
->
chroma
.
pf_convert
=
E_
(
I420_R8G8B8A8
);
}
else
if
(
p_vout
->
output
.
i_rmask
==
0x0000ff00
&&
p_vout
->
output
.
i_gmask
==
0x00ff0000
...
...
modules/video_chroma/i420_rgb.h
View file @
5e4dc54c
...
...
@@ -64,6 +64,7 @@ void E_(I420_RGB32) ( vout_thread_t *, picture_t *, picture_t * );
void
E_
(
I420_R5G5B5
)
(
vout_thread_t
*
,
picture_t
*
,
picture_t
*
);
void
E_
(
I420_R5G6B5
)
(
vout_thread_t
*
,
picture_t
*
,
picture_t
*
);
void
E_
(
I420_A8R8G8B8
)
(
vout_thread_t
*
,
picture_t
*
,
picture_t
*
);
void
E_
(
I420_R8G8B8A8
)
(
vout_thread_t
*
,
picture_t
*
,
picture_t
*
);
void
E_
(
I420_B8G8R8A8
)
(
vout_thread_t
*
,
picture_t
*
,
picture_t
*
);
void
E_
(
I420_A8B8G8R8
)
(
vout_thread_t
*
,
picture_t
*
,
picture_t
*
);
#endif
...
...
modules/video_chroma/i420_rgb16.c
View file @
5e4dc54c
...
...
@@ -1140,6 +1140,245 @@ void E_(I420_A8R8G8B8)( vout_thread_t *p_vout, picture_t *p_src,
#endif
}
void
E_
(
I420_R8G8B8A8
)(
vout_thread_t
*
p_vout
,
picture_t
*
p_src
,
picture_t
*
p_dest
)
{
/* We got this one from the old arguments */
uint32_t
*
p_pic
=
(
uint32_t
*
)
p_dest
->
p
->
p_pixels
;
uint8_t
*
p_y
=
p_src
->
Y_PIXELS
;
uint8_t
*
p_u
=
p_src
->
U_PIXELS
;
uint8_t
*
p_v
=
p_src
->
V_PIXELS
;
vlc_bool_t
b_hscale
;
/* horizontal scaling type */
unsigned
int
i_vscale
;
/* vertical scaling type */
unsigned
int
i_x
,
i_y
;
/* horizontal and vertical indexes */
int
i_right_margin
;
int
i_rewind
;
int
i_scale_count
;
/* scale modulo counter */
int
i_chroma_width
=
p_vout
->
render
.
i_width
/
2
;
/* chroma width */
uint32_t
*
p_pic_start
;
/* beginning of the current line for copy */
/* Conversion buffer pointer */
uint32_t
*
p_buffer_start
=
(
uint32_t
*
)
p_vout
->
chroma
.
p_sys
->
p_buffer
;
uint32_t
*
p_buffer
;
/* Offset array pointer */
int
*
p_offset_start
=
p_vout
->
chroma
.
p_sys
->
p_offset
;
int
*
p_offset
;
const
int
i_source_margin
=
p_src
->
p
[
0
].
i_pitch
-
p_src
->
p
[
0
].
i_visible_pitch
;
const
int
i_source_margin_c
=
p_src
->
p
[
1
].
i_pitch
-
p_src
->
p
[
1
].
i_visible_pitch
;
i_right_margin
=
p_dest
->
p
->
i_pitch
-
p_dest
->
p
->
i_visible_pitch
;
/* Rule: when a picture of size (x1,y1) with aspect ratio r1 is rendered
* on a picture of size (x2,y2) with aspect ratio r2, if x1 grows to x1'
* then y1 grows to y1' = x1' * y2/x2 * r2/r1 */
SetOffset
(
p_vout
->
render
.
i_width
,
p_vout
->
render
.
i_height
,
p_vout
->
output
.
i_width
,
p_vout
->
output
.
i_height
,
&
b_hscale
,
&
i_vscale
,
p_offset_start
);
/*
* Perform conversion
*/
i_scale_count
=
(
i_vscale
==
1
)
?
p_vout
->
output
.
i_height
:
p_vout
->
render
.
i_height
;
#if defined (MODULE_NAME_IS_i420_rgb_sse2)
if
(
p_vout
->
render
.
i_width
&
15
)
{
i_rewind
=
16
-
(
p_vout
->
render
.
i_width
&
15
);
}
else
{
i_rewind
=
0
;
}
/*
** SSE2 128 bits fetch/store instructions are faster
** if memory access is 16 bytes aligned
*/
p_buffer
=
b_hscale
?
p_buffer_start
:
p_pic
;
if
(
0
==
(
15
&
(
p_src
->
p
[
Y_PLANE
].
i_pitch
|
p_dest
->
p
->
i_pitch
|
((
int
)
p_y
)
|
((
int
)
p_buffer
)))
)
{
/* use faster SSE2 aligned fetch and store */
for
(
i_y
=
0
;
i_y
<
p_vout
->
render
.
i_height
;
i_y
++
)
{
p_pic_start
=
p_pic
;
for
(
i_x
=
p_vout
->
render
.
i_width
/
16
;
i_x
--
;
)
{
SSE2_CALL
(
SSE2_INIT_32_ALIGNED
SSE2_YUV_MUL
SSE2_YUV_ADD
SSE2_UNPACK_32_RGBA_ALIGNED
);
p_y
+=
16
;
p_u
+=
8
;
p_v
+=
8
;
p_buffer
+=
16
;
}
/* Here we do some unaligned reads and duplicate conversions, but
* at least we have all the pixels */
if
(
i_rewind
)
{
p_y
-=
i_rewind
;
p_u
-=
i_rewind
>>
1
;
p_v
-=
i_rewind
>>
1
;
p_buffer
-=
i_rewind
;
SSE2_CALL
(
SSE2_INIT_32_UNALIGNED
SSE2_YUV_MUL
SSE2_YUV_ADD
SSE2_UNPACK_32_RGBA_UNALIGNED
);
p_y
+=
16
;
p_u
+=
4
;
p_v
+=
4
;
}
SCALE_WIDTH
;
SCALE_HEIGHT
(
420
,
4
);
p_y
+=
i_source_margin
;
if
(
i_y
%
2
)
{
p_u
+=
i_source_margin_c
;
p_v
+=
i_source_margin_c
;
}
p_buffer
=
b_hscale
?
p_buffer_start
:
p_pic
;
}
}
else
{
/* use slower SSE2 unaligned fetch and store */
for
(
i_y
=
0
;
i_y
<
p_vout
->
render
.
i_height
;
i_y
++
)
{
p_pic_start
=
p_pic
;
p_buffer
=
b_hscale
?
p_buffer_start
:
p_pic
;
for
(
i_x
=
p_vout
->
render
.
i_width
/
16
;
i_x
--
;
)
{
SSE2_CALL
(
SSE2_INIT_32_UNALIGNED
SSE2_YUV_MUL
SSE2_YUV_ADD
SSE2_UNPACK_32_RGBA_UNALIGNED
);
p_y
+=
16
;
p_u
+=
8
;
p_v
+=
8
;
p_buffer
+=
16
;
}
/* Here we do some unaligned reads and duplicate conversions, but
* at least we have all the pixels */
if
(
i_rewind
)
{
p_y
-=
i_rewind
;
p_u
-=
i_rewind
>>
1
;
p_v
-=
i_rewind
>>
1
;
p_buffer
-=
i_rewind
;
SSE2_CALL
(
SSE2_INIT_32_UNALIGNED
SSE2_YUV_MUL
SSE2_YUV_ADD
SSE2_UNPACK_32_RGBA_UNALIGNED
);
p_y
+=
16
;
p_u
+=
8
;
p_v
+=
8
;
}
SCALE_WIDTH
;
SCALE_HEIGHT
(
420
,
4
);
p_y
+=
i_source_margin
;
if
(
i_y
%
2
)
{
p_u
+=
i_source_margin_c
;
p_v
+=
i_source_margin_c
;
}
p_buffer
=
b_hscale
?
p_buffer_start
:
p_pic
;
}
}
/* make sure all SSE2 stores are visible thereafter */
SSE2_END
;
#else // defined (MODULE_NAME_IS_i420_rgb_mmx)
if
(
p_vout
->
render
.
i_width
&
7
)
{
i_rewind
=
8
-
(
p_vout
->
render
.
i_width
&
7
);
}
else
{
i_rewind
=
0
;
}
for
(
i_y
=
0
;
i_y
<
p_vout
->
render
.
i_height
;
i_y
++
)
{
p_pic_start
=
p_pic
;
p_buffer
=
b_hscale
?
p_buffer_start
:
p_pic
;
for
(
i_x
=
p_vout
->
render
.
i_width
/
8
;
i_x
--
;
)
{
MMX_CALL
(
MMX_INIT_32
MMX_YUV_MUL
MMX_YUV_ADD
MMX_UNPACK_32_RGBA
);
p_y
+=
8
;
p_u
+=
4
;
p_v
+=
4
;
p_buffer
+=
8
;
}
/* Here we do some unaligned reads and duplicate conversions, but
* at least we have all the pixels */
if
(
i_rewind
)
{
p_y
-=
i_rewind
;
p_u
-=
i_rewind
>>
1
;
p_v
-=
i_rewind
>>
1
;
p_buffer
-=
i_rewind
;
MMX_CALL
(
MMX_INIT_32
MMX_YUV_MUL
MMX_YUV_ADD
MMX_UNPACK_32_RGBA
);
p_y
+=
8
;
p_u
+=
4
;
p_v
+=
4
;
p_buffer
+=
8
;
}
SCALE_WIDTH
;
SCALE_HEIGHT
(
420
,
4
);
p_y
+=
i_source_margin
;
if
(
i_y
%
2
)
{
p_u
+=
i_source_margin_c
;
p_v
+=
i_source_margin_c
;
}
}
/* re-enable FPU registers */
MMX_END
;
#endif
}
void
E_
(
I420_B8G8R8A8
)(
vout_thread_t
*
p_vout
,
picture_t
*
p_src
,
picture_t
*
p_dest
)
{
...
...
modules/video_chroma/i420_rgb_mmx.h
View file @
5e4dc54c
...
...
@@ -300,6 +300,26 @@ punpckhwd %%mm1, %%mm0 # 00 R7 B7 G7 00 R6 B6 G6 \n\
movq %%mm0, 24(%3) # Store ARGB7 ARGB6 \n\
"
#define MMX_UNPACK_32_RGBA " \n\
pxor %%mm3, %%mm3 # zero mm3 \n\
movq %%mm2, %%mm4 # G7 G6 G5 G4 G3 G2 G1 G0 \n\
punpcklbw %%mm1, %%mm4 # R3 G3 R2 G2 R1 G1 R0 G0 \n\
punpcklbw %%mm0, %%mm3 # B3 00 B2 00 B1 00 B0 00 \n\
movq %%mm3, %%mm5 # R3 00 R2 00 R1 00 R0 00 \n\
punpcklwd %%mm4, %%mm3 # R1 G1 B1 00 R0 G0 B0 00 \n\
movq %%mm3, (%3) # Store RGBA1 RGBA0 \n\
punpckhwd %%mm4, %%mm5 # R3 G3 B3 00 R2 G2 B2 00 \n\
movq %%mm5, 8(%3) # Store RGBA3 RGBA2 \n\
pxor %%mm6, %%mm6 # zero mm6 \n\
punpckhbw %%mm1, %%mm2 # R7 G7 R6 G6 R5 G5 R4 G4 \n\
punpckhbw %%mm0, %%mm6 # B7 00 B6 00 B5 00 B4 00 \n\
movq %%mm6, %%mm0 # B7 00 B6 00 B5 00 B4 00 \n\
punpcklwd %%mm2, %%mm6 # R5 G5 B5 00 R4 G4 B4 00 \n\
movq %%mm6, 16(%3) # Store RGBA5 RGBA4 \n\
punpckhwd %%mm2, %%mm0 # R7 G7 B7 00 R6 G6 B6 00 \n\
movq %%mm0, 24(%3) # Store RGBA7 RGBA6 \n\
"
#define MMX_UNPACK_32_BGRA " \n\
pxor %%mm3, %%mm3 # zero mm3 \n\
movq %%mm2, %%mm4 # G7 G6 G5 G4 G3 G2 G1 G0 \n\
...
...
@@ -356,15 +376,15 @@ movq %%mm2, 24(%3) # Store ABGR7 ABGR6 \n\
#define MMX_END _mm_empty()
#define MMX_INIT_16 \
mm0 = _mm_cvtsi32_si64(
(int)*p_u);
\
mm1 = _mm_cvtsi32_si64(
(int)*p_v);
\
mm0 = _mm_cvtsi32_si64(
*(int*)p_u);
\
mm1 = _mm_cvtsi32_si64(
*(int*)p_v);
\
mm4 = _mm_setzero_si64(); \
mm6 = (__m64)*(uint64_t *)p_y
mm6 = (__m64)*(uint64_t *)p_y
;
#define MMX_INIT_32 \
mm0 = _mm_cvtsi32_si64(
(int)*p_u);
\
mm0 = _mm_cvtsi32_si64(
*(int*)p_u);
\
*(uint16_t *)p_buffer = 0; \
mm1 = _mm_cvtsi32_si64(
(int)*p_v);
\
mm1 = _mm_cvtsi32_si64(
*(int*)p_v);
\
mm4 = _mm_setzero_si64(); \
mm6 = (__m64)*(uint64_t *)p_y;
...
...
@@ -483,6 +503,25 @@ movq %%mm2, 24(%3) # Store ABGR7 ABGR6 \n\
mm0 = _mm_unpackhi_pi16(mm0, mm1); \
*(uint64_t *)(p_buffer + 6) = (uint64_t)mm0;
#define MMX_UNPACK_32_RGBA \
mm3 = _mm_setzero_si64(); \
mm4 = mm2; \
mm4 = _mm_unpacklo_pi8(mm4, mm1); \
mm3 = _mm_unpacklo_pi8(mm3, mm0); \
mm5 = mm3; \
mm3 = _mm_unpacklo_pi16(mm3, mm4); \
*(uint64_t *)p_buffer = (uint64_t)mm3; \
mm5 = _mm_unpackhi_pi16(mm5, mm4); \
*(uint64_t *)(p_buffer + 2) = (uint64_t)mm5;\
mm6 = _mm_setzero_si64(); \
mm2 = _mm_unpackhi_pi8(mm2, mm1); \
mm6 = _mm_unpackhi_pi8(mm6, mm0); \
mm0 = mm6; \
mm6 = _mm_unpacklo_pi16(mm6, mm2); \
*(uint64_t *)(p_buffer + 4) = (uint64_t)mm6;\
mm0 = _mm_unpackhi_pi16(mm0, mm2); \
*(uint64_t *)(p_buffer + 6) = (uint64_t)mm0;
#define MMX_UNPACK_32_BGRA \
mm3 = _mm_setzero_si64(); \
mm4 = mm2; \
...
...
@@ -503,7 +542,23 @@ movq %%mm2, 24(%3) # Store ABGR7 ABGR6 \n\
*(uint64_t *)(p_buffer + 6) = (uint64_t)mm0;
#define MMX_UNPACK_32_ABGR \
;
mm3 = _mm_setzero_si64(); \
mm4 = mm1; \
mm4 = _mm_unpacklo_pi8(mm4, mm2); \
mm5 = mm0; \
mm5 = _mm_unpacklo_pi8(mm5, mm3); \
mm6 = mm4; \
mm4 = _mm_unpacklo_pi16(mm4, mm5); \
*(uint64_t *)p_buffer = (uint64_t)mm4; \
mm6 = _mm_unpackhi_pi16(mm6, mm5); \
*(uint64_t *)(p_buffer + 2) = (uint64_t)mm6;\
mm1 = _mm_unpackhi_pi8(mm1, mm2); \
mm0 = _mm_unpackhi_pi8(mm0, mm3); \
mm2 = mm1; \
mm1 = _mm_unpacklo_pi16(mm1, mm0); \
*(uint64_t *)(p_buffer + 4) = (uint64_t)mm1;\
mm2 = _mm_unpackhi_pi16(mm2, mm0); \
*(uint64_t *)(p_buffer + 6) = (uint64_t)mm2;
#endif
...
...
@@ -795,6 +850,46 @@ punpckhwd %%xmm1, %%xmm0 # 00 R7 B7 G7 00 R6 B6 G6 \n\
movdqu %%xmm0, 48(%3) # Store ARGB15 ARGB14 ARGB13 ARGB12 \n\
"
#define SSE2_UNPACK_32_RGBA_ALIGNED " \n\
pxor %%xmm3, %%xmm3 # zero mm3 \n\
movdqa %%xmm2, %%xmm4 # G7 G6 G5 G4 G3 G2 G1 G0 \n\
punpcklbw %%xmm1, %%xmm4 # R3 G3 R2 G2 R1 G1 R0 G0 \n\
punpcklbw %%xmm0, %%xmm3 # B3 00 B2 00 B1 00 B0 00 \n\
movdqa %%xmm3, %%xmm5 # R3 00 R2 00 R1 00 R0 00 \n\
punpcklwd %%xmm4, %%xmm3 # R1 G1 B1 00 R0 B0 G0 00 \n\
movntdq %%xmm3, (%3) # Store RGBA3 RGBA2 RGBA1 RGBA0 \n\
punpckhwd %%xmm4, %%xmm5 # R3 G3 B3 00 R2 G2 B2 00 \n\
movntdq %%xmm5, 16(%3) # Store RGBA7 RGBA6 RGBA5 RGBA4 \n\
pxor %%xmm6, %%xmm6 # zero mm6 \n\
punpckhbw %%xmm1, %%xmm2 # R7 G7 R6 G6 R5 G5 R4 G4 \n\
punpckhbw %%xmm0, %%xmm6 # B7 00 B6 00 B5 00 B4 00 \n\
movdqa %%xmm6, %%xmm0 # B7 00 B6 00 B5 00 B4 00 \n\
punpcklwd %%xmm2, %%xmm6 # R5 G5 B5 00 R4 G4 B4 00 \n\
movntdq %%xmm6, 32(%3) # Store BGRA11 BGRA10 BGRA9 RGBA8 \n\
punpckhwd %%xmm2, %%xmm0 # R7 G7 B7 00 R6 G6 B6 00 \n\
movntdq %%xmm0, 48(%3) # Store RGBA15 RGBA14 RGBA13 RGBA12 \n\
"
#define SSE2_UNPACK_32_RGBA_UNALIGNED " \n\
pxor %%xmm3, %%xmm3 # zero mm3 \n\
movdqa %%xmm2, %%xmm4 # G7 G6 G5 G4 G3 G2 G1 G0 \n\
punpcklbw %%xmm1, %%xmm4 # R3 G3 R2 G2 R1 G1 R0 G0 \n\
punpcklbw %%xmm0, %%xmm3 # B3 00 B2 00 B1 00 B0 00 \n\
movdqa %%xmm3, %%xmm5 # R3 00 R2 00 R1 00 R0 00 \n\
punpcklwd %%xmm4, %%xmm3 # R1 G1 B1 00 R0 B0 G0 00 \n\
movdqu %%xmm3, (%3) # Store RGBA3 RGBA2 RGBA1 RGBA0 \n\
punpckhwd %%xmm4, %%xmm5 # R3 G3 B3 00 R2 G2 B2 00 \n\
movdqu %%xmm5, 16(%3) # Store RGBA7 RGBA6 RGBA5 RGBA4 \n\
pxor %%xmm6, %%xmm6 # zero mm6 \n\
punpckhbw %%xmm1, %%xmm2 # R7 G7 R6 G6 R5 G5 R4 G4 \n\
punpckhbw %%xmm0, %%xmm6 # B7 00 B6 00 B5 00 B4 00 \n\
movdqa %%xmm6, %%xmm0 # B7 00 B6 00 B5 00 B4 00 \n\
punpcklwd %%xmm2, %%xmm6 # R5 G5 B5 00 R4 G4 B4 00 \n\
movdqu %%xmm6, 32(%3) # Store RGBA11 RGBA10 RGBA9 RGBA8 \n\
punpckhwd %%xmm2, %%xmm0 # R7 G7 B7 00 R6 G6 B6 00 \n\
movdqu %%xmm0, 48(%3) # Store RGBA15 RGBA14 RGBA13 RGBA12 \n\
"
#define SSE2_UNPACK_32_BGRA_ALIGNED " \n\
pxor %%xmm3, %%xmm3 # zero mm3 \n\
movdqa %%xmm2, %%xmm4 # G7 G6 G5 G4 G3 G2 G1 G0 \n\
...
...
@@ -881,11 +976,11 @@ movdqu %%xmm2, 48(%3) # Store ABGR15 ABGR14 ABGR13 ABGR12 \n\
#include <emmintrin.h>
#define SSE2_CALL(SSE2_INSTRUCTIONS)
\
do {
\
__m128i xmm0, xmm1, xmm2, xmm3,
\
xmm4, xmm5, xmm6, xmm7;
\
SSE2_INSTRUCTIONS
\
#define SSE2_CALL(SSE2_INSTRUCTIONS) \
do { \
__m128i xmm0, xmm1, xmm2, xmm3, \
xmm4, xmm5, xmm6, xmm7; \
SSE2_INSTRUCTIONS \
} while(0)
#define SSE2_END _mm_sfence()
...
...
@@ -971,179 +1066,249 @@ movdqu %%xmm2, 48(%3) # Store ABGR15 ABGR14 ABGR13 ABGR12 \n\
xmm1 = _mm_unpacklo_epi8(xmm1, xmm4); \
xmm2 = _mm_unpacklo_epi8(xmm2, xmm5);
#define SSE2_UNPACK_15_ALIGNED \
xmm5 = _mm_set1_epi32(0xf8f8f8f8UL); \
xmm0 = _mm_and_si128(xmm0, xmm5); \
xmm0 = _mm_srli_epi16(xmm0, 3); \
xmm2 = _mm_and_si128(xmm2, xmm5); \
xmm1 = _mm_and_si128(xmm1, xmm5); \
xmm1 = _mm_srli_epi16(xmm1, 1); \
xmm4 = _mm_setzero_si128(); \
xmm5 = xmm0; \
xmm7 = xmm2; \
#define SSE2_UNPACK_15_ALIGNED
\
xmm5 = _mm_set1_epi32(0xf8f8f8f8UL);
\
xmm0 = _mm_and_si128(xmm0, xmm5);
\
xmm0 = _mm_srli_epi16(xmm0, 3);
\
xmm2 = _mm_and_si128(xmm2, xmm5);
\
xmm1 = _mm_and_si128(xmm1, xmm5);
\
xmm1 = _mm_srli_epi16(xmm1, 1);
\
xmm4 = _mm_setzero_si128();
\
xmm5 = xmm0;
\
xmm7 = xmm2;
\
\
xmm2 = _mm_unpacklo_epi8(xmm2, xmm4); \
xmm0 = _mm_unpacklo_epi8(xmm0, xmm1); \
xmm2 = _mm_slli_epi16(xmm2, 2); \
xmm0 = _mm_or_si128(xmm0, xmm2); \
_mm_stream_si128((__m128i*)p_buffer, xmm0); \
xmm2 = _mm_unpacklo_epi8(xmm2, xmm4);
\
xmm0 = _mm_unpacklo_epi8(xmm0, xmm1);
\
xmm2 = _mm_slli_epi16(xmm2, 2);
\
xmm0 = _mm_or_si128(xmm0, xmm2);
\
_mm_stream_si128((__m128i*)p_buffer, xmm0);
\
\
xmm7 = _mm_unpackhi_epi8(xmm7, xmm4); \
xmm5 = _mm_unpackhi_epi8(xmm5, xmm1); \
xmm7 = _mm_slli_epi16(xmm7, 2); \
xmm5 = _mm_or_si128(xmm5, xmm7); \
xmm7 = _mm_unpackhi_epi8(xmm7, xmm4);
\
xmm5 = _mm_unpackhi_epi8(xmm5, xmm1);
\
xmm7 = _mm_slli_epi16(xmm7, 2);
\
xmm5 = _mm_or_si128(xmm5, xmm7);
\
_mm_stream_si128((__m128i*)(p_buffer+8), xmm5);
#define SSE2_UNPACK_15_UNALIGNED \
xmm5 = _mm_set1_epi32(0xf8f8f8f8UL); \
xmm0 = _mm_and_si128(xmm0, xmm5); \
xmm0 = _mm_srli_epi16(xmm0, 3); \
xmm2 = _mm_and_si128(xmm2, xmm5); \
xmm1 = _mm_and_si128(xmm1, xmm5); \
xmm1 = _mm_srli_epi16(xmm1, 1); \
xmm4 = _mm_setzero_si128(); \
xmm5 = xmm0; \
xmm7 = xmm2; \
#define SSE2_UNPACK_15_UNALIGNED
\
xmm5 = _mm_set1_epi32(0xf8f8f8f8UL);
\
xmm0 = _mm_and_si128(xmm0, xmm5);
\
xmm0 = _mm_srli_epi16(xmm0, 3);
\
xmm2 = _mm_and_si128(xmm2, xmm5);
\
xmm1 = _mm_and_si128(xmm1, xmm5);
\
xmm1 = _mm_srli_epi16(xmm1, 1);
\
xmm4 = _mm_setzero_si128();
\
xmm5 = xmm0;
\
xmm7 = xmm2;
\
\
xmm2 = _mm_unpacklo_epi8(xmm2, xmm4); \
xmm0 = _mm_unpacklo_epi8(xmm0, xmm1); \
xmm2 = _mm_slli_epi16(xmm2, 2); \
xmm0 = _mm_or_si128(xmm0, xmm2); \
_mm_storeu_si128((__m128i*)p_buffer, xmm0); \
xmm2 = _mm_unpacklo_epi8(xmm2, xmm4);
\
xmm0 = _mm_unpacklo_epi8(xmm0, xmm1);
\
xmm2 = _mm_slli_epi16(xmm2, 2);
\
xmm0 = _mm_or_si128(xmm0, xmm2);
\
_mm_storeu_si128((__m128i*)p_buffer, xmm0);
\
\
xmm7 = _mm_unpackhi_epi8(xmm7, xmm4); \
xmm5 = _mm_unpackhi_epi8(xmm5, xmm1); \
xmm7 = _mm_slli_epi16(xmm7, 2); \
xmm5 = _mm_or_si128(xmm5, xmm7); \
xmm7 = _mm_unpackhi_epi8(xmm7, xmm4);
\
xmm5 = _mm_unpackhi_epi8(xmm5, xmm1);
\
xmm7 = _mm_slli_epi16(xmm7, 2);
\
xmm5 = _mm_or_si128(xmm5, xmm7);
\
_mm_storeu_si128((__m128i*)(p_buffer+16), xmm5);
#define SSE2_UNPACK_16_ALIGNED \
xmm5 = _mm_set1_epi32(0xf8f8f8f8UL); \
xmm0 = _mm_and_si128(xmm0, xmm5); \
xmm1 = _mm_and_si128(xmm1, xmm5); \
xmm5 = _mm_set1_epi32(0xfcfcfcfcUL); \
xmm2 = _mm_and_si128(xmm2, xmm5); \
xmm0 = _mm_srli_epi16(xmm0, 3); \
xmm4 = _mm_setzero_si128(); \
xmm5 = xmm0; \
xmm7 = xmm2; \
#define SSE2_UNPACK_16_ALIGNED
\
xmm5 = _mm_set1_epi32(0xf8f8f8f8UL);
\
xmm0 = _mm_and_si128(xmm0, xmm5);
\
xmm1 = _mm_and_si128(xmm1, xmm5);
\
xmm5 = _mm_set1_epi32(0xfcfcfcfcUL);
\
xmm2 = _mm_and_si128(xmm2, xmm5);
\
xmm0 = _mm_srli_epi16(xmm0, 3);
\
xmm4 = _mm_setzero_si128();
\
xmm5 = xmm0;
\
xmm7 = xmm2;
\
\
xmm2 = _mm_unpacklo_epi8(xmm2, xmm4); \
xmm0 = _mm_unpacklo_epi8(xmm0, xmm1); \
xmm2 = _mm_slli_epi16(xmm2, 3); \
xmm0 = _mm_or_si128(xmm0, xmm2); \
_mm_stream_si128((__m128i*)p_buffer, xmm0); \
xmm2 = _mm_unpacklo_epi8(xmm2, xmm4);
\
xmm0 = _mm_unpacklo_epi8(xmm0, xmm1);
\
xmm2 = _mm_slli_epi16(xmm2, 3);
\
xmm0 = _mm_or_si128(xmm0, xmm2);
\
_mm_stream_si128((__m128i*)p_buffer, xmm0);
\
\
xmm7 = _mm_unpackhi_epi8(xmm7, xmm4); \
xmm5 = _mm_unpackhi_epi8(xmm5, xmm1); \
xmm7 = _mm_slli_epi16(xmm7, 3); \
xmm5 = _mm_or_si128(xmm5, xmm7); \
xmm7 = _mm_unpackhi_epi8(xmm7, xmm4);
\
xmm5 = _mm_unpackhi_epi8(xmm5, xmm1);
\
xmm7 = _mm_slli_epi16(xmm7, 3);
\
xmm5 = _mm_or_si128(xmm5, xmm7);
\
_mm_stream_si128((__m128i*)(p_buffer+8), xmm5);
#define SSE2_UNPACK_16_UNALIGNED \
xmm5 = _mm_set1_epi32(0xf8f8f8f8UL); \
xmm0 = _mm_and_si128(xmm0, xmm5); \
xmm1 = _mm_and_si128(xmm1, xmm5); \
xmm5 = _mm_set1_epi32(0xfcfcfcfcUL); \
xmm2 = _mm_and_si128(xmm2, xmm5); \
xmm0 = _mm_srli_epi16(xmm0, 3); \
xmm4 = _mm_setzero_si128(); \
xmm5 = xmm0; \
xmm7 = xmm2; \
#define SSE2_UNPACK_16_UNALIGNED
\
xmm5 = _mm_set1_epi32(0xf8f8f8f8UL);
\
xmm0 = _mm_and_si128(xmm0, xmm5);
\
xmm1 = _mm_and_si128(xmm1, xmm5);
\
xmm5 = _mm_set1_epi32(0xfcfcfcfcUL);
\
xmm2 = _mm_and_si128(xmm2, xmm5);
\
xmm0 = _mm_srli_epi16(xmm0, 3);
\
xmm4 = _mm_setzero_si128();
\
xmm5 = xmm0;
\
xmm7 = xmm2;
\
\
xmm2 = _mm_unpacklo_epi8(xmm2, xmm4); \
xmm0 = _mm_unpacklo_epi8(xmm0, xmm1); \
xmm2 = _mm_slli_epi16(xmm2, 3); \
xmm0 = _mm_or_si128(xmm0, xmm2); \
_mm_storeu_si128((__m128i*)p_buffer, xmm0); \
xmm2 = _mm_unpacklo_epi8(xmm2, xmm4);
\
xmm0 = _mm_unpacklo_epi8(xmm0, xmm1);
\
xmm2 = _mm_slli_epi16(xmm2, 3);
\
xmm0 = _mm_or_si128(xmm0, xmm2);
\
_mm_storeu_si128((__m128i*)p_buffer, xmm0);
\
\
xmm7 = _mm_unpackhi_epi8(xmm7, xmm4); \
xmm5 = _mm_unpackhi_epi8(xmm5, xmm1); \
xmm7 = _mm_slli_epi16(xmm7, 3); \
xmm5 = _mm_or_si128(xmm5, xmm7); \
xmm7 = _mm_unpackhi_epi8(xmm7, xmm4);
\
xmm5 = _mm_unpackhi_epi8(xmm5, xmm1);
\
xmm7 = _mm_slli_epi16(xmm7, 3);
\
xmm5 = _mm_or_si128(xmm5, xmm7);
\
_mm_storeu_si128((__m128i*)(p_buffer+8), xmm5);
#define SSE2_UNPACK_32_ARGB_ALIGNED \
xmm3 = _mm_setzero_si128(); \
xmm4 = xmm0; \
xmm4 = _mm_unpacklo_epi8(xmm4, xmm2); \
xmm5 = xmm1; \
xmm5 = _mm_unpacklo_epi8(xmm5, xmm3); \
xmm6 = xmm4; \
xmm4 = _mm_unpacklo_epi16(xmm4, xmm5); \
_mm_stream_si128((__m128i*)(p_buffer), xmm4); \
xmm6 = _mm_unpackhi_epi16(xmm6, xmm5); \
#define SSE2_UNPACK_32_ARGB_ALIGNED
\
xmm3 = _mm_setzero_si128();
\
xmm4 = xmm0;
\
xmm4 = _mm_unpacklo_epi8(xmm4, xmm2);
\
xmm5 = xmm1;
\
xmm5 = _mm_unpacklo_epi8(xmm5, xmm3);
\
xmm6 = xmm4;
\
xmm4 = _mm_unpacklo_epi16(xmm4, xmm5);
\
_mm_stream_si128((__m128i*)(p_buffer), xmm4);
\
xmm6 = _mm_unpackhi_epi16(xmm6, xmm5);
\
_mm_stream_si128((__m128i*)(p_buffer+4), xmm6); \
xmm0 = _mm_unpackhi_epi8(xmm0, xmm2); \
xmm1 = _mm_unpackhi_epi8(xmm1, xmm3); \
xmm5 = xmm0; \
xmm5 = _mm_unpacklo_epi16(xmm5, xmm1); \
xmm0 = _mm_unpackhi_epi8(xmm0, xmm2);
\
xmm1 = _mm_unpackhi_epi8(xmm1, xmm3);
\
xmm5 = xmm0;
\
xmm5 = _mm_unpacklo_epi16(xmm5, xmm1);
\
_mm_stream_si128((__m128i*)(p_buffer+8), xmm5); \
xmm0 = _mm_unpackhi_epi16(xmm0, xmm1); \
xmm0 = _mm_unpackhi_epi16(xmm0, xmm1);
\
_mm_stream_si128((__m128i*)(p_buffer+12), xmm0);
#define SSE2_UNPACK_32_ARGB_UNALIGNED \
xmm3 = _mm_setzero_si128(); \
xmm4 = xmm0; \
xmm4 = _mm_unpacklo_epi8(xmm4, xmm2); \
xmm5 = xmm1; \
xmm5 = _mm_unpacklo_epi8(xmm5, xmm3); \
xmm6 = xmm4; \
xmm4 = _mm_unpacklo_epi16(xmm4, xmm5); \
_mm_storeu_si128((__m128i*)(p_buffer), xmm4); \
xmm6 = _mm_unpackhi_epi16(xmm6, xmm5); \
#define SSE2_UNPACK_32_ARGB_UNALIGNED
\
xmm3 = _mm_setzero_si128();
\
xmm4 = xmm0;
\
xmm4 = _mm_unpacklo_epi8(xmm4, xmm2);
\
xmm5 = xmm1;
\
xmm5 = _mm_unpacklo_epi8(xmm5, xmm3);
\
xmm6 = xmm4;
\
xmm4 = _mm_unpacklo_epi16(xmm4, xmm5);
\
_mm_storeu_si128((__m128i*)(p_buffer), xmm4);
\
xmm6 = _mm_unpackhi_epi16(xmm6, xmm5);
\
_mm_storeu_si128((__m128i*)(p_buffer+4), xmm6); \
xmm0 = _mm_unpackhi_epi8(xmm0, xmm2); \
xmm1 = _mm_unpackhi_epi8(xmm1, xmm3); \
xmm5 = xmm0; \
xmm5 = _mm_unpacklo_epi16(xmm5, xmm1); \
xmm0 = _mm_unpackhi_epi8(xmm0, xmm2);
\
xmm1 = _mm_unpackhi_epi8(xmm1, xmm3);
\
xmm5 = xmm0;
\
xmm5 = _mm_unpacklo_epi16(xmm5, xmm1);
\
_mm_storeu_si128((__m128i*)(p_buffer+8), xmm5); \
xmm0 = _mm_unpackhi_epi16(xmm0, xmm1); \
xmm0 = _mm_unpackhi_epi16(xmm0, xmm1);
\
_mm_storeu_si128((__m128i*)(p_buffer+12), xmm0);
#define SSE2_UNPACK_32_
BGRA_ALIGNED
\
xmm3 = _mm_setzero_si128(); \
xmm4 = xmm2; \
xmm4 = _mm_unpacklo_epi8(xmm4, xmm
0);
\
xmm3 = _mm_unpacklo_epi8(xmm3, xmm
1);
\
xmm5 = xmm3; \
xmm3 = _mm_unpacklo_epi16(xmm3, xmm4); \
_mm_stream_si128((__m128i*)(p_buffer), xmm3); \
xmm5 = _mm_unpackhi_epi16(xmm5, xmm4); \
#define SSE2_UNPACK_32_
RGBA_ALIGNED
\
xmm3 = _mm_setzero_si128();
\
xmm4 = xmm2;
\
xmm4 = _mm_unpacklo_epi8(xmm4, xmm
1);
\
xmm3 = _mm_unpacklo_epi8(xmm3, xmm
0);
\
xmm5 = xmm3;
\
xmm3 = _mm_unpacklo_epi16(xmm3, xmm4);
\
_mm_stream_si128((__m128i*)(p_buffer), xmm3);
\
xmm5 = _mm_unpackhi_epi16(xmm5, xmm4);
\
_mm_stream_si128((__m128i*)(p_buffer+4), xmm5); \
xmm6 = _mm_setzero_si128(); \
xmm2 = _mm_unpackhi_epi8(xmm2, xmm
0);
\
xmm6 = _mm_unpackhi_epi8(xmm6, xmm
1);
\
xmm0 = xmm6; \
xmm6 = _mm_unpacklo_epi16(xmm6, xmm2); \
xmm6 = _mm_setzero_si128();
\
xmm2 = _mm_unpackhi_epi8(xmm2, xmm
1);
\
xmm6 = _mm_unpackhi_epi8(xmm6, xmm
0);
\
xmm0 = xmm6;
\
xmm6 = _mm_unpacklo_epi16(xmm6, xmm2);
\
_mm_stream_si128((__m128i*)(p_buffer+8), xmm6); \
xmm0 = _mm_unpackhi_epi16(xmm0, xmm2); \
xmm0 = _mm_unpackhi_epi16(xmm0, xmm2);
\
_mm_stream_si128((__m128i*)(p_buffer+12), xmm0);
#define SSE2_UNPACK_32_
BGRA_UNALIGNED
\
xmm3 = _mm_setzero_si128(); \
xmm4 = xmm2; \
xmm4 = _mm_unpacklo_epi8(xmm4, xmm
0);
\
xmm3 = _mm_unpacklo_epi8(xmm3, xmm
1);
\
xmm5 = xmm3; \
xmm3 = _mm_unpacklo_epi16(xmm3, xmm4); \
_mm_storeu_si128((__m128i*)(p_buffer), xmm3); \
xmm5 = _mm_unpackhi_epi16(xmm5, xmm4); \
#define SSE2_UNPACK_32_
RGBA_UNALIGNED
\
xmm3 = _mm_setzero_si128();
\
xmm4 = xmm2;
\
xmm4 = _mm_unpacklo_epi8(xmm4, xmm
1);
\
xmm3 = _mm_unpacklo_epi8(xmm3, xmm
0);
\
xmm5 = xmm3;
\
xmm3 = _mm_unpacklo_epi16(xmm3, xmm4);
\
_mm_storeu_si128((__m128i*)(p_buffer), xmm3);
\
xmm5 = _mm_unpackhi_epi16(xmm5, xmm4);
\
_mm_storeu_si128((__m128i*)(p_buffer+4), xmm5); \
xmm6 = _mm_setzero_si128(); \
xmm2 = _mm_unpackhi_epi8(xmm2, xmm
0);
\
xmm6 = _mm_unpackhi_epi8(xmm6, xmm
1);
\
xmm0 = xmm6; \
xmm6 = _mm_unpacklo_epi16(xmm6, xmm2); \
xmm6 = _mm_setzero_si128();
\
xmm2 = _mm_unpackhi_epi8(xmm2, xmm
1);
\
xmm6 = _mm_unpackhi_epi8(xmm6, xmm
0);
\
xmm0 = xmm6;
\
xmm6 = _mm_unpacklo_epi16(xmm6, xmm2);
\
_mm_storeu_si128((__m128i*)(p_buffer+8), xmm6); \
xmm0 = _mm_unpackhi_epi16(xmm0, xmm2); \
xmm0 = _mm_unpackhi_epi16(xmm0, xmm2);
\
_mm_storeu_si128((__m128i*)(p_buffer+12), xmm0);
#define SSE2_UNPACK_32_ABGR_ALIGNED \
;
#define SSE2_UNPACK_32_BGRA_ALIGNED \
xmm3 = _mm_setzero_si128(); \
xmm4 = xmm2; \
xmm4 = _mm_unpacklo_epi8(xmm4, xmm0); \
xmm3 = _mm_unpacklo_epi8(xmm3, xmm1); \
xmm5 = xmm3; \
xmm3 = _mm_unpacklo_epi16(xmm3, xmm4); \
_mm_stream_si128((__m128i*)(p_buffer), xmm3); \
xmm5 = _mm_unpackhi_epi16(xmm5, xmm4); \
_mm_stream_si128((__m128i*)(p_buffer+4), xmm5); \
xmm6 = _mm_setzero_si128(); \
xmm2 = _mm_unpackhi_epi8(xmm2, xmm0); \
xmm6 = _mm_unpackhi_epi8(xmm6, xmm1); \
xmm0 = xmm6; \
xmm6 = _mm_unpacklo_epi16(xmm6, xmm2); \
_mm_stream_si128((__m128i*)(p_buffer+8), xmm6); \
xmm0 = _mm_unpackhi_epi16(xmm0, xmm2); \
_mm_stream_si128((__m128i*)(p_buffer+12), xmm0);
#define SSE2_UNPACK_32_BGRA_UNALIGNED \
xmm3 = _mm_setzero_si128(); \
xmm4 = xmm2; \
xmm4 = _mm_unpacklo_epi8(xmm4, xmm0); \
xmm3 = _mm_unpacklo_epi8(xmm3, xmm1); \
xmm5 = xmm3; \
xmm3 = _mm_unpacklo_epi16(xmm3, xmm4); \
_mm_storeu_si128((__m128i*)(p_buffer), xmm3); \
xmm5 = _mm_unpackhi_epi16(xmm5, xmm4); \
_mm_storeu_si128((__m128i*)(p_buffer+4), xmm5); \
xmm6 = _mm_setzero_si128(); \
xmm2 = _mm_unpackhi_epi8(xmm2, xmm0); \
xmm6 = _mm_unpackhi_epi8(xmm6, xmm1); \
xmm0 = xmm6; \
xmm6 = _mm_unpacklo_epi16(xmm6, xmm2); \
_mm_storeu_si128((__m128i*)(p_buffer+8), xmm6); \
xmm0 = _mm_unpackhi_epi16(xmm0, xmm2); \
_mm_storeu_si128((__m128i*)(p_buffer+12), xmm0);
#define SSE2_UNPACK_32_ABGR_UNALIGNED \
;
#define SSE2_UNPACK_32_ABGR_ALIGNED \
xmm3 = _mm_setzero_si128(); \
xmm4 = xmm1; \
xmm4 = _mm_unpacklo_epi8(xmm4, xmm2); \
xmm5 = xmm0; \
xmm5 = _mm_unpacklo_epi8(xmm5, xmm3); \
xmm6 = xmm4; \
xmm4 = _mm_unpacklo_epi16(xmm4, xmm5); \
_mm_stream_si128((__m128i*)(p_buffer), xmm4); \
xmm6 = _mm_unpackhi_epi16(xmm6, xmm5); \
_mm_stream_si128((__m128i*)(p_buffer+4), xmm6); \
xmm1 = _mm_unpackhi_epi8(xmm1, xmm2); \
xmm0 = _mm_unpackhi_epi8(xmm0, xmm3); \
xmm2 = xmm1; \
xmm1 = _mm_unpacklo_epi16(xmm1, xmm0); \
_mm_stream_si128((__m128i*)(p_buffer+8), xmm1); \
xmm2 = _mm_unpackhi_epi16(xmm2, xmm0); \
_mm_stream_si128((__m128i*)(p_buffer+12), xmm2);
#define SSE2_UNPACK_32_ABGR_UNALIGNED \
xmm3 = _mm_setzero_si128(); \
xmm4 = xmm1; \
xmm4 = _mm_unpacklo_epi8(xmm4, xmm2); \
xmm5 = xmm0; \
xmm5 = _mm_unpacklo_epi8(xmm5, xmm3); \
xmm6 = xmm4; \
xmm4 = _mm_unpacklo_epi16(xmm4, xmm5); \
_mm_storeu_si128((__m128i*)(p_buffer), xmm4); \
xmm6 = _mm_unpackhi_epi16(xmm6, xmm5); \
_mm_storeu_si128((__m128i*)(p_buffer+4), xmm6); \
xmm1 = _mm_unpackhi_epi8(xmm1, xmm2); \
xmm0 = _mm_unpackhi_epi8(xmm0, xmm3); \
xmm2 = xmm1; \
xmm1 = _mm_unpacklo_epi16(xmm1, xmm0); \
_mm_storeu_si128((__m128i*)(p_buffer+8), xmm1); \
xmm2 = _mm_unpackhi_epi16(xmm2, xmm0); \
_mm_storeu_si128((__m128i*)(p_buffer+12), xmm2);
#endif
...
...
modules/video_chroma/i420_yuy2.h
View file @
5e4dc54c
...
...
@@ -138,56 +138,56 @@ movq %%mm1, (%1) # Store YUYV \n\
#define MMX_END _mm_empty()
#define MMX_YUV420_YUYV \
mm1 = _mm_cvtsi32_si64(
(int)*p_u);
\
mm2 = _mm_cvtsi32_si64(
(int)*p_v);
\
mm1 = _mm_cvtsi32_si64(
*(int*)p_u);
\
mm2 = _mm_cvtsi32_si64(
*(int*)p_v);
\
mm0 = (__m64)*(uint64_t*)p_y1; \
mm3 = (__m64)*(uint64_t*)p_y2; \
mm1 = _mm_unpacklo_pi8(mm1, mm2); \
mm2 = mm0; \
mm2 = _mm_unpacklo_pi8(mm2, mm1); \
*(uin
64_t)p_line1 = (uint64)mm2;
\
*(uin
t64_t*)p_line1 = (uint64_t)mm2;
\
mm0 = _mm_unpackhi_pi8(mm0, mm1); \
*(uin
64_t)(p_line1 + 4) = (uint64)mm0;
\
*(uin
t64_t*)(p_line1+8) = (uint64_t)mm0;
\
mm4 = mm3; \
mm4 = _mm_unpacklo_pi8(mm4, mm1); \
*(uin
64_t)p_line2 = (uint64)mm4;
\
*(uin
t64_t*)p_line2 = (uint64_t)mm4;
\
mm3 = _mm_unpackhi_pi8(mm3, mm1); \
*(uin
64_t)(p_line2 + 4) = (uint64)mm4
;
*(uin
t64_t*)(p_line2+8) = (uint64_t)mm3
;
#define MMX_YUV420_YVYU \
mm2 = _mm_cvtsi32_si64(
(int)*p_u);
\
mm1 = _mm_cvtsi32_si64(
(int)*p_v);
\
mm2 = _mm_cvtsi32_si64(
*(int*)p_u);
\
mm1 = _mm_cvtsi32_si64(
*(int*)p_v);
\
mm0 = (__m64)*(uint64_t*)p_y1; \
mm3 = (__m64)*(uint64_t*)p_y2; \
mm1 = _mm_unpacklo_pi8(mm1, mm2); \
mm2 = mm0; \
mm2 = _mm_unpacklo_pi8(mm2, mm1); \
*(uin
64_t)p_line1 = (uint64)mm2;
\
*(uin
t64_t*)p_line1 = (uint64_t)mm2;
\
mm0 = _mm_unpackhi_pi8(mm0, mm1); \
*(uin
64_t)(p_line1 + 4) = (uint64)mm0;
\
*(uin
t64_t*)(p_line1+8) = (uint64_t)mm0;
\
mm4 = mm3; \
mm4 = _mm_unpacklo_pi8(mm4, mm1); \
*(uin
64_t)p_line2 = (uint64)mm4;
\
*(uin
t64_t*)p_line2 = (uint64_t)mm4;
\
mm3 = _mm_unpackhi_pi8(mm3, mm1); \
*(uin
64_t)(p_line2 + 4) = (uint64)mm4
;
*(uin
t64_t*)(p_line2+8) = (uint64_t)mm3
;
#define MMX_YUV420_UYVY \
mm1 = _mm_cvtsi32_si64(
(int)*p_u);
\
mm2 = _mm_cvtsi32_si64(
(int)*p_v);
\
mm1 = _mm_cvtsi32_si64(
*(int*)p_u);
\
mm2 = _mm_cvtsi32_si64(
*(int*)p_v);
\
mm0 = (__m64)*(uint64_t*)p_y1; \
mm3 = (__m64)*(uint64_t*)p_y2; \
mm1 = _mm_unpacklo_pi8(mm1, mm2); \
mm2 = mm1; \
mm2 = _mm_unpacklo_pi8(mm2, mm0); \
*(uin
64_t)p_line1 = (uint64)mm2;
\
*(uin
t64_t*)p_line1 = (uint64_t)mm2;
\
mm2 = mm1; \
mm2 = _mm_unpackhi_pi8(mm2, mm0); \
*(uin
64_t)(p_line1 + 4) = (uint64)mm2;
\
*(uin
t64_t*)(p_line1+8) = (uint64_t)mm2;
\
mm4 = mm1; \
mm4 = _mm_unpacklo_pi8(mm4, mm3); \
*(uin
64_t)p_line2 = (uint64)mm4;
\
*(uin
t64_t*)p_line2 = (uint64_t)mm4;
\
mm1 = _mm_unpackhi_pi8(mm1, mm3); \
*(uin
64_t)(p_line2 + 4) = (uint64
)mm1;
*(uin
t64_t*)(p_line2+8) = (uint64_t
)mm1;
#endif
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment