Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Support
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
V
vlc
Project overview
Project overview
Details
Activity
Releases
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Issues
0
Issues
0
List
Boards
Labels
Milestones
Redmine
Redmine
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Operations
Operations
Metrics
Environments
Analytics
Analytics
CI / CD
Repository
Value Stream
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
videolan
vlc
Commits
5e4dc54c
Commit
5e4dc54c
authored
Aug 02, 2007
by
Damien Fouilleul
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
chromas: more SSE2/MMX fixes, added I420_RGBA conversion
parent
c23c9ae9
Changes
5
Hide whitespace changes
Inline
Side-by-side
Showing
5 changed files
with
575 additions
and
171 deletions
+575
-171
modules/video_chroma/i420_rgb.c
modules/video_chroma/i420_rgb.c
+1
-2
modules/video_chroma/i420_rgb.h
modules/video_chroma/i420_rgb.h
+1
-0
modules/video_chroma/i420_rgb16.c
modules/video_chroma/i420_rgb16.c
+239
-0
modules/video_chroma/i420_rgb_mmx.h
modules/video_chroma/i420_rgb_mmx.h
+316
-151
modules/video_chroma/i420_yuy2.h
modules/video_chroma/i420_yuy2.h
+18
-18
No files found.
modules/video_chroma/i420_rgb.c
View file @
5e4dc54c
...
@@ -161,8 +161,7 @@ static int Activate( vlc_object_t *p_this )
...
@@ -161,8 +161,7 @@ static int Activate( vlc_object_t *p_this )
{
{
/* R8G8B8A8 pixel format */
/* R8G8B8A8 pixel format */
msg_Dbg
(
p_this
,
"RGB pixel format is R8G8B8A8"
);
msg_Dbg
(
p_this
,
"RGB pixel format is R8G8B8A8"
);
//p_vout->chroma.pf_convert = E_(I420_B8G8R8A8);
p_vout
->
chroma
.
pf_convert
=
E_
(
I420_R8G8B8A8
);
return
-
1
;
}
}
else
if
(
p_vout
->
output
.
i_rmask
==
0x0000ff00
else
if
(
p_vout
->
output
.
i_rmask
==
0x0000ff00
&&
p_vout
->
output
.
i_gmask
==
0x00ff0000
&&
p_vout
->
output
.
i_gmask
==
0x00ff0000
...
...
modules/video_chroma/i420_rgb.h
View file @
5e4dc54c
...
@@ -64,6 +64,7 @@ void E_(I420_RGB32) ( vout_thread_t *, picture_t *, picture_t * );
...
@@ -64,6 +64,7 @@ void E_(I420_RGB32) ( vout_thread_t *, picture_t *, picture_t * );
void
E_
(
I420_R5G5B5
)
(
vout_thread_t
*
,
picture_t
*
,
picture_t
*
);
void
E_
(
I420_R5G5B5
)
(
vout_thread_t
*
,
picture_t
*
,
picture_t
*
);
void
E_
(
I420_R5G6B5
)
(
vout_thread_t
*
,
picture_t
*
,
picture_t
*
);
void
E_
(
I420_R5G6B5
)
(
vout_thread_t
*
,
picture_t
*
,
picture_t
*
);
void
E_
(
I420_A8R8G8B8
)
(
vout_thread_t
*
,
picture_t
*
,
picture_t
*
);
void
E_
(
I420_A8R8G8B8
)
(
vout_thread_t
*
,
picture_t
*
,
picture_t
*
);
void
E_
(
I420_R8G8B8A8
)
(
vout_thread_t
*
,
picture_t
*
,
picture_t
*
);
void
E_
(
I420_B8G8R8A8
)
(
vout_thread_t
*
,
picture_t
*
,
picture_t
*
);
void
E_
(
I420_B8G8R8A8
)
(
vout_thread_t
*
,
picture_t
*
,
picture_t
*
);
void
E_
(
I420_A8B8G8R8
)
(
vout_thread_t
*
,
picture_t
*
,
picture_t
*
);
void
E_
(
I420_A8B8G8R8
)
(
vout_thread_t
*
,
picture_t
*
,
picture_t
*
);
#endif
#endif
...
...
modules/video_chroma/i420_rgb16.c
View file @
5e4dc54c
...
@@ -1140,6 +1140,245 @@ void E_(I420_A8R8G8B8)( vout_thread_t *p_vout, picture_t *p_src,
...
@@ -1140,6 +1140,245 @@ void E_(I420_A8R8G8B8)( vout_thread_t *p_vout, picture_t *p_src,
#endif
#endif
}
}
void
E_
(
I420_R8G8B8A8
)(
vout_thread_t
*
p_vout
,
picture_t
*
p_src
,
picture_t
*
p_dest
)
{
/* We got this one from the old arguments */
uint32_t
*
p_pic
=
(
uint32_t
*
)
p_dest
->
p
->
p_pixels
;
uint8_t
*
p_y
=
p_src
->
Y_PIXELS
;
uint8_t
*
p_u
=
p_src
->
U_PIXELS
;
uint8_t
*
p_v
=
p_src
->
V_PIXELS
;
vlc_bool_t
b_hscale
;
/* horizontal scaling type */
unsigned
int
i_vscale
;
/* vertical scaling type */
unsigned
int
i_x
,
i_y
;
/* horizontal and vertical indexes */
int
i_right_margin
;
int
i_rewind
;
int
i_scale_count
;
/* scale modulo counter */
int
i_chroma_width
=
p_vout
->
render
.
i_width
/
2
;
/* chroma width */
uint32_t
*
p_pic_start
;
/* beginning of the current line for copy */
/* Conversion buffer pointer */
uint32_t
*
p_buffer_start
=
(
uint32_t
*
)
p_vout
->
chroma
.
p_sys
->
p_buffer
;
uint32_t
*
p_buffer
;
/* Offset array pointer */
int
*
p_offset_start
=
p_vout
->
chroma
.
p_sys
->
p_offset
;
int
*
p_offset
;
const
int
i_source_margin
=
p_src
->
p
[
0
].
i_pitch
-
p_src
->
p
[
0
].
i_visible_pitch
;
const
int
i_source_margin_c
=
p_src
->
p
[
1
].
i_pitch
-
p_src
->
p
[
1
].
i_visible_pitch
;
i_right_margin
=
p_dest
->
p
->
i_pitch
-
p_dest
->
p
->
i_visible_pitch
;
/* Rule: when a picture of size (x1,y1) with aspect ratio r1 is rendered
* on a picture of size (x2,y2) with aspect ratio r2, if x1 grows to x1'
* then y1 grows to y1' = x1' * y2/x2 * r2/r1 */
SetOffset
(
p_vout
->
render
.
i_width
,
p_vout
->
render
.
i_height
,
p_vout
->
output
.
i_width
,
p_vout
->
output
.
i_height
,
&
b_hscale
,
&
i_vscale
,
p_offset_start
);
/*
* Perform conversion
*/
i_scale_count
=
(
i_vscale
==
1
)
?
p_vout
->
output
.
i_height
:
p_vout
->
render
.
i_height
;
#if defined (MODULE_NAME_IS_i420_rgb_sse2)
if
(
p_vout
->
render
.
i_width
&
15
)
{
i_rewind
=
16
-
(
p_vout
->
render
.
i_width
&
15
);
}
else
{
i_rewind
=
0
;
}
/*
** SSE2 128 bits fetch/store instructions are faster
** if memory access is 16 bytes aligned
*/
p_buffer
=
b_hscale
?
p_buffer_start
:
p_pic
;
if
(
0
==
(
15
&
(
p_src
->
p
[
Y_PLANE
].
i_pitch
|
p_dest
->
p
->
i_pitch
|
((
int
)
p_y
)
|
((
int
)
p_buffer
)))
)
{
/* use faster SSE2 aligned fetch and store */
for
(
i_y
=
0
;
i_y
<
p_vout
->
render
.
i_height
;
i_y
++
)
{
p_pic_start
=
p_pic
;
for
(
i_x
=
p_vout
->
render
.
i_width
/
16
;
i_x
--
;
)
{
SSE2_CALL
(
SSE2_INIT_32_ALIGNED
SSE2_YUV_MUL
SSE2_YUV_ADD
SSE2_UNPACK_32_RGBA_ALIGNED
);
p_y
+=
16
;
p_u
+=
8
;
p_v
+=
8
;
p_buffer
+=
16
;
}
/* Here we do some unaligned reads and duplicate conversions, but
* at least we have all the pixels */
if
(
i_rewind
)
{
p_y
-=
i_rewind
;
p_u
-=
i_rewind
>>
1
;
p_v
-=
i_rewind
>>
1
;
p_buffer
-=
i_rewind
;
SSE2_CALL
(
SSE2_INIT_32_UNALIGNED
SSE2_YUV_MUL
SSE2_YUV_ADD
SSE2_UNPACK_32_RGBA_UNALIGNED
);
p_y
+=
16
;
p_u
+=
4
;
p_v
+=
4
;
}
SCALE_WIDTH
;
SCALE_HEIGHT
(
420
,
4
);
p_y
+=
i_source_margin
;
if
(
i_y
%
2
)
{
p_u
+=
i_source_margin_c
;
p_v
+=
i_source_margin_c
;
}
p_buffer
=
b_hscale
?
p_buffer_start
:
p_pic
;
}
}
else
{
/* use slower SSE2 unaligned fetch and store */
for
(
i_y
=
0
;
i_y
<
p_vout
->
render
.
i_height
;
i_y
++
)
{
p_pic_start
=
p_pic
;
p_buffer
=
b_hscale
?
p_buffer_start
:
p_pic
;
for
(
i_x
=
p_vout
->
render
.
i_width
/
16
;
i_x
--
;
)
{
SSE2_CALL
(
SSE2_INIT_32_UNALIGNED
SSE2_YUV_MUL
SSE2_YUV_ADD
SSE2_UNPACK_32_RGBA_UNALIGNED
);
p_y
+=
16
;
p_u
+=
8
;
p_v
+=
8
;
p_buffer
+=
16
;
}
/* Here we do some unaligned reads and duplicate conversions, but
* at least we have all the pixels */
if
(
i_rewind
)
{
p_y
-=
i_rewind
;
p_u
-=
i_rewind
>>
1
;
p_v
-=
i_rewind
>>
1
;
p_buffer
-=
i_rewind
;
SSE2_CALL
(
SSE2_INIT_32_UNALIGNED
SSE2_YUV_MUL
SSE2_YUV_ADD
SSE2_UNPACK_32_RGBA_UNALIGNED
);
p_y
+=
16
;
p_u
+=
8
;
p_v
+=
8
;
}
SCALE_WIDTH
;
SCALE_HEIGHT
(
420
,
4
);
p_y
+=
i_source_margin
;
if
(
i_y
%
2
)
{
p_u
+=
i_source_margin_c
;
p_v
+=
i_source_margin_c
;
}
p_buffer
=
b_hscale
?
p_buffer_start
:
p_pic
;
}
}
/* make sure all SSE2 stores are visible thereafter */
SSE2_END
;
#else // defined (MODULE_NAME_IS_i420_rgb_mmx)
if
(
p_vout
->
render
.
i_width
&
7
)
{
i_rewind
=
8
-
(
p_vout
->
render
.
i_width
&
7
);
}
else
{
i_rewind
=
0
;
}
for
(
i_y
=
0
;
i_y
<
p_vout
->
render
.
i_height
;
i_y
++
)
{
p_pic_start
=
p_pic
;
p_buffer
=
b_hscale
?
p_buffer_start
:
p_pic
;
for
(
i_x
=
p_vout
->
render
.
i_width
/
8
;
i_x
--
;
)
{
MMX_CALL
(
MMX_INIT_32
MMX_YUV_MUL
MMX_YUV_ADD
MMX_UNPACK_32_RGBA
);
p_y
+=
8
;
p_u
+=
4
;
p_v
+=
4
;
p_buffer
+=
8
;
}
/* Here we do some unaligned reads and duplicate conversions, but
* at least we have all the pixels */
if
(
i_rewind
)
{
p_y
-=
i_rewind
;
p_u
-=
i_rewind
>>
1
;
p_v
-=
i_rewind
>>
1
;
p_buffer
-=
i_rewind
;
MMX_CALL
(
MMX_INIT_32
MMX_YUV_MUL
MMX_YUV_ADD
MMX_UNPACK_32_RGBA
);
p_y
+=
8
;
p_u
+=
4
;
p_v
+=
4
;
p_buffer
+=
8
;
}
SCALE_WIDTH
;
SCALE_HEIGHT
(
420
,
4
);
p_y
+=
i_source_margin
;
if
(
i_y
%
2
)
{
p_u
+=
i_source_margin_c
;
p_v
+=
i_source_margin_c
;
}
}
/* re-enable FPU registers */
MMX_END
;
#endif
}
void
E_
(
I420_B8G8R8A8
)(
vout_thread_t
*
p_vout
,
picture_t
*
p_src
,
void
E_
(
I420_B8G8R8A8
)(
vout_thread_t
*
p_vout
,
picture_t
*
p_src
,
picture_t
*
p_dest
)
picture_t
*
p_dest
)
{
{
...
...
modules/video_chroma/i420_rgb_mmx.h
View file @
5e4dc54c
...
@@ -300,6 +300,26 @@ punpckhwd %%mm1, %%mm0 # 00 R7 B7 G7 00 R6 B6 G6 \n\
...
@@ -300,6 +300,26 @@ punpckhwd %%mm1, %%mm0 # 00 R7 B7 G7 00 R6 B6 G6 \n\
movq %%mm0, 24(%3) # Store ARGB7 ARGB6 \n\
movq %%mm0, 24(%3) # Store ARGB7 ARGB6 \n\
"
"
#define MMX_UNPACK_32_RGBA " \n\
pxor %%mm3, %%mm3 # zero mm3 \n\
movq %%mm2, %%mm4 # G7 G6 G5 G4 G3 G2 G1 G0 \n\
punpcklbw %%mm1, %%mm4 # R3 G3 R2 G2 R1 G1 R0 G0 \n\
punpcklbw %%mm0, %%mm3 # B3 00 B2 00 B1 00 B0 00 \n\
movq %%mm3, %%mm5 # R3 00 R2 00 R1 00 R0 00 \n\
punpcklwd %%mm4, %%mm3 # R1 G1 B1 00 R0 G0 B0 00 \n\
movq %%mm3, (%3) # Store RGBA1 RGBA0 \n\
punpckhwd %%mm4, %%mm5 # R3 G3 B3 00 R2 G2 B2 00 \n\
movq %%mm5, 8(%3) # Store RGBA3 RGBA2 \n\
pxor %%mm6, %%mm6 # zero mm6 \n\
punpckhbw %%mm1, %%mm2 # R7 G7 R6 G6 R5 G5 R4 G4 \n\
punpckhbw %%mm0, %%mm6 # B7 00 B6 00 B5 00 B4 00 \n\
movq %%mm6, %%mm0 # B7 00 B6 00 B5 00 B4 00 \n\
punpcklwd %%mm2, %%mm6 # R5 G5 B5 00 R4 G4 B4 00 \n\
movq %%mm6, 16(%3) # Store RGBA5 RGBA4 \n\
punpckhwd %%mm2, %%mm0 # R7 G7 B7 00 R6 G6 B6 00 \n\
movq %%mm0, 24(%3) # Store RGBA7 RGBA6 \n\
"
#define MMX_UNPACK_32_BGRA " \n\
#define MMX_UNPACK_32_BGRA " \n\
pxor %%mm3, %%mm3 # zero mm3 \n\
pxor %%mm3, %%mm3 # zero mm3 \n\
movq %%mm2, %%mm4 # G7 G6 G5 G4 G3 G2 G1 G0 \n\
movq %%mm2, %%mm4 # G7 G6 G5 G4 G3 G2 G1 G0 \n\
...
@@ -356,15 +376,15 @@ movq %%mm2, 24(%3) # Store ABGR7 ABGR6 \n\
...
@@ -356,15 +376,15 @@ movq %%mm2, 24(%3) # Store ABGR7 ABGR6 \n\
#define MMX_END _mm_empty()
#define MMX_END _mm_empty()
#define MMX_INIT_16 \
#define MMX_INIT_16 \
mm0 = _mm_cvtsi32_si64(
(int)*p_u);
\
mm0 = _mm_cvtsi32_si64(
*(int*)p_u);
\
mm1 = _mm_cvtsi32_si64(
(int)*p_v);
\
mm1 = _mm_cvtsi32_si64(
*(int*)p_v);
\
mm4 = _mm_setzero_si64(); \
mm4 = _mm_setzero_si64(); \
mm6 = (__m64)*(uint64_t *)p_y
mm6 = (__m64)*(uint64_t *)p_y
;
#define MMX_INIT_32 \
#define MMX_INIT_32 \
mm0 = _mm_cvtsi32_si64(
(int)*p_u);
\
mm0 = _mm_cvtsi32_si64(
*(int*)p_u);
\
*(uint16_t *)p_buffer = 0; \
*(uint16_t *)p_buffer = 0; \
mm1 = _mm_cvtsi32_si64(
(int)*p_v);
\
mm1 = _mm_cvtsi32_si64(
*(int*)p_v);
\
mm4 = _mm_setzero_si64(); \
mm4 = _mm_setzero_si64(); \
mm6 = (__m64)*(uint64_t *)p_y;
mm6 = (__m64)*(uint64_t *)p_y;
...
@@ -483,6 +503,25 @@ movq %%mm2, 24(%3) # Store ABGR7 ABGR6 \n\
...
@@ -483,6 +503,25 @@ movq %%mm2, 24(%3) # Store ABGR7 ABGR6 \n\
mm0 = _mm_unpackhi_pi16(mm0, mm1); \
mm0 = _mm_unpackhi_pi16(mm0, mm1); \
*(uint64_t *)(p_buffer + 6) = (uint64_t)mm0;
*(uint64_t *)(p_buffer + 6) = (uint64_t)mm0;
#define MMX_UNPACK_32_RGBA \
mm3 = _mm_setzero_si64(); \
mm4 = mm2; \
mm4 = _mm_unpacklo_pi8(mm4, mm1); \
mm3 = _mm_unpacklo_pi8(mm3, mm0); \
mm5 = mm3; \
mm3 = _mm_unpacklo_pi16(mm3, mm4); \
*(uint64_t *)p_buffer = (uint64_t)mm3; \
mm5 = _mm_unpackhi_pi16(mm5, mm4); \
*(uint64_t *)(p_buffer + 2) = (uint64_t)mm5;\
mm6 = _mm_setzero_si64(); \
mm2 = _mm_unpackhi_pi8(mm2, mm1); \
mm6 = _mm_unpackhi_pi8(mm6, mm0); \
mm0 = mm6; \
mm6 = _mm_unpacklo_pi16(mm6, mm2); \
*(uint64_t *)(p_buffer + 4) = (uint64_t)mm6;\
mm0 = _mm_unpackhi_pi16(mm0, mm2); \
*(uint64_t *)(p_buffer + 6) = (uint64_t)mm0;
#define MMX_UNPACK_32_BGRA \
#define MMX_UNPACK_32_BGRA \
mm3 = _mm_setzero_si64(); \
mm3 = _mm_setzero_si64(); \
mm4 = mm2; \
mm4 = mm2; \
...
@@ -503,7 +542,23 @@ movq %%mm2, 24(%3) # Store ABGR7 ABGR6 \n\
...
@@ -503,7 +542,23 @@ movq %%mm2, 24(%3) # Store ABGR7 ABGR6 \n\
*(uint64_t *)(p_buffer + 6) = (uint64_t)mm0;
*(uint64_t *)(p_buffer + 6) = (uint64_t)mm0;
#define MMX_UNPACK_32_ABGR \
#define MMX_UNPACK_32_ABGR \
;
mm3 = _mm_setzero_si64(); \
mm4 = mm1; \
mm4 = _mm_unpacklo_pi8(mm4, mm2); \
mm5 = mm0; \
mm5 = _mm_unpacklo_pi8(mm5, mm3); \
mm6 = mm4; \
mm4 = _mm_unpacklo_pi16(mm4, mm5); \
*(uint64_t *)p_buffer = (uint64_t)mm4; \
mm6 = _mm_unpackhi_pi16(mm6, mm5); \
*(uint64_t *)(p_buffer + 2) = (uint64_t)mm6;\
mm1 = _mm_unpackhi_pi8(mm1, mm2); \
mm0 = _mm_unpackhi_pi8(mm0, mm3); \
mm2 = mm1; \
mm1 = _mm_unpacklo_pi16(mm1, mm0); \
*(uint64_t *)(p_buffer + 4) = (uint64_t)mm1;\
mm2 = _mm_unpackhi_pi16(mm2, mm0); \
*(uint64_t *)(p_buffer + 6) = (uint64_t)mm2;
#endif
#endif
...
@@ -795,6 +850,46 @@ punpckhwd %%xmm1, %%xmm0 # 00 R7 B7 G7 00 R6 B6 G6 \n\
...
@@ -795,6 +850,46 @@ punpckhwd %%xmm1, %%xmm0 # 00 R7 B7 G7 00 R6 B6 G6 \n\
movdqu %%xmm0, 48(%3) # Store ARGB15 ARGB14 ARGB13 ARGB12 \n\
movdqu %%xmm0, 48(%3) # Store ARGB15 ARGB14 ARGB13 ARGB12 \n\
"
"
#define SSE2_UNPACK_32_RGBA_ALIGNED " \n\
pxor %%xmm3, %%xmm3 # zero mm3 \n\
movdqa %%xmm2, %%xmm4 # G7 G6 G5 G4 G3 G2 G1 G0 \n\
punpcklbw %%xmm1, %%xmm4 # R3 G3 R2 G2 R1 G1 R0 G0 \n\
punpcklbw %%xmm0, %%xmm3 # B3 00 B2 00 B1 00 B0 00 \n\
movdqa %%xmm3, %%xmm5 # R3 00 R2 00 R1 00 R0 00 \n\
punpcklwd %%xmm4, %%xmm3 # R1 G1 B1 00 R0 B0 G0 00 \n\
movntdq %%xmm3, (%3) # Store RGBA3 RGBA2 RGBA1 RGBA0 \n\
punpckhwd %%xmm4, %%xmm5 # R3 G3 B3 00 R2 G2 B2 00 \n\
movntdq %%xmm5, 16(%3) # Store RGBA7 RGBA6 RGBA5 RGBA4 \n\
pxor %%xmm6, %%xmm6 # zero mm6 \n\
punpckhbw %%xmm1, %%xmm2 # R7 G7 R6 G6 R5 G5 R4 G4 \n\
punpckhbw %%xmm0, %%xmm6 # B7 00 B6 00 B5 00 B4 00 \n\
movdqa %%xmm6, %%xmm0 # B7 00 B6 00 B5 00 B4 00 \n\
punpcklwd %%xmm2, %%xmm6 # R5 G5 B5 00 R4 G4 B4 00 \n\
movntdq %%xmm6, 32(%3) # Store BGRA11 BGRA10 BGRA9 RGBA8 \n\
punpckhwd %%xmm2, %%xmm0 # R7 G7 B7 00 R6 G6 B6 00 \n\
movntdq %%xmm0, 48(%3) # Store RGBA15 RGBA14 RGBA13 RGBA12 \n\
"
#define SSE2_UNPACK_32_RGBA_UNALIGNED " \n\
pxor %%xmm3, %%xmm3 # zero mm3 \n\
movdqa %%xmm2, %%xmm4 # G7 G6 G5 G4 G3 G2 G1 G0 \n\
punpcklbw %%xmm1, %%xmm4 # R3 G3 R2 G2 R1 G1 R0 G0 \n\
punpcklbw %%xmm0, %%xmm3 # B3 00 B2 00 B1 00 B0 00 \n\
movdqa %%xmm3, %%xmm5 # R3 00 R2 00 R1 00 R0 00 \n\
punpcklwd %%xmm4, %%xmm3 # R1 G1 B1 00 R0 B0 G0 00 \n\
movdqu %%xmm3, (%3) # Store RGBA3 RGBA2 RGBA1 RGBA0 \n\
punpckhwd %%xmm4, %%xmm5 # R3 G3 B3 00 R2 G2 B2 00 \n\
movdqu %%xmm5, 16(%3) # Store RGBA7 RGBA6 RGBA5 RGBA4 \n\
pxor %%xmm6, %%xmm6 # zero mm6 \n\
punpckhbw %%xmm1, %%xmm2 # R7 G7 R6 G6 R5 G5 R4 G4 \n\
punpckhbw %%xmm0, %%xmm6 # B7 00 B6 00 B5 00 B4 00 \n\
movdqa %%xmm6, %%xmm0 # B7 00 B6 00 B5 00 B4 00 \n\
punpcklwd %%xmm2, %%xmm6 # R5 G5 B5 00 R4 G4 B4 00 \n\
movdqu %%xmm6, 32(%3) # Store RGBA11 RGBA10 RGBA9 RGBA8 \n\
punpckhwd %%xmm2, %%xmm0 # R7 G7 B7 00 R6 G6 B6 00 \n\
movdqu %%xmm0, 48(%3) # Store RGBA15 RGBA14 RGBA13 RGBA12 \n\
"
#define SSE2_UNPACK_32_BGRA_ALIGNED " \n\
#define SSE2_UNPACK_32_BGRA_ALIGNED " \n\
pxor %%xmm3, %%xmm3 # zero mm3 \n\
pxor %%xmm3, %%xmm3 # zero mm3 \n\
movdqa %%xmm2, %%xmm4 # G7 G6 G5 G4 G3 G2 G1 G0 \n\
movdqa %%xmm2, %%xmm4 # G7 G6 G5 G4 G3 G2 G1 G0 \n\
...
@@ -881,11 +976,11 @@ movdqu %%xmm2, 48(%3) # Store ABGR15 ABGR14 ABGR13 ABGR12 \n\
...
@@ -881,11 +976,11 @@ movdqu %%xmm2, 48(%3) # Store ABGR15 ABGR14 ABGR13 ABGR12 \n\
#include <emmintrin.h>
#include <emmintrin.h>
#define SSE2_CALL(SSE2_INSTRUCTIONS)
\
#define SSE2_CALL(SSE2_INSTRUCTIONS) \
do {
\
do { \
__m128i xmm0, xmm1, xmm2, xmm3,
\
__m128i xmm0, xmm1, xmm2, xmm3, \
xmm4, xmm5, xmm6, xmm7;
\
xmm4, xmm5, xmm6, xmm7; \
SSE2_INSTRUCTIONS
\
SSE2_INSTRUCTIONS \
} while(0)
} while(0)
#define SSE2_END _mm_sfence()
#define SSE2_END _mm_sfence()
...
@@ -971,179 +1066,249 @@ movdqu %%xmm2, 48(%3) # Store ABGR15 ABGR14 ABGR13 ABGR12 \n\
...
@@ -971,179 +1066,249 @@ movdqu %%xmm2, 48(%3) # Store ABGR15 ABGR14 ABGR13 ABGR12 \n\
xmm1 = _mm_unpacklo_epi8(xmm1, xmm4); \
xmm1 = _mm_unpacklo_epi8(xmm1, xmm4); \
xmm2 = _mm_unpacklo_epi8(xmm2, xmm5);
xmm2 = _mm_unpacklo_epi8(xmm2, xmm5);
#define SSE2_UNPACK_15_ALIGNED \
#define SSE2_UNPACK_15_ALIGNED
\
xmm5 = _mm_set1_epi32(0xf8f8f8f8UL); \
xmm5 = _mm_set1_epi32(0xf8f8f8f8UL);
\
xmm0 = _mm_and_si128(xmm0, xmm5); \
xmm0 = _mm_and_si128(xmm0, xmm5);
\
xmm0 = _mm_srli_epi16(xmm0, 3); \
xmm0 = _mm_srli_epi16(xmm0, 3);
\
xmm2 = _mm_and_si128(xmm2, xmm5); \
xmm2 = _mm_and_si128(xmm2, xmm5);
\
xmm1 = _mm_and_si128(xmm1, xmm5); \
xmm1 = _mm_and_si128(xmm1, xmm5);
\
xmm1 = _mm_srli_epi16(xmm1, 1); \
xmm1 = _mm_srli_epi16(xmm1, 1);
\
xmm4 = _mm_setzero_si128(); \
xmm4 = _mm_setzero_si128();
\
xmm5 = xmm0; \
xmm5 = xmm0;
\
xmm7 = xmm2; \
xmm7 = xmm2;
\
\
\
xmm2 = _mm_unpacklo_epi8(xmm2, xmm4); \
xmm2 = _mm_unpacklo_epi8(xmm2, xmm4);
\
xmm0 = _mm_unpacklo_epi8(xmm0, xmm1); \
xmm0 = _mm_unpacklo_epi8(xmm0, xmm1);
\
xmm2 = _mm_slli_epi16(xmm2, 2); \
xmm2 = _mm_slli_epi16(xmm2, 2);
\
xmm0 = _mm_or_si128(xmm0, xmm2); \
xmm0 = _mm_or_si128(xmm0, xmm2);
\
_mm_stream_si128((__m128i*)p_buffer, xmm0); \
_mm_stream_si128((__m128i*)p_buffer, xmm0);
\
\
\
xmm7 = _mm_unpackhi_epi8(xmm7, xmm4); \
xmm7 = _mm_unpackhi_epi8(xmm7, xmm4);
\
xmm5 = _mm_unpackhi_epi8(xmm5, xmm1); \
xmm5 = _mm_unpackhi_epi8(xmm5, xmm1);
\
xmm7 = _mm_slli_epi16(xmm7, 2); \
xmm7 = _mm_slli_epi16(xmm7, 2);
\
xmm5 = _mm_or_si128(xmm5, xmm7); \
xmm5 = _mm_or_si128(xmm5, xmm7);
\
_mm_stream_si128((__m128i*)(p_buffer+8), xmm5);
_mm_stream_si128((__m128i*)(p_buffer+8), xmm5);
#define SSE2_UNPACK_15_UNALIGNED \
#define SSE2_UNPACK_15_UNALIGNED
\
xmm5 = _mm_set1_epi32(0xf8f8f8f8UL); \
xmm5 = _mm_set1_epi32(0xf8f8f8f8UL);
\
xmm0 = _mm_and_si128(xmm0, xmm5); \
xmm0 = _mm_and_si128(xmm0, xmm5);
\
xmm0 = _mm_srli_epi16(xmm0, 3); \
xmm0 = _mm_srli_epi16(xmm0, 3);
\
xmm2 = _mm_and_si128(xmm2, xmm5); \
xmm2 = _mm_and_si128(xmm2, xmm5);
\
xmm1 = _mm_and_si128(xmm1, xmm5); \
xmm1 = _mm_and_si128(xmm1, xmm5);
\
xmm1 = _mm_srli_epi16(xmm1, 1); \
xmm1 = _mm_srli_epi16(xmm1, 1);
\
xmm4 = _mm_setzero_si128(); \
xmm4 = _mm_setzero_si128();
\
xmm5 = xmm0; \
xmm5 = xmm0;
\
xmm7 = xmm2; \
xmm7 = xmm2;
\
\
\
xmm2 = _mm_unpacklo_epi8(xmm2, xmm4); \
xmm2 = _mm_unpacklo_epi8(xmm2, xmm4);
\
xmm0 = _mm_unpacklo_epi8(xmm0, xmm1); \
xmm0 = _mm_unpacklo_epi8(xmm0, xmm1);
\
xmm2 = _mm_slli_epi16(xmm2, 2); \
xmm2 = _mm_slli_epi16(xmm2, 2);
\
xmm0 = _mm_or_si128(xmm0, xmm2); \
xmm0 = _mm_or_si128(xmm0, xmm2);
\
_mm_storeu_si128((__m128i*)p_buffer, xmm0); \
_mm_storeu_si128((__m128i*)p_buffer, xmm0);
\
\
\
xmm7 = _mm_unpackhi_epi8(xmm7, xmm4); \
xmm7 = _mm_unpackhi_epi8(xmm7, xmm4);
\
xmm5 = _mm_unpackhi_epi8(xmm5, xmm1); \
xmm5 = _mm_unpackhi_epi8(xmm5, xmm1);
\
xmm7 = _mm_slli_epi16(xmm7, 2); \
xmm7 = _mm_slli_epi16(xmm7, 2);
\
xmm5 = _mm_or_si128(xmm5, xmm7); \
xmm5 = _mm_or_si128(xmm5, xmm7);
\
_mm_storeu_si128((__m128i*)(p_buffer+16), xmm5);
_mm_storeu_si128((__m128i*)(p_buffer+16), xmm5);
#define SSE2_UNPACK_16_ALIGNED \
#define SSE2_UNPACK_16_ALIGNED
\
xmm5 = _mm_set1_epi32(0xf8f8f8f8UL); \
xmm5 = _mm_set1_epi32(0xf8f8f8f8UL);
\
xmm0 = _mm_and_si128(xmm0, xmm5); \
xmm0 = _mm_and_si128(xmm0, xmm5);
\
xmm1 = _mm_and_si128(xmm1, xmm5); \
xmm1 = _mm_and_si128(xmm1, xmm5);
\
xmm5 = _mm_set1_epi32(0xfcfcfcfcUL); \
xmm5 = _mm_set1_epi32(0xfcfcfcfcUL);
\
xmm2 = _mm_and_si128(xmm2, xmm5); \
xmm2 = _mm_and_si128(xmm2, xmm5);
\
xmm0 = _mm_srli_epi16(xmm0, 3); \
xmm0 = _mm_srli_epi16(xmm0, 3);
\
xmm4 = _mm_setzero_si128(); \
xmm4 = _mm_setzero_si128();
\
xmm5 = xmm0; \
xmm5 = xmm0;
\
xmm7 = xmm2; \
xmm7 = xmm2;
\
\
\
xmm2 = _mm_unpacklo_epi8(xmm2, xmm4); \
xmm2 = _mm_unpacklo_epi8(xmm2, xmm4);
\
xmm0 = _mm_unpacklo_epi8(xmm0, xmm1); \
xmm0 = _mm_unpacklo_epi8(xmm0, xmm1);
\
xmm2 = _mm_slli_epi16(xmm2, 3); \
xmm2 = _mm_slli_epi16(xmm2, 3);
\
xmm0 = _mm_or_si128(xmm0, xmm2); \
xmm0 = _mm_or_si128(xmm0, xmm2);
\
_mm_stream_si128((__m128i*)p_buffer, xmm0); \
_mm_stream_si128((__m128i*)p_buffer, xmm0);
\
\
\
xmm7 = _mm_unpackhi_epi8(xmm7, xmm4); \
xmm7 = _mm_unpackhi_epi8(xmm7, xmm4);
\
xmm5 = _mm_unpackhi_epi8(xmm5, xmm1); \
xmm5 = _mm_unpackhi_epi8(xmm5, xmm1);
\
xmm7 = _mm_slli_epi16(xmm7, 3); \
xmm7 = _mm_slli_epi16(xmm7, 3);
\
xmm5 = _mm_or_si128(xmm5, xmm7); \
xmm5 = _mm_or_si128(xmm5, xmm7);
\
_mm_stream_si128((__m128i*)(p_buffer+8), xmm5);
_mm_stream_si128((__m128i*)(p_buffer+8), xmm5);
#define SSE2_UNPACK_16_UNALIGNED \
#define SSE2_UNPACK_16_UNALIGNED
\
xmm5 = _mm_set1_epi32(0xf8f8f8f8UL); \
xmm5 = _mm_set1_epi32(0xf8f8f8f8UL);
\
xmm0 = _mm_and_si128(xmm0, xmm5); \
xmm0 = _mm_and_si128(xmm0, xmm5);
\
xmm1 = _mm_and_si128(xmm1, xmm5); \
xmm1 = _mm_and_si128(xmm1, xmm5);
\
xmm5 = _mm_set1_epi32(0xfcfcfcfcUL); \
xmm5 = _mm_set1_epi32(0xfcfcfcfcUL);
\
xmm2 = _mm_and_si128(xmm2, xmm5); \
xmm2 = _mm_and_si128(xmm2, xmm5);
\
xmm0 = _mm_srli_epi16(xmm0, 3); \
xmm0 = _mm_srli_epi16(xmm0, 3);
\
xmm4 = _mm_setzero_si128(); \
xmm4 = _mm_setzero_si128();
\
xmm5 = xmm0; \
xmm5 = xmm0;
\
xmm7 = xmm2; \
xmm7 = xmm2;
\
\
\
xmm2 = _mm_unpacklo_epi8(xmm2, xmm4); \
xmm2 = _mm_unpacklo_epi8(xmm2, xmm4);
\
xmm0 = _mm_unpacklo_epi8(xmm0, xmm1); \
xmm0 = _mm_unpacklo_epi8(xmm0, xmm1);
\
xmm2 = _mm_slli_epi16(xmm2, 3); \
xmm2 = _mm_slli_epi16(xmm2, 3);
\
xmm0 = _mm_or_si128(xmm0, xmm2); \
xmm0 = _mm_or_si128(xmm0, xmm2);
\
_mm_storeu_si128((__m128i*)p_buffer, xmm0); \
_mm_storeu_si128((__m128i*)p_buffer, xmm0);
\
\
\
xmm7 = _mm_unpackhi_epi8(xmm7, xmm4); \
xmm7 = _mm_unpackhi_epi8(xmm7, xmm4);
\
xmm5 = _mm_unpackhi_epi8(xmm5, xmm1); \
xmm5 = _mm_unpackhi_epi8(xmm5, xmm1);
\
xmm7 = _mm_slli_epi16(xmm7, 3); \
xmm7 = _mm_slli_epi16(xmm7, 3);
\
xmm5 = _mm_or_si128(xmm5, xmm7); \
xmm5 = _mm_or_si128(xmm5, xmm7);
\
_mm_storeu_si128((__m128i*)(p_buffer+8), xmm5);
_mm_storeu_si128((__m128i*)(p_buffer+8), xmm5);
#define SSE2_UNPACK_32_ARGB_ALIGNED \
#define SSE2_UNPACK_32_ARGB_ALIGNED
\
xmm3 = _mm_setzero_si128(); \
xmm3 = _mm_setzero_si128();
\
xmm4 = xmm0; \
xmm4 = xmm0;
\
xmm4 = _mm_unpacklo_epi8(xmm4, xmm2); \
xmm4 = _mm_unpacklo_epi8(xmm4, xmm2);
\
xmm5 = xmm1; \
xmm5 = xmm1;
\
xmm5 = _mm_unpacklo_epi8(xmm5, xmm3); \
xmm5 = _mm_unpacklo_epi8(xmm5, xmm3);
\
xmm6 = xmm4; \
xmm6 = xmm4;
\
xmm4 = _mm_unpacklo_epi16(xmm4, xmm5); \
xmm4 = _mm_unpacklo_epi16(xmm4, xmm5);
\
_mm_stream_si128((__m128i*)(p_buffer), xmm4); \
_mm_stream_si128((__m128i*)(p_buffer), xmm4);
\
xmm6 = _mm_unpackhi_epi16(xmm6, xmm5); \
xmm6 = _mm_unpackhi_epi16(xmm6, xmm5);
\
_mm_stream_si128((__m128i*)(p_buffer+4), xmm6); \
_mm_stream_si128((__m128i*)(p_buffer+4), xmm6); \
xmm0 = _mm_unpackhi_epi8(xmm0, xmm2); \
xmm0 = _mm_unpackhi_epi8(xmm0, xmm2);
\
xmm1 = _mm_unpackhi_epi8(xmm1, xmm3); \
xmm1 = _mm_unpackhi_epi8(xmm1, xmm3);
\
xmm5 = xmm0; \
xmm5 = xmm0;
\
xmm5 = _mm_unpacklo_epi16(xmm5, xmm1); \
xmm5 = _mm_unpacklo_epi16(xmm5, xmm1);
\
_mm_stream_si128((__m128i*)(p_buffer+8), xmm5); \
_mm_stream_si128((__m128i*)(p_buffer+8), xmm5); \
xmm0 = _mm_unpackhi_epi16(xmm0, xmm1); \
xmm0 = _mm_unpackhi_epi16(xmm0, xmm1);
\
_mm_stream_si128((__m128i*)(p_buffer+12), xmm0);
_mm_stream_si128((__m128i*)(p_buffer+12), xmm0);
#define SSE2_UNPACK_32_ARGB_UNALIGNED \
#define SSE2_UNPACK_32_ARGB_UNALIGNED
\
xmm3 = _mm_setzero_si128(); \
xmm3 = _mm_setzero_si128();
\
xmm4 = xmm0; \
xmm4 = xmm0;
\
xmm4 = _mm_unpacklo_epi8(xmm4, xmm2); \
xmm4 = _mm_unpacklo_epi8(xmm4, xmm2);
\
xmm5 = xmm1; \
xmm5 = xmm1;
\
xmm5 = _mm_unpacklo_epi8(xmm5, xmm3); \
xmm5 = _mm_unpacklo_epi8(xmm5, xmm3);
\
xmm6 = xmm4; \
xmm6 = xmm4;
\
xmm4 = _mm_unpacklo_epi16(xmm4, xmm5); \
xmm4 = _mm_unpacklo_epi16(xmm4, xmm5);
\
_mm_storeu_si128((__m128i*)(p_buffer), xmm4); \
_mm_storeu_si128((__m128i*)(p_buffer), xmm4);
\
xmm6 = _mm_unpackhi_epi16(xmm6, xmm5); \
xmm6 = _mm_unpackhi_epi16(xmm6, xmm5);
\
_mm_storeu_si128((__m128i*)(p_buffer+4), xmm6); \
_mm_storeu_si128((__m128i*)(p_buffer+4), xmm6); \
xmm0 = _mm_unpackhi_epi8(xmm0, xmm2); \
xmm0 = _mm_unpackhi_epi8(xmm0, xmm2);
\
xmm1 = _mm_unpackhi_epi8(xmm1, xmm3); \
xmm1 = _mm_unpackhi_epi8(xmm1, xmm3);
\
xmm5 = xmm0; \
xmm5 = xmm0;
\
xmm5 = _mm_unpacklo_epi16(xmm5, xmm1); \
xmm5 = _mm_unpacklo_epi16(xmm5, xmm1);
\
_mm_storeu_si128((__m128i*)(p_buffer+8), xmm5); \
_mm_storeu_si128((__m128i*)(p_buffer+8), xmm5); \
xmm0 = _mm_unpackhi_epi16(xmm0, xmm1); \
xmm0 = _mm_unpackhi_epi16(xmm0, xmm1);
\
_mm_storeu_si128((__m128i*)(p_buffer+12), xmm0);
_mm_storeu_si128((__m128i*)(p_buffer+12), xmm0);
#define SSE2_UNPACK_32_
BGRA_ALIGNED
\
#define SSE2_UNPACK_32_
RGBA_ALIGNED
\
xmm3 = _mm_setzero_si128(); \
xmm3 = _mm_setzero_si128();
\
xmm4 = xmm2; \
xmm4 = xmm2;
\
xmm4 = _mm_unpacklo_epi8(xmm4, xmm
0);
\
xmm4 = _mm_unpacklo_epi8(xmm4, xmm
1);
\
xmm3 = _mm_unpacklo_epi8(xmm3, xmm
1);
\
xmm3 = _mm_unpacklo_epi8(xmm3, xmm
0);
\
xmm5 = xmm3; \
xmm5 = xmm3;
\
xmm3 = _mm_unpacklo_epi16(xmm3, xmm4); \
xmm3 = _mm_unpacklo_epi16(xmm3, xmm4);
\
_mm_stream_si128((__m128i*)(p_buffer), xmm3); \
_mm_stream_si128((__m128i*)(p_buffer), xmm3);
\
xmm5 = _mm_unpackhi_epi16(xmm5, xmm4); \
xmm5 = _mm_unpackhi_epi16(xmm5, xmm4);
\
_mm_stream_si128((__m128i*)(p_buffer+4), xmm5); \
_mm_stream_si128((__m128i*)(p_buffer+4), xmm5); \
xmm6 = _mm_setzero_si128(); \
xmm6 = _mm_setzero_si128();
\
xmm2 = _mm_unpackhi_epi8(xmm2, xmm
0);
\
xmm2 = _mm_unpackhi_epi8(xmm2, xmm
1);
\
xmm6 = _mm_unpackhi_epi8(xmm6, xmm
1);
\
xmm6 = _mm_unpackhi_epi8(xmm6, xmm
0);
\
xmm0 = xmm6; \
xmm0 = xmm6;
\
xmm6 = _mm_unpacklo_epi16(xmm6, xmm2); \
xmm6 = _mm_unpacklo_epi16(xmm6, xmm2);
\
_mm_stream_si128((__m128i*)(p_buffer+8), xmm6); \
_mm_stream_si128((__m128i*)(p_buffer+8), xmm6); \
xmm0 = _mm_unpackhi_epi16(xmm0, xmm2); \
xmm0 = _mm_unpackhi_epi16(xmm0, xmm2);
\
_mm_stream_si128((__m128i*)(p_buffer+12), xmm0);
_mm_stream_si128((__m128i*)(p_buffer+12), xmm0);
#define SSE2_UNPACK_32_
BGRA_UNALIGNED
\
#define SSE2_UNPACK_32_
RGBA_UNALIGNED
\
xmm3 = _mm_setzero_si128(); \
xmm3 = _mm_setzero_si128();
\
xmm4 = xmm2; \
xmm4 = xmm2;
\
xmm4 = _mm_unpacklo_epi8(xmm4, xmm
0);
\
xmm4 = _mm_unpacklo_epi8(xmm4, xmm
1);
\
xmm3 = _mm_unpacklo_epi8(xmm3, xmm
1);
\
xmm3 = _mm_unpacklo_epi8(xmm3, xmm
0);
\
xmm5 = xmm3; \
xmm5 = xmm3;
\
xmm3 = _mm_unpacklo_epi16(xmm3, xmm4); \
xmm3 = _mm_unpacklo_epi16(xmm3, xmm4);
\
_mm_storeu_si128((__m128i*)(p_buffer), xmm3); \
_mm_storeu_si128((__m128i*)(p_buffer), xmm3);
\
xmm5 = _mm_unpackhi_epi16(xmm5, xmm4); \
xmm5 = _mm_unpackhi_epi16(xmm5, xmm4);
\
_mm_storeu_si128((__m128i*)(p_buffer+4), xmm5); \
_mm_storeu_si128((__m128i*)(p_buffer+4), xmm5); \
xmm6 = _mm_setzero_si128(); \
xmm6 = _mm_setzero_si128();
\
xmm2 = _mm_unpackhi_epi8(xmm2, xmm
0);
\
xmm2 = _mm_unpackhi_epi8(xmm2, xmm
1);
\
xmm6 = _mm_unpackhi_epi8(xmm6, xmm
1);
\
xmm6 = _mm_unpackhi_epi8(xmm6, xmm
0);
\
xmm0 = xmm6; \
xmm0 = xmm6;
\
xmm6 = _mm_unpacklo_epi16(xmm6, xmm2); \
xmm6 = _mm_unpacklo_epi16(xmm6, xmm2);
\
_mm_storeu_si128((__m128i*)(p_buffer+8), xmm6); \
_mm_storeu_si128((__m128i*)(p_buffer+8), xmm6); \
xmm0 = _mm_unpackhi_epi16(xmm0, xmm2); \
xmm0 = _mm_unpackhi_epi16(xmm0, xmm2);
\
_mm_storeu_si128((__m128i*)(p_buffer+12), xmm0);
_mm_storeu_si128((__m128i*)(p_buffer+12), xmm0);
#define SSE2_UNPACK_32_ABGR_ALIGNED \
#define SSE2_UNPACK_32_BGRA_ALIGNED \
;
xmm3 = _mm_setzero_si128(); \
xmm4 = xmm2; \
xmm4 = _mm_unpacklo_epi8(xmm4, xmm0); \
xmm3 = _mm_unpacklo_epi8(xmm3, xmm1); \
xmm5 = xmm3; \
xmm3 = _mm_unpacklo_epi16(xmm3, xmm4); \
_mm_stream_si128((__m128i*)(p_buffer), xmm3); \
xmm5 = _mm_unpackhi_epi16(xmm5, xmm4); \
_mm_stream_si128((__m128i*)(p_buffer+4), xmm5); \
xmm6 = _mm_setzero_si128(); \
xmm2 = _mm_unpackhi_epi8(xmm2, xmm0); \
xmm6 = _mm_unpackhi_epi8(xmm6, xmm1); \
xmm0 = xmm6; \
xmm6 = _mm_unpacklo_epi16(xmm6, xmm2); \
_mm_stream_si128((__m128i*)(p_buffer+8), xmm6); \
xmm0 = _mm_unpackhi_epi16(xmm0, xmm2); \
_mm_stream_si128((__m128i*)(p_buffer+12), xmm0);
#define SSE2_UNPACK_32_BGRA_UNALIGNED \
xmm3 = _mm_setzero_si128(); \
xmm4 = xmm2; \
xmm4 = _mm_unpacklo_epi8(xmm4, xmm0); \
xmm3 = _mm_unpacklo_epi8(xmm3, xmm1); \
xmm5 = xmm3; \
xmm3 = _mm_unpacklo_epi16(xmm3, xmm4); \
_mm_storeu_si128((__m128i*)(p_buffer), xmm3); \
xmm5 = _mm_unpackhi_epi16(xmm5, xmm4); \
_mm_storeu_si128((__m128i*)(p_buffer+4), xmm5); \
xmm6 = _mm_setzero_si128(); \
xmm2 = _mm_unpackhi_epi8(xmm2, xmm0); \
xmm6 = _mm_unpackhi_epi8(xmm6, xmm1); \
xmm0 = xmm6; \
xmm6 = _mm_unpacklo_epi16(xmm6, xmm2); \
_mm_storeu_si128((__m128i*)(p_buffer+8), xmm6); \
xmm0 = _mm_unpackhi_epi16(xmm0, xmm2); \
_mm_storeu_si128((__m128i*)(p_buffer+12), xmm0);
#define SSE2_UNPACK_32_ABGR_UNALIGNED \
#define SSE2_UNPACK_32_ABGR_ALIGNED \
;
xmm3 = _mm_setzero_si128(); \
xmm4 = xmm1; \
xmm4 = _mm_unpacklo_epi8(xmm4, xmm2); \
xmm5 = xmm0; \
xmm5 = _mm_unpacklo_epi8(xmm5, xmm3); \
xmm6 = xmm4; \
xmm4 = _mm_unpacklo_epi16(xmm4, xmm5); \
_mm_stream_si128((__m128i*)(p_buffer), xmm4); \
xmm6 = _mm_unpackhi_epi16(xmm6, xmm5); \
_mm_stream_si128((__m128i*)(p_buffer+4), xmm6); \
xmm1 = _mm_unpackhi_epi8(xmm1, xmm2); \
xmm0 = _mm_unpackhi_epi8(xmm0, xmm3); \
xmm2 = xmm1; \
xmm1 = _mm_unpacklo_epi16(xmm1, xmm0); \
_mm_stream_si128((__m128i*)(p_buffer+8), xmm1); \
xmm2 = _mm_unpackhi_epi16(xmm2, xmm0); \
_mm_stream_si128((__m128i*)(p_buffer+12), xmm2);
#define SSE2_UNPACK_32_ABGR_UNALIGNED \
xmm3 = _mm_setzero_si128(); \
xmm4 = xmm1; \
xmm4 = _mm_unpacklo_epi8(xmm4, xmm2); \
xmm5 = xmm0; \
xmm5 = _mm_unpacklo_epi8(xmm5, xmm3); \
xmm6 = xmm4; \
xmm4 = _mm_unpacklo_epi16(xmm4, xmm5); \
_mm_storeu_si128((__m128i*)(p_buffer), xmm4); \
xmm6 = _mm_unpackhi_epi16(xmm6, xmm5); \
_mm_storeu_si128((__m128i*)(p_buffer+4), xmm6); \
xmm1 = _mm_unpackhi_epi8(xmm1, xmm2); \
xmm0 = _mm_unpackhi_epi8(xmm0, xmm3); \
xmm2 = xmm1; \
xmm1 = _mm_unpacklo_epi16(xmm1, xmm0); \
_mm_storeu_si128((__m128i*)(p_buffer+8), xmm1); \
xmm2 = _mm_unpackhi_epi16(xmm2, xmm0); \
_mm_storeu_si128((__m128i*)(p_buffer+12), xmm2);
#endif
#endif
...
...
modules/video_chroma/i420_yuy2.h
View file @
5e4dc54c
...
@@ -138,56 +138,56 @@ movq %%mm1, (%1) # Store YUYV \n\
...
@@ -138,56 +138,56 @@ movq %%mm1, (%1) # Store YUYV \n\
#define MMX_END _mm_empty()
#define MMX_END _mm_empty()
#define MMX_YUV420_YUYV \
#define MMX_YUV420_YUYV \
mm1 = _mm_cvtsi32_si64(
(int)*p_u);
\
mm1 = _mm_cvtsi32_si64(
*(int*)p_u);
\
mm2 = _mm_cvtsi32_si64(
(int)*p_v);
\
mm2 = _mm_cvtsi32_si64(
*(int*)p_v);
\
mm0 = (__m64)*(uint64_t*)p_y1; \
mm0 = (__m64)*(uint64_t*)p_y1; \
mm3 = (__m64)*(uint64_t*)p_y2; \
mm3 = (__m64)*(uint64_t*)p_y2; \
mm1 = _mm_unpacklo_pi8(mm1, mm2); \
mm1 = _mm_unpacklo_pi8(mm1, mm2); \
mm2 = mm0; \
mm2 = mm0; \
mm2 = _mm_unpacklo_pi8(mm2, mm1); \
mm2 = _mm_unpacklo_pi8(mm2, mm1); \
*(uin
64_t)p_line1 = (uint64)mm2;
\
*(uin
t64_t*)p_line1 = (uint64_t)mm2;
\
mm0 = _mm_unpackhi_pi8(mm0, mm1); \
mm0 = _mm_unpackhi_pi8(mm0, mm1); \
*(uin
64_t)(p_line1 + 4) = (uint64)mm0;
\
*(uin
t64_t*)(p_line1+8) = (uint64_t)mm0;
\
mm4 = mm3; \
mm4 = mm3; \
mm4 = _mm_unpacklo_pi8(mm4, mm1); \
mm4 = _mm_unpacklo_pi8(mm4, mm1); \
*(uin
64_t)p_line2 = (uint64)mm4;
\
*(uin
t64_t*)p_line2 = (uint64_t)mm4;
\
mm3 = _mm_unpackhi_pi8(mm3, mm1); \
mm3 = _mm_unpackhi_pi8(mm3, mm1); \
*(uin
64_t)(p_line2 + 4) = (uint64)mm4
;
*(uin
t64_t*)(p_line2+8) = (uint64_t)mm3
;
#define MMX_YUV420_YVYU \
#define MMX_YUV420_YVYU \
mm2 = _mm_cvtsi32_si64(
(int)*p_u);
\
mm2 = _mm_cvtsi32_si64(
*(int*)p_u);
\
mm1 = _mm_cvtsi32_si64(
(int)*p_v);
\
mm1 = _mm_cvtsi32_si64(
*(int*)p_v);
\
mm0 = (__m64)*(uint64_t*)p_y1; \
mm0 = (__m64)*(uint64_t*)p_y1; \
mm3 = (__m64)*(uint64_t*)p_y2; \
mm3 = (__m64)*(uint64_t*)p_y2; \
mm1 = _mm_unpacklo_pi8(mm1, mm2); \
mm1 = _mm_unpacklo_pi8(mm1, mm2); \
mm2 = mm0; \
mm2 = mm0; \
mm2 = _mm_unpacklo_pi8(mm2, mm1); \
mm2 = _mm_unpacklo_pi8(mm2, mm1); \
*(uin
64_t)p_line1 = (uint64)mm2;
\
*(uin
t64_t*)p_line1 = (uint64_t)mm2;
\
mm0 = _mm_unpackhi_pi8(mm0, mm1); \
mm0 = _mm_unpackhi_pi8(mm0, mm1); \
*(uin
64_t)(p_line1 + 4) = (uint64)mm0;
\
*(uin
t64_t*)(p_line1+8) = (uint64_t)mm0;
\
mm4 = mm3; \
mm4 = mm3; \
mm4 = _mm_unpacklo_pi8(mm4, mm1); \
mm4 = _mm_unpacklo_pi8(mm4, mm1); \
*(uin
64_t)p_line2 = (uint64)mm4;
\
*(uin
t64_t*)p_line2 = (uint64_t)mm4;
\
mm3 = _mm_unpackhi_pi8(mm3, mm1); \
mm3 = _mm_unpackhi_pi8(mm3, mm1); \
*(uin
64_t)(p_line2 + 4) = (uint64)mm4
;
*(uin
t64_t*)(p_line2+8) = (uint64_t)mm3
;
#define MMX_YUV420_UYVY \
#define MMX_YUV420_UYVY \
mm1 = _mm_cvtsi32_si64(
(int)*p_u);
\
mm1 = _mm_cvtsi32_si64(
*(int*)p_u);
\
mm2 = _mm_cvtsi32_si64(
(int)*p_v);
\
mm2 = _mm_cvtsi32_si64(
*(int*)p_v);
\
mm0 = (__m64)*(uint64_t*)p_y1; \
mm0 = (__m64)*(uint64_t*)p_y1; \
mm3 = (__m64)*(uint64_t*)p_y2; \
mm3 = (__m64)*(uint64_t*)p_y2; \
mm1 = _mm_unpacklo_pi8(mm1, mm2); \
mm1 = _mm_unpacklo_pi8(mm1, mm2); \
mm2 = mm1; \
mm2 = mm1; \
mm2 = _mm_unpacklo_pi8(mm2, mm0); \
mm2 = _mm_unpacklo_pi8(mm2, mm0); \
*(uin
64_t)p_line1 = (uint64)mm2;
\
*(uin
t64_t*)p_line1 = (uint64_t)mm2;
\
mm2 = mm1; \
mm2 = mm1; \
mm2 = _mm_unpackhi_pi8(mm2, mm0); \
mm2 = _mm_unpackhi_pi8(mm2, mm0); \
*(uin
64_t)(p_line1 + 4) = (uint64)mm2;
\
*(uin
t64_t*)(p_line1+8) = (uint64_t)mm2;
\
mm4 = mm1; \
mm4 = mm1; \
mm4 = _mm_unpacklo_pi8(mm4, mm3); \
mm4 = _mm_unpacklo_pi8(mm4, mm3); \
*(uin
64_t)p_line2 = (uint64)mm4;
\
*(uin
t64_t*)p_line2 = (uint64_t)mm4;
\
mm1 = _mm_unpackhi_pi8(mm1, mm3); \
mm1 = _mm_unpackhi_pi8(mm1, mm3); \
*(uin
64_t)(p_line2 + 4) = (uint64
)mm1;
*(uin
t64_t*)(p_line2+8) = (uint64_t
)mm1;
#endif
#endif
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment