Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Support
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
V
vlc
Project overview
Project overview
Details
Activity
Releases
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Issues
0
Issues
0
List
Boards
Labels
Milestones
Redmine
Redmine
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Operations
Operations
Metrics
Environments
Analytics
Analytics
CI / CD
Repository
Value Stream
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
videolan
vlc
Commits
a3883709
Commit
a3883709
authored
Aug 01, 2007
by
Damien Fouilleul
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
video_chroma: added I420_ABGR32 support (mostly for opengl), some clean up as well
parent
7b64c064
Changes
8
Show whitespace changes
Inline
Side-by-side
Showing
8 changed files
with
1600 additions
and
1248 deletions
+1600
-1248
modules/video_chroma/i420_rgb.c
modules/video_chroma/i420_rgb.c
+18
-1
modules/video_chroma/i420_rgb.h
modules/video_chroma/i420_rgb.h
+1
-0
modules/video_chroma/i420_rgb16.c
modules/video_chroma/i420_rgb16.c
+387
-455
modules/video_chroma/i420_rgb_mmx.h
modules/video_chroma/i420_rgb_mmx.h
+791
-666
modules/video_chroma/i420_yuy2.c
modules/video_chroma/i420_yuy2.c
+12
-12
modules/video_chroma/i420_yuy2.h
modules/video_chroma/i420_yuy2.h
+307
-87
modules/video_chroma/i422_yuy2.c
modules/video_chroma/i422_yuy2.c
+35
-26
modules/video_chroma/i422_yuy2.h
modules/video_chroma/i422_yuy2.h
+49
-1
No files found.
modules/video_chroma/i420_rgb.c
View file @
a3883709
...
@@ -155,6 +155,15 @@ static int Activate( vlc_object_t *p_this )
...
@@ -155,6 +155,15 @@ static int Activate( vlc_object_t *p_this )
msg_Dbg
(
p_this
,
"RGB pixel format is A8R8G8B8"
);
msg_Dbg
(
p_this
,
"RGB pixel format is A8R8G8B8"
);
p_vout
->
chroma
.
pf_convert
=
E_
(
I420_A8R8G8B8
);
p_vout
->
chroma
.
pf_convert
=
E_
(
I420_A8R8G8B8
);
}
}
else
if
(
p_vout
->
output
.
i_rmask
==
0xff000000
&&
p_vout
->
output
.
i_gmask
==
0x00ff0000
&&
p_vout
->
output
.
i_bmask
==
0x0000ff00
)
{
/* R8G8B8A8 pixel format */
msg_Dbg
(
p_this
,
"RGB pixel format is R8G8B8A8"
);
//p_vout->chroma.pf_convert = E_(I420_B8G8R8A8);
return
-
1
;
}
else
if
(
p_vout
->
output
.
i_rmask
==
0x0000ff00
else
if
(
p_vout
->
output
.
i_rmask
==
0x0000ff00
&&
p_vout
->
output
.
i_gmask
==
0x00ff0000
&&
p_vout
->
output
.
i_gmask
==
0x00ff0000
&&
p_vout
->
output
.
i_bmask
==
0xff000000
)
&&
p_vout
->
output
.
i_bmask
==
0xff000000
)
...
@@ -163,10 +172,18 @@ static int Activate( vlc_object_t *p_this )
...
@@ -163,10 +172,18 @@ static int Activate( vlc_object_t *p_this )
msg_Dbg
(
p_this
,
"RGB pixel format is B8G8R8A8"
);
msg_Dbg
(
p_this
,
"RGB pixel format is B8G8R8A8"
);
p_vout
->
chroma
.
pf_convert
=
E_
(
I420_B8G8R8A8
);
p_vout
->
chroma
.
pf_convert
=
E_
(
I420_B8G8R8A8
);
}
}
else
if
(
p_vout
->
output
.
i_rmask
==
0x000000ff
&&
p_vout
->
output
.
i_gmask
==
0x0000ff00
&&
p_vout
->
output
.
i_bmask
==
0x00ff0000
)
{
/* A8B8G8R8 pixel format */
msg_Dbg
(
p_this
,
"RGB pixel format is A8B8G8R8"
);
p_vout
->
chroma
.
pf_convert
=
E_
(
I420_A8B8G8R8
);
}
else
else
return
-
1
;
return
-
1
;
#else
#else
/
/
generic C chroma converter */
/
*
generic C chroma converter */
p_vout
->
chroma
.
pf_convert
=
E_
(
I420_RGB32
);
p_vout
->
chroma
.
pf_convert
=
E_
(
I420_RGB32
);
#endif
#endif
break
;
break
;
...
...
modules/video_chroma/i420_rgb.h
View file @
a3883709
...
@@ -65,6 +65,7 @@ void E_(I420_R5G5B5) ( vout_thread_t *, picture_t *, picture_t * );
...
@@ -65,6 +65,7 @@ void E_(I420_R5G5B5) ( vout_thread_t *, picture_t *, picture_t * );
void
E_
(
I420_R5G6B5
)
(
vout_thread_t
*
,
picture_t
*
,
picture_t
*
);
void
E_
(
I420_R5G6B5
)
(
vout_thread_t
*
,
picture_t
*
,
picture_t
*
);
void
E_
(
I420_A8R8G8B8
)
(
vout_thread_t
*
,
picture_t
*
,
picture_t
*
);
void
E_
(
I420_A8R8G8B8
)
(
vout_thread_t
*
,
picture_t
*
,
picture_t
*
);
void
E_
(
I420_B8G8R8A8
)
(
vout_thread_t
*
,
picture_t
*
,
picture_t
*
);
void
E_
(
I420_B8G8R8A8
)
(
vout_thread_t
*
,
picture_t
*
,
picture_t
*
);
void
E_
(
I420_A8B8G8R8
)
(
vout_thread_t
*
,
picture_t
*
,
picture_t
*
);
#endif
#endif
/*****************************************************************************
/*****************************************************************************
...
...
modules/video_chroma/i420_rgb16.c
View file @
a3883709
...
@@ -35,14 +35,8 @@
...
@@ -35,14 +35,8 @@
#if defined (MODULE_NAME_IS_i420_rgb)
#if defined (MODULE_NAME_IS_i420_rgb)
# include "i420_rgb_c.h"
# include "i420_rgb_c.h"
#elif defined (MODULE_NAME_IS_i420_rgb_mmx)
#elif defined (MODULE_NAME_IS_i420_rgb_mmx)
# if defined(HAVE_MMX_INTRINSICS)
# include <mmintrin.h>
# endif
# include "i420_rgb_mmx.h"
# include "i420_rgb_mmx.h"
#elif defined (MODULE_NAME_IS_i420_rgb_sse2)
#elif defined (MODULE_NAME_IS_i420_rgb_sse2)
# if defined(HAVE_SSE2_INTRINSICS)
# include <emmintrin.h>
# endif
# include "i420_rgb_mmx.h"
# include "i420_rgb_mmx.h"
#endif
#endif
...
@@ -309,7 +303,7 @@ void E_(I420_RGB16)( vout_thread_t *p_vout, picture_t *p_src,
...
@@ -309,7 +303,7 @@ void E_(I420_RGB16)( vout_thread_t *p_vout, picture_t *p_src,
}
}
}
}
#else //
defined (MODULE_NAME_IS_i420_rgb_mmx
)
#else //
! defined (MODULE_NAME_IS_i420_rgb
)
void
E_
(
I420_R5G5B5
)(
vout_thread_t
*
p_vout
,
picture_t
*
p_src
,
void
E_
(
I420_R5G5B5
)(
vout_thread_t
*
p_vout
,
picture_t
*
p_src
,
picture_t
*
p_dest
)
picture_t
*
p_dest
)
...
@@ -388,20 +382,12 @@ void E_(I420_R5G5B5)( vout_thread_t *p_vout, picture_t *p_src,
...
@@ -388,20 +382,12 @@ void E_(I420_R5G5B5)( vout_thread_t *p_vout, picture_t *p_src,
for
(
i_x
=
p_vout
->
render
.
i_width
/
16
;
i_x
--
;
)
for
(
i_x
=
p_vout
->
render
.
i_width
/
16
;
i_x
--
;
)
{
{
#if defined (CAN_COMPILE_SSE2)
SSE2_CALL
(
__asm__
(
".p2align 3"
SSE2_INIT_16_ALIGNED
SSE2_INIT_16_ALIGNED
SSE2_YUV_MUL
SSE2_YUV_MUL
SSE2_YUV_ADD
SSE2_YUV_ADD
SSE2_UNPACK_15_ALIGNED
SSE2_UNPACK_15_ALIGNED
:
:
"r"
(
p_y
),
"r"
(
p_u
),
"r"
(
p_v
),
"r"
(
p_buffer
)
:
"eax"
);
);
#else
__m128i
xmm0
,
xmm1
,
xmm2
,
xmm3
,
xmm4
,
xmm5
,
xmm6
,
xmm7
;
SSE2_INTRINSICS_INIT_16_ALIGNED
SSE2_INTRINSICS_YUV_MUL
SSE2_INTRINSICS_YUV_ADD
SSE2_INTRINSICS_UNPACK_15_ALIGNED
#endif
p_y
+=
16
;
p_y
+=
16
;
p_u
+=
8
;
p_u
+=
8
;
p_v
+=
8
;
p_v
+=
8
;
...
@@ -416,23 +402,12 @@ void E_(I420_R5G5B5)( vout_thread_t *p_vout, picture_t *p_src,
...
@@ -416,23 +402,12 @@ void E_(I420_R5G5B5)( vout_thread_t *p_vout, picture_t *p_src,
p_v
-=
i_rewind
>>
1
;
p_v
-=
i_rewind
>>
1
;
p_buffer
-=
i_rewind
;
p_buffer
-=
i_rewind
;
#if defined (CAN_COMPILE_SSE2)
SSE2_CALL
(
__asm__
(
".p2align 3"
SSE2_INIT_16_UNALIGNED
SSE2_INIT_16_UNALIGNED
SSE2_YUV_MUL
SSE2_YUV_MUL
SSE2_YUV_ADD
SSE2_YUV_ADD
SSE2_UNPACK_15_UNALIGNED
SSE2_UNPACK_15_UNALIGNED
:
:
"r"
(
p_y
),
"r"
(
p_u
),
"r"
(
p_v
),
"r"
(
p_buffer
)
:
"eax"
);
);
#else
{
__m128i
xmm0
,
xmm1
,
xmm2
,
xmm3
,
xmm4
,
xmm5
,
xmm6
,
xmm7
;
SSE2_INTRINSICS_INIT_16_UNALIGNED
SSE2_INTRINSICS_YUV_MUL
SSE2_INTRINSICS_YUV_ADD
SSE2_INTRINSICS_UNPACK_15_UNALIGNED
}
#endif
p_y
+=
16
;
p_y
+=
16
;
p_u
+=
8
;
p_u
+=
8
;
p_v
+=
8
;
p_v
+=
8
;
...
@@ -459,20 +434,12 @@ void E_(I420_R5G5B5)( vout_thread_t *p_vout, picture_t *p_src,
...
@@ -459,20 +434,12 @@ void E_(I420_R5G5B5)( vout_thread_t *p_vout, picture_t *p_src,
for
(
i_x
=
p_vout
->
render
.
i_width
/
16
;
i_x
--
;
)
for
(
i_x
=
p_vout
->
render
.
i_width
/
16
;
i_x
--
;
)
{
{
#if defined (CAN_COMPILE_SSE2)
SSE2_CALL
(
__asm__
(
".p2align 3"
SSE2_INIT_16_UNALIGNED
SSE2_INIT_16_UNALIGNED
SSE2_YUV_MUL
SSE2_YUV_MUL
SSE2_YUV_ADD
SSE2_YUV_ADD
SSE2_UNPACK_15_UNALIGNED
SSE2_UNPACK_15_UNALIGNED
:
:
"r"
(
p_y
),
"r"
(
p_u
),
"r"
(
p_v
),
"r"
(
p_buffer
)
:
"eax"
);
);
#else
__m128i
xmm0
,
xmm1
,
xmm2
,
xmm3
,
xmm4
,
xmm5
,
xmm6
,
xmm7
;
SSE2_INTRINSICS_INIT_16_UNALIGNED
SSE2_INTRINSICS_YUV_MUL
SSE2_INTRINSICS_YUV_ADD
SSE2_INTRINSICS_UNPACK_15_UNALIGNED
#endif
p_y
+=
16
;
p_y
+=
16
;
p_u
+=
8
;
p_u
+=
8
;
p_v
+=
8
;
p_v
+=
8
;
...
@@ -487,23 +454,12 @@ void E_(I420_R5G5B5)( vout_thread_t *p_vout, picture_t *p_src,
...
@@ -487,23 +454,12 @@ void E_(I420_R5G5B5)( vout_thread_t *p_vout, picture_t *p_src,
p_v
-=
i_rewind
>>
1
;
p_v
-=
i_rewind
>>
1
;
p_buffer
-=
i_rewind
;
p_buffer
-=
i_rewind
;
#if defined (CAN_COMPILE_SSE2)
SSE2_CALL
(
__asm__
(
".p2align 3"
SSE2_INIT_16_UNALIGNED
SSE2_INIT_16_UNALIGNED
SSE2_YUV_MUL
SSE2_YUV_MUL
SSE2_YUV_ADD
SSE2_YUV_ADD
SSE2_UNPACK_15_UNALIGNED
SSE2_UNPACK_15_UNALIGNED
:
:
"r"
(
p_y
),
"r"
(
p_u
),
"r"
(
p_v
),
"r"
(
p_buffer
)
:
"eax"
);
);
#else
{
__m128i
xmm0
,
xmm1
,
xmm2
,
xmm3
,
xmm4
,
xmm5
,
xmm6
,
xmm7
;
SSE2_INTRINSICS_INIT_16_UNALIGNED
SSE2_INTRINSICS_YUV_MUL
SSE2_INTRINSICS_YUV_ADD
SSE2_INTRINSICS_UNPACK_15_UNALIGNED
}
#endif
p_y
+=
16
;
p_y
+=
16
;
p_u
+=
8
;
p_u
+=
8
;
p_v
+=
8
;
p_v
+=
8
;
...
@@ -522,11 +478,7 @@ void E_(I420_R5G5B5)( vout_thread_t *p_vout, picture_t *p_src,
...
@@ -522,11 +478,7 @@ void E_(I420_R5G5B5)( vout_thread_t *p_vout, picture_t *p_src,
}
}
/* make sure all SSE2 stores are visible thereafter */
/* make sure all SSE2 stores are visible thereafter */
#if defined (CAN_COMPILE_SSE2)
SSE2_END
;
__asm__
__volatile__
(
"sfence"
:::
"memory"
);
#else
_mm_sfence
();
#endif
#else // defined (MODULE_NAME_IS_i420_rgb_mmx)
#else // defined (MODULE_NAME_IS_i420_rgb_mmx)
...
@@ -546,22 +498,12 @@ void E_(I420_R5G5B5)( vout_thread_t *p_vout, picture_t *p_src,
...
@@ -546,22 +498,12 @@ void E_(I420_R5G5B5)( vout_thread_t *p_vout, picture_t *p_src,
for
(
i_x
=
p_vout
->
render
.
i_width
/
8
;
i_x
--
;
)
for
(
i_x
=
p_vout
->
render
.
i_width
/
8
;
i_x
--
;
)
{
{
#if defined (CAN_COMPILE_MMX)
MMX_CALL
(
__asm__
(
".p2align 3"
MMX_INIT_16
MMX_INIT_16
MMX_YUV_MUL
MMX_YUV_MUL
MMX_YUV_ADD
MMX_YUV_ADD
MMX_UNPACK_15
MMX_UNPACK_15
:
:
"r"
(
p_y
),
"r"
(
p_u
),
"r"
(
p_v
),
"r"
(
p_buffer
)
);
);
#else
__m64
mm0
,
mm1
,
mm2
,
mm3
,
mm4
,
mm5
,
mm6
,
mm7
;
uint64_t
tmp64
;
MMX_INTRINSICS_INIT_16
MMX_INTRINSICS_YUV_MUL
MMX_INTRINSICS_YUV_ADD
MMX_INTRINSICS_UNPACK_15
#endif
p_y
+=
8
;
p_y
+=
8
;
p_u
+=
4
;
p_u
+=
4
;
p_v
+=
4
;
p_v
+=
4
;
...
@@ -577,24 +519,12 @@ void E_(I420_R5G5B5)( vout_thread_t *p_vout, picture_t *p_src,
...
@@ -577,24 +519,12 @@ void E_(I420_R5G5B5)( vout_thread_t *p_vout, picture_t *p_src,
p_v
-=
i_rewind
>>
1
;
p_v
-=
i_rewind
>>
1
;
p_buffer
-=
i_rewind
;
p_buffer
-=
i_rewind
;
#if defined (CAN_COMPILE_MMX)
MMX_CALL
(
__asm__
(
".p2align 3"
MMX_INIT_16
MMX_INIT_16
MMX_YUV_MUL
MMX_YUV_MUL
MMX_YUV_ADD
MMX_YUV_ADD
MMX_UNPACK_15
MMX_UNPACK_15
:
:
"r"
(
p_y
),
"r"
(
p_u
),
"r"
(
p_v
),
"r"
(
p_buffer
)
);
);
#else
{
__m64
mm0
,
mm1
,
mm2
,
mm3
,
mm4
,
mm5
,
mm6
,
mm7
;
uint64_t
tmp64
;
MMX_INTRINSICS_INIT_16
MMX_INTRINSICS_YUV_MUL
MMX_INTRINSICS_YUV_ADD
MMX_INTRINSICS_UNPACK_15
}
#endif
p_y
+=
8
;
p_y
+=
8
;
p_u
+=
4
;
p_u
+=
4
;
p_v
+=
4
;
p_v
+=
4
;
...
@@ -611,11 +541,7 @@ void E_(I420_R5G5B5)( vout_thread_t *p_vout, picture_t *p_src,
...
@@ -611,11 +541,7 @@ void E_(I420_R5G5B5)( vout_thread_t *p_vout, picture_t *p_src,
}
}
}
}
/* re-enable FPU registers */
/* re-enable FPU registers */
#if defined (CAN_COMPILE_MMX)
MMX_END
;
__asm__
__volatile__
(
"emms"
);
#else
_mm_empty
();
#endif
#endif
#endif
}
}
...
@@ -697,20 +623,12 @@ void E_(I420_R5G6B5)( vout_thread_t *p_vout, picture_t *p_src,
...
@@ -697,20 +623,12 @@ void E_(I420_R5G6B5)( vout_thread_t *p_vout, picture_t *p_src,
for
(
i_x
=
p_vout
->
render
.
i_width
/
16
;
i_x
--
;
)
for
(
i_x
=
p_vout
->
render
.
i_width
/
16
;
i_x
--
;
)
{
{
#if defined (CAN_COMPILE_SSE2)
SSE2_CALL
(
__asm__
(
".p2align 3"
SSE2_INIT_16_ALIGNED
SSE2_INIT_16_ALIGNED
SSE2_YUV_MUL
SSE2_YUV_MUL
SSE2_YUV_ADD
SSE2_YUV_ADD
SSE2_UNPACK_16_ALIGNED
SSE2_UNPACK_16_ALIGNED
:
:
"r"
(
p_y
),
"r"
(
p_u
),
"r"
(
p_v
),
"r"
(
p_buffer
)
:
"eax"
);
);
#else
__m128i
xmm0
,
xmm1
,
xmm2
,
xmm3
,
xmm4
,
xmm5
,
xmm6
,
xmm7
;
SSE2_INTRINSICS_INIT_16_ALIGNED
SSE2_INTRINSICS_YUV_MUL
SSE2_INTRINSICS_YUV_ADD
SSE2_INTRINSICS_UNPACK_16_ALIGNED
#endif
p_y
+=
16
;
p_y
+=
16
;
p_u
+=
8
;
p_u
+=
8
;
p_v
+=
8
;
p_v
+=
8
;
...
@@ -725,23 +643,12 @@ void E_(I420_R5G6B5)( vout_thread_t *p_vout, picture_t *p_src,
...
@@ -725,23 +643,12 @@ void E_(I420_R5G6B5)( vout_thread_t *p_vout, picture_t *p_src,
p_v
-=
i_rewind
>>
1
;
p_v
-=
i_rewind
>>
1
;
p_buffer
-=
i_rewind
;
p_buffer
-=
i_rewind
;
#if defined (CAN_COMPILE_SSE2)
SSE2_CALL
(
__asm__
(
".p2align 3"
SSE2_INIT_16_UNALIGNED
SSE2_INIT_16_UNALIGNED
SSE2_YUV_MUL
SSE2_YUV_MUL
SSE2_YUV_ADD
SSE2_YUV_ADD
SSE2_UNPACK_16_UNALIGNED
SSE2_UNPACK_16_UNALIGNED
:
:
"r"
(
p_y
),
"r"
(
p_u
),
"r"
(
p_v
),
"r"
(
p_buffer
)
:
"eax"
);
);
#else
{
__m128i
xmm0
,
xmm1
,
xmm2
,
xmm3
,
xmm4
,
xmm5
,
xmm6
,
xmm7
;
SSE2_INTRINSICS_INIT_16_UNALIGNED
SSE2_INTRINSICS_YUV_MUL
SSE2_INTRINSICS_YUV_ADD
SSE2_INTRINSICS_UNPACK_16_UNALIGNED
}
#endif
p_y
+=
16
;
p_y
+=
16
;
p_u
+=
8
;
p_u
+=
8
;
p_v
+=
8
;
p_v
+=
8
;
...
@@ -768,20 +675,12 @@ void E_(I420_R5G6B5)( vout_thread_t *p_vout, picture_t *p_src,
...
@@ -768,20 +675,12 @@ void E_(I420_R5G6B5)( vout_thread_t *p_vout, picture_t *p_src,
for
(
i_x
=
p_vout
->
render
.
i_width
/
16
;
i_x
--
;
)
for
(
i_x
=
p_vout
->
render
.
i_width
/
16
;
i_x
--
;
)
{
{
#if defined (CAN_COMPILE_SSE2)
SSE2_CALL
(
__asm__
(
".p2align 3"
SSE2_INIT_16_UNALIGNED
SSE2_INIT_16_UNALIGNED
SSE2_YUV_MUL
SSE2_YUV_MUL
SSE2_YUV_ADD
SSE2_YUV_ADD
SSE2_UNPACK_16_UNALIGNED
SSE2_UNPACK_16_UNALIGNED
:
:
"r"
(
p_y
),
"r"
(
p_u
),
"r"
(
p_v
),
"r"
(
p_buffer
)
:
"eax"
);
);
#else
__m128i
xmm0
,
xmm1
,
xmm2
,
xmm3
,
xmm4
,
xmm5
,
xmm6
,
xmm7
;
SSE2_INTRINSICS_INIT_16_UNALIGNED
SSE2_INTRINSICS_YUV_MUL
SSE2_INTRINSICS_YUV_ADD
SSE2_INTRINSICS_UNPACK_16_UNALIGNED
#endif
p_y
+=
16
;
p_y
+=
16
;
p_u
+=
8
;
p_u
+=
8
;
p_v
+=
8
;
p_v
+=
8
;
...
@@ -796,23 +695,12 @@ void E_(I420_R5G6B5)( vout_thread_t *p_vout, picture_t *p_src,
...
@@ -796,23 +695,12 @@ void E_(I420_R5G6B5)( vout_thread_t *p_vout, picture_t *p_src,
p_v
-=
i_rewind
>>
1
;
p_v
-=
i_rewind
>>
1
;
p_buffer
-=
i_rewind
;
p_buffer
-=
i_rewind
;
#if defined (CAN_COMPILE_SSE2)
SSE2_CALL
(
__asm__
(
".p2align 3"
SSE2_INIT_16_UNALIGNED
SSE2_INIT_16_UNALIGNED
SSE2_YUV_MUL
SSE2_YUV_MUL
SSE2_YUV_ADD
SSE2_YUV_ADD
SSE2_UNPACK_16_UNALIGNED
SSE2_UNPACK_16_UNALIGNED
:
:
"r"
(
p_y
),
"r"
(
p_u
),
"r"
(
p_v
),
"r"
(
p_buffer
)
:
"eax"
);
);
#else
{
__m128i
xmm0
,
xmm1
,
xmm2
,
xmm3
,
xmm4
,
xmm5
,
xmm6
,
xmm7
;
SSE2_INTRINSICS_INIT_16_UNALIGNED
SSE2_INTRINSICS_YUV_MUL
SSE2_INTRINSICS_YUV_ADD
SSE2_INTRINSICS_UNPACK_16_UNALIGNED
}
#endif
p_y
+=
16
;
p_y
+=
16
;
p_u
+=
8
;
p_u
+=
8
;
p_v
+=
8
;
p_v
+=
8
;
...
@@ -831,11 +719,7 @@ void E_(I420_R5G6B5)( vout_thread_t *p_vout, picture_t *p_src,
...
@@ -831,11 +719,7 @@ void E_(I420_R5G6B5)( vout_thread_t *p_vout, picture_t *p_src,
}
}
/* make sure all SSE2 stores are visible thereafter */
/* make sure all SSE2 stores are visible thereafter */
#if defined (CAN_COMPILE_SSE2)
SSE2_END
;
__asm__
__volatile__
(
"sfence"
:::
"memory"
);
#else
_mm_sfence
();
#endif
#else // defined (MODULE_NAME_IS_i420_rgb_mmx)
#else // defined (MODULE_NAME_IS_i420_rgb_mmx)
...
@@ -855,22 +739,12 @@ void E_(I420_R5G6B5)( vout_thread_t *p_vout, picture_t *p_src,
...
@@ -855,22 +739,12 @@ void E_(I420_R5G6B5)( vout_thread_t *p_vout, picture_t *p_src,
for
(
i_x
=
p_vout
->
render
.
i_width
/
8
;
i_x
--
;
)
for
(
i_x
=
p_vout
->
render
.
i_width
/
8
;
i_x
--
;
)
{
{
#if defined (CAN_COMPILE_MMX)
MMX_CALL
(
__asm__
(
".p2align 3"
MMX_INIT_16
MMX_INIT_16
MMX_YUV_MUL
MMX_YUV_MUL
MMX_YUV_ADD
MMX_YUV_ADD
MMX_UNPACK_16
MMX_UNPACK_16
:
:
"r"
(
p_y
),
"r"
(
p_u
),
"r"
(
p_v
),
"r"
(
p_buffer
)
);
);
#else
__m64
mm0
,
mm1
,
mm2
,
mm3
,
mm4
,
mm5
,
mm6
,
mm7
;
uint64_t
tmp64
;
MMX_INTRINSICS_INIT_16
MMX_INTRINSICS_YUV_MUL
MMX_INTRINSICS_YUV_ADD
MMX_INTRINSICS_UNPACK_16
#endif
p_y
+=
8
;
p_y
+=
8
;
p_u
+=
4
;
p_u
+=
4
;
p_v
+=
4
;
p_v
+=
4
;
...
@@ -886,24 +760,12 @@ void E_(I420_R5G6B5)( vout_thread_t *p_vout, picture_t *p_src,
...
@@ -886,24 +760,12 @@ void E_(I420_R5G6B5)( vout_thread_t *p_vout, picture_t *p_src,
p_v
-=
i_rewind
>>
1
;
p_v
-=
i_rewind
>>
1
;
p_buffer
-=
i_rewind
;
p_buffer
-=
i_rewind
;
#if defined (CAN_COMPILE_MMX)
MMX_CALL
(
__asm__
(
".p2align 3"
MMX_INIT_16
MMX_INIT_16
MMX_YUV_MUL
MMX_YUV_MUL
MMX_YUV_ADD
MMX_YUV_ADD
MMX_UNPACK_16
MMX_UNPACK_16
:
:
"r"
(
p_y
),
"r"
(
p_u
),
"r"
(
p_v
),
"r"
(
p_buffer
)
);
);
#else
{
__m64
mm0
,
mm1
,
mm2
,
mm3
,
mm4
,
mm5
,
mm6
,
mm7
;
uint64_t
tmp64
;
MMX_INTRINSICS_INIT_16
MMX_INTRINSICS_YUV_MUL
MMX_INTRINSICS_YUV_ADD
MMX_INTRINSICS_UNPACK_16
}
#endif
p_y
+=
8
;
p_y
+=
8
;
p_u
+=
4
;
p_u
+=
4
;
p_v
+=
4
;
p_v
+=
4
;
...
@@ -920,11 +782,7 @@ void E_(I420_R5G6B5)( vout_thread_t *p_vout, picture_t *p_src,
...
@@ -920,11 +782,7 @@ void E_(I420_R5G6B5)( vout_thread_t *p_vout, picture_t *p_src,
}
}
}
}
/* re-enable FPU registers */
/* re-enable FPU registers */
#if defined (CAN_COMPILE_MMX)
MMX_END
;
__asm__
__volatile__
(
"emms"
);
#else
_mm_empty
();
#endif
#endif
#endif
}
}
...
@@ -1118,23 +976,12 @@ void E_(I420_A8R8G8B8)( vout_thread_t *p_vout, picture_t *p_src,
...
@@ -1118,23 +976,12 @@ void E_(I420_A8R8G8B8)( vout_thread_t *p_vout, picture_t *p_src,
for
(
i_x
=
p_vout
->
render
.
i_width
/
16
;
i_x
--
;
)
for
(
i_x
=
p_vout
->
render
.
i_width
/
16
;
i_x
--
;
)
{
{
#if defined (CAN_COMPILE_SSE2)
SSE2_CALL
(
/* use inline SSE2 assembly */
__asm__
(
".p2align 3"
SSE2_INIT_32_ALIGNED
SSE2_INIT_32_ALIGNED
SSE2_YUV_MUL
SSE2_YUV_MUL
SSE2_YUV_ADD
SSE2_YUV_ADD
SSE2_UNPACK_32_ARGB_ALIGNED
SSE2_UNPACK_32_ARGB_ALIGNED
:
:
"r"
(
p_y
),
"r"
(
p_u
),
"r"
(
p_v
),
"r"
(
p_buffer
)
:
"eax"
);
);
#else
/* otherwise use SSE2 C intrinsics wrappers */
__m128i
xmm0
,
xmm1
,
xmm2
,
xmm3
,
xmm4
,
xmm5
,
xmm6
,
xmm7
;
SSE2_INTRINSICS_INIT_32_ALIGNED
SSE2_INTRINSICS_YUV_MUL
SSE2_INTRINSICS_YUV_ADD
SSE2_INTRINSICS_UNPACK_32_ARGB_ALIGNED
#endif
p_y
+=
16
;
p_y
+=
16
;
p_u
+=
8
;
p_u
+=
8
;
p_v
+=
8
;
p_v
+=
8
;
...
@@ -1149,25 +996,12 @@ void E_(I420_A8R8G8B8)( vout_thread_t *p_vout, picture_t *p_src,
...
@@ -1149,25 +996,12 @@ void E_(I420_A8R8G8B8)( vout_thread_t *p_vout, picture_t *p_src,
p_u
-=
i_rewind
>>
1
;
p_u
-=
i_rewind
>>
1
;
p_v
-=
i_rewind
>>
1
;
p_v
-=
i_rewind
>>
1
;
p_buffer
-=
i_rewind
;
p_buffer
-=
i_rewind
;
#if defined (CAN_COMPILE_SSE2)
SSE2_CALL
(
/* use inline SSE2 assembly */
__asm__
(
".p2align 3"
SSE2_INIT_32_UNALIGNED
SSE2_INIT_32_UNALIGNED
SSE2_YUV_MUL
SSE2_YUV_MUL
SSE2_YUV_ADD
SSE2_YUV_ADD
SSE2_UNPACK_32_ARGB_UNALIGNED
SSE2_UNPACK_32_ARGB_UNALIGNED
:
:
"r"
(
p_y
),
"r"
(
p_u
),
"r"
(
p_v
),
"r"
(
p_buffer
)
:
"eax"
);
);
#else
/* otherwise use SSE2 intrinsics wrappers */
{
__m128i
xmm0
,
xmm1
,
xmm2
,
xmm3
,
xmm4
,
xmm5
,
xmm6
,
xmm7
;
SSE2_INTRINSICS_INIT_32_UNALIGNED
SSE2_INTRINSICS_YUV_MUL
SSE2_INTRINSICS_YUV_ADD
SSE2_INTRINSICS_UNPACK_32_ARGB_UNALIGNED
}
#endif
p_y
+=
16
;
p_y
+=
16
;
p_u
+=
4
;
p_u
+=
4
;
p_v
+=
4
;
p_v
+=
4
;
...
@@ -1194,23 +1028,12 @@ void E_(I420_A8R8G8B8)( vout_thread_t *p_vout, picture_t *p_src,
...
@@ -1194,23 +1028,12 @@ void E_(I420_A8R8G8B8)( vout_thread_t *p_vout, picture_t *p_src,
for
(
i_x
=
p_vout
->
render
.
i_width
/
16
;
i_x
--
;
)
for
(
i_x
=
p_vout
->
render
.
i_width
/
16
;
i_x
--
;
)
{
{
#if defined (CAN_COMPILE_SSE2)
SSE2_CALL
(
/* use inline SSE2 assembly */
__asm__
(
".p2align 3"
SSE2_INIT_32_UNALIGNED
SSE2_INIT_32_UNALIGNED
SSE2_YUV_MUL
SSE2_YUV_MUL
SSE2_YUV_ADD
SSE2_YUV_ADD
SSE2_UNPACK_32_ARGB_UNALIGNED
SSE2_UNPACK_32_ARGB_UNALIGNED
:
:
"r"
(
p_y
),
"r"
(
p_u
),
"r"
(
p_v
),
"r"
(
p_buffer
)
:
"eax"
);
);
#else
/* otherwise use SSE2 C intrinsics wrappers */
__m128i
xmm0
,
xmm1
,
xmm2
,
xmm3
,
xmm4
,
xmm5
,
xmm6
,
xmm7
;
SSE2_INTRINSICS_INIT_32_UNALIGNED
SSE2_INTRINSICS_YUV_MUL
SSE2_INTRINSICS_YUV_ADD
SSE2_INTRINSICS_UNPACK_32_ARGB_UNALIGNED
#endif
p_y
+=
16
;
p_y
+=
16
;
p_u
+=
8
;
p_u
+=
8
;
p_v
+=
8
;
p_v
+=
8
;
...
@@ -1225,25 +1048,12 @@ void E_(I420_A8R8G8B8)( vout_thread_t *p_vout, picture_t *p_src,
...
@@ -1225,25 +1048,12 @@ void E_(I420_A8R8G8B8)( vout_thread_t *p_vout, picture_t *p_src,
p_u
-=
i_rewind
>>
1
;
p_u
-=
i_rewind
>>
1
;
p_v
-=
i_rewind
>>
1
;
p_v
-=
i_rewind
>>
1
;
p_buffer
-=
i_rewind
;
p_buffer
-=
i_rewind
;
#if defined (CAN_COMPILE_SSE2)
SSE2_CALL
(
/* use inline SSE2 assembly */
__asm__
(
".p2align 3"
SSE2_INIT_32_UNALIGNED
SSE2_INIT_32_UNALIGNED
SSE2_YUV_MUL
SSE2_YUV_MUL
SSE2_YUV_ADD
SSE2_YUV_ADD
SSE2_UNPACK_32_ARGB_UNALIGNED
SSE2_UNPACK_32_ARGB_UNALIGNED
:
:
"r"
(
p_y
),
"r"
(
p_u
),
"r"
(
p_v
),
"r"
(
p_buffer
)
:
"eax"
);
);
#else
/* otherwise use SSE2 intrinsics wrappers */
{
__m128i
xmm0
,
xmm1
,
xmm2
,
xmm3
,
xmm4
,
xmm5
,
xmm6
,
xmm7
;
SSE2_INTRINSICS_INIT_32_UNALIGNED
SSE2_INTRINSICS_YUV_MUL
SSE2_INTRINSICS_YUV_ADD
SSE2_INTRINSICS_UNPACK_32_ARGB_UNALIGNED
}
#endif
p_y
+=
16
;
p_y
+=
16
;
p_u
+=
8
;
p_u
+=
8
;
p_v
+=
8
;
p_v
+=
8
;
...
@@ -1262,11 +1072,7 @@ void E_(I420_A8R8G8B8)( vout_thread_t *p_vout, picture_t *p_src,
...
@@ -1262,11 +1072,7 @@ void E_(I420_A8R8G8B8)( vout_thread_t *p_vout, picture_t *p_src,
}
}
/* make sure all SSE2 stores are visible thereafter */
/* make sure all SSE2 stores are visible thereafter */
#if defined (CAN_COMPILE_SSE2)
SSE2_END
;
__asm__
__volatile__
(
"sfence"
:::
"memory"
);
#else
_mm_sfence
();
#endif
#else // defined (MODULE_NAME_IS_i420_rgb_mmx)
#else // defined (MODULE_NAME_IS_i420_rgb_mmx)
...
@@ -1286,26 +1092,12 @@ void E_(I420_A8R8G8B8)( vout_thread_t *p_vout, picture_t *p_src,
...
@@ -1286,26 +1092,12 @@ void E_(I420_A8R8G8B8)( vout_thread_t *p_vout, picture_t *p_src,
for
(
i_x
=
p_vout
->
render
.
i_width
/
8
;
i_x
--
;
)
for
(
i_x
=
p_vout
->
render
.
i_width
/
8
;
i_x
--
;
)
{
{
#if defined (CAN_COMPILE_MMX)
MMX_CALL
(
/* use inline MMX assembly */
MMX_INIT_32
__asm__
(
MMX_INIT_32
:
:
"r"
(
p_y
),
"r"
(
p_u
),
"r"
(
p_v
),
"r"
(
p_buffer
)
);
__asm__
(
".p2align 3"
MMX_YUV_MUL
MMX_YUV_MUL
MMX_YUV_ADD
MMX_YUV_ADD
MMX_UNPACK_32_ARGB
MMX_UNPACK_32_ARGB
:
:
"r"
(
p_y
),
"r"
(
p_u
),
"r"
(
p_v
),
"r"
(
p_buffer
)
);
);
#else
/* otherwise use MMX C intrinsics wrappers */
__m64
mm0
,
mm1
,
mm2
,
mm3
,
mm4
,
mm5
,
mm6
,
mm7
;
uint64_t
tmp64
;
MMX_INTRINSICS_INIT_32
MMX_INTRINSICS_YUV_MUL
MMX_INTRINSICS_YUV_ADD
MMX_INTRINSICS_UNPACK_32_ARGB
#endif
p_y
+=
8
;
p_y
+=
8
;
p_u
+=
4
;
p_u
+=
4
;
p_v
+=
4
;
p_v
+=
4
;
...
@@ -1320,26 +1112,12 @@ void E_(I420_A8R8G8B8)( vout_thread_t *p_vout, picture_t *p_src,
...
@@ -1320,26 +1112,12 @@ void E_(I420_A8R8G8B8)( vout_thread_t *p_vout, picture_t *p_src,
p_u
-=
i_rewind
>>
1
;
p_u
-=
i_rewind
>>
1
;
p_v
-=
i_rewind
>>
1
;
p_v
-=
i_rewind
>>
1
;
p_buffer
-=
i_rewind
;
p_buffer
-=
i_rewind
;
#if defined (CAN_COMPILE_MMX)
MMX_CALL
(
/* use inline MMX assembly */
__asm__
(
".p2align 3"
MMX_INIT_32
MMX_INIT_32
MMX_YUV_MUL
MMX_YUV_MUL
MMX_YUV_ADD
MMX_YUV_ADD
MMX_UNPACK_32_ARGB
MMX_UNPACK_32_ARGB
:
:
"r"
(
p_y
),
"r"
(
p_u
),
"r"
(
p_v
),
"r"
(
p_buffer
)
);
);
#else
/* otherwise use MMX intrinsics wrappers */
{
__m64
mm0
,
mm1
,
mm2
,
mm3
,
mm4
,
mm5
,
mm6
,
mm7
;
uint64_t
tmp64
;
MMX_INTRINSICS_INIT_32
MMX_INTRINSICS_YUV_MUL
MMX_INTRINSICS_YUV_ADD
MMX_INTRINSICS_UNPACK_32_ARGB
}
#endif
p_y
+=
8
;
p_y
+=
8
;
p_u
+=
4
;
p_u
+=
4
;
p_v
+=
4
;
p_v
+=
4
;
...
@@ -1355,12 +1133,9 @@ void E_(I420_A8R8G8B8)( vout_thread_t *p_vout, picture_t *p_src,
...
@@ -1355,12 +1133,9 @@ void E_(I420_A8R8G8B8)( vout_thread_t *p_vout, picture_t *p_src,
p_v
+=
i_source_margin_c
;
p_v
+=
i_source_margin_c
;
}
}
}
}
/* re-enable FPU registers */
/* re-enable FPU registers */
#if defined (CAN_COMPILE_MMX)
MMX_END
;
__asm__
__volatile__
(
"emms"
);
#else
_mm_empty
();
#endif
#endif
#endif
}
}
...
@@ -1440,23 +1215,12 @@ void E_(I420_B8G8R8A8)( vout_thread_t *p_vout, picture_t *p_src,
...
@@ -1440,23 +1215,12 @@ void E_(I420_B8G8R8A8)( vout_thread_t *p_vout, picture_t *p_src,
for
(
i_x
=
p_vout
->
render
.
i_width
/
16
;
i_x
--
;
)
for
(
i_x
=
p_vout
->
render
.
i_width
/
16
;
i_x
--
;
)
{
{
#if defined (CAN_COMPILE_SSE2)
SSE2_CALL
(
/* use inline SSE2 assembly */
__asm__
(
".p2align 3"
SSE2_INIT_32_ALIGNED
SSE2_INIT_32_ALIGNED
SSE2_YUV_MUL
SSE2_YUV_MUL
SSE2_YUV_ADD
SSE2_YUV_ADD
SSE2_UNPACK_32_BGRA_ALIGNED
SSE2_UNPACK_32_BGRA_ALIGNED
:
:
"r"
(
p_y
),
"r"
(
p_u
),
"r"
(
p_v
),
"r"
(
p_buffer
)
:
"eax"
);
);
#else
/* otherwise use SSE2 C intrinsics wrappers */
__m128i
xmm0
,
xmm1
,
xmm2
,
xmm3
,
xmm4
,
xmm5
,
xmm6
,
xmm7
;
SSE2_INTRINSICS_INIT_32_ALIGNED
SSE2_INTRINSICS_YUV_MUL
SSE2_INTRINSICS_YUV_ADD
SSE2_INTRINSICS_UNPACK_32_BGRA_ALIGNED
#endif
p_y
+=
16
;
p_y
+=
16
;
p_u
+=
8
;
p_u
+=
8
;
p_v
+=
8
;
p_v
+=
8
;
...
@@ -1471,25 +1235,12 @@ void E_(I420_B8G8R8A8)( vout_thread_t *p_vout, picture_t *p_src,
...
@@ -1471,25 +1235,12 @@ void E_(I420_B8G8R8A8)( vout_thread_t *p_vout, picture_t *p_src,
p_u
-=
i_rewind
>>
1
;
p_u
-=
i_rewind
>>
1
;
p_v
-=
i_rewind
>>
1
;
p_v
-=
i_rewind
>>
1
;
p_buffer
-=
i_rewind
;
p_buffer
-=
i_rewind
;
#if defined (CAN_COMPILE_SSE2)
SSE2_CALL
(
/* use inline SSE2 assembly */
__asm__
(
".p2align 3"
SSE2_INIT_32_UNALIGNED
SSE2_INIT_32_UNALIGNED
SSE2_YUV_MUL
SSE2_YUV_MUL
SSE2_YUV_ADD
SSE2_YUV_ADD
SSE2_UNPACK_32_BGRA_UNALIGNED
SSE2_UNPACK_32_BGRA_UNALIGNED
:
:
"r"
(
p_y
),
"r"
(
p_u
),
"r"
(
p_v
),
"r"
(
p_buffer
)
:
"eax"
);
);
#else
/* otherwise use SSE2 intrinsics wrappers */
{
__m128i
xmm0
,
xmm1
,
xmm2
,
xmm3
,
xmm4
,
xmm5
,
xmm6
,
xmm7
;
SSE2_INTRINSICS_INIT_32_UNALIGNED
SSE2_INTRINSICS_YUV_MUL
SSE2_INTRINSICS_YUV_ADD
SSE2_INTRINSICS_UNPACK_32_BGRA_UNALIGNED
}
#endif
p_y
+=
16
;
p_y
+=
16
;
p_u
+=
4
;
p_u
+=
4
;
p_v
+=
4
;
p_v
+=
4
;
...
@@ -1516,23 +1267,12 @@ void E_(I420_B8G8R8A8)( vout_thread_t *p_vout, picture_t *p_src,
...
@@ -1516,23 +1267,12 @@ void E_(I420_B8G8R8A8)( vout_thread_t *p_vout, picture_t *p_src,
for
(
i_x
=
p_vout
->
render
.
i_width
/
16
;
i_x
--
;
)
for
(
i_x
=
p_vout
->
render
.
i_width
/
16
;
i_x
--
;
)
{
{
#if defined (CAN_COMPILE_SSE2)
SSE2_CALL
(
/* use inline SSE2 assembly */
__asm__
(
".p2align 3"
SSE2_INIT_32_UNALIGNED
SSE2_INIT_32_UNALIGNED
SSE2_YUV_MUL
SSE2_YUV_MUL
SSE2_YUV_ADD
SSE2_YUV_ADD
SSE2_UNPACK_32_BGRA_UNALIGNED
SSE2_UNPACK_32_BGRA_UNALIGNED
:
:
"r"
(
p_y
),
"r"
(
p_u
),
"r"
(
p_v
),
"r"
(
p_buffer
)
:
"eax"
);
);
#else
/* otherwise use SSE2 C intrinsics wrappers */
__m128i
xmm0
,
xmm1
,
xmm2
,
xmm3
,
xmm4
,
xmm5
,
xmm6
,
xmm7
;
SSE2_INTRINSICS_INIT_32_UNALIGNED
SSE2_INTRINSICS_YUV_MUL
SSE2_INTRINSICS_YUV_ADD
SSE2_INTRINSICS_UNPACK_32_BGRA_UNALIGNED
#endif
p_y
+=
16
;
p_y
+=
16
;
p_u
+=
8
;
p_u
+=
8
;
p_v
+=
8
;
p_v
+=
8
;
...
@@ -1547,25 +1287,12 @@ void E_(I420_B8G8R8A8)( vout_thread_t *p_vout, picture_t *p_src,
...
@@ -1547,25 +1287,12 @@ void E_(I420_B8G8R8A8)( vout_thread_t *p_vout, picture_t *p_src,
p_u
-=
i_rewind
>>
1
;
p_u
-=
i_rewind
>>
1
;
p_v
-=
i_rewind
>>
1
;
p_v
-=
i_rewind
>>
1
;
p_buffer
-=
i_rewind
;
p_buffer
-=
i_rewind
;
#if defined (CAN_COMPILE_SSE2)
SSE2_CALL
(
/* use inline SSE2 assembly */
__asm__
(
".p2align 3"
SSE2_INIT_32_UNALIGNED
SSE2_INIT_32_UNALIGNED
SSE2_YUV_MUL
SSE2_YUV_MUL
SSE2_YUV_ADD
SSE2_YUV_ADD
SSE2_UNPACK_32_BGRA_UNALIGNED
SSE2_UNPACK_32_BGRA_UNALIGNED
:
:
"r"
(
p_y
),
"r"
(
p_u
),
"r"
(
p_v
),
"r"
(
p_buffer
)
:
"eax"
);
);
#else
/* otherwise use SSE2 intrinsics wrappers */
{
__m128i
xmm0
,
xmm1
,
xmm2
,
xmm3
,
xmm4
,
xmm5
,
xmm6
,
xmm7
;
SSE2_INTRINSICS_INIT_32_UNALIGNED
SSE2_INTRINSICS_YUV_MUL
SSE2_INTRINSICS_YUV_ADD
SSE2_INTRINSICS_UNPACK_32_BGRA_UNALIGNED
}
#endif
p_y
+=
16
;
p_y
+=
16
;
p_u
+=
8
;
p_u
+=
8
;
p_v
+=
8
;
p_v
+=
8
;
...
@@ -1601,26 +1328,12 @@ void E_(I420_B8G8R8A8)( vout_thread_t *p_vout, picture_t *p_src,
...
@@ -1601,26 +1328,12 @@ void E_(I420_B8G8R8A8)( vout_thread_t *p_vout, picture_t *p_src,
for
(
i_x
=
p_vout
->
render
.
i_width
/
8
;
i_x
--
;
)
for
(
i_x
=
p_vout
->
render
.
i_width
/
8
;
i_x
--
;
)
{
{
#if defined (CAN_COMPILE_MMX)
MMX_CALL
(
/* use inline MMX assembly */
MMX_INIT_32
__asm__
(
MMX_INIT_32
:
:
"r"
(
p_y
),
"r"
(
p_u
),
"r"
(
p_v
),
"r"
(
p_buffer
)
);
__asm__
(
".p2align 3"
MMX_YUV_MUL
MMX_YUV_MUL
MMX_YUV_ADD
MMX_YUV_ADD
MMX_UNPACK_32_ARGB
MMX_UNPACK_32_BGRA
:
:
"r"
(
p_y
),
"r"
(
p_u
),
"r"
(
p_v
),
"r"
(
p_buffer
)
);
);
#else
/* otherwise use MMX C intrinsics wrappers */
__m64
mm0
,
mm1
,
mm2
,
mm3
,
mm4
,
mm5
,
mm6
,
mm7
;
uint64_t
tmp64
;
MMX_INTRINSICS_INIT_32
MMX_INTRINSICS_YUV_MUL
MMX_INTRINSICS_YUV_ADD
MMX_INTRINSICS_UNPACK_32_BGRA
#endif
p_y
+=
8
;
p_y
+=
8
;
p_u
+=
4
;
p_u
+=
4
;
p_v
+=
4
;
p_v
+=
4
;
...
@@ -1635,26 +1348,248 @@ void E_(I420_B8G8R8A8)( vout_thread_t *p_vout, picture_t *p_src,
...
@@ -1635,26 +1348,248 @@ void E_(I420_B8G8R8A8)( vout_thread_t *p_vout, picture_t *p_src,
p_u
-=
i_rewind
>>
1
;
p_u
-=
i_rewind
>>
1
;
p_v
-=
i_rewind
>>
1
;
p_v
-=
i_rewind
>>
1
;
p_buffer
-=
i_rewind
;
p_buffer
-=
i_rewind
;
#if defined (CAN_COMPILE_MMX)
MMX_CALL
(
/* use inline MMX assembly */
__asm__
(
".p2align 3"
MMX_INIT_32
MMX_INIT_32
MMX_YUV_MUL
MMX_YUV_MUL
MMX_YUV_ADD
MMX_YUV_ADD
MMX_UNPACK_32_BGRA
MMX_UNPACK_32_BGRA
:
:
"r"
(
p_y
),
"r"
(
p_u
),
"r"
(
p_v
),
"r"
(
p_buffer
)
);
);
p_y
+=
8
;
p_u
+=
4
;
p_v
+=
4
;
p_buffer
+=
8
;
}
SCALE_WIDTH
;
SCALE_HEIGHT
(
420
,
4
);
p_y
+=
i_source_margin
;
if
(
i_y
%
2
)
{
p_u
+=
i_source_margin_c
;
p_v
+=
i_source_margin_c
;
}
}
/* re-enable FPU registers */
MMX_END
;
#endif
}
void
E_
(
I420_A8B8G8R8
)(
vout_thread_t
*
p_vout
,
picture_t
*
p_src
,
picture_t
*
p_dest
)
{
/* We got this one from the old arguments */
uint32_t
*
p_pic
=
(
uint32_t
*
)
p_dest
->
p
->
p_pixels
;
uint8_t
*
p_y
=
p_src
->
Y_PIXELS
;
uint8_t
*
p_u
=
p_src
->
U_PIXELS
;
uint8_t
*
p_v
=
p_src
->
V_PIXELS
;
vlc_bool_t
b_hscale
;
/* horizontal scaling type */
unsigned
int
i_vscale
;
/* vertical scaling type */
unsigned
int
i_x
,
i_y
;
/* horizontal and vertical indexes */
int
i_right_margin
;
int
i_rewind
;
int
i_scale_count
;
/* scale modulo counter */
int
i_chroma_width
=
p_vout
->
render
.
i_width
/
2
;
/* chroma width */
uint32_t
*
p_pic_start
;
/* beginning of the current line for copy */
/* Conversion buffer pointer */
uint32_t
*
p_buffer_start
=
(
uint32_t
*
)
p_vout
->
chroma
.
p_sys
->
p_buffer
;
uint32_t
*
p_buffer
;
/* Offset array pointer */
int
*
p_offset_start
=
p_vout
->
chroma
.
p_sys
->
p_offset
;
int
*
p_offset
;
const
int
i_source_margin
=
p_src
->
p
[
0
].
i_pitch
-
p_src
->
p
[
0
].
i_visible_pitch
;
const
int
i_source_margin_c
=
p_src
->
p
[
1
].
i_pitch
-
p_src
->
p
[
1
].
i_visible_pitch
;
i_right_margin
=
p_dest
->
p
->
i_pitch
-
p_dest
->
p
->
i_visible_pitch
;
/* Rule: when a picture of size (x1,y1) with aspect ratio r1 is rendered
* on a picture of size (x2,y2) with aspect ratio r2, if x1 grows to x1'
* then y1 grows to y1' = x1' * y2/x2 * r2/r1 */
SetOffset
(
p_vout
->
render
.
i_width
,
p_vout
->
render
.
i_height
,
p_vout
->
output
.
i_width
,
p_vout
->
output
.
i_height
,
&
b_hscale
,
&
i_vscale
,
p_offset_start
);
/*
* Perform conversion
*/
i_scale_count
=
(
i_vscale
==
1
)
?
p_vout
->
output
.
i_height
:
p_vout
->
render
.
i_height
;
#if defined (MODULE_NAME_IS_i420_rgb_sse2)
if
(
p_vout
->
render
.
i_width
&
15
)
{
i_rewind
=
16
-
(
p_vout
->
render
.
i_width
&
15
);
}
else
{
i_rewind
=
0
;
}
/*
** SSE2 128 bits fetch/store instructions are faster
** if memory access is 16 bytes aligned
*/
p_buffer
=
b_hscale
?
p_buffer_start
:
p_pic
;
if
(
0
==
(
15
&
(
p_src
->
p
[
Y_PLANE
].
i_pitch
|
p_dest
->
p
->
i_pitch
|
((
int
)
p_y
)
|
((
int
)
p_buffer
)))
)
{
/* use faster SSE2 aligned fetch and store */
for
(
i_y
=
0
;
i_y
<
p_vout
->
render
.
i_height
;
i_y
++
)
{
p_pic_start
=
p_pic
;
for
(
i_x
=
p_vout
->
render
.
i_width
/
16
;
i_x
--
;
)
{
SSE2_CALL
(
SSE2_INIT_32_ALIGNED
SSE2_YUV_MUL
SSE2_YUV_ADD
SSE2_UNPACK_32_ABGR_ALIGNED
);
p_y
+=
16
;
p_u
+=
8
;
p_v
+=
8
;
p_buffer
+=
16
;
}
/* Here we do some unaligned reads and duplicate conversions, but
* at least we have all the pixels */
if
(
i_rewind
)
{
p_y
-=
i_rewind
;
p_u
-=
i_rewind
>>
1
;
p_v
-=
i_rewind
>>
1
;
p_buffer
-=
i_rewind
;
SSE2_CALL
(
SSE2_INIT_32_UNALIGNED
SSE2_YUV_MUL
SSE2_YUV_ADD
SSE2_UNPACK_32_ABGR_UNALIGNED
);
p_y
+=
16
;
p_u
+=
4
;
p_v
+=
4
;
}
SCALE_WIDTH
;
SCALE_HEIGHT
(
420
,
4
);
p_y
+=
i_source_margin
;
if
(
i_y
%
2
)
{
p_u
+=
i_source_margin_c
;
p_v
+=
i_source_margin_c
;
}
p_buffer
=
b_hscale
?
p_buffer_start
:
p_pic
;
}
}
else
{
/* use slower SSE2 unaligned fetch and store */
for
(
i_y
=
0
;
i_y
<
p_vout
->
render
.
i_height
;
i_y
++
)
{
p_pic_start
=
p_pic
;
p_buffer
=
b_hscale
?
p_buffer_start
:
p_pic
;
for
(
i_x
=
p_vout
->
render
.
i_width
/
16
;
i_x
--
;
)
{
SSE2_CALL
(
SSE2_INIT_32_UNALIGNED
SSE2_YUV_MUL
SSE2_YUV_ADD
SSE2_UNPACK_32_ABGR_UNALIGNED
);
p_y
+=
16
;
p_u
+=
8
;
p_v
+=
8
;
p_buffer
+=
16
;
}
/* Here we do some unaligned reads and duplicate conversions, but
* at least we have all the pixels */
if
(
i_rewind
)
{
p_y
-=
i_rewind
;
p_u
-=
i_rewind
>>
1
;
p_v
-=
i_rewind
>>
1
;
p_buffer
-=
i_rewind
;
SSE2_CALL
(
SSE2_INIT_32_UNALIGNED
SSE2_YUV_MUL
SSE2_YUV_ADD
SSE2_UNPACK_32_ABGR_UNALIGNED
);
p_y
+=
16
;
p_u
+=
8
;
p_v
+=
8
;
}
SCALE_WIDTH
;
SCALE_HEIGHT
(
420
,
4
);
p_y
+=
i_source_margin
;
if
(
i_y
%
2
)
{
p_u
+=
i_source_margin_c
;
p_v
+=
i_source_margin_c
;
}
p_buffer
=
b_hscale
?
p_buffer_start
:
p_pic
;
}
}
#else
#else
/* otherwise use MMX intrinsics wrappers */
if
(
p_vout
->
render
.
i_width
&
7
)
{
i_rewind
=
8
-
(
p_vout
->
render
.
i_width
&
7
);
}
else
{
i_rewind
=
0
;
}
for
(
i_y
=
0
;
i_y
<
p_vout
->
render
.
i_height
;
i_y
++
)
{
{
__m64
mm0
,
mm1
,
mm2
,
mm3
,
mm4
,
mm5
,
mm6
,
mm7
;
p_pic_start
=
p_pic
;
uint64_t
tmp64
;
p_buffer
=
b_hscale
?
p_buffer_start
:
p_pic
;
MMX_INTRINSICS_INIT_32
for
(
i_x
=
p_vout
->
render
.
i_width
/
8
;
i_x
--
;
)
MMX_INTRINSICS_YUV_MUL
{
MMX_INTRINSICS_YUV_ADD
MMX_CALL
(
MMX_INTRINSICS_UNPACK_32_BGRA
MMX_INIT_32
MMX_YUV_MUL
MMX_YUV_ADD
MMX_UNPACK_32_ABGR
);
p_y
+=
8
;
p_u
+=
4
;
p_v
+=
4
;
p_buffer
+=
8
;
}
}
#endif
/* Here we do some unaligned reads and duplicate conversions, but
* at least we have all the pixels */
if
(
i_rewind
)
{
p_y
-=
i_rewind
;
p_u
-=
i_rewind
>>
1
;
p_v
-=
i_rewind
>>
1
;
p_buffer
-=
i_rewind
;
MMX_CALL
(
MMX_INIT_32
MMX_YUV_MUL
MMX_YUV_ADD
MMX_UNPACK_32_ABGR
);
p_y
+=
8
;
p_y
+=
8
;
p_u
+=
4
;
p_u
+=
4
;
p_v
+=
4
;
p_v
+=
4
;
...
@@ -1670,12 +1605,9 @@ void E_(I420_B8G8R8A8)( vout_thread_t *p_vout, picture_t *p_src,
...
@@ -1670,12 +1605,9 @@ void E_(I420_B8G8R8A8)( vout_thread_t *p_vout, picture_t *p_src,
p_v
+=
i_source_margin_c
;
p_v
+=
i_source_margin_c
;
}
}
}
}
/* re-enable FPU registers */
/* re-enable FPU registers */
#if defined (CAN_COMPILE_MMX)
MMX_END
;
__asm__
__volatile__
(
"emms"
);
#else
_mm_empty
();
#endif
#endif
#endif
}
}
...
...
modules/video_chroma/i420_rgb_mmx.h
View file @
a3883709
...
@@ -24,6 +24,8 @@
...
@@ -24,6 +24,8 @@
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston MA 02110-1301, USA.
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston MA 02110-1301, USA.
*****************************************************************************/
*****************************************************************************/
#ifdef MODULE_NAME_IS_i420_rgb_mmx
/* hope these constant values are cache line aligned */
/* hope these constant values are cache line aligned */
#if __GNUC__ > 3 || (__GNUC__ == 3 && __GNUC_MINOR__ >= 3)
#if __GNUC__ > 3 || (__GNUC__ == 3 && __GNUC_MINOR__ >= 3)
#define USED_U64(foo) \
#define USED_U64(foo) \
...
@@ -46,6 +48,22 @@ USED_U64(mmx_mask_f8) = 0xf8f8f8f8f8f8f8f8ULL;
...
@@ -46,6 +48,22 @@ USED_U64(mmx_mask_f8) = 0xf8f8f8f8f8f8f8f8ULL;
USED_U64
(
mmx_mask_fc
)
=
0xfcfcfcfcfcfcfcfcULL
;
USED_U64
(
mmx_mask_fc
)
=
0xfcfcfcfcfcfcfcfcULL
;
#undef USED_U64
#undef USED_U64
#if defined(CAN_COMPILE_MMX)
/* MMX assembly */
#define MMX_CALL(MMX_INSTRUCTIONS) \
do { \
__asm__ __volatile__( \
".p2align 3 \n\t" \
MMX_INSTRUCTIONS \
: \
: "r" (p_y), "r" (p_u), \
"r" (p_v), "r" (p_buffer) ); \
} while(0)
#define MMX_END __asm__ __volatile__ ( "emms" )
/* Use RIP-relative code in PIC mode on amd64 */
/* Use RIP-relative code in PIC mode on amd64 */
#if defined(__x86_64__) && defined(__PIC__)
#if defined(__x86_64__) && defined(__PIC__)
# define G "(%%rip)"
# define G "(%%rip)"
...
@@ -60,42 +78,6 @@ pxor %%mm4, %%mm4 # zero mm4 \n\
...
@@ -60,42 +78,6 @@ pxor %%mm4, %%mm4 # zero mm4 \n\
movq (%0), %%mm6 # Load 8 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 \n\
movq (%0), %%mm6 # Load 8 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 \n\
"
"
#define SSE2_INIT_16_ALIGNED " \n\
movq (%1), %%xmm0 # Load 8 Cb 00 00 00 00 u3 u2 u1 u0 \n\
movq (%2), %%xmm1 # Load 8 Cr 00 00 00 00 v3 v2 v1 v0 \n\
pxor %%xmm4, %%xmm4 # zero mm4 \n\
movdqa (%0), %%xmm6 # Load 16 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 \n\
"
#define SSE2_INIT_16_UNALIGNED " \n\
movq (%1), %%xmm0 # Load 8 Cb 00 00 00 00 u3 u2 u1 u0 \n\
movq (%2), %%xmm1 # Load 8 Cr 00 00 00 00 v3 v2 v1 v0 \n\
pxor %%xmm4, %%xmm4 # zero mm4 \n\
movdqu (%0), %%xmm6 # Load 16 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 \n\
prefetchnta (%3) # Tell CPU not to cache output RGB data \n\
"
#define MMX_INTRINSICS_INIT_16 \
tmp64 = *(uint32_t *)p_u; \
mm0 = (__m64)tmp64; \
tmp64 = *(uint32_t *)p_v; \
mm1 = (__m64)tmp64; \
mm4 = _mm_setzero_si64(); \
mm6 = (__m64)*(uint64_t *)p_y; \
#define SSE2_INTRINSICS_INIT_16_ALIGNED \
xmm0 = _mm_loadl_epi64((__m128i *)p_u); \
xmm1 = _mm_loadl_epi64((__m128i *)p_v); \
xmm4 = _mm_setzero_si128(); \
xmm6 = _mm_load_si128((__m128i *)p_y); \
#define SSE2_INTRINSICS_INIT_16_UNALIGNED \
xmm0 = _mm_loadl_epi64((__m128i *)p_u); \
xmm1 = _mm_loadl_epi64((__m128i *)p_v); \
xmm4 = _mm_setzero_si128(); \
xmm6 = _mm_loadu_si128((__m128i *)p_y); \
_mm_prefetch(p_buffer, _MM_HINT_NTA); \
#define MMX_INIT_16_GRAY " \n\
#define MMX_INIT_16_GRAY " \n\
movq (%0), %%mm6 # Load 8 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 \n\
movq (%0), %%mm6 # Load 8 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 \n\
#movl $0, (%3) # cache preload for image \n\
#movl $0, (%3) # cache preload for image \n\
...
@@ -109,43 +91,6 @@ pxor %%mm4, %%mm4 # zero mm4 \n\
...
@@ -109,43 +91,6 @@ pxor %%mm4, %%mm4 # zero mm4 \n\
movq (%0), %%mm6 # Load 8 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 \n\
movq (%0), %%mm6 # Load 8 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 \n\
"
"
#define SSE2_INIT_32_ALIGNED " \n\
movq (%1), %%xmm0 # Load 8 Cb 00 00 00 00 u3 u2 u1 u0 \n\
movq (%2), %%xmm1 # Load 8 Cr 00 00 00 00 v3 v2 v1 v0 \n\
pxor %%xmm4, %%xmm4 # zero mm4 \n\
movdqa (%0), %%xmm6 # Load 16 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 \n\
"
#define SSE2_INIT_32_UNALIGNED " \n\
movq (%1), %%xmm0 # Load 8 Cb 00 00 00 00 u3 u2 u1 u0 \n\
movq (%2), %%xmm1 # Load 8 Cr 00 00 00 00 v3 v2 v1 v0 \n\
pxor %%xmm4, %%xmm4 # zero mm4 \n\
movdqu (%0), %%xmm6 # Load 16 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 \n\
prefetchnta (%3) # Tell CPU not to cache output RGB data \n\
"
#define MMX_INTRINSICS_INIT_32 \
tmp64 = *(uint32_t *)p_u; \
mm0 = (__m64)tmp64; \
*(uint16_t *)p_buffer = 0; \
tmp64 = *(uint32_t *)p_v; \
mm1 = (__m64)tmp64; \
mm4 = _mm_setzero_si64(); \
mm6 = (__m64)*(uint64_t *)p_y;
#define SSE2_INTRINSICS_INIT_32_ALIGNED \
xmm0 = _mm_loadl_epi64((__m128i *)p_u); \
xmm1 = _mm_loadl_epi64((__m128i *)p_v); \
xmm4 = _mm_setzero_si128(); \
xmm6 = _mm_load_si128((__m128i *)p_y); \
#define SSE2_INTRINSICS_INIT_32_UNALIGNED \
xmm0 = _mm_loadl_epi64((__m128i *)p_u); \
xmm1 = _mm_loadl_epi64((__m128i *)p_v); \
xmm4 = _mm_setzero_si128(); \
xmm6 = _mm_loadu_si128((__m128i *)p_y); \
_mm_prefetch(p_buffer, _MM_HINT_NTA); \
/*
/*
* Do the multiply part of the conversion for even and odd pixels,
* Do the multiply part of the conversion for even and odd pixels,
* register usage:
* register usage:
...
@@ -181,113 +126,6 @@ pmulhw mmx_Y_coeff"G", %%mm6 # Mul 4 Y even 00 y6 00 y4 00 y2 00 y0 \n\
...
@@ -181,113 +126,6 @@ pmulhw mmx_Y_coeff"G", %%mm6 # Mul 4 Y even 00 y6 00 y4 00 y2 00 y0 \n\
pmulhw mmx_Y_coeff"G", %%mm7 # Mul 4 Y odd 00 y7 00 y5 00 y3 00 y1 \n\
pmulhw mmx_Y_coeff"G", %%mm7 # Mul 4 Y odd 00 y7 00 y5 00 y3 00 y1 \n\
"
"
#define SSE2_YUV_MUL " \n\
# convert the chroma part \n\
punpcklbw %%xmm4, %%xmm0 # scatter 8 Cb 00 u3 00 u2 00 u1 00 u0 \n\
punpcklbw %%xmm4, %%xmm1 # scatter 8 Cr 00 v3 00 v2 00 v1 00 v0 \n\
movl $0x00800080, %%eax # \n\
movd %%eax, %%xmm5 # \n\
pshufd $0, %%xmm5, %%xmm5 # Set xmm5 to 0080 0080 ... 0080 0080 \n\
psubsw %%xmm5, %%xmm0 # Cb -= 128 \n\
psubsw %%xmm5, %%xmm1 # Cr -= 128 \n\
psllw $3, %%xmm0 # Promote precision \n\
psllw $3, %%xmm1 # Promote precision \n\
movdqa %%xmm0, %%xmm2 # Copy 8 Cb 00 u3 00 u2 00 u1 00 u0 \n\
movdqa %%xmm1, %%xmm3 # Copy 8 Cr 00 v3 00 v2 00 v1 00 v0 \n\
movl $0xf37df37d, %%eax # \n\
movd %%eax, %%xmm5 # \n\
pshufd $0, %%xmm5, %%xmm5 # Set xmm5 to f37d f37d ... f37d f37d \n\
pmulhw %%xmm5, %%xmm2 # Mul Cb with green coeff -> Cb green \n\
movl $0xe5fce5fc, %%eax # \n\
movd %%eax, %%xmm5 # \n\
pshufd $0, %%xmm5, %%xmm5 # Set xmm5 to e5fc e5fc ... e5fc e5fc \n\
pmulhw %%xmm5, %%xmm3 # Mul Cr with green coeff -> Cr green \n\
movl $0x40934093, %%eax # \n\
movd %%eax, %%xmm5 # \n\
pshufd $0, %%xmm5, %%xmm5 # Set xmm5 to 4093 4093 ... 4093 4093 \n\
pmulhw %%xmm5, %%xmm0 # Mul Cb -> Cblue 00 b3 00 b2 00 b1 00 b0 \n\
movl $0x33123312, %%eax # \n\
movd %%eax, %%xmm5 # \n\
pshufd $0, %%xmm5, %%xmm5 # Set xmm5 to 3312 3312 ... 3312 3312 \n\
pmulhw %%xmm5, %%xmm1 # Mul Cr -> Cred 00 r3 00 r2 00 r1 00 r0 \n\
paddsw %%xmm3, %%xmm2 # Cb green + Cr green -> Cgreen \n\
\n\
# convert the luma part \n\
movl $0x10101010, %%eax # \n\
movd %%eax, %%xmm5 # \n\
pshufd $0, %%xmm5, %%xmm5 # Set xmm5 to 1010 1010 ... 1010 1010 \n\
psubusb %%xmm5, %%xmm6 # Y -= 16 \n\
movdqa %%xmm6, %%xmm7 # Copy 16 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 \n\
movl $0x00ff00ff, %%eax # \n\
movd %%eax, %%xmm5 # \n\
pshufd $0, %%xmm5, %%xmm5 # set xmm5 to 00ff 00ff ... 00ff 00ff \n\
pand %%xmm5, %%xmm6 # get Y even 00 Y6 00 Y4 00 Y2 00 Y0 \n\
psrlw $8, %%xmm7 # get Y odd 00 Y7 00 Y5 00 Y3 00 Y1 \n\
psllw $3, %%xmm6 # Promote precision \n\
psllw $3, %%xmm7 # Promote precision \n\
movl $0x253f253f, %%eax # \n\
movd %%eax, %%xmm5 # \n\
pshufd $0, %%xmm5, %%xmm5 # set xmm5 to 253f 253f ... 253f 253f \n\
pmulhw %%xmm5, %%xmm6 # Mul 8 Y even 00 y6 00 y4 00 y2 00 y0 \n\
pmulhw %%xmm5, %%xmm7 # Mul 8 Y odd 00 y7 00 y5 00 y3 00 y1 \n\
"
#define MMX_INTRINSICS_YUV_MUL \
mm0 = _mm_unpacklo_pi8(mm0, mm4); \
mm1 = _mm_unpacklo_pi8(mm1, mm4); \
mm0 = _mm_subs_pi16(mm0, (__m64)mmx_80w); \
mm1 = _mm_subs_pi16(mm1, (__m64)mmx_80w); \
mm0 = _mm_slli_pi16(mm0, 3); \
mm1 = _mm_slli_pi16(mm1, 3); \
mm2 = mm0; \
mm3 = mm1; \
mm2 = _mm_mulhi_pi16(mm2, (__m64)mmx_U_green); \
mm3 = _mm_mulhi_pi16(mm3, (__m64)mmx_V_green); \
mm0 = _mm_mulhi_pi16(mm0, (__m64)mmx_U_blue); \
mm1 = _mm_mulhi_pi16(mm1, (__m64)mmx_V_red); \
mm2 = _mm_adds_pi16(mm2, mm3); \
\
mm6 = _mm_subs_pu8(mm6, (__m64)mmx_10w); \
mm7 = mm6; \
mm6 = _mm_and_si64(mm6, (__m64)mmx_00ffw); \
mm7 = _mm_srli_pi16(mm7, 8); \
mm6 = _mm_slli_pi16(mm6, 3); \
mm7 = _mm_slli_pi16(mm7, 3); \
mm6 = _mm_mulhi_pi16(mm6, (__m64)mmx_Y_coeff); \
mm7 = _mm_mulhi_pi16(mm7, (__m64)mmx_Y_coeff);
#define SSE2_INTRINSICS_YUV_MUL \
xmm0 = _mm_unpacklo_epi8(xmm0, xmm4); \
xmm1 = _mm_unpacklo_epi8(xmm1, xmm4); \
xmm5 = _mm_set1_epi32(0x00800080UL); \
xmm0 = _mm_subs_epi16(xmm0, xmm5); \
xmm1 = _mm_subs_epi16(xmm1, xmm5); \
xmm0 = _mm_slli_epi16(xmm0, 3); \
xmm1 = _mm_slli_epi16(xmm1, 3); \
xmm2 = xmm0; \
xmm3 = xmm1; \
xmm5 = _mm_set1_epi32(0xf37df37dUL); \
xmm2 = _mm_mulhi_epi16(xmm2, xmm5); \
xmm5 = _mm_set1_epi32(0xe5fce5fcUL); \
xmm3 = _mm_mulhi_epi16(xmm3, xmm5); \
xmm5 = _mm_set1_epi32(0x40934093UL); \
xmm0 = _mm_mulhi_epi16(xmm0, xmm5); \
xmm5 = _mm_set1_epi32(0x33123312UL); \
xmm1 = _mm_mulhi_epi16(xmm1, xmm5); \
xmm2 = _mm_adds_epi16(xmm2, xmm3); \
\
xmm5 = _mm_set1_epi32(0x10101010UL); \
xmm6 = _mm_subs_epu8(xmm6, xmm5); \
xmm7 = xmm6; \
xmm5 = _mm_set1_epi32(0x00ff00ffUL); \
xmm6 = _mm_and_si128(xmm6, xmm5); \
xmm7 = _mm_srli_epi16(xmm7, 8); \
xmm6 = _mm_slli_epi16(xmm6, 3); \
xmm7 = _mm_slli_epi16(xmm7, 3); \
xmm5 = _mm_set1_epi32(0x253f253fUL); \
xmm6 = _mm_mulhi_epi16(xmm6, xmm5); \
xmm7 = _mm_mulhi_epi16(xmm7, xmm5);
/*
/*
* Do the addition part of the conversion for even and odd pixels,
* Do the addition part of the conversion for even and odd pixels,
* register usage:
* register usage:
...
@@ -324,80 +162,6 @@ punpcklbw %%mm4, %%mm1 # R7 R6 R5 R4 R3 R2 R1 R0 \n\
...
@@ -324,80 +162,6 @@ punpcklbw %%mm4, %%mm1 # R7 R6 R5 R4 R3 R2 R1 R0 \n\
punpcklbw %%mm5, %%mm2 # G7 G6 G5 G4 G3 G2 G1 G0 \n\
punpcklbw %%mm5, %%mm2 # G7 G6 G5 G4 G3 G2 G1 G0 \n\
"
"
#define SSE2_YUV_ADD " \n\
# Do horizontal and vertical scaling \n\
movdqa %%xmm0, %%xmm3 # Copy Cblue \n\
movdqa %%xmm1, %%xmm4 # Copy Cred \n\
movdqa %%xmm2, %%xmm5 # Copy Cgreen \n\
paddsw %%xmm6, %%xmm0 # Y even + Cblue 00 B6 00 B4 00 B2 00 B0 \n\
paddsw %%xmm7, %%xmm3 # Y odd + Cblue 00 B7 00 B5 00 B3 00 B1 \n\
paddsw %%xmm6, %%xmm1 # Y even + Cred 00 R6 00 R4 00 R2 00 R0 \n\
paddsw %%xmm7, %%xmm4 # Y odd + Cred 00 R7 00 R5 00 R3 00 R1 \n\
paddsw %%xmm6, %%xmm2 # Y even + Cgreen 00 G6 00 G4 00 G2 00 G0 \n\
paddsw %%xmm7, %%xmm5 # Y odd + Cgreen 00 G7 00 G5 00 G3 00 G1 \n\
\n\
# Limit RGB even to 0..255 \n\
packuswb %%xmm0, %%xmm0 # B6 B4 B2 B0 / B6 B4 B2 B0 \n\
packuswb %%xmm1, %%xmm1 # R6 R4 R2 R0 / R6 R4 R2 R0 \n\
packuswb %%xmm2, %%xmm2 # G6 G4 G2 G0 / G6 G4 G2 G0 \n\
\n\
# Limit RGB odd to 0..255 \n\
packuswb %%xmm3, %%xmm3 # B7 B5 B3 B1 / B7 B5 B3 B1 \n\
packuswb %%xmm4, %%xmm4 # R7 R5 R3 R1 / R7 R5 R3 R1 \n\
packuswb %%xmm5, %%xmm5 # G7 G5 G3 G1 / G7 G5 G3 G1 \n\
\n\
# Interleave RGB even and odd \n\
punpcklbw %%xmm3, %%xmm0 # B7 B6 B5 B4 B3 B2 B1 B0 \n\
punpcklbw %%xmm4, %%xmm1 # R7 R6 R5 R4 R3 R2 R1 R0 \n\
punpcklbw %%xmm5, %%xmm2 # G7 G6 G5 G4 G3 G2 G1 G0 \n\
"
#define MMX_INTRINSICS_YUV_ADD \
mm3 = mm0; \
mm4 = mm1; \
mm5 = mm2; \
mm0 = _mm_adds_pi16(mm0, mm6); \
mm3 = _mm_adds_pi16(mm3, mm7); \
mm1 = _mm_adds_pi16(mm1, mm6); \
mm4 = _mm_adds_pi16(mm4, mm7); \
mm2 = _mm_adds_pi16(mm2, mm6); \
mm5 = _mm_adds_pi16(mm5, mm7); \
\
mm0 = _mm_packs_pu16(mm0, mm0); \
mm1 = _mm_packs_pu16(mm1, mm1); \
mm2 = _mm_packs_pu16(mm2, mm2); \
\
mm3 = _mm_packs_pu16(mm3, mm3); \
mm4 = _mm_packs_pu16(mm4, mm4); \
mm5 = _mm_packs_pu16(mm5, mm5); \
\
mm0 = _mm_unpacklo_pi8(mm0, mm3); \
mm1 = _mm_unpacklo_pi8(mm1, mm4); \
mm2 = _mm_unpacklo_pi8(mm2, mm5);
#define SSE2_INTRINSICS_YUV_ADD \
xmm3 = xmm0; \
xmm4 = xmm1; \
xmm5 = xmm2; \
xmm0 = _mm_adds_epi16(xmm0, xmm6); \
xmm3 = _mm_adds_epi16(xmm3, xmm7); \
xmm1 = _mm_adds_epi16(xmm1, xmm6); \
xmm4 = _mm_adds_epi16(xmm4, xmm7); \
xmm2 = _mm_adds_epi16(xmm2, xmm6); \
xmm5 = _mm_adds_epi16(xmm5, xmm7); \
\
xmm0 = _mm_packus_epi16(xmm0, xmm0); \
xmm1 = _mm_packus_epi16(xmm1, xmm1); \
xmm2 = _mm_packus_epi16(xmm2, xmm2); \
\
xmm3 = _mm_packus_epi16(xmm3, xmm3); \
xmm4 = _mm_packus_epi16(xmm4, xmm4); \
xmm5 = _mm_packus_epi16(xmm5, xmm5); \
\
xmm0 = _mm_unpacklo_epi8(xmm0, xmm3); \
xmm1 = _mm_unpacklo_epi8(xmm1, xmm4); \
xmm2 = _mm_unpacklo_epi8(xmm2, xmm5);
/*
/*
* Grayscale case, only use Y
* Grayscale case, only use Y
*/
*/
...
@@ -476,19 +240,414 @@ movd 4(%2), %%mm1 # Load 4 Cr __ __ __ __ v3 v2 v1 v0 \n\
...
@@ -476,19 +240,414 @@ movd 4(%2), %%mm1 # Load 4 Cr __ __ __ __ v3 v2 v1 v0 \n\
movq %%mm5, 8(%3) # store pixel 4-7 \n\
movq %%mm5, 8(%3) # store pixel 4-7 \n\
"
"
#define SSE2_UNPACK_15_ALIGNED " \n\
/*
* convert RGB plane to RGB 16 bits,
* mm0 -> B, mm1 -> R, mm2 -> G,
* mm4 -> GB, mm5 -> AR pixel 4-7,
* mm6 -> GB, mm7 -> AR pixel 0-3
*/
#define MMX_UNPACK_16 " \n\
# mask unneeded bits off \n\
# mask unneeded bits off \n\
movl $0xf8f8f8f8, %%eax # \n\
pand mmx_mask_f8"G", %%mm0 # b7b6b5b4 b3______ b7b6b5b4 b3______ \n\
movd %%eax, %%xmm5 # \n\
pand mmx_mask_fc"G", %%mm2 # g7g6g5g4 g3g2____ g7g6g5g4 g3g2____ \n\
pshufd $0, %%xmm5, %%xmm5 # set xmm5 to f8f8 f8f8 ... f8f8 f8f8 \n\
pand mmx_mask_f8"G", %%mm1 # r7r6r5r4 r3______ r7r6r5r4 r3______ \n\
pand %%xmm5, %%xmm0 # b7b6b5b4 b3______ b7b6b5b4 b3______ \n\
psrlw $3,%%mm0 # ______b7 b6b5b4b3 ______b7 b6b5b4b3 \n\
psrlw $3,%%xmm0 # ______b7 b6b5b4b3 ______b7 b6b5b4b3 \n\
pxor %%mm4, %%mm4 # zero mm4 \n\
pand %%xmm5, %%xmm2 # g7g6g5g4 g3______ g7g6g5g4 g3______ \n\
movq %%mm0, %%mm5 # Copy B7-B0 \n\
pand %%xmm5, %%xmm1 # r7r6r5r4 r3______ r7r6r5r4 r3______ \n\
movq %%mm2, %%mm7 # Copy G7-G0 \n\
psrlw $1,%%xmm1 # __r7r6r5 r4r3____ __r7r6r5 r4r3____ \n\
\n\
pxor %%xmm4, %%xmm4 # zero mm4 \n\
# convert rgb24 plane to rgb16 pack for pixel 0-3 \n\
movdqa %%xmm0, %%xmm5 # Copy B15-B0 \n\
punpcklbw %%mm4, %%mm2 # ________ ________ g7g6g5g4 g3g2____ \n\
movdqa %%xmm2, %%xmm7 # Copy G15-G0 \n\
punpcklbw %%mm1, %%mm0 # r7r6r5r4 r3______ ______b7 b6b5b4b3 \n\
psllw $3,%%mm2 # ________ __g7g6g5 g4g3g2__ ________ \n\
por %%mm2, %%mm0 # r7r6r5r4 r3g7g6g5 g4g3g2b7 b6b5b4b3 \n\
movq 8(%0), %%mm6 # Load 8 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 \n\
movq %%mm0, (%3) # store pixel 0-3 \n\
\n\
# convert rgb24 plane to rgb16 pack for pixel 0-3 \n\
punpckhbw %%mm4, %%mm7 # ________ ________ g7g6g5g4 g3g2____ \n\
punpckhbw %%mm1, %%mm5 # r7r6r5r4 r3______ ______b7 b6b5b4b3 \n\
psllw $3,%%mm7 # ________ __g7g6g5 g4g3g2__ ________ \n\
movd 4(%1), %%mm0 # Load 4 Cb __ __ __ __ u3 u2 u1 u0 \n\
por %%mm7, %%mm5 # r7r6r5r4 r3g7g6g5 g4g3g2b7 b6b5b4b3 \n\
movd 4(%2), %%mm1 # Load 4 Cr __ __ __ __ v3 v2 v1 v0 \n\
movq %%mm5, 8(%3) # store pixel 4-7 \n\
"
/*
* convert RGB plane to RGB packed format,
* mm0 -> B, mm1 -> R, mm2 -> G
*/
#define MMX_UNPACK_32_ARGB " \n\
pxor %%mm3, %%mm3 # zero mm3 \n\
movq %%mm0, %%mm4 # B7 B6 B5 B4 B3 B2 B1 B0 \n\
punpcklbw %%mm2, %%mm4 # G3 B3 G2 B2 G1 B1 G0 B0 \n\
movq %%mm1, %%mm5 # R7 R6 R5 R4 R3 R2 R1 R0 \n\
punpcklbw %%mm3, %%mm5 # 00 R3 00 R2 00 R1 00 R0 \n\
movq %%mm4, %%mm6 # G3 B3 G2 B2 G1 B1 G0 B0 \n\
punpcklwd %%mm5, %%mm4 # 00 R1 B1 G1 00 R0 B0 G0 \n\
movq %%mm4, (%3) # Store ARGB1 ARGB0 \n\
punpckhwd %%mm5, %%mm6 # 00 R3 B3 G3 00 R2 B2 G2 \n\
movq %%mm6, 8(%3) # Store ARGB3 ARGB2 \n\
punpckhbw %%mm2, %%mm0 # G7 B7 G6 B6 G5 B5 G4 B4 \n\
punpckhbw %%mm3, %%mm1 # 00 R7 00 R6 00 R5 00 R4 \n\
movq %%mm0, %%mm5 # G7 B7 G6 B6 G5 B5 G4 B4 \n\
punpcklwd %%mm1, %%mm5 # 00 R5 B5 G5 00 R4 B4 G4 \n\
movq %%mm5, 16(%3) # Store ARGB5 ARGB4 \n\
punpckhwd %%mm1, %%mm0 # 00 R7 B7 G7 00 R6 B6 G6 \n\
movq %%mm0, 24(%3) # Store ARGB7 ARGB6 \n\
"
#define MMX_UNPACK_32_BGRA " \n\
pxor %%mm3, %%mm3 # zero mm3 \n\
movq %%mm2, %%mm4 # G7 G6 G5 G4 G3 G2 G1 G0 \n\
punpcklbw %%mm0, %%mm4 # B3 G3 B2 G2 B1 G1 B0 G0 \n\
punpcklbw %%mm1, %%mm3 # R3 00 R2 00 R1 00 R0 00 \n\
movq %%mm3, %%mm5 # R3 00 R2 00 R1 00 R0 00 \n\
punpcklwd %%mm4, %%mm3 # B1 G1 R1 00 B0 G0 R0 00 \n\
movq %%mm3, (%3) # Store BGRA1 BGRA0 \n\
punpckhwd %%mm4, %%mm5 # B3 G3 R3 00 B2 G2 R2 00 \n\
movq %%mm5, 8(%3) # Store BGRA3 BGRA2 \n\
pxor %%mm6, %%mm6 # zero mm6 \n\
punpckhbw %%mm0, %%mm2 # B7 G7 B6 G6 B5 G5 B4 G4 \n\
punpckhbw %%mm1, %%mm6 # R7 00 R6 00 R5 00 R4 00 \n\
movq %%mm6, %%mm0 # R7 00 R6 00 R5 00 R4 00 \n\
punpcklwd %%mm2, %%mm6 # B5 G5 R5 00 B4 G4 R4 00 \n\
movq %%mm6, 16(%3) # Store BGRA5 BGRA4 \n\
punpckhwd %%mm2, %%mm0 # B7 G7 R7 00 B6 G6 R6 00 \n\
movq %%mm0, 24(%3) # Store BGRA7 BGRA6 \n\
"
#define MMX_UNPACK_32_ABGR " \n\
pxor %%mm3, %%mm3 # zero mm3 \n\
movq %%mm1, %%mm4 # R7 R6 R5 R4 R3 R2 R1 R0 \n\
punpcklbw %%mm2, %%mm4 # G3 R3 G2 R2 G1 R1 G0 R0 \n\
movq %%mm0, %%mm5 # B7 B6 B5 B4 B3 B2 B1 B0 \n\
punpcklbw %%mm3, %%mm5 # 00 B3 00 B2 00 B1 00 B0 \n\
movq %%mm4, %%mm6 # G3 R3 G2 R2 G1 R1 G0 R0 \n\
punpcklwd %%mm5, %%mm4 # 00 B1 G1 R1 00 B0 G0 R0 \n\
movq %%mm4, (%3) # Store ABGR1 ABGR0 \n\
punpckhwd %%mm5, %%mm6 # 00 B3 G3 R3 00 B2 G2 R2 \n\
movq %%mm6, 8(%3) # Store ABGR3 ABGR2 \n\
punpckhbw %%mm2, %%mm1 # G7 R7 G6 R6 G5 R5 G4 R4 \n\
punpckhbw %%mm3, %%mm0 # 00 B7 00 B6 00 B5 00 B4 \n\
movq %%mm1, %%mm2 # G7 R7 G6 R6 G5 R5 G4 R4 \n\
punpcklwd %%mm0, %%mm1 # 00 B5 G5 R5 00 B4 G4 R4 \n\
movq %%mm1, 16(%3) # Store ABGR5 ABGR4 \n\
punpckhwd %%mm0, %%mm2 # B7 G7 R7 00 B6 G6 R6 00 \n\
movq %%mm2, 24(%3) # Store ABGR7 ABGR6 \n\
"
#elif defined(HAVE_MMX_INTRINSICS)
/* MMX intrinsics */
#include <mmintrin.h>
#define MMX_CALL(MMX_INSTRUCTIONS) \
do { \
__m64 mm0, mm1, mm2, mm3, \
mm4, mm5, mm6, mm7; \
MMX_INSTRUCTIONS \
} while(0)
#define MMX_END _mm_empty()
#define MMX_INIT_16 \
mm0 = _mm_cvtsi32_si64((int)*p_u); \
mm1 = _mm_cvtsi32_si64((int)*p_v); \
mm4 = _mm_setzero_si64(); \
mm6 = (__m64)*(uint64_t *)p_y
#define MMX_INIT_32 \
mm0 = _mm_cvtsi32_si64((int)*p_u); \
*(uint16_t *)p_buffer = 0; \
mm1 = _mm_cvtsi32_si64((int)*p_v); \
mm4 = _mm_setzero_si64(); \
mm6 = (__m64)*(uint64_t *)p_y;
#define MMX_YUV_MUL \
mm0 = _mm_unpacklo_pi8(mm0, mm4); \
mm1 = _mm_unpacklo_pi8(mm1, mm4); \
mm0 = _mm_subs_pi16(mm0, (__m64)mmx_80w); \
mm1 = _mm_subs_pi16(mm1, (__m64)mmx_80w); \
mm0 = _mm_slli_pi16(mm0, 3); \
mm1 = _mm_slli_pi16(mm1, 3); \
mm2 = mm0; \
mm3 = mm1; \
mm2 = _mm_mulhi_pi16(mm2, (__m64)mmx_U_green); \
mm3 = _mm_mulhi_pi16(mm3, (__m64)mmx_V_green); \
mm0 = _mm_mulhi_pi16(mm0, (__m64)mmx_U_blue); \
mm1 = _mm_mulhi_pi16(mm1, (__m64)mmx_V_red); \
mm2 = _mm_adds_pi16(mm2, mm3); \
\
mm6 = _mm_subs_pu8(mm6, (__m64)mmx_10w); \
mm7 = mm6; \
mm6 = _mm_and_si64(mm6, (__m64)mmx_00ffw); \
mm7 = _mm_srli_pi16(mm7, 8); \
mm6 = _mm_slli_pi16(mm6, 3); \
mm7 = _mm_slli_pi16(mm7, 3); \
mm6 = _mm_mulhi_pi16(mm6, (__m64)mmx_Y_coeff); \
mm7 = _mm_mulhi_pi16(mm7, (__m64)mmx_Y_coeff);
#define MMX_YUV_ADD \
mm3 = mm0; \
mm4 = mm1; \
mm5 = mm2; \
mm0 = _mm_adds_pi16(mm0, mm6); \
mm3 = _mm_adds_pi16(mm3, mm7); \
mm1 = _mm_adds_pi16(mm1, mm6); \
mm4 = _mm_adds_pi16(mm4, mm7); \
mm2 = _mm_adds_pi16(mm2, mm6); \
mm5 = _mm_adds_pi16(mm5, mm7); \
\
mm0 = _mm_packs_pu16(mm0, mm0); \
mm1 = _mm_packs_pu16(mm1, mm1); \
mm2 = _mm_packs_pu16(mm2, mm2); \
\
mm3 = _mm_packs_pu16(mm3, mm3); \
mm4 = _mm_packs_pu16(mm4, mm4); \
mm5 = _mm_packs_pu16(mm5, mm5); \
\
mm0 = _mm_unpacklo_pi8(mm0, mm3); \
mm1 = _mm_unpacklo_pi8(mm1, mm4); \
mm2 = _mm_unpacklo_pi8(mm2, mm5);
#define MMX_UNPACK_15 \
mm0 = _mm_and_si64(mm0, (__m64)mmx_mask_f8); \
mm0 = _mm_srli_pi16(mm0, 3); \
mm2 = _mm_and_si64(mm2, (__m64)mmx_mask_f8); \
mm1 = _mm_and_si64(mm1, (__m64)mmx_mask_f8); \
mm1 = _mm_srli_pi16(mm1, 1); \
mm4 = _mm_setzero_si64(); \
mm5 = mm0; \
mm7 = mm2; \
\
mm2 = _mm_unpacklo_pi8(mm2, mm4); \
mm0 = _mm_unpacklo_pi8(mm0, mm1); \
mm2 = _mm_slli_pi16(mm2, 2); \
mm0 = _mm_or_si64(mm0, mm2); \
mm6 = (__m64)*(uint64_t *)(p_y + 8); \
*(uint64_t *)p_buffer = (uint64_t)mm0; \
\
mm7 = _mm_unpackhi_pi8(mm7, mm4); \
mm5 = _mm_unpackhi_pi8(mm5, mm1); \
mm7 = _mm_slli_pi16(mm7, 2); \
mm0 = _mm_cvtsi32_si64((int)*(uint32_t *)(p_u + 4)); \
mm5 = _mm_or_si64(mm5, mm7); \
mm1 = _mm_cvtsi32_si64((int)*(uint32_t *)(p_v + 4)); \
*(uint64_t *)(p_buffer + 4) = (uint64_t)mm5;
#define MMX_UNPACK_16 \
mm0 = _mm_and_si64(mm0, (__m64)mmx_mask_f8); \
mm2 = _mm_and_si64(mm2, (__m64)mmx_mask_fc); \
mm1 = _mm_and_si64(mm1, (__m64)mmx_mask_f8); \
mm0 = _mm_srli_pi16(mm0, 3); \
mm4 = _mm_setzero_si64(); \
mm5 = mm0; \
mm7 = mm2; \
\
mm2 = _mm_unpacklo_pi8(mm2, mm4); \
mm0 = _mm_unpacklo_pi8(mm0, mm1); \
mm2 = _mm_slli_pi16(mm2, 3); \
mm0 = _mm_or_si64(mm0, mm2); \
mm6 = (__m64)*(uint64_t *)(p_y + 8); \
*(uint64_t *)p_buffer = (uint64_t)mm0; \
\
mm7 = _mm_unpackhi_pi8(mm7, mm4); \
mm5 = _mm_unpackhi_pi8(mm5, mm1); \
mm7 = _mm_slli_pi16(mm7, 3); \
mm0 = _mm_cvtsi32_si64((int)*(uint32_t *)(p_u + 4)); \
mm5 = _mm_or_si64(mm5, mm7); \
mm1 = _mm_cvtsi32_si64((int)*(uint32_t *)(p_v + 4)); \
*(uint64_t *)(p_buffer + 4) = (uint64_t)mm5;
#define MMX_UNPACK_32_ARGB \
mm3 = _mm_setzero_si64(); \
mm4 = mm0; \
mm4 = _mm_unpacklo_pi8(mm4, mm2); \
mm5 = mm1; \
mm5 = _mm_unpacklo_pi8(mm5, mm3); \
mm6 = mm4; \
mm4 = _mm_unpacklo_pi16(mm4, mm5); \
*(uint64_t *)p_buffer = (uint64_t)mm4; \
mm6 = _mm_unpackhi_pi16(mm6, mm5); \
*(uint64_t *)(p_buffer + 2) = (uint64_t)mm6;\
mm0 = _mm_unpackhi_pi8(mm0, mm2); \
mm1 = _mm_unpackhi_pi8(mm1, mm3); \
mm5 = mm0; \
mm5 = _mm_unpacklo_pi16(mm5, mm1); \
*(uint64_t *)(p_buffer + 4) = (uint64_t)mm5;\
mm0 = _mm_unpackhi_pi16(mm0, mm1); \
*(uint64_t *)(p_buffer + 6) = (uint64_t)mm0;
#define MMX_UNPACK_32_BGRA \
mm3 = _mm_setzero_si64(); \
mm4 = mm2; \
mm4 = _mm_unpacklo_pi8(mm4, mm0); \
mm3 = _mm_unpacklo_pi8(mm3, mm1); \
mm5 = mm3; \
mm3 = _mm_unpacklo_pi16(mm3, mm4); \
*(uint64_t *)p_buffer = (uint64_t)mm3; \
mm5 = _mm_unpackhi_pi16(mm5, mm4); \
*(uint64_t *)(p_buffer + 2) = (uint64_t)mm5;\
mm6 = _mm_setzero_si64(); \
mm2 = _mm_unpackhi_pi8(mm2, mm0); \
mm6 = _mm_unpackhi_pi8(mm6, mm1); \
mm0 = mm6; \
mm6 = _mm_unpacklo_pi16(mm6, mm2); \
*(uint64_t *)(p_buffer + 4) = (uint64_t)mm6;\
mm0 = _mm_unpackhi_pi16(mm0, mm2); \
*(uint64_t *)(p_buffer + 6) = (uint64_t)mm0;
#define MMX_UNPACK_32_ABGR \
;
#endif
#elif defined( MODULE_NAME_IS_i420_rgb_sse2 )
#if defined(CAN_COMPILE_SSE2)
/* SSE2 assembly */
#define SSE2_CALL(SSE2_INSTRUCTIONS) \
do { \
__asm__ __volatile__( \
".p2align 3 \n\t" \
SSE2_INSTRUCTIONS \
: \
: "r" (p_y), "r" (p_u), \
"r" (p_v), "r" (p_buffer) \
: "eax" ); \
} while(0)
#define SSE2_END __asm__ __volatile__ ( "sfence" ::: "memory" )
#define SSE2_INIT_16_ALIGNED " \n\
movq (%1), %%xmm0 # Load 8 Cb 00 00 00 00 u3 u2 u1 u0 \n\
movq (%2), %%xmm1 # Load 8 Cr 00 00 00 00 v3 v2 v1 v0 \n\
pxor %%xmm4, %%xmm4 # zero mm4 \n\
movdqa (%0), %%xmm6 # Load 16 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 \n\
"
#define SSE2_INIT_16_UNALIGNED " \n\
movq (%1), %%xmm0 # Load 8 Cb 00 00 00 00 u3 u2 u1 u0 \n\
movq (%2), %%xmm1 # Load 8 Cr 00 00 00 00 v3 v2 v1 v0 \n\
pxor %%xmm4, %%xmm4 # zero mm4 \n\
movdqu (%0), %%xmm6 # Load 16 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 \n\
prefetchnta (%3) # Tell CPU not to cache output RGB data \n\
"
#define SSE2_INIT_32_ALIGNED " \n\
movq (%1), %%xmm0 # Load 8 Cb 00 00 00 00 u3 u2 u1 u0 \n\
movq (%2), %%xmm1 # Load 8 Cr 00 00 00 00 v3 v2 v1 v0 \n\
pxor %%xmm4, %%xmm4 # zero mm4 \n\
movdqa (%0), %%xmm6 # Load 16 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 \n\
"
#define SSE2_INIT_32_UNALIGNED " \n\
movq (%1), %%xmm0 # Load 8 Cb 00 00 00 00 u3 u2 u1 u0 \n\
movq (%2), %%xmm1 # Load 8 Cr 00 00 00 00 v3 v2 v1 v0 \n\
pxor %%xmm4, %%xmm4 # zero mm4 \n\
movdqu (%0), %%xmm6 # Load 16 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 \n\
prefetchnta (%3) # Tell CPU not to cache output RGB data \n\
"
#define SSE2_YUV_MUL " \n\
# convert the chroma part \n\
punpcklbw %%xmm4, %%xmm0 # scatter 8 Cb 00 u3 00 u2 00 u1 00 u0 \n\
punpcklbw %%xmm4, %%xmm1 # scatter 8 Cr 00 v3 00 v2 00 v1 00 v0 \n\
movl $0x00800080, %%eax # \n\
movd %%eax, %%xmm5 # \n\
pshufd $0, %%xmm5, %%xmm5 # Set xmm5 to 0080 0080 ... 0080 0080 \n\
psubsw %%xmm5, %%xmm0 # Cb -= 128 \n\
psubsw %%xmm5, %%xmm1 # Cr -= 128 \n\
psllw $3, %%xmm0 # Promote precision \n\
psllw $3, %%xmm1 # Promote precision \n\
movdqa %%xmm0, %%xmm2 # Copy 8 Cb 00 u3 00 u2 00 u1 00 u0 \n\
movdqa %%xmm1, %%xmm3 # Copy 8 Cr 00 v3 00 v2 00 v1 00 v0 \n\
movl $0xf37df37d, %%eax # \n\
movd %%eax, %%xmm5 # \n\
pshufd $0, %%xmm5, %%xmm5 # Set xmm5 to f37d f37d ... f37d f37d \n\
pmulhw %%xmm5, %%xmm2 # Mul Cb with green coeff -> Cb green \n\
movl $0xe5fce5fc, %%eax # \n\
movd %%eax, %%xmm5 # \n\
pshufd $0, %%xmm5, %%xmm5 # Set xmm5 to e5fc e5fc ... e5fc e5fc \n\
pmulhw %%xmm5, %%xmm3 # Mul Cr with green coeff -> Cr green \n\
movl $0x40934093, %%eax # \n\
movd %%eax, %%xmm5 # \n\
pshufd $0, %%xmm5, %%xmm5 # Set xmm5 to 4093 4093 ... 4093 4093 \n\
pmulhw %%xmm5, %%xmm0 # Mul Cb -> Cblue 00 b3 00 b2 00 b1 00 b0 \n\
movl $0x33123312, %%eax # \n\
movd %%eax, %%xmm5 # \n\
pshufd $0, %%xmm5, %%xmm5 # Set xmm5 to 3312 3312 ... 3312 3312 \n\
pmulhw %%xmm5, %%xmm1 # Mul Cr -> Cred 00 r3 00 r2 00 r1 00 r0 \n\
paddsw %%xmm3, %%xmm2 # Cb green + Cr green -> Cgreen \n\
\n\
# convert the luma part \n\
movl $0x10101010, %%eax # \n\
movd %%eax, %%xmm5 # \n\
pshufd $0, %%xmm5, %%xmm5 # Set xmm5 to 1010 1010 ... 1010 1010 \n\
psubusb %%xmm5, %%xmm6 # Y -= 16 \n\
movdqa %%xmm6, %%xmm7 # Copy 16 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 \n\
movl $0x00ff00ff, %%eax # \n\
movd %%eax, %%xmm5 # \n\
pshufd $0, %%xmm5, %%xmm5 # set xmm5 to 00ff 00ff ... 00ff 00ff \n\
pand %%xmm5, %%xmm6 # get Y even 00 Y6 00 Y4 00 Y2 00 Y0 \n\
psrlw $8, %%xmm7 # get Y odd 00 Y7 00 Y5 00 Y3 00 Y1 \n\
psllw $3, %%xmm6 # Promote precision \n\
psllw $3, %%xmm7 # Promote precision \n\
movl $0x253f253f, %%eax # \n\
movd %%eax, %%xmm5 # \n\
pshufd $0, %%xmm5, %%xmm5 # set xmm5 to 253f 253f ... 253f 253f \n\
pmulhw %%xmm5, %%xmm6 # Mul 8 Y even 00 y6 00 y4 00 y2 00 y0 \n\
pmulhw %%xmm5, %%xmm7 # Mul 8 Y odd 00 y7 00 y5 00 y3 00 y1 \n\
"
#define SSE2_YUV_ADD " \n\
# Do horizontal and vertical scaling \n\
movdqa %%xmm0, %%xmm3 # Copy Cblue \n\
movdqa %%xmm1, %%xmm4 # Copy Cred \n\
movdqa %%xmm2, %%xmm5 # Copy Cgreen \n\
paddsw %%xmm6, %%xmm0 # Y even + Cblue 00 B6 00 B4 00 B2 00 B0 \n\
paddsw %%xmm7, %%xmm3 # Y odd + Cblue 00 B7 00 B5 00 B3 00 B1 \n\
paddsw %%xmm6, %%xmm1 # Y even + Cred 00 R6 00 R4 00 R2 00 R0 \n\
paddsw %%xmm7, %%xmm4 # Y odd + Cred 00 R7 00 R5 00 R3 00 R1 \n\
paddsw %%xmm6, %%xmm2 # Y even + Cgreen 00 G6 00 G4 00 G2 00 G0 \n\
paddsw %%xmm7, %%xmm5 # Y odd + Cgreen 00 G7 00 G5 00 G3 00 G1 \n\
\n\
# Limit RGB even to 0..255 \n\
packuswb %%xmm0, %%xmm0 # B6 B4 B2 B0 / B6 B4 B2 B0 \n\
packuswb %%xmm1, %%xmm1 # R6 R4 R2 R0 / R6 R4 R2 R0 \n\
packuswb %%xmm2, %%xmm2 # G6 G4 G2 G0 / G6 G4 G2 G0 \n\
\n\
# Limit RGB odd to 0..255 \n\
packuswb %%xmm3, %%xmm3 # B7 B5 B3 B1 / B7 B5 B3 B1 \n\
packuswb %%xmm4, %%xmm4 # R7 R5 R3 R1 / R7 R5 R3 R1 \n\
packuswb %%xmm5, %%xmm5 # G7 G5 G3 G1 / G7 G5 G3 G1 \n\
\n\
# Interleave RGB even and odd \n\
punpcklbw %%xmm3, %%xmm0 # B7 B6 B5 B4 B3 B2 B1 B0 \n\
punpcklbw %%xmm4, %%xmm1 # R7 R6 R5 R4 R3 R2 R1 R0 \n\
punpcklbw %%xmm5, %%xmm2 # G7 G6 G5 G4 G3 G2 G1 G0 \n\
"
#define SSE2_UNPACK_15_ALIGNED " \n\
# mask unneeded bits off \n\
movl $0xf8f8f8f8, %%eax # \n\
movd %%eax, %%xmm5 # \n\
pshufd $0, %%xmm5, %%xmm5 # set xmm5 to f8f8 f8f8 ... f8f8 f8f8 \n\
pand %%xmm5, %%xmm0 # b7b6b5b4 b3______ b7b6b5b4 b3______ \n\
psrlw $3,%%xmm0 # ______b7 b6b5b4b3 ______b7 b6b5b4b3 \n\
pand %%xmm5, %%xmm2 # g7g6g5g4 g3______ g7g6g5g4 g3______ \n\
pand %%xmm5, %%xmm1 # r7r6r5r4 r3______ r7r6r5r4 r3______ \n\
psrlw $1,%%xmm1 # __r7r6r5 r4r3____ __r7r6r5 r4r3____ \n\
pxor %%xmm4, %%xmm4 # zero mm4 \n\
movdqa %%xmm0, %%xmm5 # Copy B15-B0 \n\
movdqa %%xmm2, %%xmm7 # Copy G15-G0 \n\
\n\
\n\
# convert rgb24 plane to rgb15 pack for pixel 0-7 \n\
# convert rgb24 plane to rgb15 pack for pixel 0-7 \n\
punpcklbw %%xmm4, %%xmm2 # ________ ________ g7g6g5g4 g3______ \n\
punpcklbw %%xmm4, %%xmm2 # ________ ________ g7g6g5g4 g3______ \n\
...
@@ -534,116 +693,38 @@ por %%xmm7, %%xmm5 # r7r6r5r4 r3__g7g6 g5g4g3b7 b6b5b4b3 \n\
...
@@ -534,116 +693,38 @@ por %%xmm7, %%xmm5 # r7r6r5r4 r3__g7g6 g5g4g3b7 b6b5b4b3 \n\
movdqu %%xmm5, 16(%3) # store pixel 4-7 \n\
movdqu %%xmm5, 16(%3) # store pixel 4-7 \n\
"
"
#define MMX_INTRINSICS_UNPACK_15 \
#define SSE2_UNPACK_16_ALIGNED " \n\
mm0 = _mm_and_si64(mm0, (__m64)mmx_mask_f8); \
mm0 = _mm_srli_pi16(mm0, 3); \
mm2 = _mm_and_si64(mm2, (__m64)mmx_mask_f8); \
mm1 = _mm_and_si64(mm1, (__m64)mmx_mask_f8); \
mm1 = _mm_srli_pi16(mm1, 1); \
mm4 = _mm_setzero_si64(); \
mm5 = mm0; \
mm7 = mm2; \
\
mm2 = _mm_unpacklo_pi8(mm2, mm4); \
mm0 = _mm_unpacklo_pi8(mm0, mm1); \
mm2 = _mm_slli_pi16(mm2, 2); \
mm0 = _mm_or_si64(mm0, mm2); \
tmp64 = *(uint64_t *)(p_y + 8); \
mm6 = (__m64)tmp64; \
*(uint64_t *)p_buffer = (uint64_t)mm0; \
\
mm7 = _mm_unpackhi_pi8(mm7, mm4); \
mm5 = _mm_unpackhi_pi8(mm5, mm1); \
mm7 = _mm_slli_pi16(mm7, 2); \
tmp64 = (uint64_t)*(uint32_t *)(p_u + 4); \
mm0 = (__m64)tmp64; \
mm5 = _mm_or_si64(mm5, mm7); \
tmp64 = (uint64_t)*(uint32_t *)(p_v + 4); \
mm1 = (__m64)tmp64; \
*(uint64_t *)(p_buffer + 4) = (uint64_t)mm5;
#define SSE2_INTRINSICS_UNPACK_15_ALIGNED \
xmm5 = _mm_set1_epi32(0xf8f8f8f8UL); \
xmm0 = _mm_and_si128(xmm0, xmm5); \
xmm0 = _mm_srli_epi16(xmm0, 3); \
xmm2 = _mm_and_si128(xmm2, xmm5); \
xmm1 = _mm_and_si128(xmm1, xmm5); \
xmm1 = _mm_srli_epi16(xmm1, 1); \
xmm4 = _mm_setzero_si128(); \
xmm5 = xmm0; \
xmm7 = xmm2; \
\
xmm2 = _mm_unpacklo_epi8(xmm2, xmm4); \
xmm0 = _mm_unpacklo_epi8(xmm0, xmm1); \
xmm2 = _mm_slli_epi16(xmm2, 2); \
xmm0 = _mm_or_si128(xmm0, xmm2); \
_mm_stream_si128((__m128i*)p_buffer, xmm0); \
\
xmm7 = _mm_unpackhi_epi8(xmm7, xmm4); \
xmm5 = _mm_unpackhi_epi8(xmm5, xmm1); \
xmm7 = _mm_slli_epi16(xmm7, 2); \
xmm5 = _mm_or_si128(xmm5, xmm7); \
_mm_stream_si128((__m128i*)(p_buffer+8), xmm5);
#define SSE2_INTRINSICS_UNPACK_15_UNALIGNED \
xmm5 = _mm_set1_epi32(0xf8f8f8f8UL); \
xmm0 = _mm_and_si128(xmm0, xmm5); \
xmm0 = _mm_srli_epi16(xmm0, 3); \
xmm2 = _mm_and_si128(xmm2, xmm5); \
xmm1 = _mm_and_si128(xmm1, xmm5); \
xmm1 = _mm_srli_epi16(xmm1, 1); \
xmm4 = _mm_setzero_si128(); \
xmm5 = xmm0; \
xmm7 = xmm2; \
\
xmm2 = _mm_unpacklo_epi8(xmm2, xmm4); \
xmm0 = _mm_unpacklo_epi8(xmm0, xmm1); \
xmm2 = _mm_slli_epi16(xmm2, 2); \
xmm0 = _mm_or_si128(xmm0, xmm2); \
_mm_storeu_si128((__m128i*)p_buffer, xmm0); \
\
xmm7 = _mm_unpackhi_epi8(xmm7, xmm4); \
xmm5 = _mm_unpackhi_epi8(xmm5, xmm1); \
xmm7 = _mm_slli_epi16(xmm7, 2); \
xmm5 = _mm_or_si128(xmm5, xmm7); \
_mm_storeu_si128((__m128i*)(p_buffer+16), xmm5);
/*
* convert RGB plane to RGB 16 bits,
* mm0 -> B, mm1 -> R, mm2 -> G,
* mm4 -> GB, mm5 -> AR pixel 4-7,
* mm6 -> GB, mm7 -> AR pixel 0-3
*/
#define MMX_UNPACK_16 " \n\
# mask unneeded bits off \n\
# mask unneeded bits off \n\
pand mmx_mask_f8"G", %%mm0 # b7b6b5b4 b3______ b7b6b5b4 b3______ \n\
movl $0xf8f8f8f8, %%eax # \n\
pand mmx_mask_fc"G", %%mm2 # g7g6g5g4 g3g2____ g7g6g5g4 g3g2____ \n\
movd %%eax, %%xmm5 # \n\
pand mmx_mask_f8"G", %%mm1 # r7r6r5r4 r3______ r7r6r5r4 r3______ \n\
pshufd $0, %%xmm5, %%xmm5 # set xmm5 to f8f8 f8f8 ... f8f8 f8f8 \n\
psrlw $3,%%mm0 # ______b7 b6b5b4b3 ______b7 b6b5b4b3 \n\
pand %%xmm5, %%xmm0 # b7b6b5b4 b3______ b7b6b5b4 b3______ \n\
pxor %%mm4, %%mm4 # zero mm4 \n\
pand %%xmm5, %%xmm1 # r7r6r5r4 r3______ r7r6r5r4 r3______ \n\
movq %%mm0, %%mm5 # Copy B7-B0 \n\
movl $0xfcfcfcfc, %%eax # \n\
movq %%mm2, %%mm7 # Copy G7-G0 \n\
movd %%eax, %%xmm5 # \n\
pshufd $0, %%xmm5, %%xmm5 # set xmm5 to f8f8 f8f8 ... f8f8 f8f8 \n\
pand %%xmm5, %%xmm2 # g7g6g5g4 g3g2____ g7g6g5g4 g3g2____ \n\
psrlw $3,%%xmm0 # ______b7 b6b5b4b3 ______b7 b6b5b4b3 \n\
pxor %%xmm4, %%xmm4 # zero mm4 \n\
movdqa %%xmm0, %%xmm5 # Copy B15-B0 \n\
movdqa %%xmm2, %%xmm7 # Copy G15-G0 \n\
\n\
\n\
# convert rgb24 plane to rgb16 pack for pixel 0-3 \n\
# convert rgb24 plane to rgb16 pack for pixel 0-7 \n\
punpcklbw %%mm4, %%mm2 # ________ ________ g7g6g5g4 g3g2____ \n\
punpcklbw %%xmm4, %%xmm2 # ________ ________ g7g6g5g4 g3g2____ \n\
punpcklbw %%mm1, %%mm0 # r7r6r5r4 r3______ ______b7 b6b5b4b3 \n\
punpcklbw %%xmm1, %%xmm0 # r7r6r5r4 r3______ ______b7 b6b5b4b3 \n\
psllw $3,%%mm2 # ________ __g7g6g5 g4g3g2__ ________ \n\
psllw $3,%%xmm2 # ________ __g7g6g5 g4g3g2__ ________ \n\
por %%mm2, %%mm0 # r7r6r5r4 r3g7g6g5 g4g3g2b7 b6b5b4b3 \n\
por %%xmm2, %%xmm0 # r7r6r5r4 r3g7g6g5 g4g3g2b7 b6b5b4b3 \n\
movq 8(%0), %%mm6 # Load 8 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 \n\
movntdq %%xmm0, (%3) # store pixel 0-7 \n\
movq %%mm0, (%3) # store pixel 0-3 \n\
\n\
\n\
# convert rgb24 plane to rgb16 pack for pixel 0-3 \n\
# convert rgb24 plane to rgb16 pack for pixel 8-15 \n\
punpckhbw %%mm4, %%mm7 # ________ ________ g7g6g5g4 g3g2____ \n\
punpckhbw %%xmm4, %%xmm7 # ________ ________ g7g6g5g4 g3g2____ \n\
punpckhbw %%mm1, %%mm5 # r7r6r5r4 r3______ ______b7 b6b5b4b3 \n\
punpckhbw %%xmm1, %%xmm5 # r7r6r5r4 r3______ ______b7 b6b5b4b3 \n\
psllw $3,%%mm7 # ________ __g7g6g5 g4g3g2__ ________ \n\
psllw $3,%%xmm7 # ________ __g7g6g5 g4g3g2__ ________ \n\
movd 4(%1), %%mm0 # Load 4 Cb __ __ __ __ u3 u2 u1 u0 \n\
por %%xmm7, %%xmm5 # r7r6r5r4 r3g7g6g5 g4g3g2b7 b6b5b4b3 \n\
por %%mm7, %%mm5 # r7r6r5r4 r3g7g6g5 g4g3g2b7 b6b5b4b3 \n\
movntdq %%xmm5, 16(%3) # store pixel 4-7 \n\
movd 4(%2), %%mm1 # Load 4 Cr __ __ __ __ v3 v2 v1 v0 \n\
movq %%mm5, 8(%3) # store pixel 4-7 \n\
"
"
#define SSE2_UNPACK_16_
ALIGNED "
\n\
#define SSE2_UNPACK_16_
UNALIGNED "
\n\
# mask unneeded bits off \n\
# mask unneeded bits off \n\
movl $0xf8f8f8f8, %%eax # \n\
movl $0xf8f8f8f8, %%eax # \n\
movd %%eax, %%xmm5 # \n\
movd %%eax, %%xmm5 # \n\
...
@@ -664,75 +745,279 @@ punpcklbw %%xmm4, %%xmm2 # ________ ________ g7g6g5g4 g3g2____ \n\
...
@@ -664,75 +745,279 @@ punpcklbw %%xmm4, %%xmm2 # ________ ________ g7g6g5g4 g3g2____ \n\
punpcklbw %%xmm1, %%xmm0 # r7r6r5r4 r3______ ______b7 b6b5b4b3 \n\
punpcklbw %%xmm1, %%xmm0 # r7r6r5r4 r3______ ______b7 b6b5b4b3 \n\
psllw $3,%%xmm2 # ________ __g7g6g5 g4g3g2__ ________ \n\
psllw $3,%%xmm2 # ________ __g7g6g5 g4g3g2__ ________ \n\
por %%xmm2, %%xmm0 # r7r6r5r4 r3g7g6g5 g4g3g2b7 b6b5b4b3 \n\
por %%xmm2, %%xmm0 # r7r6r5r4 r3g7g6g5 g4g3g2b7 b6b5b4b3 \n\
mov
ntdq
%%xmm0, (%3) # store pixel 0-7 \n\
mov
dqu
%%xmm0, (%3) # store pixel 0-7 \n\
\n\
\n\
# convert rgb24 plane to rgb16 pack for pixel 8-15 \n\
# convert rgb24 plane to rgb16 pack for pixel 8-15 \n\
punpckhbw %%xmm4, %%xmm7 # ________ ________ g7g6g5g4 g3g2____ \n\
punpckhbw %%xmm4, %%xmm7 # ________ ________ g7g6g5g4 g3g2____ \n\
punpckhbw %%xmm1, %%xmm5 # r7r6r5r4 r3______ ______b7 b6b5b4b3 \n\
punpckhbw %%xmm1, %%xmm5 # r7r6r5r4 r3______ ______b7 b6b5b4b3 \n\
psllw $3,%%xmm7 # ________ __g7g6g5 g4g3g2__ ________ \n\
psllw $3,%%xmm7 # ________ __g7g6g5 g4g3g2__ ________ \n\
por %%xmm7, %%xmm5 # r7r6r5r4 r3g7g6g5 g4g3g2b7 b6b5b4b3 \n\
por %%xmm7, %%xmm5 # r7r6r5r4 r3g7g6g5 g4g3g2b7 b6b5b4b3 \n\
movntdq %%xmm5, 16(%3) # store pixel 4-7 \n\
movdqu %%xmm5, 16(%3) # store pixel 4-7 \n\
"
#define SSE2_UNPACK_32_ARGB_ALIGNED " \n\
pxor %%xmm3, %%xmm3 # zero xmm3 \n\
movdqa %%xmm0, %%xmm4 # B7 B6 B5 B4 B3 B2 B1 B0 \n\
punpcklbw %%xmm2, %%xmm4 # G3 B3 G2 B2 G1 B1 G0 B0 \n\
movdqa %%xmm1, %%xmm5 # R7 R6 R5 R4 R3 R2 R1 R0 \n\
punpcklbw %%xmm3, %%xmm5 # 00 R3 00 R2 00 R1 00 R0 \n\
movdqa %%xmm4, %%xmm6 # G3 B3 G2 B2 G1 B1 G0 B0 \n\
punpcklwd %%xmm5, %%xmm4 # 00 R1 B1 G1 00 R0 B0 G0 \n\
movntdq %%xmm4, (%3) # Store ARGB3 ARGB2 ARGB1 ARGB0 \n\
punpckhwd %%xmm5, %%xmm6 # 00 R3 B3 G3 00 R2 B2 G2 \n\
movntdq %%xmm6, 16(%3) # Store ARGB7 ARGB6 ARGB5 ARGB4 \n\
punpckhbw %%xmm2, %%xmm0 # G7 B7 G6 B6 G5 B5 G4 B4 \n\
punpckhbw %%xmm3, %%xmm1 # 00 R7 00 R6 00 R5 00 R4 \n\
movdqa %%xmm0, %%xmm5 # G7 B7 G6 B6 G5 B5 G4 B4 \n\
punpcklwd %%xmm1, %%xmm5 # 00 R5 B5 G5 00 R4 B4 G4 \n\
movntdq %%xmm5, 32(%3) # Store ARGB11 ARGB10 ARGB9 ARGB8 \n\
punpckhwd %%xmm1, %%xmm0 # 00 R7 B7 G7 00 R6 B6 G6 \n\
movntdq %%xmm0, 48(%3) # Store ARGB15 ARGB14 ARGB13 ARGB12 \n\
"
#define SSE2_UNPACK_32_ARGB_UNALIGNED " \n\
pxor %%xmm3, %%xmm3 # zero xmm3 \n\
movdqa %%xmm0, %%xmm4 # B7 B6 B5 B4 B3 B2 B1 B0 \n\
punpcklbw %%xmm2, %%xmm4 # G3 B3 G2 B2 G1 B1 G0 B0 \n\
movdqa %%xmm1, %%xmm5 # R7 R6 R5 R4 R3 R2 R1 R0 \n\
punpcklbw %%xmm3, %%xmm5 # 00 R3 00 R2 00 R1 00 R0 \n\
movdqa %%xmm4, %%xmm6 # G3 B3 G2 B2 G1 B1 G0 B0 \n\
punpcklwd %%xmm5, %%xmm4 # 00 R1 B1 G1 00 R0 B0 G0 \n\
movdqu %%xmm4, (%3) # Store ARGB3 ARGB2 ARGB1 ARGB0 \n\
punpckhwd %%xmm5, %%xmm6 # 00 R3 B3 G3 00 R2 B2 G2 \n\
movdqu %%xmm6, 16(%3) # Store ARGB7 ARGB6 ARGB5 ARGB4 \n\
punpckhbw %%xmm2, %%xmm0 # G7 B7 G6 B6 G5 B5 G4 B4 \n\
punpckhbw %%xmm3, %%xmm1 # 00 R7 00 R6 00 R5 00 R4 \n\
movdqa %%xmm0, %%xmm5 # G7 B7 G6 B6 G5 B5 G4 B4 \n\
punpcklwd %%xmm1, %%xmm5 # 00 R5 B5 G5 00 R4 B4 G4 \n\
movdqu %%xmm5, 32(%3) # Store ARGB11 ARGB10 ARGB9 ARGB8 \n\
punpckhwd %%xmm1, %%xmm0 # 00 R7 B7 G7 00 R6 B6 G6 \n\
movdqu %%xmm0, 48(%3) # Store ARGB15 ARGB14 ARGB13 ARGB12 \n\
"
#define SSE2_UNPACK_32_BGRA_ALIGNED " \n\
pxor %%xmm3, %%xmm3 # zero mm3 \n\
movdqa %%xmm2, %%xmm4 # G7 G6 G5 G4 G3 G2 G1 G0 \n\
punpcklbw %%xmm0, %%xmm4 # B3 G3 B2 G2 B1 G1 B0 G0 \n\
punpcklbw %%xmm1, %%xmm3 # R3 00 R2 00 R1 00 R0 00 \n\
movdqa %%xmm3, %%xmm5 # R3 00 R2 00 R1 00 R0 00 \n\
punpcklwd %%xmm4, %%xmm3 # B1 G1 R1 00 B0 G0 R0 00 \n\
movntdq %%xmm3, (%3) # Store BGRA3 BGRA2 BGRA1 BGRA0 \n\
punpckhwd %%xmm4, %%xmm5 # B3 G3 R3 00 B2 G2 R2 00 \n\
movntdq %%xmm5, 16(%3) # Store BGRA7 BGRA6 BGRA5 BGRA4 \n\
pxor %%xmm6, %%xmm6 # zero mm6 \n\
punpckhbw %%xmm0, %%xmm2 # B7 G7 B6 G6 B5 G5 B4 G4 \n\
punpckhbw %%xmm1, %%xmm6 # R7 00 R6 00 R5 00 R4 00 \n\
movdqa %%xmm6, %%xmm0 # R7 00 R6 00 R5 00 R4 00 \n\
punpcklwd %%xmm2, %%xmm6 # B5 G5 R5 00 B4 G4 R4 00 \n\
movntdq %%xmm6, 32(%3) # Store BGRA11 BGRA10 BGRA9 BGRA8 \n\
punpckhwd %%xmm2, %%xmm0 # B7 G7 R7 00 B6 G6 R6 00 \n\
movntdq %%xmm0, 48(%3) # Store BGRA15 BGRA14 BGRA13 BGRA12 \n\
"
#define SSE2_UNPACK_32_BGRA_UNALIGNED " \n\
pxor %%xmm3, %%xmm3 # zero mm3 \n\
movdqa %%xmm2, %%xmm4 # G7 G6 G5 G4 G3 G2 G1 G0 \n\
punpcklbw %%xmm0, %%xmm4 # B3 G3 B2 G2 B1 G1 B0 G0 \n\
punpcklbw %%xmm1, %%xmm3 # R3 00 R2 00 R1 00 R0 00 \n\
movdqa %%xmm3, %%xmm5 # R3 00 R2 00 R1 00 R0 00 \n\
punpcklwd %%xmm4, %%xmm3 # B1 G1 R1 00 B0 G0 R0 00 \n\
movdqu %%xmm3, (%3) # Store BGRA3 BGRA2 BGRA1 BGRA0 \n\
punpckhwd %%xmm4, %%xmm5 # B3 G3 R3 00 B2 G2 R2 00 \n\
movdqu %%xmm5, 16(%3) # Store BGRA7 BGRA6 BGRA5 BGRA4 \n\
pxor %%xmm6, %%xmm6 # zero mm6 \n\
punpckhbw %%xmm0, %%xmm2 # B7 G7 B6 G6 B5 G5 B4 G4 \n\
punpckhbw %%xmm1, %%xmm6 # R7 00 R6 00 R5 00 R4 00 \n\
movdqa %%xmm6, %%xmm0 # R7 00 R6 00 R5 00 R4 00 \n\
punpcklwd %%xmm2, %%xmm6 # B5 G5 R5 00 B4 G4 R4 00 \n\
movdqu %%xmm6, 32(%3) # Store BGRA11 BGRA10 BGRA9 BGRA8 \n\
punpckhwd %%xmm2, %%xmm0 # B7 G7 R7 00 B6 G6 R6 00 \n\
movdqu %%xmm0, 48(%3) # Store BGRA15 BGRA14 BGRA13 BGRA12 \n\
"
"
#define SSE2_UNPACK_16_UNALIGNED " \n\
#define SSE2_UNPACK_32_ABGR_ALIGNED " \n\
# mask unneeded bits off \n\
pxor %%xmm3, %%xmm3 # zero mm3 \n\
movl $0xf8f8f8f8, %%eax # \n\
movdqa %%xmm1, %%xmm4 # R7 R6 R5 R4 R3 R2 R1 R0 \n\
movd %%eax, %%xmm5 # \n\
punpcklbw %%xmm2, %%xmm4 # G3 R3 G2 R2 G1 R1 G0 R0 \n\
pshufd $0, %%xmm5, %%xmm5 # set xmm5 to f8f8 f8f8 ... f8f8 f8f8 \n\
movdqa %%xmm0, %%xmm5 # B7 B6 B5 B4 B3 B2 B1 B0 \n\
pand %%xmm5, %%xmm0 # b7b6b5b4 b3______ b7b6b5b4 b3______ \n\
punpcklbw %%xmm3, %%xmm5 # 00 B3 00 B2 00 B1 00 B0 \n\
pand %%xmm5, %%xmm1 # r7r6r5r4 r3______ r7r6r5r4 r3______ \n\
movdqa %%xmm4, %%xmm6 # G3 R3 G2 R2 G1 R1 G0 R0 \n\
movl $0xfcfcfcfc, %%eax # \n\
punpcklwd %%xmm5, %%xmm4 # 00 B1 G1 R1 00 B0 G0 R0 \n\
movd %%eax, %%xmm5 # \n\
movntdq %%xmm4, (%3) # Store ABGR3 ABGR2 ABGR1 ABGR0 \n\
pshufd $0, %%xmm5, %%xmm5 # set xmm5 to f8f8 f8f8 ... f8f8 f8f8 \n\
punpckhwd %%xmm5, %%xmm6 # 00 B3 G3 R3 00 B2 G2 R2 \n\
pand %%xmm5, %%xmm2 # g7g6g5g4 g3g2____ g7g6g5g4 g3g2____ \n\
movntdq %%xmm6, 16(%3) # Store ABGR7 ABGR6 ABGR5 ABGR4 \n\
psrlw $3,%%xmm0 # ______b7 b6b5b4b3 ______b7 b6b5b4b3 \n\
punpckhbw %%xmm2, %%xmm1 # G7 R7 G6 R6 G5 R5 G4 R4 \n\
pxor %%xmm4, %%xmm4 # zero mm4 \n\
punpckhbw %%xmm3, %%xmm0 # 00 B7 00 B6 00 B5 00 B4 \n\
movdqa %%xmm0, %%xmm5 # Copy B15-B0 \n\
movdqa %%xmm1, %%xmm2 # G7 R7 G6 R6 G5 R5 G4 R4 \n\
movdqa %%xmm2, %%xmm7 # Copy G15-G0 \n\
punpcklwd %%xmm0, %%xmm1 # 00 B5 G5 R5 00 B4 G4 R4 \n\
\n\
movntdq %%xmm1, 32(%3) # Store ABGR11 ABGR10 ABGR9 ABGR8 \n\
# convert rgb24 plane to rgb16 pack for pixel 0-7 \n\
punpckhwd %%xmm0, %%xmm2 # B7 G7 R7 00 B6 G6 R6 00 \n\
punpcklbw %%xmm4, %%xmm2 # ________ ________ g7g6g5g4 g3g2____ \n\
movntdq %%xmm2, 48(%3) # Store ABGR15 ABGR14 ABGR13 ABGR12 \n\
punpcklbw %%xmm1, %%xmm0 # r7r6r5r4 r3______ ______b7 b6b5b4b3 \n\
"
psllw $3,%%xmm2 # ________ __g7g6g5 g4g3g2__ ________ \n\
por %%xmm2, %%xmm0 # r7r6r5r4 r3g7g6g5 g4g3g2b7 b6b5b4b3 \n\
#define SSE2_UNPACK_32_ABGR_UNALIGNED " \n\
movdqu %%xmm0, (%3) # store pixel 0-7 \n\
pxor %%xmm3, %%xmm3 # zero mm3 \n\
\n\
movdqa %%xmm1, %%xmm4 # R7 R6 R5 R4 R3 R2 R1 R0 \n\
# convert rgb24 plane to rgb16 pack for pixel 8-15 \n\
punpcklbw %%xmm2, %%xmm4 # G3 R3 G2 R2 G1 R1 G0 R0 \n\
punpckhbw %%xmm4, %%xmm7 # ________ ________ g7g6g5g4 g3g2____ \n\
movdqa %%xmm0, %%xmm5 # B7 B6 B5 B4 B3 B2 B1 B0 \n\
punpckhbw %%xmm1, %%xmm5 # r7r6r5r4 r3______ ______b7 b6b5b4b3 \n\
punpcklbw %%xmm3, %%xmm5 # 00 B3 00 B2 00 B1 00 B0 \n\
psllw $3,%%xmm7 # ________ __g7g6g5 g4g3g2__ ________ \n\
movdqa %%xmm4, %%xmm6 # G3 R3 G2 R2 G1 R1 G0 R0 \n\
por %%xmm7, %%xmm5 # r7r6r5r4 r3g7g6g5 g4g3g2b7 b6b5b4b3 \n\
punpcklwd %%xmm5, %%xmm4 # 00 B1 G1 R1 00 B0 G0 R0 \n\
movdqu %%xmm5, 16(%3) # store pixel 4-7 \n\
movdqu %%xmm4, (%3) # Store ABGR3 ABGR2 ABGR1 ABGR0 \n\
"
punpckhwd %%xmm5, %%xmm6 # 00 B3 G3 R3 00 B2 G2 R2 \n\
movdqu %%xmm6, 16(%3) # Store ABGR7 ABGR6 ABGR5 ABGR4 \n\
punpckhbw %%xmm2, %%xmm1 # G7 R7 G6 R6 G5 R5 G4 R4 \n\
punpckhbw %%xmm3, %%xmm0 # 00 B7 00 B6 00 B5 00 B4 \n\
movdqa %%xmm1, %%xmm2 # R7 00 R6 00 R5 00 R4 00 \n\
punpcklwd %%xmm0, %%xmm1 # 00 B5 G5 R5 00 B4 G4 R4 \n\
movdqu %%xmm1, 32(%3) # Store ABGR11 ABGR10 ABGR9 ABGR8 \n\
punpckhwd %%xmm0, %%xmm2 # B7 G7 R7 00 B6 G6 R6 00 \n\
movdqu %%xmm2, 48(%3) # Store ABGR15 ABGR14 ABGR13 ABGR12 \n\
"
#elif defined(HAVE_SSE2_INTRINSICS)
/* SSE2 intrinsics */
#include <emmintrin.h>
#define SSE2_CALL(SSE2_INSTRUCTIONS) \
do { \
__m128i xmm0, xmm1, xmm2, xmm3, \
xmm4, xmm5, xmm6, xmm7; \
SSE2_INSTRUCTIONS \
} while(0)
#define SSE2_END _mm_sfence()
#define SSE2_INIT_16_ALIGNED \
xmm0 = _mm_loadl_epi64((__m128i *)p_u); \
xmm1 = _mm_loadl_epi64((__m128i *)p_v); \
xmm4 = _mm_setzero_si128(); \
xmm6 = _mm_load_si128((__m128i *)p_y);
#define SSE2_INIT_16_UNALIGNED \
xmm0 = _mm_loadl_epi64((__m128i *)p_u); \
xmm1 = _mm_loadl_epi64((__m128i *)p_v); \
xmm4 = _mm_setzero_si128(); \
xmm6 = _mm_loadu_si128((__m128i *)p_y); \
_mm_prefetch(p_buffer, _MM_HINT_NTA);
#define SSE2_INIT_32_ALIGNED \
xmm0 = _mm_loadl_epi64((__m128i *)p_u); \
xmm1 = _mm_loadl_epi64((__m128i *)p_v); \
xmm4 = _mm_setzero_si128(); \
xmm6 = _mm_load_si128((__m128i *)p_y);
#define SSE2_INIT_32_UNALIGNED \
xmm0 = _mm_loadl_epi64((__m128i *)p_u); \
xmm1 = _mm_loadl_epi64((__m128i *)p_v); \
xmm4 = _mm_setzero_si128(); \
xmm6 = _mm_loadu_si128((__m128i *)p_y); \
_mm_prefetch(p_buffer, _MM_HINT_NTA);
#define SSE2_YUV_MUL \
xmm0 = _mm_unpacklo_epi8(xmm0, xmm4); \
xmm1 = _mm_unpacklo_epi8(xmm1, xmm4); \
xmm5 = _mm_set1_epi32(0x00800080UL); \
xmm0 = _mm_subs_epi16(xmm0, xmm5); \
xmm1 = _mm_subs_epi16(xmm1, xmm5); \
xmm0 = _mm_slli_epi16(xmm0, 3); \
xmm1 = _mm_slli_epi16(xmm1, 3); \
xmm2 = xmm0; \
xmm3 = xmm1; \
xmm5 = _mm_set1_epi32(0xf37df37dUL); \
xmm2 = _mm_mulhi_epi16(xmm2, xmm5); \
xmm5 = _mm_set1_epi32(0xe5fce5fcUL); \
xmm3 = _mm_mulhi_epi16(xmm3, xmm5); \
xmm5 = _mm_set1_epi32(0x40934093UL); \
xmm0 = _mm_mulhi_epi16(xmm0, xmm5); \
xmm5 = _mm_set1_epi32(0x33123312UL); \
xmm1 = _mm_mulhi_epi16(xmm1, xmm5); \
xmm2 = _mm_adds_epi16(xmm2, xmm3); \
\
xmm5 = _mm_set1_epi32(0x10101010UL); \
xmm6 = _mm_subs_epu8(xmm6, xmm5); \
xmm7 = xmm6; \
xmm5 = _mm_set1_epi32(0x00ff00ffUL); \
xmm6 = _mm_and_si128(xmm6, xmm5); \
xmm7 = _mm_srli_epi16(xmm7, 8); \
xmm6 = _mm_slli_epi16(xmm6, 3); \
xmm7 = _mm_slli_epi16(xmm7, 3); \
xmm5 = _mm_set1_epi32(0x253f253fUL); \
xmm6 = _mm_mulhi_epi16(xmm6, xmm5); \
xmm7 = _mm_mulhi_epi16(xmm7, xmm5);
#define SSE2_YUV_ADD \
xmm3 = xmm0; \
xmm4 = xmm1; \
xmm5 = xmm2; \
xmm0 = _mm_adds_epi16(xmm0, xmm6); \
xmm3 = _mm_adds_epi16(xmm3, xmm7); \
xmm1 = _mm_adds_epi16(xmm1, xmm6); \
xmm4 = _mm_adds_epi16(xmm4, xmm7); \
xmm2 = _mm_adds_epi16(xmm2, xmm6); \
xmm5 = _mm_adds_epi16(xmm5, xmm7); \
\
xmm0 = _mm_packus_epi16(xmm0, xmm0); \
xmm1 = _mm_packus_epi16(xmm1, xmm1); \
xmm2 = _mm_packus_epi16(xmm2, xmm2); \
\
xmm3 = _mm_packus_epi16(xmm3, xmm3); \
xmm4 = _mm_packus_epi16(xmm4, xmm4); \
xmm5 = _mm_packus_epi16(xmm5, xmm5); \
\
xmm0 = _mm_unpacklo_epi8(xmm0, xmm3); \
xmm1 = _mm_unpacklo_epi8(xmm1, xmm4); \
xmm2 = _mm_unpacklo_epi8(xmm2, xmm5);
#define SSE2_UNPACK_15_ALIGNED \
xmm5 = _mm_set1_epi32(0xf8f8f8f8UL); \
xmm0 = _mm_and_si128(xmm0, xmm5); \
xmm0 = _mm_srli_epi16(xmm0, 3); \
xmm2 = _mm_and_si128(xmm2, xmm5); \
xmm1 = _mm_and_si128(xmm1, xmm5); \
xmm1 = _mm_srli_epi16(xmm1, 1); \
xmm4 = _mm_setzero_si128(); \
xmm5 = xmm0; \
xmm7 = xmm2; \
\
xmm2 = _mm_unpacklo_epi8(xmm2, xmm4); \
xmm0 = _mm_unpacklo_epi8(xmm0, xmm1); \
xmm2 = _mm_slli_epi16(xmm2, 2); \
xmm0 = _mm_or_si128(xmm0, xmm2); \
_mm_stream_si128((__m128i*)p_buffer, xmm0); \
\
xmm7 = _mm_unpackhi_epi8(xmm7, xmm4); \
xmm5 = _mm_unpackhi_epi8(xmm5, xmm1); \
xmm7 = _mm_slli_epi16(xmm7, 2); \
xmm5 = _mm_or_si128(xmm5, xmm7); \
_mm_stream_si128((__m128i*)(p_buffer+8), xmm5);
#define MMX_INTRINSICS_UNPACK_16 \
#define SSE2_UNPACK_15_UNALIGNED \
mm0 = _mm_and_si64(mm0, (__m64)mmx_mask_f8); \
xmm5 = _mm_set1_epi32(0xf8f8f8f8UL); \
mm2 = _mm_and_si64(mm2, (__m64)mmx_mask_fc); \
xmm0 = _mm_and_si128(xmm0, xmm5); \
mm1 = _mm_and_si64(mm1, (__m64)mmx_mask_f8); \
xmm0 = _mm_srli_epi16(xmm0, 3); \
mm0 = _mm_srli_pi16(mm0, 3); \
xmm2 = _mm_and_si128(xmm2, xmm5); \
mm4 = _mm_setzero_si64(); \
xmm1 = _mm_and_si128(xmm1, xmm5); \
mm5 = mm0; \
xmm1 = _mm_srli_epi16(xmm1, 1); \
mm7 = mm2; \
xmm4 = _mm_setzero_si128(); \
xmm5 = xmm0; \
xmm7 = xmm2; \
\
\
mm2 = _mm_unpacklo_pi8(mm2, mm4); \
xmm2 = _mm_unpacklo_epi8(xmm2, xmm4); \
mm0 = _mm_unpacklo_pi8(mm0, mm1); \
xmm0 = _mm_unpacklo_epi8(xmm0, xmm1); \
mm2 = _mm_slli_pi16(mm2, 3); \
xmm2 = _mm_slli_epi16(xmm2, 2); \
mm0 = _mm_or_si64(mm0, mm2); \
xmm0 = _mm_or_si128(xmm0, xmm2); \
tmp64 = *(uint64_t *)(p_y + 8); \
_mm_storeu_si128((__m128i*)p_buffer, xmm0); \
mm6 = (__m64)tmp64; \
*(uint64_t *)p_buffer = (uint64_t)mm0; \
\
\
mm7 = _mm_unpackhi_pi8(mm7, mm4); \
xmm7 = _mm_unpackhi_epi8(xmm7, xmm4); \
mm5 = _mm_unpackhi_pi8(mm5, mm1); \
xmm5 = _mm_unpackhi_epi8(xmm5, xmm1); \
mm7 = _mm_slli_pi16(mm7, 3); \
xmm7 = _mm_slli_epi16(xmm7, 2); \
tmp64 = (uint64_t)*(uint32_t *)(p_u + 4); \
xmm5 = _mm_or_si128(xmm5, xmm7); \
mm0 = (__m64)tmp64; \
_mm_storeu_si128((__m128i*)(p_buffer+16), xmm5);
mm5 = _mm_or_si64(mm5, mm7); \
tmp64 = (uint64_t)*(uint32_t *)(p_v + 4); \
mm1 = (__m64)tmp64; \
*(uint64_t *)(p_buffer + 4) = (uint64_t)mm5;
#define SSE2_
INTRINSICS_UNPACK_16_ALIGNED
\
#define SSE2_
UNPACK_16_ALIGNED
\
xmm5 = _mm_set1_epi32(0xf8f8f8f8UL); \
xmm5 = _mm_set1_epi32(0xf8f8f8f8UL); \
xmm0 = _mm_and_si128(xmm0, xmm5); \
xmm0 = _mm_and_si128(xmm0, xmm5); \
xmm1 = _mm_and_si128(xmm1, xmm5); \
xmm1 = _mm_and_si128(xmm1, xmm5); \
...
@@ -755,7 +1040,7 @@ movdqu %%xmm5, 16(%3) # store pixel 4-7 \n\
...
@@ -755,7 +1040,7 @@ movdqu %%xmm5, 16(%3) # store pixel 4-7 \n\
xmm5 = _mm_or_si128(xmm5, xmm7); \
xmm5 = _mm_or_si128(xmm5, xmm7); \
_mm_stream_si128((__m128i*)(p_buffer+8), xmm5);
_mm_stream_si128((__m128i*)(p_buffer+8), xmm5);
#define SSE2_
INTRINSICS_UNPACK_16_UNALIGNED
\
#define SSE2_
UNPACK_16_UNALIGNED
\
xmm5 = _mm_set1_epi32(0xf8f8f8f8UL); \
xmm5 = _mm_set1_epi32(0xf8f8f8f8UL); \
xmm0 = _mm_and_si128(xmm0, xmm5); \
xmm0 = _mm_and_si128(xmm0, xmm5); \
xmm1 = _mm_and_si128(xmm1, xmm5); \
xmm1 = _mm_and_si128(xmm1, xmm5); \
...
@@ -778,91 +1063,7 @@ movdqu %%xmm5, 16(%3) # store pixel 4-7 \n\
...
@@ -778,91 +1063,7 @@ movdqu %%xmm5, 16(%3) # store pixel 4-7 \n\
xmm5 = _mm_or_si128(xmm5, xmm7); \
xmm5 = _mm_or_si128(xmm5, xmm7); \
_mm_storeu_si128((__m128i*)(p_buffer+8), xmm5);
_mm_storeu_si128((__m128i*)(p_buffer+8), xmm5);
/*
#define SSE2_UNPACK_32_ARGB_ALIGNED \
* convert RGB plane to RGB packed format,
* mm0 -> B, mm1 -> R, mm2 -> G
*/
#define MMX_UNPACK_32_ARGB " \n\
pxor %%mm3, %%mm3 # zero mm3 \n\
movq %%mm0, %%mm4 # B7 B6 B5 B4 B3 B2 B1 B0 \n\
punpcklbw %%mm2, %%mm4 # G3 B3 G2 B2 G1 B1 G0 B0 \n\
movq %%mm1, %%mm5 # R7 R6 R5 R4 R3 R2 R1 R0 \n\
punpcklbw %%mm3, %%mm5 # 00 R3 00 R2 00 R1 00 R0 \n\
movq %%mm4, %%mm6 # G3 B3 G2 B2 G1 B1 G0 B0 \n\
punpcklwd %%mm5, %%mm4 # 00 R1 B1 G1 00 R0 B0 G0 \n\
movq %%mm4, (%3) # Store ARGB1 ARGB0 \n\
punpckhwd %%mm5, %%mm6 # 00 R3 B3 G3 00 R2 B2 G2 \n\
movq %%mm6, 8(%3) # Store ARGB3 ARGB2 \n\
punpckhbw %%mm2, %%mm0 # G7 B7 G6 B6 G5 B5 G4 B4 \n\
punpckhbw %%mm3, %%mm1 # 00 R7 00 R6 00 R5 00 R4 \n\
movq %%mm0, %%mm5 # G7 B7 G6 B6 G5 B5 G4 B4 \n\
punpcklwd %%mm1, %%mm5 # 00 R5 B5 G5 00 R4 B4 G4 \n\
movq %%mm5, 16(%3) # Store ARGB5 ARGB4 \n\
punpckhwd %%mm1, %%mm0 # 00 R7 B7 G7 00 R6 B6 G6 \n\
movq %%mm0, 24(%3) # Store ARGB7 ARGB6 \n\
"
#define SSE2_UNPACK_32_ARGB_ALIGNED " \n\
pxor %%xmm3, %%xmm3 # zero xmm3 \n\
movdqa %%xmm0, %%xmm4 # B7 B6 B5 B4 B3 B2 B1 B0 \n\
punpcklbw %%xmm2, %%xmm4 # G3 B3 G2 B2 G1 B1 G0 B0 \n\
movdqa %%xmm1, %%xmm5 # R7 R6 R5 R4 R3 R2 R1 R0 \n\
punpcklbw %%xmm3, %%xmm5 # 00 R3 00 R2 00 R1 00 R0 \n\
movdqa %%xmm4, %%xmm6 # G3 B3 G2 B2 G1 B1 G0 B0 \n\
punpcklwd %%xmm5, %%xmm4 # 00 R1 B1 G1 00 R0 B0 G0 \n\
movntdq %%xmm4, (%3) # Store ARGB3 ARGB2 ARGB1 ARGB0 \n\
punpckhwd %%xmm5, %%xmm6 # 00 R3 B3 G3 00 R2 B2 G2 \n\
movntdq %%xmm6, 16(%3) # Store ARGB7 ARGB6 ARGB5 ARGB4 \n\
punpckhbw %%xmm2, %%xmm0 # G7 B7 G6 B6 G5 B5 G4 B4 \n\
punpckhbw %%xmm3, %%xmm1 # 00 R7 00 R6 00 R5 00 R4 \n\
movdqa %%xmm0, %%xmm5 # G7 B7 G6 B6 G5 B5 G4 B4 \n\
punpcklwd %%xmm1, %%xmm5 # 00 R5 B5 G5 00 R4 B4 G4 \n\
movntdq %%xmm5, 32(%3) # Store ARGB11 ARGB10 ARGB9 ARGB8 \n\
punpckhwd %%xmm1, %%xmm0 # 00 R7 B7 G7 00 R6 B6 G6 \n\
movntdq %%xmm0, 48(%3) # Store ARGB15 ARGB14 ARGB13 ARGB12 \n\
"
#define SSE2_UNPACK_32_ARGB_UNALIGNED " \n\
pxor %%xmm3, %%xmm3 # zero xmm3 \n\
movdqa %%xmm0, %%xmm4 # B7 B6 B5 B4 B3 B2 B1 B0 \n\
punpcklbw %%xmm2, %%xmm4 # G3 B3 G2 B2 G1 B1 G0 B0 \n\
movdqa %%xmm1, %%xmm5 # R7 R6 R5 R4 R3 R2 R1 R0 \n\
punpcklbw %%xmm3, %%xmm5 # 00 R3 00 R2 00 R1 00 R0 \n\
movdqa %%xmm4, %%xmm6 # G3 B3 G2 B2 G1 B1 G0 B0 \n\
punpcklwd %%xmm5, %%xmm4 # 00 R1 B1 G1 00 R0 B0 G0 \n\
movdqu %%xmm4, (%3) # Store ARGB3 ARGB2 ARGB1 ARGB0 \n\
punpckhwd %%xmm5, %%xmm6 # 00 R3 B3 G3 00 R2 B2 G2 \n\
movdqu %%xmm6, 16(%3) # Store ARGB7 ARGB6 ARGB5 ARGB4 \n\
punpckhbw %%xmm2, %%xmm0 # G7 B7 G6 B6 G5 B5 G4 B4 \n\
punpckhbw %%xmm3, %%xmm1 # 00 R7 00 R6 00 R5 00 R4 \n\
movdqa %%xmm0, %%xmm5 # G7 B7 G6 B6 G5 B5 G4 B4 \n\
punpcklwd %%xmm1, %%xmm5 # 00 R5 B5 G5 00 R4 B4 G4 \n\
movdqu %%xmm5, 32(%3) # Store ARGB11 ARGB10 ARGB9 ARGB8 \n\
punpckhwd %%xmm1, %%xmm0 # 00 R7 B7 G7 00 R6 B6 G6 \n\
movdqu %%xmm0, 48(%3) # Store ARGB15 ARGB14 ARGB13 ARGB12 \n\
"
#define MMX_INTRINSICS_UNPACK_32_ARGB \
mm3 = _mm_setzero_si64(); \
mm4 = mm0; \
mm4 = _mm_unpacklo_pi8(mm4, mm2); \
mm5 = mm1; \
mm5 = _mm_unpacklo_pi8(mm5, mm3); \
mm6 = mm4; \
mm4 = _mm_unpacklo_pi16(mm4, mm5); \
*(uint64_t *)p_buffer = (uint64_t)mm4; \
mm6 = _mm_unpackhi_pi16(mm6, mm5); \
*(uint64_t *)(p_buffer + 2) = (uint64_t)mm6; \
mm0 = _mm_unpackhi_pi8(mm0, mm2); \
mm1 = _mm_unpackhi_pi8(mm1, mm3); \
mm5 = mm0; \
mm5 = _mm_unpacklo_pi16(mm5, mm1); \
*(uint64_t *)(p_buffer + 4) = (uint64_t)mm5; \
mm0 = _mm_unpackhi_pi16(mm0, mm1); \
*(uint64_t *)(p_buffer + 6) = (uint64_t)mm0;
#define SSE2_INTRINSICS_UNPACK_32_ARGB_ALIGNED \
xmm3 = _mm_setzero_si128(); \
xmm3 = _mm_setzero_si128(); \
xmm4 = xmm0; \
xmm4 = xmm0; \
xmm4 = _mm_unpacklo_epi8(xmm4, xmm2); \
xmm4 = _mm_unpacklo_epi8(xmm4, xmm2); \
...
@@ -881,7 +1082,7 @@ movdqu %%xmm0, 48(%3) # Store ARGB15 ARGB14 ARGB13 ARGB12 \n\
...
@@ -881,7 +1082,7 @@ movdqu %%xmm0, 48(%3) # Store ARGB15 ARGB14 ARGB13 ARGB12 \n\
xmm0 = _mm_unpackhi_epi16(xmm0, xmm1); \
xmm0 = _mm_unpackhi_epi16(xmm0, xmm1); \
_mm_stream_si128((__m128i*)(p_buffer+12), xmm0);
_mm_stream_si128((__m128i*)(p_buffer+12), xmm0);
#define SSE2_
INTRINSICS_UNPACK_32_ARGB_UNALIGNED
\
#define SSE2_
UNPACK_32_ARGB_UNALIGNED
\
xmm3 = _mm_setzero_si128(); \
xmm3 = _mm_setzero_si128(); \
xmm4 = xmm0; \
xmm4 = xmm0; \
xmm4 = _mm_unpacklo_epi8(xmm4, xmm2); \
xmm4 = _mm_unpacklo_epi8(xmm4, xmm2); \
...
@@ -900,126 +1101,50 @@ movdqu %%xmm0, 48(%3) # Store ARGB15 ARGB14 ARGB13 ARGB12 \n\
...
@@ -900,126 +1101,50 @@ movdqu %%xmm0, 48(%3) # Store ARGB15 ARGB14 ARGB13 ARGB12 \n\
xmm0 = _mm_unpackhi_epi16(xmm0, xmm1); \
xmm0 = _mm_unpackhi_epi16(xmm0, xmm1); \
_mm_storeu_si128((__m128i*)(p_buffer+12), xmm0);
_mm_storeu_si128((__m128i*)(p_buffer+12), xmm0);
#define MMX_UNPACK_32_BGRA " \n\
#define SSE2_UNPACK_32_BGRA_ALIGNED \
pxor %%mm3, %%mm3 # zero mm3 \n\
movq %%mm2, %%mm4 # G7 G6 G5 G4 G3 G2 G1 G0 \n\
punpcklbw %%mm0, %%mm4 # B3 G3 B2 G2 B1 G1 B0 G0 \n\
punpcklbw %%mm1, %%mm3 # R3 00 R2 00 R1 00 R0 00 \n\
movq %%mm3, %%mm5 # R3 00 R2 00 R1 00 R0 00 \n\
punpcklwd %%mm4, %%mm3 # B1 G1 R1 00 B0 G0 R0 00 \n\
movq %%mm3, (%3) # Store BGRA1 BGRA0 \n\
punpckhwd %%mm4, %%mm5 # B3 G3 R3 00 B2 G2 R2 00 \n\
movq %%mm5, 8(%3) # Store BGRA3 BGRA2 \n\
pxor %%mm3, %%mm3 # zero mm3 \n\
movq %%mm2, %%mm4 # G7 G6 G5 G4 G3 G2 G1 G0 \n\
punpckhbw %%mm0, %%mm4 # B7 G7 B6 G6 B5 G5 B4 G4 \n\
punpckhbw %%mm1, %%mm3 # R7 00 R6 00 R5 00 R4 00 \n\
movq %%mm3, %%mm5 # R7 00 R6 00 R5 00 R4 00 \n\
punpcklwd %%mm1, %%mm3 # B5 G5 R5 00 B4 G4 R4 00 \n\
movq %%mm3, 16(%3) # Store BGRA5 BGRA4 \n\
punpckhwd %%mm4, %%mm5 # B7 G7 R7 00 B6 G6 R6 00 \n\
movq %%mm5, 24(%3) # Store BGRA7 BGRA6 \n\
"
#define SSE2_UNPACK_32_BGRA_ALIGNED " \n\
pxor %%xmm3, %%xmm3 # zero mm3 \n\
movdqa %%xmm2, %%xmm4 # G7 G6 G5 G4 G3 G2 G1 G0 \n\
punpcklbw %%xmm0, %%xmm4 # B3 G3 B2 G2 B1 G1 B0 G0 \n\
punpcklbw %%xmm1, %%xmm3 # R3 00 R2 00 R1 00 R0 00 \n\
movdqa %%xmm3, %%xmm5 # R3 00 R2 00 R1 00 R0 00 \n\
punpcklwd %%xmm4, %%xmm3 # B1 G1 R1 00 B0 G0 R0 00 \n\
movntdq %%xmm3, (%3) # Store BGRA3 BGRA2 BGRA1 BGRA0 \n\
punpckhwd %%xmm4, %%xmm5 # B3 G3 R3 00 B2 G2 R2 00 \n\
movntdq %%xmm5, 8(%3) # Store BGRA7 BGRA6 BGRA5 BGRA4 \n\
pxor %%xmm3, %%xmm3 # zero mm3 \n\
movdqa %%xmm2, %%xmm4 # G7 G6 G5 G4 G3 G2 G1 G0 \n\
punpckhbw %%xmm0, %%xmm4 # B7 G7 B6 G6 B5 G5 B4 G4 \n\
punpckhbw %%xmm1, %%xmm3 # R7 00 R6 00 R5 00 R4 00 \n\
movdqa %%xmm3, %%xmm5 # R7 00 R6 00 R5 00 R4 00 \n\
punpcklwd %%xmm1, %%xmm3 # B5 G5 R5 00 B4 G4 R4 00 \n\
movntdq %%xmm3, 16(%3) # Store BGRA11 BGRA10 BGRA9 BGRA8 \n\
punpckhwd %%xmm4, %%xmm5 # B7 G7 R7 00 B6 G6 R6 00 \n\
movntdq %%xmm5, 24(%3) # Store BGRA15 BGRA14 BGRA13 BGRA12 \n\
"
#define SSE2_UNPACK_32_BGRA_UNALIGNED " \n\
pxor %%xmm3, %%xmm3 # zero mm3 \n\
movdqa %%xmm2, %%xmm4 # G7 G6 G5 G4 G3 G2 G1 G0 \n\
punpcklbw %%xmm0, %%xmm4 # B3 G3 B2 G2 B1 G1 B0 G0 \n\
punpcklbw %%xmm1, %%xmm3 # R3 00 R2 00 R1 00 R0 00 \n\
movdqa %%xmm3, %%xmm5 # R3 00 R2 00 R1 00 R0 00 \n\
punpcklwd %%xmm4, %%xmm3 # B1 G1 R1 00 B0 G0 R0 00 \n\
movdqu %%xmm3, (%3) # Store BGRA3 BGRA2 BGRA1 BGRA0 \n\
punpckhwd %%xmm4, %%xmm5 # B3 G3 R3 00 B2 G2 R2 00 \n\
movdqu %%xmm5, 8(%3) # Store BGRA7 BGRA6 BGRA5 BGRA4 \n\
pxor %%xmm3, %%xmm3 # zero mm3 \n\
movdqa %%xmm2, %%xmm4 # G7 G6 G5 G4 G3 G2 G1 G0 \n\
punpckhbw %%xmm0, %%xmm4 # B7 G7 B6 G6 B5 G5 B4 G4 \n\
punpckhbw %%xmm1, %%xmm3 # R7 00 R6 00 R5 00 R4 00 \n\
movdqa %%xmm3, %%xmm5 # R7 00 R6 00 R5 00 R4 00 \n\
punpcklwd %%xmm1, %%xmm3 # B5 G5 R5 00 B4 G4 R4 00 \n\
movdqu %%xmm3, 16(%3) # Store BGRA11 BGRA10 BGRA9 BGRA8 \n\
punpckhwd %%xmm4, %%xmm5 # B7 G7 R7 00 B6 G6 R6 00 \n\
movdqu %%xmm5, 24(%3) # Store BGRA15 BGRA14 BGRA13 BGRA12 \n\
"
#define MMX_INTRINSICS_UNPACK_32_BGRA \
mm3 = _mm_setzero_si64(); \
mm4 = mm2; \
mm4 = _mm_unpacklo_pi8(mm4, mm0); \
mm1 = _mm_unpacklo_pi8(mm1, mm3); \
mm5 = mm3; \
mm3 = _mm_unpacklo_pi16(mm3, mm4); \
*(uint64_t *)p_buffer = (uint64_t)mm3; \
mm5 = _mm_unpackhi_pi16(mm5, mm4); \
*(uint64_t *)(p_buffer + 2) = (uint64_t)mm5; \
mm3 = _mm_setzero_si64(); \
mm4 = mm2; \
mm0 = _mm_unpackhi_pi8(mm0, mm4); \
mm1 = _mm_unpackhi_pi8(mm1, mm3); \
mm5 = mm3; \
mm3 = _mm_unpacklo_pi16(mm3, mm1); \
*(uint64_t *)(p_buffer + 4) = (uint64_t)mm3; \
mm5 = _mm_unpackhi_pi16(mm5, mm4); \
*(uint64_t *)(p_buffer + 6) = (uint64_t)mm5; \
#define SSE2_INTRINSICS_UNPACK_32_BGRA_ALIGNED \
xmm3 = _mm_setzero_si128(); \
xmm3 = _mm_setzero_si128(); \
xmm4 = xmm2; \
xmm4 = xmm2; \
xmm4 = _mm_unpacklo_epi8(xmm4, xmm0); \
xmm4 = _mm_unpacklo_epi8(xmm4, xmm0); \
xmm
1 = _mm_unpacklo_epi8(xmm1, xmm3);
\
xmm
3 = _mm_unpacklo_epi8(xmm3, xmm1);
\
xmm5 = xmm3; \
xmm5 = xmm3; \
xmm3 = _mm_unpacklo_epi16(xmm3, xmm4); \
xmm3 = _mm_unpacklo_epi16(xmm3, xmm4); \
_mm_stream_si128((__m128i*)(p_buffer), xmm3); \
_mm_stream_si128((__m128i*)(p_buffer), xmm3); \
xmm5 = _mm_unpackhi_epi16(xmm5, xmm4); \
xmm5 = _mm_unpackhi_epi16(xmm5, xmm4); \
_mm_stream_si128((__m128i*)(p_buffer+4), xmm5); \
_mm_stream_si128((__m128i*)(p_buffer+4), xmm5); \
xmm3 = _mm_setzero_si128(); \
xmm6 = _mm_setzero_si128(); \
xmm4 = xmm2; \
xmm2 = _mm_unpackhi_epi8(xmm2, xmm0); \
xmm0 = _mm_unpackhi_epi8(xmm0, xmm4); \
xmm6 = _mm_unpackhi_epi8(xmm6, xmm1); \
xmm1 = _mm_unpackhi_epi8(xmm1, xmm3); \
xmm0 = xmm6; \
xmm5 = xmm3; \
xmm6 = _mm_unpacklo_epi16(xmm6, xmm2); \
xmm3 = _mm_unpacklo_epi16(xmm3, xmm1); \
_mm_stream_si128((__m128i*)(p_buffer+8), xmm6); \
_mm_stream_si128((__m128i*)(p_buffer+8), xmm3); \
xmm0 = _mm_unpackhi_epi16(xmm0, xmm2); \
xmm5 = _mm_unpackhi_epi16(xmm5, xmm4); \
_mm_stream_si128((__m128i*)(p_buffer+12), xmm0);
_mm_stream_si128((__m128i*)(p_buffer+12), xmm5); \
#define SSE2_
INTRINSICS_UNPACK_32_BGRA_UNALIGNED
\
#define SSE2_
UNPACK_32_BGRA_UNALIGNED
\
xmm3 = _mm_setzero_si128(); \
xmm3 = _mm_setzero_si128(); \
xmm4 = xmm2; \
xmm4 = xmm2; \
xmm4 = _mm_unpacklo_epi8(xmm4, xmm0); \
xmm4 = _mm_unpacklo_epi8(xmm4, xmm0); \
xmm
1 = _mm_unpacklo_epi8(xmm1, xmm3);
\
xmm
3 = _mm_unpacklo_epi8(xmm3, xmm1);
\
xmm5 = xmm3; \
xmm5 = xmm3; \
xmm3 = _mm_unpacklo_epi16(xmm3, xmm4); \
xmm3 = _mm_unpacklo_epi16(xmm3, xmm4); \
_mm_storeu_si128((__m128i*)(p_buffer), xmm3); \
_mm_storeu_si128((__m128i*)(p_buffer), xmm3); \
xmm5 = _mm_unpackhi_epi16(xmm5, xmm4); \
xmm5 = _mm_unpackhi_epi16(xmm5, xmm4); \
_mm_storeu_si128((__m128i*)(p_buffer+4), xmm5); \
_mm_storeu_si128((__m128i*)(p_buffer+4), xmm5); \
xmm3 = _mm_setzero_si128(); \
xmm6 = _mm_setzero_si128(); \
xmm4 = xmm2; \
xmm2 = _mm_unpackhi_epi8(xmm2, xmm0); \
xmm0 = _mm_unpackhi_epi8(xmm0, xmm4); \
xmm6 = _mm_unpackhi_epi8(xmm6, xmm1); \
xmm1 = _mm_unpackhi_epi8(xmm1, xmm3); \
xmm0 = xmm6; \
xmm5 = xmm3; \
xmm6 = _mm_unpacklo_epi16(xmm6, xmm2); \
xmm3 = _mm_unpacklo_epi16(xmm3, xmm1); \
_mm_storeu_si128((__m128i*)(p_buffer+8), xmm6); \
_mm_storeu_si128((__m128i*)(p_buffer+8), xmm3); \
xmm0 = _mm_unpackhi_epi16(xmm0, xmm2); \
xmm5 = _mm_unpackhi_epi16(xmm5, xmm4); \
_mm_storeu_si128((__m128i*)(p_buffer+12), xmm0);
_mm_storeu_si128((__m128i*)(p_buffer+12), xmm5); \
#define SSE2_UNPACK_32_ABGR_ALIGNED \
;
#define SSE2_UNPACK_32_ABGR_UNALIGNED \
;
#endif
#endif
modules/video_chroma/i420_yuy2.c
View file @
a3883709
...
@@ -307,7 +307,7 @@ static void I420_YUY2( vout_thread_t *p_vout, picture_t *p_source,
...
@@ -307,7 +307,7 @@ static void I420_YUY2( vout_thread_t *p_vout, picture_t *p_source,
#if defined (MODULE_NAME_IS_i420_yuy2_mmx)
#if defined (MODULE_NAME_IS_i420_yuy2_mmx)
/* re-enable FPU registers */
/* re-enable FPU registers */
__asm__
__volatile__
(
"emms"
)
;
MMX_END
;
#endif
#endif
#if defined (MODULE_NAME_IS_i420_yuy2_altivec)
#if defined (MODULE_NAME_IS_i420_yuy2_altivec)
...
@@ -348,8 +348,6 @@ static void I420_YUY2( vout_thread_t *p_vout, picture_t *p_source,
...
@@ -348,8 +348,6 @@ static void I420_YUY2( vout_thread_t *p_vout, picture_t *p_source,
p_line1
+=
i_dest_margin
;
p_line1
+=
i_dest_margin
;
p_line2
+=
i_dest_margin
;
p_line2
+=
i_dest_margin
;
}
}
/* make sure all SSE2 stores are visible thereafter */
__asm__
__volatile__
(
"sfence"
);
}
}
else
else
{
{
...
@@ -379,6 +377,8 @@ static void I420_YUY2( vout_thread_t *p_vout, picture_t *p_source,
...
@@ -379,6 +377,8 @@ static void I420_YUY2( vout_thread_t *p_vout, picture_t *p_source,
p_line2
+=
i_dest_margin
;
p_line2
+=
i_dest_margin
;
}
}
}
}
/* make sure all SSE2 stores are visible thereafter */
SSE2_END
;
#endif // defined(MODULE_NAME_IS_i420_yuy2_sse2)
#endif // defined(MODULE_NAME_IS_i420_yuy2_sse2)
}
}
...
@@ -518,7 +518,7 @@ static void I420_YVYU( vout_thread_t *p_vout, picture_t *p_source,
...
@@ -518,7 +518,7 @@ static void I420_YVYU( vout_thread_t *p_vout, picture_t *p_source,
#if defined (MODULE_NAME_IS_i420_yuy2_mmx)
#if defined (MODULE_NAME_IS_i420_yuy2_mmx)
/* re-enable FPU registers */
/* re-enable FPU registers */
__asm__
__volatile__
(
"emms"
)
;
MMX_END
;
#endif
#endif
#if defined (MODULE_NAME_IS_i420_yuy2_altivec)
#if defined (MODULE_NAME_IS_i420_yuy2_altivec)
...
@@ -558,8 +558,6 @@ static void I420_YVYU( vout_thread_t *p_vout, picture_t *p_source,
...
@@ -558,8 +558,6 @@ static void I420_YVYU( vout_thread_t *p_vout, picture_t *p_source,
p_line1
+=
i_dest_margin
;
p_line1
+=
i_dest_margin
;
p_line2
+=
i_dest_margin
;
p_line2
+=
i_dest_margin
;
}
}
/* make sure all SSE2 stores are visible thereafter */
__asm__
__volatile__
(
"sfence"
);
}
}
else
else
{
{
...
@@ -589,6 +587,8 @@ static void I420_YVYU( vout_thread_t *p_vout, picture_t *p_source,
...
@@ -589,6 +587,8 @@ static void I420_YVYU( vout_thread_t *p_vout, picture_t *p_source,
p_line2
+=
i_dest_margin
;
p_line2
+=
i_dest_margin
;
}
}
}
}
/* make sure all SSE2 stores are visible thereafter */
SSE2_END
;
#endif // defined(MODULE_NAME_IS_i420_yuy2_sse2)
#endif // defined(MODULE_NAME_IS_i420_yuy2_sse2)
}
}
...
@@ -727,7 +727,7 @@ static void I420_UYVY( vout_thread_t *p_vout, picture_t *p_source,
...
@@ -727,7 +727,7 @@ static void I420_UYVY( vout_thread_t *p_vout, picture_t *p_source,
#if defined (MODULE_NAME_IS_i420_yuy2_mmx)
#if defined (MODULE_NAME_IS_i420_yuy2_mmx)
/* re-enable FPU registers */
/* re-enable FPU registers */
__asm__
__volatile__
(
"emms"
)
;
MMX_END
;
#endif
#endif
#if defined (MODULE_NAME_IS_i420_yuy2_altivec)
#if defined (MODULE_NAME_IS_i420_yuy2_altivec)
...
@@ -767,8 +767,6 @@ static void I420_UYVY( vout_thread_t *p_vout, picture_t *p_source,
...
@@ -767,8 +767,6 @@ static void I420_UYVY( vout_thread_t *p_vout, picture_t *p_source,
p_line1
+=
i_dest_margin
;
p_line1
+=
i_dest_margin
;
p_line2
+=
i_dest_margin
;
p_line2
+=
i_dest_margin
;
}
}
/* make sure all SSE2 stores are visible thereafter */
__asm__
__volatile__
(
"sfence"
);
}
}
else
else
{
{
...
@@ -798,6 +796,8 @@ static void I420_UYVY( vout_thread_t *p_vout, picture_t *p_source,
...
@@ -798,6 +796,8 @@ static void I420_UYVY( vout_thread_t *p_vout, picture_t *p_source,
p_line2
+=
i_dest_margin
;
p_line2
+=
i_dest_margin
;
}
}
}
}
/* make sure all SSE2 stores are visible thereafter */
SSE2_END
;
#endif // defined(MODULE_NAME_IS_i420_yuy2_sse2)
#endif // defined(MODULE_NAME_IS_i420_yuy2_sse2)
}
}
...
@@ -871,7 +871,7 @@ static void I420_cyuv( vout_thread_t *p_vout, picture_t *p_source,
...
@@ -871,7 +871,7 @@ static void I420_cyuv( vout_thread_t *p_vout, picture_t *p_source,
#if defined (MODULE_NAME_IS_i420_yuy2_mmx)
#if defined (MODULE_NAME_IS_i420_yuy2_mmx)
/* re-enable FPU registers */
/* re-enable FPU registers */
__asm__
__volatile__
(
"emms"
)
;
MMX_END
;
#endif
#endif
#else // defined(MODULE_NAME_IS_i420_yuy2_sse2)
#else // defined(MODULE_NAME_IS_i420_yuy2_sse2)
...
@@ -907,8 +907,6 @@ static void I420_cyuv( vout_thread_t *p_vout, picture_t *p_source,
...
@@ -907,8 +907,6 @@ static void I420_cyuv( vout_thread_t *p_vout, picture_t *p_source,
p_line1
+=
i_dest_margin
;
p_line1
+=
i_dest_margin
;
p_line2
+=
i_dest_margin
;
p_line2
+=
i_dest_margin
;
}
}
/* make sure all SSE2 stores are visible thereafter */
__asm__
__volatile__
(
"sfence"
);
}
}
else
else
{
{
...
@@ -938,6 +936,8 @@ static void I420_cyuv( vout_thread_t *p_vout, picture_t *p_source,
...
@@ -938,6 +936,8 @@ static void I420_cyuv( vout_thread_t *p_vout, picture_t *p_source,
p_line2
+=
i_dest_margin
;
p_line2
+=
i_dest_margin
;
}
}
}
}
/* make sure all SSE2 stores are visible thereafter */
SSE2_END
;
#endif // defined(MODULE_NAME_IS_i420_yuy2_sse2)
#endif // defined(MODULE_NAME_IS_i420_yuy2_sse2)
}
}
#endif // !defined (MODULE_NAME_IS_i420_yuy2_altivec)
#endif // !defined (MODULE_NAME_IS_i420_yuy2_altivec)
...
...
modules/video_chroma/i420_yuy2.h
View file @
a3883709
...
@@ -24,17 +24,26 @@
...
@@ -24,17 +24,26 @@
#ifdef MODULE_NAME_IS_i420_yuy2_mmx
#ifdef MODULE_NAME_IS_i420_yuy2_mmx
#if defined(CAN_COMPILE_MMX)
/* MMX assembly */
#define MMX_CALL(MMX_INSTRUCTIONS) \
#define MMX_CALL(MMX_INSTRUCTIONS) \
do { \
do { \
__asm__ __volatile__( \
__asm__ __volatile__( \
".p2align 3 \n\t" \
".p2align 3 \n\t" \
MMX_INSTRUCTIONS \
MMX_INSTRUCTIONS \
: \
: \
: "r" (p_line1), "r" (p_line2), "r" (p_y1), "r" (p_y2), \
: "r" (p_line1), "r" (p_line2), \
"r" (p_y1), "r" (p_y2), \
"r" (p_u), "r" (p_v) ); \
"r" (p_u), "r" (p_v) ); \
p_line1 += 16; p_line2 += 16; p_y1 += 8; p_y2 += 8; p_u += 4; p_v += 4; \
p_line1 += 16; p_line2 += 16; \
p_y1 += 8; p_y2 += 8; \
p_u += 4; p_v += 4; \
} while(0)
} while(0)
#define MMX_END __asm__ __volatile__ ( "emms" )
#define MMX_YUV420_YUYV " \n\
#define MMX_YUV420_YUYV " \n\
movd (%4), %%mm1 # Load 4 Cb 00 00 00 00 u3 u2 u1 u0 \n\
movd (%4), %%mm1 # Load 4 Cb 00 00 00 00 u3 u2 u1 u0 \n\
movd (%5), %%mm2 # Load 4 Cr 00 00 00 00 v3 v2 v1 v0 \n\
movd (%5), %%mm2 # Load 4 Cr 00 00 00 00 v3 v2 v1 v0 \n\
...
@@ -111,10 +120,82 @@ packuswb %%mm1, %%mm1 # pack Y Y6 Y4 Y2 Y0 Y6 Y4 Y2 Y0 \n\
...
@@ -111,10 +120,82 @@ packuswb %%mm1, %%mm1 # pack Y Y6 Y4 Y2 Y0 Y6 Y4 Y2 Y0 \n\
punpcklbw %%mm2, %%mm1 # v2 Y6 u2 Y4 v0 Y2 u0 Y0 \n\
punpcklbw %%mm2, %%mm1 # v2 Y6 u2 Y4 v0 Y2 u0 Y0 \n\
movq %%mm1, (%1) # Store YUYV \n\
movq %%mm1, (%1) # Store YUYV \n\
"
"
#elif defined(HAVE_MMX_INTRINSICS)
/* MMX intrinsics */
#include <mmintrin.h>
#define MMX_CALL(MMX_INSTRUCTIONS) \
do { \
__m64 mm0, mm1, mm2, mm3, mm4; \
MMX_INSTRUCTIONS \
p_line1 += 16; p_line2 += 16; \
p_y1 += 8; p_y2 += 8; \
p_u += 4; p_v += 4; \
} while(0)
#define MMX_END _mm_empty()
#define MMX_YUV420_YUYV \
mm1 = _mm_cvtsi32_si64((int)*p_u); \
mm2 = _mm_cvtsi32_si64((int)*p_v); \
mm0 = (__m64)*(uint64_t*)p_y1; \
mm3 = (__m64)*(uint64_t*)p_y2; \
mm1 = _mm_unpacklo_pi8(mm1, mm2); \
mm2 = mm0; \
mm2 = _mm_unpacklo_pi8(mm2, mm1); \
*(uin64_t)p_line1 = (uint64)mm2; \
mm0 = _mm_unpackhi_pi8(mm0, mm1); \
*(uin64_t)(p_line1 + 4) = (uint64)mm0; \
mm4 = mm3; \
mm4 = _mm_unpacklo_pi8(mm4, mm1); \
*(uin64_t)p_line2 = (uint64)mm4; \
mm3 = _mm_unpackhi_pi8(mm3, mm1); \
*(uin64_t)(p_line2 + 4) = (uint64)mm4;
#define MMX_YUV420_YVYU \
mm2 = _mm_cvtsi32_si64((int)*p_u); \
mm1 = _mm_cvtsi32_si64((int)*p_v); \
mm0 = (__m64)*(uint64_t*)p_y1; \
mm3 = (__m64)*(uint64_t*)p_y2; \
mm1 = _mm_unpacklo_pi8(mm1, mm2); \
mm2 = mm0; \
mm2 = _mm_unpacklo_pi8(mm2, mm1); \
*(uin64_t)p_line1 = (uint64)mm2; \
mm0 = _mm_unpackhi_pi8(mm0, mm1); \
*(uin64_t)(p_line1 + 4) = (uint64)mm0; \
mm4 = mm3; \
mm4 = _mm_unpacklo_pi8(mm4, mm1); \
*(uin64_t)p_line2 = (uint64)mm4; \
mm3 = _mm_unpackhi_pi8(mm3, mm1); \
*(uin64_t)(p_line2 + 4) = (uint64)mm4;
#define MMX_YUV420_UYVY \
mm1 = _mm_cvtsi32_si64((int)*p_u); \
mm2 = _mm_cvtsi32_si64((int)*p_v); \
mm0 = (__m64)*(uint64_t*)p_y1; \
mm3 = (__m64)*(uint64_t*)p_y2; \
mm1 = _mm_unpacklo_pi8(mm1, mm2); \
mm2 = mm1; \
mm2 = _mm_unpacklo_pi8(mm2, mm0); \
*(uin64_t)p_line1 = (uint64)mm2; \
mm2 = mm1; \
mm2 = _mm_unpackhi_pi8(mm2, mm0); \
*(uin64_t)(p_line1 + 4) = (uint64)mm2; \
mm4 = mm1; \
mm4 = _mm_unpacklo_pi8(mm4, mm3); \
*(uin64_t)p_line2 = (uint64)mm4; \
mm1 = _mm_unpackhi_pi8(mm1, mm3); \
*(uin64_t)(p_line2 + 4) = (uint64)mm1;
#endif
#elif defined( MODULE_NAME_IS_i420_yuy2_sse2 )
#elif defined( MODULE_NAME_IS_i420_yuy2_sse2 )
/* SSE2 */
#if defined(CAN_COMPILE_SSE2)
/* SSE2 assembly */
#define SSE2_CALL(SSE2_INSTRUCTIONS) \
#define SSE2_CALL(SSE2_INSTRUCTIONS) \
do { \
do { \
...
@@ -122,12 +203,16 @@ movq %%mm1, (%1) # Store YUYV \n\
...
@@ -122,12 +203,16 @@ movq %%mm1, (%1) # Store YUYV \n\
".p2align 3 \n\t" \
".p2align 3 \n\t" \
SSE2_INSTRUCTIONS \
SSE2_INSTRUCTIONS \
: \
: \
: "r" (p_line1), "r" (p_line2), "r" (p_y1), "r" (p_y2), \
: "r" (p_line1), "r" (p_line2), \
"r" (p_y1), "r" (p_y2), \
"r" (p_u), "r" (p_v) ); \
"r" (p_u), "r" (p_v) ); \
p_line1 += 32; p_line2 += 32; p_y1 += 16; p_y2 += 16; \
p_line1 += 32; p_line2 += 32; \
p_y1 += 16; p_y2 += 16; \
p_u += 8; p_v += 8; \
p_u += 8; p_v += 8; \
} while(0)
} while(0)
#define SSE2_END __asm__ __volatile__ ( "sfence" ::: "memory" )
#define SSE2_YUV420_YUYV_ALIGNED " \n\
#define SSE2_YUV420_YUYV_ALIGNED " \n\
movq (%4), %%xmm1 # Load 8 Cb u7 u6 u5 u4 u3 u2 u1 u0 \n\
movq (%4), %%xmm1 # Load 8 Cb u7 u6 u5 u4 u3 u2 u1 u0 \n\
movq (%5), %%xmm2 # Load 8 Cr v7 06 v5 v4 v3 v2 v1 v0 \n\
movq (%5), %%xmm2 # Load 8 Cr v7 06 v5 v4 v3 v2 v1 v0 \n\
...
@@ -151,6 +236,8 @@ movq (%4), %%xmm1 # Load 8 Cb 00 00 00 00 u3 u2 u1 u0 \n\
...
@@ -151,6 +236,8 @@ movq (%4), %%xmm1 # Load 8 Cb 00 00 00 00 u3 u2 u1 u0 \n\
movq (%5), %%xmm2 # Load 8 Cr 00 00 00 00 v3 v2 v1 v0 \n\
movq (%5), %%xmm2 # Load 8 Cr 00 00 00 00 v3 v2 v1 v0 \n\
movdqu (%2), %%xmm0 # Load 16 Y y7 y6 y5 y4 y3 y2 y1 y0 \n\
movdqu (%2), %%xmm0 # Load 16 Y y7 y6 y5 y4 y3 y2 y1 y0 \n\
movdqu (%3), %%xmm3 # Load 16 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 \n\
movdqu (%3), %%xmm3 # Load 16 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 \n\
prefetchnta (%0) # Tell CPU not to cache output YUYV data \n\
prefetchnta (%1) # Tell CPU not to cache output YUYV data \n\
punpcklbw %%xmm2, %%xmm1 # v3 u3 v2 u2 v1 u1 v0 u0 \n\
punpcklbw %%xmm2, %%xmm1 # v3 u3 v2 u2 v1 u1 v0 u0 \n\
movdqa %%xmm0, %%xmm2 # y7 y6 y5 y4 y3 y2 y1 y0 \n\
movdqa %%xmm0, %%xmm2 # y7 y6 y5 y4 y3 y2 y1 y0 \n\
punpcklbw %%xmm1, %%xmm2 # v1 y3 u1 y2 v0 y1 u0 y0 \n\
punpcklbw %%xmm1, %%xmm2 # v1 y3 u1 y2 v0 y1 u0 y0 \n\
...
@@ -187,6 +274,8 @@ movq (%4), %%xmm2 # Load 8 Cb 00 00 00 00 u3 u2 u1 u0 \n\
...
@@ -187,6 +274,8 @@ movq (%4), %%xmm2 # Load 8 Cb 00 00 00 00 u3 u2 u1 u0 \n\
movq (%5), %%xmm1 # Load 8 Cr 00 00 00 00 v3 v2 v1 v0 \n\
movq (%5), %%xmm1 # Load 8 Cr 00 00 00 00 v3 v2 v1 v0 \n\
movdqu (%2), %%xmm0 # Load 16 Y y7 y6 y5 y4 y3 y2 y1 y0 \n\
movdqu (%2), %%xmm0 # Load 16 Y y7 y6 y5 y4 y3 y2 y1 y0 \n\
movdqu (%3), %%xmm3 # Load 16 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 \n\
movdqu (%3), %%xmm3 # Load 16 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 \n\
prefetchnta (%0) # Tell CPU not to cache output YVYU data \n\
prefetchnta (%1) # Tell CPU not to cache output YVYU data \n\
punpcklbw %%xmm2, %%xmm1 # u3 v3 u2 v2 u1 v1 u0 v0 \n\
punpcklbw %%xmm2, %%xmm1 # u3 v3 u2 v2 u1 v1 u0 v0 \n\
movdqu %%xmm0, %%xmm2 # y7 y6 y5 y4 y3 y2 y1 y0 \n\
movdqu %%xmm0, %%xmm2 # y7 y6 y5 y4 y3 y2 y1 y0 \n\
punpcklbw %%xmm1, %%xmm2 # u1 y3 v1 y2 u0 y1 v0 y0 \n\
punpcklbw %%xmm1, %%xmm2 # u1 y3 v1 y2 u0 y1 v0 y0 \n\
...
@@ -224,6 +313,8 @@ movq (%4), %%xmm1 # Load 8 Cb 00 00 00 00 u3 u2 u1 u0 \n\
...
@@ -224,6 +313,8 @@ movq (%4), %%xmm1 # Load 8 Cb 00 00 00 00 u3 u2 u1 u0 \n\
movq (%5), %%xmm2 # Load 8 Cr 00 00 00 00 v3 v2 v1 v0 \n\
movq (%5), %%xmm2 # Load 8 Cr 00 00 00 00 v3 v2 v1 v0 \n\
movdqu (%2), %%xmm0 # Load 16 Y y7 y6 y5 y4 y3 y2 y1 y0 \n\
movdqu (%2), %%xmm0 # Load 16 Y y7 y6 y5 y4 y3 y2 y1 y0 \n\
movdqu (%3), %%xmm3 # Load 16 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 \n\
movdqu (%3), %%xmm3 # Load 16 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 \n\
prefetchnta (%0) # Tell CPU not to cache output UYVY data \n\
prefetchnta (%1) # Tell CPU not to cache output UYVY data \n\
punpcklbw %%xmm2, %%xmm1 # v3 u3 v2 u2 v1 u1 v0 u0 \n\
punpcklbw %%xmm2, %%xmm1 # v3 u3 v2 u2 v1 u1 v0 u0 \n\
movdqu %%xmm1, %%xmm2 # v3 u3 v2 u2 v1 u1 v0 u0 \n\
movdqu %%xmm1, %%xmm2 # v3 u3 v2 u2 v1 u1 v0 u0 \n\
punpcklbw %%xmm0, %%xmm2 # y3 v1 y2 u1 y1 v0 y0 u0 \n\
punpcklbw %%xmm0, %%xmm2 # y3 v1 y2 u1 y1 v0 y0 u0 \n\
...
@@ -238,6 +329,135 @@ punpckhbw %%xmm3, %%xmm1 # Y7 v3 Y6 u3 Y5 v2 Y4 u2 \n\
...
@@ -238,6 +329,135 @@ punpckhbw %%xmm3, %%xmm1 # Y7 v3 Y6 u3 Y5 v2 Y4 u2 \n\
movdqu %%xmm1, 16(%1) # Store high UYVY \n\
movdqu %%xmm1, 16(%1) # Store high UYVY \n\
"
"
#elif defined(HAVE_SSE2_INTRINSICS)
/* SSE2 intrinsics */
#include <emmintrin.h>
#define SSE2_CALL(SSE2_INSTRUCTIONS) \
do { \
__m128i xmm0, xmm1, xmm2, xmm3, xmm4; \
SSE2_INSTRUCTIONS \
p_line1 += 32; p_line2 += 32; \
p_y1 += 16; p_y2 += 16; \
p_u += 8; p_v += 8; \
} while(0)
#define SSE2_END _mm_sfence()
#define SSE2_YUV420_YUYV_ALIGNED \
xmm1 = _mm_loadl_epi64((__m128i *)p_u); \
xmm2 = _mm_loadl_epi64((__m128i *)p_v); \
xmm0 = _mm_load_si128((__m128i *)p_y1); \
xmm3 = _mm_load_si128((__m128i *)p_y2); \
xmm1 = _mm_unpacklo_epi8(xmm1, xmm2); \
xmm2 = xmm0; \
xmm2 = _mm_unpacklo_epi8(xmm2, xmm1); \
_mm_stream_si128((__m128i*)(p_line1), xmm2); \
xmm0 = _mm_unpackhi_epi8(xmm0, xmm1); \
_mm_stream_si128((__m128i*)(p_line1+16), xmm0); \
xmm4 = xmm3; \
xmm4 = _mm_unpacklo_epi8(xmm4, xmm1); \
_mm_stream_si128((__m128i*)(p_line2), xmm4); \
xmm3 = _mm_unpackhi_epi8(xmm3, xmm1); \
_mm_stream_si128((__m128i*)(p_line1+16), xmm3);
#define SSE2_YUV420_YUYV_UNALIGNED \
xmm1 = _mm_loadl_epi64((__m128i *)p_u); \
xmm2 = _mm_loadl_epi64((__m128i *)p_v); \
xmm0 = _mm_load_si128((__m128i *)p_y1); \
xmm3 = _mm_load_si128((__m128i *)p_y2); \
_mm_prefetch(p_line1, _MM_HINT_NTA); \
_mm_prefetch(p_line2, _MM_HINT_NTA); \
xmm1 = _mm_unpacklo_epi8(xmm1, xmm2); \
xmm2 = xmm0; \
xmm2 = _mm_unpacklo_epi8(xmm2, xmm1); \
_mm_storeu_si128((__m128i*)(p_line1), xmm2); \
xmm0 = _mm_unpackhi_epi8(xmm0, xmm1); \
_mm_storeu_si128((__m128i*)(p_line1+16), xmm0); \
xmm4 = xmm3; \
xmm4 = _mm_unpacklo_epi8(xmm4, xmm1); \
_mm_storeu_si128((__m128i*)(p_line2), xmm4); \
xmm3 = _mm_unpackhi_epi8(xmm3, xmm1); \
_mm_storeu_si128((__m128i*)(p_line1+16), xmm3);
#define SSE2_YUV420_YVYU_ALIGNED \
xmm1 = _mm_loadl_epi64((__m128i *)p_v); \
xmm2 = _mm_loadl_epi64((__m128i *)p_u); \
xmm0 = _mm_load_si128((__m128i *)p_y1); \
xmm3 = _mm_load_si128((__m128i *)p_y2); \
xmm1 = _mm_unpacklo_epi8(xmm1, xmm2); \
xmm2 = xmm0; \
xmm2 = _mm_unpacklo_epi8(xmm2, xmm1); \
_mm_stream_si128((__m128i*)(p_line1), xmm2); \
xmm0 = _mm_unpackhi_epi8(xmm0, xmm1); \
_mm_stream_si128((__m128i*)(p_line1+16), xmm0); \
xmm4 = xmm3; \
xmm4 = _mm_unpacklo_epi8(xmm4, xmm1); \
_mm_stream_si128((__m128i*)(p_line2), xmm4); \
xmm3 = _mm_unpackhi_epi8(xmm3, xmm1); \
_mm_stream_si128((__m128i*)(p_line1+16), xmm3);
#define SSE2_YUV420_YVYU_UNALIGNED \
xmm1 = _mm_loadl_epi64((__m128i *)p_v); \
xmm2 = _mm_loadl_epi64((__m128i *)p_u); \
xmm0 = _mm_load_si128((__m128i *)p_y1); \
xmm3 = _mm_load_si128((__m128i *)p_y2); \
_mm_prefetch(p_line1, _MM_HINT_NTA); \
_mm_prefetch(p_line2, _MM_HINT_NTA); \
xmm1 = _mm_unpacklo_epi8(xmm1, xmm2); \
xmm2 = xmm0; \
xmm2 = _mm_unpacklo_epi8(xmm2, xmm1); \
_mm_storeu_si128((__m128i*)(p_line1), xmm2); \
xmm0 = _mm_unpackhi_epi8(xmm0, xmm1); \
_mm_storeu_si128((__m128i*)(p_line1+16), xmm0); \
xmm4 = xmm3; \
xmm4 = _mm_unpacklo_epi8(xmm4, xmm1); \
_mm_storeu_si128((__m128i*)(p_line2), xmm4); \
xmm3 = _mm_unpackhi_epi8(xmm3, xmm1); \
_mm_storeu_si128((__m128i*)(p_line1+16), xmm3);
#define SSE2_YUV420_UYVY_ALIGNED \
xmm1 = _mm_loadl_epi64((__m128i *)p_u); \
xmm2 = _mm_loadl_epi64((__m128i *)p_v); \
xmm0 = _mm_load_si128((__m128i *)p_y1); \
xmm3 = _mm_load_si128((__m128i *)p_y2); \
xmm1 = _mm_unpacklo_epi8(xmm1, xmm2); \
xmm2 = xmm1; \
xmm2 = _mm_unpacklo_epi8(xmm2, xmm0); \
_mm_stream_si128((__m128i*)(p_line1), xmm2); \
xmm2 = xmm1; \
xmm2 = _mm_unpackhi_epi8(xmm2, xmm0); \
_mm_stream_si128((__m128i*)(p_line1+16), xmm2); \
xmm4 = xmm1; \
xmm4 = _mm_unpacklo_epi8(xmm4, xmm3); \
_mm_stream_si128((__m128i*)(p_line2), xmm4); \
xmm1 = _mm_unpackhi_epi8(xmm1, xmm3); \
_mm_stream_si128((__m128i*)(p_line1+16), xmm1);
#define SSE2_YUV420_UYVY_UNALIGNED \
xmm1 = _mm_loadl_epi64((__m128i *)p_u); \
xmm2 = _mm_loadl_epi64((__m128i *)p_v); \
xmm0 = _mm_load_si128((__m128i *)p_y1); \
xmm3 = _mm_load_si128((__m128i *)p_y2); \
_mm_prefetch(p_line1, _MM_HINT_NTA); \
_mm_prefetch(p_line2, _MM_HINT_NTA); \
xmm1 = _mm_unpacklo_epi8(xmm1, xmm2); \
xmm2 = xmm1; \
xmm2 = _mm_unpacklo_epi8(xmm2, xmm0); \
_mm_storeu_si128((__m128i*)(p_line1), xmm2); \
xmm2 = xmm1; \
xmm2 = _mm_unpackhi_epi8(xmm2, xmm0); \
_mm_storeu_si128((__m128i*)(p_line1+16), xmm2); \
xmm4 = xmm1; \
xmm4 = _mm_unpacklo_epi8(xmm4, xmm3); \
_mm_storeu_si128((__m128i*)(p_line2), xmm4); \
xmm1 = _mm_unpackhi_epi8(xmm1, xmm3); \
_mm_storeu_si128((__m128i*)(p_line1+16), xmm1);
#endif
#endif
#endif
/* Used in both accelerated and C modules */
/* Used in both accelerated and C modules */
...
...
modules/video_chroma/i422_yuy2.c
View file @
a3883709
...
@@ -5,6 +5,7 @@
...
@@ -5,6 +5,7 @@
* $Id$
* $Id$
*
*
* Authors: Samuel Hocevar <sam@zoy.org>
* Authors: Samuel Hocevar <sam@zoy.org>
* Damien Fouilleul <damienf@videolan.org>
*
*
* This program is free software; you can redistribute it and/or modify
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* it under the terms of the GNU General Public License as published by
...
@@ -160,15 +161,17 @@ static void I422_YUY2( vout_thread_t *p_vout, picture_t *p_source,
...
@@ -160,15 +161,17 @@ static void I422_YUY2( vout_thread_t *p_vout, picture_t *p_source,
C_YUV422_YUYV
(
p_line
,
p_y
,
p_u
,
p_v
);
C_YUV422_YUYV
(
p_line
,
p_y
,
p_u
,
p_v
);
C_YUV422_YUYV
(
p_line
,
p_y
,
p_u
,
p_v
);
C_YUV422_YUYV
(
p_line
,
p_y
,
p_u
,
p_v
);
C_YUV422_YUYV
(
p_line
,
p_y
,
p_u
,
p_v
);
C_YUV422_YUYV
(
p_line
,
p_y
,
p_u
,
p_v
);
#else
#elif defined (MODULE_NAME_IS_i422_yuy2_mmx)
__asm__
(
".p2align 3"
MMX_YUV422_YUYV
MMX_CALL
(
MMX_YUV422_YUYV
);
:
:
"r"
(
p_line
),
"r"
(
p_y
),
"r"
(
p_u
),
"r"
(
p_v
)
);
p_line
+=
16
;
p_y
+=
8
;
p_u
+=
4
;
p_v
+=
4
;
#endif
#endif
}
}
p_pixels
+=
i_pitch
;
p_pixels
+=
i_pitch
;
}
}
#if defined (MODULE_NAME_IS_i422_yuy2_mmx)
MMX_END
;
#elif defined (MODULE_NAME_IS_i422_yuy2_sse2)
SSE2_END
;
#endif
}
}
/*****************************************************************************
/*****************************************************************************
...
@@ -195,15 +198,17 @@ static void I422_YVYU( vout_thread_t *p_vout, picture_t *p_source,
...
@@ -195,15 +198,17 @@ static void I422_YVYU( vout_thread_t *p_vout, picture_t *p_source,
C_YUV422_YVYU
(
p_line
,
p_y
,
p_u
,
p_v
);
C_YUV422_YVYU
(
p_line
,
p_y
,
p_u
,
p_v
);
C_YUV422_YVYU
(
p_line
,
p_y
,
p_u
,
p_v
);
C_YUV422_YVYU
(
p_line
,
p_y
,
p_u
,
p_v
);
C_YUV422_YVYU
(
p_line
,
p_y
,
p_u
,
p_v
);
C_YUV422_YVYU
(
p_line
,
p_y
,
p_u
,
p_v
);
#else
#elif defined (MODULE_NAME_IS_i422_yuy2_mmx)
__asm__
(
".p2align 3"
MMX_YUV422_YVYU
MMX_CALL
(
MMX_YUV422_YVYU
);
:
:
"r"
(
p_line
),
"r"
(
p_y
),
"r"
(
p_u
),
"r"
(
p_v
)
);
p_line
+=
16
;
p_y
+=
8
;
p_u
+=
4
;
p_v
+=
4
;
#endif
#endif
}
}
p_pixels
+=
i_pitch
;
p_pixels
+=
i_pitch
;
}
}
#if defined (MODULE_NAME_IS_i422_yuy2_mmx)
MMX_END
;
#elif defined (MODULE_NAME_IS_i422_yuy2_sse2)
SSE2_END
;
#endif
}
}
/*****************************************************************************
/*****************************************************************************
...
@@ -230,15 +235,17 @@ static void I422_UYVY( vout_thread_t *p_vout, picture_t *p_source,
...
@@ -230,15 +235,17 @@ static void I422_UYVY( vout_thread_t *p_vout, picture_t *p_source,
C_YUV422_UYVY
(
p_line
,
p_y
,
p_u
,
p_v
);
C_YUV422_UYVY
(
p_line
,
p_y
,
p_u
,
p_v
);
C_YUV422_UYVY
(
p_line
,
p_y
,
p_u
,
p_v
);
C_YUV422_UYVY
(
p_line
,
p_y
,
p_u
,
p_v
);
C_YUV422_UYVY
(
p_line
,
p_y
,
p_u
,
p_v
);
C_YUV422_UYVY
(
p_line
,
p_y
,
p_u
,
p_v
);
#else
#elif defined (MODULE_NAME_IS_i422_yuy2_mmx)
__asm__
(
".p2align 3"
MMX_YUV422_UYVY
MMX_CALL
(
MMX_YUV422_UYVY
);
:
:
"r"
(
p_line
),
"r"
(
p_y
),
"r"
(
p_u
),
"r"
(
p_v
)
);
p_line
+=
16
;
p_y
+=
8
;
p_u
+=
4
;
p_v
+=
4
;
#endif
#endif
}
}
p_pixels
+=
i_pitch
;
p_pixels
+=
i_pitch
;
}
}
#if defined (MODULE_NAME_IS_i422_yuy2_mmx)
MMX_END
;
#elif defined (MODULE_NAME_IS_i422_yuy2_sse2)
SSE2_END
;
#endif
}
}
/*****************************************************************************
/*****************************************************************************
...
@@ -275,14 +282,16 @@ static void I422_cyuv( vout_thread_t *p_vout, picture_t *p_source,
...
@@ -275,14 +282,16 @@ static void I422_cyuv( vout_thread_t *p_vout, picture_t *p_source,
C_YUV422_UYVY
(
p_line
,
p_y
,
p_u
,
p_v
);
C_YUV422_UYVY
(
p_line
,
p_y
,
p_u
,
p_v
);
C_YUV422_UYVY
(
p_line
,
p_y
,
p_u
,
p_v
);
C_YUV422_UYVY
(
p_line
,
p_y
,
p_u
,
p_v
);
C_YUV422_UYVY
(
p_line
,
p_y
,
p_u
,
p_v
);
C_YUV422_UYVY
(
p_line
,
p_y
,
p_u
,
p_v
);
#else
#elif defined (MODULE_NAME_IS_i422_yuy2_mmx)
__asm__
(
".p2align 3"
MMX_YUV422_UYVY
MMX_CALL
(
MMX_YUV422_UYVY
);
:
:
"r"
(
p_line
),
"r"
(
p_y
),
"r"
(
p_u
),
"r"
(
p_v
)
);
p_line
+=
16
;
p_y
+=
8
;
p_u
+=
4
;
p_v
+=
4
;
#endif
#endif
}
}
}
}
#if defined (MODULE_NAME_IS_i422_yuy2_mmx)
MMX_END
;
#elif defined (MODULE_NAME_IS_i422_yuy2_sse2)
SSE2_END
;
#endif
}
}
/*****************************************************************************
/*****************************************************************************
...
...
modules/video_chroma/i422_yuy2.h
View file @
a3883709
...
@@ -5,6 +5,7 @@
...
@@ -5,6 +5,7 @@
* $Id$
* $Id$
*
*
* Authors: Samuel Hocevar <sam@zoy.org>
* Authors: Samuel Hocevar <sam@zoy.org>
* Damien Fouilleul <damienf@videolan.org>
*
*
* This program is free software; you can redistribute it and/or modify
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* it under the terms of the GNU General Public License as published by
...
@@ -23,6 +24,24 @@
...
@@ -23,6 +24,24 @@
#ifdef MODULE_NAME_IS_i422_yuy2_mmx
#ifdef MODULE_NAME_IS_i422_yuy2_mmx
#if defined(CAN_COMPILE_MMX)
/* MMX assembly */
#define MMX_CALL(MMX_INSTRUCTIONS) \
do { \
__asm__ __volatile__( \
".p2align 3 \n\t" \
MMX_INSTRUCTIONS \
: \
: "r" (p_line), "r" (p_y), \
"r" (p_u), "r" (p_v) ); \
p_line += 16; p_y += 8; \
p_u += 4; p_v += 4; \
} while(0)
#define MMX_END __asm__ __volatile__ ( "emms" )
#define MMX_YUV422_YUYV " \n\
#define MMX_YUV422_YUYV " \n\
movq (%1), %%mm0 # Load 8 Y y7 y6 y5 y4 y3 y2 y1 y0 \n\
movq (%1), %%mm0 # Load 8 Y y7 y6 y5 y4 y3 y2 y1 y0 \n\
movd (%2), %%mm1 # Load 4 Cb 00 00 00 00 u3 u2 u1 u0 \n\
movd (%2), %%mm1 # Load 4 Cb 00 00 00 00 u3 u2 u1 u0 \n\
...
@@ -62,7 +81,36 @@ movq %%mm1, 8(%0) # Store high UYVY \n\
...
@@ -62,7 +81,36 @@ movq %%mm1, 8(%0) # Store high UYVY \n\
#define MMX_YUV422_Y211 " \n\
#define MMX_YUV422_Y211 " \n\
"
"
#else
#elif defined(HAVE_MMX_INTRINSICS)
/* MMX intrinsics */
#include <mmintrin.h>
#define MMX_END _mm_empty()
#endif
#elif defined( MODULE_NAME_IS_i422_yuy2_sse2 )
#if defined(CAN_COMPILE_SSE2)
/* SSE2 assembly */
#define SSE2_END __asm__ __volatile__ ( "sfence" ::: "memory" )
#elif defined(HAVE_SSE2_INTRINSICS)
/* SSE2 intrinsics */
#include <emmintrin.h>
#define SSE2_END _mm_sfence()
#endif
#elif defined (MODULE_NAME_IS_i422_yuy2)
#define C_YUV422_YUYV( p_line, p_y, p_u, p_v ) \
#define C_YUV422_YUYV( p_line, p_y, p_u, p_v ) \
*(p_line)++ = *(p_y)++; \
*(p_line)++ = *(p_y)++; \
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment