Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Support
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
V
vlc-1.1
Project overview
Project overview
Details
Activity
Releases
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Issues
0
Issues
0
List
Boards
Labels
Milestones
Redmine
Redmine
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Operations
Operations
Metrics
Environments
Analytics
Analytics
CI / CD
Repository
Value Stream
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
videolan
vlc-1.1
Commits
df3b5eec
Commit
df3b5eec
authored
Aug 15, 2007
by
Damien Fouilleul
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
video chromas: finalize SSE2 improvements
parent
30900cb3
Changes
5
Hide whitespace changes
Inline
Side-by-side
Showing
5 changed files
with
145 additions
and
6 deletions
+145
-6
AUTHORS
AUTHORS
+3
-0
NEWS
NEWS
+2
-0
modules/video_chroma/i420_yuy2.h
modules/video_chroma/i420_yuy2.h
+6
-6
modules/video_chroma/i422_yuy2.c
modules/video_chroma/i422_yuy2.c
+61
-0
modules/video_chroma/i422_yuy2.h
modules/video_chroma/i422_yuy2.h
+73
-0
No files found.
AUTHORS
View file @
df3b5eec
...
@@ -190,6 +190,9 @@ E: Damien.Fouilleul@laposte.net
...
@@ -190,6 +190,9 @@ E: Damien.Fouilleul@laposte.net
C: Quovodis
C: Quovodis
D: ActiveX control
D: ActiveX control
D: Safari/Firefox plugin for MacOS X
D: Safari/Firefox plugin for MacOS X
D: Direct3D Video output
D: SSE2 chroma converters
D: improved MMX chroma converters
S: Ireland
S: Ireland
N: Alexis Guillard
N: Alexis Guillard
...
...
NEWS
View file @
df3b5eec
...
@@ -81,6 +81,8 @@ Video output and filters:
...
@@ -81,6 +81,8 @@ Video output and filters:
was previously part of the mosaic module.
was previously part of the mosaic module.
* Fix random characters problem in RSS filter.
* Fix random characters problem in RSS filter.
* Add rotate-deciangle for more precision on rotate filter
* Add rotate-deciangle for more precision on rotate filter
* Support for Intel SSE2 intruction set in chroma converters
* Improved use of Intel MMX intruction set in chroma converters
Audio output
Audio output
* Replay gain support.
* Replay gain support.
...
...
modules/video_chroma/i420_yuy2.h
View file @
df3b5eec
...
@@ -366,8 +366,8 @@ movdqu %%xmm1, 16(%1) # Store high UYVY \n\
...
@@ -366,8 +366,8 @@ movdqu %%xmm1, 16(%1) # Store high UYVY \n\
#define SSE2_YUV420_YUYV_UNALIGNED \
#define SSE2_YUV420_YUYV_UNALIGNED \
xmm1 = _mm_loadl_epi64((__m128i *)p_u); \
xmm1 = _mm_loadl_epi64((__m128i *)p_u); \
xmm2 = _mm_loadl_epi64((__m128i *)p_v); \
xmm2 = _mm_loadl_epi64((__m128i *)p_v); \
xmm0 = _mm_load
_si128((__m128i *)p_y1);
\
xmm0 = _mm_load
u_si128((__m128i *)p_y1);
\
xmm3 = _mm_load
_si128((__m128i *)p_y2);
\
xmm3 = _mm_load
u_si128((__m128i *)p_y2);
\
_mm_prefetch(p_line1, _MM_HINT_NTA); \
_mm_prefetch(p_line1, _MM_HINT_NTA); \
_mm_prefetch(p_line2, _MM_HINT_NTA); \
_mm_prefetch(p_line2, _MM_HINT_NTA); \
xmm1 = _mm_unpacklo_epi8(xmm1, xmm2); \
xmm1 = _mm_unpacklo_epi8(xmm1, xmm2); \
...
@@ -402,8 +402,8 @@ movdqu %%xmm1, 16(%1) # Store high UYVY \n\
...
@@ -402,8 +402,8 @@ movdqu %%xmm1, 16(%1) # Store high UYVY \n\
#define SSE2_YUV420_YVYU_UNALIGNED \
#define SSE2_YUV420_YVYU_UNALIGNED \
xmm1 = _mm_loadl_epi64((__m128i *)p_v); \
xmm1 = _mm_loadl_epi64((__m128i *)p_v); \
xmm2 = _mm_loadl_epi64((__m128i *)p_u); \
xmm2 = _mm_loadl_epi64((__m128i *)p_u); \
xmm0 = _mm_load
_si128((__m128i *)p_y1);
\
xmm0 = _mm_load
u_si128((__m128i *)p_y1);
\
xmm3 = _mm_load
_si128((__m128i *)p_y2);
\
xmm3 = _mm_load
u_si128((__m128i *)p_y2);
\
_mm_prefetch(p_line1, _MM_HINT_NTA); \
_mm_prefetch(p_line1, _MM_HINT_NTA); \
_mm_prefetch(p_line2, _MM_HINT_NTA); \
_mm_prefetch(p_line2, _MM_HINT_NTA); \
xmm1 = _mm_unpacklo_epi8(xmm1, xmm2); \
xmm1 = _mm_unpacklo_epi8(xmm1, xmm2); \
...
@@ -439,8 +439,8 @@ movdqu %%xmm1, 16(%1) # Store high UYVY \n\
...
@@ -439,8 +439,8 @@ movdqu %%xmm1, 16(%1) # Store high UYVY \n\
#define SSE2_YUV420_UYVY_UNALIGNED \
#define SSE2_YUV420_UYVY_UNALIGNED \
xmm1 = _mm_loadl_epi64((__m128i *)p_u); \
xmm1 = _mm_loadl_epi64((__m128i *)p_u); \
xmm2 = _mm_loadl_epi64((__m128i *)p_v); \
xmm2 = _mm_loadl_epi64((__m128i *)p_v); \
xmm0 = _mm_load
_si128((__m128i *)p_y1);
\
xmm0 = _mm_load
u_si128((__m128i *)p_y1);
\
xmm3 = _mm_load
_si128((__m128i *)p_y2);
\
xmm3 = _mm_load
u_si128((__m128i *)p_y2);
\
_mm_prefetch(p_line1, _MM_HINT_NTA); \
_mm_prefetch(p_line1, _MM_HINT_NTA); \
_mm_prefetch(p_line2, _MM_HINT_NTA); \
_mm_prefetch(p_line2, _MM_HINT_NTA); \
xmm1 = _mm_unpacklo_epi8(xmm1, xmm2); \
xmm1 = _mm_unpacklo_epi8(xmm1, xmm2); \
...
...
modules/video_chroma/i422_yuy2.c
View file @
df3b5eec
...
@@ -442,6 +442,61 @@ static void I422_cyuv( vout_thread_t *p_vout, picture_t *p_source,
...
@@ -442,6 +442,61 @@ static void I422_cyuv( vout_thread_t *p_vout, picture_t *p_source,
int
i_x
,
i_y
;
int
i_x
,
i_y
;
const
int
i_source_margin
=
p_source
->
p
[
0
].
i_pitch
-
p_source
->
p
[
0
].
i_visible_pitch
;
const
int
i_source_margin_c
=
p_source
->
p
[
1
].
i_pitch
-
p_source
->
p
[
1
].
i_visible_pitch
;
const
int
i_dest_margin
=
p_dest
->
p
->
i_pitch
-
p_dest
->
p
->
i_visible_pitch
;
#if defined (MODULE_NAME_IS_i422_yuy2_sse2)
if
(
0
==
(
15
&
(
p_source
->
p
[
Y_PLANE
].
i_pitch
|
p_dest
->
p
->
i_pitch
|
((
int
)
p_line
|
(
int
)
p_y
)))
)
{
/* use faster SSE2 aligned fetch and store */
for
(
i_y
=
p_vout
->
render
.
i_height
;
i_y
--
;
)
{
p_line
-=
2
*
p_dest
->
p
->
i_pitch
;
for
(
i_x
=
p_vout
->
render
.
i_width
/
16
;
i_x
--
;
)
{
SSE2_CALL
(
SSE2_YUV422_UYVY_ALIGNED
);
}
for
(
i_x
=
(
p_vout
->
render
.
i_width
%
16
)
/
2
;
i_x
--
;
)
{
C_YUV422_UYVY
(
p_line
,
p_y
,
p_u
,
p_v
);
}
p_y
+=
i_source_margin
;
p_u
+=
i_source_margin_c
;
p_v
+=
i_source_margin_c
;
p_line
+=
i_dest_margin
;
}
}
else
{
/* use slower SSE2 unaligned fetch and store */
for
(
i_y
=
p_vout
->
render
.
i_height
;
i_y
--
;
)
{
p_line
-=
2
*
p_dest
->
p
->
i_pitch
;
for
(
i_x
=
p_vout
->
render
.
i_width
/
16
;
i_x
--
;
)
{
SSE2_CALL
(
SSE2_YUV422_UYVY_UNALIGNED
);
}
for
(
i_x
=
(
p_vout
->
render
.
i_width
%
16
)
/
2
;
i_x
--
;
)
{
C_YUV422_UYVY
(
p_line
,
p_y
,
p_u
,
p_v
);
}
p_y
+=
i_source_margin
;
p_u
+=
i_source_margin_c
;
p_v
+=
i_source_margin_c
;
p_line
+=
i_dest_margin
;
}
}
SSE2_END
;
#else
for
(
i_y
=
p_vout
->
render
.
i_height
;
i_y
--
;
)
for
(
i_y
=
p_vout
->
render
.
i_height
;
i_y
--
;
)
{
{
for
(
i_x
=
p_vout
->
render
.
i_width
/
8
;
i_x
--
;
)
for
(
i_x
=
p_vout
->
render
.
i_width
/
8
;
i_x
--
;
)
...
@@ -457,12 +512,18 @@ static void I422_cyuv( vout_thread_t *p_vout, picture_t *p_source,
...
@@ -457,12 +512,18 @@ static void I422_cyuv( vout_thread_t *p_vout, picture_t *p_source,
MMX_CALL
(
MMX_YUV422_UYVY
);
MMX_CALL
(
MMX_YUV422_UYVY
);
#endif
#endif
}
}
p_y
+=
i_source_margin
;
p_u
+=
i_source_margin_c
;
p_v
+=
i_source_margin_c
;
p_line
+=
i_dest_margin
;
}
}
#if defined (MODULE_NAME_IS_i422_yuy2_mmx)
#if defined (MODULE_NAME_IS_i422_yuy2_mmx)
MMX_END
;
MMX_END
;
#elif defined (MODULE_NAME_IS_i422_yuy2_sse2)
#elif defined (MODULE_NAME_IS_i422_yuy2_sse2)
SSE2_END
;
SSE2_END
;
#endif
#endif
#endif
}
}
/*****************************************************************************
/*****************************************************************************
...
...
modules/video_chroma/i422_yuy2.h
View file @
df3b5eec
...
@@ -233,9 +233,82 @@ movdqu %%xmm1, 16(%0) # Store high UYVY \n\
...
@@ -233,9 +233,82 @@ movdqu %%xmm1, 16(%0) # Store high UYVY \n\
#include <emmintrin.h>
#include <emmintrin.h>
#define SSE2_CALL(SSE2_INSTRUCTIONS) \
do { \
__m128i xmm0, xmm1, xmm2; \
SSE2_INSTRUCTIONS \
p_line += 32; p_y += 16; \
p_u += 8; p_v += 8; \
} while(0)
#define SSE2_END _mm_sfence()
#define SSE2_END _mm_sfence()
#define SSE2_YUV422_YUYV_ALIGNED \
xmm0 = _mm_load_si128((__m128i *)p_y); \
xmm1 = _mm_loadl_epi64((__m128i *)p_u); \
xmm2 = _mm_loadl_epi64((__m128i *)p_v); \
xmm1 = _mm_unpacklo_epi8(xmm1, xmm2); \
xmm2 = xmm0; \
xmm2 = _mm_unpacklo_epi8(xmm2, xmm1); \
_mm_stream_si128((__m128i*)(p_line), xmm2); \
xmm0 = _mm_unpackhi_epi8(xmm0, xmm1); \
_mm_stream_si128((__m128i*)(p_line+16), xmm0);
#define SSE2_YUV422_YUYV_UNALIGNED \
xmm0 = _mm_loadu_si128((__m128i *)p_y); \
xmm1 = _mm_loadl_epi64((__m128i *)p_u); \
xmm2 = _mm_loadl_epi64((__m128i *)p_v); \
xmm1 = _mm_unpacklo_epi8(xmm1, xmm2); \
xmm2 = xmm0; \
xmm2 = _mm_unpacklo_epi8(xmm2, xmm1); \
_mm_storeu_si128((__m128i*)(p_line), xmm2); \
xmm0 = _mm_unpackhi_epi8(xmm0, xmm1); \
_mm_storeu_si128((__m128i*)(p_line+16), xmm0);
#define SSE2_YUV422_YVYU_ALIGNED \
xmm0 = _mm_load_si128((__m128i *)p_y); \
xmm2 = _mm_loadl_epi64((__m128i *)p_u); \
xmm1 = _mm_loadl_epi64((__m128i *)p_v); \
xmm1 = _mm_unpacklo_epi8(xmm1, xmm2); \
xmm2 = xmm0; \
xmm2 = _mm_unpacklo_epi8(xmm2, xmm1); \
_mm_stream_si128((__m128i*)(p_line), xmm2); \
xmm0 = _mm_unpackhi_epi8(xmm0, xmm1); \
_mm_stream_si128((__m128i*)(p_line+16), xmm0);
#define SSE2_YUV422_YVYU_UNALIGNED \
xmm0 = _mm_loadu_si128((__m128i *)p_y); \
xmm2 = _mm_loadl_epi64((__m128i *)p_u); \
xmm1 = _mm_loadl_epi64((__m128i *)p_v); \
xmm1 = _mm_unpacklo_epi8(xmm1, xmm2); \
xmm2 = xmm0; \
xmm2 = _mm_unpacklo_epi8(xmm2, xmm1); \
_mm_storeu_si128((__m128i*)(p_line), xmm2); \
xmm0 = _mm_unpackhi_epi8(xmm0, xmm1); \
_mm_storeu_si128((__m128i*)(p_line+16), xmm0);
#define SSE2_YUV422_UYVY_ALIGNED \
xmm0 = _mm_load_si128((__m128i *)p_y); \
xmm1 = _mm_loadl_epi64((__m128i *)p_u); \
xmm2 = _mm_loadl_epi64((__m128i *)p_v); \
xmm1 = _mm_unpacklo_epi8(xmm1, xmm2); \
xmm2 = xmm1; \
xmm2 = _mm_unpacklo_epi8(xmm2, xmm0); \
_mm_stream_si128((__m128i*)(p_line), xmm2); \
xmm1 = _mm_unpackhi_epi8(xmm1, xmm0); \
_mm_stream_si128((__m128i*)(p_line+16), xmm1);
#define SSE2_YUV422_UYVY_UNALIGNED \
xmm0 = _mm_loadu_si128((__m128i *)p_y); \
xmm1 = _mm_loadl_epi64((__m128i *)p_u); \
xmm2 = _mm_loadl_epi64((__m128i *)p_v); \
xmm1 = _mm_unpacklo_epi8(xmm1, xmm2); \
xmm2 = xmm1; \
xmm2 = _mm_unpacklo_epi8(xmm2, xmm0); \
_mm_storeu_si128((__m128i*)(p_line), xmm2); \
xmm1 = _mm_unpackhi_epi8(xmm1, xmm0); \
_mm_storeu_si128((__m128i*)(p_line+16), xmm1);
#endif
#endif
#endif
#endif
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment