Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Support
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
V
vlc-gpu
Project overview
Project overview
Details
Activity
Releases
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Issues
0
Issues
0
List
Boards
Labels
Milestones
Redmine
Redmine
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Operations
Operations
Metrics
Environments
Analytics
Analytics
CI / CD
Repository
Value Stream
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
videolan
vlc-gpu
Commits
f4f90e67
Commit
f4f90e67
authored
Jun 16, 2007
by
Damien Fouilleul
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
video_chroma: a few SSE2 fixes
parent
a3eb2a70
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
30 additions
and
32 deletions
+30
-32
modules/video_chroma/i420_rgb16.c
modules/video_chroma/i420_rgb16.c
+23
-24
modules/video_chroma/i420_rgb_mmx.h
modules/video_chroma/i420_rgb_mmx.h
+7
-8
No files found.
modules/video_chroma/i420_rgb16.c
View file @
f4f90e67
...
@@ -448,12 +448,6 @@ void E_(I420_R5G5B5)( vout_thread_t *p_vout, picture_t *p_src,
...
@@ -448,12 +448,6 @@ void E_(I420_R5G5B5)( vout_thread_t *p_vout, picture_t *p_src,
}
}
p_buffer
=
b_hscale
?
p_buffer_start
:
p_pic
;
p_buffer
=
b_hscale
?
p_buffer_start
:
p_pic
;
}
}
/* make sure all SSE2 stores are visible thereafter */
#if defined (CAN_COMPILE_SSE2)
__asm__
__volatile__
(
"sfence"
);
#else
_mm_sfence
();
#endif
}
}
else
else
{
{
...
@@ -526,6 +520,14 @@ void E_(I420_R5G5B5)( vout_thread_t *p_vout, picture_t *p_src,
...
@@ -526,6 +520,14 @@ void E_(I420_R5G5B5)( vout_thread_t *p_vout, picture_t *p_src,
p_buffer
=
b_hscale
?
p_buffer_start
:
p_pic
;
p_buffer
=
b_hscale
?
p_buffer_start
:
p_pic
;
}
}
}
}
/* make sure all SSE2 stores are visible thereafter */
#if defined (CAN_COMPILE_SSE2)
__asm__
__volatile__
(
"sfence"
:::
"memory"
);
#else
_mm_sfence
();
#endif
#else // defined (MODULE_NAME_IS_i420_rgb_mmx)
#else // defined (MODULE_NAME_IS_i420_rgb_mmx)
if
(
p_vout
->
render
.
i_width
&
7
)
if
(
p_vout
->
render
.
i_width
&
7
)
...
@@ -755,12 +757,6 @@ void E_(I420_R5G6B5)( vout_thread_t *p_vout, picture_t *p_src,
...
@@ -755,12 +757,6 @@ void E_(I420_R5G6B5)( vout_thread_t *p_vout, picture_t *p_src,
}
}
p_buffer
=
b_hscale
?
p_buffer_start
:
p_pic
;
p_buffer
=
b_hscale
?
p_buffer_start
:
p_pic
;
}
}
/* make sure all SSE2 stores are visible thereafter */
#if defined (CAN_COMPILE_SSE2)
__asm__
__volatile__
(
"sfence"
);
#else
_mm_sfence
();
#endif
}
}
else
else
{
{
...
@@ -833,6 +829,14 @@ void E_(I420_R5G6B5)( vout_thread_t *p_vout, picture_t *p_src,
...
@@ -833,6 +829,14 @@ void E_(I420_R5G6B5)( vout_thread_t *p_vout, picture_t *p_src,
p_buffer
=
b_hscale
?
p_buffer_start
:
p_pic
;
p_buffer
=
b_hscale
?
p_buffer_start
:
p_pic
;
}
}
}
}
/* make sure all SSE2 stores are visible thereafter */
#if defined (CAN_COMPILE_SSE2)
__asm__
__volatile__
(
"sfence"
:::
"memory"
);
#else
_mm_sfence
();
#endif
#else // defined (MODULE_NAME_IS_i420_rgb_mmx)
#else // defined (MODULE_NAME_IS_i420_rgb_mmx)
if
(
p_vout
->
render
.
i_width
&
7
)
if
(
p_vout
->
render
.
i_width
&
7
)
...
@@ -1179,12 +1183,6 @@ void E_(I420_A8R8G8B8)( vout_thread_t *p_vout, picture_t *p_src,
...
@@ -1179,12 +1183,6 @@ void E_(I420_A8R8G8B8)( vout_thread_t *p_vout, picture_t *p_src,
}
}
p_buffer
=
b_hscale
?
p_buffer_start
:
p_pic
;
p_buffer
=
b_hscale
?
p_buffer_start
:
p_pic
;
}
}
/* make sure all SSE2 stores are visible thereafter */
#if defined (CAN_COMPILE_SSE2)
__asm__
__volatile__
(
"sfence"
);
#else
_mm_sfence
();
#endif
}
}
else
else
{
{
...
@@ -1263,7 +1261,14 @@ void E_(I420_A8R8G8B8)( vout_thread_t *p_vout, picture_t *p_src,
...
@@ -1263,7 +1261,14 @@ void E_(I420_A8R8G8B8)( vout_thread_t *p_vout, picture_t *p_src,
}
}
}
}
/* make sure all SSE2 stores are visible thereafter */
#if defined (CAN_COMPILE_SSE2)
__asm__
__volatile__
(
"sfence"
:::
"memory"
);
#else
#else
_mm_sfence
();
#endif
#else // defined (MODULE_NAME_IS_i420_rgb_mmx)
if
(
p_vout
->
render
.
i_width
&
7
)
if
(
p_vout
->
render
.
i_width
&
7
)
{
{
...
@@ -1500,12 +1505,6 @@ void E_(I420_B8G8R8A8)( vout_thread_t *p_vout, picture_t *p_src,
...
@@ -1500,12 +1505,6 @@ void E_(I420_B8G8R8A8)( vout_thread_t *p_vout, picture_t *p_src,
}
}
p_buffer
=
b_hscale
?
p_buffer_start
:
p_pic
;
p_buffer
=
b_hscale
?
p_buffer_start
:
p_pic
;
}
}
/* make sure all SSE2 stores are visible thereafter */
#if defined (CAN_COMPILE_SSE2)
__asm__
__volatile__
(
"sfence"
);
#else
_mm_sfence
();
#endif
}
}
else
else
{
{
...
...
modules/video_chroma/i420_rgb_mmx.h
View file @
f4f90e67
...
@@ -61,7 +61,6 @@ movq (%0), %%mm6 # Load 8 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 \n\
...
@@ -61,7 +61,6 @@ movq (%0), %%mm6 # Load 8 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 \n\
"
"
#define SSE2_INIT_16_ALIGNED " \n\
#define SSE2_INIT_16_ALIGNED " \n\
prefetcht1 (%3) # cache preload for image \n\
movq (%1), %%xmm0 # Load 8 Cb 00 00 00 00 u3 u2 u1 u0 \n\
movq (%1), %%xmm0 # Load 8 Cb 00 00 00 00 u3 u2 u1 u0 \n\
movq (%2), %%xmm1 # Load 8 Cr 00 00 00 00 v3 v2 v1 v0 \n\
movq (%2), %%xmm1 # Load 8 Cr 00 00 00 00 v3 v2 v1 v0 \n\
pxor %%xmm4, %%xmm4 # zero mm4 \n\
pxor %%xmm4, %%xmm4 # zero mm4 \n\
...
@@ -69,11 +68,11 @@ movdqa (%0), %%xmm6 # Load 16 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 \n\
...
@@ -69,11 +68,11 @@ movdqa (%0), %%xmm6 # Load 16 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 \n\
"
"
#define SSE2_INIT_16_UNALIGNED " \n\
#define SSE2_INIT_16_UNALIGNED " \n\
prefetcht1 (%3) # cache preload for image \n\
movq (%1), %%xmm0 # Load 8 Cb 00 00 00 00 u3 u2 u1 u0 \n\
movq (%1), %%xmm0 # Load 8 Cb 00 00 00 00 u3 u2 u1 u0 \n\
movq (%2), %%xmm1 # Load 8 Cr 00 00 00 00 v3 v2 v1 v0 \n\
movq (%2), %%xmm1 # Load 8 Cr 00 00 00 00 v3 v2 v1 v0 \n\
pxor %%xmm4, %%xmm4 # zero mm4 \n\
pxor %%xmm4, %%xmm4 # zero mm4 \n\
movdqu (%0), %%xmm6 # Load 16 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 \n\
movdqu (%0), %%xmm6 # Load 16 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 \n\
prefetchnta (%3) # Tell CPU not to cache output RGB data \n\
"
"
#define MMX_INTRINSICS_INIT_16 \
#define MMX_INTRINSICS_INIT_16 \
...
@@ -91,11 +90,11 @@ movdqu (%0), %%xmm6 # Load 16 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 \n\
...
@@ -91,11 +90,11 @@ movdqu (%0), %%xmm6 # Load 16 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 \n\
xmm6 = _mm_load_si128((__m128i *)p_y); \
xmm6 = _mm_load_si128((__m128i *)p_y); \
#define SSE2_INTRINSICS_INIT_16_UNALIGNED \
#define SSE2_INTRINSICS_INIT_16_UNALIGNED \
_mm_prefetch(p_buffer, _MM_HINT_T1); \
xmm0 = _mm_loadl_epi64((__m128i *)p_u); \
xmm0 = _mm_loadl_epi64((__m128i *)p_u); \
xmm1 = _mm_loadl_epi64((__m128i *)p_v); \
xmm1 = _mm_loadl_epi64((__m128i *)p_v); \
xmm4 = _mm_setzero_si128(); \
xmm4 = _mm_setzero_si128(); \
xmm6 = _mm_loadu_si128((__m128i *)p_y); \
xmm6 = _mm_loadu_si128((__m128i *)p_y); \
_mm_prefetch(p_buffer, _MM_HINT_NTA); \
#define MMX_INIT_16_GRAY " \n\
#define MMX_INIT_16_GRAY " \n\
movq (%0), %%mm6 # Load 8 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 \n\
movq (%0), %%mm6 # Load 8 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 \n\
...
@@ -118,11 +117,11 @@ movdqa (%0), %%xmm6 # Load 16 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 \n\
...
@@ -118,11 +117,11 @@ movdqa (%0), %%xmm6 # Load 16 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 \n\
"
"
#define SSE2_INIT_32_UNALIGNED " \n\
#define SSE2_INIT_32_UNALIGNED " \n\
prefetcht1 (%3) # cache preload for image \n\
movq (%1), %%xmm0 # Load 8 Cb 00 00 00 00 u3 u2 u1 u0 \n\
movq (%1), %%xmm0 # Load 8 Cb 00 00 00 00 u3 u2 u1 u0 \n\
movq (%2), %%xmm1 # Load 8 Cr 00 00 00 00 v3 v2 v1 v0 \n\
movq (%2), %%xmm1 # Load 8 Cr 00 00 00 00 v3 v2 v1 v0 \n\
pxor %%xmm4, %%xmm4 # zero mm4 \n\
pxor %%xmm4, %%xmm4 # zero mm4 \n\
movdqu (%0), %%xmm6 # Load 16 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 \n\
movdqu (%0), %%xmm6 # Load 16 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 \n\
prefetchnta (%3) # Tell CPU not to cache output RGB data \n\
"
"
#define MMX_INTRINSICS_INIT_32 \
#define MMX_INTRINSICS_INIT_32 \
...
@@ -141,11 +140,11 @@ movdqu (%0), %%xmm6 # Load 16 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 \n\
...
@@ -141,11 +140,11 @@ movdqu (%0), %%xmm6 # Load 16 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 \n\
xmm6 = _mm_load_si128((__m128i *)p_y); \
xmm6 = _mm_load_si128((__m128i *)p_y); \
#define SSE2_INTRINSICS_INIT_32_UNALIGNED \
#define SSE2_INTRINSICS_INIT_32_UNALIGNED \
_mm_prefetch(p_buffer, _MM_HINT_T1); \
xmm0 = _mm_loadl_epi64((__m128i *)p_u); \
xmm0 = _mm_loadl_epi64((__m128i *)p_u); \
xmm1 = _mm_loadl_epi64((__m128i *)p_v); \
xmm1 = _mm_loadl_epi64((__m128i *)p_v); \
xmm4 = _mm_setzero_si128(); \
xmm4 = _mm_setzero_si128(); \
xmm6 = _mm_loadu_si128((__m128i *)p_y); \
xmm6 = _mm_loadu_si128((__m128i *)p_y); \
_mm_prefetch(p_buffer, _MM_HINT_NTA); \
/*
/*
* Do the multiply part of the conversion for even and odd pixels,
* Do the multiply part of the conversion for even and odd pixels,
...
@@ -260,7 +259,7 @@ pmulhw %%xmm5, %%xmm7 # Mul 8 Y odd 00 y7 00 y5 00 y3 00 y1 \n\
...
@@ -260,7 +259,7 @@ pmulhw %%xmm5, %%xmm7 # Mul 8 Y odd 00 y7 00 y5 00 y3 00 y1 \n\
#define SSE2_INTRINSICS_YUV_MUL \
#define SSE2_INTRINSICS_YUV_MUL \
xmm0 = _mm_unpacklo_epi8(xmm0, xmm4); \
xmm0 = _mm_unpacklo_epi8(xmm0, xmm4); \
xmm1 = _mm_unpacklo_epi8(xmm1, xmm4); \
xmm1 = _mm_unpacklo_epi8(xmm1, xmm4); \
xmm5 = _mm_set1_epi32(0x
80808
080UL); \
xmm5 = _mm_set1_epi32(0x
00800
080UL); \
xmm0 = _mm_subs_epi16(xmm0, xmm5); \
xmm0 = _mm_subs_epi16(xmm0, xmm5); \
xmm1 = _mm_subs_epi16(xmm1, xmm5); \
xmm1 = _mm_subs_epi16(xmm1, xmm5); \
xmm0 = _mm_slli_epi16(xmm0, 3); \
xmm0 = _mm_slli_epi16(xmm0, 3); \
...
@@ -1001,7 +1000,7 @@ movdqu %%xmm5, 24(%3) # Store BGRA15 BGRA14 BGRA13 BGRA12 \n\
...
@@ -1001,7 +1000,7 @@ movdqu %%xmm5, 24(%3) # Store BGRA15 BGRA14 BGRA13 BGRA12 \n\
xmm5 = xmm3; \
xmm5 = xmm3; \
xmm3 = _mm_unpacklo_epi16(xmm3, xmm1); \
xmm3 = _mm_unpacklo_epi16(xmm3, xmm1); \
_mm_stream_si128((__m128i*)(p_buffer+8), xmm3); \
_mm_stream_si128((__m128i*)(p_buffer+8), xmm3); \
xmm5 = _
xmm_unpackhi_
pi16(xmm5, xmm4); \
xmm5 = _
mm_unpackhi_e
pi16(xmm5, xmm4); \
_mm_stream_si128((__m128i*)(p_buffer+12), xmm5); \
_mm_stream_si128((__m128i*)(p_buffer+12), xmm5); \
#define SSE2_INTRINSICS_UNPACK_32_BGRA_UNALIGNED \
#define SSE2_INTRINSICS_UNPACK_32_BGRA_UNALIGNED \
...
@@ -1021,6 +1020,6 @@ movdqu %%xmm5, 24(%3) # Store BGRA15 BGRA14 BGRA13 BGRA12 \n\
...
@@ -1021,6 +1020,6 @@ movdqu %%xmm5, 24(%3) # Store BGRA15 BGRA14 BGRA13 BGRA12 \n\
xmm5 = xmm3; \
xmm5 = xmm3; \
xmm3 = _mm_unpacklo_epi16(xmm3, xmm1); \
xmm3 = _mm_unpacklo_epi16(xmm3, xmm1); \
_mm_storeu_si128((__m128i*)(p_buffer+8), xmm3); \
_mm_storeu_si128((__m128i*)(p_buffer+8), xmm3); \
xmm5 = _
xmm_unpackhi_
pi16(xmm5, xmm4); \
xmm5 = _
mm_unpackhi_e
pi16(xmm5, xmm4); \
_mm_storeu_si128((__m128i*)(p_buffer+12), xmm5); \
_mm_storeu_si128((__m128i*)(p_buffer+12), xmm5); \
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment