Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Support
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
V
vlc
Project overview
Project overview
Details
Activity
Releases
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Issues
0
Issues
0
List
Boards
Labels
Milestones
Redmine
Redmine
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Operations
Operations
Metrics
Environments
Analytics
Analytics
CI / CD
Repository
Value Stream
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
videolan
vlc
Commits
90c07084
Commit
90c07084
authored
Jan 05, 2016
by
Francois Cartegnie
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
packetizer: add SSE2 based AnnexB startcode helper
Improves even more from previous commit (by ~2x on 4K)
parent
953dd004
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
96 additions
and
15 deletions
+96
-15
modules/packetizer/startcode_helper.h
modules/packetizer/startcode_helper.h
+96
-15
No files found.
modules/packetizer/startcode_helper.h
View file @
90c07084
...
@@ -20,15 +20,105 @@
...
@@ -20,15 +20,105 @@
#ifndef _STARTCODE_HELPER_H
#ifndef _STARTCODE_HELPER_H
#define _STARTCODE_HELPER_H 1
#define _STARTCODE_HELPER_H 1
#include <vlc_cpu.h>
#if !defined(CAN_COMPILE_SSE2) && defined(HAVE_SSE2_INTRINSICS)
#include <emmintrin.h>
#endif
/* Looks up efficiently for an AnnexB startcode 0x00 0x00 0x01
/* Looks up efficiently for an AnnexB startcode 0x00 0x00 0x01
* by using a 4 times faster trick than single byte lookup.
* by using a 4 times faster trick than single byte lookup. */
*
* That code is adapted from libav's ff_avc_find_startcode_internal
#define TRY_MATCH(p,a) {\
if (p[a+1] == 0) {\
if (p[a+0] == 0 && p[a+2] == 1)\
return a+p;\
if (p[a+2] == 0 && p[a+3] == 1)\
return a+p+1;\
}\
if (p[a+3] == 0) {\
if (p[a+2] == 0 && p[a+4] == 1)\
return a+p+2;\
if (p[a+4] == 0 && p[a+5] == 1)\
return a+p+3;\
}\
}
#if defined(CAN_COMPILE_SSE2) || defined(HAVE_SSE2_INTRINSICS)
__attribute__
((
__target__
(
"sse2"
)))
static
inline
const
uint8_t
*
startcode_FindAnnexB_SSE2
(
const
uint8_t
*
p
,
const
uint8_t
*
end
)
{
/* First align to 16 */
/* Skipping this step and doing unaligned loads isn't faster */
const
uint8_t
*
alignedend
=
p
+
16
-
((
intptr_t
)
p
&
15
);
for
(
end
-=
3
;
p
<
alignedend
&&
p
<
end
;
p
++
)
{
if
(
p
[
0
]
==
0
&&
p
[
1
]
==
0
&&
p
[
2
]
==
1
)
return
p
;
}
if
(
p
==
end
)
return
NULL
;
alignedend
=
end
-
((
intptr_t
)
end
&
15
);
if
(
alignedend
>
p
)
{
#ifdef CAN_COMPILE_SSE2
asm
volatile
(
"pxor %%xmm1, %%xmm1
\n
"
:::
"xmm1"
);
#else
__m128i
zeros
=
_mm_set1_epi8
(
0x00
);
#endif
for
(
;
p
<
alignedend
;
p
+=
16
)
{
uint32_t
match
;
#ifdef CAN_COMPILE_SSE2
asm
volatile
(
"movdqa 0(%[v]), %%xmm0
\n
"
"pcmpeqb %%xmm1, %%xmm0
\n
"
"pmovmskb %%xmm0, %[match]
\n
"
:
[
match
]
"=r"
(
match
)
:
[
v
]
"r"
(
p
)
:
"xmm0"
);
#else
__m128i
v
=
_mm_load_si128
((
__m128i
*
)
p
);
__m128i
res
=
_mm_cmpeq_epi8
(
zeros
,
v
);
match
=
_mm_movemask_epi8
(
res
);
/* mask will be in reversed match order */
#endif
if
(
match
&
0x000F
)
TRY_MATCH
(
p
,
0
);
if
(
match
&
0x00F0
)
TRY_MATCH
(
p
,
4
);
if
(
match
&
0x0F00
)
TRY_MATCH
(
p
,
8
);
if
(
match
&
0xF000
)
TRY_MATCH
(
p
,
12
);
}
}
for
(;
p
<
end
;
p
++
)
{
if
(
p
[
0
]
==
0
&&
p
[
1
]
==
0
&&
p
[
2
]
==
1
)
return
p
;
}
return
NULL
;
}
#endif
/* That code is adapted from libav's ff_avc_find_startcode_internal
* and i believe the trick originated from
* and i believe the trick originated from
* https://graphics.stanford.edu/~seander/bithacks.html#ZeroInWord
* https://graphics.stanford.edu/~seander/bithacks.html#ZeroInWord
*/
*/
static
inline
const
uint8_t
*
startcode_FindAnnexB
(
const
uint8_t
*
p
,
const
uint8_t
*
end
)
static
inline
const
uint8_t
*
startcode_FindAnnexB
(
const
uint8_t
*
p
,
const
uint8_t
*
end
)
{
{
#if defined(CAN_COMPILE_SSE2) || defined(HAVE_SSE2_INTRINSICS)
if
(
vlc_CPU_SSE2
())
return
startcode_FindAnnexB_SSE2
(
p
,
end
);
#endif
const
uint8_t
*
a
=
p
+
4
-
((
intptr_t
)
p
&
3
);
const
uint8_t
*
a
=
p
+
4
-
((
intptr_t
)
p
&
3
);
for
(
end
-=
3
;
p
<
a
&&
p
<
end
;
p
++
)
{
for
(
end
-=
3
;
p
<
a
&&
p
<
end
;
p
++
)
{
...
@@ -41,18 +131,7 @@ static inline const uint8_t * startcode_FindAnnexB( const uint8_t *p, const uint
...
@@ -41,18 +131,7 @@ static inline const uint8_t * startcode_FindAnnexB( const uint8_t *p, const uint
if
((
x
-
0x01010101
)
&
(
~
x
)
&
0x80808080
)
if
((
x
-
0x01010101
)
&
(
~
x
)
&
0x80808080
)
{
{
/* matching DW isn't faster */
/* matching DW isn't faster */
if
(
p
[
1
]
==
0
)
{
TRY_MATCH
(
p
,
0
);
if
(
p
[
0
]
==
0
&&
p
[
2
]
==
1
)
return
p
;
if
(
p
[
2
]
==
0
&&
p
[
3
]
==
1
)
return
p
+
1
;
}
if
(
p
[
3
]
==
0
)
{
if
(
p
[
2
]
==
0
&&
p
[
4
]
==
1
)
return
p
+
2
;
if
(
p
[
4
]
==
0
&&
p
[
5
]
==
1
)
return
p
+
3
;
}
}
}
}
}
...
@@ -64,4 +143,6 @@ static inline const uint8_t * startcode_FindAnnexB( const uint8_t *p, const uint
...
@@ -64,4 +143,6 @@ static inline const uint8_t * startcode_FindAnnexB( const uint8_t *p, const uint
return
NULL
;
return
NULL
;
}
}
#undef TRY_MATCH
#endif
#endif
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment