packetizer: add SSE2 based AnnexB startcode helper

Improves even more from previous commit (by ~2x on 4K)

packetizer: add SSE2 based AnnexB startcode helper
Improves even more from previous commit (by ~2x on 4K)
90c07084 · Francois Cartegnie · 953dd004 · 90c07084
Commit 90c07084 authored Jan 05, 2016 by Francois Cartegnie
Hide whitespace changes
Inline Side-by-side

Showing with 96 additions and 15 deletions

modules/packetizer/startcode_helper.h modules/packetizer/startcode_helper.h +96 -15

No files found.
--- a/modules/packetizer/startcode_helper.h
+++ b/modules/packetizer/startcode_helper.h
@@ -20,15 +20,105 @@
 #ifndef _STARTCODE_HELPER_H
 #define _STARTCODE_HELPER_H 1
+#include <vlc_cpu.h>
+#if !defined(CAN_COMPILE_SSE2) && defined(HAVE_SSE2_INTRINSICS)
+   #include <emmintrin.h>
+#endif
 /* Looks up efficiently for an AnnexB startcode 0x00 0x00 0x01
- * by using a 4 times faster trick than single byte lookup.
+ * by using a 4 times faster trick than single byte lookup. */
- *
- * That code is adapted from libav's ff_avc_find_startcode_internal
+#define TRY_MATCH(p,a) {\
+     if (p[a+1] == 0) {\
+            if (p[a+0] == 0 && p[a+2] == 1)\
+                return a+p;\
+            if (p[a+2] == 0 && p[a+3] == 1)\
+                return a+p+1;\
+        }\
+        if (p[a+3] == 0) {\
+            if (p[a+2] == 0 && p[a+4] == 1)\
+                return a+p+2;\
+            if (p[a+4] == 0 && p[a+5] == 1)\
+                return a+p+3;\
+        }\
+    }
+#if defined(CAN_COMPILE_SSE2) || defined(HAVE_SSE2_INTRINSICS)
+__attribute__ ((__target__ ("sse2")))
+static inline const uint8_t * startcode_FindAnnexB_SSE2( const uint8_t *p, const uint8_t *end )
+{
+    /* First align to 16 */
+    /* Skipping this step and doing unaligned loads isn't faster */
+    const uint8_t *alignedend = p + 16 - ((intptr_t)p & 15);
+    for (end -= 3; p < alignedend && p < end; p++) {
+        if (p[0] == 0 && p[1] == 0 && p[2] == 1)
+            return p;
+    }
+    if( p == end )
+        return NULL;
+    alignedend = end - ((intptr_t) end & 15);
+    if( alignedend > p )
+    {
+#ifdef CAN_COMPILE_SSE2
+        asm volatile(
+            "pxor   %%xmm1, %%xmm1\n"
+            ::: "xmm1"
+        );
+#else
+        __m128i zeros = _mm_set1_epi8( 0x00 );
+#endif
+        for( ; p < alignedend; p += 16)
+        {
+            uint32_t match;
+#ifdef CAN_COMPILE_SSE2
+            asm volatile(
+                "movdqa   0(%[v]),   %%xmm0\n"
+                "pcmpeqb   %%xmm1,   %%xmm0\n"
+                "pmovmskb  %%xmm0,   %[match]\n"
+                : [match]"=r"(match)
+                : [v]"r"(p)
+                : "xmm0"
+            );
+#else
+            __m128i v = _mm_load_si128((__m128i*)p);
+            __m128i res = _mm_cmpeq_epi8( zeros, v );
+            match = _mm_movemask_epi8( res ); /* mask will be in reversed match order */
+#endif
+            if( match & 0x000F )
+                TRY_MATCH(p, 0);
+            if( match & 0x00F0 )
+                TRY_MATCH(p, 4);
+            if( match & 0x0F00 )
+                TRY_MATCH(p, 8);
+            if( match & 0xF000 )
+                TRY_MATCH(p, 12);
+        }
+    }
+    for (; p < end; p++) {
+        if (p[0] == 0 && p[1] == 0 && p[2] == 1)
+            return p;
+    }
+    return NULL;
+}
+#endif
+/* That code is adapted from libav's ff_avc_find_startcode_internal
 * and i believe the trick originated from
 * https://graphics.stanford.edu/~seander/bithacks.html#ZeroInWord
 */
 static inline const uint8_t * startcode_FindAnnexB( const uint8_t *p, const uint8_t *end )
 {
+#if defined(CAN_COMPILE_SSE2) || defined(HAVE_SSE2_INTRINSICS)
+    if (vlc_CPU_SSE2())
+        return startcode_FindAnnexB_SSE2(p, end);
+#endif
    const uint8_t *a = p + 4 - ((intptr_t)p & 3);
    for (end -= 3; p < a && p < end; p++) {
@@ -41,18 +131,7 @@ static inline const uint8_t * startcode_FindAnnexB( const uint8_t *p, const uint
        if ((x - 0x01010101) & (~x) & 0x80808080)
        {
            /* matching DW isn't faster */
-            if (p[1] == 0) {
+            TRY_MATCH(p, 0);
-                if (p[0] == 0 && p[2] == 1)
-                    return p;
-                if (p[2] == 0 && p[3] == 1)
-                    return p+1;
-            }
-            if (p[3] == 0) {
-                if (p[2] == 0 && p[4] == 1)
-                    return p+2;
-                if (p[4] == 0 && p[5] == 1)
-                    return p+3;
-            }
        }
    }
@@ -64,4 +143,6 @@ static inline const uint8_t * startcode_FindAnnexB( const uint8_t *p, const uint
    return NULL;
 }
+#undef TRY_MATCH
 #endif