avcodec: split generic YV12/NV12 copy out of SSE2 functions

This is required by the next commit. This also avoids making two copies on non-x86 platforms, where USWC optimizations are irrelevant. (cherry picked from commit 07d8e576c7a0dcc7863d6853f620d5a99ebba55f) Conflicts: modules/codec/avcodec/copy.c

avcodec: split generic YV12/NV12 copy out of SSE2 functions
This is required by the next commit. This also avoids making two copies on non-x86 platforms, where USWC optimizations are irrelevant. (cherry picked from commit 07d8e576c7a0dcc7863d6853f620d5a99ebba55f) Conflicts: modules/codec/avcodec/copy.c
066dc71d · Rémi Denis-Courmont · 69eb62b3 · 066dc71d · 066dc71d · 066dc71d
Commit 066dc71d authored Oct 13, 2012 by Rémi Denis-Courmont
Showing with 147 additions and 92 deletions

modules/codec/avcodec/copy.c modules/codec/avcodec/copy.c +143 -88

modules/codec/avcodec/copy.h modules/codec/avcodec/copy.h +2 -0

modules/codec/avcodec/vaapi.c modules/codec/avcodec/vaapi.c +2 -4

No files found.
--- a/modules/codec/avcodec/copy.c
+++ b/modules/codec/avcodec/copy.c
@@ -32,6 +32,31 @@

 #include "copy.h"

+int CopyInitCache(copy_cache_t *cache, unsigned width)
+{
+#ifdef CAN_COMPILE_SSE2
+    cache->size = __MAX((width + 0x0f) & ~ 0x0f, 4096);
+    cache->buffer = vlc_memalign(16, cache->size);
+    if (!cache->buffer)
+        return VLC_EGENERIC;
+#else
+    (void) cache; (void) width;
+#endif
+    return VLC_SUCCESS;
+}
+
+void CopyCleanCache(copy_cache_t *cache)
+{
+#ifdef CAN_COMPILE_SSE2
+    vlc_free(cache->buffer);
+    cache->buffer = NULL;
+    cache->size   = 0;
+#else
+    (void) cache;
+#endif
+}
+
+#ifdef CAN_COMPILE_SSE2
 /* Copy 64 bytes from srcp to dstp loading data with the SSE>=2 instruction
 * load and storing data with the SSE>=2 instruction store.
 */
@@ -47,16 +72,6 @@
        store " %%xmm4,   48(%[dst])\n" \
        : : [dst]"r"(dstp), [src]"r"(srcp) : "memory")

-/* Execute the instruction op only if SSE2 is supported. */
-#ifdef CAN_COMPILE_SSE2
-#   define ASM_SSE2(cpu, op) do {          \
-        if (cpu & CPU_CAPABILITY_SSE2)  \
-            asm volatile (op);    \
-    } while (0)
-#else
-#   define ASM_SSE2(cpu, op)
-#endif
-
 /* Optimized copy from "Uncacheable Speculative Write Combining" memory
 * as used by some video surface.
 * XXX It is really efficient only when SSE4.1 is available.
@@ -68,7 +83,8 @@ static void CopyFromUswc(uint8_t *dst, size_t dst_pitch,
 {
    assert(((intptr_t)dst & 0x0f) == 0 && (dst_pitch & 0x0f) == 0);

-    ASM_SSE2(cpu, "mfence");
+    asm volatile ("mfence");
+
    for (unsigned y = 0; y < height; y++) {
        const unsigned unaligned = (-(uintptr_t)src) & 0x0f;
        unsigned x = 0;
@@ -87,8 +103,7 @@ static void CopyFromUswc(uint8_t *dst, size_t dst_pitch,
            }
        } else
 #endif
-#ifdef CAN_COMPILE_SSE2
-        if (cpu & CPU_CAPABILITY_SSE2) {
+        {
            if (!unaligned) {
                for (; x+63 < width; x += 64)
                    COPY64(&dst[x], &src[x], "movdqa", "movdqa");
@@ -97,7 +112,6 @@ static void CopyFromUswc(uint8_t *dst, size_t dst_pitch,
                    COPY64(&dst[x], &src[x], "movdqa", "movdqu");
            }
        }
-#endif

        for (; x < width; x++)
            dst[x] = src[x];
@@ -109,28 +123,23 @@ static void CopyFromUswc(uint8_t *dst, size_t dst_pitch,

 static void Copy2d(uint8_t *dst, size_t dst_pitch,
                   const uint8_t *src, size_t src_pitch,
-                   unsigned width, unsigned height,
-                   unsigned cpu)
+                   unsigned width, unsigned height)
 {
    assert(((intptr_t)src & 0x0f) == 0 && (src_pitch & 0x0f) == 0);

-    ASM_SSE2(cpu, "mfence");
+    asm volatile ("mfence");

    for (unsigned y = 0; y < height; y++) {
        unsigned x = 0;
-        bool unaligned = ((intptr_t)dst & 0x0f) != 0;

-#ifdef CAN_COMPILE_SSE2
-        if (cpu & CPU_CAPABILITY_SSE2) {
-            if (!unaligned) {
-                for (; x+63 < width; x += 64)
-                    COPY64(&dst[x], &src[x], "movdqa", "movntdq");
-            } else {
-                for (; x+63 < width; x += 64)
-                    COPY64(&dst[x], &src[x], "movdqa", "movdqu");
-            }
+        bool unaligned = ((intptr_t)dst & 0x0f) != 0;
+        if (!unaligned) {
+            for (; x+63 < width; x += 64)
+                COPY64(&dst[x], &src[x], "movdqa", "movntdq");
+        } else {
+            for (; x+63 < width; x += 64)
+                COPY64(&dst[x], &src[x], "movdqa", "movdqu");
        }
-#endif

        for (; x < width; x++)
            dst[x] = src[x];
@@ -140,10 +149,10 @@ static void Copy2d(uint8_t *dst, size_t dst_pitch,
    }
 }

-static void SplitUV(uint8_t *dstu, size_t dstu_pitch,
-                    uint8_t *dstv, size_t dstv_pitch,
-                    const uint8_t *src, size_t src_pitch,
-                    unsigned width, unsigned height, unsigned cpu)
+static void SSE_SplitUV(uint8_t *dstu, size_t dstu_pitch,
+                        uint8_t *dstv, size_t dstv_pitch,
+                        const uint8_t *src, size_t src_pitch,
+                        unsigned width, unsigned height, unsigned cpu)
 {
    const uint8_t shuffle[] = { 0, 2, 4, 6, 8, 10, 12, 14,
                                1, 3, 5, 7, 9, 11, 13, 15 };
@@ -152,7 +161,7 @@ static void SplitUV(uint8_t *dstu, size_t dstu_pitch,

    assert(((intptr_t)src & 0x0f) == 0 && (src_pitch & 0x0f) == 0);

-    ASM_SSE2(cpu, "mfence");
+    asm volatile ("mfence");

    for (unsigned y = 0; y < height; y++) {
        unsigned x = 0;
@@ -188,8 +197,7 @@ static void SplitUV(uint8_t *dstu, size_t dstu_pitch,
            }
        } else
 #endif
-#ifdef CAN_COMPILE_SSE2
-        if (cpu & CPU_CAPABILITY_SSE2) {
+        {
            for (x = 0; x < (width & ~31); x += 32) {
                asm volatile (
                    "movdqu (%[mask]), %%xmm7\n"
@@ -213,7 +221,6 @@ static void SplitUV(uint8_t *dstu, size_t dstu_pitch,
                    : : [dst2]"r"(&dstu[x]), [dst1]"r"(&dstv[x]), [src]"r"(&src[2*x]), [mask]"r"(mask) : "memory");
            }
        }
-#endif
 #undef STORE2X32
 #undef LOAD64

@@ -227,10 +234,10 @@ static void SplitUV(uint8_t *dstu, size_t dstu_pitch,
    }
 }

-static void CopyPlane(uint8_t *dst, size_t dst_pitch, const uint8_t *src, size_t src_pitch,
-                      uint8_t *cache, size_t cache_size,
-                      unsigned width, unsigned height,
-                      unsigned cpu)
+static void SSE_CopyPlane(uint8_t *dst, size_t dst_pitch,
+                          const uint8_t *src, size_t src_pitch,
+                          uint8_t *cache, size_t cache_size,
+                          unsigned width, unsigned height, unsigned cpu)
 {
    const unsigned w16 = (width+15) & ~15;
    const unsigned hstep = cache_size / w16;
@@ -247,21 +254,20 @@ static void CopyPlane(uint8_t *dst, size_t dst_pitch, const uint8_t *src, size_t
        /* Copy from our cache to the destination */
        Copy2d(dst, dst_pitch,
               cache, w16,
-               width, hblock, cpu);
+               width, hblock);

        /* */
        src += src_pitch * hblock;
        dst += dst_pitch * hblock;
    }
-
-    ASM_SSE2(cpu, "mfence");
+    asm volatile ("mfence");
 }
-static void SplitPlanes(uint8_t *dstu, size_t dstu_pitch,
-                        uint8_t *dstv, size_t dstv_pitch,
-                        const uint8_t *src, size_t src_pitch,
-                        uint8_t *cache, size_t cache_size,
-                        unsigned width, unsigned height,
-                        unsigned cpu)
+
+static void SSE_SplitPlanes(uint8_t *dstu, size_t dstu_pitch,
+                            uint8_t *dstv, size_t dstv_pitch,
+                            const uint8_t *src, size_t src_pitch,
+                            uint8_t *cache, size_t cache_size,
+                            unsigned width, unsigned height, unsigned cpu)
 {
    const unsigned w2_16 = (2*width+15) & ~15;
    const unsigned hstep = cache_size / w2_16;
@@ -271,76 +277,125 @@ static void SplitPlanes(uint8_t *dstu, size_t dstu_pitch,
        const unsigned hblock =  __MIN(hstep, height - y);

        /* Copy a bunch of line into our cache */
-        CopyFromUswc(cache, w2_16,
-                     src, src_pitch,
+        CopyFromUswc(cache, w2_16, src, src_pitch,
                     2*width, hblock, cpu);

        /* Copy from our cache to the destination */
-        SplitUV(dstu, dstu_pitch,
-                dstv, dstv_pitch,
-                cache, w2_16,
-                width, hblock, cpu);
+        SSE_SplitUV(dstu, dstu_pitch, dstv, dstv_pitch,
+                    cache, w2_16, width, hblock, cpu);

        /* */
        src  += src_pitch  * hblock;
        dstu += dstu_pitch * hblock;
        dstv += dstv_pitch * hblock;
    }
+    asm volatile ("mfence");
+}

-    ASM_SSE2(cpu, "mfence");
+static void SSE_CopyFromNv12(picture_t *dst,
+                             uint8_t *src[2], size_t src_pitch[2],
+                             unsigned width, unsigned height,
+                             copy_cache_t *cache)
+{
+    const unsigned cpu = vlc_CPU();
+
+    /* */
+    SSE_CopyPlane(dst->p[0].p_pixels, dst->p[0].i_pitch,
+                  src[0], src_pitch[0],
+                  cache->buffer, cache->size,
+                  width, height, cpu);
+    SSE_SplitPlanes(dst->p[2].p_pixels, dst->p[2].i_pitch,
+                    dst->p[1].p_pixels, dst->p[1].i_pitch,
+                    src[1], src_pitch[1],
+                    cache->buffer, cache->size,
+                    width/2, height/2, cpu);
+    asm volatile ("emms");
 }

-int CopyInitCache(copy_cache_t *cache, unsigned width)
+static void SSE_CopyFromYv12(picture_t *dst,
+                             uint8_t *src[3], size_t src_pitch[3],
+                             unsigned width, unsigned height,
+                             copy_cache_t *cache)
 {
-    cache->size = __MAX((width + 0x0f) & ~ 0x0f, 4096);
-    cache->buffer = vlc_memalign(16, cache->size);
-    if (!cache->buffer)
-        return VLC_EGENERIC;
-    return VLC_SUCCESS;
+    const unsigned cpu = vlc_CPU();
+
+    /* */
+    for (unsigned n = 0; n < 3; n++) {
+        const unsigned d = n > 0 ? 2 : 1;
+        SSE_CopyPlane(dst->p[n].p_pixels, dst->p[n].i_pitch,
+                      src[n], src_pitch[n],
+                      cache->buffer, cache->size,
+                      width/d, height/d, cpu);
+    }
+    asm volatile ("emms");
 }
-void CopyCleanCache(copy_cache_t *cache)
+#undef COPY64
+#endif /* CAN_COMPILE_SSE2 */
+
+static void CopyPlane(uint8_t *dst, size_t dst_pitch,
+                      const uint8_t *src, size_t src_pitch,
+                      unsigned width, unsigned height)
 {
-    vlc_free(cache->buffer);
-    cache->buffer = NULL;
-    cache->size   = 0;
+    for (unsigned y = 0; y < height; y++) {
+        memcpy(dst, src, width);
+        src += src_pitch;
+        dst += dst_pitch;
+    }
+}
+
+static void SplitPlanes(uint8_t *dstu, size_t dstu_pitch,
+                        uint8_t *dstv, size_t dstv_pitch,
+                        const uint8_t *src, size_t src_pitch,
+                        unsigned width, unsigned height)
+{
+    for (unsigned y = 0; y < height; y++) {
+        for (unsigned x = 0; x < width; x++) {
+            dstu[x] = src[2*x+0];
+            dstv[x] = src[2*x+1];
+        }
+        src  += src_pitch;
+        dstu += dstu_pitch;
+        dstv += dstv_pitch;
+    }
 }

 void CopyFromNv12(picture_t *dst, uint8_t *src[2], size_t src_pitch[2],
                  unsigned width, unsigned height,
                  copy_cache_t *cache)
 {
-    const unsigned cpu = vlc_CPU();
+#ifdef CAN_COMPILE_SSE2
+    unsigned cpu = vlc_CPU();
+    if (cpu & CPU_CAPABILITY_SSE2)
+        return SSE_CopyFromNv12(dst, src, src_pitch, width, height, cache);
+#else
+    (void) cache;
+#endif

-    /* */
    CopyPlane(dst->p[0].p_pixels, dst->p[0].i_pitch,
              src[0], src_pitch[0],
-              cache->buffer, cache->size,
-              width, height, cpu);
+              width, height);
    SplitPlanes(dst->p[2].p_pixels, dst->p[2].i_pitch,
                dst->p[1].p_pixels, dst->p[1].i_pitch,
                src[1], src_pitch[1],
-                cache->buffer, cache->size,
-                width/2, height/2, cpu);
-
-    ASM_SSE2(cpu, "emms");
+                width/2, height/2);
 }
+
 void CopyFromYv12(picture_t *dst, uint8_t *src[3], size_t src_pitch[3],
                  unsigned width, unsigned height,
                  copy_cache_t *cache)
 {
-    const unsigned cpu = vlc_CPU();
+#ifdef CAN_COMPILE_SSE2
+    unsigned cpu = vlc_CPU();
+    if (cpu & CPU_CAPABILITY_SSE2)
+        return SSE_CopyFromYv12(dst, src, src_pitch, width, height, cache);
+#else
+    (void) cache;
+#endif

-    /* */
-    for (unsigned n = 0; n < 3; n++) {
-        const unsigned d = n > 0 ? 2 : 1;
-        CopyPlane(dst->p[n].p_pixels, dst->p[n].i_pitch,
-                  src[n], src_pitch[n],
-                  cache->buffer, cache->size,
-                  width/d, height/d, cpu);
-    }
-    ASM_SSE2(cpu, "emms");
+     CopyPlane(dst->p[0].p_pixels, dst->p[0].i_pitch,
+               src[0], src_pitch[0], width, height);
+     CopyPlane(dst->p[1].p_pixels, dst->p[1].i_pitch,
+               src[1], src_pitch[1], width / 2, height / 2);
+     CopyPlane(dst->p[2].p_pixels, dst->p[2].i_pitch,
+               src[1], src_pitch[2], width / 2, height / 2);
 }
-
-#undef ASM_SSE2
-#undef COPY64
-
--- a/modules/codec/avcodec/copy.h
+++ b/modules/codec/avcodec/copy.h
@@ -25,8 +25,10 @@
 #define _VLC_AVCODEC_COPY_H 1

 typedef struct {
+# ifdef CAN_COMPILE_SSE2
    uint8_t *buffer;
    size_t  size;
+# endif
 } copy_cache_t;

 int  CopyInitCache(copy_cache_t *cache, unsigned width);

--- a/modules/codec/avcodec/vaapi.c
+++ b/modules/codec/avcodec/vaapi.c
@@ -310,7 +310,8 @@ static int CreateSurfaces( vlc_va_vaapi_t *p_va, void **pp_hw_ctx, vlc_fourcc_t
        goto error;
    *pi_chroma = i_chroma;

-    CopyInitCache( &p_va->image_cache, i_width );
+    if( unlikely(CopyInitCache( &p_va->image_cache, i_width )) )
+        goto error;

    /* Setup the ffmpeg hardware context */
    *pp_hw_ctx = &p_va->hw_ctx;
@@ -358,9 +359,6 @@ static int Extract( vlc_va_t *p_external, picture_t *p_picture, AVFrame *p_ff )
 {
    vlc_va_vaapi_t *p_va = vlc_va_vaapi_Get(p_external);

-    if( !p_va->image_cache.buffer )
-        return VLC_EGENERIC;
-
    VASurfaceID i_surface_id = (VASurfaceID)(uintptr_t)p_ff->data[3];

 #if VA_CHECK_VERSION(0,31,0)