Commit 066dc71d authored by Rémi Denis-Courmont's avatar Rémi Denis-Courmont

avcodec: split generic YV12/NV12 copy out of SSE2 functions

This is required by the next commit. This also avoids making two copies
on non-x86 platforms, where USWC optimizations are irrelevant.
(cherry picked from commit 07d8e576c7a0dcc7863d6853f620d5a99ebba55f)

Conflicts:
	modules/codec/avcodec/copy.c
parent 69eb62b3
...@@ -32,6 +32,31 @@ ...@@ -32,6 +32,31 @@
#include "copy.h" #include "copy.h"
int CopyInitCache(copy_cache_t *cache, unsigned width)
{
#ifdef CAN_COMPILE_SSE2
cache->size = __MAX((width + 0x0f) & ~ 0x0f, 4096);
cache->buffer = vlc_memalign(16, cache->size);
if (!cache->buffer)
return VLC_EGENERIC;
#else
(void) cache; (void) width;
#endif
return VLC_SUCCESS;
}
void CopyCleanCache(copy_cache_t *cache)
{
#ifdef CAN_COMPILE_SSE2
vlc_free(cache->buffer);
cache->buffer = NULL;
cache->size = 0;
#else
(void) cache;
#endif
}
#ifdef CAN_COMPILE_SSE2
/* Copy 64 bytes from srcp to dstp loading data with the SSE>=2 instruction /* Copy 64 bytes from srcp to dstp loading data with the SSE>=2 instruction
* load and storing data with the SSE>=2 instruction store. * load and storing data with the SSE>=2 instruction store.
*/ */
...@@ -47,16 +72,6 @@ ...@@ -47,16 +72,6 @@
store " %%xmm4, 48(%[dst])\n" \ store " %%xmm4, 48(%[dst])\n" \
: : [dst]"r"(dstp), [src]"r"(srcp) : "memory") : : [dst]"r"(dstp), [src]"r"(srcp) : "memory")
/* Execute the instruction op only if SSE2 is supported. */
#ifdef CAN_COMPILE_SSE2
# define ASM_SSE2(cpu, op) do { \
if (cpu & CPU_CAPABILITY_SSE2) \
asm volatile (op); \
} while (0)
#else
# define ASM_SSE2(cpu, op)
#endif
/* Optimized copy from "Uncacheable Speculative Write Combining" memory /* Optimized copy from "Uncacheable Speculative Write Combining" memory
* as used by some video surface. * as used by some video surface.
* XXX It is really efficient only when SSE4.1 is available. * XXX It is really efficient only when SSE4.1 is available.
...@@ -68,7 +83,8 @@ static void CopyFromUswc(uint8_t *dst, size_t dst_pitch, ...@@ -68,7 +83,8 @@ static void CopyFromUswc(uint8_t *dst, size_t dst_pitch,
{ {
assert(((intptr_t)dst & 0x0f) == 0 && (dst_pitch & 0x0f) == 0); assert(((intptr_t)dst & 0x0f) == 0 && (dst_pitch & 0x0f) == 0);
ASM_SSE2(cpu, "mfence"); asm volatile ("mfence");
for (unsigned y = 0; y < height; y++) { for (unsigned y = 0; y < height; y++) {
const unsigned unaligned = (-(uintptr_t)src) & 0x0f; const unsigned unaligned = (-(uintptr_t)src) & 0x0f;
unsigned x = 0; unsigned x = 0;
...@@ -87,8 +103,7 @@ static void CopyFromUswc(uint8_t *dst, size_t dst_pitch, ...@@ -87,8 +103,7 @@ static void CopyFromUswc(uint8_t *dst, size_t dst_pitch,
} }
} else } else
#endif #endif
#ifdef CAN_COMPILE_SSE2 {
if (cpu & CPU_CAPABILITY_SSE2) {
if (!unaligned) { if (!unaligned) {
for (; x+63 < width; x += 64) for (; x+63 < width; x += 64)
COPY64(&dst[x], &src[x], "movdqa", "movdqa"); COPY64(&dst[x], &src[x], "movdqa", "movdqa");
...@@ -97,7 +112,6 @@ static void CopyFromUswc(uint8_t *dst, size_t dst_pitch, ...@@ -97,7 +112,6 @@ static void CopyFromUswc(uint8_t *dst, size_t dst_pitch,
COPY64(&dst[x], &src[x], "movdqa", "movdqu"); COPY64(&dst[x], &src[x], "movdqa", "movdqu");
} }
} }
#endif
for (; x < width; x++) for (; x < width; x++)
dst[x] = src[x]; dst[x] = src[x];
...@@ -109,28 +123,23 @@ static void CopyFromUswc(uint8_t *dst, size_t dst_pitch, ...@@ -109,28 +123,23 @@ static void CopyFromUswc(uint8_t *dst, size_t dst_pitch,
static void Copy2d(uint8_t *dst, size_t dst_pitch, static void Copy2d(uint8_t *dst, size_t dst_pitch,
const uint8_t *src, size_t src_pitch, const uint8_t *src, size_t src_pitch,
unsigned width, unsigned height, unsigned width, unsigned height)
unsigned cpu)
{ {
assert(((intptr_t)src & 0x0f) == 0 && (src_pitch & 0x0f) == 0); assert(((intptr_t)src & 0x0f) == 0 && (src_pitch & 0x0f) == 0);
ASM_SSE2(cpu, "mfence"); asm volatile ("mfence");
for (unsigned y = 0; y < height; y++) { for (unsigned y = 0; y < height; y++) {
unsigned x = 0; unsigned x = 0;
bool unaligned = ((intptr_t)dst & 0x0f) != 0;
#ifdef CAN_COMPILE_SSE2 bool unaligned = ((intptr_t)dst & 0x0f) != 0;
if (cpu & CPU_CAPABILITY_SSE2) { if (!unaligned) {
if (!unaligned) { for (; x+63 < width; x += 64)
for (; x+63 < width; x += 64) COPY64(&dst[x], &src[x], "movdqa", "movntdq");
COPY64(&dst[x], &src[x], "movdqa", "movntdq"); } else {
} else { for (; x+63 < width; x += 64)
for (; x+63 < width; x += 64) COPY64(&dst[x], &src[x], "movdqa", "movdqu");
COPY64(&dst[x], &src[x], "movdqa", "movdqu");
}
} }
#endif
for (; x < width; x++) for (; x < width; x++)
dst[x] = src[x]; dst[x] = src[x];
...@@ -140,10 +149,10 @@ static void Copy2d(uint8_t *dst, size_t dst_pitch, ...@@ -140,10 +149,10 @@ static void Copy2d(uint8_t *dst, size_t dst_pitch,
} }
} }
static void SplitUV(uint8_t *dstu, size_t dstu_pitch, static void SSE_SplitUV(uint8_t *dstu, size_t dstu_pitch,
uint8_t *dstv, size_t dstv_pitch, uint8_t *dstv, size_t dstv_pitch,
const uint8_t *src, size_t src_pitch, const uint8_t *src, size_t src_pitch,
unsigned width, unsigned height, unsigned cpu) unsigned width, unsigned height, unsigned cpu)
{ {
const uint8_t shuffle[] = { 0, 2, 4, 6, 8, 10, 12, 14, const uint8_t shuffle[] = { 0, 2, 4, 6, 8, 10, 12, 14,
1, 3, 5, 7, 9, 11, 13, 15 }; 1, 3, 5, 7, 9, 11, 13, 15 };
...@@ -152,7 +161,7 @@ static void SplitUV(uint8_t *dstu, size_t dstu_pitch, ...@@ -152,7 +161,7 @@ static void SplitUV(uint8_t *dstu, size_t dstu_pitch,
assert(((intptr_t)src & 0x0f) == 0 && (src_pitch & 0x0f) == 0); assert(((intptr_t)src & 0x0f) == 0 && (src_pitch & 0x0f) == 0);
ASM_SSE2(cpu, "mfence"); asm volatile ("mfence");
for (unsigned y = 0; y < height; y++) { for (unsigned y = 0; y < height; y++) {
unsigned x = 0; unsigned x = 0;
...@@ -188,8 +197,7 @@ static void SplitUV(uint8_t *dstu, size_t dstu_pitch, ...@@ -188,8 +197,7 @@ static void SplitUV(uint8_t *dstu, size_t dstu_pitch,
} }
} else } else
#endif #endif
#ifdef CAN_COMPILE_SSE2 {
if (cpu & CPU_CAPABILITY_SSE2) {
for (x = 0; x < (width & ~31); x += 32) { for (x = 0; x < (width & ~31); x += 32) {
asm volatile ( asm volatile (
"movdqu (%[mask]), %%xmm7\n" "movdqu (%[mask]), %%xmm7\n"
...@@ -213,7 +221,6 @@ static void SplitUV(uint8_t *dstu, size_t dstu_pitch, ...@@ -213,7 +221,6 @@ static void SplitUV(uint8_t *dstu, size_t dstu_pitch,
: : [dst2]"r"(&dstu[x]), [dst1]"r"(&dstv[x]), [src]"r"(&src[2*x]), [mask]"r"(mask) : "memory"); : : [dst2]"r"(&dstu[x]), [dst1]"r"(&dstv[x]), [src]"r"(&src[2*x]), [mask]"r"(mask) : "memory");
} }
} }
#endif
#undef STORE2X32 #undef STORE2X32
#undef LOAD64 #undef LOAD64
...@@ -227,10 +234,10 @@ static void SplitUV(uint8_t *dstu, size_t dstu_pitch, ...@@ -227,10 +234,10 @@ static void SplitUV(uint8_t *dstu, size_t dstu_pitch,
} }
} }
static void CopyPlane(uint8_t *dst, size_t dst_pitch, const uint8_t *src, size_t src_pitch, static void SSE_CopyPlane(uint8_t *dst, size_t dst_pitch,
uint8_t *cache, size_t cache_size, const uint8_t *src, size_t src_pitch,
unsigned width, unsigned height, uint8_t *cache, size_t cache_size,
unsigned cpu) unsigned width, unsigned height, unsigned cpu)
{ {
const unsigned w16 = (width+15) & ~15; const unsigned w16 = (width+15) & ~15;
const unsigned hstep = cache_size / w16; const unsigned hstep = cache_size / w16;
...@@ -247,21 +254,20 @@ static void CopyPlane(uint8_t *dst, size_t dst_pitch, const uint8_t *src, size_t ...@@ -247,21 +254,20 @@ static void CopyPlane(uint8_t *dst, size_t dst_pitch, const uint8_t *src, size_t
/* Copy from our cache to the destination */ /* Copy from our cache to the destination */
Copy2d(dst, dst_pitch, Copy2d(dst, dst_pitch,
cache, w16, cache, w16,
width, hblock, cpu); width, hblock);
/* */ /* */
src += src_pitch * hblock; src += src_pitch * hblock;
dst += dst_pitch * hblock; dst += dst_pitch * hblock;
} }
asm volatile ("mfence");
ASM_SSE2(cpu, "mfence");
} }
static void SplitPlanes(uint8_t *dstu, size_t dstu_pitch,
uint8_t *dstv, size_t dstv_pitch, static void SSE_SplitPlanes(uint8_t *dstu, size_t dstu_pitch,
const uint8_t *src, size_t src_pitch, uint8_t *dstv, size_t dstv_pitch,
uint8_t *cache, size_t cache_size, const uint8_t *src, size_t src_pitch,
unsigned width, unsigned height, uint8_t *cache, size_t cache_size,
unsigned cpu) unsigned width, unsigned height, unsigned cpu)
{ {
const unsigned w2_16 = (2*width+15) & ~15; const unsigned w2_16 = (2*width+15) & ~15;
const unsigned hstep = cache_size / w2_16; const unsigned hstep = cache_size / w2_16;
...@@ -271,76 +277,125 @@ static void SplitPlanes(uint8_t *dstu, size_t dstu_pitch, ...@@ -271,76 +277,125 @@ static void SplitPlanes(uint8_t *dstu, size_t dstu_pitch,
const unsigned hblock = __MIN(hstep, height - y); const unsigned hblock = __MIN(hstep, height - y);
/* Copy a bunch of line into our cache */ /* Copy a bunch of line into our cache */
CopyFromUswc(cache, w2_16, CopyFromUswc(cache, w2_16, src, src_pitch,
src, src_pitch,
2*width, hblock, cpu); 2*width, hblock, cpu);
/* Copy from our cache to the destination */ /* Copy from our cache to the destination */
SplitUV(dstu, dstu_pitch, SSE_SplitUV(dstu, dstu_pitch, dstv, dstv_pitch,
dstv, dstv_pitch, cache, w2_16, width, hblock, cpu);
cache, w2_16,
width, hblock, cpu);
/* */ /* */
src += src_pitch * hblock; src += src_pitch * hblock;
dstu += dstu_pitch * hblock; dstu += dstu_pitch * hblock;
dstv += dstv_pitch * hblock; dstv += dstv_pitch * hblock;
} }
asm volatile ("mfence");
}
ASM_SSE2(cpu, "mfence"); static void SSE_CopyFromNv12(picture_t *dst,
uint8_t *src[2], size_t src_pitch[2],
unsigned width, unsigned height,
copy_cache_t *cache)
{
const unsigned cpu = vlc_CPU();
/* */
SSE_CopyPlane(dst->p[0].p_pixels, dst->p[0].i_pitch,
src[0], src_pitch[0],
cache->buffer, cache->size,
width, height, cpu);
SSE_SplitPlanes(dst->p[2].p_pixels, dst->p[2].i_pitch,
dst->p[1].p_pixels, dst->p[1].i_pitch,
src[1], src_pitch[1],
cache->buffer, cache->size,
width/2, height/2, cpu);
asm volatile ("emms");
} }
int CopyInitCache(copy_cache_t *cache, unsigned width) static void SSE_CopyFromYv12(picture_t *dst,
uint8_t *src[3], size_t src_pitch[3],
unsigned width, unsigned height,
copy_cache_t *cache)
{ {
cache->size = __MAX((width + 0x0f) & ~ 0x0f, 4096); const unsigned cpu = vlc_CPU();
cache->buffer = vlc_memalign(16, cache->size);
if (!cache->buffer) /* */
return VLC_EGENERIC; for (unsigned n = 0; n < 3; n++) {
return VLC_SUCCESS; const unsigned d = n > 0 ? 2 : 1;
SSE_CopyPlane(dst->p[n].p_pixels, dst->p[n].i_pitch,
src[n], src_pitch[n],
cache->buffer, cache->size,
width/d, height/d, cpu);
}
asm volatile ("emms");
} }
void CopyCleanCache(copy_cache_t *cache) #undef COPY64
#endif /* CAN_COMPILE_SSE2 */
static void CopyPlane(uint8_t *dst, size_t dst_pitch,
const uint8_t *src, size_t src_pitch,
unsigned width, unsigned height)
{ {
vlc_free(cache->buffer); for (unsigned y = 0; y < height; y++) {
cache->buffer = NULL; memcpy(dst, src, width);
cache->size = 0; src += src_pitch;
dst += dst_pitch;
}
}
static void SplitPlanes(uint8_t *dstu, size_t dstu_pitch,
uint8_t *dstv, size_t dstv_pitch,
const uint8_t *src, size_t src_pitch,
unsigned width, unsigned height)
{
for (unsigned y = 0; y < height; y++) {
for (unsigned x = 0; x < width; x++) {
dstu[x] = src[2*x+0];
dstv[x] = src[2*x+1];
}
src += src_pitch;
dstu += dstu_pitch;
dstv += dstv_pitch;
}
} }
void CopyFromNv12(picture_t *dst, uint8_t *src[2], size_t src_pitch[2], void CopyFromNv12(picture_t *dst, uint8_t *src[2], size_t src_pitch[2],
unsigned width, unsigned height, unsigned width, unsigned height,
copy_cache_t *cache) copy_cache_t *cache)
{ {
const unsigned cpu = vlc_CPU(); #ifdef CAN_COMPILE_SSE2
unsigned cpu = vlc_CPU();
if (cpu & CPU_CAPABILITY_SSE2)
return SSE_CopyFromNv12(dst, src, src_pitch, width, height, cache);
#else
(void) cache;
#endif
/* */
CopyPlane(dst->p[0].p_pixels, dst->p[0].i_pitch, CopyPlane(dst->p[0].p_pixels, dst->p[0].i_pitch,
src[0], src_pitch[0], src[0], src_pitch[0],
cache->buffer, cache->size, width, height);
width, height, cpu);
SplitPlanes(dst->p[2].p_pixels, dst->p[2].i_pitch, SplitPlanes(dst->p[2].p_pixels, dst->p[2].i_pitch,
dst->p[1].p_pixels, dst->p[1].i_pitch, dst->p[1].p_pixels, dst->p[1].i_pitch,
src[1], src_pitch[1], src[1], src_pitch[1],
cache->buffer, cache->size, width/2, height/2);
width/2, height/2, cpu);
ASM_SSE2(cpu, "emms");
} }
void CopyFromYv12(picture_t *dst, uint8_t *src[3], size_t src_pitch[3], void CopyFromYv12(picture_t *dst, uint8_t *src[3], size_t src_pitch[3],
unsigned width, unsigned height, unsigned width, unsigned height,
copy_cache_t *cache) copy_cache_t *cache)
{ {
const unsigned cpu = vlc_CPU(); #ifdef CAN_COMPILE_SSE2
unsigned cpu = vlc_CPU();
if (cpu & CPU_CAPABILITY_SSE2)
return SSE_CopyFromYv12(dst, src, src_pitch, width, height, cache);
#else
(void) cache;
#endif
/* */ CopyPlane(dst->p[0].p_pixels, dst->p[0].i_pitch,
for (unsigned n = 0; n < 3; n++) { src[0], src_pitch[0], width, height);
const unsigned d = n > 0 ? 2 : 1; CopyPlane(dst->p[1].p_pixels, dst->p[1].i_pitch,
CopyPlane(dst->p[n].p_pixels, dst->p[n].i_pitch, src[1], src_pitch[1], width / 2, height / 2);
src[n], src_pitch[n], CopyPlane(dst->p[2].p_pixels, dst->p[2].i_pitch,
cache->buffer, cache->size, src[1], src_pitch[2], width / 2, height / 2);
width/d, height/d, cpu);
}
ASM_SSE2(cpu, "emms");
} }
#undef ASM_SSE2
#undef COPY64
...@@ -25,8 +25,10 @@ ...@@ -25,8 +25,10 @@
#define _VLC_AVCODEC_COPY_H 1 #define _VLC_AVCODEC_COPY_H 1
typedef struct { typedef struct {
# ifdef CAN_COMPILE_SSE2
uint8_t *buffer; uint8_t *buffer;
size_t size; size_t size;
# endif
} copy_cache_t; } copy_cache_t;
int CopyInitCache(copy_cache_t *cache, unsigned width); int CopyInitCache(copy_cache_t *cache, unsigned width);
......
...@@ -310,7 +310,8 @@ static int CreateSurfaces( vlc_va_vaapi_t *p_va, void **pp_hw_ctx, vlc_fourcc_t ...@@ -310,7 +310,8 @@ static int CreateSurfaces( vlc_va_vaapi_t *p_va, void **pp_hw_ctx, vlc_fourcc_t
goto error; goto error;
*pi_chroma = i_chroma; *pi_chroma = i_chroma;
CopyInitCache( &p_va->image_cache, i_width ); if( unlikely(CopyInitCache( &p_va->image_cache, i_width )) )
goto error;
/* Setup the ffmpeg hardware context */ /* Setup the ffmpeg hardware context */
*pp_hw_ctx = &p_va->hw_ctx; *pp_hw_ctx = &p_va->hw_ctx;
...@@ -358,9 +359,6 @@ static int Extract( vlc_va_t *p_external, picture_t *p_picture, AVFrame *p_ff ) ...@@ -358,9 +359,6 @@ static int Extract( vlc_va_t *p_external, picture_t *p_picture, AVFrame *p_ff )
{ {
vlc_va_vaapi_t *p_va = vlc_va_vaapi_Get(p_external); vlc_va_vaapi_t *p_va = vlc_va_vaapi_Get(p_external);
if( !p_va->image_cache.buffer )
return VLC_EGENERIC;
VASurfaceID i_surface_id = (VASurfaceID)(uintptr_t)p_ff->data[3]; VASurfaceID i_surface_id = (VASurfaceID)(uintptr_t)p_ff->data[3];
#if VA_CHECK_VERSION(0,31,0) #if VA_CHECK_VERSION(0,31,0)
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment