Commit b50a07a3 authored by Jean-Yves Avenard's avatar Jean-Yves Avenard Committed by Jean-Baptiste Kempf

copy: minor speed enhancement to USWC copy

Adopt suggestions as per https://software.intel.com/en-us/articles/copying-accelerated-video-decode-frame-buffers:
- 64kB aligned memory buffer
- Call to mfence only around copy to cache routine.

Use a single instruction to copy non-aligned memory at startup.
Remove compilation warning on mac

This result in a small 1.3% speed increase on an i7-4650U
Signed-off-by: default avatarJean-Baptiste Kempf <jb@videolan.org>
parent 24c25e99
...@@ -35,8 +35,8 @@ ...@@ -35,8 +35,8 @@
int CopyInitCache(copy_cache_t *cache, unsigned width) int CopyInitCache(copy_cache_t *cache, unsigned width)
{ {
#ifdef CAN_COMPILE_SSE2 #ifdef CAN_COMPILE_SSE2
cache->size = __MAX((width + 0x0f) & ~ 0x0f, 4096); cache->size = __MAX((width + 0x3f) & ~ 0x3f, 4096);
cache->buffer = vlc_memalign(16, cache->size); cache->buffer = vlc_memalign(64, cache->size);
if (!cache->buffer) if (!cache->buffer)
return VLC_EGENERIC; return VLC_EGENERIC;
#else #else
...@@ -57,9 +57,15 @@ void CopyCleanCache(copy_cache_t *cache) ...@@ -57,9 +57,15 @@ void CopyCleanCache(copy_cache_t *cache)
} }
#ifdef CAN_COMPILE_SSE2 #ifdef CAN_COMPILE_SSE2
/* Copy 64 bytes from srcp to dstp loading data with the SSE>=2 instruction /* Copy 16/64 bytes from srcp to dstp loading data with the SSE>=2 instruction
* load and storing data with the SSE>=2 instruction store. * load and storing data with the SSE>=2 instruction store.
*/ */
#define COPY16(dstp, srcp, load, store) \
asm volatile ( \
load " 0(%[src]), %%xmm1\n" \
store " %%xmm1, 0(%[dst])\n" \
: : [dst]"r"(dstp), [src]"r"(srcp) : "memory", "xmm1")
#define COPY64(dstp, srcp, load, store) \ #define COPY64(dstp, srcp, load, store) \
asm volatile ( \ asm volatile ( \
load " 0(%[src]), %%xmm1\n" \ load " 0(%[src]), %%xmm1\n" \
...@@ -97,16 +103,16 @@ static void CopyFromUswc(uint8_t *dst, size_t dst_pitch, ...@@ -97,16 +103,16 @@ static void CopyFromUswc(uint8_t *dst, size_t dst_pitch,
unsigned width, unsigned height, unsigned width, unsigned height,
unsigned cpu) unsigned cpu)
{ {
#ifndef CAN_COMPILE_SSSE3
VLC_UNUSED(cpu);
#endif
assert(((intptr_t)dst & 0x0f) == 0 && (dst_pitch & 0x0f) == 0); assert(((intptr_t)dst & 0x0f) == 0 && (dst_pitch & 0x0f) == 0);
asm volatile ("mfence"); asm volatile ("mfence");
for (unsigned y = 0; y < height; y++) { for (unsigned y = 0; y < height; y++) {
const unsigned unaligned = (-(uintptr_t)src) & 0x0f; const unsigned unaligned = (-(uintptr_t)src) & 0x0f;
unsigned x = 0; unsigned x = unaligned;
for (; x < unaligned; x++)
dst[x] = src[x];
#ifdef CAN_COMPILE_SSE4_1 #ifdef CAN_COMPILE_SSE4_1
if (vlc_CPU_SSE4_1()) { if (vlc_CPU_SSE4_1()) {
...@@ -114,6 +120,7 @@ static void CopyFromUswc(uint8_t *dst, size_t dst_pitch, ...@@ -114,6 +120,7 @@ static void CopyFromUswc(uint8_t *dst, size_t dst_pitch,
for (; x+63 < width; x += 64) for (; x+63 < width; x += 64)
COPY64(&dst[x], &src[x], "movntdqa", "movdqa"); COPY64(&dst[x], &src[x], "movntdqa", "movdqa");
} else { } else {
COPY16(dst, src, "movdqu", "movdqa");
for (; x+63 < width; x += 64) for (; x+63 < width; x += 64)
COPY64(&dst[x], &src[x], "movntdqa", "movdqu"); COPY64(&dst[x], &src[x], "movntdqa", "movdqu");
} }
...@@ -124,6 +131,7 @@ static void CopyFromUswc(uint8_t *dst, size_t dst_pitch, ...@@ -124,6 +131,7 @@ static void CopyFromUswc(uint8_t *dst, size_t dst_pitch,
for (; x+63 < width; x += 64) for (; x+63 < width; x += 64)
COPY64(&dst[x], &src[x], "movdqa", "movdqa"); COPY64(&dst[x], &src[x], "movdqa", "movdqa");
} else { } else {
COPY16(dst, src, "movdqu", "movdqa");
for (; x+63 < width; x += 64) for (; x+63 < width; x += 64)
COPY64(&dst[x], &src[x], "movdqa", "movdqu"); COPY64(&dst[x], &src[x], "movdqa", "movdqu");
} }
...@@ -135,6 +143,7 @@ static void CopyFromUswc(uint8_t *dst, size_t dst_pitch, ...@@ -135,6 +143,7 @@ static void CopyFromUswc(uint8_t *dst, size_t dst_pitch,
src += src_pitch; src += src_pitch;
dst += dst_pitch; dst += dst_pitch;
} }
asm volatile ("mfence");
} }
VLC_SSE VLC_SSE
...@@ -144,8 +153,6 @@ static void Copy2d(uint8_t *dst, size_t dst_pitch, ...@@ -144,8 +153,6 @@ static void Copy2d(uint8_t *dst, size_t dst_pitch,
{ {
assert(((intptr_t)src & 0x0f) == 0 && (src_pitch & 0x0f) == 0); assert(((intptr_t)src & 0x0f) == 0 && (src_pitch & 0x0f) == 0);
asm volatile ("mfence");
for (unsigned y = 0; y < height; y++) { for (unsigned y = 0; y < height; y++) {
unsigned x = 0; unsigned x = 0;
...@@ -172,15 +179,15 @@ static void SSE_SplitUV(uint8_t *dstu, size_t dstu_pitch, ...@@ -172,15 +179,15 @@ static void SSE_SplitUV(uint8_t *dstu, size_t dstu_pitch,
const uint8_t *src, size_t src_pitch, const uint8_t *src, size_t src_pitch,
unsigned width, unsigned height, unsigned cpu) unsigned width, unsigned height, unsigned cpu)
{ {
#ifndef CAN_COMPILE_SSSE3
VLC_UNUSED(cpu); VLC_UNUSED(cpu);
#endif
const uint8_t shuffle[] = { 0, 2, 4, 6, 8, 10, 12, 14, const uint8_t shuffle[] = { 0, 2, 4, 6, 8, 10, 12, 14,
1, 3, 5, 7, 9, 11, 13, 15 }; 1, 3, 5, 7, 9, 11, 13, 15 };
const uint8_t mask[] = { 0xff, 0x00, 0xff, 0x00, 0xff, 0x00, 0xff, 0x00, const uint8_t mask[] = { 0xff, 0x00, 0xff, 0x00, 0xff, 0x00, 0xff, 0x00,
0xff, 0x00, 0xff, 0x00, 0xff, 0x00, 0xff, 0x00 }; 0xff, 0x00, 0xff, 0x00, 0xff, 0x00, 0xff, 0x00 };
assert(((intptr_t)src & 0x0f) == 0 && (src_pitch & 0x0f) == 0); assert(((intptr_t)src & 0xf) == 0 && (src_pitch & 0x0f) == 0);
asm volatile ("mfence");
for (unsigned y = 0; y < height; y++) { for (unsigned y = 0; y < height; y++) {
unsigned x = 0; unsigned x = 0;
...@@ -280,7 +287,6 @@ static void SSE_CopyPlane(uint8_t *dst, size_t dst_pitch, ...@@ -280,7 +287,6 @@ static void SSE_CopyPlane(uint8_t *dst, size_t dst_pitch,
src += src_pitch * hblock; src += src_pitch * hblock;
dst += dst_pitch * hblock; dst += dst_pitch * hblock;
} }
asm volatile ("mfence");
} }
static void SSE_SplitPlanes(uint8_t *dstu, size_t dstu_pitch, static void SSE_SplitPlanes(uint8_t *dstu, size_t dstu_pitch,
...@@ -289,27 +295,26 @@ static void SSE_SplitPlanes(uint8_t *dstu, size_t dstu_pitch, ...@@ -289,27 +295,26 @@ static void SSE_SplitPlanes(uint8_t *dstu, size_t dstu_pitch,
uint8_t *cache, size_t cache_size, uint8_t *cache, size_t cache_size,
unsigned width, unsigned height, unsigned cpu) unsigned width, unsigned height, unsigned cpu)
{ {
const unsigned w2_16 = (2*width+15) & ~15; const unsigned w16 = (2*width+15) & ~15;
const unsigned hstep = cache_size / w2_16; const unsigned hstep = cache_size / w16;
assert(hstep > 0); assert(hstep > 0);
for (unsigned y = 0; y < height; y += hstep) { for (unsigned y = 0; y < height; y += hstep) {
const unsigned hblock = __MIN(hstep, height - y); const unsigned hblock = __MIN(hstep, height - y);
/* Copy a bunch of line into our cache */ /* Copy a bunch of line into our cache */
CopyFromUswc(cache, w2_16, src, src_pitch, CopyFromUswc(cache, w16, src, src_pitch,
2*width, hblock, cpu); 2*width, hblock, cpu);
/* Copy from our cache to the destination */ /* Copy from our cache to the destination */
SSE_SplitUV(dstu, dstu_pitch, dstv, dstv_pitch, SSE_SplitUV(dstu, dstu_pitch, dstv, dstv_pitch,
cache, w2_16, width, hblock, cpu); cache, w16, width, hblock, cpu);
/* */ /* */
src += src_pitch * hblock; src += src_pitch * hblock;
dstu += dstu_pitch * hblock; dstu += dstu_pitch * hblock;
dstv += dstv_pitch * hblock; dstv += dstv_pitch * hblock;
} }
asm volatile ("mfence");
} }
static void SSE_CopyFromNv12(picture_t *dst, static void SSE_CopyFromNv12(picture_t *dst,
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment