Commit a2a1a8ef authored by ivo's avatar ivo

New implementation of rgb32tobgr32

The previous implementation segfaulted with MMX enabled when fed an image
smaller than the size of the units the MMX code processed. The new code:
- is faster for MMX, MMX2 and plain C
- processes small images correctly
- is LGPL



git-svn-id: file:///var/local/repositories/mplayer/trunk/libswscale@23009 b3059339-0415-0410-9bf9-f77b7e298cf2
parent b8b3ece9
......@@ -1364,49 +1364,66 @@ static inline void RENAME(rgb16to32)(const uint8_t *src, uint8_t *dst, long src_
static inline void RENAME(rgb32tobgr32)(const uint8_t *src, uint8_t *dst, long src_size)
{
uint8_t *d = dst, *s = (uint8_t *) src;
const uint8_t *end = s + src_size;
#ifdef HAVE_MMX
/* TODO: unroll this loop */
asm volatile (
"xor %%"REG_a", %%"REG_a" \n\t"
ASMALIGN(4)
"1: \n\t"
PREFETCH" 32(%0, %%"REG_a") \n\t"
"movq (%0, %%"REG_a"), %%mm0 \n\t"
"movq %%mm0, %%mm1 \n\t"
"movq %%mm0, %%mm2 \n\t"
"pslld $16, %%mm0 \n\t"
"psrld $16, %%mm1 \n\t"
"pand "MANGLE(mask32r)", %%mm0 \n\t"
"pand "MANGLE(mask32g)", %%mm2 \n\t"
"pand "MANGLE(mask32b)", %%mm1 \n\t"
"por %%mm0, %%mm2 \n\t"
"por %%mm1, %%mm2 \n\t"
MOVNTQ" %%mm2, (%1, %%"REG_a") \n\t"
"add $8, %%"REG_a" \n\t"
"cmp %2, %%"REG_a" \n\t"
" jb 1b \n\t"
:: "r" (src), "r"(dst), "r" (src_size-7)
: "%"REG_a
);
__asm __volatile(SFENCE:::"memory");
__asm __volatile(EMMS:::"memory");
#else
unsigned i;
unsigned num_pixels = src_size >> 2;
for(i=0; i<num_pixels; i++)
{
#ifdef WORDS_BIGENDIAN
dst[4*i + 1] = src[4*i + 3];
dst[4*i + 2] = src[4*i + 2];
dst[4*i + 3] = src[4*i + 1];
#else
dst[4*i + 0] = src[4*i + 2];
dst[4*i + 1] = src[4*i + 1];
dst[4*i + 2] = src[4*i + 0];
#endif
__asm __volatile(
" "PREFETCH" (%1) \n"
" movq %3, %%mm7 \n"
" pxor %4, %%mm7 \n"
" movq %%mm7, %%mm6 \n"
" pxor %5, %%mm7 \n"
" jmp 2f \n"
ASMALIGN(4)
"1: \n"
" "PREFETCH" 32(%1) \n"
" movq (%1), %%mm0 \n"
" movq 8(%1), %%mm1 \n"
# ifdef HAVE_MMX2
" pshufw $177, %%mm0, %%mm3 \n"
" pshufw $177, %%mm1, %%mm5 \n"
" pand %%mm7, %%mm0 \n"
" pand %%mm6, %%mm3 \n"
" pand %%mm7, %%mm1 \n"
" pand %%mm6, %%mm5 \n"
" por %%mm3, %%mm0 \n"
" por %%mm5, %%mm1 \n"
# else
" movq %%mm0, %%mm2 \n"
" movq %%mm1, %%mm4 \n"
" pand %%mm7, %%mm0 \n"
" pand %%mm6, %%mm2 \n"
" pand %%mm7, %%mm1 \n"
" pand %%mm6, %%mm4 \n"
" movq %%mm2, %%mm3 \n"
" movq %%mm4, %%mm5 \n"
" pslld $16, %%mm2 \n"
" psrld $16, %%mm3 \n"
" pslld $16, %%mm4 \n"
" psrld $16, %%mm5 \n"
" por %%mm2, %%mm0 \n"
" por %%mm4, %%mm1 \n"
" por %%mm3, %%mm0 \n"
" por %%mm5, %%mm1 \n"
# endif
" "MOVNTQ" %%mm0, (%0) \n"
" "MOVNTQ" %%mm1, 8(%0) \n"
" add $16, %0 \n"
" add $16, %1 \n"
"2: \n"
" cmp %1, %2 \n"
" ja 1b \n"
" "SFENCE" \n"
" "EMMS" \n"
: "+r"(d), "+r"(s)
: "r" (end-15), "m" (mask32b), "m" (mask32r), "m" (mmx_one)
: "memory");
#endif
for (; s<end; s+=4, d+=4) {
int v = *(uint32_t *)s, g = v & 0xff00;
v &= 0xff00ff;
*(uint32_t *)d = (v>>16) + g + (v<<16);
}
#endif
}
static inline void RENAME(rgb24tobgr24)(const uint8_t *src, uint8_t *dst, long src_size)
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment