Commit e46c1625 authored by michael's avatar michael

faster 15to16 bit rgb (the mmx routine is limited by memory speed so there is...

faster 15to16 bit rgb (the mmx routine is limited by memory speed so there is no difference ): but the c routine is faster


git-svn-id: file:///var/local/repositories/mplayer/trunk/postproc@2699 b3059339-0415-0410-9bf9-f77b7e298cf2
parent a354926f
......@@ -14,6 +14,7 @@ static const uint64_t mask24l __attribute__((aligned(8))) = 0x0000000000FFFFFFU
static const uint64_t mask24h __attribute__((aligned(8))) = 0x0000FFFFFF000000ULL;
static const uint64_t mask15b __attribute__((aligned(8))) = 0x001F001F001F001FULL; /* 00000000 00011111 xxB */
static const uint64_t mask15rg __attribute__((aligned(8))) = 0x7FE07FE07FE07FE0ULL; /* 01111111 11100000 RGx */
static const uint64_t mask15s __attribute__((aligned(8))) = 0xFFE0FFE0FFE0FFE0ULL;
#endif
void rgb24to32(const uint8_t *src,uint8_t *dst,uint32_t src_size)
......@@ -119,6 +120,7 @@ void rgb32to24(const uint8_t *src,uint8_t *dst,uint32_t src_size)
Original by Strepto/Astral
ported to gcc & bugfixed : A'rpi
MMX2, 3DNOW optimization by Nick Kurshev
32bit c version, and and&add trick by Michael Niedermayer
*/
void rgb15to16(const uint8_t *src,uint8_t *dst,uint32_t src_size)
{
......@@ -126,11 +128,10 @@ void rgb15to16(const uint8_t *src,uint8_t *dst,uint32_t src_size)
register const char* s=src+src_size;
register char* d=dst+src_size;
register int offs=-src_size;
__asm __volatile(PREFETCH" %0"::"m"(*(s+offs)):"memory");
__asm __volatile(PREFETCH" %0"::"m"(*(s+offs)));
__asm __volatile(
"movq %0, %%mm4\n\t"
"movq %1, %%mm5"
::"m"(mask15b), "m"(mask15rg):"memory");
::"m"(mask15s));
while(offs<0)
{
__asm __volatile(
......@@ -140,23 +141,20 @@ void rgb15to16(const uint8_t *src,uint8_t *dst,uint32_t src_size)
"movq %%mm0, %%mm1\n\t"
"movq %%mm2, %%mm3\n\t"
"pand %%mm4, %%mm0\n\t"
"pand %%mm5, %%mm1\n\t"
"pand %%mm4, %%mm2\n\t"
"pand %%mm5, %%mm3\n\t"
"psllq $1, %%mm1\n\t"
"psllq $1, %%mm3\n\t"
"por %%mm1, %%mm0\n\t"
"por %%mm3, %%mm2\n\t"
"paddw %%mm1, %%mm0\n\t"
"paddw %%mm3, %%mm2\n\t"
MOVNTQ" %%mm0, %0\n\t"
MOVNTQ" %%mm2, 8%0"
:"=m"(*(d+offs))
:"m"(*(s+offs))
:"memory");
);
offs+=16;
}
__asm __volatile(SFENCE:::"memory");
__asm __volatile(EMMS:::"memory");
#else
#if 0
const uint16_t *s1=( uint16_t * )src;
uint16_t *d1=( uint16_t * )dst;
uint16_t *e=((uint8_t *)s1)+src_size;
......@@ -168,6 +166,19 @@ void rgb15to16(const uint8_t *src,uint8_t *dst,uint32_t src_size)
00000000000001 1111=0x001F */
*( d1++ )=( x&0x001F )|( ( x&0x7FE0 )<<1 );
}
#else
const uint32_t *s1=( uint32_t * )src;
uint32_t *d1=( uint32_t * )dst;
int i;
int size= src_size>>2;
for(i=0; i<size; i++)
{
register int x= s1[i];
// d1[i] = x + (x&0x7FE07FE0); //faster but need msbit =0 which might not allways be true
d1[i] = (x&0x7FFF7FFF) + (x&0x7FE07FE0);
}
#endif
#endif
}
......
......@@ -14,6 +14,7 @@ static const uint64_t mask24l __attribute__((aligned(8))) = 0x0000000000FFFFFFU
static const uint64_t mask24h __attribute__((aligned(8))) = 0x0000FFFFFF000000ULL;
static const uint64_t mask15b __attribute__((aligned(8))) = 0x001F001F001F001FULL; /* 00000000 00011111 xxB */
static const uint64_t mask15rg __attribute__((aligned(8))) = 0x7FE07FE07FE07FE0ULL; /* 01111111 11100000 RGx */
static const uint64_t mask15s __attribute__((aligned(8))) = 0xFFE0FFE0FFE0FFE0ULL;
#endif
void rgb24to32(const uint8_t *src,uint8_t *dst,uint32_t src_size)
......@@ -119,6 +120,7 @@ void rgb32to24(const uint8_t *src,uint8_t *dst,uint32_t src_size)
Original by Strepto/Astral
ported to gcc & bugfixed : A'rpi
MMX2, 3DNOW optimization by Nick Kurshev
32bit c version, and and&add trick by Michael Niedermayer
*/
void rgb15to16(const uint8_t *src,uint8_t *dst,uint32_t src_size)
{
......@@ -126,11 +128,10 @@ void rgb15to16(const uint8_t *src,uint8_t *dst,uint32_t src_size)
register const char* s=src+src_size;
register char* d=dst+src_size;
register int offs=-src_size;
__asm __volatile(PREFETCH" %0"::"m"(*(s+offs)):"memory");
__asm __volatile(PREFETCH" %0"::"m"(*(s+offs)));
__asm __volatile(
"movq %0, %%mm4\n\t"
"movq %1, %%mm5"
::"m"(mask15b), "m"(mask15rg):"memory");
::"m"(mask15s));
while(offs<0)
{
__asm __volatile(
......@@ -140,23 +141,20 @@ void rgb15to16(const uint8_t *src,uint8_t *dst,uint32_t src_size)
"movq %%mm0, %%mm1\n\t"
"movq %%mm2, %%mm3\n\t"
"pand %%mm4, %%mm0\n\t"
"pand %%mm5, %%mm1\n\t"
"pand %%mm4, %%mm2\n\t"
"pand %%mm5, %%mm3\n\t"
"psllq $1, %%mm1\n\t"
"psllq $1, %%mm3\n\t"
"por %%mm1, %%mm0\n\t"
"por %%mm3, %%mm2\n\t"
"paddw %%mm1, %%mm0\n\t"
"paddw %%mm3, %%mm2\n\t"
MOVNTQ" %%mm0, %0\n\t"
MOVNTQ" %%mm2, 8%0"
:"=m"(*(d+offs))
:"m"(*(s+offs))
:"memory");
);
offs+=16;
}
__asm __volatile(SFENCE:::"memory");
__asm __volatile(EMMS:::"memory");
#else
#if 0
const uint16_t *s1=( uint16_t * )src;
uint16_t *d1=( uint16_t * )dst;
uint16_t *e=((uint8_t *)s1)+src_size;
......@@ -168,6 +166,19 @@ void rgb15to16(const uint8_t *src,uint8_t *dst,uint32_t src_size)
00000000000001 1111=0x001F */
*( d1++ )=( x&0x001F )|( ( x&0x7FE0 )<<1 );
}
#else
const uint32_t *s1=( uint32_t * )src;
uint32_t *d1=( uint32_t * )dst;
int i;
int size= src_size>>2;
for(i=0; i<size; i++)
{
register int x= s1[i];
// d1[i] = x + (x&0x7FE07FE0); //faster but need msbit =0 which might not allways be true
d1[i] = (x&0x7FFF7FFF) + (x&0x7FE07FE0);
}
#endif
#endif
}
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment