Commit 83f2312b authored by Laurent Aimar's avatar Laurent Aimar

Added support for SSE2 to 16 bit merge (deinterlace).

parent 8962e714
...@@ -636,9 +636,9 @@ int Open( vlc_object_t *p_this ) ...@@ -636,9 +636,9 @@ int Open( vlc_object_t *p_this )
else else
#endif #endif
#if defined(CAN_COMPILE_SSE) #if defined(CAN_COMPILE_SSE)
if( chroma->pixel_size == 1 && (vlc_CPU() & CPU_CAPABILITY_SSE2) ) if( (vlc_CPU() & CPU_CAPABILITY_SSE2) )
{ {
p_sys->pf_merge = MergeSSE2; p_sys->pf_merge = chroma->pixel_size == 1 ? Merge8BitSSE2 : Merge16BitSSE2;
p_sys->pf_end_merge = EndMMX; p_sys->pf_end_merge = EndMMX;
} }
else else
......
...@@ -118,8 +118,8 @@ void Merge3DNow( void *_p_dest, const void *_p_s1, const void *_p_s2, ...@@ -118,8 +118,8 @@ void Merge3DNow( void *_p_dest, const void *_p_s1, const void *_p_s2,
#endif #endif
#if defined(CAN_COMPILE_SSE) #if defined(CAN_COMPILE_SSE)
void MergeSSE2( void *_p_dest, const void *_p_s1, const void *_p_s2, void Merge8BitSSE2( void *_p_dest, const void *_p_s1, const void *_p_s2,
size_t i_bytes ) size_t i_bytes )
{ {
uint8_t *p_dest = _p_dest; uint8_t *p_dest = _p_dest;
const uint8_t *p_s1 = _p_s1; const uint8_t *p_s1 = _p_s1;
...@@ -143,6 +143,34 @@ void MergeSSE2( void *_p_dest, const void *_p_s1, const void *_p_s2, ...@@ -143,6 +143,34 @@ void MergeSSE2( void *_p_dest, const void *_p_s1, const void *_p_s2,
for( ; i_bytes > 0; i_bytes-- ) for( ; i_bytes > 0; i_bytes-- )
*p_dest++ = ( *p_s1++ + *p_s2++ ) >> 1; *p_dest++ = ( *p_s1++ + *p_s2++ ) >> 1;
} }
void Merge16BitSSE2( void *_p_dest, const void *_p_s1, const void *_p_s2,
size_t i_bytes )
{
uint16_t *p_dest = _p_dest;
const uint16_t *p_s1 = _p_s1;
const uint16_t *p_s2 = _p_s2;
size_t i_words = i_bytes / 2;
for( ; i_words > 0 && ((uintptr_t)p_s1 & 15); i_words-- )
*p_dest++ = ( *p_s1++ + *p_s2++ ) >> 1;
for( ; i_words >= 8; i_words -= 8 )
{
__asm__ __volatile__( "movdqu %2,%%xmm1;"
"pavgw %1, %%xmm1;"
"movdqu %%xmm1, %0" :"=m" (*p_dest):
"m" (*p_s1),
"m" (*p_s2) );
p_dest += 8;
p_s1 += 8;
p_s2 += 8;
}
for( ; i_words > 0; i_words-- )
*p_dest++ = ( *p_s1++ + *p_s2++ ) >> 1;
}
#endif #endif
#ifdef CAN_COMPILE_C_ALTIVEC #ifdef CAN_COMPILE_C_ALTIVEC
......
...@@ -141,7 +141,16 @@ void Merge3DNow ( void *, const void *, const void *, size_t ); ...@@ -141,7 +141,16 @@ void Merge3DNow ( void *, const void *, const void *, size_t );
* @param _p_s2 Source line B * @param _p_s2 Source line B
* @param i_bytes Number of bytes to merge * @param i_bytes Number of bytes to merge
*/ */
void MergeSSE2 ( void *, const void *, const void *, size_t ); void Merge8BitSSE2( void *, const void *, const void *, size_t );
/**
* SSE2 routine to blend pixels from two picture lines.
*
* @param _p_dest Target
* @param _p_s1 Source line A
* @param _p_s2 Source line B
* @param i_bytes Number of bytes to merge
*/
void Merge16BitSSE2( void *, const void *, const void *, size_t );
#endif #endif
#if defined __ARM_NEON__ #if defined __ARM_NEON__
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment