Commit 9acaa4b2 authored by Damien Fouilleul's avatar Damien Fouilleul

- video_chromas: more SSE2 and MMX support and optimization, added SSE2 i420 -> RGB acceleration

parent 7b8ea9c3
...@@ -1274,7 +1274,7 @@ MMXEXT_MODULES="memcpymmxext" ...@@ -1274,7 +1274,7 @@ MMXEXT_MODULES="memcpymmxext"
#MMXEXT_MODULES="${MMXEXT_MODULES} idctmmxext motionmmxext" #MMXEXT_MODULES="${MMXEXT_MODULES} idctmmxext motionmmxext"
THREEDNOW_MODULES="memcpy3dn" THREEDNOW_MODULES="memcpy3dn"
SSE_MODULES="" SSE_MODULES=""
SSE2_MODULES="i420_yuy2_sse2" SSE2_MODULES="i420_rgb_sse2 i420_yuy2_sse2"
ALTIVEC_MODULES="memcpyaltivec i420_yuy2_altivec" ALTIVEC_MODULES="memcpyaltivec i420_yuy2_altivec"
#ALTIVEC_MODULES="${ALTIVEC_MODULES} idctaltivec motionaltivec" #ALTIVEC_MODULES="${ALTIVEC_MODULES} idctaltivec motionaltivec"
...@@ -1325,7 +1325,7 @@ AC_CACHE_CHECK([if \$CC groks SSE2 intrinsics], ...@@ -1325,7 +1325,7 @@ AC_CACHE_CHECK([if \$CC groks SSE2 intrinsics],
[ac_cv_c_sse2_intrinsics=no])]) [ac_cv_c_sse2_intrinsics=no])])
if test "${ac_cv_c_sse2_intrinsics}" != "no"; then if test "${ac_cv_c_sse2_intrinsics}" != "no"; then
AC_DEFINE(HAVE_SSE2_INTRINSICS, 1, Define if SSE2 intrinsics are available.) AC_DEFINE(HAVE_SSE2_INTRINSICS, 1, Define if SSE2 intrinsics are available.)
dnl VLC_ADD_CFLAGS([i420_rgb_sse2],[-msse2]) VLC_ADD_CFLAGS([i420_rgb_sse2],[-msse2])
fi fi
AC_CACHE_CHECK([if \$CC groks MMX inline assembly], AC_CACHE_CHECK([if \$CC groks MMX inline assembly],
......
...@@ -13,6 +13,13 @@ SOURCES_i420_rgb_mmx = \ ...@@ -13,6 +13,13 @@ SOURCES_i420_rgb_mmx = \
i420_rgb_mmx.h \ i420_rgb_mmx.h \
$(NULL) $(NULL)
SOURCES_i420_rgb_sse2 = \
i420_rgb.c \
i420_rgb.h \
i420_rgb16.c \
i420_rgb_mmx.h \
$(NULL)
SOURCES_i420_yuy2 = \ SOURCES_i420_yuy2 = \
i420_yuy2.c \ i420_yuy2.c \
i420_yuy2.h \ i420_yuy2.h \
......
...@@ -4,7 +4,8 @@ ...@@ -4,7 +4,8 @@
* Copyright (C) 2000, 2001, 2004 the VideoLAN team * Copyright (C) 2000, 2001, 2004 the VideoLAN team
* $Id$ * $Id$
* *
* Author: Sam Hocevar <sam@zoy.org> * Authors: Sam Hocevar <sam@zoy.org>
* Damien Fouilleul <damienf@videolan.org>
* *
* This program is free software; you can redistribute it and/or modify * This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by * it under the terms of the GNU General Public License as published by
...@@ -72,6 +73,11 @@ vlc_module_begin(); ...@@ -72,6 +73,11 @@ vlc_module_begin();
"RV15,RV16,RV24,RV32 conversions") ); "RV15,RV16,RV24,RV32 conversions") );
set_capability( "chroma", 100 ); set_capability( "chroma", 100 );
add_requirement( MMX ); add_requirement( MMX );
#elif defined (MODULE_NAME_IS_i420_rgb_sse2)
set_description( _( "SSE2 I420,IYUV,YV12 to "
"RV15,RV16,RV24,RV32 conversions") );
set_capability( "chroma", 120 );
add_requirement( SSE2 );
#endif #endif
set_callbacks( Activate, Deactivate ); set_callbacks( Activate, Deactivate );
vlc_module_end(); vlc_module_end();
...@@ -107,19 +113,30 @@ static int Activate( vlc_object_t *p_this ) ...@@ -107,19 +113,30 @@ static int Activate( vlc_object_t *p_this )
#endif #endif
case VLC_FOURCC('R','V','1','5'): case VLC_FOURCC('R','V','1','5'):
case VLC_FOURCC('R','V','1','6'): case VLC_FOURCC('R','V','1','6'):
#if defined (MODULE_NAME_IS_i420_rgb_mmx) #if ! defined (MODULE_NAME_IS_i420_rgb)
/* If we don't have support for the bitmasks, bail out */ /* If we don't have support for the bitmasks, bail out */
if( ( p_vout->output.i_rmask != 0x7c00 if( ( p_vout->output.i_rmask == 0x7c00
|| p_vout->output.i_gmask != 0x03e0 && p_vout->output.i_gmask == 0x03e0
|| p_vout->output.i_bmask != 0x001f ) && p_vout->output.i_bmask == 0x001f ) )
&& ( p_vout->output.i_rmask != 0xf800
|| p_vout->output.i_gmask != 0x07e0
|| p_vout->output.i_bmask != 0x001f ) )
{ {
return -1; /* R5G5B6 pixel format */
msg_Dbg(p_this, "RGB pixel format is R5G5B5");
p_vout->chroma.pf_convert = E_(I420_R5G5B5);
} }
#endif else if( ( p_vout->output.i_rmask == 0xf800
&& p_vout->output.i_gmask == 0x07e0
&& p_vout->output.i_bmask == 0x001f ) )
{
/* R5G6B5 pixel format */
msg_Dbg(p_this, "RGB pixel format is R5G6B5");
p_vout->chroma.pf_convert = E_(I420_R5G6B5);
}
else
return -1;
#else
// generic C chroma converter */
p_vout->chroma.pf_convert = E_(I420_RGB16); p_vout->chroma.pf_convert = E_(I420_RGB16);
#endif
break; break;
#if 0 #if 0
...@@ -128,16 +145,30 @@ static int Activate( vlc_object_t *p_this ) ...@@ -128,16 +145,30 @@ static int Activate( vlc_object_t *p_this )
#endif #endif
case VLC_FOURCC('R','V','3','2'): case VLC_FOURCC('R','V','3','2'):
#if defined (MODULE_NAME_IS_i420_rgb_mmx) #if ! defined (MODULE_NAME_IS_i420_rgb)
/* If we don't have support for the bitmasks, bail out */ /* If we don't have support for the bitmasks, bail out */
if( p_vout->output.i_rmask != 0x00ff0000 if( p_vout->output.i_rmask == 0x00ff0000
|| p_vout->output.i_gmask != 0x0000ff00 && p_vout->output.i_gmask == 0x0000ff00
|| p_vout->output.i_bmask != 0x000000ff ) && p_vout->output.i_bmask == 0x000000ff )
{ {
return -1; /* A8R8G8B8 pixel format */
msg_Dbg(p_this, "RGB pixel format is A8R8G8B8");
p_vout->chroma.pf_convert = E_(I420_A8R8G8B8);
} }
#endif else if( p_vout->output.i_rmask == 0x0000ff00
&& p_vout->output.i_gmask == 0x00ff0000
&& p_vout->output.i_bmask == 0xff000000 )
{
/* B8G8R8A8 pixel format */
msg_Dbg(p_this, "RGB pixel format is B8G8R8A8");
p_vout->chroma.pf_convert = E_(I420_B8G8R8A8);
}
else
return -1;
#else
// generic C chroma converter */
p_vout->chroma.pf_convert = E_(I420_RGB32); p_vout->chroma.pf_convert = E_(I420_RGB32);
#endif
break; break;
default: default:
......
...@@ -58,9 +58,14 @@ struct chroma_sys_t ...@@ -58,9 +58,14 @@ struct chroma_sys_t
#ifdef MODULE_NAME_IS_i420_rgb #ifdef MODULE_NAME_IS_i420_rgb
void E_(I420_RGB8) ( vout_thread_t *, picture_t *, picture_t * ); void E_(I420_RGB8) ( vout_thread_t *, picture_t *, picture_t * );
void E_(I420_RGB16_dither) ( vout_thread_t *, picture_t *, picture_t * ); void E_(I420_RGB16_dither) ( vout_thread_t *, picture_t *, picture_t * );
#endif
void E_(I420_RGB16) ( vout_thread_t *, picture_t *, picture_t * ); void E_(I420_RGB16) ( vout_thread_t *, picture_t *, picture_t * );
void E_(I420_RGB32) ( vout_thread_t *, picture_t *, picture_t * ); void E_(I420_RGB32) ( vout_thread_t *, picture_t *, picture_t * );
#else // if defined(MODULE_NAME_IS_i420_rgb_mmx)
void E_(I420_R5G5B5) ( vout_thread_t *, picture_t *, picture_t * );
void E_(I420_R5G6B5) ( vout_thread_t *, picture_t *, picture_t * );
void E_(I420_A8R8G8B8) ( vout_thread_t *, picture_t *, picture_t * );
void E_(I420_B8G8R8A8) ( vout_thread_t *, picture_t *, picture_t * );
#endif
/***************************************************************************** /*****************************************************************************
* CONVERT_*_PIXEL: pixel conversion macros * CONVERT_*_PIXEL: pixel conversion macros
......
This diff is collapsed.
This diff is collapsed.
...@@ -306,7 +306,8 @@ static void I420_YUY2( vout_thread_t *p_vout, picture_t *p_source, ...@@ -306,7 +306,8 @@ static void I420_YUY2( vout_thread_t *p_vout, picture_t *p_source,
} }
#if defined (MODULE_NAME_IS_i420_yuy2_mmx) #if defined (MODULE_NAME_IS_i420_yuy2_mmx)
__asm__ __volatile__("emms" :: ); /* re-enable FPU registers */
__asm__ __volatile__ ( "emms" );
#endif #endif
#if defined (MODULE_NAME_IS_i420_yuy2_altivec) #if defined (MODULE_NAME_IS_i420_yuy2_altivec)
...@@ -347,6 +348,8 @@ static void I420_YUY2( vout_thread_t *p_vout, picture_t *p_source, ...@@ -347,6 +348,8 @@ static void I420_YUY2( vout_thread_t *p_vout, picture_t *p_source,
p_line1 += i_dest_margin; p_line1 += i_dest_margin;
p_line2 += i_dest_margin; p_line2 += i_dest_margin;
} }
/* make sure all SSE2 stores are visible thereafter */
__asm__ __volatile__ ( "sfence" );
} }
else else
{ {
...@@ -514,7 +517,8 @@ static void I420_YVYU( vout_thread_t *p_vout, picture_t *p_source, ...@@ -514,7 +517,8 @@ static void I420_YVYU( vout_thread_t *p_vout, picture_t *p_source,
} }
#if defined (MODULE_NAME_IS_i420_yuy2_mmx) #if defined (MODULE_NAME_IS_i420_yuy2_mmx)
__asm__ __volatile__("emms" :: ); /* re-enable FPU registers */
__asm__ __volatile__ ( "emms" );
#endif #endif
#if defined (MODULE_NAME_IS_i420_yuy2_altivec) #if defined (MODULE_NAME_IS_i420_yuy2_altivec)
...@@ -554,6 +558,8 @@ static void I420_YVYU( vout_thread_t *p_vout, picture_t *p_source, ...@@ -554,6 +558,8 @@ static void I420_YVYU( vout_thread_t *p_vout, picture_t *p_source,
p_line1 += i_dest_margin; p_line1 += i_dest_margin;
p_line2 += i_dest_margin; p_line2 += i_dest_margin;
} }
/* make sure all SSE2 stores are visible thereafter */
__asm__ __volatile__ ( "sfence" );
} }
else else
{ {
...@@ -720,7 +726,8 @@ static void I420_UYVY( vout_thread_t *p_vout, picture_t *p_source, ...@@ -720,7 +726,8 @@ static void I420_UYVY( vout_thread_t *p_vout, picture_t *p_source,
} }
#if defined (MODULE_NAME_IS_i420_yuy2_mmx) #if defined (MODULE_NAME_IS_i420_yuy2_mmx)
__asm__ __volatile__("emms" :: ); /* re-enable FPU registers */
__asm__ __volatile__ ( "emms" );
#endif #endif
#if defined (MODULE_NAME_IS_i420_yuy2_altivec) #if defined (MODULE_NAME_IS_i420_yuy2_altivec)
...@@ -760,6 +767,8 @@ static void I420_UYVY( vout_thread_t *p_vout, picture_t *p_source, ...@@ -760,6 +767,8 @@ static void I420_UYVY( vout_thread_t *p_vout, picture_t *p_source,
p_line1 += i_dest_margin; p_line1 += i_dest_margin;
p_line2 += i_dest_margin; p_line2 += i_dest_margin;
} }
/* make sure all SSE2 stores are visible thereafter */
__asm__ __volatile__ ( "sfence" );
} }
else else
{ {
...@@ -861,7 +870,8 @@ static void I420_cyuv( vout_thread_t *p_vout, picture_t *p_source, ...@@ -861,7 +870,8 @@ static void I420_cyuv( vout_thread_t *p_vout, picture_t *p_source,
} }
#if defined (MODULE_NAME_IS_i420_yuy2_mmx) #if defined (MODULE_NAME_IS_i420_yuy2_mmx)
__asm__ __volatile__("emms" :: ); /* re-enable FPU registers */
__asm__ __volatile__ ( "emms" );
#endif #endif
#else // defined(MODULE_NAME_IS_i420_yuy2_sse2) #else // defined(MODULE_NAME_IS_i420_yuy2_sse2)
...@@ -897,6 +907,8 @@ static void I420_cyuv( vout_thread_t *p_vout, picture_t *p_source, ...@@ -897,6 +907,8 @@ static void I420_cyuv( vout_thread_t *p_vout, picture_t *p_source,
p_line1 += i_dest_margin; p_line1 += i_dest_margin;
p_line2 += i_dest_margin; p_line2 += i_dest_margin;
} }
/* make sure all SSE2 stores are visible thereafter */
__asm__ __volatile__ ( "sfence" );
} }
else else
{ {
......
...@@ -136,14 +136,14 @@ movdqa (%3), %%xmm3 # Load 16 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 \n\ ...@@ -136,14 +136,14 @@ movdqa (%3), %%xmm3 # Load 16 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 \n\
punpcklbw %%xmm2, %%xmm1 # v7 u7 v6 u6 .. u1 v0 u0 \n\ punpcklbw %%xmm2, %%xmm1 # v7 u7 v6 u6 .. u1 v0 u0 \n\
movdqa %%xmm0, %%xmm2 # y15 y14 y13 .. y2 y1 y0 \n\ movdqa %%xmm0, %%xmm2 # y15 y14 y13 .. y2 y1 y0 \n\
punpcklbw %%xmm1, %%xmm2 # v3 y7 u3 .. v0 y1 u0 y0 \n\ punpcklbw %%xmm1, %%xmm2 # v3 y7 u3 .. v0 y1 u0 y0 \n\
movdqa %%xmm2, (%0) # Store low YUYV \n\ movntdq %%xmm2, (%0) # Store low YUYV \n\
punpckhbw %%xmm1, %%xmm0 # v3 y7 u3 y6 v2 y5 u2 y4 \n\ punpckhbw %%xmm1, %%xmm0 # v3 y7 u3 y6 v2 y5 u2 y4 \n\
movdqa %%xmm0, 16(%0) # Store high YUYV \n\ movntdq %%xmm0, 16(%0) # Store high YUYV \n\
movdqa %%xmm3, %%xmm4 # Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 \n\ movdqa %%xmm3, %%xmm4 # Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 \n\
punpcklbw %%xmm1, %%xmm4 # v1 Y3 u1 Y2 v0 Y1 u0 Y0 \n\ punpcklbw %%xmm1, %%xmm4 # v1 Y3 u1 Y2 v0 Y1 u0 Y0 \n\
movdqa %%xmm4, (%1) # Store low YUYV \n\ movntdq %%xmm4, (%1) # Store low YUYV \n\
punpckhbw %%xmm1, %%xmm3 # v3 Y7 u3 Y6 v2 Y5 u2 Y4 \n\ punpckhbw %%xmm1, %%xmm3 # v3 Y7 u3 Y6 v2 Y5 u2 Y4 \n\
movdqa %%xmm3, 16(%1) # Store high YUYV \n\ movntdq %%xmm3, 16(%1) # Store high YUYV \n\
" "
#define SSE2_YUV420_YUYV_UNALIGNED " \n\ #define SSE2_YUV420_YUYV_UNALIGNED " \n\
...@@ -172,14 +172,14 @@ movdqa (%3), %%xmm3 # Load 16 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 \n\ ...@@ -172,14 +172,14 @@ movdqa (%3), %%xmm3 # Load 16 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 \n\
punpcklbw %%xmm2, %%xmm1 # u3 v3 u2 v2 u1 v1 u0 v0 \n\ punpcklbw %%xmm2, %%xmm1 # u3 v3 u2 v2 u1 v1 u0 v0 \n\
movdqa %%xmm0, %%xmm2 # y7 y6 y5 y4 y3 y2 y1 y0 \n\ movdqa %%xmm0, %%xmm2 # y7 y6 y5 y4 y3 y2 y1 y0 \n\
punpcklbw %%xmm1, %%xmm2 # u1 y3 v1 y2 u0 y1 v0 y0 \n\ punpcklbw %%xmm1, %%xmm2 # u1 y3 v1 y2 u0 y1 v0 y0 \n\
movdqa %%xmm2, (%0) # Store low YUYV \n\ movntdq %%xmm2, (%0) # Store low YUYV \n\
punpckhbw %%xmm1, %%xmm0 # u3 y7 v3 y6 u2 y5 v2 y4 \n\ punpckhbw %%xmm1, %%xmm0 # u3 y7 v3 y6 u2 y5 v2 y4 \n\
movdqa %%xmm0, 16(%0) # Store high YUYV \n\ movntdq %%xmm0, 16(%0) # Store high YUYV \n\
movdqa %%xmm3, %%xmm4 # Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 \n\ movdqa %%xmm3, %%xmm4 # Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 \n\
punpcklbw %%xmm1, %%xmm4 # u1 Y3 v1 Y2 u0 Y1 v0 Y0 \n\ punpcklbw %%xmm1, %%xmm4 # u1 Y3 v1 Y2 u0 Y1 v0 Y0 \n\
movdqa %%xmm4, (%1) # Store low YUYV \n\ movntdq %%xmm4, (%1) # Store low YUYV \n\
punpckhbw %%xmm1, %%xmm3 # u3 Y7 v3 Y6 u2 Y5 v2 Y4 \n\ punpckhbw %%xmm1, %%xmm3 # u3 Y7 v3 Y6 u2 Y5 v2 Y4 \n\
movdqa %%xmm3, 16(%1) # Store high YUYV \n\ movntdq %%xmm3, 16(%1) # Store high YUYV \n\
" "
#define SSE2_YUV420_YVYU_UNALIGNED " \n\ #define SSE2_YUV420_YVYU_UNALIGNED " \n\
...@@ -208,15 +208,15 @@ movdqa (%3), %%xmm3 # Load 16 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 \n\ ...@@ -208,15 +208,15 @@ movdqa (%3), %%xmm3 # Load 16 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 \n\
punpcklbw %%xmm2, %%xmm1 # v3 u3 v2 u2 v1 u1 v0 u0 \n\ punpcklbw %%xmm2, %%xmm1 # v3 u3 v2 u2 v1 u1 v0 u0 \n\
movdqa %%xmm1, %%xmm2 # v3 u3 v2 u2 v1 u1 v0 u0 \n\ movdqa %%xmm1, %%xmm2 # v3 u3 v2 u2 v1 u1 v0 u0 \n\
punpcklbw %%xmm0, %%xmm2 # y3 v1 y2 u1 y1 v0 y0 u0 \n\ punpcklbw %%xmm0, %%xmm2 # y3 v1 y2 u1 y1 v0 y0 u0 \n\
movdqa %%xmm2, (%0) # Store low UYVY \n\ movntdq %%xmm2, (%0) # Store low UYVY \n\
movdqa %%xmm1, %%xmm2 # u3 v3 u2 v2 u1 v1 u0 v0 \n\ movdqa %%xmm1, %%xmm2 # u3 v3 u2 v2 u1 v1 u0 v0 \n\
punpckhbw %%xmm0, %%xmm2 # y3 v1 y2 u1 y1 v0 y0 u0 \n\ punpckhbw %%xmm0, %%xmm2 # y3 v1 y2 u1 y1 v0 y0 u0 \n\
movdqa %%xmm2, 16(%0) # Store high UYVY \n\ movntdq %%xmm2, 16(%0) # Store high UYVY \n\
movdqa %%xmm1, %%xmm4 # u3 v3 u2 v2 u1 v1 u0 v0 \n\ movdqa %%xmm1, %%xmm4 # u3 v3 u2 v2 u1 v1 u0 v0 \n\
punpcklbw %%xmm3, %%xmm4 # Y3 v1 Y2 u1 Y1 v0 Y0 u0 \n\ punpcklbw %%xmm3, %%xmm4 # Y3 v1 Y2 u1 Y1 v0 Y0 u0 \n\
movdqa %%xmm4, (%1) # Store low UYVY \n\ movntdq %%xmm4, (%1) # Store low UYVY \n\
punpckhbw %%xmm3, %%xmm1 # Y7 v3 Y6 u3 Y5 v2 Y4 u2 \n\ punpckhbw %%xmm3, %%xmm1 # Y7 v3 Y6 u3 Y5 v2 Y4 u2 \n\
movdqa %%xmm1, 16(%1) # Store high UYVY \n\ movntdq %%xmm1, 16(%1) # Store high UYVY \n\
" "
#define SSE2_YUV420_UYVY_UNALIGNED " \n\ #define SSE2_YUV420_UYVY_UNALIGNED " \n\
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment