Commit 15b4ed29 authored by Gildas Bazin's avatar Gildas Bazin

* modules/video_filter/swscale/*: new resizing/chroma conversion video filter...

* modules/video_filter/swscale/*: new resizing/chroma conversion video filter based on MPlayer's swscale module.
  (not enabled in configure.ac yet as it may have problems building on some platforms)
parent c1032025
......@@ -4105,6 +4105,7 @@ AC_CONFIG_FILES([
modules/stream_out/transrate/Makefile
modules/video_chroma/Makefile
modules/video_filter/Makefile
modules/video_filter/swscale/Makefile
modules/video_output/Makefile
modules/video_output/directx/Makefile
modules/video_output/qte/Makefile
......
SOURCES_swscale = \
swscale.c \
swscale.h \
swscale_internal.h \
yuv2rgb.c \
rgb2rgb.c \
rgb2rgb.h \
common.h \
filter.c \
$(NULL)
EXTRA_DIST += \
swscale_template.c \
swscale_altivec_template.c \
yuv2rgb_template.c \
yuv2rgb_altivec.c \
yuv2rgb_mlib.c \
rgb2rgb_template.c
\ No newline at end of file
/*****************************************************************************
* common.h: a few defines and wrappers for swscale
*****************************************************************************
* Copyright (C) 1999-2004 VideoLAN
* $Id$
*
* Authors: Gildas Bazin <gbazin@videolan.org>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111, USA.
*****************************************************************************/
#define mp_msg(a,b,args... ) //fprintf(stderr, ##args)
#define vo_format_name(a) ""
#ifndef _VLC_FILTER_H
extern void *( *swscale_fast_memcpy )( void *, const void *, int );
#endif
#define memcpy(a,b,c) swscale_fast_memcpy(a,b,c)
#if defined(__CYGWIN__) || defined(__MINGW32__) || defined(__OS2__) || \
(defined(__OpenBSD__) && !defined(__ELF__))
#define MANGLE(a) "_" #a
#else
#define MANGLE(a) #a
#endif
#ifdef ARCH_X86
static inline unsigned short ByteSwap16(unsigned short x)
{
__asm("xchgb %b0,%h0" :
"=q" (x) :
"0" (x));
return x;
}
#define bswap_16(x) ByteSwap16(x)
#else
#define bswap_16(x) (((x) & 0x00ff) << 8 | ((x) & 0xff00) >> 8)
#endif /* !ARCH_X86 */
/* SWSCALE image formats */
#define IMGFMT_RGB_MASK 0xFFFFFF00
#define IMGFMT_RGB (('R'<<24)|('G'<<16)|('B'<<8))
#define IMGFMT_RGB1 (IMGFMT_RGB|1)
#define IMGFMT_RGB4 (IMGFMT_RGB|4)
#define IMGFMT_RGB4_CHAR (IMGFMT_RGB|4|128) // RGB4 with 1 pixel per byte
#define IMGFMT_RGB8 (IMGFMT_RGB|8)
#define IMGFMT_RGB15 (IMGFMT_RGB|15)
#define IMGFMT_RGB16 (IMGFMT_RGB|16)
#define IMGFMT_RGB24 (IMGFMT_RGB|24)
#define IMGFMT_RGB32 (IMGFMT_RGB|32)
#define IMGFMT_BGR_MASK 0xFFFFFF00
#define IMGFMT_BGR (('B'<<24)|('G'<<16)|('R'<<8))
#define IMGFMT_BGR1 (IMGFMT_BGR|1)
#define IMGFMT_BGR4 (IMGFMT_BGR|4)
#define IMGFMT_BGR4_CHAR (IMGFMT_BGR|4|128) // BGR4 with 1 pixel per byte
#define IMGFMT_BGR8 (IMGFMT_BGR|8)
#define IMGFMT_BGR15 (IMGFMT_BGR|15)
#define IMGFMT_BGR16 (IMGFMT_BGR|16)
#define IMGFMT_BGR24 (IMGFMT_BGR|24)
#define IMGFMT_BGR32 (IMGFMT_BGR|32)
#ifdef WORDS_BIGENDIAN
#define IMGFMT_ABGR IMGFMT_RGB32
#define IMGFMT_BGRA (IMGFMT_RGB32|64)
#define IMGFMT_ARGB IMGFMT_BGR32
#define IMGFMT_RGBA (IMGFMT_BGR32|64)
#else
#define IMGFMT_ABGR (IMGFMT_BGR32|64)
#define IMGFMT_BGRA IMGFMT_BGR32
#define IMGFMT_ARGB (IMGFMT_RGB32|64)
#define IMGFMT_RGBA IMGFMT_RGB32
#endif
/* old names for compatibility */
#define IMGFMT_RG4B IMGFMT_RGB4_CHAR
#define IMGFMT_BG4B IMGFMT_BGR4_CHAR
#define IMGFMT_IS_RGB(fmt) (((fmt)&IMGFMT_RGB_MASK)==IMGFMT_RGB)
#define IMGFMT_IS_BGR(fmt) (((fmt)&IMGFMT_BGR_MASK)==IMGFMT_BGR)
#define IMGFMT_RGB_DEPTH(fmt) ((fmt)&0x3F)
#define IMGFMT_BGR_DEPTH(fmt) ((fmt)&0x3F)
/* Planar YUV Formats */
#define IMGFMT_YVU9 0x39555659
#define IMGFMT_IF09 0x39304649
#define IMGFMT_YV12 0x32315659
#define IMGFMT_I420 0x30323449
#define IMGFMT_IYUV 0x56555949
#define IMGFMT_CLPL 0x4C504C43
#define IMGFMT_Y800 0x30303859
#define IMGFMT_Y8 0x20203859
#define IMGFMT_NV12 0x3231564E
#define IMGFMT_NV21 0x3132564E
/* unofficial Planar Formats, FIXME if official 4CC exists */
#define IMGFMT_444P 0x50343434
#define IMGFMT_422P 0x50323234
#define IMGFMT_411P 0x50313134
#define IMGFMT_HM12 0x32314D48
/* Packed YUV Formats */
#define IMGFMT_IUYV 0x56595549
#define IMGFMT_IY41 0x31435949
#define IMGFMT_IYU1 0x31555949
#define IMGFMT_IYU2 0x32555949
#define IMGFMT_UYVY 0x59565955
#define IMGFMT_UYNV 0x564E5955
#define IMGFMT_cyuv 0x76757963
#define IMGFMT_Y422 0x32323459
#define IMGFMT_YUY2 0x32595559
#define IMGFMT_YUNV 0x564E5559
#define IMGFMT_YVYU 0x55595659
#define IMGFMT_Y41P 0x50313459
#define IMGFMT_Y211 0x31313259
#define IMGFMT_Y41T 0x54313459
#define IMGFMT_Y42T 0x54323459
#define IMGFMT_V422 0x32323456
#define IMGFMT_V655 0x35353656
#define IMGFMT_CLJR 0x524A4C43
#define IMGFMT_YUVP 0x50565559
#define IMGFMT_UYVP 0x50565955
/*****************************************************************************
* filter.c: video scaling module using the swscale library
*****************************************************************************
* Copyright (C) 2003 VideoLAN
* $Id$
*
* Author: Gildas Bazin <gbazin@videolan.org>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111, USA.
*****************************************************************************/
/*****************************************************************************
* Preamble
*****************************************************************************/
#include <vlc/vlc.h>
#include <vlc/decoder.h>
#include "vlc_filter.h"
#include "common.h"
#include "swscale.h"
void *( *swscale_fast_memcpy )( void *, const void *, size_t );
/*****************************************************************************
* filter_sys_t : filter descriptor
*****************************************************************************/
struct filter_sys_t
{
struct SwsContext *ctx;
SwsFilter *p_src_filter;
SwsFilter *p_dst_filter;
int i_cpu_mask, i_sws_flags;
es_format_t fmt_in;
es_format_t fmt_out;
};
/****************************************************************************
* Local prototypes
****************************************************************************/
static int OpenFilter ( vlc_object_t * );
static void CloseFilter( vlc_object_t * );
static picture_t *Filter( filter_t *, picture_t * );
static int CheckInit( filter_t * );
static int GetSwscaleChroma( vlc_fourcc_t );
/*****************************************************************************
* Module descriptor
*****************************************************************************/
#define MODE_TEXT N_("Scaling mode")
#define MODE_LONGTEXT N_("You can choose the default scaling mode")
static int pi_mode_values[] = { 0, 1, 2, 4, 8, 5, 6, 9, 10 };
static char *ppsz_mode_descriptions[] =
{ N_("Fast bilinear"), N_("Bilinear"), N_("Bicubic (good quality)"),
N_("Experimental"), N_("Nearest neighbour (bad quality)"),
N_("Area"), N_("Luma bicubic / chroma bilinear"), N_("Gauss"),
N_("SincR"), N_("Lanczos"), N_("Bicubic spline") };
vlc_module_begin();
set_description( _("Video scaling filter") );
set_capability( "video filter2", 1000 );
set_callbacks( OpenFilter, CloseFilter );
add_integer( "swscale-mode", 0, NULL, MODE_TEXT, MODE_LONGTEXT, VLC_TRUE );
change_integer_list( pi_mode_values, ppsz_mode_descriptions, 0 );
vlc_module_end();
/*****************************************************************************
* OpenFilter: probe the filter and return score
*****************************************************************************/
static int OpenFilter( vlc_object_t *p_this )
{
filter_t *p_filter = (filter_t*)p_this;
filter_sys_t *p_sys;
vlc_value_t val;
unsigned int i_fmt_in, i_fmt_out;
int i_sws_mode;
float sws_lum_gblur = 0.0, sws_chr_gblur = 0.0;
int sws_chr_vshift = 0, sws_chr_hshift = 0;
float sws_chr_sharpen = 0.0, sws_lum_sharpen = 0.0;
/* Supported Input formats: YV12, I420/IYUV, YUY2, UYVY, BGR32, BGR24,
* BGR16, BGR15, RGB32, RGB24, Y8/Y800, YVU9/IF09 */
if( !(i_fmt_in = GetSwscaleChroma(p_filter->fmt_in.video.i_chroma)) )
{
return VLC_EGENERIC;
}
/* Supported output formats: YV12, I420/IYUV, YUY2, UYVY,
* {BGR,RGB}{1,4,8,15,16,24,32}, Y8/Y800, YVU9/IF09 */
if( !(i_fmt_out = GetSwscaleChroma(p_filter->fmt_out.video.i_chroma)) )
{
return VLC_EGENERIC;
}
/* Allocate the memory needed to store the decoder's structure */
if( ( p_filter->p_sys = p_sys =
(filter_sys_t *)malloc(sizeof(filter_sys_t)) ) == NULL )
{
msg_Err( p_filter, "out of memory" );
return VLC_EGENERIC;
}
swscale_fast_memcpy = p_filter->p_vlc->pf_memcpy;
/* Set CPU capabilities */
p_sys->i_cpu_mask = 0;
if( p_filter->p_libvlc->i_cpu & CPU_CAPABILITY_MMX )
{
p_sys->i_cpu_mask |= SWS_CPU_CAPS_MMX;
}
if( p_filter->p_libvlc->i_cpu & CPU_CAPABILITY_MMXEXT )
{
p_sys->i_cpu_mask |= SWS_CPU_CAPS_MMX2;
}
if( p_filter->p_libvlc->i_cpu & CPU_CAPABILITY_3DNOW )
{
p_sys->i_cpu_mask |= SWS_CPU_CAPS_3DNOW;
}
if( p_filter->p_libvlc->i_cpu & CPU_CAPABILITY_ALTIVEC )
{
p_sys->i_cpu_mask |= SWS_CPU_CAPS_ALTIVEC;
}
var_Create( p_filter, "swscale-mode", VLC_VAR_INTEGER|VLC_VAR_DOINHERIT );
var_Get( p_filter, "swscale-mode", &val );
i_sws_mode = val.i_int;
switch( i_sws_mode )
{
case 0: p_sys->i_sws_flags |= SWS_FAST_BILINEAR; break;
case 1: p_sys->i_sws_flags |= SWS_BILINEAR; break;
case 2: p_sys->i_sws_flags |= SWS_BICUBIC; break;
case 3: p_sys->i_sws_flags |= SWS_X; break;
case 4: p_sys->i_sws_flags |= SWS_POINT; break;
case 5: p_sys->i_sws_flags |= SWS_AREA; break;
case 6: p_sys->i_sws_flags |= SWS_BICUBLIN; break;
case 7: p_sys->i_sws_flags |= SWS_GAUSS; break;
case 8: p_sys->i_sws_flags |= SWS_SINC; break;
case 9: p_sys->i_sws_flags |= SWS_LANCZOS; break;
case 10: p_sys->i_sws_flags |= SWS_SPLINE; break;
default: p_sys->i_sws_flags |= SWS_FAST_BILINEAR; i_sws_mode = 0; break;
}
p_sys->p_src_filter = 0; p_sys->p_dst_filter = 0;
p_sys->p_src_filter =
sws_getDefaultFilter( sws_lum_gblur, sws_chr_gblur,
sws_lum_sharpen, sws_chr_sharpen,
sws_chr_hshift, sws_chr_vshift, 0 );
/* Misc init */
p_sys->ctx = NULL;
p_filter->pf_video_filter = Filter;
es_format_Init( &p_sys->fmt_in, 0, 0 );
es_format_Init( &p_sys->fmt_out, 0, 0 );
if( CheckInit( p_filter ) != VLC_SUCCESS )
{
if( p_sys->p_src_filter ) sws_freeFilter( p_sys->p_src_filter );
free( p_sys );
return VLC_EGENERIC;
}
msg_Dbg( p_filter, "%ix%i chroma: %4.4s -> %ix%i chroma: %4.4s",
p_filter->fmt_in.video.i_width, p_filter->fmt_in.video.i_height,
(char *)&p_filter->fmt_in.video.i_chroma,
p_filter->fmt_out.video.i_width, p_filter->fmt_out.video.i_height,
(char *)&p_filter->fmt_out.video.i_chroma );
if( p_filter->fmt_in.video.i_width != p_filter->fmt_out.video.i_width ||
p_filter->fmt_in.video.i_height != p_filter->fmt_out.video.i_height )
{
msg_Dbg( p_filter, "scaling mode: %s",
ppsz_mode_descriptions[i_sws_mode] );
}
return VLC_SUCCESS;
}
/*****************************************************************************
* CloseFilter: clean up the filter
*****************************************************************************/
static void CloseFilter( vlc_object_t *p_this )
{
filter_t *p_filter = (filter_t*)p_this;
filter_sys_t *p_sys = p_filter->p_sys;
if( p_sys->ctx ) sws_freeContext( p_sys->ctx );
if( p_sys->p_src_filter ) sws_freeFilter( p_sys->p_src_filter );
free( p_sys );
}
/*****************************************************************************
* CheckInit: Initialise filter when necessary
*****************************************************************************/
static int CheckInit( filter_t *p_filter )
{
filter_sys_t *p_sys = p_filter->p_sys;
if( p_filter->fmt_in.video.i_width != p_sys->fmt_in.video.i_width ||
p_filter->fmt_in.video.i_height != p_sys->fmt_in.video.i_height ||
p_filter->fmt_out.video.i_width != p_sys->fmt_out.video.i_width ||
p_filter->fmt_out.video.i_height != p_sys->fmt_out.video.i_height )
{
unsigned int i_fmt_in, i_fmt_out;
if( !(i_fmt_in = GetSwscaleChroma(p_filter->fmt_in.video.i_chroma)) ||
!(i_fmt_out = GetSwscaleChroma(p_filter->fmt_out.video.i_chroma)) )
{
msg_Err( p_filter, "format not supported" );
return VLC_EGENERIC;
}
if( p_sys->ctx ) sws_freeContext( p_sys->ctx );
p_sys->ctx =
sws_getContext( p_filter->fmt_in.video.i_width,
p_filter->fmt_in.video.i_height, i_fmt_in,
p_filter->fmt_out.video.i_width,
p_filter->fmt_out.video.i_height, i_fmt_out,
p_sys->i_sws_flags | p_sys->i_cpu_mask,
p_sys->p_src_filter, p_sys->p_dst_filter );
if( !p_sys->ctx )
{
msg_Err( p_filter, "could not init SwScaler" );
return VLC_EGENERIC;
}
p_sys->fmt_in = p_filter->fmt_in;
p_sys->fmt_out = p_filter->fmt_out;
}
return VLC_SUCCESS;
}
/****************************************************************************
* Filter: the whole thing
****************************************************************************
* This function is called just after the thread is launched.
****************************************************************************/
static picture_t *Filter( filter_t *p_filter, picture_t *p_pic )
{
filter_sys_t *p_sys = p_filter->p_sys;
uint8_t *src[3]; int src_stride[3];
uint8_t *dst[3]; int dst_stride[3];
picture_t *p_pic_dst;
int i_plane;
/* Check if format properties changed */
if( CheckInit( p_filter ) != VLC_SUCCESS ) return 0;
/* Request output picture */
p_pic_dst = p_filter->pf_vout_buffer_new( p_filter );
if( !p_pic_dst )
{
msg_Warn( p_filter, "can't get output picture" );
return NULL;
}
for( i_plane = 0; i_plane < p_pic->i_planes; i_plane++ )
{
src[i_plane] = p_pic->p[i_plane].p_pixels;
src_stride[i_plane] = p_pic->p[i_plane].i_pitch;
}
for( i_plane = 0; i_plane < p_pic_dst->i_planes; i_plane++ )
{
dst[i_plane] = p_pic_dst->p[i_plane].p_pixels;
dst_stride[i_plane] = p_pic_dst->p[i_plane].i_pitch;
}
sws_scale_ordered( p_sys->ctx, src, src_stride,
0, p_filter->fmt_in.video.i_height,
dst, dst_stride );
p_pic_dst->date = p_pic->date;
p_pic_dst->b_force = p_pic->b_force;
p_pic_dst->i_nb_fields = p_pic->i_nb_fields;
p_pic_dst->b_progressive = p_pic->b_progressive;
p_pic_dst->b_top_field_first = p_pic->b_top_field_first;
p_pic->pf_release( p_pic );
return p_pic_dst;
}
/*****************************************************************************
* Chroma fourcc -> ffmpeg_id mapping
*****************************************************************************/
static struct
{
vlc_fourcc_t i_chroma;
unsigned int i_swscale_chroma;
} chroma_table[] =
{
/* Planar YUV formats */
{ VLC_FOURCC('Y','V','1','2'), IMGFMT_YV12 },
{ VLC_FOURCC('I','4','2','0'), IMGFMT_I420 },
{ VLC_FOURCC('I','Y','U','V'), IMGFMT_IYUV },
{ VLC_FOURCC('I','4','4','4'), IMGFMT_444P },
{ VLC_FOURCC('I','4','2','2'), IMGFMT_422P },
{ VLC_FOURCC('I','4','1','1'), IMGFMT_411P },
/* Packed YUV formats */
{ VLC_FOURCC('U','Y','V','Y'), IMGFMT_UYVY },
{ VLC_FOURCC('Y','U','Y','2'), IMGFMT_YUY2 },
/* Packed RGB formats */
{ VLC_FOURCC('R','V','1','5'), IMGFMT_RGB15 },
{ VLC_FOURCC('R','V','1','6'), IMGFMT_RGB16 },
{ VLC_FOURCC('R','V','2','4'), IMGFMT_RGB24 },
{ VLC_FOURCC('R','V','3','2'), IMGFMT_RGB32 },
{ VLC_FOURCC('G','R','E','Y'), IMGFMT_RGB8 },
{0}
};
static int GetSwscaleChroma( vlc_fourcc_t i_chroma )
{
int i;
for( i = 0; chroma_table[i].i_chroma != 0; i++ )
{
if( chroma_table[i].i_chroma == i_chroma )
return chroma_table[i].i_swscale_chroma;
}
return 0;
}
/*
*
* rgb2rgb.c, Software RGB to RGB convertor
* pluralize by Software PAL8 to RGB convertor
* Software YUV to YUV convertor
* Software YUV to RGB convertor
* Written by Nick Kurshev.
* palette & yuv & runtime cpu stuff by Michael (michaelni@gmx.at) (under GPL)
*/
#include <inttypes.h>
#include "config.h"
#include "rgb2rgb.h"
#include "swscale.h"
#include "common.h"
#define FAST_BGR2YV12 // use 7 bit coeffs instead of 15bit
void (*rgb24to32)(const uint8_t *src,uint8_t *dst,unsigned src_size);
void (*rgb24to16)(const uint8_t *src,uint8_t *dst,unsigned src_size);
void (*rgb24to15)(const uint8_t *src,uint8_t *dst,unsigned src_size);
void (*rgb32to24)(const uint8_t *src,uint8_t *dst,unsigned src_size);
void (*rgb32to16)(const uint8_t *src,uint8_t *dst,unsigned src_size);
void (*rgb32to15)(const uint8_t *src,uint8_t *dst,unsigned src_size);
void (*rgb15to16)(const uint8_t *src,uint8_t *dst,unsigned src_size);
void (*rgb15to24)(const uint8_t *src,uint8_t *dst,unsigned src_size);
void (*rgb15to32)(const uint8_t *src,uint8_t *dst,unsigned src_size);
void (*rgb16to15)(const uint8_t *src,uint8_t *dst,unsigned src_size);
void (*rgb16to24)(const uint8_t *src,uint8_t *dst,unsigned src_size);
void (*rgb16to32)(const uint8_t *src,uint8_t *dst,unsigned src_size);
//void (*rgb24tobgr32)(const uint8_t *src, uint8_t *dst, unsigned src_size);
void (*rgb24tobgr24)(const uint8_t *src, uint8_t *dst, unsigned src_size);
void (*rgb24tobgr16)(const uint8_t *src, uint8_t *dst, unsigned src_size);
void (*rgb24tobgr15)(const uint8_t *src, uint8_t *dst, unsigned src_size);
void (*rgb32tobgr32)(const uint8_t *src, uint8_t *dst, unsigned src_size);
//void (*rgb32tobgr24)(const uint8_t *src, uint8_t *dst, unsigned src_size);
void (*rgb32tobgr16)(const uint8_t *src, uint8_t *dst, unsigned src_size);
void (*rgb32tobgr15)(const uint8_t *src, uint8_t *dst, unsigned src_size);
void (*yv12toyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
unsigned int width, unsigned int height,
int lumStride, int chromStride, int dstStride);
void (*yv12touyvy)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
unsigned int width, unsigned int height,
int lumStride, int chromStride, int dstStride);
void (*yuv422ptoyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
unsigned int width, unsigned int height,
int lumStride, int chromStride, int dstStride);
void (*yuy2toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
unsigned int width, unsigned int height,
int lumStride, int chromStride, int srcStride);
void (*rgb24toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
unsigned int width, unsigned int height,
int lumStride, int chromStride, int srcStride);
void (*planar2x)(const uint8_t *src, uint8_t *dst, int width, int height, int srcStride, int dstStride);
void (*interleaveBytes)(uint8_t *src1, uint8_t *src2, uint8_t *dst,
unsigned width, unsigned height, int src1Stride,
int src2Stride, int dstStride);
void (*vu9_to_vu12)(const uint8_t *src1, const uint8_t *src2,
uint8_t *dst1, uint8_t *dst2,
unsigned width, unsigned height,
int srcStride1, int srcStride2,
int dstStride1, int dstStride2);
void (*yvu9_to_yuy2)(const uint8_t *src1, const uint8_t *src2, const uint8_t *src3,
uint8_t *dst,
unsigned width, unsigned height,
int srcStride1, int srcStride2,
int srcStride3, int dstStride);
#ifdef ARCH_X86
static const uint64_t mmx_null __attribute__((aligned(8))) = 0x0000000000000000ULL;
static const uint64_t mmx_one __attribute__((aligned(8))) = 0xFFFFFFFFFFFFFFFFULL;
static const uint64_t mask32b attribute_used __attribute__((aligned(8))) = 0x000000FF000000FFULL;
static const uint64_t mask32g attribute_used __attribute__((aligned(8))) = 0x0000FF000000FF00ULL;
static const uint64_t mask32r attribute_used __attribute__((aligned(8))) = 0x00FF000000FF0000ULL;
static const uint64_t mask32 __attribute__((aligned(8))) = 0x00FFFFFF00FFFFFFULL;
static const uint64_t mask3216br __attribute__((aligned(8)))=0x00F800F800F800F8ULL;
static const uint64_t mask3216g __attribute__((aligned(8)))=0x0000FC000000FC00ULL;
static const uint64_t mask3215g __attribute__((aligned(8)))=0x0000F8000000F800ULL;
static const uint64_t mul3216 __attribute__((aligned(8))) = 0x2000000420000004ULL;
static const uint64_t mul3215 __attribute__((aligned(8))) = 0x2000000820000008ULL;
static const uint64_t mask24b attribute_used __attribute__((aligned(8))) = 0x00FF0000FF0000FFULL;
static const uint64_t mask24g attribute_used __attribute__((aligned(8))) = 0xFF0000FF0000FF00ULL;
static const uint64_t mask24r attribute_used __attribute__((aligned(8))) = 0x0000FF0000FF0000ULL;
static const uint64_t mask24l __attribute__((aligned(8))) = 0x0000000000FFFFFFULL;
static const uint64_t mask24h __attribute__((aligned(8))) = 0x0000FFFFFF000000ULL;
static const uint64_t mask24hh __attribute__((aligned(8))) = 0xffff000000000000ULL;
static const uint64_t mask24hhh __attribute__((aligned(8))) = 0xffffffff00000000ULL;
static const uint64_t mask24hhhh __attribute__((aligned(8))) = 0xffffffffffff0000ULL;
static const uint64_t mask15b __attribute__((aligned(8))) = 0x001F001F001F001FULL; /* 00000000 00011111 xxB */
static const uint64_t mask15rg __attribute__((aligned(8))) = 0x7FE07FE07FE07FE0ULL; /* 01111111 11100000 RGx */
static const uint64_t mask15s __attribute__((aligned(8))) = 0xFFE0FFE0FFE0FFE0ULL;
static const uint64_t mask15g __attribute__((aligned(8))) = 0x03E003E003E003E0ULL;
static const uint64_t mask15r __attribute__((aligned(8))) = 0x7C007C007C007C00ULL;
#define mask16b mask15b
static const uint64_t mask16g __attribute__((aligned(8))) = 0x07E007E007E007E0ULL;
static const uint64_t mask16r __attribute__((aligned(8))) = 0xF800F800F800F800ULL;
static const uint64_t red_16mask __attribute__((aligned(8))) = 0x0000f8000000f800ULL;
static const uint64_t green_16mask __attribute__((aligned(8)))= 0x000007e0000007e0ULL;
static const uint64_t blue_16mask __attribute__((aligned(8))) = 0x0000001f0000001fULL;
static const uint64_t red_15mask __attribute__((aligned(8))) = 0x00007c000000f800ULL;
static const uint64_t green_15mask __attribute__((aligned(8)))= 0x000003e0000007e0ULL;
static const uint64_t blue_15mask __attribute__((aligned(8))) = 0x0000001f0000001fULL;
#ifdef FAST_BGR2YV12
static const uint64_t bgr2YCoeff attribute_used __attribute__((aligned(8))) = 0x000000210041000DULL;
static const uint64_t bgr2UCoeff attribute_used __attribute__((aligned(8))) = 0x0000FFEEFFDC0038ULL;
static const uint64_t bgr2VCoeff attribute_used __attribute__((aligned(8))) = 0x00000038FFD2FFF8ULL;
#else
static const uint64_t bgr2YCoeff attribute_used __attribute__((aligned(8))) = 0x000020E540830C8BULL;
static const uint64_t bgr2UCoeff attribute_used __attribute__((aligned(8))) = 0x0000ED0FDAC23831ULL;
static const uint64_t bgr2VCoeff attribute_used __attribute__((aligned(8))) = 0x00003831D0E6F6EAULL;
#endif
static const uint64_t bgr2YOffset attribute_used __attribute__((aligned(8))) = 0x1010101010101010ULL;
static const uint64_t bgr2UVOffset attribute_used __attribute__((aligned(8)))= 0x8080808080808080ULL;
static const uint64_t w1111 attribute_used __attribute__((aligned(8))) = 0x0001000100010001ULL;
#if 0
static volatile uint64_t __attribute__((aligned(8))) b5Dither;
static volatile uint64_t __attribute__((aligned(8))) g5Dither;
static volatile uint64_t __attribute__((aligned(8))) g6Dither;
static volatile uint64_t __attribute__((aligned(8))) r5Dither;
static uint64_t __attribute__((aligned(8))) dither4[2]={
0x0103010301030103LL,
0x0200020002000200LL,};
static uint64_t __attribute__((aligned(8))) dither8[2]={
0x0602060206020602LL,
0x0004000400040004LL,};
#endif
#endif
#define RGB2YUV_SHIFT 8
#define BY ((int)( 0.098*(1<<RGB2YUV_SHIFT)+0.5))
#define BV ((int)(-0.071*(1<<RGB2YUV_SHIFT)+0.5))
#define BU ((int)( 0.439*(1<<RGB2YUV_SHIFT)+0.5))
#define GY ((int)( 0.504*(1<<RGB2YUV_SHIFT)+0.5))
#define GV ((int)(-0.368*(1<<RGB2YUV_SHIFT)+0.5))
#define GU ((int)(-0.291*(1<<RGB2YUV_SHIFT)+0.5))
#define RY ((int)( 0.257*(1<<RGB2YUV_SHIFT)+0.5))
#define RV ((int)( 0.439*(1<<RGB2YUV_SHIFT)+0.5))
#define RU ((int)(-0.148*(1<<RGB2YUV_SHIFT)+0.5))
//Note: we have C, MMX, MMX2, 3DNOW version therse no 3DNOW+MMX2 one
//Plain C versions
#undef HAVE_MMX
#undef HAVE_MMX2
#undef HAVE_3DNOW
#undef HAVE_SSE2
#define RENAME(a) a ## _C
#include "rgb2rgb_template.c"
#ifdef ARCH_X86
//MMX versions
#undef RENAME
#define HAVE_MMX
#undef HAVE_MMX2
#undef HAVE_3DNOW
#undef HAVE_SSE2
#define RENAME(a) a ## _MMX
#include "rgb2rgb_template.c"
//MMX2 versions
#undef RENAME
#define HAVE_MMX
#define HAVE_MMX2
#undef HAVE_3DNOW
#undef HAVE_SSE2
#define RENAME(a) a ## _MMX2
#include "rgb2rgb_template.c"
//3DNOW versions
#undef RENAME
#define HAVE_MMX
#undef HAVE_MMX2
#define HAVE_3DNOW
#undef HAVE_SSE2
#define RENAME(a) a ## _3DNOW
#include "rgb2rgb_template.c"
#endif //ARCH_X86
/*
rgb15->rgb16 Original by Strepto/Astral
ported to gcc & bugfixed : A'rpi
MMX2, 3DNOW optimization by Nick Kurshev
32bit c version, and and&add trick by Michael Niedermayer
*/
void sws_rgb2rgb_init(int flags){
#ifdef ARCH_X86
if(flags & SWS_CPU_CAPS_MMX2){
rgb15to16= rgb15to16_MMX2;
rgb15to24= rgb15to24_MMX2;
rgb15to32= rgb15to32_MMX2;
rgb16to24= rgb16to24_MMX2;
rgb16to32= rgb16to32_MMX2;
rgb16to15= rgb16to15_MMX2;
rgb24to16= rgb24to16_MMX2;
rgb24to15= rgb24to15_MMX2;
rgb24to32= rgb24to32_MMX2;
rgb32to16= rgb32to16_MMX2;
rgb32to15= rgb32to15_MMX2;
rgb32to24= rgb32to24_MMX2;
rgb24tobgr15= rgb24tobgr15_MMX2;
rgb24tobgr16= rgb24tobgr16_MMX2;
rgb24tobgr24= rgb24tobgr24_MMX2;
rgb32tobgr32= rgb32tobgr32_MMX2;
rgb32tobgr16= rgb32tobgr16_MMX2;
rgb32tobgr15= rgb32tobgr15_MMX2;
yv12toyuy2= yv12toyuy2_MMX2;
yv12touyvy= yv12touyvy_MMX2;
yuv422ptoyuy2= yuv422ptoyuy2_MMX2;
yuy2toyv12= yuy2toyv12_MMX2;
// uyvytoyv12= uyvytoyv12_MMX2;
// yvu9toyv12= yvu9toyv12_MMX2;
planar2x= planar2x_MMX2;
rgb24toyv12= rgb24toyv12_MMX2;
interleaveBytes= interleaveBytes_MMX2;
vu9_to_vu12= vu9_to_vu12_MMX2;
yvu9_to_yuy2= yvu9_to_yuy2_MMX2;
}else if(flags & SWS_CPU_CAPS_3DNOW){
rgb15to16= rgb15to16_3DNOW;
rgb15to24= rgb15to24_3DNOW;
rgb15to32= rgb15to32_3DNOW;
rgb16to24= rgb16to24_3DNOW;
rgb16to32= rgb16to32_3DNOW;
rgb16to15= rgb16to15_3DNOW;
rgb24to16= rgb24to16_3DNOW;
rgb24to15= rgb24to15_3DNOW;
rgb24to32= rgb24to32_3DNOW;
rgb32to16= rgb32to16_3DNOW;
rgb32to15= rgb32to15_3DNOW;
rgb32to24= rgb32to24_3DNOW;
rgb24tobgr15= rgb24tobgr15_3DNOW;
rgb24tobgr16= rgb24tobgr16_3DNOW;
rgb24tobgr24= rgb24tobgr24_3DNOW;
rgb32tobgr32= rgb32tobgr32_3DNOW;
rgb32tobgr16= rgb32tobgr16_3DNOW;
rgb32tobgr15= rgb32tobgr15_3DNOW;
yv12toyuy2= yv12toyuy2_3DNOW;
yv12touyvy= yv12touyvy_3DNOW;
yuv422ptoyuy2= yuv422ptoyuy2_3DNOW;
yuy2toyv12= yuy2toyv12_3DNOW;
// uyvytoyv12= uyvytoyv12_3DNOW;
// yvu9toyv12= yvu9toyv12_3DNOW;
planar2x= planar2x_3DNOW;
rgb24toyv12= rgb24toyv12_3DNOW;
interleaveBytes= interleaveBytes_3DNOW;
vu9_to_vu12= vu9_to_vu12_3DNOW;
yvu9_to_yuy2= yvu9_to_yuy2_3DNOW;
}else if(flags & SWS_CPU_CAPS_MMX){
rgb15to16= rgb15to16_MMX;
rgb15to24= rgb15to24_MMX;
rgb15to32= rgb15to32_MMX;
rgb16to24= rgb16to24_MMX;
rgb16to32= rgb16to32_MMX;
rgb16to15= rgb16to15_MMX;
rgb24to16= rgb24to16_MMX;
rgb24to15= rgb24to15_MMX;
rgb24to32= rgb24to32_MMX;
rgb32to16= rgb32to16_MMX;
rgb32to15= rgb32to15_MMX;
rgb32to24= rgb32to24_MMX;
rgb24tobgr15= rgb24tobgr15_MMX;
rgb24tobgr16= rgb24tobgr16_MMX;
rgb24tobgr24= rgb24tobgr24_MMX;
rgb32tobgr32= rgb32tobgr32_MMX;
rgb32tobgr16= rgb32tobgr16_MMX;
rgb32tobgr15= rgb32tobgr15_MMX;
yv12toyuy2= yv12toyuy2_MMX;
yv12touyvy= yv12touyvy_MMX;
yuv422ptoyuy2= yuv422ptoyuy2_MMX;
yuy2toyv12= yuy2toyv12_MMX;
// uyvytoyv12= uyvytoyv12_MMX;
// yvu9toyv12= yvu9toyv12_MMX;
planar2x= planar2x_MMX;
rgb24toyv12= rgb24toyv12_MMX;
interleaveBytes= interleaveBytes_MMX;
vu9_to_vu12= vu9_to_vu12_MMX;
yvu9_to_yuy2= yvu9_to_yuy2_MMX;
}else
#endif
{
rgb15to16= rgb15to16_C;
rgb15to24= rgb15to24_C;
rgb15to32= rgb15to32_C;
rgb16to24= rgb16to24_C;
rgb16to32= rgb16to32_C;
rgb16to15= rgb16to15_C;
rgb24to16= rgb24to16_C;
rgb24to15= rgb24to15_C;
rgb24to32= rgb24to32_C;
rgb32to16= rgb32to16_C;
rgb32to15= rgb32to15_C;
rgb32to24= rgb32to24_C;
rgb24tobgr15= rgb24tobgr15_C;
rgb24tobgr16= rgb24tobgr16_C;
rgb24tobgr24= rgb24tobgr24_C;
rgb32tobgr32= rgb32tobgr32_C;
rgb32tobgr16= rgb32tobgr16_C;
rgb32tobgr15= rgb32tobgr15_C;
yv12toyuy2= yv12toyuy2_C;
yv12touyvy= yv12touyvy_C;
yuv422ptoyuy2= yuv422ptoyuy2_C;
yuy2toyv12= yuy2toyv12_C;
// uyvytoyv12= uyvytoyv12_C;
// yvu9toyv12= yvu9toyv12_C;
planar2x= planar2x_C;
rgb24toyv12= rgb24toyv12_C;
interleaveBytes= interleaveBytes_C;
vu9_to_vu12= vu9_to_vu12_C;
yvu9_to_yuy2= yvu9_to_yuy2_C;
}
}
/**
* Pallete is assumed to contain bgr32
*/
void palette8torgb32(const uint8_t *src, uint8_t *dst, unsigned num_pixels, const uint8_t *palette)
{
unsigned i;
/*
for(i=0; i<num_pixels; i++)
((unsigned *)dst)[i] = ((unsigned *)palette)[ src[i] ];
*/
for(i=0; i<num_pixels; i++)
{
//FIXME slow?
dst[0]= palette[ src[i]*4+2 ];
dst[1]= palette[ src[i]*4+1 ];
dst[2]= palette[ src[i]*4+0 ];
// dst[3]= 0; /* do we need this cleansing? */
dst+= 4;
}
}
void palette8tobgr32(const uint8_t *src, uint8_t *dst, unsigned num_pixels, const uint8_t *palette)
{
unsigned i;
for(i=0; i<num_pixels; i++)
{
//FIXME slow?
dst[0]= palette[ src[i]*4+0 ];
dst[1]= palette[ src[i]*4+1 ];
dst[2]= palette[ src[i]*4+2 ];
// dst[3]= 0; /* do we need this cleansing? */
dst+= 4;
}
}
/**
* Pallete is assumed to contain bgr32
*/
void palette8torgb24(const uint8_t *src, uint8_t *dst, unsigned num_pixels, const uint8_t *palette)
{
unsigned i;
/*
writes 1 byte o much and might cause alignment issues on some architectures?
for(i=0; i<num_pixels; i++)
((unsigned *)(&dst[i*3])) = ((unsigned *)palette)[ src[i] ];
*/
for(i=0; i<num_pixels; i++)
{
//FIXME slow?
dst[0]= palette[ src[i]*4+2 ];
dst[1]= palette[ src[i]*4+1 ];
dst[2]= palette[ src[i]*4+0 ];
dst+= 3;
}
}
void palette8tobgr24(const uint8_t *src, uint8_t *dst, unsigned num_pixels, const uint8_t *palette)
{
unsigned i;
/*
writes 1 byte o much and might cause alignment issues on some architectures?
for(i=0; i<num_pixels; i++)
((unsigned *)(&dst[i*3])) = ((unsigned *)palette)[ src[i] ];
*/
for(i=0; i<num_pixels; i++)
{
//FIXME slow?
dst[0]= palette[ src[i]*4+0 ];
dst[1]= palette[ src[i]*4+1 ];
dst[2]= palette[ src[i]*4+2 ];
dst+= 3;
}
}
/**
* Palette is assumed to contain bgr16, see rgb32to16 to convert the palette
*/
void palette8torgb16(const uint8_t *src, uint8_t *dst, unsigned num_pixels, const uint8_t *palette)
{
unsigned i;
for(i=0; i<num_pixels; i++)
((uint16_t *)dst)[i] = ((uint16_t *)palette)[ src[i] ];
}
void palette8tobgr16(const uint8_t *src, uint8_t *dst, unsigned num_pixels, const uint8_t *palette)
{
unsigned i;
for(i=0; i<num_pixels; i++)
((uint16_t *)dst)[i] = bswap_16(((uint16_t *)palette)[ src[i] ]);
}
/**
* Pallete is assumed to contain bgr15, see rgb32to15 to convert the palette
*/
void palette8torgb15(const uint8_t *src, uint8_t *dst, unsigned num_pixels, const uint8_t *palette)
{
unsigned i;
for(i=0; i<num_pixels; i++)
((uint16_t *)dst)[i] = ((uint16_t *)palette)[ src[i] ];
}
void palette8tobgr15(const uint8_t *src, uint8_t *dst, unsigned num_pixels, const uint8_t *palette)
{
unsigned i;
for(i=0; i<num_pixels; i++)
((uint16_t *)dst)[i] = bswap_16(((uint16_t *)palette)[ src[i] ]);
}
void rgb32tobgr24(const uint8_t *src, uint8_t *dst, unsigned int src_size)
{
unsigned i;
unsigned num_pixels = src_size >> 2;
for(i=0; i<num_pixels; i++)
{
dst[3*i + 0] = src[4*i + 2];
dst[3*i + 1] = src[4*i + 1];
dst[3*i + 2] = src[4*i + 0];
}
}
void rgb24tobgr32(const uint8_t *src, uint8_t *dst, unsigned int src_size)
{
unsigned i;
for(i=0; 3*i<src_size; i++)
{
dst[4*i + 0] = src[3*i + 2];
dst[4*i + 1] = src[3*i + 1];
dst[4*i + 2] = src[3*i + 0];
dst[4*i + 3] = 0;
}
}
void rgb16tobgr32(const uint8_t *src, uint8_t *dst, unsigned int src_size)
{
const uint16_t *end;
uint8_t *d = (uint8_t *)dst;
const uint16_t *s = (uint16_t *)src;
end = s + src_size/2;
while(s < end)
{
register uint16_t bgr;
bgr = *s++;
*d++ = (bgr&0xF800)>>8;
*d++ = (bgr&0x7E0)>>3;
*d++ = (bgr&0x1F)<<3;
*d++ = 0;
}
}
void rgb16tobgr24(const uint8_t *src, uint8_t *dst, unsigned int src_size)
{
const uint16_t *end;
uint8_t *d = (uint8_t *)dst;
const uint16_t *s = (const uint16_t *)src;
end = s + src_size/2;
while(s < end)
{
register uint16_t bgr;
bgr = *s++;
*d++ = (bgr&0xF800)>>8;
*d++ = (bgr&0x7E0)>>3;
*d++ = (bgr&0x1F)<<3;
}
}
void rgb16tobgr16(const uint8_t *src, uint8_t *dst, unsigned int src_size)
{
unsigned i;
unsigned num_pixels = src_size >> 1;
for(i=0; i<num_pixels; i++)
{
unsigned b,g,r;
register uint16_t rgb;
rgb = src[2*i];
r = rgb&0x1F;
g = (rgb&0x7E0)>>5;
b = (rgb&0xF800)>>11;
dst[2*i] = (b&0x1F) | ((g&0x3F)<<5) | ((r&0x1F)<<11);
}
}
void rgb16tobgr15(const uint8_t *src, uint8_t *dst, unsigned int src_size)
{
unsigned i;
unsigned num_pixels = src_size >> 1;
for(i=0; i<num_pixels; i++)
{
unsigned b,g,r;
register uint16_t rgb;
rgb = src[2*i];
r = rgb&0x1F;
g = (rgb&0x7E0)>>5;
b = (rgb&0xF800)>>11;
dst[2*i] = (b&0x1F) | ((g&0x1F)<<5) | ((r&0x1F)<<10);
}
}
void rgb15tobgr32(const uint8_t *src, uint8_t *dst, unsigned int src_size)
{
const uint16_t *end;
uint8_t *d = (uint8_t *)dst;
const uint16_t *s = (const uint16_t *)src;
end = s + src_size/2;
while(s < end)
{
register uint16_t bgr;
bgr = *s++;
*d++ = (bgr&0x7C00)>>7;
*d++ = (bgr&0x3E0)>>2;
*d++ = (bgr&0x1F)<<3;
*d++ = 0;
}
}
void rgb15tobgr24(const uint8_t *src, uint8_t *dst, unsigned int src_size)
{
const uint16_t *end;
uint8_t *d = (uint8_t *)dst;
const uint16_t *s = (uint16_t *)src;
end = s + src_size/2;
while(s < end)
{
register uint16_t bgr;
bgr = *s++;
*d++ = (bgr&0x7C00)>>7;
*d++ = (bgr&0x3E0)>>2;
*d++ = (bgr&0x1F)<<3;
}
}
void rgb15tobgr16(const uint8_t *src, uint8_t *dst, unsigned int src_size)
{
unsigned i;
unsigned num_pixels = src_size >> 1;
for(i=0; i<num_pixels; i++)
{
unsigned b,g,r;
register uint16_t rgb;
rgb = src[2*i];
r = rgb&0x1F;
g = (rgb&0x3E0)>>5;
b = (rgb&0x7C00)>>10;
dst[2*i] = (b&0x1F) | ((g&0x3F)<<5) | ((r&0x1F)<<11);
}
}
void rgb15tobgr15(const uint8_t *src, uint8_t *dst, unsigned int src_size)
{
unsigned i;
unsigned num_pixels = src_size >> 1;
for(i=0; i<num_pixels; i++)
{
unsigned b,g,r;
register uint16_t rgb;
rgb = src[2*i];
r = rgb&0x1F;
g = (rgb&0x3E0)>>5;
b = (rgb&0x7C00)>>10;
dst[2*i] = (b&0x1F) | ((g&0x1F)<<5) | ((r&0x1F)<<10);
}
}
void rgb8tobgr8(const uint8_t *src, uint8_t *dst, unsigned int src_size)
{
unsigned i;
unsigned num_pixels = src_size;
for(i=0; i<num_pixels; i++)
{
unsigned b,g,r;
register uint8_t rgb;
rgb = src[i];
r = (rgb&0x07);
g = (rgb&0x38)>>3;
b = (rgb&0xC0)>>6;
dst[i] = ((b<<1)&0x07) | ((g&0x07)<<3) | ((r&0x03)<<6);
}
}
/*
*
* rgb2rgb.h, Software RGB to RGB convertor
* pluralize by Software PAL8 to RGB convertor
* Software YUV to YUV convertor
* Software YUV to RGB convertor
*/
#ifndef RGB2RGB_INCLUDED
#define RGB2RGB_INCLUDED
// Note: do not fix the dependence on stdio.h
/* A full collection of rgb to rgb(bgr) convertors */
extern void (*rgb24to32)(const uint8_t *src,uint8_t *dst,unsigned src_size);
extern void (*rgb24to16)(const uint8_t *src,uint8_t *dst,unsigned src_size);
extern void (*rgb24to15)(const uint8_t *src,uint8_t *dst,unsigned src_size);
extern void (*rgb32to24)(const uint8_t *src,uint8_t *dst,unsigned src_size);
extern void (*rgb32to16)(const uint8_t *src,uint8_t *dst,unsigned src_size);
extern void (*rgb32to15)(const uint8_t *src,uint8_t *dst,unsigned src_size);
extern void (*rgb15to16)(const uint8_t *src,uint8_t *dst,unsigned src_size);
extern void (*rgb15to24)(const uint8_t *src,uint8_t *dst,unsigned src_size);
extern void (*rgb15to32)(const uint8_t *src,uint8_t *dst,unsigned src_size);
extern void (*rgb16to15)(const uint8_t *src,uint8_t *dst,unsigned src_size);
extern void (*rgb16to24)(const uint8_t *src,uint8_t *dst,unsigned src_size);
extern void (*rgb16to32)(const uint8_t *src,uint8_t *dst,unsigned src_size);
extern void (*rgb24tobgr24)(const uint8_t *src, uint8_t *dst, unsigned src_size);
extern void (*rgb24tobgr16)(const uint8_t *src, uint8_t *dst, unsigned src_size);
extern void (*rgb24tobgr15)(const uint8_t *src, uint8_t *dst, unsigned src_size);
extern void (*rgb32tobgr32)(const uint8_t *src, uint8_t *dst, unsigned src_size);
extern void (*rgb32tobgr16)(const uint8_t *src, uint8_t *dst, unsigned src_size);
extern void (*rgb32tobgr15)(const uint8_t *src, uint8_t *dst, unsigned src_size);
extern void rgb24tobgr32(const uint8_t *src, uint8_t *dst, unsigned src_size);
extern void rgb32tobgr24(const uint8_t *src, uint8_t *dst, unsigned src_size);
extern void rgb16tobgr32(const uint8_t *src, uint8_t *dst, unsigned src_size);
extern void rgb16tobgr24(const uint8_t *src, uint8_t *dst, unsigned src_size);
extern void rgb16tobgr16(const uint8_t *src, uint8_t *dst, unsigned src_size);
extern void rgb16tobgr15(const uint8_t *src, uint8_t *dst, unsigned src_size);
extern void rgb15tobgr32(const uint8_t *src, uint8_t *dst, unsigned src_size);
extern void rgb15tobgr24(const uint8_t *src, uint8_t *dst, unsigned src_size);
extern void rgb15tobgr16(const uint8_t *src, uint8_t *dst, unsigned src_size);
extern void rgb15tobgr15(const uint8_t *src, uint8_t *dst, unsigned src_size);
extern void rgb8tobgr8(const uint8_t *src, uint8_t *dst, unsigned src_size);
extern void palette8torgb32(const uint8_t *src, uint8_t *dst, unsigned num_pixels, const uint8_t *palette);
extern void palette8tobgr32(const uint8_t *src, uint8_t *dst, unsigned num_pixels, const uint8_t *palette);
extern void palette8torgb24(const uint8_t *src, uint8_t *dst, unsigned num_pixels, const uint8_t *palette);
extern void palette8tobgr24(const uint8_t *src, uint8_t *dst, unsigned num_pixels, const uint8_t *palette);
extern void palette8torgb16(const uint8_t *src, uint8_t *dst, unsigned num_pixels, const uint8_t *palette);
extern void palette8tobgr16(const uint8_t *src, uint8_t *dst, unsigned num_pixels, const uint8_t *palette);
extern void palette8torgb15(const uint8_t *src, uint8_t *dst, unsigned num_pixels, const uint8_t *palette);
extern void palette8tobgr15(const uint8_t *src, uint8_t *dst, unsigned num_pixels, const uint8_t *palette);
/**
*
* height should be a multiple of 2 and width should be a multiple of 16 (if this is a
* problem for anyone then tell me, and ill fix it)
* chrominance data is only taken from every secound line others are ignored FIXME write HQ version
*/
//void uyvytoyv12(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
/**
*
* height should be a multiple of 2 and width should be a multiple of 16 (if this is a
* problem for anyone then tell me, and ill fix it)
*/
extern void (*yv12toyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
unsigned int width, unsigned int height,
int lumStride, int chromStride, int dstStride);
/**
*
* width should be a multiple of 16
*/
extern void (*yuv422ptoyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
unsigned int width, unsigned int height,
int lumStride, int chromStride, int dstStride);
/**
*
* height should be a multiple of 2 and width should be a multiple of 16 (if this is a
* problem for anyone then tell me, and ill fix it)
*/
extern void (*yuy2toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
unsigned int width, unsigned int height,
int lumStride, int chromStride, int srcStride);
/**
*
* height should be a multiple of 2 and width should be a multiple of 16 (if this is a
* problem for anyone then tell me, and ill fix it)
*/
extern void (*yv12touyvy)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
unsigned int width, unsigned int height,
int lumStride, int chromStride, int dstStride);
/**
*
* height should be a multiple of 2 and width should be a multiple of 2 (if this is a
* problem for anyone then tell me, and ill fix it)
* chrominance data is only taken from every secound line others are ignored FIXME write HQ version
*/
extern void (*rgb24toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
unsigned int width, unsigned int height,
int lumStride, int chromStride, int srcStride);
extern void (*planar2x)(const uint8_t *src, uint8_t *dst, int width, int height, int srcStride, int dstStride);
extern void (*interleaveBytes)(uint8_t *src1, uint8_t *src2, uint8_t *dst,
unsigned width, unsigned height, int src1Stride,
int src2Stride, int dstStride);
extern void (*vu9_to_vu12)(const uint8_t *src1, const uint8_t *src2,
uint8_t *dst1, uint8_t *dst2,
unsigned width, unsigned height,
int srcStride1, int srcStride2,
int dstStride1, int dstStride2);
extern void (*yvu9_to_yuy2)(const uint8_t *src1, const uint8_t *src2, const uint8_t *src3,
uint8_t *dst,
unsigned width, unsigned height,
int srcStride1, int srcStride2,
int srcStride3, int dstStride);
#define MODE_RGB 0x1
#define MODE_BGR 0x2
static void yuv2rgb(uint8_t * image, uint8_t * py,
uint8_t * pu, uint8_t * pv,
unsigned h_size, unsigned v_size,
int rgb_stride, int y_stride, int uv_stride){
printf("broken, this should use the swscaler\n");
}
static void yuv2rgb_init (unsigned bpp, int mode){
printf("broken, this should use the swscaler\n");
}
void sws_rgb2rgb_init(int flags);
#endif
/*
*
* rgb2rgb.c, Software RGB to RGB convertor
* pluralize by Software PAL8 to RGB convertor
* Software YUV to YUV convertor
* Software YUV to RGB convertor
* Written by Nick Kurshev.
* palette & yuv & runtime cpu stuff by Michael (michaelni@gmx.at) (under GPL)
*/
#include <stddef.h>
#include <inttypes.h> /* for __WORDSIZE */
#ifndef __WORDSIZE
// #warning You have misconfigured system and probably will lose performance!
#define __WORDSIZE MP_WORDSIZE
#endif
#undef PREFETCH
#undef MOVNTQ
#undef EMMS
#undef SFENCE
#undef MMREG_SIZE
#undef PREFETCHW
#undef PAVGB
#ifdef HAVE_SSE2
#define MMREG_SIZE 16
#else
#define MMREG_SIZE 8
#endif
#ifdef HAVE_3DNOW
#define PREFETCH "prefetch"
#define PREFETCHW "prefetchw"
#define PAVGB "pavgusb"
#elif defined ( HAVE_MMX2 )
#define PREFETCH "prefetchnta"
#define PREFETCHW "prefetcht0"
#define PAVGB "pavgb"
#else
#define PREFETCH "/nop"
#define PREFETCHW "/nop"
#endif
#ifdef HAVE_3DNOW
/* On K6 femms is faster of emms. On K7 femms is directly mapped on emms. */
#define EMMS "femms"
#else
#define EMMS "emms"
#endif
#ifdef HAVE_MMX2
#define MOVNTQ "movntq"
#define SFENCE "sfence"
#else
#define MOVNTQ "movq"
#define SFENCE "/nop"
#endif
static inline void RENAME(rgb24to32)(const uint8_t *src,uint8_t *dst,unsigned src_size)
{
uint8_t *dest = dst;
const uint8_t *s = src;
const uint8_t *end;
#ifdef HAVE_MMX
const uint8_t *mm_end;
#endif
end = s + src_size;
#ifdef HAVE_MMX
__asm __volatile(PREFETCH" %0"::"m"(*s):"memory");
mm_end = end - 23;
__asm __volatile("movq %0, %%mm7"::"m"(mask32):"memory");
while(s < mm_end)
{
__asm __volatile(
PREFETCH" 32%1\n\t"
"movd %1, %%mm0\n\t"
"punpckldq 3%1, %%mm0\n\t"
"movd 6%1, %%mm1\n\t"
"punpckldq 9%1, %%mm1\n\t"
"movd 12%1, %%mm2\n\t"
"punpckldq 15%1, %%mm2\n\t"
"movd 18%1, %%mm3\n\t"
"punpckldq 21%1, %%mm3\n\t"
"pand %%mm7, %%mm0\n\t"
"pand %%mm7, %%mm1\n\t"
"pand %%mm7, %%mm2\n\t"
"pand %%mm7, %%mm3\n\t"
MOVNTQ" %%mm0, %0\n\t"
MOVNTQ" %%mm1, 8%0\n\t"
MOVNTQ" %%mm2, 16%0\n\t"
MOVNTQ" %%mm3, 24%0"
:"=m"(*dest)
:"m"(*s)
:"memory");
dest += 32;
s += 24;
}
__asm __volatile(SFENCE:::"memory");
__asm __volatile(EMMS:::"memory");
#endif
while(s < end)
{
*dest++ = *s++;
*dest++ = *s++;
*dest++ = *s++;
*dest++ = 0;
}
}
static inline void RENAME(rgb32to24)(const uint8_t *src,uint8_t *dst,unsigned src_size)
{
uint8_t *dest = dst;
const uint8_t *s = src;
const uint8_t *end;
#ifdef HAVE_MMX
const uint8_t *mm_end;
#endif
end = s + src_size;
#ifdef HAVE_MMX
__asm __volatile(PREFETCH" %0"::"m"(*s):"memory");
mm_end = end - 31;
while(s < mm_end)
{
__asm __volatile(
PREFETCH" 32%1\n\t"
"movq %1, %%mm0\n\t"
"movq 8%1, %%mm1\n\t"
"movq 16%1, %%mm4\n\t"
"movq 24%1, %%mm5\n\t"
"movq %%mm0, %%mm2\n\t"
"movq %%mm1, %%mm3\n\t"
"movq %%mm4, %%mm6\n\t"
"movq %%mm5, %%mm7\n\t"
"psrlq $8, %%mm2\n\t"
"psrlq $8, %%mm3\n\t"
"psrlq $8, %%mm6\n\t"
"psrlq $8, %%mm7\n\t"
"pand %2, %%mm0\n\t"
"pand %2, %%mm1\n\t"
"pand %2, %%mm4\n\t"
"pand %2, %%mm5\n\t"
"pand %3, %%mm2\n\t"
"pand %3, %%mm3\n\t"
"pand %3, %%mm6\n\t"
"pand %3, %%mm7\n\t"
"por %%mm2, %%mm0\n\t"
"por %%mm3, %%mm1\n\t"
"por %%mm6, %%mm4\n\t"
"por %%mm7, %%mm5\n\t"
"movq %%mm1, %%mm2\n\t"
"movq %%mm4, %%mm3\n\t"
"psllq $48, %%mm2\n\t"
"psllq $32, %%mm3\n\t"
"pand %4, %%mm2\n\t"
"pand %5, %%mm3\n\t"
"por %%mm2, %%mm0\n\t"
"psrlq $16, %%mm1\n\t"
"psrlq $32, %%mm4\n\t"
"psllq $16, %%mm5\n\t"
"por %%mm3, %%mm1\n\t"
"pand %6, %%mm5\n\t"
"por %%mm5, %%mm4\n\t"
MOVNTQ" %%mm0, %0\n\t"
MOVNTQ" %%mm1, 8%0\n\t"
MOVNTQ" %%mm4, 16%0"
:"=m"(*dest)
:"m"(*s),"m"(mask24l),
"m"(mask24h),"m"(mask24hh),"m"(mask24hhh),"m"(mask24hhhh)
:"memory");
dest += 24;
s += 32;
}
__asm __volatile(SFENCE:::"memory");
__asm __volatile(EMMS:::"memory");
#endif
while(s < end)
{
*dest++ = *s++;
*dest++ = *s++;
*dest++ = *s++;
s++;
}
}
/*
Original by Strepto/Astral
ported to gcc & bugfixed : A'rpi
MMX2, 3DNOW optimization by Nick Kurshev
32bit c version, and and&add trick by Michael Niedermayer
*/
static inline void RENAME(rgb15to16)(const uint8_t *src,uint8_t *dst,unsigned src_size)
{
register const uint8_t* s=src;
register uint8_t* d=dst;
register const uint8_t *end;
const uint8_t *mm_end;
end = s + src_size;
#ifdef HAVE_MMX
__asm __volatile(PREFETCH" %0"::"m"(*s));
__asm __volatile("movq %0, %%mm4"::"m"(mask15s));
mm_end = end - 15;
while(s<mm_end)
{
__asm __volatile(
PREFETCH" 32%1\n\t"
"movq %1, %%mm0\n\t"
"movq 8%1, %%mm2\n\t"
"movq %%mm0, %%mm1\n\t"
"movq %%mm2, %%mm3\n\t"
"pand %%mm4, %%mm0\n\t"
"pand %%mm4, %%mm2\n\t"
"paddw %%mm1, %%mm0\n\t"
"paddw %%mm3, %%mm2\n\t"
MOVNTQ" %%mm0, %0\n\t"
MOVNTQ" %%mm2, 8%0"
:"=m"(*d)
:"m"(*s)
);
d+=16;
s+=16;
}
__asm __volatile(SFENCE:::"memory");
__asm __volatile(EMMS:::"memory");
#endif
mm_end = end - 3;
while(s < mm_end)
{
register unsigned x= *((uint32_t *)s);
*((uint32_t *)d) = (x&0x7FFF7FFF) + (x&0x7FE07FE0);
d+=4;
s+=4;
}
if(s < end)
{
register unsigned short x= *((uint16_t *)s);
*((uint16_t *)d) = (x&0x7FFF) + (x&0x7FE0);
}
}
static inline void RENAME(rgb16to15)(const uint8_t *src,uint8_t *dst,unsigned src_size)
{
register const uint8_t* s=src;
register uint8_t* d=dst;
register const uint8_t *end;
const uint8_t *mm_end;
end = s + src_size;
#ifdef HAVE_MMX
__asm __volatile(PREFETCH" %0"::"m"(*s));
__asm __volatile("movq %0, %%mm7"::"m"(mask15rg));
__asm __volatile("movq %0, %%mm6"::"m"(mask15b));
mm_end = end - 15;
while(s<mm_end)
{
__asm __volatile(
PREFETCH" 32%1\n\t"
"movq %1, %%mm0\n\t"
"movq 8%1, %%mm2\n\t"
"movq %%mm0, %%mm1\n\t"
"movq %%mm2, %%mm3\n\t"
"psrlq $1, %%mm0\n\t"
"psrlq $1, %%mm2\n\t"
"pand %%mm7, %%mm0\n\t"
"pand %%mm7, %%mm2\n\t"
"pand %%mm6, %%mm1\n\t"
"pand %%mm6, %%mm3\n\t"
"por %%mm1, %%mm0\n\t"
"por %%mm3, %%mm2\n\t"
MOVNTQ" %%mm0, %0\n\t"
MOVNTQ" %%mm2, 8%0"
:"=m"(*d)
:"m"(*s)
);
d+=16;
s+=16;
}
__asm __volatile(SFENCE:::"memory");
__asm __volatile(EMMS:::"memory");
#endif
mm_end = end - 3;
while(s < mm_end)
{
register uint32_t x= *((uint32_t *)s);
*((uint32_t *)d) = ((x>>1)&0x7FE07FE0) | (x&0x001F001F);
s+=4;
d+=4;
}
if(s < end)
{
register uint16_t x= *((uint16_t *)s);
*((uint16_t *)d) = ((x>>1)&0x7FE0) | (x&0x001F);
s+=2;
d+=2;
}
}
static inline void RENAME(rgb32to16)(const uint8_t *src, uint8_t *dst, unsigned src_size)
{
const uint8_t *s = src;
const uint8_t *end;
#ifdef HAVE_MMX
const uint8_t *mm_end;
#endif
uint16_t *d = (uint16_t *)dst;
end = s + src_size;
#ifdef HAVE_MMX
mm_end = end - 15;
#if 1 //is faster only if multiplies are reasonable fast (FIXME figure out on which cpus this is faster, on Athlon its slightly faster)
asm volatile(
"movq %3, %%mm5 \n\t"
"movq %4, %%mm6 \n\t"
"movq %5, %%mm7 \n\t"
".balign 16 \n\t"
"1: \n\t"
PREFETCH" 32(%1) \n\t"
"movd (%1), %%mm0 \n\t"
"movd 4(%1), %%mm3 \n\t"
"punpckldq 8(%1), %%mm0 \n\t"
"punpckldq 12(%1), %%mm3 \n\t"
"movq %%mm0, %%mm1 \n\t"
"movq %%mm3, %%mm4 \n\t"
"pand %%mm6, %%mm0 \n\t"
"pand %%mm6, %%mm3 \n\t"
"pmaddwd %%mm7, %%mm0 \n\t"
"pmaddwd %%mm7, %%mm3 \n\t"
"pand %%mm5, %%mm1 \n\t"
"pand %%mm5, %%mm4 \n\t"
"por %%mm1, %%mm0 \n\t"
"por %%mm4, %%mm3 \n\t"
"psrld $5, %%mm0 \n\t"
"pslld $11, %%mm3 \n\t"
"por %%mm3, %%mm0 \n\t"
MOVNTQ" %%mm0, (%0) \n\t"
"addl $16, %1 \n\t"
"addl $8, %0 \n\t"
"cmpl %2, %1 \n\t"
" jb 1b \n\t"
: "+r" (d), "+r"(s)
: "r" (mm_end), "m" (mask3216g), "m" (mask3216br), "m" (mul3216)
);
#else
__asm __volatile(PREFETCH" %0"::"m"(*src):"memory");
__asm __volatile(
"movq %0, %%mm7\n\t"
"movq %1, %%mm6\n\t"
::"m"(red_16mask),"m"(green_16mask));
while(s < mm_end)
{
__asm __volatile(
PREFETCH" 32%1\n\t"
"movd %1, %%mm0\n\t"
"movd 4%1, %%mm3\n\t"
"punpckldq 8%1, %%mm0\n\t"
"punpckldq 12%1, %%mm3\n\t"
"movq %%mm0, %%mm1\n\t"
"movq %%mm0, %%mm2\n\t"
"movq %%mm3, %%mm4\n\t"
"movq %%mm3, %%mm5\n\t"
"psrlq $3, %%mm0\n\t"
"psrlq $3, %%mm3\n\t"
"pand %2, %%mm0\n\t"
"pand %2, %%mm3\n\t"
"psrlq $5, %%mm1\n\t"
"psrlq $5, %%mm4\n\t"
"pand %%mm6, %%mm1\n\t"
"pand %%mm6, %%mm4\n\t"
"psrlq $8, %%mm2\n\t"
"psrlq $8, %%mm5\n\t"
"pand %%mm7, %%mm2\n\t"
"pand %%mm7, %%mm5\n\t"
"por %%mm1, %%mm0\n\t"
"por %%mm4, %%mm3\n\t"
"por %%mm2, %%mm0\n\t"
"por %%mm5, %%mm3\n\t"
"psllq $16, %%mm3\n\t"
"por %%mm3, %%mm0\n\t"
MOVNTQ" %%mm0, %0\n\t"
:"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
d += 4;
s += 16;
}
#endif
__asm __volatile(SFENCE:::"memory");
__asm __volatile(EMMS:::"memory");
#endif
while(s < end)
{
const int src= *s; s += 4;
*d++ = ((src&0xFF)>>3) + ((src&0xFC00)>>5) + ((src&0xF80000)>>8);
// *d++ = ((src>>3)&0x1F) + ((src>>5)&0x7E0) + ((src>>8)&0xF800);
}
}
static inline void RENAME(rgb32tobgr16)(const uint8_t *src, uint8_t *dst, unsigned int src_size)
{
const uint8_t *s = src;
const uint8_t *end;
#ifdef HAVE_MMX
const uint8_t *mm_end;
#endif
uint16_t *d = (uint16_t *)dst;
end = s + src_size;
#ifdef HAVE_MMX
__asm __volatile(PREFETCH" %0"::"m"(*src):"memory");
__asm __volatile(
"movq %0, %%mm7\n\t"
"movq %1, %%mm6\n\t"
::"m"(red_16mask),"m"(green_16mask));
mm_end = end - 15;
while(s < mm_end)
{
__asm __volatile(
PREFETCH" 32%1\n\t"
"movd %1, %%mm0\n\t"
"movd 4%1, %%mm3\n\t"
"punpckldq 8%1, %%mm0\n\t"
"punpckldq 12%1, %%mm3\n\t"
"movq %%mm0, %%mm1\n\t"
"movq %%mm0, %%mm2\n\t"
"movq %%mm3, %%mm4\n\t"
"movq %%mm3, %%mm5\n\t"
"psllq $8, %%mm0\n\t"
"psllq $8, %%mm3\n\t"
"pand %%mm7, %%mm0\n\t"
"pand %%mm7, %%mm3\n\t"
"psrlq $5, %%mm1\n\t"
"psrlq $5, %%mm4\n\t"
"pand %%mm6, %%mm1\n\t"
"pand %%mm6, %%mm4\n\t"
"psrlq $19, %%mm2\n\t"
"psrlq $19, %%mm5\n\t"
"pand %2, %%mm2\n\t"
"pand %2, %%mm5\n\t"
"por %%mm1, %%mm0\n\t"
"por %%mm4, %%mm3\n\t"
"por %%mm2, %%mm0\n\t"
"por %%mm5, %%mm3\n\t"
"psllq $16, %%mm3\n\t"
"por %%mm3, %%mm0\n\t"
MOVNTQ" %%mm0, %0\n\t"
:"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
d += 4;
s += 16;
}
__asm __volatile(SFENCE:::"memory");
__asm __volatile(EMMS:::"memory");
#endif
while(s < end)
{
const int src= *s; s += 4;
*d++ = ((src&0xF8)<<8) + ((src&0xFC00)>>5) + ((src&0xF80000)>>19);
}
}
static inline void RENAME(rgb32to15)(const uint8_t *src, uint8_t *dst, unsigned src_size)
{
const uint8_t *s = src;
const uint8_t *end;
#ifdef HAVE_MMX
const uint8_t *mm_end;
#endif
uint16_t *d = (uint16_t *)dst;
end = s + src_size;
#ifdef HAVE_MMX
mm_end = end - 15;
#if 1 //is faster only if multiplies are reasonable fast (FIXME figure out on which cpus this is faster, on Athlon its slightly faster)
asm volatile(
"movq %3, %%mm5 \n\t"
"movq %4, %%mm6 \n\t"
"movq %5, %%mm7 \n\t"
".balign 16 \n\t"
"1: \n\t"
PREFETCH" 32(%1) \n\t"
"movd (%1), %%mm0 \n\t"
"movd 4(%1), %%mm3 \n\t"
"punpckldq 8(%1), %%mm0 \n\t"
"punpckldq 12(%1), %%mm3 \n\t"
"movq %%mm0, %%mm1 \n\t"
"movq %%mm3, %%mm4 \n\t"
"pand %%mm6, %%mm0 \n\t"
"pand %%mm6, %%mm3 \n\t"
"pmaddwd %%mm7, %%mm0 \n\t"
"pmaddwd %%mm7, %%mm3 \n\t"
"pand %%mm5, %%mm1 \n\t"
"pand %%mm5, %%mm4 \n\t"
"por %%mm1, %%mm0 \n\t"
"por %%mm4, %%mm3 \n\t"
"psrld $6, %%mm0 \n\t"
"pslld $10, %%mm3 \n\t"
"por %%mm3, %%mm0 \n\t"
MOVNTQ" %%mm0, (%0) \n\t"
"addl $16, %1 \n\t"
"addl $8, %0 \n\t"
"cmpl %2, %1 \n\t"
" jb 1b \n\t"
: "+r" (d), "+r"(s)
: "r" (mm_end), "m" (mask3215g), "m" (mask3216br), "m" (mul3215)
);
#else
__asm __volatile(PREFETCH" %0"::"m"(*src):"memory");
__asm __volatile(
"movq %0, %%mm7\n\t"
"movq %1, %%mm6\n\t"
::"m"(red_15mask),"m"(green_15mask));
while(s < mm_end)
{
__asm __volatile(
PREFETCH" 32%1\n\t"
"movd %1, %%mm0\n\t"
"movd 4%1, %%mm3\n\t"
"punpckldq 8%1, %%mm0\n\t"
"punpckldq 12%1, %%mm3\n\t"
"movq %%mm0, %%mm1\n\t"
"movq %%mm0, %%mm2\n\t"
"movq %%mm3, %%mm4\n\t"
"movq %%mm3, %%mm5\n\t"
"psrlq $3, %%mm0\n\t"
"psrlq $3, %%mm3\n\t"
"pand %2, %%mm0\n\t"
"pand %2, %%mm3\n\t"
"psrlq $6, %%mm1\n\t"
"psrlq $6, %%mm4\n\t"
"pand %%mm6, %%mm1\n\t"
"pand %%mm6, %%mm4\n\t"
"psrlq $9, %%mm2\n\t"
"psrlq $9, %%mm5\n\t"
"pand %%mm7, %%mm2\n\t"
"pand %%mm7, %%mm5\n\t"
"por %%mm1, %%mm0\n\t"
"por %%mm4, %%mm3\n\t"
"por %%mm2, %%mm0\n\t"
"por %%mm5, %%mm3\n\t"
"psllq $16, %%mm3\n\t"
"por %%mm3, %%mm0\n\t"
MOVNTQ" %%mm0, %0\n\t"
:"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
d += 4;
s += 16;
}
#endif
__asm __volatile(SFENCE:::"memory");
__asm __volatile(EMMS:::"memory");
#endif
while(s < end)
{
const int src= *s; s += 4;
*d++ = ((src&0xFF)>>3) + ((src&0xF800)>>6) + ((src&0xF80000)>>9);
}
}
static inline void RENAME(rgb32tobgr15)(const uint8_t *src, uint8_t *dst, unsigned src_size)
{
const uint8_t *s = src;
const uint8_t *end;
#ifdef HAVE_MMX
const uint8_t *mm_end;
#endif
uint16_t *d = (uint16_t *)dst;
end = s + src_size;
#ifdef HAVE_MMX
__asm __volatile(PREFETCH" %0"::"m"(*src):"memory");
__asm __volatile(
"movq %0, %%mm7\n\t"
"movq %1, %%mm6\n\t"
::"m"(red_15mask),"m"(green_15mask));
mm_end = end - 15;
while(s < mm_end)
{
__asm __volatile(
PREFETCH" 32%1\n\t"
"movd %1, %%mm0\n\t"
"movd 4%1, %%mm3\n\t"
"punpckldq 8%1, %%mm0\n\t"
"punpckldq 12%1, %%mm3\n\t"
"movq %%mm0, %%mm1\n\t"
"movq %%mm0, %%mm2\n\t"
"movq %%mm3, %%mm4\n\t"
"movq %%mm3, %%mm5\n\t"
"psllq $7, %%mm0\n\t"
"psllq $7, %%mm3\n\t"
"pand %%mm7, %%mm0\n\t"
"pand %%mm7, %%mm3\n\t"
"psrlq $6, %%mm1\n\t"
"psrlq $6, %%mm4\n\t"
"pand %%mm6, %%mm1\n\t"
"pand %%mm6, %%mm4\n\t"
"psrlq $19, %%mm2\n\t"
"psrlq $19, %%mm5\n\t"
"pand %2, %%mm2\n\t"
"pand %2, %%mm5\n\t"
"por %%mm1, %%mm0\n\t"
"por %%mm4, %%mm3\n\t"
"por %%mm2, %%mm0\n\t"
"por %%mm5, %%mm3\n\t"
"psllq $16, %%mm3\n\t"
"por %%mm3, %%mm0\n\t"
MOVNTQ" %%mm0, %0\n\t"
:"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
d += 4;
s += 16;
}
__asm __volatile(SFENCE:::"memory");
__asm __volatile(EMMS:::"memory");
#endif
while(s < end)
{
const int src= *s; s += 4;
*d++ = ((src&0xF8)<<7) + ((src&0xF800)>>6) + ((src&0xF80000)>>19);
}
}
static inline void RENAME(rgb24to16)(const uint8_t *src, uint8_t *dst, unsigned src_size)
{
const uint8_t *s = src;
const uint8_t *end;
#ifdef HAVE_MMX
const uint8_t *mm_end;
#endif
uint16_t *d = (uint16_t *)dst;
end = s + src_size;
#ifdef HAVE_MMX
__asm __volatile(PREFETCH" %0"::"m"(*src):"memory");
__asm __volatile(
"movq %0, %%mm7\n\t"
"movq %1, %%mm6\n\t"
::"m"(red_16mask),"m"(green_16mask));
mm_end = end - 11;
while(s < mm_end)
{
__asm __volatile(
PREFETCH" 32%1\n\t"
"movd %1, %%mm0\n\t"
"movd 3%1, %%mm3\n\t"
"punpckldq 6%1, %%mm0\n\t"
"punpckldq 9%1, %%mm3\n\t"
"movq %%mm0, %%mm1\n\t"
"movq %%mm0, %%mm2\n\t"
"movq %%mm3, %%mm4\n\t"
"movq %%mm3, %%mm5\n\t"
"psrlq $3, %%mm0\n\t"
"psrlq $3, %%mm3\n\t"
"pand %2, %%mm0\n\t"
"pand %2, %%mm3\n\t"
"psrlq $5, %%mm1\n\t"
"psrlq $5, %%mm4\n\t"
"pand %%mm6, %%mm1\n\t"
"pand %%mm6, %%mm4\n\t"
"psrlq $8, %%mm2\n\t"
"psrlq $8, %%mm5\n\t"
"pand %%mm7, %%mm2\n\t"
"pand %%mm7, %%mm5\n\t"
"por %%mm1, %%mm0\n\t"
"por %%mm4, %%mm3\n\t"
"por %%mm2, %%mm0\n\t"
"por %%mm5, %%mm3\n\t"
"psllq $16, %%mm3\n\t"
"por %%mm3, %%mm0\n\t"
MOVNTQ" %%mm0, %0\n\t"
:"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
d += 4;
s += 12;
}
__asm __volatile(SFENCE:::"memory");
__asm __volatile(EMMS:::"memory");
#endif
while(s < end)
{
const int b= *s++;
const int g= *s++;
const int r= *s++;
*d++ = (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8);
}
}
static inline void RENAME(rgb24tobgr16)(const uint8_t *src, uint8_t *dst, unsigned int src_size)
{
const uint8_t *s = src;
const uint8_t *end;
#ifdef HAVE_MMX
const uint8_t *mm_end;
#endif
uint16_t *d = (uint16_t *)dst;
end = s + src_size;
#ifdef HAVE_MMX
__asm __volatile(PREFETCH" %0"::"m"(*src):"memory");
__asm __volatile(
"movq %0, %%mm7\n\t"
"movq %1, %%mm6\n\t"
::"m"(red_16mask),"m"(green_16mask));
mm_end = end - 15;
while(s < mm_end)
{
__asm __volatile(
PREFETCH" 32%1\n\t"
"movd %1, %%mm0\n\t"
"movd 3%1, %%mm3\n\t"
"punpckldq 6%1, %%mm0\n\t"
"punpckldq 9%1, %%mm3\n\t"
"movq %%mm0, %%mm1\n\t"
"movq %%mm0, %%mm2\n\t"
"movq %%mm3, %%mm4\n\t"
"movq %%mm3, %%mm5\n\t"
"psllq $8, %%mm0\n\t"
"psllq $8, %%mm3\n\t"
"pand %%mm7, %%mm0\n\t"
"pand %%mm7, %%mm3\n\t"
"psrlq $5, %%mm1\n\t"
"psrlq $5, %%mm4\n\t"
"pand %%mm6, %%mm1\n\t"
"pand %%mm6, %%mm4\n\t"
"psrlq $19, %%mm2\n\t"
"psrlq $19, %%mm5\n\t"
"pand %2, %%mm2\n\t"
"pand %2, %%mm5\n\t"
"por %%mm1, %%mm0\n\t"
"por %%mm4, %%mm3\n\t"
"por %%mm2, %%mm0\n\t"
"por %%mm5, %%mm3\n\t"
"psllq $16, %%mm3\n\t"
"por %%mm3, %%mm0\n\t"
MOVNTQ" %%mm0, %0\n\t"
:"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
d += 4;
s += 12;
}
__asm __volatile(SFENCE:::"memory");
__asm __volatile(EMMS:::"memory");
#endif
while(s < end)
{
const int r= *s++;
const int g= *s++;
const int b= *s++;
*d++ = (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8);
}
}
static inline void RENAME(rgb24to15)(const uint8_t *src, uint8_t *dst, unsigned src_size)
{
const uint8_t *s = src;
const uint8_t *end;
#ifdef HAVE_MMX
const uint8_t *mm_end;
#endif
uint16_t *d = (uint16_t *)dst;
end = s + src_size;
#ifdef HAVE_MMX
__asm __volatile(PREFETCH" %0"::"m"(*src):"memory");
__asm __volatile(
"movq %0, %%mm7\n\t"
"movq %1, %%mm6\n\t"
::"m"(red_15mask),"m"(green_15mask));
mm_end = end - 11;
while(s < mm_end)
{
__asm __volatile(
PREFETCH" 32%1\n\t"
"movd %1, %%mm0\n\t"
"movd 3%1, %%mm3\n\t"
"punpckldq 6%1, %%mm0\n\t"
"punpckldq 9%1, %%mm3\n\t"
"movq %%mm0, %%mm1\n\t"
"movq %%mm0, %%mm2\n\t"
"movq %%mm3, %%mm4\n\t"
"movq %%mm3, %%mm5\n\t"
"psrlq $3, %%mm0\n\t"
"psrlq $3, %%mm3\n\t"
"pand %2, %%mm0\n\t"
"pand %2, %%mm3\n\t"
"psrlq $6, %%mm1\n\t"
"psrlq $6, %%mm4\n\t"
"pand %%mm6, %%mm1\n\t"
"pand %%mm6, %%mm4\n\t"
"psrlq $9, %%mm2\n\t"
"psrlq $9, %%mm5\n\t"
"pand %%mm7, %%mm2\n\t"
"pand %%mm7, %%mm5\n\t"
"por %%mm1, %%mm0\n\t"
"por %%mm4, %%mm3\n\t"
"por %%mm2, %%mm0\n\t"
"por %%mm5, %%mm3\n\t"
"psllq $16, %%mm3\n\t"
"por %%mm3, %%mm0\n\t"
MOVNTQ" %%mm0, %0\n\t"
:"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
d += 4;
s += 12;
}
__asm __volatile(SFENCE:::"memory");
__asm __volatile(EMMS:::"memory");
#endif
while(s < end)
{
const int b= *s++;
const int g= *s++;
const int r= *s++;
*d++ = (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7);
}
}
static inline void RENAME(rgb24tobgr15)(const uint8_t *src, uint8_t *dst, unsigned src_size)
{
const uint8_t *s = src;
const uint8_t *end;
#ifdef HAVE_MMX
const uint8_t *mm_end;
#endif
uint16_t *d = (uint16_t *)dst;
end = s + src_size;
#ifdef HAVE_MMX
__asm __volatile(PREFETCH" %0"::"m"(*src):"memory");
__asm __volatile(
"movq %0, %%mm7\n\t"
"movq %1, %%mm6\n\t"
::"m"(red_15mask),"m"(green_15mask));
mm_end = end - 15;
while(s < mm_end)
{
__asm __volatile(
PREFETCH" 32%1\n\t"
"movd %1, %%mm0\n\t"
"movd 3%1, %%mm3\n\t"
"punpckldq 6%1, %%mm0\n\t"
"punpckldq 9%1, %%mm3\n\t"
"movq %%mm0, %%mm1\n\t"
"movq %%mm0, %%mm2\n\t"
"movq %%mm3, %%mm4\n\t"
"movq %%mm3, %%mm5\n\t"
"psllq $7, %%mm0\n\t"
"psllq $7, %%mm3\n\t"
"pand %%mm7, %%mm0\n\t"
"pand %%mm7, %%mm3\n\t"
"psrlq $6, %%mm1\n\t"
"psrlq $6, %%mm4\n\t"
"pand %%mm6, %%mm1\n\t"
"pand %%mm6, %%mm4\n\t"
"psrlq $19, %%mm2\n\t"
"psrlq $19, %%mm5\n\t"
"pand %2, %%mm2\n\t"
"pand %2, %%mm5\n\t"
"por %%mm1, %%mm0\n\t"
"por %%mm4, %%mm3\n\t"
"por %%mm2, %%mm0\n\t"
"por %%mm5, %%mm3\n\t"
"psllq $16, %%mm3\n\t"
"por %%mm3, %%mm0\n\t"
MOVNTQ" %%mm0, %0\n\t"
:"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
d += 4;
s += 12;
}
__asm __volatile(SFENCE:::"memory");
__asm __volatile(EMMS:::"memory");
#endif
while(s < end)
{
const int r= *s++;
const int g= *s++;
const int b= *s++;
*d++ = (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7);
}
}
/*
I use here less accurate approximation by simply
left-shifting the input
value and filling the low order bits with
zeroes. This method improves png's
compression but this scheme cannot reproduce white exactly, since it does not
generate an all-ones maximum value; the net effect is to darken the
image slightly.
The better method should be "left bit replication":
4 3 2 1 0
---------
1 1 0 1 1
7 6 5 4 3 2 1 0
----------------
1 1 0 1 1 1 1 0
|=======| |===|
| Leftmost Bits Repeated to Fill Open Bits
|
Original Bits
*/
static inline void RENAME(rgb15to24)(const uint8_t *src, uint8_t *dst, unsigned src_size)
{
const uint16_t *end;
#ifdef HAVE_MMX
const uint16_t *mm_end;
#endif
uint8_t *d = (uint8_t *)dst;
const uint16_t *s = (uint16_t *)src;
end = s + src_size/2;
#ifdef HAVE_MMX
__asm __volatile(PREFETCH" %0"::"m"(*s):"memory");
mm_end = end - 7;
while(s < mm_end)
{
__asm __volatile(
PREFETCH" 32%1\n\t"
"movq %1, %%mm0\n\t"
"movq %1, %%mm1\n\t"
"movq %1, %%mm2\n\t"
"pand %2, %%mm0\n\t"
"pand %3, %%mm1\n\t"
"pand %4, %%mm2\n\t"
"psllq $3, %%mm0\n\t"
"psrlq $2, %%mm1\n\t"
"psrlq $7, %%mm2\n\t"
"movq %%mm0, %%mm3\n\t"
"movq %%mm1, %%mm4\n\t"
"movq %%mm2, %%mm5\n\t"
"punpcklwd %5, %%mm0\n\t"
"punpcklwd %5, %%mm1\n\t"
"punpcklwd %5, %%mm2\n\t"
"punpckhwd %5, %%mm3\n\t"
"punpckhwd %5, %%mm4\n\t"
"punpckhwd %5, %%mm5\n\t"
"psllq $8, %%mm1\n\t"
"psllq $16, %%mm2\n\t"
"por %%mm1, %%mm0\n\t"
"por %%mm2, %%mm0\n\t"
"psllq $8, %%mm4\n\t"
"psllq $16, %%mm5\n\t"
"por %%mm4, %%mm3\n\t"
"por %%mm5, %%mm3\n\t"
"movq %%mm0, %%mm6\n\t"
"movq %%mm3, %%mm7\n\t"
"movq 8%1, %%mm0\n\t"
"movq 8%1, %%mm1\n\t"
"movq 8%1, %%mm2\n\t"
"pand %2, %%mm0\n\t"
"pand %3, %%mm1\n\t"
"pand %4, %%mm2\n\t"
"psllq $3, %%mm0\n\t"
"psrlq $2, %%mm1\n\t"
"psrlq $7, %%mm2\n\t"
"movq %%mm0, %%mm3\n\t"
"movq %%mm1, %%mm4\n\t"
"movq %%mm2, %%mm5\n\t"
"punpcklwd %5, %%mm0\n\t"
"punpcklwd %5, %%mm1\n\t"
"punpcklwd %5, %%mm2\n\t"
"punpckhwd %5, %%mm3\n\t"
"punpckhwd %5, %%mm4\n\t"
"punpckhwd %5, %%mm5\n\t"
"psllq $8, %%mm1\n\t"
"psllq $16, %%mm2\n\t"
"por %%mm1, %%mm0\n\t"
"por %%mm2, %%mm0\n\t"
"psllq $8, %%mm4\n\t"
"psllq $16, %%mm5\n\t"
"por %%mm4, %%mm3\n\t"
"por %%mm5, %%mm3\n\t"
:"=m"(*d)
:"m"(*s),"m"(mask15b),"m"(mask15g),"m"(mask15r), "m"(mmx_null)
:"memory");
/* Borrowed 32 to 24 */
__asm __volatile(
"movq %%mm0, %%mm4\n\t"
"movq %%mm3, %%mm5\n\t"
"movq %%mm6, %%mm0\n\t"
"movq %%mm7, %%mm1\n\t"
"movq %%mm4, %%mm6\n\t"
"movq %%mm5, %%mm7\n\t"
"movq %%mm0, %%mm2\n\t"
"movq %%mm1, %%mm3\n\t"
"psrlq $8, %%mm2\n\t"
"psrlq $8, %%mm3\n\t"
"psrlq $8, %%mm6\n\t"
"psrlq $8, %%mm7\n\t"
"pand %2, %%mm0\n\t"
"pand %2, %%mm1\n\t"
"pand %2, %%mm4\n\t"
"pand %2, %%mm5\n\t"
"pand %3, %%mm2\n\t"
"pand %3, %%mm3\n\t"
"pand %3, %%mm6\n\t"
"pand %3, %%mm7\n\t"
"por %%mm2, %%mm0\n\t"
"por %%mm3, %%mm1\n\t"
"por %%mm6, %%mm4\n\t"
"por %%mm7, %%mm5\n\t"
"movq %%mm1, %%mm2\n\t"
"movq %%mm4, %%mm3\n\t"
"psllq $48, %%mm2\n\t"
"psllq $32, %%mm3\n\t"
"pand %4, %%mm2\n\t"
"pand %5, %%mm3\n\t"
"por %%mm2, %%mm0\n\t"
"psrlq $16, %%mm1\n\t"
"psrlq $32, %%mm4\n\t"
"psllq $16, %%mm5\n\t"
"por %%mm3, %%mm1\n\t"
"pand %6, %%mm5\n\t"
"por %%mm5, %%mm4\n\t"
MOVNTQ" %%mm0, %0\n\t"
MOVNTQ" %%mm1, 8%0\n\t"
MOVNTQ" %%mm4, 16%0"
:"=m"(*d)
:"m"(*s),"m"(mask24l),"m"(mask24h),"m"(mask24hh),"m"(mask24hhh),"m"(mask24hhhh)
:"memory");
d += 24;
s += 8;
}
__asm __volatile(SFENCE:::"memory");
__asm __volatile(EMMS:::"memory");
#endif
while(s < end)
{
register uint16_t bgr;
bgr = *s++;
*d++ = (bgr&0x1F)<<3;
*d++ = (bgr&0x3E0)>>2;
*d++ = (bgr&0x7C00)>>7;
}
}
static inline void RENAME(rgb16to24)(const uint8_t *src, uint8_t *dst, unsigned src_size)
{
const uint16_t *end;
#ifdef HAVE_MMX
const uint16_t *mm_end;
#endif
uint8_t *d = (uint8_t *)dst;
const uint16_t *s = (const uint16_t *)src;
end = s + src_size/2;
#ifdef HAVE_MMX
__asm __volatile(PREFETCH" %0"::"m"(*s):"memory");
mm_end = end - 7;
while(s < mm_end)
{
__asm __volatile(
PREFETCH" 32%1\n\t"
"movq %1, %%mm0\n\t"
"movq %1, %%mm1\n\t"
"movq %1, %%mm2\n\t"
"pand %2, %%mm0\n\t"
"pand %3, %%mm1\n\t"
"pand %4, %%mm2\n\t"
"psllq $3, %%mm0\n\t"
"psrlq $3, %%mm1\n\t"
"psrlq $8, %%mm2\n\t"
"movq %%mm0, %%mm3\n\t"
"movq %%mm1, %%mm4\n\t"
"movq %%mm2, %%mm5\n\t"
"punpcklwd %5, %%mm0\n\t"
"punpcklwd %5, %%mm1\n\t"
"punpcklwd %5, %%mm2\n\t"
"punpckhwd %5, %%mm3\n\t"
"punpckhwd %5, %%mm4\n\t"
"punpckhwd %5, %%mm5\n\t"
"psllq $8, %%mm1\n\t"
"psllq $16, %%mm2\n\t"
"por %%mm1, %%mm0\n\t"
"por %%mm2, %%mm0\n\t"
"psllq $8, %%mm4\n\t"
"psllq $16, %%mm5\n\t"
"por %%mm4, %%mm3\n\t"
"por %%mm5, %%mm3\n\t"
"movq %%mm0, %%mm6\n\t"
"movq %%mm3, %%mm7\n\t"
"movq 8%1, %%mm0\n\t"
"movq 8%1, %%mm1\n\t"
"movq 8%1, %%mm2\n\t"
"pand %2, %%mm0\n\t"
"pand %3, %%mm1\n\t"
"pand %4, %%mm2\n\t"
"psllq $3, %%mm0\n\t"
"psrlq $3, %%mm1\n\t"
"psrlq $8, %%mm2\n\t"
"movq %%mm0, %%mm3\n\t"
"movq %%mm1, %%mm4\n\t"
"movq %%mm2, %%mm5\n\t"
"punpcklwd %5, %%mm0\n\t"
"punpcklwd %5, %%mm1\n\t"
"punpcklwd %5, %%mm2\n\t"
"punpckhwd %5, %%mm3\n\t"
"punpckhwd %5, %%mm4\n\t"
"punpckhwd %5, %%mm5\n\t"
"psllq $8, %%mm1\n\t"
"psllq $16, %%mm2\n\t"
"por %%mm1, %%mm0\n\t"
"por %%mm2, %%mm0\n\t"
"psllq $8, %%mm4\n\t"
"psllq $16, %%mm5\n\t"
"por %%mm4, %%mm3\n\t"
"por %%mm5, %%mm3\n\t"
:"=m"(*d)
:"m"(*s),"m"(mask16b),"m"(mask16g),"m"(mask16r),"m"(mmx_null)
:"memory");
/* Borrowed 32 to 24 */
__asm __volatile(
"movq %%mm0, %%mm4\n\t"
"movq %%mm3, %%mm5\n\t"
"movq %%mm6, %%mm0\n\t"
"movq %%mm7, %%mm1\n\t"
"movq %%mm4, %%mm6\n\t"
"movq %%mm5, %%mm7\n\t"
"movq %%mm0, %%mm2\n\t"
"movq %%mm1, %%mm3\n\t"
"psrlq $8, %%mm2\n\t"
"psrlq $8, %%mm3\n\t"
"psrlq $8, %%mm6\n\t"
"psrlq $8, %%mm7\n\t"
"pand %2, %%mm0\n\t"
"pand %2, %%mm1\n\t"
"pand %2, %%mm4\n\t"
"pand %2, %%mm5\n\t"
"pand %3, %%mm2\n\t"
"pand %3, %%mm3\n\t"
"pand %3, %%mm6\n\t"
"pand %3, %%mm7\n\t"
"por %%mm2, %%mm0\n\t"
"por %%mm3, %%mm1\n\t"
"por %%mm6, %%mm4\n\t"
"por %%mm7, %%mm5\n\t"
"movq %%mm1, %%mm2\n\t"
"movq %%mm4, %%mm3\n\t"
"psllq $48, %%mm2\n\t"
"psllq $32, %%mm3\n\t"
"pand %4, %%mm2\n\t"
"pand %5, %%mm3\n\t"
"por %%mm2, %%mm0\n\t"
"psrlq $16, %%mm1\n\t"
"psrlq $32, %%mm4\n\t"
"psllq $16, %%mm5\n\t"
"por %%mm3, %%mm1\n\t"
"pand %6, %%mm5\n\t"
"por %%mm5, %%mm4\n\t"
MOVNTQ" %%mm0, %0\n\t"
MOVNTQ" %%mm1, 8%0\n\t"
MOVNTQ" %%mm4, 16%0"
:"=m"(*d)
:"m"(*s),"m"(mask24l),"m"(mask24h),"m"(mask24hh),"m"(mask24hhh),"m"(mask24hhhh)
:"memory");
d += 24;
s += 8;
}
__asm __volatile(SFENCE:::"memory");
__asm __volatile(EMMS:::"memory");
#endif
while(s < end)
{
register uint16_t bgr;
bgr = *s++;
*d++ = (bgr&0x1F)<<3;
*d++ = (bgr&0x7E0)>>3;
*d++ = (bgr&0xF800)>>8;
}
}
static inline void RENAME(rgb15to32)(const uint8_t *src, uint8_t *dst, unsigned src_size)
{
const uint16_t *end;
#ifdef HAVE_MMX
const uint16_t *mm_end;
#endif
uint8_t *d = (uint8_t *)dst;
const uint16_t *s = (const uint16_t *)src;
end = s + src_size/2;
#ifdef HAVE_MMX
__asm __volatile(PREFETCH" %0"::"m"(*s):"memory");
__asm __volatile("pxor %%mm7,%%mm7\n\t":::"memory");
mm_end = end - 3;
while(s < mm_end)
{
__asm __volatile(
PREFETCH" 32%1\n\t"
"movq %1, %%mm0\n\t"
"movq %1, %%mm1\n\t"
"movq %1, %%mm2\n\t"
"pand %2, %%mm0\n\t"
"pand %3, %%mm1\n\t"
"pand %4, %%mm2\n\t"
"psllq $3, %%mm0\n\t"
"psrlq $2, %%mm1\n\t"
"psrlq $7, %%mm2\n\t"
"movq %%mm0, %%mm3\n\t"
"movq %%mm1, %%mm4\n\t"
"movq %%mm2, %%mm5\n\t"
"punpcklwd %%mm7, %%mm0\n\t"
"punpcklwd %%mm7, %%mm1\n\t"
"punpcklwd %%mm7, %%mm2\n\t"
"punpckhwd %%mm7, %%mm3\n\t"
"punpckhwd %%mm7, %%mm4\n\t"
"punpckhwd %%mm7, %%mm5\n\t"
"psllq $8, %%mm1\n\t"
"psllq $16, %%mm2\n\t"
"por %%mm1, %%mm0\n\t"
"por %%mm2, %%mm0\n\t"
"psllq $8, %%mm4\n\t"
"psllq $16, %%mm5\n\t"
"por %%mm4, %%mm3\n\t"
"por %%mm5, %%mm3\n\t"
MOVNTQ" %%mm0, %0\n\t"
MOVNTQ" %%mm3, 8%0\n\t"
:"=m"(*d)
:"m"(*s),"m"(mask15b),"m"(mask15g),"m"(mask15r)
:"memory");
d += 16;
s += 4;
}
__asm __volatile(SFENCE:::"memory");
__asm __volatile(EMMS:::"memory");
#endif
while(s < end)
{
#if 0 //slightly slower on athlon
int bgr= *s++;
*((uint32_t*)d)++ = ((bgr&0x1F)<<3) + ((bgr&0x3E0)<<6) + ((bgr&0x7C00)<<9);
#else
//FIXME this is very likely wrong for bigendian (and the following converters too)
register uint16_t bgr;
bgr = *s++;
*d++ = (bgr&0x1F)<<3;
*d++ = (bgr&0x3E0)>>2;
*d++ = (bgr&0x7C00)>>7;
*d++ = 0;
#endif
}
}
static inline void RENAME(rgb16to32)(const uint8_t *src, uint8_t *dst, unsigned src_size)
{
const uint16_t *end;
#ifdef HAVE_MMX
const uint16_t *mm_end;
#endif
uint8_t *d = (uint8_t *)dst;
const uint16_t *s = (uint16_t *)src;
end = s + src_size/2;
#ifdef HAVE_MMX
__asm __volatile(PREFETCH" %0"::"m"(*s):"memory");
__asm __volatile("pxor %%mm7,%%mm7\n\t":::"memory");
mm_end = end - 3;
while(s < mm_end)
{
__asm __volatile(
PREFETCH" 32%1\n\t"
"movq %1, %%mm0\n\t"
"movq %1, %%mm1\n\t"
"movq %1, %%mm2\n\t"
"pand %2, %%mm0\n\t"
"pand %3, %%mm1\n\t"
"pand %4, %%mm2\n\t"
"psllq $3, %%mm0\n\t"
"psrlq $3, %%mm1\n\t"
"psrlq $8, %%mm2\n\t"
"movq %%mm0, %%mm3\n\t"
"movq %%mm1, %%mm4\n\t"
"movq %%mm2, %%mm5\n\t"
"punpcklwd %%mm7, %%mm0\n\t"
"punpcklwd %%mm7, %%mm1\n\t"
"punpcklwd %%mm7, %%mm2\n\t"
"punpckhwd %%mm7, %%mm3\n\t"
"punpckhwd %%mm7, %%mm4\n\t"
"punpckhwd %%mm7, %%mm5\n\t"
"psllq $8, %%mm1\n\t"
"psllq $16, %%mm2\n\t"
"por %%mm1, %%mm0\n\t"
"por %%mm2, %%mm0\n\t"
"psllq $8, %%mm4\n\t"
"psllq $16, %%mm5\n\t"
"por %%mm4, %%mm3\n\t"
"por %%mm5, %%mm3\n\t"
MOVNTQ" %%mm0, %0\n\t"
MOVNTQ" %%mm3, 8%0\n\t"
:"=m"(*d)
:"m"(*s),"m"(mask16b),"m"(mask16g),"m"(mask16r)
:"memory");
d += 16;
s += 4;
}
__asm __volatile(SFENCE:::"memory");
__asm __volatile(EMMS:::"memory");
#endif
while(s < end)
{
register uint16_t bgr;
bgr = *s++;
*d++ = (bgr&0x1F)<<3;
*d++ = (bgr&0x7E0)>>3;
*d++ = (bgr&0xF800)>>8;
*d++ = 0;
}
}
static inline void RENAME(rgb32tobgr32)(const uint8_t *src, uint8_t *dst, unsigned int src_size)
{
#ifdef HAVE_MMX
/* TODO: unroll this loop */
asm volatile (
"xorl %%eax, %%eax \n\t"
".balign 16 \n\t"
"1: \n\t"
PREFETCH" 32(%0, %%eax) \n\t"
"movq (%0, %%eax), %%mm0 \n\t"
"movq %%mm0, %%mm1 \n\t"
"movq %%mm0, %%mm2 \n\t"
"pslld $16, %%mm0 \n\t"
"psrld $16, %%mm1 \n\t"
"pand "MANGLE(mask32r)", %%mm0 \n\t"
"pand "MANGLE(mask32g)", %%mm2 \n\t"
"pand "MANGLE(mask32b)", %%mm1 \n\t"
"por %%mm0, %%mm2 \n\t"
"por %%mm1, %%mm2 \n\t"
MOVNTQ" %%mm2, (%1, %%eax) \n\t"
"addl $8, %%eax \n\t"
"cmpl %2, %%eax \n\t"
" jb 1b \n\t"
:: "r" (src), "r"(dst), "r" (src_size-7)
: "%eax"
);
__asm __volatile(SFENCE:::"memory");
__asm __volatile(EMMS:::"memory");
#else
unsigned i;
unsigned num_pixels = src_size >> 2;
for(i=0; i<num_pixels; i++)
{
#ifdef WORDS_BIGENDIAN
dst[4*i + 1] = src[4*i + 3];
dst[4*i + 2] = src[4*i + 2];
dst[4*i + 3] = src[4*i + 1];
#else
dst[4*i + 0] = src[4*i + 2];
dst[4*i + 1] = src[4*i + 1];
dst[4*i + 2] = src[4*i + 0];
#endif
}
#endif
}
static inline void RENAME(rgb24tobgr24)(const uint8_t *src, uint8_t *dst, unsigned int src_size)
{
unsigned i;
#ifdef HAVE_MMX
int mmx_size= 23 - src_size;
asm volatile (
"movq "MANGLE(mask24r)", %%mm5 \n\t"
"movq "MANGLE(mask24g)", %%mm6 \n\t"
"movq "MANGLE(mask24b)", %%mm7 \n\t"
".balign 16 \n\t"
"1: \n\t"
PREFETCH" 32(%1, %%eax) \n\t"
"movq (%1, %%eax), %%mm0 \n\t" // BGR BGR BG
"movq (%1, %%eax), %%mm1 \n\t" // BGR BGR BG
"movq 2(%1, %%eax), %%mm2 \n\t" // R BGR BGR B
"psllq $16, %%mm0 \n\t" // 00 BGR BGR
"pand %%mm5, %%mm0 \n\t"
"pand %%mm6, %%mm1 \n\t"
"pand %%mm7, %%mm2 \n\t"
"por %%mm0, %%mm1 \n\t"
"por %%mm2, %%mm1 \n\t"
"movq 6(%1, %%eax), %%mm0 \n\t" // BGR BGR BG
MOVNTQ" %%mm1, (%2, %%eax) \n\t" // RGB RGB RG
"movq 8(%1, %%eax), %%mm1 \n\t" // R BGR BGR B
"movq 10(%1, %%eax), %%mm2 \n\t" // GR BGR BGR
"pand %%mm7, %%mm0 \n\t"
"pand %%mm5, %%mm1 \n\t"
"pand %%mm6, %%mm2 \n\t"
"por %%mm0, %%mm1 \n\t"
"por %%mm2, %%mm1 \n\t"
"movq 14(%1, %%eax), %%mm0 \n\t" // R BGR BGR B
MOVNTQ" %%mm1, 8(%2, %%eax) \n\t" // B RGB RGB R
"movq 16(%1, %%eax), %%mm1 \n\t" // GR BGR BGR
"movq 18(%1, %%eax), %%mm2 \n\t" // BGR BGR BG
"pand %%mm6, %%mm0 \n\t"
"pand %%mm7, %%mm1 \n\t"
"pand %%mm5, %%mm2 \n\t"
"por %%mm0, %%mm1 \n\t"
"por %%mm2, %%mm1 \n\t"
MOVNTQ" %%mm1, 16(%2, %%eax) \n\t"
"addl $24, %%eax \n\t"
" js 1b \n\t"
: "+a" (mmx_size)
: "r" (src-mmx_size), "r"(dst-mmx_size)
);
__asm __volatile(SFENCE:::"memory");
__asm __volatile(EMMS:::"memory");
if(mmx_size==23) return; //finihsed, was multiple of 8
src+= src_size;
dst+= src_size;
src_size= 23-mmx_size;
src-= src_size;
dst-= src_size;
#endif
for(i=0; i<src_size; i+=3)
{
register uint8_t x;
x = src[i + 2];
dst[i + 1] = src[i + 1];
dst[i + 2] = src[i + 0];
dst[i + 0] = x;
}
}
static inline void RENAME(yuvPlanartoyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
unsigned int width, unsigned int height,
int lumStride, int chromStride, int dstStride, int vertLumPerChroma)
{
unsigned y;
const unsigned chromWidth= width>>1;
for(y=0; y<height; y++)
{
#ifdef HAVE_MMX
//FIXME handle 2 lines a once (fewer prefetch, reuse some chrom, but very likely limited by mem anyway)
asm volatile(
"xorl %%eax, %%eax \n\t"
".balign 16 \n\t"
"1: \n\t"
PREFETCH" 32(%1, %%eax, 2) \n\t"
PREFETCH" 32(%2, %%eax) \n\t"
PREFETCH" 32(%3, %%eax) \n\t"
"movq (%2, %%eax), %%mm0 \n\t" // U(0)
"movq %%mm0, %%mm2 \n\t" // U(0)
"movq (%3, %%eax), %%mm1 \n\t" // V(0)
"punpcklbw %%mm1, %%mm0 \n\t" // UVUV UVUV(0)
"punpckhbw %%mm1, %%mm2 \n\t" // UVUV UVUV(8)
"movq (%1, %%eax,2), %%mm3 \n\t" // Y(0)
"movq 8(%1, %%eax,2), %%mm5 \n\t" // Y(8)
"movq %%mm3, %%mm4 \n\t" // Y(0)
"movq %%mm5, %%mm6 \n\t" // Y(8)
"punpcklbw %%mm0, %%mm3 \n\t" // YUYV YUYV(0)
"punpckhbw %%mm0, %%mm4 \n\t" // YUYV YUYV(4)
"punpcklbw %%mm2, %%mm5 \n\t" // YUYV YUYV(8)
"punpckhbw %%mm2, %%mm6 \n\t" // YUYV YUYV(12)
MOVNTQ" %%mm3, (%0, %%eax, 4) \n\t"
MOVNTQ" %%mm4, 8(%0, %%eax, 4) \n\t"
MOVNTQ" %%mm5, 16(%0, %%eax, 4) \n\t"
MOVNTQ" %%mm6, 24(%0, %%eax, 4) \n\t"
"addl $8, %%eax \n\t"
"cmpl %4, %%eax \n\t"
" jb 1b \n\t"
::"r"(dst), "r"(ysrc), "r"(usrc), "r"(vsrc), "g" (chromWidth)
: "%eax"
);
#else
#if defined ARCH_ALPHA && defined HAVE_MVI
#define pl2yuy2(n) \
y1 = yc[n]; \
y2 = yc2[n]; \
u = uc[n]; \
v = vc[n]; \
asm("unpkbw %1, %0" : "=r"(y1) : "r"(y1)); \
asm("unpkbw %1, %0" : "=r"(y2) : "r"(y2)); \
asm("unpkbl %1, %0" : "=r"(u) : "r"(u)); \
asm("unpkbl %1, %0" : "=r"(v) : "r"(v)); \
yuv1 = (u << 8) + (v << 24); \
yuv2 = yuv1 + y2; \
yuv1 += y1; \
qdst[n] = yuv1; \
qdst2[n] = yuv2;
int i;
uint64_t *qdst = (uint64_t *) dst;
uint64_t *qdst2 = (uint64_t *) (dst + dstStride);
const uint32_t *yc = (uint32_t *) ysrc;
const uint32_t *yc2 = (uint32_t *) (ysrc + lumStride);
const uint16_t *uc = (uint16_t*) usrc, *vc = (uint16_t*) vsrc;
for(i = 0; i < chromWidth; i += 8){
uint64_t y1, y2, yuv1, yuv2;
uint64_t u, v;
/* Prefetch */
asm("ldq $31,64(%0)" :: "r"(yc));
asm("ldq $31,64(%0)" :: "r"(yc2));
asm("ldq $31,64(%0)" :: "r"(uc));
asm("ldq $31,64(%0)" :: "r"(vc));
pl2yuy2(0);
pl2yuy2(1);
pl2yuy2(2);
pl2yuy2(3);
yc += 4;
yc2 += 4;
uc += 4;
vc += 4;
qdst += 4;
qdst2 += 4;
}
y++;
ysrc += lumStride;
dst += dstStride;
#elif __WORDSIZE >= 64
int i;
uint64_t *ldst = (uint64_t *) dst;
const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc;
for(i = 0; i < chromWidth; i += 2){
uint64_t k, l;
k = yc[0] + (uc[0] << 8) +
(yc[1] << 16) + (vc[0] << 24);
l = yc[2] + (uc[1] << 8) +
(yc[3] << 16) + (vc[1] << 24);
*ldst++ = k + (l << 32);
yc += 4;
uc += 2;
vc += 2;
}
#else
int i, *idst = (int32_t *) dst;
const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc;
for(i = 0; i < chromWidth; i++){
#ifdef WORDS_BIGENDIAN
*idst++ = (yc[0] << 24)+ (uc[0] << 16) +
(yc[1] << 8) + (vc[0] << 0);
#else
*idst++ = yc[0] + (uc[0] << 8) +
(yc[1] << 16) + (vc[0] << 24);
#endif
yc += 2;
uc++;
vc++;
}
#endif
#endif
if((y&(vertLumPerChroma-1))==(vertLumPerChroma-1) )
{
usrc += chromStride;
vsrc += chromStride;
}
ysrc += lumStride;
dst += dstStride;
}
#ifdef HAVE_MMX
asm( EMMS" \n\t"
SFENCE" \n\t"
:::"memory");
#endif
}
/**
*
* height should be a multiple of 2 and width should be a multiple of 16 (if this is a
* problem for anyone then tell me, and ill fix it)
*/
static inline void RENAME(yv12toyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
unsigned int width, unsigned int height,
int lumStride, int chromStride, int dstStride)
{
//FIXME interpolate chroma
RENAME(yuvPlanartoyuy2)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 2);
}
static inline void RENAME(yuvPlanartouyvy)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
unsigned int width, unsigned int height,
int lumStride, int chromStride, int dstStride, int vertLumPerChroma)
{
unsigned y;
const unsigned chromWidth= width>>1;
for(y=0; y<height; y++)
{
#ifdef HAVE_MMX
//FIXME handle 2 lines a once (fewer prefetch, reuse some chrom, but very likely limited by mem anyway)
asm volatile(
"xorl %%eax, %%eax \n\t"
".balign 16 \n\t"
"1: \n\t"
PREFETCH" 32(%1, %%eax, 2) \n\t"
PREFETCH" 32(%2, %%eax) \n\t"
PREFETCH" 32(%3, %%eax) \n\t"
"movq (%2, %%eax), %%mm0 \n\t" // U(0)
"movq %%mm0, %%mm2 \n\t" // U(0)
"movq (%3, %%eax), %%mm1 \n\t" // V(0)
"punpcklbw %%mm1, %%mm0 \n\t" // UVUV UVUV(0)
"punpckhbw %%mm1, %%mm2 \n\t" // UVUV UVUV(8)
"movq (%1, %%eax,2), %%mm3 \n\t" // Y(0)
"movq 8(%1, %%eax,2), %%mm5 \n\t" // Y(8)
"movq %%mm0, %%mm4 \n\t" // Y(0)
"movq %%mm2, %%mm6 \n\t" // Y(8)
"punpcklbw %%mm3, %%mm0 \n\t" // YUYV YUYV(0)
"punpckhbw %%mm3, %%mm4 \n\t" // YUYV YUYV(4)
"punpcklbw %%mm5, %%mm2 \n\t" // YUYV YUYV(8)
"punpckhbw %%mm5, %%mm6 \n\t" // YUYV YUYV(12)
MOVNTQ" %%mm0, (%0, %%eax, 4) \n\t"
MOVNTQ" %%mm4, 8(%0, %%eax, 4) \n\t"
MOVNTQ" %%mm2, 16(%0, %%eax, 4) \n\t"
MOVNTQ" %%mm6, 24(%0, %%eax, 4) \n\t"
"addl $8, %%eax \n\t"
"cmpl %4, %%eax \n\t"
" jb 1b \n\t"
::"r"(dst), "r"(ysrc), "r"(usrc), "r"(vsrc), "g" (chromWidth)
: "%eax"
);
#else
//FIXME adapt the alpha asm code from yv12->yuy2
#if __WORDSIZE >= 64
int i;
uint64_t *ldst = (uint64_t *) dst;
const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc;
for(i = 0; i < chromWidth; i += 2){
uint64_t k, l;
k = uc[0] + (yc[0] << 8) +
(vc[0] << 16) + (yc[1] << 24);
l = uc[1] + (yc[2] << 8) +
(vc[1] << 16) + (yc[3] << 24);
*ldst++ = k + (l << 32);
yc += 4;
uc += 2;
vc += 2;
}
#else
int i, *idst = (int32_t *) dst;
const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc;
for(i = 0; i < chromWidth; i++){
#ifdef WORDS_BIGENDIAN
*idst++ = (uc[0] << 24)+ (yc[0] << 16) +
(vc[0] << 8) + (yc[1] << 0);
#else
*idst++ = uc[0] + (yc[0] << 8) +
(vc[0] << 16) + (yc[1] << 24);
#endif
yc += 2;
uc++;
vc++;
}
#endif
#endif
if((y&(vertLumPerChroma-1))==(vertLumPerChroma-1) )
{
usrc += chromStride;
vsrc += chromStride;
}
ysrc += lumStride;
dst += dstStride;
}
#ifdef HAVE_MMX
asm( EMMS" \n\t"
SFENCE" \n\t"
:::"memory");
#endif
}
/**
*
* height should be a multiple of 2 and width should be a multiple of 16 (if this is a
* problem for anyone then tell me, and ill fix it)
*/
static inline void RENAME(yv12touyvy)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
unsigned int width, unsigned int height,
int lumStride, int chromStride, int dstStride)
{
//FIXME interpolate chroma
RENAME(yuvPlanartouyvy)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 2);
}
/**
*
* width should be a multiple of 16
*/
static inline void RENAME(yuv422ptoyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
unsigned int width, unsigned int height,
int lumStride, int chromStride, int dstStride)
{
RENAME(yuvPlanartoyuy2)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 1);
}
/**
*
* height should be a multiple of 2 and width should be a multiple of 16 (if this is a
* problem for anyone then tell me, and ill fix it)
*/
static inline void RENAME(yuy2toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
unsigned int width, unsigned int height,
int lumStride, int chromStride, int srcStride)
{
unsigned y;
const unsigned chromWidth= width>>1;
for(y=0; y<height; y+=2)
{
#ifdef HAVE_MMX
asm volatile(
"xorl %%eax, %%eax \n\t"
"pcmpeqw %%mm7, %%mm7 \n\t"
"psrlw $8, %%mm7 \n\t" // FF,00,FF,00...
".balign 16 \n\t"
"1: \n\t"
PREFETCH" 64(%0, %%eax, 4) \n\t"
"movq (%0, %%eax, 4), %%mm0 \n\t" // YUYV YUYV(0)
"movq 8(%0, %%eax, 4), %%mm1 \n\t" // YUYV YUYV(4)
"movq %%mm0, %%mm2 \n\t" // YUYV YUYV(0)
"movq %%mm1, %%mm3 \n\t" // YUYV YUYV(4)
"psrlw $8, %%mm0 \n\t" // U0V0 U0V0(0)
"psrlw $8, %%mm1 \n\t" // U0V0 U0V0(4)
"pand %%mm7, %%mm2 \n\t" // Y0Y0 Y0Y0(0)
"pand %%mm7, %%mm3 \n\t" // Y0Y0 Y0Y0(4)
"packuswb %%mm1, %%mm0 \n\t" // UVUV UVUV(0)
"packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(0)
MOVNTQ" %%mm2, (%1, %%eax, 2) \n\t"
"movq 16(%0, %%eax, 4), %%mm1 \n\t" // YUYV YUYV(8)
"movq 24(%0, %%eax, 4), %%mm2 \n\t" // YUYV YUYV(12)
"movq %%mm1, %%mm3 \n\t" // YUYV YUYV(8)
"movq %%mm2, %%mm4 \n\t" // YUYV YUYV(12)
"psrlw $8, %%mm1 \n\t" // U0V0 U0V0(8)
"psrlw $8, %%mm2 \n\t" // U0V0 U0V0(12)
"pand %%mm7, %%mm3 \n\t" // Y0Y0 Y0Y0(8)
"pand %%mm7, %%mm4 \n\t" // Y0Y0 Y0Y0(12)
"packuswb %%mm2, %%mm1 \n\t" // UVUV UVUV(8)
"packuswb %%mm4, %%mm3 \n\t" // YYYY YYYY(8)
MOVNTQ" %%mm3, 8(%1, %%eax, 2) \n\t"
"movq %%mm0, %%mm2 \n\t" // UVUV UVUV(0)
"movq %%mm1, %%mm3 \n\t" // UVUV UVUV(8)
"psrlw $8, %%mm0 \n\t" // V0V0 V0V0(0)
"psrlw $8, %%mm1 \n\t" // V0V0 V0V0(8)
"pand %%mm7, %%mm2 \n\t" // U0U0 U0U0(0)
"pand %%mm7, %%mm3 \n\t" // U0U0 U0U0(8)
"packuswb %%mm1, %%mm0 \n\t" // VVVV VVVV(0)
"packuswb %%mm3, %%mm2 \n\t" // UUUU UUUU(0)
MOVNTQ" %%mm0, (%3, %%eax) \n\t"
MOVNTQ" %%mm2, (%2, %%eax) \n\t"
"addl $8, %%eax \n\t"
"cmpl %4, %%eax \n\t"
" jb 1b \n\t"
::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
: "memory", "%eax"
);
ydst += lumStride;
src += srcStride;
asm volatile(
"xorl %%eax, %%eax \n\t"
".balign 16 \n\t"
"1: \n\t"
PREFETCH" 64(%0, %%eax, 4) \n\t"
"movq (%0, %%eax, 4), %%mm0 \n\t" // YUYV YUYV(0)
"movq 8(%0, %%eax, 4), %%mm1 \n\t" // YUYV YUYV(4)
"movq 16(%0, %%eax, 4), %%mm2 \n\t" // YUYV YUYV(8)
"movq 24(%0, %%eax, 4), %%mm3 \n\t" // YUYV YUYV(12)
"pand %%mm7, %%mm0 \n\t" // Y0Y0 Y0Y0(0)
"pand %%mm7, %%mm1 \n\t" // Y0Y0 Y0Y0(4)
"pand %%mm7, %%mm2 \n\t" // Y0Y0 Y0Y0(8)
"pand %%mm7, %%mm3 \n\t" // Y0Y0 Y0Y0(12)
"packuswb %%mm1, %%mm0 \n\t" // YYYY YYYY(0)
"packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(8)
MOVNTQ" %%mm0, (%1, %%eax, 2) \n\t"
MOVNTQ" %%mm2, 8(%1, %%eax, 2) \n\t"
"addl $8, %%eax \n\t"
"cmpl %4, %%eax \n\t"
" jb 1b \n\t"
::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
: "memory", "%eax"
);
#else
unsigned i;
for(i=0; i<chromWidth; i++)
{
ydst[2*i+0] = src[4*i+0];
udst[i] = src[4*i+1];
ydst[2*i+1] = src[4*i+2];
vdst[i] = src[4*i+3];
}
ydst += lumStride;
src += srcStride;
for(i=0; i<chromWidth; i++)
{
ydst[2*i+0] = src[4*i+0];
ydst[2*i+1] = src[4*i+2];
}
#endif
udst += chromStride;
vdst += chromStride;
ydst += lumStride;
src += srcStride;
}
#ifdef HAVE_MMX
asm volatile( EMMS" \n\t"
SFENCE" \n\t"
:::"memory");
#endif
}
static inline void RENAME(yvu9toyv12)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc,
uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
unsigned int width, unsigned int height, int lumStride, int chromStride)
{
/* Y Plane */
memcpy(ydst, ysrc, width*height);
/* XXX: implement upscaling for U,V */
}
static inline void RENAME(planar2x)(const uint8_t *src, uint8_t *dst, int srcWidth, int srcHeight, int srcStride, int dstStride)
{
int x,y;
dst[0]= src[0];
// first line
for(x=0; x<srcWidth-1; x++){
dst[2*x+1]= (3*src[x] + src[x+1])>>2;
dst[2*x+2]= ( src[x] + 3*src[x+1])>>2;
}
dst[2*srcWidth-1]= src[srcWidth-1];
dst+= dstStride;
for(y=1; y<srcHeight; y++){
#if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
const int mmxSize= srcWidth&~15;
asm volatile(
"movl %4, %%eax \n\t"
"1: \n\t"
"movq (%0, %%eax), %%mm0 \n\t"
"movq (%1, %%eax), %%mm1 \n\t"
"movq 1(%0, %%eax), %%mm2 \n\t"
"movq 1(%1, %%eax), %%mm3 \n\t"
"movq -1(%0, %%eax), %%mm4 \n\t"
"movq -1(%1, %%eax), %%mm5 \n\t"
PAVGB" %%mm0, %%mm5 \n\t"
PAVGB" %%mm0, %%mm3 \n\t"
PAVGB" %%mm0, %%mm5 \n\t"
PAVGB" %%mm0, %%mm3 \n\t"
PAVGB" %%mm1, %%mm4 \n\t"
PAVGB" %%mm1, %%mm2 \n\t"
PAVGB" %%mm1, %%mm4 \n\t"
PAVGB" %%mm1, %%mm2 \n\t"
"movq %%mm5, %%mm7 \n\t"
"movq %%mm4, %%mm6 \n\t"
"punpcklbw %%mm3, %%mm5 \n\t"
"punpckhbw %%mm3, %%mm7 \n\t"
"punpcklbw %%mm2, %%mm4 \n\t"
"punpckhbw %%mm2, %%mm6 \n\t"
#if 1
MOVNTQ" %%mm5, (%2, %%eax, 2) \n\t"
MOVNTQ" %%mm7, 8(%2, %%eax, 2) \n\t"
MOVNTQ" %%mm4, (%3, %%eax, 2) \n\t"
MOVNTQ" %%mm6, 8(%3, %%eax, 2) \n\t"
#else
"movq %%mm5, (%2, %%eax, 2) \n\t"
"movq %%mm7, 8(%2, %%eax, 2) \n\t"
"movq %%mm4, (%3, %%eax, 2) \n\t"
"movq %%mm6, 8(%3, %%eax, 2) \n\t"
#endif
"addl $8, %%eax \n\t"
" js 1b \n\t"
:: "r" (src + mmxSize ), "r" (src + srcStride + mmxSize ),
"r" (dst + mmxSize*2), "r" (dst + dstStride + mmxSize*2),
"g" (-mmxSize)
: "%eax"
);
#else
const int mmxSize=1;
#endif
dst[0 ]= (3*src[0] + src[srcStride])>>2;
dst[dstStride]= ( src[0] + 3*src[srcStride])>>2;
for(x=mmxSize-1; x<srcWidth-1; x++){
dst[2*x +1]= (3*src[x+0] + src[x+srcStride+1])>>2;
dst[2*x+dstStride+2]= ( src[x+0] + 3*src[x+srcStride+1])>>2;
dst[2*x+dstStride+1]= ( src[x+1] + 3*src[x+srcStride ])>>2;
dst[2*x +2]= (3*src[x+1] + src[x+srcStride ])>>2;
}
dst[srcWidth*2 -1 ]= (3*src[srcWidth-1] + src[srcWidth-1 + srcStride])>>2;
dst[srcWidth*2 -1 + dstStride]= ( src[srcWidth-1] + 3*src[srcWidth-1 + srcStride])>>2;
dst+=dstStride*2;
src+=srcStride;
}
// last line
#if 1
dst[0]= src[0];
for(x=0; x<srcWidth-1; x++){
dst[2*x+1]= (3*src[x] + src[x+1])>>2;
dst[2*x+2]= ( src[x] + 3*src[x+1])>>2;
}
dst[2*srcWidth-1]= src[srcWidth-1];
#else
for(x=0; x<srcWidth; x++){
dst[2*x+0]=
dst[2*x+1]= src[x];
}
#endif
#ifdef HAVE_MMX
asm volatile( EMMS" \n\t"
SFENCE" \n\t"
:::"memory");
#endif
}
/**
*
* height should be a multiple of 2 and width should be a multiple of 16 (if this is a
* problem for anyone then tell me, and ill fix it)
* chrominance data is only taken from every secound line others are ignored FIXME write HQ version
*/
static inline void RENAME(uyvytoyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
unsigned int width, unsigned int height,
int lumStride, int chromStride, int srcStride)
{
unsigned y;
const unsigned chromWidth= width>>1;
for(y=0; y<height; y+=2)
{
#ifdef HAVE_MMX
asm volatile(
"xorl %%eax, %%eax \n\t"
"pcmpeqw %%mm7, %%mm7 \n\t"
"psrlw $8, %%mm7 \n\t" // FF,00,FF,00...
".balign 16 \n\t"
"1: \n\t"
PREFETCH" 64(%0, %%eax, 4) \n\t"
"movq (%0, %%eax, 4), %%mm0 \n\t" // UYVY UYVY(0)
"movq 8(%0, %%eax, 4), %%mm1 \n\t" // UYVY UYVY(4)
"movq %%mm0, %%mm2 \n\t" // UYVY UYVY(0)
"movq %%mm1, %%mm3 \n\t" // UYVY UYVY(4)
"pand %%mm7, %%mm0 \n\t" // U0V0 U0V0(0)
"pand %%mm7, %%mm1 \n\t" // U0V0 U0V0(4)
"psrlw $8, %%mm2 \n\t" // Y0Y0 Y0Y0(0)
"psrlw $8, %%mm3 \n\t" // Y0Y0 Y0Y0(4)
"packuswb %%mm1, %%mm0 \n\t" // UVUV UVUV(0)
"packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(0)
MOVNTQ" %%mm2, (%1, %%eax, 2) \n\t"
"movq 16(%0, %%eax, 4), %%mm1 \n\t" // UYVY UYVY(8)
"movq 24(%0, %%eax, 4), %%mm2 \n\t" // UYVY UYVY(12)
"movq %%mm1, %%mm3 \n\t" // UYVY UYVY(8)
"movq %%mm2, %%mm4 \n\t" // UYVY UYVY(12)
"pand %%mm7, %%mm1 \n\t" // U0V0 U0V0(8)
"pand %%mm7, %%mm2 \n\t" // U0V0 U0V0(12)
"psrlw $8, %%mm3 \n\t" // Y0Y0 Y0Y0(8)
"psrlw $8, %%mm4 \n\t" // Y0Y0 Y0Y0(12)
"packuswb %%mm2, %%mm1 \n\t" // UVUV UVUV(8)
"packuswb %%mm4, %%mm3 \n\t" // YYYY YYYY(8)
MOVNTQ" %%mm3, 8(%1, %%eax, 2) \n\t"
"movq %%mm0, %%mm2 \n\t" // UVUV UVUV(0)
"movq %%mm1, %%mm3 \n\t" // UVUV UVUV(8)
"psrlw $8, %%mm0 \n\t" // V0V0 V0V0(0)
"psrlw $8, %%mm1 \n\t" // V0V0 V0V0(8)
"pand %%mm7, %%mm2 \n\t" // U0U0 U0U0(0)
"pand %%mm7, %%mm3 \n\t" // U0U0 U0U0(8)
"packuswb %%mm1, %%mm0 \n\t" // VVVV VVVV(0)
"packuswb %%mm3, %%mm2 \n\t" // UUUU UUUU(0)
MOVNTQ" %%mm0, (%3, %%eax) \n\t"
MOVNTQ" %%mm2, (%2, %%eax) \n\t"
"addl $8, %%eax \n\t"
"cmpl %4, %%eax \n\t"
" jb 1b \n\t"
::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
: "memory", "%eax"
);
ydst += lumStride;
src += srcStride;
asm volatile(
"xorl %%eax, %%eax \n\t"
".balign 16 \n\t"
"1: \n\t"
PREFETCH" 64(%0, %%eax, 4) \n\t"
"movq (%0, %%eax, 4), %%mm0 \n\t" // YUYV YUYV(0)
"movq 8(%0, %%eax, 4), %%mm1 \n\t" // YUYV YUYV(4)
"movq 16(%0, %%eax, 4), %%mm2 \n\t" // YUYV YUYV(8)
"movq 24(%0, %%eax, 4), %%mm3 \n\t" // YUYV YUYV(12)
"psrlw $8, %%mm0 \n\t" // Y0Y0 Y0Y0(0)
"psrlw $8, %%mm1 \n\t" // Y0Y0 Y0Y0(4)
"psrlw $8, %%mm2 \n\t" // Y0Y0 Y0Y0(8)
"psrlw $8, %%mm3 \n\t" // Y0Y0 Y0Y0(12)
"packuswb %%mm1, %%mm0 \n\t" // YYYY YYYY(0)
"packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(8)
MOVNTQ" %%mm0, (%1, %%eax, 2) \n\t"
MOVNTQ" %%mm2, 8(%1, %%eax, 2) \n\t"
"addl $8, %%eax \n\t"
"cmpl %4, %%eax \n\t"
" jb 1b \n\t"
::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
: "memory", "%eax"
);
#else
unsigned i;
for(i=0; i<chromWidth; i++)
{
udst[i] = src[4*i+0];
ydst[2*i+0] = src[4*i+1];
vdst[i] = src[4*i+2];
ydst[2*i+1] = src[4*i+3];
}
ydst += lumStride;
src += srcStride;
for(i=0; i<chromWidth; i++)
{
ydst[2*i+0] = src[4*i+1];
ydst[2*i+1] = src[4*i+3];
}
#endif
udst += chromStride;
vdst += chromStride;
ydst += lumStride;
src += srcStride;
}
#ifdef HAVE_MMX
asm volatile( EMMS" \n\t"
SFENCE" \n\t"
:::"memory");
#endif
}
/**
*
* height should be a multiple of 2 and width should be a multiple of 2 (if this is a
* problem for anyone then tell me, and ill fix it)
* chrominance data is only taken from every secound line others are ignored in the C version FIXME write HQ version
*/
static inline void RENAME(rgb24toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
unsigned int width, unsigned int height,
int lumStride, int chromStride, int srcStride)
{
unsigned y;
const unsigned chromWidth= width>>1;
#ifdef HAVE_MMX
for(y=0; y<height-2; y+=2)
{
unsigned i;
for(i=0; i<2; i++)
{
asm volatile(
"movl %2, %%eax \n\t"
"movq "MANGLE(bgr2YCoeff)", %%mm6 \n\t"
"movq "MANGLE(w1111)", %%mm5 \n\t"
"pxor %%mm7, %%mm7 \n\t"
"leal (%%eax, %%eax, 2), %%ebx \n\t"
".balign 16 \n\t"
"1: \n\t"
PREFETCH" 64(%0, %%ebx) \n\t"
"movd (%0, %%ebx), %%mm0 \n\t"
"movd 3(%0, %%ebx), %%mm1 \n\t"
"punpcklbw %%mm7, %%mm0 \n\t"
"punpcklbw %%mm7, %%mm1 \n\t"
"movd 6(%0, %%ebx), %%mm2 \n\t"
"movd 9(%0, %%ebx), %%mm3 \n\t"
"punpcklbw %%mm7, %%mm2 \n\t"
"punpcklbw %%mm7, %%mm3 \n\t"
"pmaddwd %%mm6, %%mm0 \n\t"
"pmaddwd %%mm6, %%mm1 \n\t"
"pmaddwd %%mm6, %%mm2 \n\t"
"pmaddwd %%mm6, %%mm3 \n\t"
#ifndef FAST_BGR2YV12
"psrad $8, %%mm0 \n\t"
"psrad $8, %%mm1 \n\t"
"psrad $8, %%mm2 \n\t"
"psrad $8, %%mm3 \n\t"
#endif
"packssdw %%mm1, %%mm0 \n\t"
"packssdw %%mm3, %%mm2 \n\t"
"pmaddwd %%mm5, %%mm0 \n\t"
"pmaddwd %%mm5, %%mm2 \n\t"
"packssdw %%mm2, %%mm0 \n\t"
"psraw $7, %%mm0 \n\t"
"movd 12(%0, %%ebx), %%mm4 \n\t"
"movd 15(%0, %%ebx), %%mm1 \n\t"
"punpcklbw %%mm7, %%mm4 \n\t"
"punpcklbw %%mm7, %%mm1 \n\t"
"movd 18(%0, %%ebx), %%mm2 \n\t"
"movd 21(%0, %%ebx), %%mm3 \n\t"
"punpcklbw %%mm7, %%mm2 \n\t"
"punpcklbw %%mm7, %%mm3 \n\t"
"pmaddwd %%mm6, %%mm4 \n\t"
"pmaddwd %%mm6, %%mm1 \n\t"
"pmaddwd %%mm6, %%mm2 \n\t"
"pmaddwd %%mm6, %%mm3 \n\t"
#ifndef FAST_BGR2YV12
"psrad $8, %%mm4 \n\t"
"psrad $8, %%mm1 \n\t"
"psrad $8, %%mm2 \n\t"
"psrad $8, %%mm3 \n\t"
#endif
"packssdw %%mm1, %%mm4 \n\t"
"packssdw %%mm3, %%mm2 \n\t"
"pmaddwd %%mm5, %%mm4 \n\t"
"pmaddwd %%mm5, %%mm2 \n\t"
"addl $24, %%ebx \n\t"
"packssdw %%mm2, %%mm4 \n\t"
"psraw $7, %%mm4 \n\t"
"packuswb %%mm4, %%mm0 \n\t"
"paddusb "MANGLE(bgr2YOffset)", %%mm0 \n\t"
MOVNTQ" %%mm0, (%1, %%eax) \n\t"
"addl $8, %%eax \n\t"
" js 1b \n\t"
: : "r" (src+width*3), "r" (ydst+width), "g" (-width)
: "%eax", "%ebx"
);
ydst += lumStride;
src += srcStride;
}
src -= srcStride*2;
asm volatile(
"movl %4, %%eax \n\t"
"movq "MANGLE(w1111)", %%mm5 \n\t"
"movq "MANGLE(bgr2UCoeff)", %%mm6 \n\t"
"pxor %%mm7, %%mm7 \n\t"
"leal (%%eax, %%eax, 2), %%ebx \n\t"
"addl %%ebx, %%ebx \n\t"
".balign 16 \n\t"
"1: \n\t"
PREFETCH" 64(%0, %%ebx) \n\t"
PREFETCH" 64(%1, %%ebx) \n\t"
#if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
"movq (%0, %%ebx), %%mm0 \n\t"
"movq (%1, %%ebx), %%mm1 \n\t"
"movq 6(%0, %%ebx), %%mm2 \n\t"
"movq 6(%1, %%ebx), %%mm3 \n\t"
PAVGB" %%mm1, %%mm0 \n\t"
PAVGB" %%mm3, %%mm2 \n\t"
"movq %%mm0, %%mm1 \n\t"
"movq %%mm2, %%mm3 \n\t"
"psrlq $24, %%mm0 \n\t"
"psrlq $24, %%mm2 \n\t"
PAVGB" %%mm1, %%mm0 \n\t"
PAVGB" %%mm3, %%mm2 \n\t"
"punpcklbw %%mm7, %%mm0 \n\t"
"punpcklbw %%mm7, %%mm2 \n\t"
#else
"movd (%0, %%ebx), %%mm0 \n\t"
"movd (%1, %%ebx), %%mm1 \n\t"
"movd 3(%0, %%ebx), %%mm2 \n\t"
"movd 3(%1, %%ebx), %%mm3 \n\t"
"punpcklbw %%mm7, %%mm0 \n\t"
"punpcklbw %%mm7, %%mm1 \n\t"
"punpcklbw %%mm7, %%mm2 \n\t"
"punpcklbw %%mm7, %%mm3 \n\t"
"paddw %%mm1, %%mm0 \n\t"
"paddw %%mm3, %%mm2 \n\t"
"paddw %%mm2, %%mm0 \n\t"
"movd 6(%0, %%ebx), %%mm4 \n\t"
"movd 6(%1, %%ebx), %%mm1 \n\t"
"movd 9(%0, %%ebx), %%mm2 \n\t"
"movd 9(%1, %%ebx), %%mm3 \n\t"
"punpcklbw %%mm7, %%mm4 \n\t"
"punpcklbw %%mm7, %%mm1 \n\t"
"punpcklbw %%mm7, %%mm2 \n\t"
"punpcklbw %%mm7, %%mm3 \n\t"
"paddw %%mm1, %%mm4 \n\t"
"paddw %%mm3, %%mm2 \n\t"
"paddw %%mm4, %%mm2 \n\t"
"psrlw $2, %%mm0 \n\t"
"psrlw $2, %%mm2 \n\t"
#endif
"movq "MANGLE(bgr2VCoeff)", %%mm1 \n\t"
"movq "MANGLE(bgr2VCoeff)", %%mm3 \n\t"
"pmaddwd %%mm0, %%mm1 \n\t"
"pmaddwd %%mm2, %%mm3 \n\t"
"pmaddwd %%mm6, %%mm0 \n\t"
"pmaddwd %%mm6, %%mm2 \n\t"
#ifndef FAST_BGR2YV12
"psrad $8, %%mm0 \n\t"
"psrad $8, %%mm1 \n\t"
"psrad $8, %%mm2 \n\t"
"psrad $8, %%mm3 \n\t"
#endif
"packssdw %%mm2, %%mm0 \n\t"
"packssdw %%mm3, %%mm1 \n\t"
"pmaddwd %%mm5, %%mm0 \n\t"
"pmaddwd %%mm5, %%mm1 \n\t"
"packssdw %%mm1, %%mm0 \n\t" // V1 V0 U1 U0
"psraw $7, %%mm0 \n\t"
#if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
"movq 12(%0, %%ebx), %%mm4 \n\t"
"movq 12(%1, %%ebx), %%mm1 \n\t"
"movq 18(%0, %%ebx), %%mm2 \n\t"
"movq 18(%1, %%ebx), %%mm3 \n\t"
PAVGB" %%mm1, %%mm4 \n\t"
PAVGB" %%mm3, %%mm2 \n\t"
"movq %%mm4, %%mm1 \n\t"
"movq %%mm2, %%mm3 \n\t"
"psrlq $24, %%mm4 \n\t"
"psrlq $24, %%mm2 \n\t"
PAVGB" %%mm1, %%mm4 \n\t"
PAVGB" %%mm3, %%mm2 \n\t"
"punpcklbw %%mm7, %%mm4 \n\t"
"punpcklbw %%mm7, %%mm2 \n\t"
#else
"movd 12(%0, %%ebx), %%mm4 \n\t"
"movd 12(%1, %%ebx), %%mm1 \n\t"
"movd 15(%0, %%ebx), %%mm2 \n\t"
"movd 15(%1, %%ebx), %%mm3 \n\t"
"punpcklbw %%mm7, %%mm4 \n\t"
"punpcklbw %%mm7, %%mm1 \n\t"
"punpcklbw %%mm7, %%mm2 \n\t"
"punpcklbw %%mm7, %%mm3 \n\t"
"paddw %%mm1, %%mm4 \n\t"
"paddw %%mm3, %%mm2 \n\t"
"paddw %%mm2, %%mm4 \n\t"
"movd 18(%0, %%ebx), %%mm5 \n\t"
"movd 18(%1, %%ebx), %%mm1 \n\t"
"movd 21(%0, %%ebx), %%mm2 \n\t"
"movd 21(%1, %%ebx), %%mm3 \n\t"
"punpcklbw %%mm7, %%mm5 \n\t"
"punpcklbw %%mm7, %%mm1 \n\t"
"punpcklbw %%mm7, %%mm2 \n\t"
"punpcklbw %%mm7, %%mm3 \n\t"
"paddw %%mm1, %%mm5 \n\t"
"paddw %%mm3, %%mm2 \n\t"
"paddw %%mm5, %%mm2 \n\t"
"movq "MANGLE(w1111)", %%mm5 \n\t"
"psrlw $2, %%mm4 \n\t"
"psrlw $2, %%mm2 \n\t"
#endif
"movq "MANGLE(bgr2VCoeff)", %%mm1 \n\t"
"movq "MANGLE(bgr2VCoeff)", %%mm3 \n\t"
"pmaddwd %%mm4, %%mm1 \n\t"
"pmaddwd %%mm2, %%mm3 \n\t"
"pmaddwd %%mm6, %%mm4 \n\t"
"pmaddwd %%mm6, %%mm2 \n\t"
#ifndef FAST_BGR2YV12
"psrad $8, %%mm4 \n\t"
"psrad $8, %%mm1 \n\t"
"psrad $8, %%mm2 \n\t"
"psrad $8, %%mm3 \n\t"
#endif
"packssdw %%mm2, %%mm4 \n\t"
"packssdw %%mm3, %%mm1 \n\t"
"pmaddwd %%mm5, %%mm4 \n\t"
"pmaddwd %%mm5, %%mm1 \n\t"
"addl $24, %%ebx \n\t"
"packssdw %%mm1, %%mm4 \n\t" // V3 V2 U3 U2
"psraw $7, %%mm4 \n\t"
"movq %%mm0, %%mm1 \n\t"
"punpckldq %%mm4, %%mm0 \n\t"
"punpckhdq %%mm4, %%mm1 \n\t"
"packsswb %%mm1, %%mm0 \n\t"
"paddb "MANGLE(bgr2UVOffset)", %%mm0 \n\t"
"movd %%mm0, (%2, %%eax) \n\t"
"punpckhdq %%mm0, %%mm0 \n\t"
"movd %%mm0, (%3, %%eax) \n\t"
"addl $4, %%eax \n\t"
" js 1b \n\t"
: : "r" (src+chromWidth*6), "r" (src+srcStride+chromWidth*6), "r" (udst+chromWidth), "r" (vdst+chromWidth), "g" (-chromWidth)
: "%eax", "%ebx"
);
udst += chromStride;
vdst += chromStride;
src += srcStride*2;
}
asm volatile( EMMS" \n\t"
SFENCE" \n\t"
:::"memory");
#else
y=0;
#endif
for(; y<height; y+=2)
{
unsigned i;
for(i=0; i<chromWidth; i++)
{
unsigned int b= src[6*i+0];
unsigned int g= src[6*i+1];
unsigned int r= src[6*i+2];
unsigned int Y = ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
unsigned int V = ((RV*r + GV*g + BV*b)>>RGB2YUV_SHIFT) + 128;
unsigned int U = ((RU*r + GU*g + BU*b)>>RGB2YUV_SHIFT) + 128;
udst[i] = U;
vdst[i] = V;
ydst[2*i] = Y;
b= src[6*i+3];
g= src[6*i+4];
r= src[6*i+5];
Y = ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
ydst[2*i+1] = Y;
}
ydst += lumStride;
src += srcStride;
for(i=0; i<chromWidth; i++)
{
unsigned int b= src[6*i+0];
unsigned int g= src[6*i+1];
unsigned int r= src[6*i+2];
unsigned int Y = ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
ydst[2*i] = Y;
b= src[6*i+3];
g= src[6*i+4];
r= src[6*i+5];
Y = ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
ydst[2*i+1] = Y;
}
udst += chromStride;
vdst += chromStride;
ydst += lumStride;
src += srcStride;
}
}
void RENAME(interleaveBytes)(uint8_t *src1, uint8_t *src2, uint8_t *dest,
unsigned width, unsigned height, int src1Stride,
int src2Stride, int dstStride){
unsigned h;
for(h=0; h < height; h++)
{
unsigned w;
#ifdef HAVE_MMX
#ifdef HAVE_SSE2
asm(
"xorl %%eax, %%eax \n\t"
"1: \n\t"
PREFETCH" 64(%1, %%eax) \n\t"
PREFETCH" 64(%2, %%eax) \n\t"
"movdqa (%1, %%eax), %%xmm0 \n\t"
"movdqa (%1, %%eax), %%xmm1 \n\t"
"movdqa (%2, %%eax), %%xmm2 \n\t"
"punpcklbw %%xmm2, %%xmm0 \n\t"
"punpckhbw %%xmm2, %%xmm1 \n\t"
"movntdq %%xmm0, (%0, %%eax, 2) \n\t"
"movntdq %%xmm1, 16(%0, %%eax, 2)\n\t"
"addl $16, %%eax \n\t"
"cmpl %3, %%eax \n\t"
" jb 1b \n\t"
::"r"(dest), "r"(src1), "r"(src2), "r" (width-15)
: "memory", "%eax"
);
#else
asm(
"xorl %%eax, %%eax \n\t"
"1: \n\t"
PREFETCH" 64(%1, %%eax) \n\t"
PREFETCH" 64(%2, %%eax) \n\t"
"movq (%1, %%eax), %%mm0 \n\t"
"movq 8(%1, %%eax), %%mm2 \n\t"
"movq %%mm0, %%mm1 \n\t"
"movq %%mm2, %%mm3 \n\t"
"movq (%2, %%eax), %%mm4 \n\t"
"movq 8(%2, %%eax), %%mm5 \n\t"
"punpcklbw %%mm4, %%mm0 \n\t"
"punpckhbw %%mm4, %%mm1 \n\t"
"punpcklbw %%mm5, %%mm2 \n\t"
"punpckhbw %%mm5, %%mm3 \n\t"
MOVNTQ" %%mm0, (%0, %%eax, 2) \n\t"
MOVNTQ" %%mm1, 8(%0, %%eax, 2) \n\t"
MOVNTQ" %%mm2, 16(%0, %%eax, 2) \n\t"
MOVNTQ" %%mm3, 24(%0, %%eax, 2) \n\t"
"addl $16, %%eax \n\t"
"cmpl %3, %%eax \n\t"
" jb 1b \n\t"
::"r"(dest), "r"(src1), "r"(src2), "r" (width-15)
: "memory", "%eax"
);
#endif
for(w= (width&(~15)); w < width; w++)
{
dest[2*w+0] = src1[w];
dest[2*w+1] = src2[w];
}
#else
for(w=0; w < width; w++)
{
dest[2*w+0] = src1[w];
dest[2*w+1] = src2[w];
}
#endif
dest += dstStride;
src1 += src1Stride;
src2 += src2Stride;
}
#ifdef HAVE_MMX
asm(
EMMS" \n\t"
SFENCE" \n\t"
::: "memory"
);
#endif
}
static inline void RENAME(vu9_to_vu12)(const uint8_t *src1, const uint8_t *src2,
uint8_t *dst1, uint8_t *dst2,
unsigned width, unsigned height,
int srcStride1, int srcStride2,
int dstStride1, int dstStride2)
{
unsigned int y,x,h;
int w;
w=width/2; h=height/2;
#ifdef HAVE_MMX
asm volatile(
PREFETCH" %0\n\t"
PREFETCH" %1\n\t"
::"m"(*(src1+srcStride1)),"m"(*(src2+srcStride2)):"memory");
#endif
for(y=0;y<h;y++){
const uint8_t* s1=src1+srcStride1*(y>>1);
uint8_t* d=dst1+dstStride1*y;
x=0;
#ifdef HAVE_MMX
for(;x<w-31;x+=32)
{
asm volatile(
PREFETCH" 32%1\n\t"
"movq %1, %%mm0\n\t"
"movq 8%1, %%mm2\n\t"
"movq 16%1, %%mm4\n\t"
"movq 24%1, %%mm6\n\t"
"movq %%mm0, %%mm1\n\t"
"movq %%mm2, %%mm3\n\t"
"movq %%mm4, %%mm5\n\t"
"movq %%mm6, %%mm7\n\t"
"punpcklbw %%mm0, %%mm0\n\t"
"punpckhbw %%mm1, %%mm1\n\t"
"punpcklbw %%mm2, %%mm2\n\t"
"punpckhbw %%mm3, %%mm3\n\t"
"punpcklbw %%mm4, %%mm4\n\t"
"punpckhbw %%mm5, %%mm5\n\t"
"punpcklbw %%mm6, %%mm6\n\t"
"punpckhbw %%mm7, %%mm7\n\t"
MOVNTQ" %%mm0, %0\n\t"
MOVNTQ" %%mm1, 8%0\n\t"
MOVNTQ" %%mm2, 16%0\n\t"
MOVNTQ" %%mm3, 24%0\n\t"
MOVNTQ" %%mm4, 32%0\n\t"
MOVNTQ" %%mm5, 40%0\n\t"
MOVNTQ" %%mm6, 48%0\n\t"
MOVNTQ" %%mm7, 56%0"
:"=m"(d[2*x])
:"m"(s1[x])
:"memory");
}
#endif
for(;x<w;x++) d[2*x]=d[2*x+1]=s1[x];
}
for(y=0;y<h;y++){
const uint8_t* s2=src2+srcStride2*(y>>1);
uint8_t* d=dst2+dstStride2*y;
x=0;
#ifdef HAVE_MMX
for(;x<w-31;x+=32)
{
asm volatile(
PREFETCH" 32%1\n\t"
"movq %1, %%mm0\n\t"
"movq 8%1, %%mm2\n\t"
"movq 16%1, %%mm4\n\t"
"movq 24%1, %%mm6\n\t"
"movq %%mm0, %%mm1\n\t"
"movq %%mm2, %%mm3\n\t"
"movq %%mm4, %%mm5\n\t"
"movq %%mm6, %%mm7\n\t"
"punpcklbw %%mm0, %%mm0\n\t"
"punpckhbw %%mm1, %%mm1\n\t"
"punpcklbw %%mm2, %%mm2\n\t"
"punpckhbw %%mm3, %%mm3\n\t"
"punpcklbw %%mm4, %%mm4\n\t"
"punpckhbw %%mm5, %%mm5\n\t"
"punpcklbw %%mm6, %%mm6\n\t"
"punpckhbw %%mm7, %%mm7\n\t"
MOVNTQ" %%mm0, %0\n\t"
MOVNTQ" %%mm1, 8%0\n\t"
MOVNTQ" %%mm2, 16%0\n\t"
MOVNTQ" %%mm3, 24%0\n\t"
MOVNTQ" %%mm4, 32%0\n\t"
MOVNTQ" %%mm5, 40%0\n\t"
MOVNTQ" %%mm6, 48%0\n\t"
MOVNTQ" %%mm7, 56%0"
:"=m"(d[2*x])
:"m"(s2[x])
:"memory");
}
#endif
for(;x<w;x++) d[2*x]=d[2*x+1]=s2[x];
}
#ifdef HAVE_MMX
asm(
EMMS" \n\t"
SFENCE" \n\t"
::: "memory"
);
#endif
}
static inline void RENAME(yvu9_to_yuy2)(const uint8_t *src1, const uint8_t *src2, const uint8_t *src3,
uint8_t *dst,
unsigned width, unsigned height,
int srcStride1, int srcStride2,
int srcStride3, int dstStride)
{
unsigned y,x,w,h;
w=width/2; h=height;
for(y=0;y<h;y++){
const uint8_t* yp=src1+srcStride1*y;
const uint8_t* up=src2+srcStride2*(y>>2);
const uint8_t* vp=src3+srcStride3*(y>>2);
uint8_t* d=dst+dstStride*y;
x=0;
#ifdef HAVE_MMX
for(;x<w-7;x+=8)
{
asm volatile(
PREFETCH" 32(%1, %0)\n\t"
PREFETCH" 32(%2, %0)\n\t"
PREFETCH" 32(%3, %0)\n\t"
"movq (%1, %0, 4), %%mm0\n\t" /* Y0Y1Y2Y3Y4Y5Y6Y7 */
"movq (%2, %0), %%mm1\n\t" /* U0U1U2U3U4U5U6U7 */
"movq (%3, %0), %%mm2\n\t" /* V0V1V2V3V4V5V6V7 */
"movq %%mm0, %%mm3\n\t" /* Y0Y1Y2Y3Y4Y5Y6Y7 */
"movq %%mm1, %%mm4\n\t" /* U0U1U2U3U4U5U6U7 */
"movq %%mm2, %%mm5\n\t" /* V0V1V2V3V4V5V6V7 */
"punpcklbw %%mm1, %%mm1\n\t" /* U0U0 U1U1 U2U2 U3U3 */
"punpcklbw %%mm2, %%mm2\n\t" /* V0V0 V1V1 V2V2 V3V3 */
"punpckhbw %%mm4, %%mm4\n\t" /* U4U4 U5U5 U6U6 U7U7 */
"punpckhbw %%mm5, %%mm5\n\t" /* V4V4 V5V5 V6V6 V7V7 */
"movq %%mm1, %%mm6\n\t"
"punpcklbw %%mm2, %%mm1\n\t" /* U0V0 U0V0 U1V1 U1V1*/
"punpcklbw %%mm1, %%mm0\n\t" /* Y0U0 Y1V0 Y2U0 Y3V0*/
"punpckhbw %%mm1, %%mm3\n\t" /* Y4U1 Y5V1 Y6U1 Y7V1*/
MOVNTQ" %%mm0, (%4, %0, 8)\n\t"
MOVNTQ" %%mm3, 8(%4, %0, 8)\n\t"
"punpckhbw %%mm2, %%mm6\n\t" /* U2V2 U2V2 U3V3 U3V3*/
"movq 8(%1, %0, 4), %%mm0\n\t"
"movq %%mm0, %%mm3\n\t"
"punpcklbw %%mm6, %%mm0\n\t" /* Y U2 Y V2 Y U2 Y V2*/
"punpckhbw %%mm6, %%mm3\n\t" /* Y U3 Y V3 Y U3 Y V3*/
MOVNTQ" %%mm0, 16(%4, %0, 8)\n\t"
MOVNTQ" %%mm3, 24(%4, %0, 8)\n\t"
"movq %%mm4, %%mm6\n\t"
"movq 16(%1, %0, 4), %%mm0\n\t"
"movq %%mm0, %%mm3\n\t"
"punpcklbw %%mm5, %%mm4\n\t"
"punpcklbw %%mm4, %%mm0\n\t" /* Y U4 Y V4 Y U4 Y V4*/
"punpckhbw %%mm4, %%mm3\n\t" /* Y U5 Y V5 Y U5 Y V5*/
MOVNTQ" %%mm0, 32(%4, %0, 8)\n\t"
MOVNTQ" %%mm3, 40(%4, %0, 8)\n\t"
"punpckhbw %%mm5, %%mm6\n\t"
"movq 24(%1, %0, 4), %%mm0\n\t"
"movq %%mm0, %%mm3\n\t"
"punpcklbw %%mm6, %%mm0\n\t" /* Y U6 Y V6 Y U6 Y V6*/
"punpckhbw %%mm6, %%mm3\n\t" /* Y U7 Y V7 Y U7 Y V7*/
MOVNTQ" %%mm0, 48(%4, %0, 8)\n\t"
MOVNTQ" %%mm3, 56(%4, %0, 8)\n\t"
: "+r" (x)
: "r"(yp), "r" (up), "r"(vp), "r"(d)
:"memory");
}
#endif
for(; x<w; x++)
{
const int x2= x<<2;
d[8*x+0]=yp[x2];
d[8*x+1]=up[x];
d[8*x+2]=yp[x2+1];
d[8*x+3]=vp[x];
d[8*x+4]=yp[x2+2];
d[8*x+5]=up[x];
d[8*x+6]=yp[x2+3];
d[8*x+7]=vp[x];
}
}
#ifdef HAVE_MMX
asm(
EMMS" \n\t"
SFENCE" \n\t"
::: "memory"
);
#endif
}
/*
Copyright (C) 2001-2003 Michael Niedermayer <michaelni@gmx.at>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*/
/*
supported Input formats: YV12, I420/IYUV, YUY2, UYVY, BGR32, BGR24, BGR16, BGR15, RGB32, RGB24, Y8/Y800, YVU9/IF09
supported output formats: YV12, I420/IYUV, YUY2, UYVY, {BGR,RGB}{1,4,8,15,16,24,32}, Y8/Y800, YVU9/IF09
{BGR,RGB}{1,4,8,15,16} support dithering
unscaled special converters (YV12=I420=IYUV, Y800=Y8)
YV12 -> {BGR,RGB}{1,4,8,15,16,24,32}
x -> x
YUV9 -> YV12
YUV9/YV12 -> Y800
Y800 -> YUV9/YV12
BGR24 -> BGR32 & RGB24 -> RGB32
BGR32 -> BGR24 & RGB32 -> RGB24
BGR15 -> BGR16
*/
/*
tested special converters (most are tested actually but i didnt write it down ...)
YV12 -> BGR16
YV12 -> YV12
BGR15 -> BGR16
BGR16 -> BGR16
YVU9 -> YV12
untested special converters
YV12/I420 -> BGR15/BGR24/BGR32 (its the yuv2rgb stuff, so it should be ok)
YV12/I420 -> YV12/I420
YUY2/BGR15/BGR24/BGR32/RGB24/RGB32 -> same format
BGR24 -> BGR32 & RGB24 -> RGB32
BGR32 -> BGR24 & RGB32 -> RGB24
BGR24 -> YV12
*/
#include <inttypes.h>
#include <string.h>
#include <math.h>
#include <stdio.h>
#include "config.h"
#include <assert.h>
#ifdef HAVE_MALLOC_H
#include <malloc.h>
#else
#include <stdlib.h>
#endif
#include "swscale.h"
#include "swscale_internal.h"
#include "common.h"
#include "rgb2rgb.h"
#define RUNTIME_CPUDETECT 1
#undef MOVNTQ
#undef PAVGB
//#undef HAVE_MMX2
//#define HAVE_3DNOW
//#undef HAVE_MMX
//#undef ARCH_X86
//#define WORDS_BIGENDIAN
#define DITHER1XBPP
#define FAST_BGR2YV12 // use 7 bit coeffs instead of 15bit
#define RET 0xC3 //near return opcode for X86
#ifdef MP_DEBUG
#define ASSERT(x) assert(x);
#else
#define ASSERT(x) ;
#endif
#ifdef M_PI
#define PI M_PI
#else
#define PI 3.14159265358979323846
#endif
//FIXME replace this with something faster
#define isPlanarYUV(x) ((x)==IMGFMT_YV12 || (x)==IMGFMT_YVU9 \
|| (x)==IMGFMT_444P || (x)==IMGFMT_422P || (x)==IMGFMT_411P)
#define isYUV(x) ((x)==IMGFMT_UYVY || (x)==IMGFMT_YUY2 || isPlanarYUV(x))
#define isGray(x) ((x)==IMGFMT_Y800)
#define isRGB(x) (((x)&IMGFMT_RGB_MASK)==IMGFMT_RGB)
#define isBGR(x) (((x)&IMGFMT_BGR_MASK)==IMGFMT_BGR)
#define isSupportedIn(x) ((x)==IMGFMT_YV12 || (x)==IMGFMT_YUY2 || (x)==IMGFMT_UYVY\
|| (x)==IMGFMT_BGR32|| (x)==IMGFMT_BGR24|| (x)==IMGFMT_BGR16|| (x)==IMGFMT_BGR15\
|| (x)==IMGFMT_RGB32|| (x)==IMGFMT_RGB24\
|| (x)==IMGFMT_Y800 || (x)==IMGFMT_YVU9\
|| (x)==IMGFMT_444P || (x)==IMGFMT_422P || (x)==IMGFMT_411P)
#define isSupportedOut(x) ((x)==IMGFMT_YV12 || (x)==IMGFMT_YUY2 || (x)==IMGFMT_UYVY\
|| (x)==IMGFMT_444P || (x)==IMGFMT_422P || (x)==IMGFMT_411P\
|| isRGB(x) || isBGR(x)\
|| (x)==IMGFMT_Y800 || (x)==IMGFMT_YVU9)
#define isPacked(x) ((x)==IMGFMT_YUY2 || (x)==IMGFMT_UYVY ||isRGB(x) || isBGR(x))
#define RGB2YUV_SHIFT 16
#define BY ((int)( 0.098*(1<<RGB2YUV_SHIFT)+0.5))
#define BV ((int)(-0.071*(1<<RGB2YUV_SHIFT)+0.5))
#define BU ((int)( 0.439*(1<<RGB2YUV_SHIFT)+0.5))
#define GY ((int)( 0.504*(1<<RGB2YUV_SHIFT)+0.5))
#define GV ((int)(-0.368*(1<<RGB2YUV_SHIFT)+0.5))
#define GU ((int)(-0.291*(1<<RGB2YUV_SHIFT)+0.5))
#define RY ((int)( 0.257*(1<<RGB2YUV_SHIFT)+0.5))
#define RV ((int)( 0.439*(1<<RGB2YUV_SHIFT)+0.5))
#define RU ((int)(-0.148*(1<<RGB2YUV_SHIFT)+0.5))
extern const int32_t Inverse_Table_6_9[8][4];
/*
NOTES
Special versions: fast Y 1:1 scaling (no interpolation in y direction)
TODO
more intelligent missalignment avoidance for the horizontal scaler
write special vertical cubic upscale version
Optimize C code (yv12 / minmax)
add support for packed pixel yuv input & output
add support for Y8 output
optimize bgr24 & bgr32
add BGR4 output support
write special BGR->BGR scaler
*/
#define ABS(a) ((a) > 0 ? (a) : (-(a)))
#define MIN(a,b) ((a) > (b) ? (b) : (a))
#define MAX(a,b) ((a) < (b) ? (b) : (a))
#ifdef ARCH_X86
static uint64_t attribute_used __attribute__((aligned(8))) bF8= 0xF8F8F8F8F8F8F8F8LL;
static uint64_t attribute_used __attribute__((aligned(8))) bFC= 0xFCFCFCFCFCFCFCFCLL;
static uint64_t __attribute__((aligned(8))) w10= 0x0010001000100010LL;
static uint64_t attribute_used __attribute__((aligned(8))) w02= 0x0002000200020002LL;
static uint64_t attribute_used __attribute__((aligned(8))) bm00001111=0x00000000FFFFFFFFLL;
static uint64_t attribute_used __attribute__((aligned(8))) bm00000111=0x0000000000FFFFFFLL;
static uint64_t attribute_used __attribute__((aligned(8))) bm11111000=0xFFFFFFFFFF000000LL;
static uint64_t attribute_used __attribute__((aligned(8))) bm01010101=0x00FF00FF00FF00FFLL;
static volatile uint64_t attribute_used __attribute__((aligned(8))) b5Dither;
static volatile uint64_t attribute_used __attribute__((aligned(8))) g5Dither;
static volatile uint64_t attribute_used __attribute__((aligned(8))) g6Dither;
static volatile uint64_t attribute_used __attribute__((aligned(8))) r5Dither;
static uint64_t __attribute__((aligned(8))) dither4[2]={
0x0103010301030103LL,
0x0200020002000200LL,};
static uint64_t __attribute__((aligned(8))) dither8[2]={
0x0602060206020602LL,
0x0004000400040004LL,};
static uint64_t __attribute__((aligned(8))) b16Mask= 0x001F001F001F001FLL;
static uint64_t attribute_used __attribute__((aligned(8))) g16Mask= 0x07E007E007E007E0LL;
static uint64_t attribute_used __attribute__((aligned(8))) r16Mask= 0xF800F800F800F800LL;
static uint64_t __attribute__((aligned(8))) b15Mask= 0x001F001F001F001FLL;
static uint64_t attribute_used __attribute__((aligned(8))) g15Mask= 0x03E003E003E003E0LL;
static uint64_t attribute_used __attribute__((aligned(8))) r15Mask= 0x7C007C007C007C00LL;
static uint64_t attribute_used __attribute__((aligned(8))) M24A= 0x00FF0000FF0000FFLL;
static uint64_t attribute_used __attribute__((aligned(8))) M24B= 0xFF0000FF0000FF00LL;
static uint64_t attribute_used __attribute__((aligned(8))) M24C= 0x0000FF0000FF0000LL;
#ifdef FAST_BGR2YV12
static const uint64_t bgr2YCoeff attribute_used __attribute__((aligned(8))) = 0x000000210041000DULL;
static const uint64_t bgr2UCoeff attribute_used __attribute__((aligned(8))) = 0x0000FFEEFFDC0038ULL;
static const uint64_t bgr2VCoeff attribute_used __attribute__((aligned(8))) = 0x00000038FFD2FFF8ULL;
#else
static const uint64_t bgr2YCoeff attribute_used __attribute__((aligned(8))) = 0x000020E540830C8BULL;
static const uint64_t bgr2UCoeff attribute_used __attribute__((aligned(8))) = 0x0000ED0FDAC23831ULL;
static const uint64_t bgr2VCoeff attribute_used __attribute__((aligned(8))) = 0x00003831D0E6F6EAULL;
#endif
static const uint64_t bgr2YOffset attribute_used __attribute__((aligned(8))) = 0x1010101010101010ULL;
static const uint64_t bgr2UVOffset attribute_used __attribute__((aligned(8)))= 0x8080808080808080ULL;
static const uint64_t w1111 attribute_used __attribute__((aligned(8))) = 0x0001000100010001ULL;
#endif
// clipping helper table for C implementations:
static unsigned char clip_table[768];
static SwsVector *sws_getConvVec(SwsVector *a, SwsVector *b);
extern const uint8_t dither_2x2_4[2][8];
extern const uint8_t dither_2x2_8[2][8];
extern const uint8_t dither_8x8_32[8][8];
extern const uint8_t dither_8x8_73[8][8];
extern const uint8_t dither_8x8_220[8][8];
#ifdef ARCH_X86
void in_asm_used_var_warning_killer()
{
volatile int i= bF8+bFC+w10+
bm00001111+bm00000111+bm11111000+b16Mask+g16Mask+r16Mask+b15Mask+g15Mask+r15Mask+
M24A+M24B+M24C+w02 + b5Dither+g5Dither+r5Dither+g6Dither+dither4[0]+dither8[0]+bm01010101;
if(i) i=0;
}
#endif
static inline void yuv2yuvXinC(int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize,
int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize,
uint8_t *dest, uint8_t *uDest, uint8_t *vDest, int dstW, int chrDstW)
{
//FIXME Optimize (just quickly writen not opti..)
int i;
for(i=0; i<dstW; i++)
{
int val=1<<18;
int j;
for(j=0; j<lumFilterSize; j++)
val += lumSrc[j][i] * lumFilter[j];
dest[i]= MIN(MAX(val>>19, 0), 255);
}
if(uDest != NULL)
for(i=0; i<chrDstW; i++)
{
int u=1<<18;
int v=1<<18;
int j;
for(j=0; j<chrFilterSize; j++)
{
u += chrSrc[j][i] * chrFilter[j];
v += chrSrc[j][i + 2048] * chrFilter[j];
}
uDest[i]= MIN(MAX(u>>19, 0), 255);
vDest[i]= MIN(MAX(v>>19, 0), 255);
}
}
#define YSCALE_YUV_2_PACKEDX_C(type) \
for(i=0; i<(dstW>>1); i++){\
int j;\
int Y1=1<<18;\
int Y2=1<<18;\
int U=1<<18;\
int V=1<<18;\
type *r, *b, *g;\
const int i2= 2*i;\
\
for(j=0; j<lumFilterSize; j++)\
{\
Y1 += lumSrc[j][i2] * lumFilter[j];\
Y2 += lumSrc[j][i2+1] * lumFilter[j];\
}\
for(j=0; j<chrFilterSize; j++)\
{\
U += chrSrc[j][i] * chrFilter[j];\
V += chrSrc[j][i+2048] * chrFilter[j];\
}\
Y1>>=19;\
Y2>>=19;\
U >>=19;\
V >>=19;\
if((Y1|Y2|U|V)&256)\
{\
if(Y1>255) Y1=255;\
else if(Y1<0)Y1=0;\
if(Y2>255) Y2=255;\
else if(Y2<0)Y2=0;\
if(U>255) U=255;\
else if(U<0) U=0;\
if(V>255) V=255;\
else if(V<0) V=0;\
}
#define YSCALE_YUV_2_RGBX_C(type) \
YSCALE_YUV_2_PACKEDX_C(type)\
r = c->table_rV[V];\
g = c->table_gU[U] + c->table_gV[V];\
b = c->table_bU[U];\
#define YSCALE_YUV_2_PACKED2_C \
for(i=0; i<(dstW>>1); i++){\
const int i2= 2*i;\
int Y1= (buf0[i2 ]*yalpha1+buf1[i2 ]*yalpha)>>19;\
int Y2= (buf0[i2+1]*yalpha1+buf1[i2+1]*yalpha)>>19;\
int U= (uvbuf0[i ]*uvalpha1+uvbuf1[i ]*uvalpha)>>19;\
int V= (uvbuf0[i+2048]*uvalpha1+uvbuf1[i+2048]*uvalpha)>>19;\
#define YSCALE_YUV_2_RGB2_C(type) \
YSCALE_YUV_2_PACKED2_C\
type *r, *b, *g;\
r = c->table_rV[V];\
g = c->table_gU[U] + c->table_gV[V];\
b = c->table_bU[U];\
#define YSCALE_YUV_2_PACKED1_C \
for(i=0; i<(dstW>>1); i++){\
const int i2= 2*i;\
int Y1= buf0[i2 ]>>7;\
int Y2= buf0[i2+1]>>7;\
int U= (uvbuf1[i ])>>7;\
int V= (uvbuf1[i+2048])>>7;\
#define YSCALE_YUV_2_RGB1_C(type) \
YSCALE_YUV_2_PACKED1_C\
type *r, *b, *g;\
r = c->table_rV[V];\
g = c->table_gU[U] + c->table_gV[V];\
b = c->table_bU[U];\
#define YSCALE_YUV_2_PACKED1B_C \
for(i=0; i<(dstW>>1); i++){\
const int i2= 2*i;\
int Y1= buf0[i2 ]>>7;\
int Y2= buf0[i2+1]>>7;\
int U= (uvbuf0[i ] + uvbuf1[i ])>>8;\
int V= (uvbuf0[i+2048] + uvbuf1[i+2048])>>8;\
#define YSCALE_YUV_2_RGB1B_C(type) \
YSCALE_YUV_2_PACKED1B_C\
type *r, *b, *g;\
r = c->table_rV[V];\
g = c->table_gU[U] + c->table_gV[V];\
b = c->table_bU[U];\
#define YSCALE_YUV_2_ANYRGB_C(func, func2)\
switch(c->dstFormat)\
{\
case IMGFMT_BGR32:\
case IMGFMT_RGB32:\
func(uint32_t)\
((uint32_t*)dest)[i2+0]= r[Y1] + g[Y1] + b[Y1];\
((uint32_t*)dest)[i2+1]= r[Y2] + g[Y2] + b[Y2];\
} \
break;\
case IMGFMT_RGB24:\
func(uint8_t)\
((uint8_t*)dest)[0]= r[Y1];\
((uint8_t*)dest)[1]= g[Y1];\
((uint8_t*)dest)[2]= b[Y1];\
((uint8_t*)dest)[3]= r[Y2];\
((uint8_t*)dest)[4]= g[Y2];\
((uint8_t*)dest)[5]= b[Y2];\
dest+=6;\
}\
break;\
case IMGFMT_BGR24:\
func(uint8_t)\
((uint8_t*)dest)[0]= b[Y1];\
((uint8_t*)dest)[1]= g[Y1];\
((uint8_t*)dest)[2]= r[Y1];\
((uint8_t*)dest)[3]= b[Y2];\
((uint8_t*)dest)[4]= g[Y2];\
((uint8_t*)dest)[5]= r[Y2];\
dest+=6;\
}\
break;\
case IMGFMT_RGB16:\
case IMGFMT_BGR16:\
{\
const int dr1= dither_2x2_8[y&1 ][0];\
const int dg1= dither_2x2_4[y&1 ][0];\
const int db1= dither_2x2_8[(y&1)^1][0];\
const int dr2= dither_2x2_8[y&1 ][1];\
const int dg2= dither_2x2_4[y&1 ][1];\
const int db2= dither_2x2_8[(y&1)^1][1];\
func(uint16_t)\
((uint16_t*)dest)[i2+0]= r[Y1+dr1] + g[Y1+dg1] + b[Y1+db1];\
((uint16_t*)dest)[i2+1]= r[Y2+dr2] + g[Y2+dg2] + b[Y2+db2];\
}\
}\
break;\
case IMGFMT_RGB15:\
case IMGFMT_BGR15:\
{\
const int dr1= dither_2x2_8[y&1 ][0];\
const int dg1= dither_2x2_8[y&1 ][1];\
const int db1= dither_2x2_8[(y&1)^1][0];\
const int dr2= dither_2x2_8[y&1 ][1];\
const int dg2= dither_2x2_8[y&1 ][0];\
const int db2= dither_2x2_8[(y&1)^1][1];\
func(uint16_t)\
((uint16_t*)dest)[i2+0]= r[Y1+dr1] + g[Y1+dg1] + b[Y1+db1];\
((uint16_t*)dest)[i2+1]= r[Y2+dr2] + g[Y2+dg2] + b[Y2+db2];\
}\
}\
break;\
case IMGFMT_RGB8:\
case IMGFMT_BGR8:\
{\
const uint8_t * const d64= dither_8x8_73[y&7];\
const uint8_t * const d32= dither_8x8_32[y&7];\
func(uint8_t)\
((uint8_t*)dest)[i2+0]= r[Y1+d32[(i2+0)&7]] + g[Y1+d32[(i2+0)&7]] + b[Y1+d64[(i2+0)&7]];\
((uint8_t*)dest)[i2+1]= r[Y2+d32[(i2+1)&7]] + g[Y2+d32[(i2+1)&7]] + b[Y2+d64[(i2+1)&7]];\
}\
}\
break;\
case IMGFMT_RGB4:\
case IMGFMT_BGR4:\
{\
const uint8_t * const d64= dither_8x8_73 [y&7];\
const uint8_t * const d128=dither_8x8_220[y&7];\
func(uint8_t)\
((uint8_t*)dest)[i]= r[Y1+d128[(i2+0)&7]] + g[Y1+d64[(i2+0)&7]] + b[Y1+d128[(i2+0)&7]]\
+ ((r[Y2+d128[(i2+1)&7]] + g[Y2+d64[(i2+1)&7]] + b[Y2+d128[(i2+1)&7]])<<4);\
}\
}\
break;\
case IMGFMT_RG4B:\
case IMGFMT_BG4B:\
{\
const uint8_t * const d64= dither_8x8_73 [y&7];\
const uint8_t * const d128=dither_8x8_220[y&7];\
func(uint8_t)\
((uint8_t*)dest)[i2+0]= r[Y1+d128[(i2+0)&7]] + g[Y1+d64[(i2+0)&7]] + b[Y1+d128[(i2+0)&7]];\
((uint8_t*)dest)[i2+1]= r[Y2+d128[(i2+1)&7]] + g[Y2+d64[(i2+1)&7]] + b[Y2+d128[(i2+1)&7]];\
}\
}\
break;\
case IMGFMT_RGB1:\
case IMGFMT_BGR1:\
{\
const uint8_t * const d128=dither_8x8_220[y&7];\
uint8_t *g= c->table_gU[128] + c->table_gV[128];\
for(i=0; i<dstW-7; i+=8){\
int acc;\
acc = g[((buf0[i ]*yalpha1+buf1[i ]*yalpha)>>19) + d128[0]];\
acc+= acc + g[((buf0[i+1]*yalpha1+buf1[i+1]*yalpha)>>19) + d128[1]];\
acc+= acc + g[((buf0[i+2]*yalpha1+buf1[i+2]*yalpha)>>19) + d128[2]];\
acc+= acc + g[((buf0[i+3]*yalpha1+buf1[i+3]*yalpha)>>19) + d128[3]];\
acc+= acc + g[((buf0[i+4]*yalpha1+buf1[i+4]*yalpha)>>19) + d128[4]];\
acc+= acc + g[((buf0[i+5]*yalpha1+buf1[i+5]*yalpha)>>19) + d128[5]];\
acc+= acc + g[((buf0[i+6]*yalpha1+buf1[i+6]*yalpha)>>19) + d128[6]];\
acc+= acc + g[((buf0[i+7]*yalpha1+buf1[i+7]*yalpha)>>19) + d128[7]];\
((uint8_t*)dest)[0]= acc;\
dest++;\
}\
\
/*\
((uint8_t*)dest)-= dstW>>4;\
{\
int acc=0;\
int left=0;\
static int top[1024];\
static int last_new[1024][1024];\
static int last_in3[1024][1024];\
static int drift[1024][1024];\
int topLeft=0;\
int shift=0;\
int count=0;\
const uint8_t * const d128=dither_8x8_220[y&7];\
int error_new=0;\
int error_in3=0;\
int f=0;\
\
for(i=dstW>>1; i<dstW; i++){\
int in= ((buf0[i ]*yalpha1+buf1[i ]*yalpha)>>19);\
int in2 = (76309 * (in - 16) + 32768) >> 16;\
int in3 = (in2 < 0) ? 0 : ((in2 > 255) ? 255 : in2);\
int old= (left*7 + topLeft + top[i]*5 + top[i+1]*3)/20 + in3\
+ (last_new[y][i] - in3)*f/256;\
int new= old> 128 ? 255 : 0;\
\
error_new+= ABS(last_new[y][i] - new);\
error_in3+= ABS(last_in3[y][i] - in3);\
f= error_new - error_in3*4;\
if(f<0) f=0;\
if(f>256) f=256;\
\
topLeft= top[i];\
left= top[i]= old - new;\
last_new[y][i]= new;\
last_in3[y][i]= in3;\
\
acc+= acc + (new&1);\
if((i&7)==6){\
((uint8_t*)dest)[0]= acc;\
((uint8_t*)dest)++;\
}\
}\
}\
*/\
}\
break;\
case IMGFMT_YUY2:\
func2\
((uint8_t*)dest)[2*i2+0]= Y1;\
((uint8_t*)dest)[2*i2+1]= U;\
((uint8_t*)dest)[2*i2+2]= Y2;\
((uint8_t*)dest)[2*i2+3]= V;\
} \
break;\
case IMGFMT_UYVY:\
func2\
((uint8_t*)dest)[2*i2+0]= U;\
((uint8_t*)dest)[2*i2+1]= Y1;\
((uint8_t*)dest)[2*i2+2]= V;\
((uint8_t*)dest)[2*i2+3]= Y2;\
} \
break;\
}\
static inline void yuv2packedXinC(SwsContext *c, int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize,
int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize,
uint8_t *dest, int dstW, int y)
{
int i;
switch(c->dstFormat)
{
case IMGFMT_RGB32:
case IMGFMT_BGR32:
YSCALE_YUV_2_RGBX_C(uint32_t)
((uint32_t*)dest)[i2+0]= r[Y1] + g[Y1] + b[Y1];
((uint32_t*)dest)[i2+1]= r[Y2] + g[Y2] + b[Y2];
}
break;
case IMGFMT_RGB24:
YSCALE_YUV_2_RGBX_C(uint8_t)
((uint8_t*)dest)[0]= r[Y1];
((uint8_t*)dest)[1]= g[Y1];
((uint8_t*)dest)[2]= b[Y1];
((uint8_t*)dest)[3]= r[Y2];
((uint8_t*)dest)[4]= g[Y2];
((uint8_t*)dest)[5]= b[Y2];
dest+=6;
}
break;
case IMGFMT_BGR24:
YSCALE_YUV_2_RGBX_C(uint8_t)
((uint8_t*)dest)[0]= b[Y1];
((uint8_t*)dest)[1]= g[Y1];
((uint8_t*)dest)[2]= r[Y1];
((uint8_t*)dest)[3]= b[Y2];
((uint8_t*)dest)[4]= g[Y2];
((uint8_t*)dest)[5]= r[Y2];
dest+=6;
}
break;
case IMGFMT_RGB16:
case IMGFMT_BGR16:
{
const int dr1= dither_2x2_8[y&1 ][0];
const int dg1= dither_2x2_4[y&1 ][0];
const int db1= dither_2x2_8[(y&1)^1][0];
const int dr2= dither_2x2_8[y&1 ][1];
const int dg2= dither_2x2_4[y&1 ][1];
const int db2= dither_2x2_8[(y&1)^1][1];
YSCALE_YUV_2_RGBX_C(uint16_t)
((uint16_t*)dest)[i2+0]= r[Y1+dr1] + g[Y1+dg1] + b[Y1+db1];
((uint16_t*)dest)[i2+1]= r[Y2+dr2] + g[Y2+dg2] + b[Y2+db2];
}
}
break;
case IMGFMT_RGB15:
case IMGFMT_BGR15:
{
const int dr1= dither_2x2_8[y&1 ][0];
const int dg1= dither_2x2_8[y&1 ][1];
const int db1= dither_2x2_8[(y&1)^1][0];
const int dr2= dither_2x2_8[y&1 ][1];
const int dg2= dither_2x2_8[y&1 ][0];
const int db2= dither_2x2_8[(y&1)^1][1];
YSCALE_YUV_2_RGBX_C(uint16_t)
((uint16_t*)dest)[i2+0]= r[Y1+dr1] + g[Y1+dg1] + b[Y1+db1];
((uint16_t*)dest)[i2+1]= r[Y2+dr2] + g[Y2+dg2] + b[Y2+db2];
}
}
break;
case IMGFMT_RGB8:
case IMGFMT_BGR8:
{
const uint8_t * const d64= dither_8x8_73[y&7];
const uint8_t * const d32= dither_8x8_32[y&7];
YSCALE_YUV_2_RGBX_C(uint8_t)
((uint8_t*)dest)[i2+0]= r[Y1+d32[(i2+0)&7]] + g[Y1+d32[(i2+0)&7]] + b[Y1+d64[(i2+0)&7]];
((uint8_t*)dest)[i2+1]= r[Y2+d32[(i2+1)&7]] + g[Y2+d32[(i2+1)&7]] + b[Y2+d64[(i2+1)&7]];
}
}
break;
case IMGFMT_RGB4:
case IMGFMT_BGR4:
{
const uint8_t * const d64= dither_8x8_73 [y&7];
const uint8_t * const d128=dither_8x8_220[y&7];
YSCALE_YUV_2_RGBX_C(uint8_t)
((uint8_t*)dest)[i]= r[Y1+d128[(i2+0)&7]] + g[Y1+d64[(i2+0)&7]] + b[Y1+d128[(i2+0)&7]]
+((r[Y2+d128[(i2+1)&7]] + g[Y2+d64[(i2+1)&7]] + b[Y2+d128[(i2+1)&7]])<<4);
}
}
break;
case IMGFMT_RG4B:
case IMGFMT_BG4B:
{
const uint8_t * const d64= dither_8x8_73 [y&7];
const uint8_t * const d128=dither_8x8_220[y&7];
YSCALE_YUV_2_RGBX_C(uint8_t)
((uint8_t*)dest)[i2+0]= r[Y1+d128[(i2+0)&7]] + g[Y1+d64[(i2+0)&7]] + b[Y1+d128[(i2+0)&7]];
((uint8_t*)dest)[i2+1]= r[Y2+d128[(i2+1)&7]] + g[Y2+d64[(i2+1)&7]] + b[Y2+d128[(i2+1)&7]];
}
}
break;
case IMGFMT_RGB1:
case IMGFMT_BGR1:
{
const uint8_t * const d128=dither_8x8_220[y&7];
uint8_t *g= c->table_gU[128] + c->table_gV[128];
int acc=0;
for(i=0; i<dstW-1; i+=2){
int j;
int Y1=1<<18;
int Y2=1<<18;
for(j=0; j<lumFilterSize; j++)
{
Y1 += lumSrc[j][i] * lumFilter[j];
Y2 += lumSrc[j][i+1] * lumFilter[j];
}
Y1>>=19;
Y2>>=19;
if((Y1|Y2)&256)
{
if(Y1>255) Y1=255;
else if(Y1<0)Y1=0;
if(Y2>255) Y2=255;
else if(Y2<0)Y2=0;
}
acc+= acc + g[Y1+d128[(i+0)&7]];
acc+= acc + g[Y2+d128[(i+1)&7]];
if((i&7)==6){
((uint8_t*)dest)[0]= acc;
dest++;
}
}
}
break;
case IMGFMT_YUY2:
YSCALE_YUV_2_PACKEDX_C(void)
((uint8_t*)dest)[2*i2+0]= Y1;
((uint8_t*)dest)[2*i2+1]= U;
((uint8_t*)dest)[2*i2+2]= Y2;
((uint8_t*)dest)[2*i2+3]= V;
}
break;
case IMGFMT_UYVY:
YSCALE_YUV_2_PACKEDX_C(void)
((uint8_t*)dest)[2*i2+0]= U;
((uint8_t*)dest)[2*i2+1]= Y1;
((uint8_t*)dest)[2*i2+2]= V;
((uint8_t*)dest)[2*i2+3]= Y2;
}
break;
}
}
//Note: we have C, X86, MMX, MMX2, 3DNOW version therse no 3DNOW+MMX2 one
//Plain C versions
#if !defined (HAVE_MMX) || defined (RUNTIME_CPUDETECT)
#define COMPILE_C
#endif
#ifdef ARCH_POWERPC
#ifdef HAVE_ALTIVEC
#define COMPILE_ALTIVEC
#endif //HAVE_ALTIVEC
#endif //ARCH_POWERPC
#ifdef ARCH_X86
#if (defined (HAVE_MMX) && !defined (HAVE_3DNOW) && !defined (HAVE_MMX2)) || defined (RUNTIME_CPUDETECT)
#define COMPILE_MMX
#endif
#if defined (HAVE_MMX2) || defined (RUNTIME_CPUDETECT)
#define COMPILE_MMX2
#endif
#if (defined (HAVE_3DNOW) && !defined (HAVE_MMX2)) || defined (RUNTIME_CPUDETECT)
#define COMPILE_3DNOW
#endif
#endif //ARCH_X86
#undef HAVE_MMX
#undef HAVE_MMX2
#undef HAVE_3DNOW
#ifdef COMPILE_C
#undef HAVE_MMX
#undef HAVE_MMX2
#undef HAVE_3DNOW
#undef HAVE_ALTIVEC
#define RENAME(a) a ## _C
#include "swscale_template.c"
#endif
#ifdef ARCH_POWERPC
#ifdef COMPILE_ALTIVEC
#undef RENAME
#define HAVE_ALTIVEC
#define RENAME(a) a ## _altivec
#include "swscale_template.c"
#endif
#endif //ARCH_POWERPC
#ifdef ARCH_X86
//X86 versions
/*
#undef RENAME
#undef HAVE_MMX
#undef HAVE_MMX2
#undef HAVE_3DNOW
#define ARCH_X86
#define RENAME(a) a ## _X86
#include "swscale_template.c"
*/
//MMX versions
#ifdef COMPILE_MMX
#undef RENAME
#define HAVE_MMX
#undef HAVE_MMX2
#undef HAVE_3DNOW
#define RENAME(a) a ## _MMX
#include "swscale_template.c"
#endif
//MMX2 versions
#ifdef COMPILE_MMX2
#undef RENAME
#define HAVE_MMX
#define HAVE_MMX2
#undef HAVE_3DNOW
#define RENAME(a) a ## _MMX2
#include "swscale_template.c"
#endif
//3DNOW versions
#ifdef COMPILE_3DNOW
#undef RENAME
#define HAVE_MMX
#undef HAVE_MMX2
#define HAVE_3DNOW
#define RENAME(a) a ## _3DNow
#include "swscale_template.c"
#endif
#endif //ARCH_X86
// minor note: the HAVE_xyz is messed up after that line so don't use it
static double getSplineCoeff(double a, double b, double c, double d, double dist)
{
// printf("%f %f %f %f %f\n", a,b,c,d,dist);
if(dist<=1.0) return ((d*dist + c)*dist + b)*dist +a;
else return getSplineCoeff( 0.0,
b+ 2.0*c + 3.0*d,
c + 3.0*d,
-b- 3.0*c - 6.0*d,
dist-1.0);
}
static inline void initFilter(int16_t **outFilter, int16_t **filterPos, int *outFilterSize, int xInc,
int srcW, int dstW, int filterAlign, int one, int flags,
SwsVector *srcFilter, SwsVector *dstFilter)
{
int i;
int filterSize;
int filter2Size;
int minFilterSize;
double *filter=NULL;
double *filter2=NULL;
#ifdef ARCH_X86
if(flags & SWS_CPU_CAPS_MMX)
asm volatile("emms\n\t"::: "memory"); //FIXME this shouldnt be required but it IS (even for non mmx versions)
#endif
// Note the +1 is for the MMXscaler which reads over the end
*filterPos = (int16_t*)memalign(8, (dstW+1)*sizeof(int16_t));
if(ABS(xInc - 0x10000) <10) // unscaled
{
int i;
filterSize= 1;
filter= (double*)memalign(8, dstW*sizeof(double)*filterSize);
for(i=0; i<dstW*filterSize; i++) filter[i]=0;
for(i=0; i<dstW; i++)
{
filter[i*filterSize]=1;
(*filterPos)[i]=i;
}
}
else if(flags&SWS_POINT) // lame looking point sampling mode
{
int i;
int xDstInSrc;
filterSize= 1;
filter= (double*)memalign(8, dstW*sizeof(double)*filterSize);
xDstInSrc= xInc/2 - 0x8000;
for(i=0; i<dstW; i++)
{
int xx= (xDstInSrc - ((filterSize-1)<<15) + (1<<15))>>16;
(*filterPos)[i]= xx;
filter[i]= 1.0;
xDstInSrc+= xInc;
}
}
else if((xInc <= (1<<16) && (flags&SWS_AREA)) || (flags&SWS_FAST_BILINEAR)) // bilinear upscale
{
int i;
int xDstInSrc;
if (flags&SWS_BICUBIC) filterSize= 4;
else if(flags&SWS_X ) filterSize= 4;
else filterSize= 2; // SWS_BILINEAR / SWS_AREA
filter= (double*)memalign(8, dstW*sizeof(double)*filterSize);
xDstInSrc= xInc/2 - 0x8000;
for(i=0; i<dstW; i++)
{
int xx= (xDstInSrc - ((filterSize-1)<<15) + (1<<15))>>16;
int j;
(*filterPos)[i]= xx;
//Bilinear upscale / linear interpolate / Area averaging
for(j=0; j<filterSize; j++)
{
double d= ABS((xx<<16) - xDstInSrc)/(double)(1<<16);
double coeff= 1.0 - d;
if(coeff<0) coeff=0;
filter[i*filterSize + j]= coeff;
xx++;
}
xDstInSrc+= xInc;
}
}
else
{
double xDstInSrc;
double sizeFactor, filterSizeInSrc;
const double xInc1= (double)xInc / (double)(1<<16);
int param= (flags&SWS_PARAM_MASK)>>SWS_PARAM_SHIFT;
if (flags&SWS_BICUBIC) sizeFactor= 4.0;
else if(flags&SWS_X) sizeFactor= 8.0;
else if(flags&SWS_AREA) sizeFactor= 1.0; //downscale only, for upscale it is bilinear
else if(flags&SWS_GAUSS) sizeFactor= 8.0; // infinite ;)
else if(flags&SWS_LANCZOS) sizeFactor= param ? 2.0*param : 6.0;
else if(flags&SWS_SINC) sizeFactor= 20.0; // infinite ;)
else if(flags&SWS_SPLINE) sizeFactor= 20.0; // infinite ;)
else if(flags&SWS_BILINEAR) sizeFactor= 2.0;
else {
sizeFactor= 0.0; //GCC warning killer
ASSERT(0)
}
if(xInc1 <= 1.0) filterSizeInSrc= sizeFactor; // upscale
else filterSizeInSrc= sizeFactor*srcW / (double)dstW;
filterSize= (int)ceil(1 + filterSizeInSrc); // will be reduced later if possible
if(filterSize > srcW-2) filterSize=srcW-2;
filter= (double*)memalign(16, dstW*sizeof(double)*filterSize);
xDstInSrc= xInc1 / 2.0 - 0.5;
for(i=0; i<dstW; i++)
{
int xx= (int)(xDstInSrc - (filterSize-1)*0.5 + 0.5);
int j;
(*filterPos)[i]= xx;
for(j=0; j<filterSize; j++)
{
double d= ABS(xx - xDstInSrc)/filterSizeInSrc*sizeFactor;
double coeff;
if(flags & SWS_BICUBIC)
{
double A= param ? -param*0.01 : -0.60;
// Equation is from VirtualDub
if(d<1.0)
coeff = (1.0 - (A+3.0)*d*d + (A+2.0)*d*d*d);
else if(d<2.0)
coeff = (-4.0*A + 8.0*A*d - 5.0*A*d*d + A*d*d*d);
else
coeff=0.0;
}
/* else if(flags & SWS_X)
{
double p= param ? param*0.01 : 0.3;
coeff = d ? sin(d*PI)/(d*PI) : 1.0;
coeff*= pow(2.0, - p*d*d);
}*/
else if(flags & SWS_X)
{
double A= param ? param*0.1 : 1.0;
if(d<1.0)
coeff = cos(d*PI);
else
coeff=-1.0;
if(coeff<0.0) coeff= -pow(-coeff, A);
else coeff= pow( coeff, A);
coeff= coeff*0.5 + 0.5;
}
else if(flags & SWS_AREA)
{
double srcPixelSize= 1.0/xInc1;
if(d + srcPixelSize/2 < 0.5) coeff= 1.0;
else if(d - srcPixelSize/2 < 0.5) coeff= (0.5-d)/srcPixelSize + 0.5;
else coeff=0.0;
}
else if(flags & SWS_GAUSS)
{
double p= param ? param*0.1 : 3.0;
coeff = pow(2.0, - p*d*d);
}
else if(flags & SWS_SINC)
{
coeff = d ? sin(d*PI)/(d*PI) : 1.0;
}
else if(flags & SWS_LANCZOS)
{
double p= param ? param : 3.0;
coeff = d ? sin(d*PI)*sin(d*PI/p)/(d*d*PI*PI/p) : 1.0;
if(d>p) coeff=0;
}
else if(flags & SWS_BILINEAR)
{
coeff= 1.0 - d;
if(coeff<0) coeff=0;
}
else if(flags & SWS_SPLINE)
{
double p=-2.196152422706632;
coeff = getSplineCoeff(1.0, 0.0, p, -p-1.0, d);
}
else {
coeff= 0.0; //GCC warning killer
ASSERT(0)
}
filter[i*filterSize + j]= coeff;
xx++;
}
xDstInSrc+= xInc1;
}
}
/* apply src & dst Filter to filter -> filter2
free(filter);
*/
ASSERT(filterSize>0)
filter2Size= filterSize;
if(srcFilter) filter2Size+= srcFilter->length - 1;
if(dstFilter) filter2Size+= dstFilter->length - 1;
ASSERT(filter2Size>0)
filter2= (double*)memalign(8, filter2Size*dstW*sizeof(double));
for(i=0; i<dstW; i++)
{
int j;
SwsVector scaleFilter;
SwsVector *outVec;
scaleFilter.coeff= filter + i*filterSize;
scaleFilter.length= filterSize;
if(srcFilter) outVec= sws_getConvVec(srcFilter, &scaleFilter);
else outVec= &scaleFilter;
ASSERT(outVec->length == filter2Size)
//FIXME dstFilter
for(j=0; j<outVec->length; j++)
{
filter2[i*filter2Size + j]= outVec->coeff[j];
}
(*filterPos)[i]+= (filterSize-1)/2 - (filter2Size-1)/2;
if(outVec != &scaleFilter) sws_freeVec(outVec);
}
free(filter); filter=NULL;
/* try to reduce the filter-size (step1 find size and shift left) */
// Assume its near normalized (*0.5 or *2.0 is ok but * 0.001 is not)
minFilterSize= 0;
for(i=dstW-1; i>=0; i--)
{
int min= filter2Size;
int j;
double cutOff=0.0;
/* get rid off near zero elements on the left by shifting left */
for(j=0; j<filter2Size; j++)
{
int k;
cutOff += ABS(filter2[i*filter2Size]);
if(cutOff > SWS_MAX_REDUCE_CUTOFF) break;
/* preserve Monotonicity because the core can't handle the filter otherwise */
if(i<dstW-1 && (*filterPos)[i] >= (*filterPos)[i+1]) break;
// Move filter coeffs left
for(k=1; k<filter2Size; k++)
filter2[i*filter2Size + k - 1]= filter2[i*filter2Size + k];
filter2[i*filter2Size + k - 1]= 0.0;
(*filterPos)[i]++;
}
cutOff=0.0;
/* count near zeros on the right */
for(j=filter2Size-1; j>0; j--)
{
cutOff += ABS(filter2[i*filter2Size + j]);
if(cutOff > SWS_MAX_REDUCE_CUTOFF) break;
min--;
}
if(min>minFilterSize) minFilterSize= min;
}
if (flags & SWS_CPU_CAPS_ALTIVEC) {
// we can handle the special case 4,
// so we don't want to go to the full 8
if (minFilterSize < 5)
filterAlign = 4;
// we really don't want to waste our time
// doing useless computation, so fall-back on
// the scalar C code for very small filter.
// vectorizing is worth it only if you have
// decent-sized vector.
if (minFilterSize < 3)
filterAlign = 1;
}
ASSERT(minFilterSize > 0)
filterSize= (minFilterSize +(filterAlign-1)) & (~(filterAlign-1));
ASSERT(filterSize > 0)
filter= (double*)memalign(8, filterSize*dstW*sizeof(double));
*outFilterSize= filterSize;
if(flags&SWS_PRINT_INFO)
MSG_INFO("SwScaler: reducing / aligning filtersize %d -> %d\n", filter2Size, filterSize);
/* try to reduce the filter-size (step2 reduce it) */
for(i=0; i<dstW; i++)
{
int j;
for(j=0; j<filterSize; j++)
{
if(j>=filter2Size) filter[i*filterSize + j]= 0.0;
else filter[i*filterSize + j]= filter2[i*filter2Size + j];
}
}
free(filter2); filter2=NULL;
//FIXME try to align filterpos if possible
//fix borders
for(i=0; i<dstW; i++)
{
int j;
if((*filterPos)[i] < 0)
{
// Move filter coeffs left to compensate for filterPos
for(j=1; j<filterSize; j++)
{
int left= MAX(j + (*filterPos)[i], 0);
filter[i*filterSize + left] += filter[i*filterSize + j];
filter[i*filterSize + j]=0;
}
(*filterPos)[i]= 0;
}
if((*filterPos)[i] + filterSize > srcW)
{
int shift= (*filterPos)[i] + filterSize - srcW;
// Move filter coeffs right to compensate for filterPos
for(j=filterSize-2; j>=0; j--)
{
int right= MIN(j + shift, filterSize-1);
filter[i*filterSize +right] += filter[i*filterSize +j];
filter[i*filterSize +j]=0;
}
(*filterPos)[i]= srcW - filterSize;
}
}
// Note the +1 is for the MMXscaler which reads over the end
*outFilter= (int16_t*)memalign(8, *outFilterSize*(dstW+1)*sizeof(int16_t));
memset(*outFilter, 0, *outFilterSize*(dstW+1)*sizeof(int16_t));
/* Normalize & Store in outFilter */
for(i=0; i<dstW; i++)
{
int j;
double error=0;
double sum=0;
double scale= one;
for(j=0; j<filterSize; j++)
{
sum+= filter[i*filterSize + j];
}
scale/= sum;
for(j=0; j<*outFilterSize; j++)
{
double v= filter[i*filterSize + j]*scale + error;
int intV= floor(v + 0.5);
(*outFilter)[i*(*outFilterSize) + j]= intV;
error = v - intV;
}
}
(*filterPos)[dstW]= (*filterPos)[dstW-1]; // the MMX scaler will read over the end
for(i=0; i<*outFilterSize; i++)
{
int j= dstW*(*outFilterSize);
(*outFilter)[j + i]= (*outFilter)[j + i - (*outFilterSize)];
}
free(filter);
}
#ifdef ARCH_X86
static void initMMX2HScaler(int dstW, int xInc, uint8_t *funnyCode, int16_t *filter, int32_t *filterPos, int numSplits)
{
uint8_t *fragmentA;
int imm8OfPShufW1A;
int imm8OfPShufW2A;
int fragmentLengthA;
uint8_t *fragmentB;
int imm8OfPShufW1B;
int imm8OfPShufW2B;
int fragmentLengthB;
int fragmentPos;
int xpos, i;
// create an optimized horizontal scaling routine
//code fragment
asm volatile(
"jmp 9f \n\t"
// Begin
"0: \n\t"
"movq (%%edx, %%eax), %%mm3 \n\t"
"movd (%%ecx, %%esi), %%mm0 \n\t"
"movd 1(%%ecx, %%esi), %%mm1 \n\t"
"punpcklbw %%mm7, %%mm1 \n\t"
"punpcklbw %%mm7, %%mm0 \n\t"
"pshufw $0xFF, %%mm1, %%mm1 \n\t"
"1: \n\t"
"pshufw $0xFF, %%mm0, %%mm0 \n\t"
"2: \n\t"
"psubw %%mm1, %%mm0 \n\t"
"movl 8(%%ebx, %%eax), %%esi \n\t"
"pmullw %%mm3, %%mm0 \n\t"
"psllw $7, %%mm1 \n\t"
"paddw %%mm1, %%mm0 \n\t"
"movq %%mm0, (%%edi, %%eax) \n\t"
"addl $8, %%eax \n\t"
// End
"9: \n\t"
// "int $3\n\t"
"leal 0b, %0 \n\t"
"leal 1b, %1 \n\t"
"leal 2b, %2 \n\t"
"decl %1 \n\t"
"decl %2 \n\t"
"subl %0, %1 \n\t"
"subl %0, %2 \n\t"
"leal 9b, %3 \n\t"
"subl %0, %3 \n\t"
:"=r" (fragmentA), "=r" (imm8OfPShufW1A), "=r" (imm8OfPShufW2A),
"=r" (fragmentLengthA)
);
asm volatile(
"jmp 9f \n\t"
// Begin
"0: \n\t"
"movq (%%edx, %%eax), %%mm3 \n\t"
"movd (%%ecx, %%esi), %%mm0 \n\t"
"punpcklbw %%mm7, %%mm0 \n\t"
"pshufw $0xFF, %%mm0, %%mm1 \n\t"
"1: \n\t"
"pshufw $0xFF, %%mm0, %%mm0 \n\t"
"2: \n\t"
"psubw %%mm1, %%mm0 \n\t"
"movl 8(%%ebx, %%eax), %%esi \n\t"
"pmullw %%mm3, %%mm0 \n\t"
"psllw $7, %%mm1 \n\t"
"paddw %%mm1, %%mm0 \n\t"
"movq %%mm0, (%%edi, %%eax) \n\t"
"addl $8, %%eax \n\t"
// End
"9: \n\t"
// "int $3\n\t"
"leal 0b, %0 \n\t"
"leal 1b, %1 \n\t"
"leal 2b, %2 \n\t"
"decl %1 \n\t"
"decl %2 \n\t"
"subl %0, %1 \n\t"
"subl %0, %2 \n\t"
"leal 9b, %3 \n\t"
"subl %0, %3 \n\t"
:"=r" (fragmentB), "=r" (imm8OfPShufW1B), "=r" (imm8OfPShufW2B),
"=r" (fragmentLengthB)
);
xpos= 0; //lumXInc/2 - 0x8000; // difference between pixel centers
fragmentPos=0;
for(i=0; i<dstW/numSplits; i++)
{
int xx=xpos>>16;
if((i&3) == 0)
{
int a=0;
int b=((xpos+xInc)>>16) - xx;
int c=((xpos+xInc*2)>>16) - xx;
int d=((xpos+xInc*3)>>16) - xx;
filter[i ] = (( xpos & 0xFFFF) ^ 0xFFFF)>>9;
filter[i+1] = (((xpos+xInc ) & 0xFFFF) ^ 0xFFFF)>>9;
filter[i+2] = (((xpos+xInc*2) & 0xFFFF) ^ 0xFFFF)>>9;
filter[i+3] = (((xpos+xInc*3) & 0xFFFF) ^ 0xFFFF)>>9;
filterPos[i/2]= xx;
if(d+1<4)
{
int maxShift= 3-(d+1);
int shift=0;
memcpy(funnyCode + fragmentPos, fragmentB, fragmentLengthB);
funnyCode[fragmentPos + imm8OfPShufW1B]=
(a+1) | ((b+1)<<2) | ((c+1)<<4) | ((d+1)<<6);
funnyCode[fragmentPos + imm8OfPShufW2B]=
a | (b<<2) | (c<<4) | (d<<6);
if(i+3>=dstW) shift=maxShift; //avoid overread
else if((filterPos[i/2]&3) <= maxShift) shift=filterPos[i/2]&3; //Align
if(shift && i>=shift)
{
funnyCode[fragmentPos + imm8OfPShufW1B]+= 0x55*shift;
funnyCode[fragmentPos + imm8OfPShufW2B]+= 0x55*shift;
filterPos[i/2]-=shift;
}
fragmentPos+= fragmentLengthB;
}
else
{
int maxShift= 3-d;
int shift=0;
memcpy(funnyCode + fragmentPos, fragmentA, fragmentLengthA);
funnyCode[fragmentPos + imm8OfPShufW1A]=
funnyCode[fragmentPos + imm8OfPShufW2A]=
a | (b<<2) | (c<<4) | (d<<6);
if(i+4>=dstW) shift=maxShift; //avoid overread
else if((filterPos[i/2]&3) <= maxShift) shift=filterPos[i/2]&3; //partial align
if(shift && i>=shift)
{
funnyCode[fragmentPos + imm8OfPShufW1A]+= 0x55*shift;
funnyCode[fragmentPos + imm8OfPShufW2A]+= 0x55*shift;
filterPos[i/2]-=shift;
}
fragmentPos+= fragmentLengthA;
}
funnyCode[fragmentPos]= RET;
}
xpos+=xInc;
}
filterPos[i/2]= xpos>>16; // needed to jump to the next part
}
#endif // ARCH_X86
static void globalInit(){
// generating tables:
int i;
for(i=0; i<768; i++){
int c= MIN(MAX(i-256, 0), 255);
clip_table[i]=c;
}
}
static SwsFunc getSwsFunc(int flags){
#ifdef RUNTIME_CPUDETECT
#ifdef ARCH_X86
// ordered per speed fasterst first
if(flags & SWS_CPU_CAPS_MMX2)
return swScale_MMX2;
else if(flags & SWS_CPU_CAPS_3DNOW)
return swScale_3DNow;
else if(flags & SWS_CPU_CAPS_MMX)
return swScale_MMX;
else
return swScale_C;
#else
#ifdef ARCH_POWERPC
if(flags & SWS_CPU_CAPS_ALTIVEC)
return swScale_altivec;
else
return swScale_C;
#endif
return swScale_C;
#endif
#else //RUNTIME_CPUDETECT
#ifdef HAVE_MMX2
return swScale_MMX2;
#elif defined (HAVE_3DNOW)
return swScale_3DNow;
#elif defined (HAVE_MMX)
return swScale_MMX;
#elif defined (HAVE_ALTIVEC)
return swScale_altivec;
#else
return swScale_C;
#endif
#endif //!RUNTIME_CPUDETECT
}
static int PlanarToNV12Wrapper(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY,
int srcSliceH, uint8_t* dstParam[], int dstStride[]){
uint8_t *dst=dstParam[0] + dstStride[0]*srcSliceY;
/* Copy Y plane */
if(dstStride[0]==srcStride[0])
memcpy(dst, src[0], srcSliceH*dstStride[0]);
else
{
int i;
uint8_t *srcPtr= src[0];
uint8_t *dstPtr= dst;
for(i=0; i<srcSliceH; i++)
{
memcpy(dstPtr, srcPtr, srcStride[0]);
srcPtr+= srcStride[0];
dstPtr+= dstStride[0];
}
}
dst = dstParam[1] + dstStride[1]*srcSliceY;
interleaveBytes( src[1],src[2],dst,c->srcW,srcSliceH,srcStride[1],srcStride[2],dstStride[0] );
return srcSliceH;
}
static int PlanarToYuy2Wrapper(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY,
int srcSliceH, uint8_t* dstParam[], int dstStride[]){
uint8_t *dst=dstParam[0] + dstStride[0]*srcSliceY;
yv12toyuy2( src[0],src[1],src[2],dst,c->srcW,srcSliceH,srcStride[0],srcStride[1],dstStride[0] );
return srcSliceH;
}
static int PlanarToUyvyWrapper(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY,
int srcSliceH, uint8_t* dstParam[], int dstStride[]){
uint8_t *dst=dstParam[0] + dstStride[0]*srcSliceY;
yv12touyvy( src[0],src[1],src[2],dst,c->srcW,srcSliceH,srcStride[0],srcStride[1],dstStride[0] );
return srcSliceH;
}
/* {RGB,BGR}{15,16,24,32} -> {RGB,BGR}{15,16,24,32} */
static int rgb2rgbWrapper(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY,
int srcSliceH, uint8_t* dst[], int dstStride[]){
const int srcFormat= c->srcFormat;
const int dstFormat= c->dstFormat;
const int srcBpp= ((srcFormat&0xFF) + 7)>>3;
const int dstBpp= ((dstFormat&0xFF) + 7)>>3;
const int srcId= (srcFormat&0xFF)>>2; // 1:0, 4:1, 8:2, 15:3, 16:4, 24:6, 32:8
const int dstId= (dstFormat&0xFF)>>2;
void (*conv)(const uint8_t *src, uint8_t *dst, unsigned src_size)=NULL;
/* BGR -> BGR */
if( (isBGR(srcFormat) && isBGR(dstFormat))
|| (isRGB(srcFormat) && isRGB(dstFormat))){
switch(srcId | (dstId<<4)){
case 0x34: conv= rgb16to15; break;
case 0x36: conv= rgb24to15; break;
case 0x38: conv= rgb32to15; break;
case 0x43: conv= rgb15to16; break;
case 0x46: conv= rgb24to16; break;
case 0x48: conv= rgb32to16; break;
case 0x63: conv= rgb15to24; break;
case 0x64: conv= rgb16to24; break;
case 0x68: conv= rgb32to24; break;
case 0x83: conv= rgb15to32; break;
case 0x84: conv= rgb16to32; break;
case 0x86: conv= rgb24to32; break;
default: MSG_ERR("swScaler: internal error %s -> %s converter\n",
vo_format_name(srcFormat), vo_format_name(dstFormat)); break;
}
}else if( (isBGR(srcFormat) && isRGB(dstFormat))
|| (isRGB(srcFormat) && isBGR(dstFormat))){
switch(srcId | (dstId<<4)){
case 0x33: conv= rgb15tobgr15; break;
case 0x34: conv= rgb16tobgr15; break;
case 0x36: conv= rgb24tobgr15; break;
case 0x38: conv= rgb32tobgr15; break;
case 0x43: conv= rgb15tobgr16; break;
case 0x44: conv= rgb16tobgr16; break;
case 0x46: conv= rgb24tobgr16; break;
case 0x48: conv= rgb32tobgr16; break;
case 0x63: conv= rgb15tobgr24; break;
case 0x64: conv= rgb16tobgr24; break;
case 0x66: conv= rgb24tobgr24; break;
case 0x68: conv= rgb32tobgr24; break;
case 0x83: conv= rgb15tobgr32; break;
case 0x84: conv= rgb16tobgr32; break;
case 0x86: conv= rgb24tobgr32; break;
case 0x88: conv= rgb32tobgr32; break;
default: MSG_ERR("swScaler: internal error %s -> %s converter\n",
vo_format_name(srcFormat), vo_format_name(dstFormat)); break;
}
}else{
MSG_ERR("swScaler: internal error %s -> %s converter\n",
vo_format_name(srcFormat), vo_format_name(dstFormat));
}
if(dstStride[0]*srcBpp == srcStride[0]*dstBpp)
conv(src[0], dst[0] + dstStride[0]*srcSliceY, srcSliceH*srcStride[0]);
else
{
int i;
uint8_t *srcPtr= src[0];
uint8_t *dstPtr= dst[0] + dstStride[0]*srcSliceY;
for(i=0; i<srcSliceH; i++)
{
conv(srcPtr, dstPtr, c->srcW*srcBpp);
srcPtr+= srcStride[0];
dstPtr+= dstStride[0];
}
}
return srcSliceH;
}
static int bgr24toyv12Wrapper(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY,
int srcSliceH, uint8_t* dst[], int dstStride[]){
rgb24toyv12(
src[0],
dst[0]+ srcSliceY *dstStride[0],
dst[1]+(srcSliceY>>1)*dstStride[1],
dst[2]+(srcSliceY>>1)*dstStride[2],
c->srcW, srcSliceH,
dstStride[0], dstStride[1], srcStride[0]);
return srcSliceH;
}
static int yvu9toyv12Wrapper(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY,
int srcSliceH, uint8_t* dst[], int dstStride[]){
int i;
/* copy Y */
if(srcStride[0]==dstStride[0])
memcpy(dst[0]+ srcSliceY*dstStride[0], src[0], srcStride[0]*srcSliceH);
else{
uint8_t *srcPtr= src[0];
uint8_t *dstPtr= dst[0] + dstStride[0]*srcSliceY;
for(i=0; i<srcSliceH; i++)
{
memcpy(dstPtr, srcPtr, c->srcW);
srcPtr+= srcStride[0];
dstPtr+= dstStride[0];
}
}
if(c->dstFormat==IMGFMT_YV12){
planar2x(src[1], dst[1], c->chrSrcW, c->chrSrcH, srcStride[1], dstStride[1]);
planar2x(src[2], dst[2], c->chrSrcW, c->chrSrcH, srcStride[2], dstStride[2]);
}else{
planar2x(src[1], dst[2], c->chrSrcW, c->chrSrcH, srcStride[1], dstStride[2]);
planar2x(src[2], dst[1], c->chrSrcW, c->chrSrcH, srcStride[2], dstStride[1]);
}
return srcSliceH;
}
/**
* bring pointers in YUV order instead of YVU
*/
static inline void sws_orderYUV(int format, uint8_t * sortedP[], int sortedStride[], uint8_t * p[], int stride[]){
if(format == IMGFMT_YV12 || format == IMGFMT_YVU9
|| format == IMGFMT_444P || format == IMGFMT_422P || format == IMGFMT_411P){
sortedP[0]= p[0];
sortedP[1]= p[2];
sortedP[2]= p[1];
sortedStride[0]= stride[0];
sortedStride[1]= stride[2];
sortedStride[2]= stride[1];
}
else if(isPacked(format) || isGray(format) || format == IMGFMT_Y8)
{
sortedP[0]= p[0];
sortedP[1]=
sortedP[2]= NULL;
sortedStride[0]= stride[0];
sortedStride[1]=
sortedStride[2]= 0;
}
else if(format == IMGFMT_I420 || format == IMGFMT_IYUV)
{
sortedP[0]= p[0];
sortedP[1]= p[1];
sortedP[2]= p[2];
sortedStride[0]= stride[0];
sortedStride[1]= stride[1];
sortedStride[2]= stride[2];
}else{
MSG_ERR("internal error in orderYUV\n");
}
}
/* unscaled copy like stuff (assumes nearly identical formats) */
static int simpleCopy(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY,
int srcSliceH, uint8_t* dst[], int dstStride[]){
if(isPacked(c->srcFormat))
{
if(dstStride[0]==srcStride[0])
memcpy(dst[0] + dstStride[0]*srcSliceY, src[0], srcSliceH*dstStride[0]);
else
{
int i;
uint8_t *srcPtr= src[0];
uint8_t *dstPtr= dst[0] + dstStride[0]*srcSliceY;
int length=0;
/* universal length finder */
while(length+c->srcW <= ABS(dstStride[0])
&& length+c->srcW <= ABS(srcStride[0])) length+= c->srcW;
ASSERT(length!=0);
for(i=0; i<srcSliceH; i++)
{
memcpy(dstPtr, srcPtr, length);
srcPtr+= srcStride[0];
dstPtr+= dstStride[0];
}
}
}
else
{ /* Planar YUV or gray */
int plane;
for(plane=0; plane<3; plane++)
{
int length= plane==0 ? c->srcW : -((-c->srcW )>>c->chrDstHSubSample);
int y= plane==0 ? srcSliceY: -((-srcSliceY)>>c->chrDstVSubSample);
int height= plane==0 ? srcSliceH: -((-srcSliceH)>>c->chrDstVSubSample);
if((isGray(c->srcFormat) || isGray(c->dstFormat)) && plane>0)
{
if(!isGray(c->dstFormat))
memset(dst[plane], 128, dstStride[plane]*height);
}
else
{
if(dstStride[plane]==srcStride[plane])
memcpy(dst[plane] + dstStride[plane]*y, src[plane], height*dstStride[plane]);
else
{
int i;
uint8_t *srcPtr= src[plane];
uint8_t *dstPtr= dst[plane] + dstStride[plane]*y;
for(i=0; i<height; i++)
{
memcpy(dstPtr, srcPtr, length);
srcPtr+= srcStride[plane];
dstPtr+= dstStride[plane];
}
}
}
}
}
return srcSliceH;
}
static int remove_dup_fourcc(int fourcc)
{
switch(fourcc)
{
case IMGFMT_I420:
case IMGFMT_IYUV: return IMGFMT_YV12;
case IMGFMT_Y8 : return IMGFMT_Y800;
case IMGFMT_IF09: return IMGFMT_YVU9;
default: return fourcc;
}
}
static void getSubSampleFactors(int *h, int *v, int format){
switch(format){
case IMGFMT_UYVY:
case IMGFMT_YUY2:
*h=1;
*v=0;
break;
case IMGFMT_YV12:
case IMGFMT_Y800: //FIXME remove after different subsamplings are fully implemented
*h=1;
*v=1;
break;
case IMGFMT_YVU9:
*h=2;
*v=2;
break;
case IMGFMT_444P:
*h=0;
*v=0;
break;
case IMGFMT_422P:
*h=1;
*v=0;
break;
case IMGFMT_411P:
*h=2;
*v=0;
break;
default:
*h=0;
*v=0;
break;
}
}
static uint16_t roundToInt16(int64_t f){
int r= (f + (1<<15))>>16;
if(r<-0x7FFF) return 0x8000;
else if(r> 0x7FFF) return 0x7FFF;
else return r;
}
/**
* @param inv_table the yuv2rgb coeffs, normally Inverse_Table_6_9[x]
* @param fullRange if 1 then the luma range is 0..255 if 0 its 16..235
* @return -1 if not supported
*/
int sws_setColorspaceDetails(SwsContext *c, const int inv_table[4], int srcRange, const int table[4], int dstRange, int brightness, int contrast, int saturation){
int64_t crv = inv_table[0];
int64_t cbu = inv_table[1];
int64_t cgu = -inv_table[2];
int64_t cgv = -inv_table[3];
int64_t cy = 1<<16;
int64_t oy = 0;
if(isYUV(c->dstFormat) || isGray(c->dstFormat)) return -1;
memcpy(c->srcColorspaceTable, inv_table, sizeof(int)*4);
memcpy(c->dstColorspaceTable, table, sizeof(int)*4);
c->brightness= brightness;
c->contrast = contrast;
c->saturation= saturation;
c->srcRange = srcRange;
c->dstRange = dstRange;
c->uOffset= 0x0400040004000400LL;
c->vOffset= 0x0400040004000400LL;
if(!srcRange){
cy= (cy*255) / 219;
oy= 16<<16;
}
cy = (cy *contrast )>>16;
crv= (crv*contrast * saturation)>>32;
cbu= (cbu*contrast * saturation)>>32;
cgu= (cgu*contrast * saturation)>>32;
cgv= (cgv*contrast * saturation)>>32;
oy -= 256*brightness;
c->yCoeff= roundToInt16(cy *8192) * 0x0001000100010001ULL;
c->vrCoeff= roundToInt16(crv*8192) * 0x0001000100010001ULL;
c->ubCoeff= roundToInt16(cbu*8192) * 0x0001000100010001ULL;
c->vgCoeff= roundToInt16(cgv*8192) * 0x0001000100010001ULL;
c->ugCoeff= roundToInt16(cgu*8192) * 0x0001000100010001ULL;
c->yOffset= roundToInt16(oy * 8) * 0x0001000100010001ULL;
yuv2rgb_c_init_tables(c, inv_table, srcRange, brightness, contrast, saturation);
//FIXME factorize
#ifdef HAVE_ALTIVEC
yuv2rgb_altivec_init_tables (c, inv_table);
#endif
return 0;
}
/**
* @return -1 if not supported
*/
int sws_getColorspaceDetails(SwsContext *c, int **inv_table, int *srcRange, int **table, int *dstRange, int *brightness, int *contrast, int *saturation){
if(isYUV(c->dstFormat) || isGray(c->dstFormat)) return -1;
*inv_table = c->srcColorspaceTable;
*table = c->dstColorspaceTable;
*srcRange = c->srcRange;
*dstRange = c->dstRange;
*brightness= c->brightness;
*contrast = c->contrast;
*saturation= c->saturation;
return 0;
}
SwsContext *sws_getContext(int srcW, int srcH, int origSrcFormat, int dstW, int dstH, int origDstFormat, int flags,
SwsFilter *srcFilter, SwsFilter *dstFilter){
SwsContext *c;
int i;
int usesVFilter, usesHFilter;
int unscaled, needsDither;
int srcFormat, dstFormat;
SwsFilter dummyFilter= {NULL, NULL, NULL, NULL};
#ifdef ARCH_X86
if(flags & SWS_CPU_CAPS_MMX)
asm volatile("emms\n\t"::: "memory");
#endif
#ifndef RUNTIME_CPUDETECT //ensure that the flags match the compiled variant if cpudetect is off
flags &= ~(SWS_CPU_CAPS_MMX|SWS_CPU_CAPS_MMX2|SWS_CPU_CAPS_3DNOW|SWS_CPU_CAPS_ALTIVEC);
#ifdef HAVE_MMX2
flags |= SWS_CPU_CAPS_MMX|SWS_CPU_CAPS_MMX2;
#elif defined (HAVE_3DNOW)
flags |= SWS_CPU_CAPS_MMX|SWS_CPU_CAPS_3DNOW;
#elif defined (HAVE_MMX)
flags |= SWS_CPU_CAPS_MMX;
#elif defined (HAVE_ALTIVEC)
flags |= SWS_CPU_CAPS_ALTIVEC;
#endif
#endif
if(clip_table[512] != 255) globalInit();
if(rgb15to16 == NULL) sws_rgb2rgb_init(flags);
/* avoid duplicate Formats, so we don't need to check to much */
srcFormat = remove_dup_fourcc(origSrcFormat);
dstFormat = remove_dup_fourcc(origDstFormat);
unscaled = (srcW == dstW && srcH == dstH);
needsDither= (isBGR(dstFormat) || isRGB(dstFormat))
&& (dstFormat&0xFF)<24
&& ((dstFormat&0xFF)<(srcFormat&0xFF) || (!(isRGB(srcFormat) || isBGR(srcFormat))));
if(!isSupportedIn(srcFormat))
{
MSG_ERR("swScaler: %s is not supported as input format\n", vo_format_name(srcFormat));
return NULL;
}
if(!isSupportedOut(dstFormat))
{
MSG_ERR("swScaler: %s is not supported as output format\n", vo_format_name(dstFormat));
return NULL;
}
/* sanity check */
if(srcW<4 || srcH<1 || dstW<8 || dstH<1) //FIXME check if these are enough and try to lowwer them after fixing the relevant parts of the code
{
MSG_ERR("swScaler: %dx%d -> %dx%d is invalid scaling dimension\n",
srcW, srcH, dstW, dstH);
return NULL;
}
if(!dstFilter) dstFilter= &dummyFilter;
if(!srcFilter) srcFilter= &dummyFilter;
c= memalign(64, sizeof(SwsContext));
memset(c, 0, sizeof(SwsContext));
c->srcW= srcW;
c->srcH= srcH;
c->dstW= dstW;
c->dstH= dstH;
c->lumXInc= ((srcW<<16) + (dstW>>1))/dstW;
c->lumYInc= ((srcH<<16) + (dstH>>1))/dstH;
c->flags= flags;
c->dstFormat= dstFormat;
c->srcFormat= srcFormat;
c->origDstFormat= origDstFormat;
c->origSrcFormat= origSrcFormat;
c->vRounder= 4* 0x0001000100010001ULL;
usesHFilter= usesVFilter= 0;
if(dstFilter->lumV!=NULL && dstFilter->lumV->length>1) usesVFilter=1;
if(dstFilter->lumH!=NULL && dstFilter->lumH->length>1) usesHFilter=1;
if(dstFilter->chrV!=NULL && dstFilter->chrV->length>1) usesVFilter=1;
if(dstFilter->chrH!=NULL && dstFilter->chrH->length>1) usesHFilter=1;
if(srcFilter->lumV!=NULL && srcFilter->lumV->length>1) usesVFilter=1;
if(srcFilter->lumH!=NULL && srcFilter->lumH->length>1) usesHFilter=1;
if(srcFilter->chrV!=NULL && srcFilter->chrV->length>1) usesVFilter=1;
if(srcFilter->chrH!=NULL && srcFilter->chrH->length>1) usesHFilter=1;
getSubSampleFactors(&c->chrSrcHSubSample, &c->chrSrcVSubSample, srcFormat);
getSubSampleFactors(&c->chrDstHSubSample, &c->chrDstVSubSample, dstFormat);
// reuse chroma for 2 pixles rgb/bgr unless user wants full chroma interpolation
if((isBGR(dstFormat) || isRGB(dstFormat)) && !(flags&SWS_FULL_CHR_H_INT)) c->chrDstHSubSample=1;
// drop some chroma lines if the user wants it
c->vChrDrop= (flags&SWS_SRC_V_CHR_DROP_MASK)>>SWS_SRC_V_CHR_DROP_SHIFT;
c->chrSrcVSubSample+= c->vChrDrop;
// drop every 2. pixel for chroma calculation unless user wants full chroma
if((isBGR(srcFormat) || isRGB(srcFormat)) && !(flags&SWS_FULL_CHR_H_INP))
c->chrSrcHSubSample=1;
c->chrIntHSubSample= c->chrDstHSubSample;
c->chrIntVSubSample= c->chrSrcVSubSample;
// note the -((-x)>>y) is so that we allways round toward +inf
c->chrSrcW= -((-srcW) >> c->chrSrcHSubSample);
c->chrSrcH= -((-srcH) >> c->chrSrcVSubSample);
c->chrDstW= -((-dstW) >> c->chrDstHSubSample);
c->chrDstH= -((-dstH) >> c->chrDstVSubSample);
sws_setColorspaceDetails(c, Inverse_Table_6_9[SWS_CS_DEFAULT], 0, Inverse_Table_6_9[SWS_CS_DEFAULT] /* FIXME*/, 0, 0, 1<<16, 1<<16);
/* unscaled special Cases */
if(unscaled && !usesHFilter && !usesVFilter)
{
/* yv12_to_nv12 */
if(srcFormat == IMGFMT_YV12 && dstFormat == IMGFMT_NV12)
{
c->swScale= PlanarToNV12Wrapper;
}
/* yuv2bgr */
if((srcFormat==IMGFMT_YV12 || srcFormat==IMGFMT_422P) && (isBGR(dstFormat) || isRGB(dstFormat)))
{
c->swScale= yuv2rgb_get_func_ptr(c);
}
if( srcFormat==IMGFMT_YVU9 && dstFormat==IMGFMT_YV12 )
{
c->swScale= yvu9toyv12Wrapper;
}
/* bgr24toYV12 */
if(srcFormat==IMGFMT_BGR24 && dstFormat==IMGFMT_YV12)
c->swScale= bgr24toyv12Wrapper;
/* rgb/bgr -> rgb/bgr (no dither needed forms) */
if( (isBGR(srcFormat) || isRGB(srcFormat))
&& (isBGR(dstFormat) || isRGB(dstFormat))
&& !needsDither)
c->swScale= rgb2rgbWrapper;
/* LQ converters if -sws 0 or -sws 4*/
if(c->flags&(SWS_FAST_BILINEAR|SWS_POINT)){
/* rgb/bgr -> rgb/bgr (dither needed forms) */
if( (isBGR(srcFormat) || isRGB(srcFormat))
&& (isBGR(dstFormat) || isRGB(dstFormat))
&& needsDither)
c->swScale= rgb2rgbWrapper;
/* yv12_to_yuy2 */
if(srcFormat == IMGFMT_YV12 &&
(dstFormat == IMGFMT_YUY2 || dstFormat == IMGFMT_UYVY))
{
if (dstFormat == IMGFMT_YUY2)
c->swScale= PlanarToYuy2Wrapper;
else
c->swScale= PlanarToUyvyWrapper;
}
}
#ifdef HAVE_ALTIVEC
if ((c->flags & SWS_CPU_CAPS_ALTIVEC) &&
((srcFormat == IMGFMT_YV12 &&
(dstFormat == IMGFMT_YUY2 || dstFormat == IMGFMT_UYVY)))) {
// unscaled YV12 -> packed YUV, we want speed
if (dstFormat == IMGFMT_YUY2)
c->swScale= yv12toyuy2_unscaled_altivec;
else
c->swScale= yv12touyvy_unscaled_altivec;
}
#endif
/* simple copy */
if( srcFormat == dstFormat
|| (isPlanarYUV(srcFormat) && isGray(dstFormat))
|| (isPlanarYUV(dstFormat) && isGray(srcFormat))
)
{
c->swScale= simpleCopy;
}
if(c->swScale){
if(flags&SWS_PRINT_INFO)
MSG_INFO("SwScaler: using unscaled %s -> %s special converter\n",
vo_format_name(srcFormat), vo_format_name(dstFormat));
return c;
}
}
if(flags & SWS_CPU_CAPS_MMX2)
{
c->canMMX2BeUsed= (dstW >=srcW && (dstW&31)==0 && (srcW&15)==0) ? 1 : 0;
if(!c->canMMX2BeUsed && dstW >=srcW && (srcW&15)==0 && (flags&SWS_FAST_BILINEAR))
{
if(flags&SWS_PRINT_INFO)
MSG_INFO("SwScaler: output Width is not a multiple of 32 -> no MMX2 scaler\n");
}
if(usesHFilter) c->canMMX2BeUsed=0;
}
else
c->canMMX2BeUsed=0;
c->chrXInc= ((c->chrSrcW<<16) + (c->chrDstW>>1))/c->chrDstW;
c->chrYInc= ((c->chrSrcH<<16) + (c->chrDstH>>1))/c->chrDstH;
// match pixel 0 of the src to pixel 0 of dst and match pixel n-2 of src to pixel n-2 of dst
// but only for the FAST_BILINEAR mode otherwise do correct scaling
// n-2 is the last chrominance sample available
// this is not perfect, but noone shuld notice the difference, the more correct variant
// would be like the vertical one, but that would require some special code for the
// first and last pixel
if(flags&SWS_FAST_BILINEAR)
{
if(c->canMMX2BeUsed)
{
c->lumXInc+= 20;
c->chrXInc+= 20;
}
//we don't use the x86asm scaler if mmx is available
else if(flags & SWS_CPU_CAPS_MMX)
{
c->lumXInc = ((srcW-2)<<16)/(dstW-2) - 20;
c->chrXInc = ((c->chrSrcW-2)<<16)/(c->chrDstW-2) - 20;
}
}
/* precalculate horizontal scaler filter coefficients */
{
const int filterAlign=
(flags & SWS_CPU_CAPS_MMX) ? 4 :
(flags & SWS_CPU_CAPS_ALTIVEC) ? 8 :
1;
initFilter(&c->hLumFilter, &c->hLumFilterPos, &c->hLumFilterSize, c->lumXInc,
srcW , dstW, filterAlign, 1<<14,
(flags&SWS_BICUBLIN) ? (flags|SWS_BICUBIC) : flags,
srcFilter->lumH, dstFilter->lumH);
initFilter(&c->hChrFilter, &c->hChrFilterPos, &c->hChrFilterSize, c->chrXInc,
c->chrSrcW, c->chrDstW, filterAlign, 1<<14,
(flags&SWS_BICUBLIN) ? (flags|SWS_BILINEAR) : flags,
srcFilter->chrH, dstFilter->chrH);
#ifdef ARCH_X86
// can't downscale !!!
if(c->canMMX2BeUsed && (flags & SWS_FAST_BILINEAR))
{
c->lumMmx2Filter = (int16_t*)memalign(8, (dstW /8+8)*sizeof(int16_t));
c->chrMmx2Filter = (int16_t*)memalign(8, (c->chrDstW /4+8)*sizeof(int16_t));
c->lumMmx2FilterPos= (int32_t*)memalign(8, (dstW /2/8+8)*sizeof(int32_t));
c->chrMmx2FilterPos= (int32_t*)memalign(8, (c->chrDstW/2/4+8)*sizeof(int32_t));
initMMX2HScaler( dstW, c->lumXInc, c->funnyYCode , c->lumMmx2Filter, c->lumMmx2FilterPos, 8);
initMMX2HScaler(c->chrDstW, c->chrXInc, c->funnyUVCode, c->chrMmx2Filter, c->chrMmx2FilterPos, 4);
}
#endif
} // Init Horizontal stuff
/* precalculate vertical scaler filter coefficients */
{
const int filterAlign=
(flags & SWS_CPU_CAPS_ALTIVEC) ? 8 :
1;
initFilter(&c->vLumFilter, &c->vLumFilterPos, &c->vLumFilterSize, c->lumYInc,
srcH , dstH, filterAlign, (1<<12)-4,
(flags&SWS_BICUBLIN) ? (flags|SWS_BICUBIC) : flags,
srcFilter->lumV, dstFilter->lumV);
initFilter(&c->vChrFilter, &c->vChrFilterPos, &c->vChrFilterSize, c->chrYInc,
c->chrSrcH, c->chrDstH, filterAlign, (1<<12)-4,
(flags&SWS_BICUBLIN) ? (flags|SWS_BILINEAR) : flags,
srcFilter->chrV, dstFilter->chrV);
}
// Calculate Buffer Sizes so that they won't run out while handling these damn slices
c->vLumBufSize= c->vLumFilterSize;
c->vChrBufSize= c->vChrFilterSize;
for(i=0; i<dstH; i++)
{
int chrI= i*c->chrDstH / dstH;
int nextSlice= MAX(c->vLumFilterPos[i ] + c->vLumFilterSize - 1,
((c->vChrFilterPos[chrI] + c->vChrFilterSize - 1)<<c->chrSrcVSubSample));
nextSlice>>= c->chrSrcVSubSample;
nextSlice<<= c->chrSrcVSubSample;
if(c->vLumFilterPos[i ] + c->vLumBufSize < nextSlice)
c->vLumBufSize= nextSlice - c->vLumFilterPos[i ];
if(c->vChrFilterPos[chrI] + c->vChrBufSize < (nextSlice>>c->chrSrcVSubSample))
c->vChrBufSize= (nextSlice>>c->chrSrcVSubSample) - c->vChrFilterPos[chrI];
}
// allocate pixbufs (we use dynamic allocation because otherwise we would need to
c->lumPixBuf= (int16_t**)memalign(4, c->vLumBufSize*2*sizeof(int16_t*));
c->chrPixBuf= (int16_t**)memalign(4, c->vChrBufSize*2*sizeof(int16_t*));
//Note we need at least one pixel more at the end because of the mmx code (just in case someone wanna replace the 4000/8000)
for(i=0; i<c->vLumBufSize; i++)
c->lumPixBuf[i]= c->lumPixBuf[i+c->vLumBufSize]= (uint16_t*)memalign(8, 4000);
for(i=0; i<c->vChrBufSize; i++)
c->chrPixBuf[i]= c->chrPixBuf[i+c->vChrBufSize]= (uint16_t*)memalign(8, 8000);
//try to avoid drawing green stuff between the right end and the stride end
for(i=0; i<c->vLumBufSize; i++) memset(c->lumPixBuf[i], 0, 4000);
for(i=0; i<c->vChrBufSize; i++) memset(c->chrPixBuf[i], 64, 8000);
ASSERT(c->chrDstH <= dstH)
if(flags&SWS_PRINT_INFO)
{
#ifdef DITHER1XBPP
char *dither= " dithered";
#else
char *dither= "";
#endif
if(flags&SWS_FAST_BILINEAR)
MSG_INFO("\nSwScaler: FAST_BILINEAR scaler, ");
else if(flags&SWS_BILINEAR)
MSG_INFO("\nSwScaler: BILINEAR scaler, ");
else if(flags&SWS_BICUBIC)
MSG_INFO("\nSwScaler: BICUBIC scaler, ");
else if(flags&SWS_X)
MSG_INFO("\nSwScaler: Experimental scaler, ");
else if(flags&SWS_POINT)
MSG_INFO("\nSwScaler: Nearest Neighbor / POINT scaler, ");
else if(flags&SWS_AREA)
MSG_INFO("\nSwScaler: Area Averageing scaler, ");
else if(flags&SWS_BICUBLIN)
MSG_INFO("\nSwScaler: luma BICUBIC / chroma BILINEAR scaler, ");
else if(flags&SWS_GAUSS)
MSG_INFO("\nSwScaler: Gaussian scaler, ");
else if(flags&SWS_SINC)
MSG_INFO("\nSwScaler: Sinc scaler, ");
else if(flags&SWS_LANCZOS)
MSG_INFO("\nSwScaler: Lanczos scaler, ");
else if(flags&SWS_SPLINE)
MSG_INFO("\nSwScaler: Bicubic spline scaler, ");
else
MSG_INFO("\nSwScaler: ehh flags invalid?! ");
if(dstFormat==IMGFMT_BGR15 || dstFormat==IMGFMT_BGR16)
MSG_INFO("from %s to%s %s ",
vo_format_name(srcFormat), dither, vo_format_name(dstFormat));
else
MSG_INFO("from %s to %s ",
vo_format_name(srcFormat), vo_format_name(dstFormat));
if(flags & SWS_CPU_CAPS_MMX2)
MSG_INFO("using MMX2\n");
else if(flags & SWS_CPU_CAPS_3DNOW)
MSG_INFO("using 3DNOW\n");
else if(flags & SWS_CPU_CAPS_MMX)
MSG_INFO("using MMX\n");
else if(flags & SWS_CPU_CAPS_ALTIVEC)
MSG_INFO("using AltiVec\n");
else
MSG_INFO("using C\n");
}
if(flags & SWS_PRINT_INFO)
{
if(flags & SWS_CPU_CAPS_MMX)
{
if(c->canMMX2BeUsed && (flags&SWS_FAST_BILINEAR))
MSG_V("SwScaler: using FAST_BILINEAR MMX2 scaler for horizontal scaling\n");
else
{
if(c->hLumFilterSize==4)
MSG_V("SwScaler: using 4-tap MMX scaler for horizontal luminance scaling\n");
else if(c->hLumFilterSize==8)
MSG_V("SwScaler: using 8-tap MMX scaler for horizontal luminance scaling\n");
else
MSG_V("SwScaler: using n-tap MMX scaler for horizontal luminance scaling\n");
if(c->hChrFilterSize==4)
MSG_V("SwScaler: using 4-tap MMX scaler for horizontal chrominance scaling\n");
else if(c->hChrFilterSize==8)
MSG_V("SwScaler: using 8-tap MMX scaler for horizontal chrominance scaling\n");
else
MSG_V("SwScaler: using n-tap MMX scaler for horizontal chrominance scaling\n");
}
}
else
{
#ifdef ARCH_X86
MSG_V("SwScaler: using X86-Asm scaler for horizontal scaling\n");
#else
if(flags & SWS_FAST_BILINEAR)
MSG_V("SwScaler: using FAST_BILINEAR C scaler for horizontal scaling\n");
else
MSG_V("SwScaler: using C scaler for horizontal scaling\n");
#endif
}
if(isPlanarYUV(dstFormat))
{
if(c->vLumFilterSize==1)
MSG_V("SwScaler: using 1-tap %s \"scaler\" for vertical scaling (YV12 like)\n", (flags & SWS_CPU_CAPS_MMX) ? "MMX" : "C");
else
MSG_V("SwScaler: using n-tap %s scaler for vertical scaling (YV12 like)\n", (flags & SWS_CPU_CAPS_MMX) ? "MMX" : "C");
}
else
{
if(c->vLumFilterSize==1 && c->vChrFilterSize==2)
MSG_V("SwScaler: using 1-tap %s \"scaler\" for vertical luminance scaling (BGR)\n"
"SwScaler: 2-tap scaler for vertical chrominance scaling (BGR)\n",(flags & SWS_CPU_CAPS_MMX) ? "MMX" : "C");
else if(c->vLumFilterSize==2 && c->vChrFilterSize==2)
MSG_V("SwScaler: using 2-tap linear %s scaler for vertical scaling (BGR)\n", (flags & SWS_CPU_CAPS_MMX) ? "MMX" : "C");
else
MSG_V("SwScaler: using n-tap %s scaler for vertical scaling (BGR)\n", (flags & SWS_CPU_CAPS_MMX) ? "MMX" : "C");
}
if(dstFormat==IMGFMT_BGR24)
MSG_V("SwScaler: using %s YV12->BGR24 Converter\n",
(flags & SWS_CPU_CAPS_MMX2) ? "MMX2" : ((flags & SWS_CPU_CAPS_MMX) ? "MMX" : "C"));
else if(dstFormat==IMGFMT_BGR32)
MSG_V("SwScaler: using %s YV12->BGR32 Converter\n", (flags & SWS_CPU_CAPS_MMX) ? "MMX" : "C");
else if(dstFormat==IMGFMT_BGR16)
MSG_V("SwScaler: using %s YV12->BGR16 Converter\n", (flags & SWS_CPU_CAPS_MMX) ? "MMX" : "C");
else if(dstFormat==IMGFMT_BGR15)
MSG_V("SwScaler: using %s YV12->BGR15 Converter\n", (flags & SWS_CPU_CAPS_MMX) ? "MMX" : "C");
MSG_V("SwScaler: %dx%d -> %dx%d\n", srcW, srcH, dstW, dstH);
}
if(flags & SWS_PRINT_INFO)
{
MSG_DBG2("SwScaler:Lum srcW=%d srcH=%d dstW=%d dstH=%d xInc=%d yInc=%d\n",
c->srcW, c->srcH, c->dstW, c->dstH, c->lumXInc, c->lumYInc);
MSG_DBG2("SwScaler:Chr srcW=%d srcH=%d dstW=%d dstH=%d xInc=%d yInc=%d\n",
c->chrSrcW, c->chrSrcH, c->chrDstW, c->chrDstH, c->chrXInc, c->chrYInc);
}
c->swScale= getSwsFunc(flags);
return c;
}
/**
* swscale warper, so we don't need to export the SwsContext.
* assumes planar YUV to be in YUV order instead of YVU
*/
int sws_scale_ordered(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY,
int srcSliceH, uint8_t* dst[], int dstStride[]){
//copy strides, so they can safely be modified
int srcStride2[3]= {srcStride[0], srcStride[1], srcStride[2]};
int dstStride2[3]= {dstStride[0], dstStride[1], dstStride[2]};
return c->swScale(c, src, srcStride2, srcSliceY, srcSliceH, dst, dstStride2);
}
/**
* swscale warper, so we don't need to export the SwsContext
*/
int sws_scale(SwsContext *c, uint8_t* srcParam[], int srcStrideParam[], int srcSliceY,
int srcSliceH, uint8_t* dstParam[], int dstStrideParam[]){
int srcStride[3];
int dstStride[3];
uint8_t *src[3];
uint8_t *dst[3];
sws_orderYUV(c->origSrcFormat, src, srcStride, srcParam, srcStrideParam);
sws_orderYUV(c->origDstFormat, dst, dstStride, dstParam, dstStrideParam);
//printf("sws: slice %d %d\n", srcSliceY, srcSliceH);
return c->swScale(c, src, srcStride, srcSliceY, srcSliceH, dst, dstStride);
}
SwsFilter *sws_getDefaultFilter(float lumaGBlur, float chromaGBlur,
float lumaSharpen, float chromaSharpen,
float chromaHShift, float chromaVShift,
int verbose)
{
SwsFilter *filter= malloc(sizeof(SwsFilter));
if(lumaGBlur!=0.0){
filter->lumH= sws_getGaussianVec(lumaGBlur, 3.0);
filter->lumV= sws_getGaussianVec(lumaGBlur, 3.0);
}else{
filter->lumH= sws_getIdentityVec();
filter->lumV= sws_getIdentityVec();
}
if(chromaGBlur!=0.0){
filter->chrH= sws_getGaussianVec(chromaGBlur, 3.0);
filter->chrV= sws_getGaussianVec(chromaGBlur, 3.0);
}else{
filter->chrH= sws_getIdentityVec();
filter->chrV= sws_getIdentityVec();
}
if(chromaSharpen!=0.0){
SwsVector *g= sws_getConstVec(-1.0, 3);
SwsVector *id= sws_getConstVec(10.0/chromaSharpen, 1);
g->coeff[1]=2.0;
sws_addVec(id, g);
sws_convVec(filter->chrH, id);
sws_convVec(filter->chrV, id);
sws_freeVec(g);
sws_freeVec(id);
}
if(lumaSharpen!=0.0){
SwsVector *g= sws_getConstVec(-1.0, 3);
SwsVector *id= sws_getConstVec(10.0/lumaSharpen, 1);
g->coeff[1]=2.0;
sws_addVec(id, g);
sws_convVec(filter->lumH, id);
sws_convVec(filter->lumV, id);
sws_freeVec(g);
sws_freeVec(id);
}
if(chromaHShift != 0.0)
sws_shiftVec(filter->chrH, (int)(chromaHShift+0.5));
if(chromaVShift != 0.0)
sws_shiftVec(filter->chrV, (int)(chromaVShift+0.5));
sws_normalizeVec(filter->chrH, 1.0);
sws_normalizeVec(filter->chrV, 1.0);
sws_normalizeVec(filter->lumH, 1.0);
sws_normalizeVec(filter->lumV, 1.0);
if(verbose) sws_printVec(filter->chrH);
if(verbose) sws_printVec(filter->lumH);
return filter;
}
/**
* returns a normalized gaussian curve used to filter stuff
* quality=3 is high quality, lowwer is lowwer quality
*/
SwsVector *sws_getGaussianVec(double variance, double quality){
const int length= (int)(variance*quality + 0.5) | 1;
int i;
double *coeff= memalign(sizeof(double), length*sizeof(double));
double middle= (length-1)*0.5;
SwsVector *vec= malloc(sizeof(SwsVector));
vec->coeff= coeff;
vec->length= length;
for(i=0; i<length; i++)
{
double dist= i-middle;
coeff[i]= exp( -dist*dist/(2*variance*variance) ) / sqrt(2*variance*PI);
}
sws_normalizeVec(vec, 1.0);
return vec;
}
SwsVector *sws_getConstVec(double c, int length){
int i;
double *coeff= memalign(sizeof(double), length*sizeof(double));
SwsVector *vec= malloc(sizeof(SwsVector));
vec->coeff= coeff;
vec->length= length;
for(i=0; i<length; i++)
coeff[i]= c;
return vec;
}
SwsVector *sws_getIdentityVec(void){
double *coeff= memalign(sizeof(double), sizeof(double));
SwsVector *vec= malloc(sizeof(SwsVector));
coeff[0]= 1.0;
vec->coeff= coeff;
vec->length= 1;
return vec;
}
void sws_normalizeVec(SwsVector *a, double height){
int i;
double sum=0;
double inv;
for(i=0; i<a->length; i++)
sum+= a->coeff[i];
inv= height/sum;
for(i=0; i<a->length; i++)
a->coeff[i]*= inv;
}
void sws_scaleVec(SwsVector *a, double scalar){
int i;
for(i=0; i<a->length; i++)
a->coeff[i]*= scalar;
}
static SwsVector *sws_getConvVec(SwsVector *a, SwsVector *b){
int length= a->length + b->length - 1;
double *coeff= memalign(sizeof(double), length*sizeof(double));
int i, j;
SwsVector *vec= malloc(sizeof(SwsVector));
vec->coeff= coeff;
vec->length= length;
for(i=0; i<length; i++) coeff[i]= 0.0;
for(i=0; i<a->length; i++)
{
for(j=0; j<b->length; j++)
{
coeff[i+j]+= a->coeff[i]*b->coeff[j];
}
}
return vec;
}
static SwsVector *sws_sumVec(SwsVector *a, SwsVector *b){
int length= MAX(a->length, b->length);
double *coeff= memalign(sizeof(double), length*sizeof(double));
int i;
SwsVector *vec= malloc(sizeof(SwsVector));
vec->coeff= coeff;
vec->length= length;
for(i=0; i<length; i++) coeff[i]= 0.0;
for(i=0; i<a->length; i++) coeff[i + (length-1)/2 - (a->length-1)/2]+= a->coeff[i];
for(i=0; i<b->length; i++) coeff[i + (length-1)/2 - (b->length-1)/2]+= b->coeff[i];
return vec;
}
static SwsVector *sws_diffVec(SwsVector *a, SwsVector *b){
int length= MAX(a->length, b->length);
double *coeff= memalign(sizeof(double), length*sizeof(double));
int i;
SwsVector *vec= malloc(sizeof(SwsVector));
vec->coeff= coeff;
vec->length= length;
for(i=0; i<length; i++) coeff[i]= 0.0;
for(i=0; i<a->length; i++) coeff[i + (length-1)/2 - (a->length-1)/2]+= a->coeff[i];
for(i=0; i<b->length; i++) coeff[i + (length-1)/2 - (b->length-1)/2]-= b->coeff[i];
return vec;
}
/* shift left / or right if "shift" is negative */
static SwsVector *sws_getShiftedVec(SwsVector *a, int shift){
int length= a->length + ABS(shift)*2;
double *coeff= memalign(sizeof(double), length*sizeof(double));
int i;
SwsVector *vec= malloc(sizeof(SwsVector));
vec->coeff= coeff;
vec->length= length;
for(i=0; i<length; i++) coeff[i]= 0.0;
for(i=0; i<a->length; i++)
{
coeff[i + (length-1)/2 - (a->length-1)/2 - shift]= a->coeff[i];
}
return vec;
}
void sws_shiftVec(SwsVector *a, int shift){
SwsVector *shifted= sws_getShiftedVec(a, shift);
free(a->coeff);
a->coeff= shifted->coeff;
a->length= shifted->length;
free(shifted);
}
void sws_addVec(SwsVector *a, SwsVector *b){
SwsVector *sum= sws_sumVec(a, b);
free(a->coeff);
a->coeff= sum->coeff;
a->length= sum->length;
free(sum);
}
void sws_subVec(SwsVector *a, SwsVector *b){
SwsVector *diff= sws_diffVec(a, b);
free(a->coeff);
a->coeff= diff->coeff;
a->length= diff->length;
free(diff);
}
void sws_convVec(SwsVector *a, SwsVector *b){
SwsVector *conv= sws_getConvVec(a, b);
free(a->coeff);
a->coeff= conv->coeff;
a->length= conv->length;
free(conv);
}
SwsVector *sws_cloneVec(SwsVector *a){
double *coeff= memalign(sizeof(double), a->length*sizeof(double));
int i;
SwsVector *vec= malloc(sizeof(SwsVector));
vec->coeff= coeff;
vec->length= a->length;
for(i=0; i<a->length; i++) coeff[i]= a->coeff[i];
return vec;
}
void sws_printVec(SwsVector *a){
int i;
double max=0;
double min=0;
double range;
for(i=0; i<a->length; i++)
if(a->coeff[i]>max) max= a->coeff[i];
for(i=0; i<a->length; i++)
if(a->coeff[i]<min) min= a->coeff[i];
range= max - min;
for(i=0; i<a->length; i++)
{
int x= (int)((a->coeff[i]-min)*60.0/range +0.5);
MSG_DBG2("%1.3f ", a->coeff[i]);
for(;x>0; x--) MSG_DBG2(" ");
MSG_DBG2("|\n");
}
}
void sws_freeVec(SwsVector *a){
if(!a) return;
if(a->coeff) free(a->coeff);
a->coeff=NULL;
a->length=0;
free(a);
}
void sws_freeFilter(SwsFilter *filter){
if(!filter) return;
if(filter->lumH) sws_freeVec(filter->lumH);
if(filter->lumV) sws_freeVec(filter->lumV);
if(filter->chrH) sws_freeVec(filter->chrH);
if(filter->chrV) sws_freeVec(filter->chrV);
free(filter);
}
void sws_freeContext(SwsContext *c){
int i;
if(!c) return;
if(c->lumPixBuf)
{
for(i=0; i<c->vLumBufSize; i++)
{
if(c->lumPixBuf[i]) free(c->lumPixBuf[i]);
c->lumPixBuf[i]=NULL;
}
free(c->lumPixBuf);
c->lumPixBuf=NULL;
}
if(c->chrPixBuf)
{
for(i=0; i<c->vChrBufSize; i++)
{
if(c->chrPixBuf[i]) free(c->chrPixBuf[i]);
c->chrPixBuf[i]=NULL;
}
free(c->chrPixBuf);
c->chrPixBuf=NULL;
}
if(c->vLumFilter) free(c->vLumFilter);
c->vLumFilter = NULL;
if(c->vChrFilter) free(c->vChrFilter);
c->vChrFilter = NULL;
if(c->hLumFilter) free(c->hLumFilter);
c->hLumFilter = NULL;
if(c->hChrFilter) free(c->hChrFilter);
c->hChrFilter = NULL;
if(c->vLumFilterPos) free(c->vLumFilterPos);
c->vLumFilterPos = NULL;
if(c->vChrFilterPos) free(c->vChrFilterPos);
c->vChrFilterPos = NULL;
if(c->hLumFilterPos) free(c->hLumFilterPos);
c->hLumFilterPos = NULL;
if(c->hChrFilterPos) free(c->hChrFilterPos);
c->hChrFilterPos = NULL;
if(c->lumMmx2Filter) free(c->lumMmx2Filter);
c->lumMmx2Filter=NULL;
if(c->chrMmx2Filter) free(c->chrMmx2Filter);
c->chrMmx2Filter=NULL;
if(c->lumMmx2FilterPos) free(c->lumMmx2FilterPos);
c->lumMmx2FilterPos=NULL;
if(c->chrMmx2FilterPos) free(c->chrMmx2FilterPos);
c->chrMmx2FilterPos=NULL;
if(c->yuvTable) free(c->yuvTable);
c->yuvTable=NULL;
free(c);
}
/*
Copyright (C) 2001-2003 Michael Niedermayer <michaelni@gmx.at>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*/
#ifndef SWSCALE_H
#define SWSCALE_H
/**
* @file swscale.h
* @brief
* external api for the swscale stuff
*/
#ifdef __cplusplus
extern "C" {
#endif
/* values for the flags, the stuff on the command line is different */
#define SWS_FAST_BILINEAR 1
#define SWS_BILINEAR 2
#define SWS_BICUBIC 4
#define SWS_X 8
#define SWS_POINT 0x10
#define SWS_AREA 0x20
#define SWS_BICUBLIN 0x40
#define SWS_GAUSS 0x80
#define SWS_SINC 0x100
#define SWS_LANCZOS 0x200
#define SWS_SPLINE 0x400
#define SWS_SRC_V_CHR_DROP_MASK 0x30000
#define SWS_SRC_V_CHR_DROP_SHIFT 16
#define SWS_PARAM_MASK 0x3FC0000
#define SWS_PARAM_SHIFT 18
#define SWS_PRINT_INFO 0x1000
//the following 3 flags are not completly implemented
//internal chrominace subsamling info
#define SWS_FULL_CHR_H_INT 0x2000
//input subsampling info
#define SWS_FULL_CHR_H_INP 0x4000
#define SWS_DIRECT_BGR 0x8000
#define SWS_CPU_CAPS_MMX 0x80000000
#define SWS_CPU_CAPS_MMX2 0x20000000
#define SWS_CPU_CAPS_3DNOW 0x40000000
#define SWS_CPU_CAPS_ALTIVEC 0x10000000
#define SWS_MAX_REDUCE_CUTOFF 0.002
#define SWS_CS_ITU709 1
#define SWS_CS_FCC 4
#define SWS_CS_ITU601 5
#define SWS_CS_ITU624 5
#define SWS_CS_SMPTE170M 5
#define SWS_CS_SMPTE240M 7
#define SWS_CS_DEFAULT 5
// when used for filters they must have an odd number of elements
// coeffs cannot be shared between vectors
typedef struct {
double *coeff;
int length;
} SwsVector;
// vectors can be shared
typedef struct {
SwsVector *lumH;
SwsVector *lumV;
SwsVector *chrH;
SwsVector *chrV;
} SwsFilter;
struct SwsContext;
void sws_freeContext(struct SwsContext *swsContext);
struct SwsContext *sws_getContext(int srcW, int srcH, int srcFormat, int dstW, int dstH, int dstFormat, int flags,
SwsFilter *srcFilter, SwsFilter *dstFilter);
int sws_scale(struct SwsContext *context, uint8_t* src[], int srcStride[], int srcSliceY,
int srcSliceH, uint8_t* dst[], int dstStride[]);
int sws_scale_ordered(struct SwsContext *context, uint8_t* src[], int srcStride[], int srcSliceY,
int srcSliceH, uint8_t* dst[], int dstStride[]);
int sws_setColorspaceDetails(struct SwsContext *c, const int inv_table[4], int srcRange, const int table[4], int dstRange, int brightness, int contrast, int saturation);
int sws_getColorspaceDetails(struct SwsContext *c, int **inv_table, int *srcRange, int **table, int *dstRange, int *brightness, int *contrast, int *saturation);
SwsVector *sws_getGaussianVec(double variance, double quality);
SwsVector *sws_getConstVec(double c, int length);
SwsVector *sws_getIdentityVec(void);
void sws_scaleVec(SwsVector *a, double scalar);
void sws_normalizeVec(SwsVector *a, double height);
void sws_convVec(SwsVector *a, SwsVector *b);
void sws_addVec(SwsVector *a, SwsVector *b);
void sws_subVec(SwsVector *a, SwsVector *b);
void sws_shiftVec(SwsVector *a, int shift);
SwsVector *sws_cloneVec(SwsVector *a);
void sws_printVec(SwsVector *a);
void sws_freeVec(SwsVector *a);
SwsFilter *sws_getDefaultFilter(float lumaGBlur, float chromaGBlur,
float lumaSarpen, float chromaSharpen,
float chromaHShift, float chromaVShift,
int verbose);
void sws_freeFilter(SwsFilter *filter);
#ifdef __cplusplus
}
#endif
#endif
/*
AltiVec-enhanced yuv2yuvX
Copyright (C) 2004 Romain Dolbeau <romain@dolbeau.org>
based on the equivalent C code in "postproc/swscale.c"
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*/
#ifdef CONFIG_DARWIN
#define AVV(x...) (x)
#else
#define AVV(x...) {x}
#endif
static const vector signed int vzero =
(const vector signed int)AVV(0, 0, 0, 0);
static const vector unsigned int altivec_vectorShiftInt19 =
(const vector unsigned int)AVV(19, 19, 19, 19);
static inline void
altivec_packIntArrayToCharArray(int *val, uint8_t* dest, int dstW) {
register int i;
if ((unsigned long)dest % 16) {
/* badly aligned store, we force store alignement */
/* and will handle load misalignement on val w/ vec_perm */
for (i = 0 ; (i < dstW) &&
(((unsigned long)dest + i) % 16) ; i++) {
int t = val[i] >> 19;
dest[i] = (t < 0) ? 0 : ((t > 255) ? 255 : t);
}
vector unsigned char perm1 = vec_lvsl(i << 2, val);
vector signed int v1 = vec_ld(i << 2, val);
for ( ; i < (dstW - 15); i+=16) {
int offset = i << 2;
vector signed int v2 = vec_ld(offset + 16, val);
vector signed int v3 = vec_ld(offset + 32, val);
vector signed int v4 = vec_ld(offset + 48, val);
vector signed int v5 = vec_ld(offset + 64, val);
vector signed int v12 = vec_perm(v1,v2,perm1);
vector signed int v23 = vec_perm(v2,v3,perm1);
vector signed int v34 = vec_perm(v3,v4,perm1);
vector signed int v45 = vec_perm(v4,v5,perm1);
vector signed int vA = vec_sra(v12, altivec_vectorShiftInt19);
vector signed int vB = vec_sra(v23, altivec_vectorShiftInt19);
vector signed int vC = vec_sra(v34, altivec_vectorShiftInt19);
vector signed int vD = vec_sra(v45, altivec_vectorShiftInt19);
vector unsigned short vs1 = vec_packsu(vA, vB);
vector unsigned short vs2 = vec_packsu(vC, vD);
vector unsigned char vf = vec_packsu(vs1, vs2);
vec_st(vf, i, dest);
v1 = v5;
}
} else { // dest is properly aligned, great
for (i = 0; i < (dstW - 15); i+=16) {
int offset = i << 2;
vector signed int v1 = vec_ld(offset, val);
vector signed int v2 = vec_ld(offset + 16, val);
vector signed int v3 = vec_ld(offset + 32, val);
vector signed int v4 = vec_ld(offset + 48, val);
vector signed int v5 = vec_sra(v1, altivec_vectorShiftInt19);
vector signed int v6 = vec_sra(v2, altivec_vectorShiftInt19);
vector signed int v7 = vec_sra(v3, altivec_vectorShiftInt19);
vector signed int v8 = vec_sra(v4, altivec_vectorShiftInt19);
vector unsigned short vs1 = vec_packsu(v5, v6);
vector unsigned short vs2 = vec_packsu(v7, v8);
vector unsigned char vf = vec_packsu(vs1, vs2);
vec_st(vf, i, dest);
}
}
for ( ; i < dstW ; i++) {
int t = val[i] >> 19;
dest[i] = (t < 0) ? 0 : ((t > 255) ? 255 : t);
}
}
static inline void
yuv2yuvX_altivec_real(int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize,
int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize,
uint8_t *dest, uint8_t *uDest, uint8_t *vDest, int dstW, int chrDstW)
{
const vector signed int vini = {(1 << 18), (1 << 18), (1 << 18), (1 << 18)};
register int i, j;
{
int __attribute__ ((aligned (16))) val[dstW];
for (i = 0; i < (dstW -7); i+=4) {
vec_st(vini, i << 2, val);
}
for (; i < dstW; i++) {
val[i] = (1 << 18);
}
for (j = 0; j < lumFilterSize; j++) {
vector signed short vLumFilter = vec_ld(j << 1, lumFilter);
vector unsigned char perm0 = vec_lvsl(j << 1, lumFilter);
vLumFilter = vec_perm(vLumFilter, vLumFilter, perm0);
vLumFilter = vec_splat(vLumFilter, 0); // lumFilter[j] is loaded 8 times in vLumFilter
vector unsigned char perm = vec_lvsl(0, lumSrc[j]);
vector signed short l1 = vec_ld(0, lumSrc[j]);
for (i = 0; i < (dstW - 7); i+=8) {
int offset = i << 2;
vector signed short l2 = vec_ld((i << 1) + 16, lumSrc[j]);
vector signed int v1 = vec_ld(offset, val);
vector signed int v2 = vec_ld(offset + 16, val);
vector signed short ls = vec_perm(l1, l2, perm); // lumSrc[j][i] ... lumSrc[j][i+7]
vector signed int i1 = vec_mule(vLumFilter, ls);
vector signed int i2 = vec_mulo(vLumFilter, ls);
vector signed int vf1 = vec_mergeh(i1, i2);
vector signed int vf2 = vec_mergel(i1, i2); // lumSrc[j][i] * lumFilter[j] ... lumSrc[j][i+7] * lumFilter[j]
vector signed int vo1 = vec_add(v1, vf1);
vector signed int vo2 = vec_add(v2, vf2);
vec_st(vo1, offset, val);
vec_st(vo2, offset + 16, val);
l1 = l2;
}
for ( ; i < dstW; i++) {
val[i] += lumSrc[j][i] * lumFilter[j];
}
}
altivec_packIntArrayToCharArray(val,dest,dstW);
}
if (uDest != 0) {
int __attribute__ ((aligned (16))) u[chrDstW];
int __attribute__ ((aligned (16))) v[chrDstW];
for (i = 0; i < (chrDstW -7); i+=4) {
vec_st(vini, i << 2, u);
vec_st(vini, i << 2, v);
}
for (; i < chrDstW; i++) {
u[i] = (1 << 18);
v[i] = (1 << 18);
}
for (j = 0; j < chrFilterSize; j++) {
vector signed short vChrFilter = vec_ld(j << 1, chrFilter);
vector unsigned char perm0 = vec_lvsl(j << 1, chrFilter);
vChrFilter = vec_perm(vChrFilter, vChrFilter, perm0);
vChrFilter = vec_splat(vChrFilter, 0); // chrFilter[j] is loaded 8 times in vChrFilter
vector unsigned char perm = vec_lvsl(0, chrSrc[j]);
vector signed short l1 = vec_ld(0, chrSrc[j]);
vector signed short l1_V = vec_ld(2048 << 1, chrSrc[j]);
for (i = 0; i < (chrDstW - 7); i+=8) {
int offset = i << 2;
vector signed short l2 = vec_ld((i << 1) + 16, chrSrc[j]);
vector signed short l2_V = vec_ld(((i + 2048) << 1) + 16, chrSrc[j]);
vector signed int v1 = vec_ld(offset, u);
vector signed int v2 = vec_ld(offset + 16, u);
vector signed int v1_V = vec_ld(offset, v);
vector signed int v2_V = vec_ld(offset + 16, v);
vector signed short ls = vec_perm(l1, l2, perm); // chrSrc[j][i] ... chrSrc[j][i+7]
vector signed short ls_V = vec_perm(l1_V, l2_V, perm); // chrSrc[j][i+2048] ... chrSrc[j][i+2055]
vector signed int i1 = vec_mule(vChrFilter, ls);
vector signed int i2 = vec_mulo(vChrFilter, ls);
vector signed int i1_V = vec_mule(vChrFilter, ls_V);
vector signed int i2_V = vec_mulo(vChrFilter, ls_V);
vector signed int vf1 = vec_mergeh(i1, i2);
vector signed int vf2 = vec_mergel(i1, i2); // chrSrc[j][i] * chrFilter[j] ... chrSrc[j][i+7] * chrFilter[j]
vector signed int vf1_V = vec_mergeh(i1_V, i2_V);
vector signed int vf2_V = vec_mergel(i1_V, i2_V); // chrSrc[j][i] * chrFilter[j] ... chrSrc[j][i+7] * chrFilter[j]
vector signed int vo1 = vec_add(v1, vf1);
vector signed int vo2 = vec_add(v2, vf2);
vector signed int vo1_V = vec_add(v1_V, vf1_V);
vector signed int vo2_V = vec_add(v2_V, vf2_V);
vec_st(vo1, offset, u);
vec_st(vo2, offset + 16, u);
vec_st(vo1_V, offset, v);
vec_st(vo2_V, offset + 16, v);
l1 = l2;
l1_V = l2_V;
}
for ( ; i < chrDstW; i++) {
u[i] += chrSrc[j][i] * chrFilter[j];
v[i] += chrSrc[j][i + 2048] * chrFilter[j];
}
}
altivec_packIntArrayToCharArray(u,uDest,chrDstW);
altivec_packIntArrayToCharArray(v,vDest,chrDstW);
}
}
static inline void hScale_altivec_real(int16_t *dst, int dstW, uint8_t *src, int srcW, int xInc, int16_t *filter, int16_t *filterPos, int filterSize) {
register int i;
int __attribute__ ((aligned (16))) tempo[4];
if (filterSize % 4) {
for(i=0; i<dstW; i++) {
register int j;
register int srcPos = filterPos[i];
register int val = 0;
for(j=0; j<filterSize; j++) {
val += ((int)src[srcPos + j])*filter[filterSize*i + j];
}
dst[i] = MIN(MAX(0, val>>7), (1<<15)-1);
}
}
else
switch (filterSize) {
case 4:
{
for(i=0; i<dstW; i++) {
register int j;
register int srcPos = filterPos[i];
vector unsigned char src_v0 = vec_ld(srcPos, src);
vector unsigned char src_v1;
if ((((int)src + srcPos)% 16) > 12) {
src_v1 = vec_ld(srcPos + 16, src);
}
vector unsigned char src_vF = vec_perm(src_v0, src_v1, vec_lvsl(srcPos, src));
vector signed short src_v = // vec_unpackh sign-extends...
(vector signed short)(vec_mergeh((vector unsigned char)vzero, src_vF));
// now put our elements in the even slots
src_v = vec_mergeh(src_v, (vector signed short)vzero);
vector signed short filter_v = vec_ld(i << 3, filter);
// the 3 above is 2 (filterSize == 4) + 1 (sizeof(short) == 2)
// the neat trick : we only care for half the elements,
// high or low depending on (i<<3)%16 (it's 0 or 8 here),
// and we're going to use vec_mule, so we chose
// carefully how to "unpack" the elements into the even slots
if ((i << 3) % 16)
filter_v = vec_mergel(filter_v,(vector signed short)vzero);
else
filter_v = vec_mergeh(filter_v,(vector signed short)vzero);
vector signed int val_vEven = vec_mule(src_v, filter_v);
vector signed int val_s = vec_sums(val_vEven, vzero);
vec_st(val_s, 0, tempo);
dst[i] = MIN(MAX(0, tempo[3]>>7), (1<<15)-1);
}
}
break;
case 8:
{
for(i=0; i<dstW; i++) {
register int srcPos = filterPos[i];
vector unsigned char src_v0 = vec_ld(srcPos, src);
vector unsigned char src_v1;
if ((((int)src + srcPos)% 16) > 8) {
src_v1 = vec_ld(srcPos + 16, src);
}
vector unsigned char src_vF = vec_perm(src_v0, src_v1, vec_lvsl(srcPos, src));
vector signed short src_v = // vec_unpackh sign-extends...
(vector signed short)(vec_mergeh((vector unsigned char)vzero, src_vF));
vector signed short filter_v = vec_ld(i << 4, filter);
// the 4 above is 3 (filterSize == 8) + 1 (sizeof(short) == 2)
vector signed int val_v = vec_msums(src_v, filter_v, (vector signed int)vzero);
vector signed int val_s = vec_sums(val_v, vzero);
vec_st(val_s, 0, tempo);
dst[i] = MIN(MAX(0, tempo[3]>>7), (1<<15)-1);
}
}
break;
case 16:
{
for(i=0; i<dstW; i++) {
register int srcPos = filterPos[i];
vector unsigned char src_v0 = vec_ld(srcPos, src);
vector unsigned char src_v1 = vec_ld(srcPos + 16, src);
vector unsigned char src_vF = vec_perm(src_v0, src_v1, vec_lvsl(srcPos, src));
vector signed short src_vA = // vec_unpackh sign-extends...
(vector signed short)(vec_mergeh((vector unsigned char)vzero, src_vF));
vector signed short src_vB = // vec_unpackh sign-extends...
(vector signed short)(vec_mergel((vector unsigned char)vzero, src_vF));
vector signed short filter_v0 = vec_ld(i << 5, filter);
vector signed short filter_v1 = vec_ld((i << 5) + 16, filter);
// the 5 above are 4 (filterSize == 16) + 1 (sizeof(short) == 2)
vector signed int val_acc = vec_msums(src_vA, filter_v0, (vector signed int)vzero);
vector signed int val_v = vec_msums(src_vB, filter_v1, val_acc);
vector signed int val_s = vec_sums(val_v, vzero);
vec_st(val_s, 0, tempo);
dst[i] = MIN(MAX(0, tempo[3]>>7), (1<<15)-1);
}
}
break;
default:
{
for(i=0; i<dstW; i++) {
register int j;
register int srcPos = filterPos[i];
vector signed int val_v = (vector signed int)vzero;
vector signed short filter_v0R = vec_ld(i * 2 * filterSize, filter);
vector unsigned char permF = vec_lvsl((i * 2 * filterSize), filter);
vector unsigned char src_v0 = vec_ld(srcPos, src);
vector unsigned char permS = vec_lvsl(srcPos, src);
for (j = 0 ; j < filterSize - 15; j += 16) {
vector unsigned char src_v1 = vec_ld(srcPos + j + 16, src);
vector unsigned char src_vF = vec_perm(src_v0, src_v1, permS);
vector signed short src_vA = // vec_unpackh sign-extends...
(vector signed short)(vec_mergeh((vector unsigned char)vzero, src_vF));
vector signed short src_vB = // vec_unpackh sign-extends...
(vector signed short)(vec_mergel((vector unsigned char)vzero, src_vF));
vector signed short filter_v1R = vec_ld((i * 2 * filterSize) + (j * 2) + 16, filter);
vector signed short filter_v2R = vec_ld((i * 2 * filterSize) + (j * 2) + 32, filter);
vector signed short filter_v0 = vec_perm(filter_v0R, filter_v1R, permF);
vector signed short filter_v1 = vec_perm(filter_v1R, filter_v2R, permF);
vector signed int val_acc = vec_msums(src_vA, filter_v0, val_v);
val_v = vec_msums(src_vB, filter_v1, val_acc);
filter_v0R = filter_v2R;
src_v0 = src_v1;
}
if (j < (filterSize-7)) {
// loading src_v0 is useless, it's already done above
//vector unsigned char src_v0 = vec_ld(srcPos + j, src);
vector unsigned char src_v1;
if ((((int)src + srcPos)% 16) > 8) {
src_v1 = vec_ld(srcPos + j + 16, src);
}
vector unsigned char src_vF = vec_perm(src_v0, src_v1, permS);
vector signed short src_v = // vec_unpackh sign-extends...
(vector signed short)(vec_mergeh((vector unsigned char)vzero, src_vF));
// loading filter_v0R is useless, it's already done above
//vector signed short filter_v0R = vec_ld((i * 2 * filterSize) + j, filter);
vector signed short filter_v1R = vec_ld((i * 2 * filterSize) + (j * 2) + 16, filter);
vector signed short filter_v = vec_perm(filter_v0R, filter_v1R, permF);
val_v = vec_msums(src_v, filter_v, val_v);
}
vector signed int val_s = vec_sums(val_v, vzero);
vec_st(val_s, 0, tempo);
dst[i] = MIN(MAX(0, tempo[3]>>7), (1<<15)-1);
}
}
}
}
static inline int yv12toyuy2_unscaled_altivec(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY,
int srcSliceH, uint8_t* dstParam[], int dstStride_a[]) {
uint8_t *dst=dstParam[0] + dstStride_a[0]*srcSliceY;
// yv12toyuy2( src[0],src[1],src[2],dst,c->srcW,srcSliceH,srcStride[0],srcStride[1],dstStride[0] );
uint8_t *ysrc = src[0];
uint8_t *usrc = src[1];
uint8_t *vsrc = src[2];
const int width = c->srcW;
const int height = srcSliceH;
const int lumStride = srcStride[0];
const int chromStride = srcStride[1];
const int dstStride = dstStride_a[0];
const vector unsigned char yperm = vec_lvsl(0, ysrc);
const int vertLumPerChroma = 2;
register unsigned int y;
/* this code assume:
1) dst is 16 bytes-aligned
2) dstStride is a multiple of 16
3) width is a multiple of 16
4) lum&chrom stride are multiple of 8
*/
for(y=0; y<height; y++)
{
int i;
for (i = 0; i < width - 31; i+= 32) {
const unsigned int j = i >> 1;
vector unsigned char v_yA = vec_ld(i, ysrc);
vector unsigned char v_yB = vec_ld(i + 16, ysrc);
vector unsigned char v_yC = vec_ld(i + 32, ysrc);
vector unsigned char v_y1 = vec_perm(v_yA, v_yB, yperm);
vector unsigned char v_y2 = vec_perm(v_yB, v_yC, yperm);
vector unsigned char v_uA = vec_ld(j, usrc);
vector unsigned char v_uB = vec_ld(j + 16, usrc);
vector unsigned char v_u = vec_perm(v_uA, v_uB, vec_lvsl(j, usrc));
vector unsigned char v_vA = vec_ld(j, vsrc);
vector unsigned char v_vB = vec_ld(j + 16, vsrc);
vector unsigned char v_v = vec_perm(v_vA, v_vB, vec_lvsl(j, vsrc));
vector unsigned char v_uv_a = vec_mergeh(v_u, v_v);
vector unsigned char v_uv_b = vec_mergel(v_u, v_v);
vector unsigned char v_yuy2_0 = vec_mergeh(v_y1, v_uv_a);
vector unsigned char v_yuy2_1 = vec_mergel(v_y1, v_uv_a);
vector unsigned char v_yuy2_2 = vec_mergeh(v_y2, v_uv_b);
vector unsigned char v_yuy2_3 = vec_mergel(v_y2, v_uv_b);
vec_st(v_yuy2_0, (i << 1), dst);
vec_st(v_yuy2_1, (i << 1) + 16, dst);
vec_st(v_yuy2_2, (i << 1) + 32, dst);
vec_st(v_yuy2_3, (i << 1) + 48, dst);
}
if (i < width) {
const unsigned int j = i >> 1;
vector unsigned char v_y1 = vec_ld(i, ysrc);
vector unsigned char v_u = vec_ld(j, usrc);
vector unsigned char v_v = vec_ld(j, vsrc);
vector unsigned char v_uv_a = vec_mergeh(v_u, v_v);
vector unsigned char v_yuy2_0 = vec_mergeh(v_y1, v_uv_a);
vector unsigned char v_yuy2_1 = vec_mergel(v_y1, v_uv_a);
vec_st(v_yuy2_0, (i << 1), dst);
vec_st(v_yuy2_1, (i << 1) + 16, dst);
}
if((y&(vertLumPerChroma-1))==(vertLumPerChroma-1) )
{
usrc += chromStride;
vsrc += chromStride;
}
ysrc += lumStride;
dst += dstStride;
}
return srcSliceH;
}
static inline int yv12touyvy_unscaled_altivec(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY,
int srcSliceH, uint8_t* dstParam[], int dstStride_a[]) {
uint8_t *dst=dstParam[0] + dstStride_a[0]*srcSliceY;
// yv12toyuy2( src[0],src[1],src[2],dst,c->srcW,srcSliceH,srcStride[0],srcStride[1],dstStride[0] );
uint8_t *ysrc = src[0];
uint8_t *usrc = src[1];
uint8_t *vsrc = src[2];
const int width = c->srcW;
const int height = srcSliceH;
const int lumStride = srcStride[0];
const int chromStride = srcStride[1];
const int dstStride = dstStride_a[0];
const int vertLumPerChroma = 2;
const vector unsigned char yperm = vec_lvsl(0, ysrc);
register unsigned int y;
/* this code assume:
1) dst is 16 bytes-aligned
2) dstStride is a multiple of 16
3) width is a multiple of 16
4) lum&chrom stride are multiple of 8
*/
for(y=0; y<height; y++)
{
int i;
for (i = 0; i < width - 31; i+= 32) {
const unsigned int j = i >> 1;
vector unsigned char v_yA = vec_ld(i, ysrc);
vector unsigned char v_yB = vec_ld(i + 16, ysrc);
vector unsigned char v_yC = vec_ld(i + 32, ysrc);
vector unsigned char v_y1 = vec_perm(v_yA, v_yB, yperm);
vector unsigned char v_y2 = vec_perm(v_yB, v_yC, yperm);
vector unsigned char v_uA = vec_ld(j, usrc);
vector unsigned char v_uB = vec_ld(j + 16, usrc);
vector unsigned char v_u = vec_perm(v_uA, v_uB, vec_lvsl(j, usrc));
vector unsigned char v_vA = vec_ld(j, vsrc);
vector unsigned char v_vB = vec_ld(j + 16, vsrc);
vector unsigned char v_v = vec_perm(v_vA, v_vB, vec_lvsl(j, vsrc));
vector unsigned char v_uv_a = vec_mergeh(v_u, v_v);
vector unsigned char v_uv_b = vec_mergel(v_u, v_v);
vector unsigned char v_uyvy_0 = vec_mergeh(v_uv_a, v_y1);
vector unsigned char v_uyvy_1 = vec_mergel(v_uv_a, v_y1);
vector unsigned char v_uyvy_2 = vec_mergeh(v_uv_b, v_y2);
vector unsigned char v_uyvy_3 = vec_mergel(v_uv_b, v_y2);
vec_st(v_uyvy_0, (i << 1), dst);
vec_st(v_uyvy_1, (i << 1) + 16, dst);
vec_st(v_uyvy_2, (i << 1) + 32, dst);
vec_st(v_uyvy_3, (i << 1) + 48, dst);
}
if (i < width) {
const unsigned int j = i >> 1;
vector unsigned char v_y1 = vec_ld(i, ysrc);
vector unsigned char v_u = vec_ld(j, usrc);
vector unsigned char v_v = vec_ld(j, vsrc);
vector unsigned char v_uv_a = vec_mergeh(v_u, v_v);
vector unsigned char v_uyvy_0 = vec_mergeh(v_uv_a, v_y1);
vector unsigned char v_uyvy_1 = vec_mergel(v_uv_a, v_y1);
vec_st(v_uyvy_0, (i << 1), dst);
vec_st(v_uyvy_1, (i << 1) + 16, dst);
}
if((y&(vertLumPerChroma-1))==(vertLumPerChroma-1) )
{
usrc += chromStride;
vsrc += chromStride;
}
ysrc += lumStride;
dst += dstStride;
}
return srcSliceH;
}
/*
Copyright (C) 2001-2003 Michael Niedermayer <michaelni@gmx.at>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*/
#ifndef SWSCALE_INTERNAL_H
#define SWSCALE_INTERNAL_H
#ifdef HAVE_ALTIVEC_H
#include <altivec.h>
#endif
#define MSG_WARN(args...) mp_msg(MSGT_SWS,MSGL_WARN, ##args )
#define MSG_FATAL(args...) mp_msg(MSGT_SWS,MSGL_FATAL, ##args )
#define MSG_ERR(args...) mp_msg(MSGT_SWS,MSGL_ERR, ##args )
#define MSG_V(args...) mp_msg(MSGT_SWS,MSGL_V, ##args )
#define MSG_DBG2(args...) mp_msg(MSGT_SWS,MSGL_DBG2, ##args )
#define MSG_INFO(args...) mp_msg(MSGT_SWS,MSGL_INFO, ##args )
#define MAX_FILTER_SIZE 256
typedef int (*SwsFunc)(struct SwsContext *context, uint8_t* src[], int srcStride[], int srcSliceY,
int srcSliceH, uint8_t* dst[], int dstStride[]);
/* this struct should be aligned on at least 32-byte boundary */
typedef struct SwsContext{
/**
*
* Note the src,dst,srcStride,dstStride will be copied, in the sws_scale() warper so they can freely be modified here
*/
SwsFunc swScale;
int srcW, srcH, dstH;
int chrSrcW, chrSrcH, chrDstW, chrDstH;
int lumXInc, chrXInc;
int lumYInc, chrYInc;
int dstFormat, srcFormat; ///< format 4:2:0 type is allways YV12
int origDstFormat, origSrcFormat; ///< format
int chrSrcHSubSample, chrSrcVSubSample;
int chrIntHSubSample, chrIntVSubSample;
int chrDstHSubSample, chrDstVSubSample;
int vChrDrop;
int16_t **lumPixBuf;
int16_t **chrPixBuf;
int16_t *hLumFilter;
int16_t *hLumFilterPos;
int16_t *hChrFilter;
int16_t *hChrFilterPos;
int16_t *vLumFilter;
int16_t *vLumFilterPos;
int16_t *vChrFilter;
int16_t *vChrFilterPos;
uint8_t formatConvBuffer[4000]; //FIXME dynamic alloc, but we have to change alot of code for this to be usefull
int hLumFilterSize;
int hChrFilterSize;
int vLumFilterSize;
int vChrFilterSize;
int vLumBufSize;
int vChrBufSize;
uint8_t __attribute__((aligned(32))) funnyYCode[10000];
uint8_t __attribute__((aligned(32))) funnyUVCode[10000];
int32_t *lumMmx2FilterPos;
int32_t *chrMmx2FilterPos;
int16_t *lumMmx2Filter;
int16_t *chrMmx2Filter;
int canMMX2BeUsed;
int lastInLumBuf;
int lastInChrBuf;
int lumBufIndex;
int chrBufIndex;
int dstY;
int flags;
void * yuvTable; // pointer to the yuv->rgb table start so it can be freed()
void * table_rV[256];
void * table_gU[256];
int table_gV[256];
void * table_bU[256];
//Colorspace stuff
int contrast, brightness, saturation; // for sws_getColorspaceDetails
int srcColorspaceTable[4];
int dstColorspaceTable[4];
int srcRange, dstRange;
#define RED_DITHER "0*8"
#define GREEN_DITHER "1*8"
#define BLUE_DITHER "2*8"
#define Y_COEFF "3*8"
#define VR_COEFF "4*8"
#define UB_COEFF "5*8"
#define VG_COEFF "6*8"
#define UG_COEFF "7*8"
#define Y_OFFSET "8*8"
#define U_OFFSET "9*8"
#define V_OFFSET "10*8"
#define LUM_MMX_FILTER_OFFSET "11*8"
#define CHR_MMX_FILTER_OFFSET "11*8+4*4*256"
#define DSTW_OFFSET "11*8+4*4*256*2" //do not change, its hardcoded in the asm
#define ESP_OFFSET "11*8+4*4*256*2+4"
#define VROUNDER_OFFSET "11*8+4*4*256*2+8"
uint64_t redDither __attribute__((aligned(8)));
uint64_t greenDither __attribute__((aligned(8)));
uint64_t blueDither __attribute__((aligned(8)));
uint64_t yCoeff __attribute__((aligned(8)));
uint64_t vrCoeff __attribute__((aligned(8)));
uint64_t ubCoeff __attribute__((aligned(8)));
uint64_t vgCoeff __attribute__((aligned(8)));
uint64_t ugCoeff __attribute__((aligned(8)));
uint64_t yOffset __attribute__((aligned(8)));
uint64_t uOffset __attribute__((aligned(8)));
uint64_t vOffset __attribute__((aligned(8)));
int32_t lumMmxFilter[4*MAX_FILTER_SIZE];
int32_t chrMmxFilter[4*MAX_FILTER_SIZE];
int dstW;
int esp;
uint64_t vRounder __attribute__((aligned(8)));
#ifdef HAVE_ALTIVEC
vector signed short CY;
vector signed short CRV;
vector signed short CBU;
vector signed short CGU;
vector signed short CGV;
vector signed short OY;
vector unsigned short CSHIFT;
#endif
} SwsContext;
//FIXME check init (where 0)
SwsFunc yuv2rgb_get_func_ptr (SwsContext *c);
int yuv2rgb_c_init_tables (SwsContext *c, const int inv_table[4], int fullRange, int brightness, int contrast, int saturation);
#endif
/*
Copyright (C) 2001-2003 Michael Niedermayer <michaelni@gmx.at>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*/
#undef MOVNTQ
#undef PAVGB
#undef PREFETCH
#undef PREFETCHW
#undef EMMS
#undef SFENCE
#ifdef HAVE_3DNOW
/* On K6 femms is faster of emms. On K7 femms is directly mapped on emms. */
#define EMMS "femms"
#else
#define EMMS "emms"
#endif
#ifdef HAVE_3DNOW
#define PREFETCH "prefetch"
#define PREFETCHW "prefetchw"
#elif defined ( HAVE_MMX2 )
#define PREFETCH "prefetchnta"
#define PREFETCHW "prefetcht0"
#else
#define PREFETCH "/nop"
#define PREFETCHW "/nop"
#endif
#ifdef HAVE_MMX2
#define SFENCE "sfence"
#else
#define SFENCE "/nop"
#endif
#ifdef HAVE_MMX2
#define PAVGB(a,b) "pavgb " #a ", " #b " \n\t"
#elif defined (HAVE_3DNOW)
#define PAVGB(a,b) "pavgusb " #a ", " #b " \n\t"
#endif
#ifdef HAVE_MMX2
#define MOVNTQ(a,b) "movntq " #a ", " #b " \n\t"
#else
#define MOVNTQ(a,b) "movq " #a ", " #b " \n\t"
#endif
#ifdef HAVE_ALTIVEC
#include "swscale_altivec_template.c"
#endif
#define YSCALEYUV2YV12X(x, offset) \
"xorl %%eax, %%eax \n\t"\
"movq "VROUNDER_OFFSET"(%0), %%mm3\n\t"\
"movq %%mm3, %%mm4 \n\t"\
"leal " offset "(%0), %%edx \n\t"\
"movl (%%edx), %%esi \n\t"\
".balign 16 \n\t" /* FIXME Unroll? */\
"1: \n\t"\
"movq 8(%%edx), %%mm0 \n\t" /* filterCoeff */\
"movq " #x "(%%esi, %%eax, 2), %%mm2 \n\t" /* srcData */\
"movq 8+" #x "(%%esi, %%eax, 2), %%mm5 \n\t" /* srcData */\
"addl $16, %%edx \n\t"\
"movl (%%edx), %%esi \n\t"\
"testl %%esi, %%esi \n\t"\
"pmulhw %%mm0, %%mm2 \n\t"\
"pmulhw %%mm0, %%mm5 \n\t"\
"paddw %%mm2, %%mm3 \n\t"\
"paddw %%mm5, %%mm4 \n\t"\
" jnz 1b \n\t"\
"psraw $3, %%mm3 \n\t"\
"psraw $3, %%mm4 \n\t"\
"packuswb %%mm4, %%mm3 \n\t"\
MOVNTQ(%%mm3, (%1, %%eax))\
"addl $8, %%eax \n\t"\
"cmpl %2, %%eax \n\t"\
"movq "VROUNDER_OFFSET"(%0), %%mm3\n\t"\
"movq %%mm3, %%mm4 \n\t"\
"leal " offset "(%0), %%edx \n\t"\
"movl (%%edx), %%esi \n\t"\
"jb 1b \n\t"
#define YSCALEYUV2YV121 \
"movl %2, %%eax \n\t"\
".balign 16 \n\t" /* FIXME Unroll? */\
"1: \n\t"\
"movq (%0, %%eax, 2), %%mm0 \n\t"\
"movq 8(%0, %%eax, 2), %%mm1 \n\t"\
"psraw $7, %%mm0 \n\t"\
"psraw $7, %%mm1 \n\t"\
"packuswb %%mm1, %%mm0 \n\t"\
MOVNTQ(%%mm0, (%1, %%eax))\
"addl $8, %%eax \n\t"\
"jnc 1b \n\t"
/*
:: "m" (-lumFilterSize), "m" (-chrFilterSize),
"m" (lumMmxFilter+lumFilterSize*4), "m" (chrMmxFilter+chrFilterSize*4),
"r" (dest), "m" (dstW),
"m" (lumSrc+lumFilterSize), "m" (chrSrc+chrFilterSize)
: "%eax", "%ebx", "%ecx", "%edx", "%esi"
*/
#define YSCALEYUV2PACKEDX \
"xorl %%eax, %%eax \n\t"\
".balign 16 \n\t"\
"nop \n\t"\
"1: \n\t"\
"leal "CHR_MMX_FILTER_OFFSET"(%0), %%edx \n\t"\
"movl (%%edx), %%esi \n\t"\
"movq "VROUNDER_OFFSET"(%0), %%mm3\n\t"\
"movq %%mm3, %%mm4 \n\t"\
".balign 16 \n\t"\
"2: \n\t"\
"movq 8(%%edx), %%mm0 \n\t" /* filterCoeff */\
"movq (%%esi, %%eax), %%mm2 \n\t" /* UsrcData */\
"movq 4096(%%esi, %%eax), %%mm5 \n\t" /* VsrcData */\
"addl $16, %%edx \n\t"\
"movl (%%edx), %%esi \n\t"\
"pmulhw %%mm0, %%mm2 \n\t"\
"pmulhw %%mm0, %%mm5 \n\t"\
"paddw %%mm2, %%mm3 \n\t"\
"paddw %%mm5, %%mm4 \n\t"\
"testl %%esi, %%esi \n\t"\
" jnz 2b \n\t"\
\
"leal "LUM_MMX_FILTER_OFFSET"(%0), %%edx \n\t"\
"movl (%%edx), %%esi \n\t"\
"movq "VROUNDER_OFFSET"(%0), %%mm1\n\t"\
"movq %%mm1, %%mm7 \n\t"\
".balign 16 \n\t"\
"2: \n\t"\
"movq 8(%%edx), %%mm0 \n\t" /* filterCoeff */\
"movq (%%esi, %%eax, 2), %%mm2 \n\t" /* Y1srcData */\
"movq 8(%%esi, %%eax, 2), %%mm5 \n\t" /* Y2srcData */\
"addl $16, %%edx \n\t"\
"movl (%%edx), %%esi \n\t"\
"pmulhw %%mm0, %%mm2 \n\t"\
"pmulhw %%mm0, %%mm5 \n\t"\
"paddw %%mm2, %%mm1 \n\t"\
"paddw %%mm5, %%mm7 \n\t"\
"testl %%esi, %%esi \n\t"\
" jnz 2b \n\t"\
#define YSCALEYUV2RGBX \
YSCALEYUV2PACKEDX\
"psubw "U_OFFSET"(%0), %%mm3 \n\t" /* (U-128)8*/\
"psubw "V_OFFSET"(%0), %%mm4 \n\t" /* (V-128)8*/\
"movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
"movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
"pmulhw "UG_COEFF"(%0), %%mm3 \n\t"\
"pmulhw "VG_COEFF"(%0), %%mm4 \n\t"\
/* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
"pmulhw "UB_COEFF"(%0), %%mm2 \n\t"\
"pmulhw "VR_COEFF"(%0), %%mm5 \n\t"\
"psubw "Y_OFFSET"(%0), %%mm1 \n\t" /* 8(Y-16)*/\
"psubw "Y_OFFSET"(%0), %%mm7 \n\t" /* 8(Y-16)*/\
"pmulhw "Y_COEFF"(%0), %%mm1 \n\t"\
"pmulhw "Y_COEFF"(%0), %%mm7 \n\t"\
/* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
"paddw %%mm3, %%mm4 \n\t"\
"movq %%mm2, %%mm0 \n\t"\
"movq %%mm5, %%mm6 \n\t"\
"movq %%mm4, %%mm3 \n\t"\
"punpcklwd %%mm2, %%mm2 \n\t"\
"punpcklwd %%mm5, %%mm5 \n\t"\
"punpcklwd %%mm4, %%mm4 \n\t"\
"paddw %%mm1, %%mm2 \n\t"\
"paddw %%mm1, %%mm5 \n\t"\
"paddw %%mm1, %%mm4 \n\t"\
"punpckhwd %%mm0, %%mm0 \n\t"\
"punpckhwd %%mm6, %%mm6 \n\t"\
"punpckhwd %%mm3, %%mm3 \n\t"\
"paddw %%mm7, %%mm0 \n\t"\
"paddw %%mm7, %%mm6 \n\t"\
"paddw %%mm7, %%mm3 \n\t"\
/* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
"packuswb %%mm0, %%mm2 \n\t"\
"packuswb %%mm6, %%mm5 \n\t"\
"packuswb %%mm3, %%mm4 \n\t"\
"pxor %%mm7, %%mm7 \n\t"
#if 0
#define FULL_YSCALEYUV2RGB \
"pxor %%mm7, %%mm7 \n\t"\
"movd %6, %%mm6 \n\t" /*yalpha1*/\
"punpcklwd %%mm6, %%mm6 \n\t"\
"punpcklwd %%mm6, %%mm6 \n\t"\
"movd %7, %%mm5 \n\t" /*uvalpha1*/\
"punpcklwd %%mm5, %%mm5 \n\t"\
"punpcklwd %%mm5, %%mm5 \n\t"\
"xorl %%eax, %%eax \n\t"\
".balign 16 \n\t"\
"1: \n\t"\
"movq (%0, %%eax, 2), %%mm0 \n\t" /*buf0[eax]*/\
"movq (%1, %%eax, 2), %%mm1 \n\t" /*buf1[eax]*/\
"movq (%2, %%eax,2), %%mm2 \n\t" /* uvbuf0[eax]*/\
"movq (%3, %%eax,2), %%mm3 \n\t" /* uvbuf1[eax]*/\
"psubw %%mm1, %%mm0 \n\t" /* buf0[eax] - buf1[eax]*/\
"psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
"pmulhw %%mm6, %%mm0 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
"pmulhw %%mm5, %%mm2 \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
"psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
"movq 4096(%2, %%eax,2), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\
"psraw $4, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
"paddw %%mm0, %%mm1 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
"movq 4096(%3, %%eax,2), %%mm0 \n\t" /* uvbuf1[eax+2048]*/\
"paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
"psubw %%mm0, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
"psubw "MANGLE(w80)", %%mm1 \n\t" /* 8(Y-16)*/\
"psubw "MANGLE(w400)", %%mm3 \n\t" /* 8(U-128)*/\
"pmulhw "MANGLE(yCoeff)", %%mm1 \n\t"\
\
\
"pmulhw %%mm5, %%mm4 \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
"movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
"pmulhw "MANGLE(ubCoeff)", %%mm3\n\t"\
"psraw $4, %%mm0 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
"pmulhw "MANGLE(ugCoeff)", %%mm2\n\t"\
"paddw %%mm4, %%mm0 \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
"psubw "MANGLE(w400)", %%mm0 \n\t" /* (V-128)8*/\
\
\
"movq %%mm0, %%mm4 \n\t" /* (V-128)8*/\
"pmulhw "MANGLE(vrCoeff)", %%mm0\n\t"\
"pmulhw "MANGLE(vgCoeff)", %%mm4\n\t"\
"paddw %%mm1, %%mm3 \n\t" /* B*/\
"paddw %%mm1, %%mm0 \n\t" /* R*/\
"packuswb %%mm3, %%mm3 \n\t"\
\
"packuswb %%mm0, %%mm0 \n\t"\
"paddw %%mm4, %%mm2 \n\t"\
"paddw %%mm2, %%mm1 \n\t" /* G*/\
\
"packuswb %%mm1, %%mm1 \n\t"
#endif
#define YSCALEYUV2PACKED(index, c) \
"movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0\n\t"\
"movq "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm1\n\t"\
"psraw $3, %%mm0 \n\t"\
"psraw $3, %%mm1 \n\t"\
"movq %%mm0, "CHR_MMX_FILTER_OFFSET"+8("#c")\n\t"\
"movq %%mm1, "LUM_MMX_FILTER_OFFSET"+8("#c")\n\t"\
"xorl "#index", "#index" \n\t"\
".balign 16 \n\t"\
"1: \n\t"\
"movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
"movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
"movq 4096(%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
"movq 4096(%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
"psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
"psubw %%mm4, %%mm5 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
"movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0\n\t"\
"pmulhw %%mm0, %%mm2 \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
"pmulhw %%mm0, %%mm5 \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
"psraw $7, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
"psraw $7, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
"paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
"paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
"movq (%0, "#index", 2), %%mm0 \n\t" /*buf0[eax]*/\
"movq (%1, "#index", 2), %%mm1 \n\t" /*buf1[eax]*/\
"movq 8(%0, "#index", 2), %%mm6 \n\t" /*buf0[eax]*/\
"movq 8(%1, "#index", 2), %%mm7 \n\t" /*buf1[eax]*/\
"psubw %%mm1, %%mm0 \n\t" /* buf0[eax] - buf1[eax]*/\
"psubw %%mm7, %%mm6 \n\t" /* buf0[eax] - buf1[eax]*/\
"pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm0\n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
"pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm6\n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
"psraw $7, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
"psraw $7, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
"paddw %%mm0, %%mm1 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
"paddw %%mm6, %%mm7 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
#define YSCALEYUV2RGB(index, c) \
"xorl "#index", "#index" \n\t"\
".balign 16 \n\t"\
"1: \n\t"\
"movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
"movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
"movq 4096(%2, "#index"), %%mm5\n\t" /* uvbuf0[eax+2048]*/\
"movq 4096(%3, "#index"), %%mm4\n\t" /* uvbuf1[eax+2048]*/\
"psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
"psubw %%mm4, %%mm5 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
"movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0\n\t"\
"pmulhw %%mm0, %%mm2 \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
"pmulhw %%mm0, %%mm5 \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
"psraw $4, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
"psraw $4, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
"paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
"paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
"psubw "U_OFFSET"("#c"), %%mm3 \n\t" /* (U-128)8*/\
"psubw "V_OFFSET"("#c"), %%mm4 \n\t" /* (V-128)8*/\
"movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
"movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
"pmulhw "UG_COEFF"("#c"), %%mm3\n\t"\
"pmulhw "VG_COEFF"("#c"), %%mm4\n\t"\
/* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
"movq (%0, "#index", 2), %%mm0 \n\t" /*buf0[eax]*/\
"movq (%1, "#index", 2), %%mm1 \n\t" /*buf1[eax]*/\
"movq 8(%0, "#index", 2), %%mm6\n\t" /*buf0[eax]*/\
"movq 8(%1, "#index", 2), %%mm7\n\t" /*buf1[eax]*/\
"psubw %%mm1, %%mm0 \n\t" /* buf0[eax] - buf1[eax]*/\
"psubw %%mm7, %%mm6 \n\t" /* buf0[eax] - buf1[eax]*/\
"pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm0\n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
"pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm6\n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
"psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
"psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
"paddw %%mm0, %%mm1 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
"paddw %%mm6, %%mm7 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
"pmulhw "UB_COEFF"("#c"), %%mm2\n\t"\
"pmulhw "VR_COEFF"("#c"), %%mm5\n\t"\
"psubw "Y_OFFSET"("#c"), %%mm1 \n\t" /* 8(Y-16)*/\
"psubw "Y_OFFSET"("#c"), %%mm7 \n\t" /* 8(Y-16)*/\
"pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\
"pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\
/* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
"paddw %%mm3, %%mm4 \n\t"\
"movq %%mm2, %%mm0 \n\t"\
"movq %%mm5, %%mm6 \n\t"\
"movq %%mm4, %%mm3 \n\t"\
"punpcklwd %%mm2, %%mm2 \n\t"\
"punpcklwd %%mm5, %%mm5 \n\t"\
"punpcklwd %%mm4, %%mm4 \n\t"\
"paddw %%mm1, %%mm2 \n\t"\
"paddw %%mm1, %%mm5 \n\t"\
"paddw %%mm1, %%mm4 \n\t"\
"punpckhwd %%mm0, %%mm0 \n\t"\
"punpckhwd %%mm6, %%mm6 \n\t"\
"punpckhwd %%mm3, %%mm3 \n\t"\
"paddw %%mm7, %%mm0 \n\t"\
"paddw %%mm7, %%mm6 \n\t"\
"paddw %%mm7, %%mm3 \n\t"\
/* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
"packuswb %%mm0, %%mm2 \n\t"\
"packuswb %%mm6, %%mm5 \n\t"\
"packuswb %%mm3, %%mm4 \n\t"\
"pxor %%mm7, %%mm7 \n\t"
#define YSCALEYUV2PACKED1(index, c) \
"xorl "#index", "#index" \n\t"\
".balign 16 \n\t"\
"1: \n\t"\
"movq (%2, "#index"), %%mm3 \n\t" /* uvbuf0[eax]*/\
"movq 4096(%2, "#index"), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\
"psraw $7, %%mm3 \n\t" \
"psraw $7, %%mm4 \n\t" \
"movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
"movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
"psraw $7, %%mm1 \n\t" \
"psraw $7, %%mm7 \n\t" \
#define YSCALEYUV2RGB1(index, c) \
"xorl "#index", "#index" \n\t"\
".balign 16 \n\t"\
"1: \n\t"\
"movq (%2, "#index"), %%mm3 \n\t" /* uvbuf0[eax]*/\
"movq 4096(%2, "#index"), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\
"psraw $4, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
"psraw $4, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
"psubw "U_OFFSET"("#c"), %%mm3 \n\t" /* (U-128)8*/\
"psubw "V_OFFSET"("#c"), %%mm4 \n\t" /* (V-128)8*/\
"movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
"movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
"pmulhw "UG_COEFF"("#c"), %%mm3\n\t"\
"pmulhw "VG_COEFF"("#c"), %%mm4\n\t"\
/* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
"movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
"movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
"psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
"psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
"pmulhw "UB_COEFF"("#c"), %%mm2\n\t"\
"pmulhw "VR_COEFF"("#c"), %%mm5\n\t"\
"psubw "Y_OFFSET"("#c"), %%mm1 \n\t" /* 8(Y-16)*/\
"psubw "Y_OFFSET"("#c"), %%mm7 \n\t" /* 8(Y-16)*/\
"pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\
"pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\
/* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
"paddw %%mm3, %%mm4 \n\t"\
"movq %%mm2, %%mm0 \n\t"\
"movq %%mm5, %%mm6 \n\t"\
"movq %%mm4, %%mm3 \n\t"\
"punpcklwd %%mm2, %%mm2 \n\t"\
"punpcklwd %%mm5, %%mm5 \n\t"\
"punpcklwd %%mm4, %%mm4 \n\t"\
"paddw %%mm1, %%mm2 \n\t"\
"paddw %%mm1, %%mm5 \n\t"\
"paddw %%mm1, %%mm4 \n\t"\
"punpckhwd %%mm0, %%mm0 \n\t"\
"punpckhwd %%mm6, %%mm6 \n\t"\
"punpckhwd %%mm3, %%mm3 \n\t"\
"paddw %%mm7, %%mm0 \n\t"\
"paddw %%mm7, %%mm6 \n\t"\
"paddw %%mm7, %%mm3 \n\t"\
/* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
"packuswb %%mm0, %%mm2 \n\t"\
"packuswb %%mm6, %%mm5 \n\t"\
"packuswb %%mm3, %%mm4 \n\t"\
"pxor %%mm7, %%mm7 \n\t"
#define YSCALEYUV2PACKED1b(index, c) \
"xorl "#index", "#index" \n\t"\
".balign 16 \n\t"\
"1: \n\t"\
"movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
"movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
"movq 4096(%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
"movq 4096(%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
"paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\
"paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\
"psrlw $8, %%mm3 \n\t" \
"psrlw $8, %%mm4 \n\t" \
"movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
"movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
"psraw $7, %%mm1 \n\t" \
"psraw $7, %%mm7 \n\t"
// do vertical chrominance interpolation
#define YSCALEYUV2RGB1b(index, c) \
"xorl "#index", "#index" \n\t"\
".balign 16 \n\t"\
"1: \n\t"\
"movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
"movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
"movq 4096(%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
"movq 4096(%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
"paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\
"paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\
"psrlw $5, %%mm3 \n\t" /*FIXME might overflow*/\
"psrlw $5, %%mm4 \n\t" /*FIXME might overflow*/\
"psubw "U_OFFSET"("#c"), %%mm3 \n\t" /* (U-128)8*/\
"psubw "V_OFFSET"("#c"), %%mm4 \n\t" /* (V-128)8*/\
"movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
"movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
"pmulhw "UG_COEFF"("#c"), %%mm3\n\t"\
"pmulhw "VG_COEFF"("#c"), %%mm4\n\t"\
/* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
"movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
"movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
"psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
"psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
"pmulhw "UB_COEFF"("#c"), %%mm2\n\t"\
"pmulhw "VR_COEFF"("#c"), %%mm5\n\t"\
"psubw "Y_OFFSET"("#c"), %%mm1 \n\t" /* 8(Y-16)*/\
"psubw "Y_OFFSET"("#c"), %%mm7 \n\t" /* 8(Y-16)*/\
"pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\
"pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\
/* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
"paddw %%mm3, %%mm4 \n\t"\
"movq %%mm2, %%mm0 \n\t"\
"movq %%mm5, %%mm6 \n\t"\
"movq %%mm4, %%mm3 \n\t"\
"punpcklwd %%mm2, %%mm2 \n\t"\
"punpcklwd %%mm5, %%mm5 \n\t"\
"punpcklwd %%mm4, %%mm4 \n\t"\
"paddw %%mm1, %%mm2 \n\t"\
"paddw %%mm1, %%mm5 \n\t"\
"paddw %%mm1, %%mm4 \n\t"\
"punpckhwd %%mm0, %%mm0 \n\t"\
"punpckhwd %%mm6, %%mm6 \n\t"\
"punpckhwd %%mm3, %%mm3 \n\t"\
"paddw %%mm7, %%mm0 \n\t"\
"paddw %%mm7, %%mm6 \n\t"\
"paddw %%mm7, %%mm3 \n\t"\
/* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
"packuswb %%mm0, %%mm2 \n\t"\
"packuswb %%mm6, %%mm5 \n\t"\
"packuswb %%mm3, %%mm4 \n\t"\
"pxor %%mm7, %%mm7 \n\t"
#define WRITEBGR32(dst, dstw, index) \
/* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
"movq %%mm2, %%mm1 \n\t" /* B */\
"movq %%mm5, %%mm6 \n\t" /* R */\
"punpcklbw %%mm4, %%mm2 \n\t" /* GBGBGBGB 0 */\
"punpcklbw %%mm7, %%mm5 \n\t" /* 0R0R0R0R 0 */\
"punpckhbw %%mm4, %%mm1 \n\t" /* GBGBGBGB 2 */\
"punpckhbw %%mm7, %%mm6 \n\t" /* 0R0R0R0R 2 */\
"movq %%mm2, %%mm0 \n\t" /* GBGBGBGB 0 */\
"movq %%mm1, %%mm3 \n\t" /* GBGBGBGB 2 */\
"punpcklwd %%mm5, %%mm0 \n\t" /* 0RGB0RGB 0 */\
"punpckhwd %%mm5, %%mm2 \n\t" /* 0RGB0RGB 1 */\
"punpcklwd %%mm6, %%mm1 \n\t" /* 0RGB0RGB 2 */\
"punpckhwd %%mm6, %%mm3 \n\t" /* 0RGB0RGB 3 */\
\
MOVNTQ(%%mm0, (dst, index, 4))\
MOVNTQ(%%mm2, 8(dst, index, 4))\
MOVNTQ(%%mm1, 16(dst, index, 4))\
MOVNTQ(%%mm3, 24(dst, index, 4))\
\
"addl $8, "#index" \n\t"\
"cmpl "#dstw", "#index" \n\t"\
" jb 1b \n\t"
#define WRITEBGR16(dst, dstw, index) \
"pand "MANGLE(bF8)", %%mm2 \n\t" /* B */\
"pand "MANGLE(bFC)", %%mm4 \n\t" /* G */\
"pand "MANGLE(bF8)", %%mm5 \n\t" /* R */\
"psrlq $3, %%mm2 \n\t"\
\
"movq %%mm2, %%mm1 \n\t"\
"movq %%mm4, %%mm3 \n\t"\
\
"punpcklbw %%mm7, %%mm3 \n\t"\
"punpcklbw %%mm5, %%mm2 \n\t"\
"punpckhbw %%mm7, %%mm4 \n\t"\
"punpckhbw %%mm5, %%mm1 \n\t"\
\
"psllq $3, %%mm3 \n\t"\
"psllq $3, %%mm4 \n\t"\
\
"por %%mm3, %%mm2 \n\t"\
"por %%mm4, %%mm1 \n\t"\
\
MOVNTQ(%%mm2, (dst, index, 2))\
MOVNTQ(%%mm1, 8(dst, index, 2))\
\
"addl $8, "#index" \n\t"\
"cmpl "#dstw", "#index" \n\t"\
" jb 1b \n\t"
#define WRITEBGR15(dst, dstw, index) \
"pand "MANGLE(bF8)", %%mm2 \n\t" /* B */\
"pand "MANGLE(bF8)", %%mm4 \n\t" /* G */\
"pand "MANGLE(bF8)", %%mm5 \n\t" /* R */\
"psrlq $3, %%mm2 \n\t"\
"psrlq $1, %%mm5 \n\t"\
\
"movq %%mm2, %%mm1 \n\t"\
"movq %%mm4, %%mm3 \n\t"\
\
"punpcklbw %%mm7, %%mm3 \n\t"\
"punpcklbw %%mm5, %%mm2 \n\t"\
"punpckhbw %%mm7, %%mm4 \n\t"\
"punpckhbw %%mm5, %%mm1 \n\t"\
\
"psllq $2, %%mm3 \n\t"\
"psllq $2, %%mm4 \n\t"\
\
"por %%mm3, %%mm2 \n\t"\
"por %%mm4, %%mm1 \n\t"\
\
MOVNTQ(%%mm2, (dst, index, 2))\
MOVNTQ(%%mm1, 8(dst, index, 2))\
\
"addl $8, "#index" \n\t"\
"cmpl "#dstw", "#index" \n\t"\
" jb 1b \n\t"
#define WRITEBGR24OLD(dst, dstw, index) \
/* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
"movq %%mm2, %%mm1 \n\t" /* B */\
"movq %%mm5, %%mm6 \n\t" /* R */\
"punpcklbw %%mm4, %%mm2 \n\t" /* GBGBGBGB 0 */\
"punpcklbw %%mm7, %%mm5 \n\t" /* 0R0R0R0R 0 */\
"punpckhbw %%mm4, %%mm1 \n\t" /* GBGBGBGB 2 */\
"punpckhbw %%mm7, %%mm6 \n\t" /* 0R0R0R0R 2 */\
"movq %%mm2, %%mm0 \n\t" /* GBGBGBGB 0 */\
"movq %%mm1, %%mm3 \n\t" /* GBGBGBGB 2 */\
"punpcklwd %%mm5, %%mm0 \n\t" /* 0RGB0RGB 0 */\
"punpckhwd %%mm5, %%mm2 \n\t" /* 0RGB0RGB 1 */\
"punpcklwd %%mm6, %%mm1 \n\t" /* 0RGB0RGB 2 */\
"punpckhwd %%mm6, %%mm3 \n\t" /* 0RGB0RGB 3 */\
\
"movq %%mm0, %%mm4 \n\t" /* 0RGB0RGB 0 */\
"psrlq $8, %%mm0 \n\t" /* 00RGB0RG 0 */\
"pand "MANGLE(bm00000111)", %%mm4\n\t" /* 00000RGB 0 */\
"pand "MANGLE(bm11111000)", %%mm0\n\t" /* 00RGB000 0.5 */\
"por %%mm4, %%mm0 \n\t" /* 00RGBRGB 0 */\
"movq %%mm2, %%mm4 \n\t" /* 0RGB0RGB 1 */\
"psllq $48, %%mm2 \n\t" /* GB000000 1 */\
"por %%mm2, %%mm0 \n\t" /* GBRGBRGB 0 */\
\
"movq %%mm4, %%mm2 \n\t" /* 0RGB0RGB 1 */\
"psrld $16, %%mm4 \n\t" /* 000R000R 1 */\
"psrlq $24, %%mm2 \n\t" /* 0000RGB0 1.5 */\
"por %%mm4, %%mm2 \n\t" /* 000RRGBR 1 */\
"pand "MANGLE(bm00001111)", %%mm2\n\t" /* 0000RGBR 1 */\
"movq %%mm1, %%mm4 \n\t" /* 0RGB0RGB 2 */\
"psrlq $8, %%mm1 \n\t" /* 00RGB0RG 2 */\
"pand "MANGLE(bm00000111)", %%mm4\n\t" /* 00000RGB 2 */\
"pand "MANGLE(bm11111000)", %%mm1\n\t" /* 00RGB000 2.5 */\
"por %%mm4, %%mm1 \n\t" /* 00RGBRGB 2 */\
"movq %%mm1, %%mm4 \n\t" /* 00RGBRGB 2 */\
"psllq $32, %%mm1 \n\t" /* BRGB0000 2 */\
"por %%mm1, %%mm2 \n\t" /* BRGBRGBR 1 */\
\
"psrlq $32, %%mm4 \n\t" /* 000000RG 2.5 */\
"movq %%mm3, %%mm5 \n\t" /* 0RGB0RGB 3 */\
"psrlq $8, %%mm3 \n\t" /* 00RGB0RG 3 */\
"pand "MANGLE(bm00000111)", %%mm5\n\t" /* 00000RGB 3 */\
"pand "MANGLE(bm11111000)", %%mm3\n\t" /* 00RGB000 3.5 */\
"por %%mm5, %%mm3 \n\t" /* 00RGBRGB 3 */\
"psllq $16, %%mm3 \n\t" /* RGBRGB00 3 */\
"por %%mm4, %%mm3 \n\t" /* RGBRGBRG 2.5 */\
\
MOVNTQ(%%mm0, (dst))\
MOVNTQ(%%mm2, 8(dst))\
MOVNTQ(%%mm3, 16(dst))\
"addl $24, "#dst" \n\t"\
\
"addl $8, "#index" \n\t"\
"cmpl "#dstw", "#index" \n\t"\
" jb 1b \n\t"
#define WRITEBGR24MMX(dst, dstw, index) \
/* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
"movq %%mm2, %%mm1 \n\t" /* B */\
"movq %%mm5, %%mm6 \n\t" /* R */\
"punpcklbw %%mm4, %%mm2 \n\t" /* GBGBGBGB 0 */\
"punpcklbw %%mm7, %%mm5 \n\t" /* 0R0R0R0R 0 */\
"punpckhbw %%mm4, %%mm1 \n\t" /* GBGBGBGB 2 */\
"punpckhbw %%mm7, %%mm6 \n\t" /* 0R0R0R0R 2 */\
"movq %%mm2, %%mm0 \n\t" /* GBGBGBGB 0 */\
"movq %%mm1, %%mm3 \n\t" /* GBGBGBGB 2 */\
"punpcklwd %%mm5, %%mm0 \n\t" /* 0RGB0RGB 0 */\
"punpckhwd %%mm5, %%mm2 \n\t" /* 0RGB0RGB 1 */\
"punpcklwd %%mm6, %%mm1 \n\t" /* 0RGB0RGB 2 */\
"punpckhwd %%mm6, %%mm3 \n\t" /* 0RGB0RGB 3 */\
\
"movq %%mm0, %%mm4 \n\t" /* 0RGB0RGB 0 */\
"movq %%mm2, %%mm6 \n\t" /* 0RGB0RGB 1 */\
"movq %%mm1, %%mm5 \n\t" /* 0RGB0RGB 2 */\
"movq %%mm3, %%mm7 \n\t" /* 0RGB0RGB 3 */\
\
"psllq $40, %%mm0 \n\t" /* RGB00000 0 */\
"psllq $40, %%mm2 \n\t" /* RGB00000 1 */\
"psllq $40, %%mm1 \n\t" /* RGB00000 2 */\
"psllq $40, %%mm3 \n\t" /* RGB00000 3 */\
\
"punpckhdq %%mm4, %%mm0 \n\t" /* 0RGBRGB0 0 */\
"punpckhdq %%mm6, %%mm2 \n\t" /* 0RGBRGB0 1 */\
"punpckhdq %%mm5, %%mm1 \n\t" /* 0RGBRGB0 2 */\
"punpckhdq %%mm7, %%mm3 \n\t" /* 0RGBRGB0 3 */\
\
"psrlq $8, %%mm0 \n\t" /* 00RGBRGB 0 */\
"movq %%mm2, %%mm6 \n\t" /* 0RGBRGB0 1 */\
"psllq $40, %%mm2 \n\t" /* GB000000 1 */\
"por %%mm2, %%mm0 \n\t" /* GBRGBRGB 0 */\
MOVNTQ(%%mm0, (dst))\
\
"psrlq $24, %%mm6 \n\t" /* 0000RGBR 1 */\
"movq %%mm1, %%mm5 \n\t" /* 0RGBRGB0 2 */\
"psllq $24, %%mm1 \n\t" /* BRGB0000 2 */\
"por %%mm1, %%mm6 \n\t" /* BRGBRGBR 1 */\
MOVNTQ(%%mm6, 8(dst))\
\
"psrlq $40, %%mm5 \n\t" /* 000000RG 2 */\
"psllq $8, %%mm3 \n\t" /* RGBRGB00 3 */\
"por %%mm3, %%mm5 \n\t" /* RGBRGBRG 2 */\
MOVNTQ(%%mm5, 16(dst))\
\
"addl $24, "#dst" \n\t"\
\
"addl $8, "#index" \n\t"\
"cmpl "#dstw", "#index" \n\t"\
" jb 1b \n\t"
#define WRITEBGR24MMX2(dst, dstw, index) \
/* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
"movq "MANGLE(M24A)", %%mm0 \n\t"\
"movq "MANGLE(M24C)", %%mm7 \n\t"\
"pshufw $0x50, %%mm2, %%mm1 \n\t" /* B3 B2 B3 B2 B1 B0 B1 B0 */\
"pshufw $0x50, %%mm4, %%mm3 \n\t" /* G3 G2 G3 G2 G1 G0 G1 G0 */\
"pshufw $0x00, %%mm5, %%mm6 \n\t" /* R1 R0 R1 R0 R1 R0 R1 R0 */\
\
"pand %%mm0, %%mm1 \n\t" /* B2 B1 B0 */\
"pand %%mm0, %%mm3 \n\t" /* G2 G1 G0 */\
"pand %%mm7, %%mm6 \n\t" /* R1 R0 */\
\
"psllq $8, %%mm3 \n\t" /* G2 G1 G0 */\
"por %%mm1, %%mm6 \n\t"\
"por %%mm3, %%mm6 \n\t"\
MOVNTQ(%%mm6, (dst))\
\
"psrlq $8, %%mm4 \n\t" /* 00 G7 G6 G5 G4 G3 G2 G1 */\
"pshufw $0xA5, %%mm2, %%mm1 \n\t" /* B5 B4 B5 B4 B3 B2 B3 B2 */\
"pshufw $0x55, %%mm4, %%mm3 \n\t" /* G4 G3 G4 G3 G4 G3 G4 G3 */\
"pshufw $0xA5, %%mm5, %%mm6 \n\t" /* R5 R4 R5 R4 R3 R2 R3 R2 */\
\
"pand "MANGLE(M24B)", %%mm1 \n\t" /* B5 B4 B3 */\
"pand %%mm7, %%mm3 \n\t" /* G4 G3 */\
"pand %%mm0, %%mm6 \n\t" /* R4 R3 R2 */\
\
"por %%mm1, %%mm3 \n\t" /* B5 G4 B4 G3 B3 */\
"por %%mm3, %%mm6 \n\t"\
MOVNTQ(%%mm6, 8(dst))\
\
"pshufw $0xFF, %%mm2, %%mm1 \n\t" /* B7 B6 B7 B6 B7 B6 B6 B7 */\
"pshufw $0xFA, %%mm4, %%mm3 \n\t" /* 00 G7 00 G7 G6 G5 G6 G5 */\
"pshufw $0xFA, %%mm5, %%mm6 \n\t" /* R7 R6 R7 R6 R5 R4 R5 R4 */\
\
"pand %%mm7, %%mm1 \n\t" /* B7 B6 */\
"pand %%mm0, %%mm3 \n\t" /* G7 G6 G5 */\
"pand "MANGLE(M24B)", %%mm6 \n\t" /* R7 R6 R5 */\
\
"por %%mm1, %%mm3 \n\t"\
"por %%mm3, %%mm6 \n\t"\
MOVNTQ(%%mm6, 16(dst))\
\
"addl $24, "#dst" \n\t"\
\
"addl $8, "#index" \n\t"\
"cmpl "#dstw", "#index" \n\t"\
" jb 1b \n\t"
#ifdef HAVE_MMX2
#undef WRITEBGR24
#define WRITEBGR24 WRITEBGR24MMX2
#else
#undef WRITEBGR24
#define WRITEBGR24 WRITEBGR24MMX
#endif
#define WRITEYUY2(dst, dstw, index) \
"packuswb %%mm3, %%mm3 \n\t"\
"packuswb %%mm4, %%mm4 \n\t"\
"packuswb %%mm7, %%mm1 \n\t"\
"punpcklbw %%mm4, %%mm3 \n\t"\
"movq %%mm1, %%mm7 \n\t"\
"punpcklbw %%mm3, %%mm1 \n\t"\
"punpckhbw %%mm3, %%mm7 \n\t"\
\
MOVNTQ(%%mm1, (dst, index, 2))\
MOVNTQ(%%mm7, 8(dst, index, 2))\
\
"addl $8, "#index" \n\t"\
"cmpl "#dstw", "#index" \n\t"\
" jb 1b \n\t"
static inline void RENAME(yuv2yuvX)(SwsContext *c, int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize,
int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize,
uint8_t *dest, uint8_t *uDest, uint8_t *vDest, int dstW, int chrDstW)
{
#ifdef HAVE_MMX
if(uDest != NULL)
{
asm volatile(
YSCALEYUV2YV12X(0, CHR_MMX_FILTER_OFFSET)
:: "r" (&c->redDither),
"r" (uDest), "m" (chrDstW)
: "%eax", "%edx", "%esi"
);
asm volatile(
YSCALEYUV2YV12X(4096, CHR_MMX_FILTER_OFFSET)
:: "r" (&c->redDither),
"r" (vDest), "m" (chrDstW)
: "%eax", "%edx", "%esi"
);
}
asm volatile(
YSCALEYUV2YV12X(0, LUM_MMX_FILTER_OFFSET)
:: "r" (&c->redDither),
"r" (dest), "m" (dstW)
: "%eax", "%edx", "%esi"
);
#else
#ifdef HAVE_ALTIVEC
yuv2yuvX_altivec_real(lumFilter, lumSrc, lumFilterSize,
chrFilter, chrSrc, chrFilterSize,
dest, uDest, vDest, dstW, chrDstW);
#else //HAVE_ALTIVEC
yuv2yuvXinC(lumFilter, lumSrc, lumFilterSize,
chrFilter, chrSrc, chrFilterSize,
dest, uDest, vDest, dstW, chrDstW);
#endif //!HAVE_ALTIVEC
#endif
}
static inline void RENAME(yuv2yuv1)(int16_t *lumSrc, int16_t *chrSrc,
uint8_t *dest, uint8_t *uDest, uint8_t *vDest, int dstW, int chrDstW)
{
#ifdef HAVE_MMX
if(uDest != NULL)
{
asm volatile(
YSCALEYUV2YV121
:: "r" (chrSrc + chrDstW), "r" (uDest + chrDstW),
"g" (-chrDstW)
: "%eax"
);
asm volatile(
YSCALEYUV2YV121
:: "r" (chrSrc + 2048 + chrDstW), "r" (vDest + chrDstW),
"g" (-chrDstW)
: "%eax"
);
}
asm volatile(
YSCALEYUV2YV121
:: "r" (lumSrc + dstW), "r" (dest + dstW),
"g" (-dstW)
: "%eax"
);
#else
int i;
for(i=0; i<dstW; i++)
{
int val= lumSrc[i]>>7;
if(val&256){
if(val<0) val=0;
else val=255;
}
dest[i]= val;
}
if(uDest != NULL)
for(i=0; i<chrDstW; i++)
{
int u=chrSrc[i]>>7;
int v=chrSrc[i + 2048]>>7;
if((u|v)&256){
if(u<0) u=0;
else if (u>255) u=255;
if(v<0) v=0;
else if (v>255) v=255;
}
uDest[i]= u;
vDest[i]= v;
}
#endif
}
/**
* vertical scale YV12 to RGB
*/
static inline void RENAME(yuv2packedX)(SwsContext *c, int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize,
int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize,
uint8_t *dest, int dstW, int dstY)
{
int dummy=0;
switch(c->dstFormat)
{
#ifdef HAVE_MMX
case IMGFMT_BGR32:
{
asm volatile(
YSCALEYUV2RGBX
WRITEBGR32(%4, %5, %%eax)
:: "r" (&c->redDither),
"m" (dummy), "m" (dummy), "m" (dummy),
"r" (dest), "m" (dstW)
: "%eax", "%edx", "%esi"
);
}
break;
case IMGFMT_BGR24:
{
asm volatile(
YSCALEYUV2RGBX
"leal (%%eax, %%eax, 2), %%ebx \n\t" //FIXME optimize
"addl %4, %%ebx \n\t"
WRITEBGR24(%%ebx, %5, %%eax)
:: "r" (&c->redDither),
"m" (dummy), "m" (dummy), "m" (dummy),
"r" (dest), "m" (dstW)
: "%eax", "%ebx", "%edx", "%esi" //FIXME ebx
);
}
break;
case IMGFMT_BGR15:
{
asm volatile(
YSCALEYUV2RGBX
/* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
#ifdef DITHER1XBPP
"paddusb "MANGLE(b5Dither)", %%mm2\n\t"
"paddusb "MANGLE(g5Dither)", %%mm4\n\t"
"paddusb "MANGLE(r5Dither)", %%mm5\n\t"
#endif
WRITEBGR15(%4, %5, %%eax)
:: "r" (&c->redDither),
"m" (dummy), "m" (dummy), "m" (dummy),
"r" (dest), "m" (dstW)
: "%eax", "%edx", "%esi"
);
}
break;
case IMGFMT_BGR16:
{
asm volatile(
YSCALEYUV2RGBX
/* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
#ifdef DITHER1XBPP
"paddusb "MANGLE(b5Dither)", %%mm2\n\t"
"paddusb "MANGLE(g6Dither)", %%mm4\n\t"
"paddusb "MANGLE(r5Dither)", %%mm5\n\t"
#endif
WRITEBGR16(%4, %5, %%eax)
:: "r" (&c->redDither),
"m" (dummy), "m" (dummy), "m" (dummy),
"r" (dest), "m" (dstW)
: "%eax", "%edx", "%esi"
);
}
break;
case IMGFMT_YUY2:
{
asm volatile(
YSCALEYUV2PACKEDX
/* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
"psraw $3, %%mm3 \n\t"
"psraw $3, %%mm4 \n\t"
"psraw $3, %%mm1 \n\t"
"psraw $3, %%mm7 \n\t"
WRITEYUY2(%4, %5, %%eax)
:: "r" (&c->redDither),
"m" (dummy), "m" (dummy), "m" (dummy),
"r" (dest), "m" (dstW)
: "%eax", "%edx", "%esi"
);
}
break;
#endif
default:
#ifdef HAVE_ALTIVEC
altivec_yuv2packedX (c, lumFilter, lumSrc, lumFilterSize,
chrFilter, chrSrc, chrFilterSize,
dest, dstW, dstY);
#else
yuv2packedXinC(c, lumFilter, lumSrc, lumFilterSize,
chrFilter, chrSrc, chrFilterSize,
dest, dstW, dstY);
#endif
break;
}
}
/**
* vertical bilinear scale YV12 to RGB
*/
static inline void RENAME(yuv2packed2)(SwsContext *c, uint16_t *buf0, uint16_t *buf1, uint16_t *uvbuf0, uint16_t *uvbuf1,
uint8_t *dest, int dstW, int yalpha, int uvalpha, int y)
{
int yalpha1=yalpha^4095;
int uvalpha1=uvalpha^4095;
int i;
#if 0 //isn't used
if(flags&SWS_FULL_CHR_H_INT)
{
switch(dstFormat)
{
#ifdef HAVE_MMX
case IMGFMT_BGR32:
asm volatile(
FULL_YSCALEYUV2RGB
"punpcklbw %%mm1, %%mm3 \n\t" // BGBGBGBG
"punpcklbw %%mm7, %%mm0 \n\t" // R0R0R0R0
"movq %%mm3, %%mm1 \n\t"
"punpcklwd %%mm0, %%mm3 \n\t" // BGR0BGR0
"punpckhwd %%mm0, %%mm1 \n\t" // BGR0BGR0
MOVNTQ(%%mm3, (%4, %%eax, 4))
MOVNTQ(%%mm1, 8(%4, %%eax, 4))
"addl $4, %%eax \n\t"
"cmpl %5, %%eax \n\t"
" jb 1b \n\t"
:: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
"m" (yalpha1), "m" (uvalpha1)
: "%eax"
);
break;
case IMGFMT_BGR24:
asm volatile(
FULL_YSCALEYUV2RGB
// lsb ... msb
"punpcklbw %%mm1, %%mm3 \n\t" // BGBGBGBG
"punpcklbw %%mm7, %%mm0 \n\t" // R0R0R0R0
"movq %%mm3, %%mm1 \n\t"
"punpcklwd %%mm0, %%mm3 \n\t" // BGR0BGR0
"punpckhwd %%mm0, %%mm1 \n\t" // BGR0BGR0
"movq %%mm3, %%mm2 \n\t" // BGR0BGR0
"psrlq $8, %%mm3 \n\t" // GR0BGR00
"pand "MANGLE(bm00000111)", %%mm2\n\t" // BGR00000
"pand "MANGLE(bm11111000)", %%mm3\n\t" // 000BGR00
"por %%mm2, %%mm3 \n\t" // BGRBGR00
"movq %%mm1, %%mm2 \n\t"
"psllq $48, %%mm1 \n\t" // 000000BG
"por %%mm1, %%mm3 \n\t" // BGRBGRBG
"movq %%mm2, %%mm1 \n\t" // BGR0BGR0
"psrld $16, %%mm2 \n\t" // R000R000
"psrlq $24, %%mm1 \n\t" // 0BGR0000
"por %%mm2, %%mm1 \n\t" // RBGRR000
"movl %4, %%ebx \n\t"
"addl %%eax, %%ebx \n\t"
#ifdef HAVE_MMX2
//FIXME Alignment
"movntq %%mm3, (%%ebx, %%eax, 2)\n\t"
"movntq %%mm1, 8(%%ebx, %%eax, 2)\n\t"
#else
"movd %%mm3, (%%ebx, %%eax, 2) \n\t"
"psrlq $32, %%mm3 \n\t"
"movd %%mm3, 4(%%ebx, %%eax, 2) \n\t"
"movd %%mm1, 8(%%ebx, %%eax, 2) \n\t"
#endif
"addl $4, %%eax \n\t"
"cmpl %5, %%eax \n\t"
" jb 1b \n\t"
:: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest), "m" (dstW),
"m" (yalpha1), "m" (uvalpha1)
: "%eax", "%ebx"
);
break;
case IMGFMT_BGR15:
asm volatile(
FULL_YSCALEYUV2RGB
#ifdef DITHER1XBPP
"paddusb "MANGLE(g5Dither)", %%mm1\n\t"
"paddusb "MANGLE(r5Dither)", %%mm0\n\t"
"paddusb "MANGLE(b5Dither)", %%mm3\n\t"
#endif
"punpcklbw %%mm7, %%mm1 \n\t" // 0G0G0G0G
"punpcklbw %%mm7, %%mm3 \n\t" // 0B0B0B0B
"punpcklbw %%mm7, %%mm0 \n\t" // 0R0R0R0R
"psrlw $3, %%mm3 \n\t"
"psllw $2, %%mm1 \n\t"
"psllw $7, %%mm0 \n\t"
"pand "MANGLE(g15Mask)", %%mm1 \n\t"
"pand "MANGLE(r15Mask)", %%mm0 \n\t"
"por %%mm3, %%mm1 \n\t"
"por %%mm1, %%mm0 \n\t"
MOVNTQ(%%mm0, (%4, %%eax, 2))
"addl $4, %%eax \n\t"
"cmpl %5, %%eax \n\t"
" jb 1b \n\t"
:: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
"m" (yalpha1), "m" (uvalpha1)
: "%eax"
);
break;
case IMGFMT_BGR16:
asm volatile(
FULL_YSCALEYUV2RGB
#ifdef DITHER1XBPP
"paddusb "MANGLE(g6Dither)", %%mm1\n\t"
"paddusb "MANGLE(r5Dither)", %%mm0\n\t"
"paddusb "MANGLE(b5Dither)", %%mm3\n\t"
#endif
"punpcklbw %%mm7, %%mm1 \n\t" // 0G0G0G0G
"punpcklbw %%mm7, %%mm3 \n\t" // 0B0B0B0B
"punpcklbw %%mm7, %%mm0 \n\t" // 0R0R0R0R
"psrlw $3, %%mm3 \n\t"
"psllw $3, %%mm1 \n\t"
"psllw $8, %%mm0 \n\t"
"pand "MANGLE(g16Mask)", %%mm1 \n\t"
"pand "MANGLE(r16Mask)", %%mm0 \n\t"
"por %%mm3, %%mm1 \n\t"
"por %%mm1, %%mm0 \n\t"
MOVNTQ(%%mm0, (%4, %%eax, 2))
"addl $4, %%eax \n\t"
"cmpl %5, %%eax \n\t"
" jb 1b \n\t"
:: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
"m" (yalpha1), "m" (uvalpha1)
: "%eax"
);
break;
#endif
case IMGFMT_RGB32:
#ifndef HAVE_MMX
case IMGFMT_BGR32:
#endif
if(dstFormat==IMGFMT_BGR32)
{
int i;
#ifdef WORDS_BIGENDIAN
dest++;
#endif
for(i=0;i<dstW;i++){
// vertical linear interpolation && yuv2rgb in a single step:
int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19);
int V=((uvbuf0[i+2048]*uvalpha1+uvbuf1[i+2048]*uvalpha)>>19);
dest[0]=clip_table[((Y + yuvtab_40cf[U]) >>13)];
dest[1]=clip_table[((Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13)];
dest[2]=clip_table[((Y + yuvtab_3343[V]) >>13)];
dest+= 4;
}
}
else if(dstFormat==IMGFMT_BGR24)
{
int i;
for(i=0;i<dstW;i++){
// vertical linear interpolation && yuv2rgb in a single step:
int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19);
int V=((uvbuf0[i+2048]*uvalpha1+uvbuf1[i+2048]*uvalpha)>>19);
dest[0]=clip_table[((Y + yuvtab_40cf[U]) >>13)];
dest[1]=clip_table[((Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13)];
dest[2]=clip_table[((Y + yuvtab_3343[V]) >>13)];
dest+= 3;
}
}
else if(dstFormat==IMGFMT_BGR16)
{
int i;
for(i=0;i<dstW;i++){
// vertical linear interpolation && yuv2rgb in a single step:
int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19);
int V=((uvbuf0[i+2048]*uvalpha1+uvbuf1[i+2048]*uvalpha)>>19);
((uint16_t*)dest)[i] =
clip_table16b[(Y + yuvtab_40cf[U]) >>13] |
clip_table16g[(Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13] |
clip_table16r[(Y + yuvtab_3343[V]) >>13];
}
}
else if(dstFormat==IMGFMT_BGR15)
{
int i;
for(i=0;i<dstW;i++){
// vertical linear interpolation && yuv2rgb in a single step:
int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19);
int V=((uvbuf0[i+2048]*uvalpha1+uvbuf1[i+2048]*uvalpha)>>19);
((uint16_t*)dest)[i] =
clip_table15b[(Y + yuvtab_40cf[U]) >>13] |
clip_table15g[(Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13] |
clip_table15r[(Y + yuvtab_3343[V]) >>13];
}
}
}//FULL_UV_IPOL
else
{
#endif // if 0
#ifdef HAVE_MMX
switch(c->dstFormat)
{
//Note 8280 == DSTW_OFFSET but the preprocessor can't handle that there :(
case IMGFMT_BGR32:
asm volatile(
"movl %%esp, "ESP_OFFSET"(%5) \n\t"
"movl %4, %%esp \n\t"
YSCALEYUV2RGB(%%eax, %5)
WRITEBGR32(%%esp, 8280(%5), %%eax)
"movl "ESP_OFFSET"(%5), %%esp \n\t"
:: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest),
"r" (&c->redDither)
: "%eax"
);
return;
case IMGFMT_BGR24:
asm volatile(
"movl %%esp, "ESP_OFFSET"(%5) \n\t"
"movl %4, %%esp \n\t"
YSCALEYUV2RGB(%%eax, %5)
WRITEBGR24(%%esp, 8280(%5), %%eax)
"movl "ESP_OFFSET"(%5), %%esp \n\t"
:: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest),
"r" (&c->redDither)
: "%eax"
);
return;
case IMGFMT_BGR15:
asm volatile(
"movl %%esp, "ESP_OFFSET"(%5) \n\t"
"movl %4, %%esp \n\t"
YSCALEYUV2RGB(%%eax, %5)
/* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
#ifdef DITHER1XBPP
"paddusb "MANGLE(b5Dither)", %%mm2\n\t"
"paddusb "MANGLE(g5Dither)", %%mm4\n\t"
"paddusb "MANGLE(r5Dither)", %%mm5\n\t"
#endif
WRITEBGR15(%%esp, 8280(%5), %%eax)
"movl "ESP_OFFSET"(%5), %%esp \n\t"
:: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest),
"r" (&c->redDither)
: "%eax"
);
return;
case IMGFMT_BGR16:
asm volatile(
"movl %%esp, "ESP_OFFSET"(%5) \n\t"
"movl %4, %%esp \n\t"
YSCALEYUV2RGB(%%eax, %5)
/* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
#ifdef DITHER1XBPP
"paddusb "MANGLE(b5Dither)", %%mm2\n\t"
"paddusb "MANGLE(g6Dither)", %%mm4\n\t"
"paddusb "MANGLE(r5Dither)", %%mm5\n\t"
#endif
WRITEBGR16(%%esp, 8280(%5), %%eax)
"movl "ESP_OFFSET"(%5), %%esp \n\t"
:: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest),
"r" (&c->redDither)
: "%eax"
);
return;
case IMGFMT_YUY2:
asm volatile(
"movl %%esp, "ESP_OFFSET"(%5) \n\t"
"movl %4, %%esp \n\t"
YSCALEYUV2PACKED(%%eax, %5)
WRITEYUY2(%%esp, 8280(%5), %%eax)
"movl "ESP_OFFSET"(%5), %%esp \n\t"
:: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest),
"r" (&c->redDither)
: "%eax"
);
return;
default: break;
}
#endif //HAVE_MMX
YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB2_C, YSCALE_YUV_2_PACKED2_C)
}
/**
* YV12 to RGB without scaling or interpolating
*/
static inline void RENAME(yuv2packed1)(SwsContext *c, uint16_t *buf0, uint16_t *uvbuf0, uint16_t *uvbuf1,
uint8_t *dest, int dstW, int uvalpha, int dstFormat, int flags, int y)
{
const int yalpha1=0;
int i;
uint16_t *buf1= buf0; //FIXME needed for the rgb1/bgr1
const int yalpha= 4096; //FIXME ...
if(flags&SWS_FULL_CHR_H_INT)
{
RENAME(yuv2packed2)(c, buf0, buf0, uvbuf0, uvbuf1, dest, dstW, 0, uvalpha, y);
return;
}
#ifdef HAVE_MMX
if( uvalpha < 2048 ) // note this is not correct (shifts chrominance by 0.5 pixels) but its a bit faster
{
switch(dstFormat)
{
case IMGFMT_BGR32:
asm volatile(
"movl %%esp, "ESP_OFFSET"(%5) \n\t"
"movl %4, %%esp \n\t"
YSCALEYUV2RGB1(%%eax, %5)
WRITEBGR32(%%esp, 8280(%5), %%eax)
"movl "ESP_OFFSET"(%5), %%esp \n\t"
:: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest),
"r" (&c->redDither)
: "%eax"
);
return;
case IMGFMT_BGR24:
asm volatile(
"movl %%esp, "ESP_OFFSET"(%5) \n\t"
"movl %4, %%esp \n\t"
YSCALEYUV2RGB1(%%eax, %5)
WRITEBGR24(%%esp, 8280(%5), %%eax)
"movl "ESP_OFFSET"(%5), %%esp \n\t"
:: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest),
"r" (&c->redDither)
: "%eax"
);
return;
case IMGFMT_BGR15:
asm volatile(
"movl %%esp, "ESP_OFFSET"(%5) \n\t"
"movl %4, %%esp \n\t"
YSCALEYUV2RGB1(%%eax, %5)
/* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
#ifdef DITHER1XBPP
"paddusb "MANGLE(b5Dither)", %%mm2\n\t"
"paddusb "MANGLE(g5Dither)", %%mm4\n\t"
"paddusb "MANGLE(r5Dither)", %%mm5\n\t"
#endif
WRITEBGR15(%%esp, 8280(%5), %%eax)
"movl "ESP_OFFSET"(%5), %%esp \n\t"
:: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest),
"r" (&c->redDither)
: "%eax"
);
return;
case IMGFMT_BGR16:
asm volatile(
"movl %%esp, "ESP_OFFSET"(%5) \n\t"
"movl %4, %%esp \n\t"
YSCALEYUV2RGB1(%%eax, %5)
/* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
#ifdef DITHER1XBPP
"paddusb "MANGLE(b5Dither)", %%mm2\n\t"
"paddusb "MANGLE(g6Dither)", %%mm4\n\t"
"paddusb "MANGLE(r5Dither)", %%mm5\n\t"
#endif
WRITEBGR16(%%esp, 8280(%5), %%eax)
"movl "ESP_OFFSET"(%5), %%esp \n\t"
:: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest),
"r" (&c->redDither)
: "%eax"
);
return;
case IMGFMT_YUY2:
asm volatile(
"movl %%esp, "ESP_OFFSET"(%5) \n\t"
"movl %4, %%esp \n\t"
YSCALEYUV2PACKED1(%%eax, %5)
WRITEYUY2(%%esp, 8280(%5), %%eax)
"movl "ESP_OFFSET"(%5), %%esp \n\t"
:: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest),
"r" (&c->redDither)
: "%eax"
);
return;
}
}
else
{
switch(dstFormat)
{
case IMGFMT_BGR32:
asm volatile(
"movl %%esp, "ESP_OFFSET"(%5) \n\t"
"movl %4, %%esp \n\t"
YSCALEYUV2RGB1b(%%eax, %5)
WRITEBGR32(%%esp, 8280(%5), %%eax)
"movl "ESP_OFFSET"(%5), %%esp \n\t"
:: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest),
"r" (&c->redDither)
: "%eax"
);
return;
case IMGFMT_BGR24:
asm volatile(
"movl %%esp, "ESP_OFFSET"(%5) \n\t"
"movl %4, %%esp \n\t"
YSCALEYUV2RGB1b(%%eax, %5)
WRITEBGR24(%%esp, 8280(%5), %%eax)
"movl "ESP_OFFSET"(%5), %%esp \n\t"
:: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest),
"r" (&c->redDither)
: "%eax"
);
return;
case IMGFMT_BGR15:
asm volatile(
"movl %%esp, "ESP_OFFSET"(%5) \n\t"
"movl %4, %%esp \n\t"
YSCALEYUV2RGB1b(%%eax, %5)
/* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
#ifdef DITHER1XBPP
"paddusb "MANGLE(b5Dither)", %%mm2\n\t"
"paddusb "MANGLE(g5Dither)", %%mm4\n\t"
"paddusb "MANGLE(r5Dither)", %%mm5\n\t"
#endif
WRITEBGR15(%%esp, 8280(%5), %%eax)
"movl "ESP_OFFSET"(%5), %%esp \n\t"
:: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest),
"r" (&c->redDither)
: "%eax"
);
return;
case IMGFMT_BGR16:
asm volatile(
"movl %%esp, "ESP_OFFSET"(%5) \n\t"
"movl %4, %%esp \n\t"
YSCALEYUV2RGB1b(%%eax, %5)
/* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
#ifdef DITHER1XBPP
"paddusb "MANGLE(b5Dither)", %%mm2\n\t"
"paddusb "MANGLE(g6Dither)", %%mm4\n\t"
"paddusb "MANGLE(r5Dither)", %%mm5\n\t"
#endif
WRITEBGR16(%%esp, 8280(%5), %%eax)
"movl "ESP_OFFSET"(%5), %%esp \n\t"
:: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest),
"r" (&c->redDither)
: "%eax"
);
return;
case IMGFMT_YUY2:
asm volatile(
"movl %%esp, "ESP_OFFSET"(%5) \n\t"
"movl %4, %%esp \n\t"
YSCALEYUV2PACKED1b(%%eax, %5)
WRITEYUY2(%%esp, 8280(%5), %%eax)
"movl "ESP_OFFSET"(%5), %%esp \n\t"
:: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest),
"r" (&c->redDither)
: "%eax"
);
return;
}
}
#endif
if( uvalpha < 2048 )
{
YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB1_C, YSCALE_YUV_2_PACKED1_C)
}else{
YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB1B_C, YSCALE_YUV_2_PACKED1B_C)
}
}
//FIXME yuy2* can read upto 7 samples to much
static inline void RENAME(yuy2ToY)(uint8_t *dst, uint8_t *src, int width)
{
#ifdef HAVE_MMX
asm volatile(
"movq "MANGLE(bm01010101)", %%mm2\n\t"
"movl %0, %%eax \n\t"
"1: \n\t"
"movq (%1, %%eax,2), %%mm0 \n\t"
"movq 8(%1, %%eax,2), %%mm1 \n\t"
"pand %%mm2, %%mm0 \n\t"
"pand %%mm2, %%mm1 \n\t"
"packuswb %%mm1, %%mm0 \n\t"
"movq %%mm0, (%2, %%eax) \n\t"
"addl $8, %%eax \n\t"
" js 1b \n\t"
: : "g" (-width), "r" (src+width*2), "r" (dst+width)
: "%eax"
);
#else
int i;
for(i=0; i<width; i++)
dst[i]= src[2*i];
#endif
}
static inline void RENAME(yuy2ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
{
#if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
asm volatile(
"movq "MANGLE(bm01010101)", %%mm4\n\t"
"movl %0, %%eax \n\t"
"1: \n\t"
"movq (%1, %%eax,4), %%mm0 \n\t"
"movq 8(%1, %%eax,4), %%mm1 \n\t"
"movq (%2, %%eax,4), %%mm2 \n\t"
"movq 8(%2, %%eax,4), %%mm3 \n\t"
PAVGB(%%mm2, %%mm0)
PAVGB(%%mm3, %%mm1)
"psrlw $8, %%mm0 \n\t"
"psrlw $8, %%mm1 \n\t"
"packuswb %%mm1, %%mm0 \n\t"
"movq %%mm0, %%mm1 \n\t"
"psrlw $8, %%mm0 \n\t"
"pand %%mm4, %%mm1 \n\t"
"packuswb %%mm0, %%mm0 \n\t"
"packuswb %%mm1, %%mm1 \n\t"
"movd %%mm0, (%4, %%eax) \n\t"
"movd %%mm1, (%3, %%eax) \n\t"
"addl $4, %%eax \n\t"
" js 1b \n\t"
: : "g" (-width), "r" (src1+width*4), "r" (src2+width*4), "r" (dstU+width), "r" (dstV+width)
: "%eax"
);
#else
int i;
for(i=0; i<width; i++)
{
dstU[i]= (src1[4*i + 1] + src2[4*i + 1])>>1;
dstV[i]= (src1[4*i + 3] + src2[4*i + 3])>>1;
}
#endif
}
//this is allmost identical to the previous, end exists only cuz yuy2ToY/UV)(dst, src+1, ...) would have 100% unaligned accesses
static inline void RENAME(uyvyToY)(uint8_t *dst, uint8_t *src, int width)
{
#ifdef HAVE_MMX
asm volatile(
"movl %0, %%eax \n\t"
"1: \n\t"
"movq (%1, %%eax,2), %%mm0 \n\t"
"movq 8(%1, %%eax,2), %%mm1 \n\t"
"psrlw $8, %%mm0 \n\t"
"psrlw $8, %%mm1 \n\t"
"packuswb %%mm1, %%mm0 \n\t"
"movq %%mm0, (%2, %%eax) \n\t"
"addl $8, %%eax \n\t"
" js 1b \n\t"
: : "g" (-width), "r" (src+width*2), "r" (dst+width)
: "%eax"
);
#else
int i;
for(i=0; i<width; i++)
dst[i]= src[2*i+1];
#endif
}
static inline void RENAME(uyvyToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
{
#if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
asm volatile(
"movq "MANGLE(bm01010101)", %%mm4\n\t"
"movl %0, %%eax \n\t"
"1: \n\t"
"movq (%1, %%eax,4), %%mm0 \n\t"
"movq 8(%1, %%eax,4), %%mm1 \n\t"
"movq (%2, %%eax,4), %%mm2 \n\t"
"movq 8(%2, %%eax,4), %%mm3 \n\t"
PAVGB(%%mm2, %%mm0)
PAVGB(%%mm3, %%mm1)
"pand %%mm4, %%mm0 \n\t"
"pand %%mm4, %%mm1 \n\t"
"packuswb %%mm1, %%mm0 \n\t"
"movq %%mm0, %%mm1 \n\t"
"psrlw $8, %%mm0 \n\t"
"pand %%mm4, %%mm1 \n\t"
"packuswb %%mm0, %%mm0 \n\t"
"packuswb %%mm1, %%mm1 \n\t"
"movd %%mm0, (%4, %%eax) \n\t"
"movd %%mm1, (%3, %%eax) \n\t"
"addl $4, %%eax \n\t"
" js 1b \n\t"
: : "g" (-width), "r" (src1+width*4), "r" (src2+width*4), "r" (dstU+width), "r" (dstV+width)
: "%eax"
);
#else
int i;
for(i=0; i<width; i++)
{
dstU[i]= (src1[4*i + 0] + src2[4*i + 0])>>1;
dstV[i]= (src1[4*i + 2] + src2[4*i + 2])>>1;
}
#endif
}
static inline void RENAME(bgr32ToY)(uint8_t *dst, uint8_t *src, int width)
{
#ifdef HAVE_MMXFIXME
#else
int i;
for(i=0; i<width; i++)
{
int b= ((uint32_t*)src)[i]&0xFF;
int g= (((uint32_t*)src)[i]>>8)&0xFF;
int r= (((uint32_t*)src)[i]>>16)&0xFF;
dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)) )>>RGB2YUV_SHIFT);
}
#endif
}
static inline void RENAME(bgr32ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
{
#ifdef HAVE_MMXFIXME
#else
int i;
for(i=0; i<width; i++)
{
const int a= ((uint32_t*)src1)[2*i+0];
const int e= ((uint32_t*)src1)[2*i+1];
const int c= ((uint32_t*)src2)[2*i+0];
const int d= ((uint32_t*)src2)[2*i+1];
const int l= (a&0xFF00FF) + (e&0xFF00FF) + (c&0xFF00FF) + (d&0xFF00FF);
const int h= (a&0x00FF00) + (e&0x00FF00) + (c&0x00FF00) + (d&0x00FF00);
const int b= l&0x3FF;
const int g= h>>8;
const int r= l>>16;
dstU[i]= ((RU*r + GU*g + BU*b)>>(RGB2YUV_SHIFT+2)) + 128;
dstV[i]= ((RV*r + GV*g + BV*b)>>(RGB2YUV_SHIFT+2)) + 128;
}
#endif
}
static inline void RENAME(bgr24ToY)(uint8_t *dst, uint8_t *src, int width)
{
#ifdef HAVE_MMX
asm volatile(
"movl %2, %%eax \n\t"
"movq "MANGLE(bgr2YCoeff)", %%mm6 \n\t"
"movq "MANGLE(w1111)", %%mm5 \n\t"
"pxor %%mm7, %%mm7 \n\t"
"leal (%%eax, %%eax, 2), %%ebx \n\t"
".balign 16 \n\t"
"1: \n\t"
PREFETCH" 64(%0, %%ebx) \n\t"
"movd (%0, %%ebx), %%mm0 \n\t"
"movd 3(%0, %%ebx), %%mm1 \n\t"
"punpcklbw %%mm7, %%mm0 \n\t"
"punpcklbw %%mm7, %%mm1 \n\t"
"movd 6(%0, %%ebx), %%mm2 \n\t"
"movd 9(%0, %%ebx), %%mm3 \n\t"
"punpcklbw %%mm7, %%mm2 \n\t"
"punpcklbw %%mm7, %%mm3 \n\t"
"pmaddwd %%mm6, %%mm0 \n\t"
"pmaddwd %%mm6, %%mm1 \n\t"
"pmaddwd %%mm6, %%mm2 \n\t"
"pmaddwd %%mm6, %%mm3 \n\t"
#ifndef FAST_BGR2YV12
"psrad $8, %%mm0 \n\t"
"psrad $8, %%mm1 \n\t"
"psrad $8, %%mm2 \n\t"
"psrad $8, %%mm3 \n\t"
#endif
"packssdw %%mm1, %%mm0 \n\t"
"packssdw %%mm3, %%mm2 \n\t"
"pmaddwd %%mm5, %%mm0 \n\t"
"pmaddwd %%mm5, %%mm2 \n\t"
"packssdw %%mm2, %%mm0 \n\t"
"psraw $7, %%mm0 \n\t"
"movd 12(%0, %%ebx), %%mm4 \n\t"
"movd 15(%0, %%ebx), %%mm1 \n\t"
"punpcklbw %%mm7, %%mm4 \n\t"
"punpcklbw %%mm7, %%mm1 \n\t"
"movd 18(%0, %%ebx), %%mm2 \n\t"
"movd 21(%0, %%ebx), %%mm3 \n\t"
"punpcklbw %%mm7, %%mm2 \n\t"
"punpcklbw %%mm7, %%mm3 \n\t"
"pmaddwd %%mm6, %%mm4 \n\t"
"pmaddwd %%mm6, %%mm1 \n\t"
"pmaddwd %%mm6, %%mm2 \n\t"
"pmaddwd %%mm6, %%mm3 \n\t"
#ifndef FAST_BGR2YV12
"psrad $8, %%mm4 \n\t"
"psrad $8, %%mm1 \n\t"
"psrad $8, %%mm2 \n\t"
"psrad $8, %%mm3 \n\t"
#endif
"packssdw %%mm1, %%mm4 \n\t"
"packssdw %%mm3, %%mm2 \n\t"
"pmaddwd %%mm5, %%mm4 \n\t"
"pmaddwd %%mm5, %%mm2 \n\t"
"addl $24, %%ebx \n\t"
"packssdw %%mm2, %%mm4 \n\t"
"psraw $7, %%mm4 \n\t"
"packuswb %%mm4, %%mm0 \n\t"
"paddusb "MANGLE(bgr2YOffset)", %%mm0 \n\t"
"movq %%mm0, (%1, %%eax) \n\t"
"addl $8, %%eax \n\t"
" js 1b \n\t"
: : "r" (src+width*3), "r" (dst+width), "g" (-width)
: "%eax", "%ebx"
);
#else
int i;
for(i=0; i<width; i++)
{
int b= src[i*3+0];
int g= src[i*3+1];
int r= src[i*3+2];
dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)) )>>RGB2YUV_SHIFT);
}
#endif
}
static inline void RENAME(bgr24ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
{
#ifdef HAVE_MMX
asm volatile(
"movl %4, %%eax \n\t"
"movq "MANGLE(w1111)", %%mm5 \n\t"
"movq "MANGLE(bgr2UCoeff)", %%mm6 \n\t"
"pxor %%mm7, %%mm7 \n\t"
"leal (%%eax, %%eax, 2), %%ebx \n\t"
"addl %%ebx, %%ebx \n\t"
".balign 16 \n\t"
"1: \n\t"
PREFETCH" 64(%0, %%ebx) \n\t"
PREFETCH" 64(%1, %%ebx) \n\t"
#if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
"movq (%0, %%ebx), %%mm0 \n\t"
"movq (%1, %%ebx), %%mm1 \n\t"
"movq 6(%0, %%ebx), %%mm2 \n\t"
"movq 6(%1, %%ebx), %%mm3 \n\t"
PAVGB(%%mm1, %%mm0)
PAVGB(%%mm3, %%mm2)
"movq %%mm0, %%mm1 \n\t"
"movq %%mm2, %%mm3 \n\t"
"psrlq $24, %%mm0 \n\t"
"psrlq $24, %%mm2 \n\t"
PAVGB(%%mm1, %%mm0)
PAVGB(%%mm3, %%mm2)
"punpcklbw %%mm7, %%mm0 \n\t"
"punpcklbw %%mm7, %%mm2 \n\t"
#else
"movd (%0, %%ebx), %%mm0 \n\t"
"movd (%1, %%ebx), %%mm1 \n\t"
"movd 3(%0, %%ebx), %%mm2 \n\t"
"movd 3(%1, %%ebx), %%mm3 \n\t"
"punpcklbw %%mm7, %%mm0 \n\t"
"punpcklbw %%mm7, %%mm1 \n\t"
"punpcklbw %%mm7, %%mm2 \n\t"
"punpcklbw %%mm7, %%mm3 \n\t"
"paddw %%mm1, %%mm0 \n\t"
"paddw %%mm3, %%mm2 \n\t"
"paddw %%mm2, %%mm0 \n\t"
"movd 6(%0, %%ebx), %%mm4 \n\t"
"movd 6(%1, %%ebx), %%mm1 \n\t"
"movd 9(%0, %%ebx), %%mm2 \n\t"
"movd 9(%1, %%ebx), %%mm3 \n\t"
"punpcklbw %%mm7, %%mm4 \n\t"
"punpcklbw %%mm7, %%mm1 \n\t"
"punpcklbw %%mm7, %%mm2 \n\t"
"punpcklbw %%mm7, %%mm3 \n\t"
"paddw %%mm1, %%mm4 \n\t"
"paddw %%mm3, %%mm2 \n\t"
"paddw %%mm4, %%mm2 \n\t"
"psrlw $2, %%mm0 \n\t"
"psrlw $2, %%mm2 \n\t"
#endif
"movq "MANGLE(bgr2VCoeff)", %%mm1 \n\t"
"movq "MANGLE(bgr2VCoeff)", %%mm3 \n\t"
"pmaddwd %%mm0, %%mm1 \n\t"
"pmaddwd %%mm2, %%mm3 \n\t"
"pmaddwd %%mm6, %%mm0 \n\t"
"pmaddwd %%mm6, %%mm2 \n\t"
#ifndef FAST_BGR2YV12
"psrad $8, %%mm0 \n\t"
"psrad $8, %%mm1 \n\t"
"psrad $8, %%mm2 \n\t"
"psrad $8, %%mm3 \n\t"
#endif
"packssdw %%mm2, %%mm0 \n\t"
"packssdw %%mm3, %%mm1 \n\t"
"pmaddwd %%mm5, %%mm0 \n\t"
"pmaddwd %%mm5, %%mm1 \n\t"
"packssdw %%mm1, %%mm0 \n\t" // V1 V0 U1 U0
"psraw $7, %%mm0 \n\t"
#if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
"movq 12(%0, %%ebx), %%mm4 \n\t"
"movq 12(%1, %%ebx), %%mm1 \n\t"
"movq 18(%0, %%ebx), %%mm2 \n\t"
"movq 18(%1, %%ebx), %%mm3 \n\t"
PAVGB(%%mm1, %%mm4)
PAVGB(%%mm3, %%mm2)
"movq %%mm4, %%mm1 \n\t"
"movq %%mm2, %%mm3 \n\t"
"psrlq $24, %%mm4 \n\t"
"psrlq $24, %%mm2 \n\t"
PAVGB(%%mm1, %%mm4)
PAVGB(%%mm3, %%mm2)
"punpcklbw %%mm7, %%mm4 \n\t"
"punpcklbw %%mm7, %%mm2 \n\t"
#else
"movd 12(%0, %%ebx), %%mm4 \n\t"
"movd 12(%1, %%ebx), %%mm1 \n\t"
"movd 15(%0, %%ebx), %%mm2 \n\t"
"movd 15(%1, %%ebx), %%mm3 \n\t"
"punpcklbw %%mm7, %%mm4 \n\t"
"punpcklbw %%mm7, %%mm1 \n\t"
"punpcklbw %%mm7, %%mm2 \n\t"
"punpcklbw %%mm7, %%mm3 \n\t"
"paddw %%mm1, %%mm4 \n\t"
"paddw %%mm3, %%mm2 \n\t"
"paddw %%mm2, %%mm4 \n\t"
"movd 18(%0, %%ebx), %%mm5 \n\t"
"movd 18(%1, %%ebx), %%mm1 \n\t"
"movd 21(%0, %%ebx), %%mm2 \n\t"
"movd 21(%1, %%ebx), %%mm3 \n\t"
"punpcklbw %%mm7, %%mm5 \n\t"
"punpcklbw %%mm7, %%mm1 \n\t"
"punpcklbw %%mm7, %%mm2 \n\t"
"punpcklbw %%mm7, %%mm3 \n\t"
"paddw %%mm1, %%mm5 \n\t"
"paddw %%mm3, %%mm2 \n\t"
"paddw %%mm5, %%mm2 \n\t"
"movq "MANGLE(w1111)", %%mm5 \n\t"
"psrlw $2, %%mm4 \n\t"
"psrlw $2, %%mm2 \n\t"
#endif
"movq "MANGLE(bgr2VCoeff)", %%mm1 \n\t"
"movq "MANGLE(bgr2VCoeff)", %%mm3 \n\t"
"pmaddwd %%mm4, %%mm1 \n\t"
"pmaddwd %%mm2, %%mm3 \n\t"
"pmaddwd %%mm6, %%mm4 \n\t"
"pmaddwd %%mm6, %%mm2 \n\t"
#ifndef FAST_BGR2YV12
"psrad $8, %%mm4 \n\t"
"psrad $8, %%mm1 \n\t"
"psrad $8, %%mm2 \n\t"
"psrad $8, %%mm3 \n\t"
#endif
"packssdw %%mm2, %%mm4 \n\t"
"packssdw %%mm3, %%mm1 \n\t"
"pmaddwd %%mm5, %%mm4 \n\t"
"pmaddwd %%mm5, %%mm1 \n\t"
"addl $24, %%ebx \n\t"
"packssdw %%mm1, %%mm4 \n\t" // V3 V2 U3 U2
"psraw $7, %%mm4 \n\t"
"movq %%mm0, %%mm1 \n\t"
"punpckldq %%mm4, %%mm0 \n\t"
"punpckhdq %%mm4, %%mm1 \n\t"
"packsswb %%mm1, %%mm0 \n\t"
"paddb "MANGLE(bgr2UVOffset)", %%mm0 \n\t"
"movd %%mm0, (%2, %%eax) \n\t"
"punpckhdq %%mm0, %%mm0 \n\t"
"movd %%mm0, (%3, %%eax) \n\t"
"addl $4, %%eax \n\t"
" js 1b \n\t"
: : "r" (src1+width*6), "r" (src2+width*6), "r" (dstU+width), "r" (dstV+width), "g" (-width)
: "%eax", "%ebx"
);
#else
int i;
for(i=0; i<width; i++)
{
int b= src1[6*i + 0] + src1[6*i + 3] + src2[6*i + 0] + src2[6*i + 3];
int g= src1[6*i + 1] + src1[6*i + 4] + src2[6*i + 1] + src2[6*i + 4];
int r= src1[6*i + 2] + src1[6*i + 5] + src2[6*i + 2] + src2[6*i + 5];
dstU[i]= ((RU*r + GU*g + BU*b)>>(RGB2YUV_SHIFT+2)) + 128;
dstV[i]= ((RV*r + GV*g + BV*b)>>(RGB2YUV_SHIFT+2)) + 128;
}
#endif
}
static inline void RENAME(bgr16ToY)(uint8_t *dst, uint8_t *src, int width)
{
int i;
for(i=0; i<width; i++)
{
int d= ((uint16_t*)src)[i];
int b= d&0x1F;
int g= (d>>5)&0x3F;
int r= (d>>11)&0x1F;
dst[i]= ((2*RY*r + GY*g + 2*BY*b)>>(RGB2YUV_SHIFT-2)) + 16;
}
}
static inline void RENAME(bgr16ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
{
int i;
for(i=0; i<width; i++)
{
int d0= ((uint32_t*)src1)[i];
int d1= ((uint32_t*)src2)[i];
int dl= (d0&0x07E0F81F) + (d1&0x07E0F81F);
int dh= ((d0>>5)&0x07C0F83F) + ((d1>>5)&0x07C0F83F);
int dh2= (dh>>11) + (dh<<21);
int d= dh2 + dl;
int b= d&0x7F;
int r= (d>>11)&0x7F;
int g= d>>21;
dstU[i]= ((2*RU*r + GU*g + 2*BU*b)>>(RGB2YUV_SHIFT+2-2)) + 128;
dstV[i]= ((2*RV*r + GV*g + 2*BV*b)>>(RGB2YUV_SHIFT+2-2)) + 128;
}
}
static inline void RENAME(bgr15ToY)(uint8_t *dst, uint8_t *src, int width)
{
int i;
for(i=0; i<width; i++)
{
int d= ((uint16_t*)src)[i];
int b= d&0x1F;
int g= (d>>5)&0x1F;
int r= (d>>10)&0x1F;
dst[i]= ((RY*r + GY*g + BY*b)>>(RGB2YUV_SHIFT-3)) + 16;
}
}
static inline void RENAME(bgr15ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
{
int i;
for(i=0; i<width; i++)
{
int d0= ((uint32_t*)src1)[i];
int d1= ((uint32_t*)src2)[i];
int dl= (d0&0x03E07C1F) + (d1&0x03E07C1F);
int dh= ((d0>>5)&0x03E0F81F) + ((d1>>5)&0x03E0F81F);
int dh2= (dh>>11) + (dh<<21);
int d= dh2 + dl;
int b= d&0x7F;
int r= (d>>10)&0x7F;
int g= d>>21;
dstU[i]= ((RU*r + GU*g + BU*b)>>(RGB2YUV_SHIFT+2-3)) + 128;
dstV[i]= ((RV*r + GV*g + BV*b)>>(RGB2YUV_SHIFT+2-3)) + 128;
}
}
static inline void RENAME(rgb32ToY)(uint8_t *dst, uint8_t *src, int width)
{
int i;
for(i=0; i<width; i++)
{
int r= ((uint32_t*)src)[i]&0xFF;
int g= (((uint32_t*)src)[i]>>8)&0xFF;
int b= (((uint32_t*)src)[i]>>16)&0xFF;
dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)) )>>RGB2YUV_SHIFT);
}
}
static inline void RENAME(rgb32ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
{
int i;
for(i=0; i<width; i++)
{
const int a= ((uint32_t*)src1)[2*i+0];
const int e= ((uint32_t*)src1)[2*i+1];
const int c= ((uint32_t*)src2)[2*i+0];
const int d= ((uint32_t*)src2)[2*i+1];
const int l= (a&0xFF00FF) + (e&0xFF00FF) + (c&0xFF00FF) + (d&0xFF00FF);
const int h= (a&0x00FF00) + (e&0x00FF00) + (c&0x00FF00) + (d&0x00FF00);
const int r= l&0x3FF;
const int g= h>>8;
const int b= l>>16;
dstU[i]= ((RU*r + GU*g + BU*b)>>(RGB2YUV_SHIFT+2)) + 128;
dstV[i]= ((RV*r + GV*g + BV*b)>>(RGB2YUV_SHIFT+2)) + 128;
}
}
static inline void RENAME(rgb24ToY)(uint8_t *dst, uint8_t *src, int width)
{
int i;
for(i=0; i<width; i++)
{
int r= src[i*3+0];
int g= src[i*3+1];
int b= src[i*3+2];
dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)) )>>RGB2YUV_SHIFT);
}
}
static inline void RENAME(rgb24ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
{
int i;
for(i=0; i<width; i++)
{
int r= src1[6*i + 0] + src1[6*i + 3] + src2[6*i + 0] + src2[6*i + 3];
int g= src1[6*i + 1] + src1[6*i + 4] + src2[6*i + 1] + src2[6*i + 4];
int b= src1[6*i + 2] + src1[6*i + 5] + src2[6*i + 2] + src2[6*i + 5];
dstU[i]= ((RU*r + GU*g + BU*b)>>(RGB2YUV_SHIFT+2)) + 128;
dstV[i]= ((RV*r + GV*g + BV*b)>>(RGB2YUV_SHIFT+2)) + 128;
}
}
// Bilinear / Bicubic scaling
static inline void RENAME(hScale)(int16_t *dst, int dstW, uint8_t *src, int srcW, int xInc,
int16_t *filter, int16_t *filterPos, int filterSize)
{
#ifdef HAVE_MMX
assert(filterSize % 4 == 0 && filterSize>0);
if(filterSize==4) // allways true for upscaling, sometimes for down too
{
int counter= -2*dstW;
filter-= counter*2;
filterPos-= counter/2;
dst-= counter/2;
asm volatile(
"pxor %%mm7, %%mm7 \n\t"
"movq "MANGLE(w02)", %%mm6 \n\t"
"pushl %%ebp \n\t" // we use 7 regs here ...
"movl %%eax, %%ebp \n\t"
".balign 16 \n\t"
"1: \n\t"
"movzwl (%2, %%ebp), %%eax \n\t"
"movzwl 2(%2, %%ebp), %%ebx \n\t"
"movq (%1, %%ebp, 4), %%mm1 \n\t"
"movq 8(%1, %%ebp, 4), %%mm3 \n\t"
"movd (%3, %%eax), %%mm0 \n\t"
"movd (%3, %%ebx), %%mm2 \n\t"
"punpcklbw %%mm7, %%mm0 \n\t"
"punpcklbw %%mm7, %%mm2 \n\t"
"pmaddwd %%mm1, %%mm0 \n\t"
"pmaddwd %%mm2, %%mm3 \n\t"
"psrad $8, %%mm0 \n\t"
"psrad $8, %%mm3 \n\t"
"packssdw %%mm3, %%mm0 \n\t"
"pmaddwd %%mm6, %%mm0 \n\t"
"packssdw %%mm0, %%mm0 \n\t"
"movd %%mm0, (%4, %%ebp) \n\t"
"addl $4, %%ebp \n\t"
" jnc 1b \n\t"
"popl %%ebp \n\t"
: "+a" (counter)
: "c" (filter), "d" (filterPos), "S" (src), "D" (dst)
: "%ebx"
);
}
else if(filterSize==8)
{
int counter= -2*dstW;
filter-= counter*4;
filterPos-= counter/2;
dst-= counter/2;
asm volatile(
"pxor %%mm7, %%mm7 \n\t"
"movq "MANGLE(w02)", %%mm6 \n\t"
"pushl %%ebp \n\t" // we use 7 regs here ...
"movl %%eax, %%ebp \n\t"
".balign 16 \n\t"
"1: \n\t"
"movzwl (%2, %%ebp), %%eax \n\t"
"movzwl 2(%2, %%ebp), %%ebx \n\t"
"movq (%1, %%ebp, 8), %%mm1 \n\t"
"movq 16(%1, %%ebp, 8), %%mm3 \n\t"
"movd (%3, %%eax), %%mm0 \n\t"
"movd (%3, %%ebx), %%mm2 \n\t"
"punpcklbw %%mm7, %%mm0 \n\t"
"punpcklbw %%mm7, %%mm2 \n\t"
"pmaddwd %%mm1, %%mm0 \n\t"
"pmaddwd %%mm2, %%mm3 \n\t"
"movq 8(%1, %%ebp, 8), %%mm1 \n\t"
"movq 24(%1, %%ebp, 8), %%mm5 \n\t"
"movd 4(%3, %%eax), %%mm4 \n\t"
"movd 4(%3, %%ebx), %%mm2 \n\t"
"punpcklbw %%mm7, %%mm4 \n\t"
"punpcklbw %%mm7, %%mm2 \n\t"
"pmaddwd %%mm1, %%mm4 \n\t"
"pmaddwd %%mm2, %%mm5 \n\t"
"paddd %%mm4, %%mm0 \n\t"
"paddd %%mm5, %%mm3 \n\t"
"psrad $8, %%mm0 \n\t"
"psrad $8, %%mm3 \n\t"
"packssdw %%mm3, %%mm0 \n\t"
"pmaddwd %%mm6, %%mm0 \n\t"
"packssdw %%mm0, %%mm0 \n\t"
"movd %%mm0, (%4, %%ebp) \n\t"
"addl $4, %%ebp \n\t"
" jnc 1b \n\t"
"popl %%ebp \n\t"
: "+a" (counter)
: "c" (filter), "d" (filterPos), "S" (src), "D" (dst)
: "%ebx"
);
}
else
{
int counter= -2*dstW;
// filter-= counter*filterSize/2;
filterPos-= counter/2;
dst-= counter/2;
asm volatile(
"pxor %%mm7, %%mm7 \n\t"
"movq "MANGLE(w02)", %%mm6 \n\t"
".balign 16 \n\t"
"1: \n\t"
"movl %2, %%ecx \n\t"
"movzwl (%%ecx, %0), %%eax \n\t"
"movzwl 2(%%ecx, %0), %%ebx \n\t"
"movl %5, %%ecx \n\t"
"pxor %%mm4, %%mm4 \n\t"
"pxor %%mm5, %%mm5 \n\t"
"2: \n\t"
"movq (%1), %%mm1 \n\t"
"movq (%1, %6), %%mm3 \n\t"
"movd (%%ecx, %%eax), %%mm0 \n\t"
"movd (%%ecx, %%ebx), %%mm2 \n\t"
"punpcklbw %%mm7, %%mm0 \n\t"
"punpcklbw %%mm7, %%mm2 \n\t"
"pmaddwd %%mm1, %%mm0 \n\t"
"pmaddwd %%mm2, %%mm3 \n\t"
"paddd %%mm3, %%mm5 \n\t"
"paddd %%mm0, %%mm4 \n\t"
"addl $8, %1 \n\t"
"addl $4, %%ecx \n\t"
"cmpl %4, %%ecx \n\t"
" jb 2b \n\t"
"addl %6, %1 \n\t"
"psrad $8, %%mm4 \n\t"
"psrad $8, %%mm5 \n\t"
"packssdw %%mm5, %%mm4 \n\t"
"pmaddwd %%mm6, %%mm4 \n\t"
"packssdw %%mm4, %%mm4 \n\t"
"movl %3, %%eax \n\t"
"movd %%mm4, (%%eax, %0) \n\t"
"addl $4, %0 \n\t"
" jnc 1b \n\t"
: "+r" (counter), "+r" (filter)
: "m" (filterPos), "m" (dst), "m"(src+filterSize),
"m" (src), "r" (filterSize*2)
: "%ebx", "%eax", "%ecx"
);
}
#else
#ifdef HAVE_ALTIVEC
hScale_altivec_real(dst, dstW, src, srcW, xInc, filter, filterPos, filterSize);
#else
int i;
for(i=0; i<dstW; i++)
{
int j;
int srcPos= filterPos[i];
int val=0;
// printf("filterPos: %d\n", filterPos[i]);
for(j=0; j<filterSize; j++)
{
// printf("filter: %d, src: %d\n", filter[i], src[srcPos + j]);
val += ((int)src[srcPos + j])*filter[filterSize*i + j];
}
// filter += hFilterSize;
dst[i] = MIN(MAX(0, val>>7), (1<<15)-1); // the cubic equation does overflow ...
// dst[i] = val>>7;
}
#endif
#endif
}
// *** horizontal scale Y line to temp buffer
static inline void RENAME(hyscale)(uint16_t *dst, int dstWidth, uint8_t *src, int srcW, int xInc,
int flags, int canMMX2BeUsed, int16_t *hLumFilter,
int16_t *hLumFilterPos, int hLumFilterSize, void *funnyYCode,
int srcFormat, uint8_t *formatConvBuffer, int16_t *mmx2Filter,
int32_t *mmx2FilterPos)
{
if(srcFormat==IMGFMT_YUY2)
{
RENAME(yuy2ToY)(formatConvBuffer, src, srcW);
src= formatConvBuffer;
}
else if(srcFormat==IMGFMT_UYVY)
{
RENAME(uyvyToY)(formatConvBuffer, src, srcW);
src= formatConvBuffer;
}
else if(srcFormat==IMGFMT_BGR32)
{
RENAME(bgr32ToY)(formatConvBuffer, src, srcW);
src= formatConvBuffer;
}
else if(srcFormat==IMGFMT_BGR24)
{
RENAME(bgr24ToY)(formatConvBuffer, src, srcW);
src= formatConvBuffer;
}
else if(srcFormat==IMGFMT_BGR16)
{
RENAME(bgr16ToY)(formatConvBuffer, src, srcW);
src= formatConvBuffer;
}
else if(srcFormat==IMGFMT_BGR15)
{
RENAME(bgr15ToY)(formatConvBuffer, src, srcW);
src= formatConvBuffer;
}
else if(srcFormat==IMGFMT_RGB32)
{
RENAME(rgb32ToY)(formatConvBuffer, src, srcW);
src= formatConvBuffer;
}
else if(srcFormat==IMGFMT_RGB24)
{
RENAME(rgb24ToY)(formatConvBuffer, src, srcW);
src= formatConvBuffer;
}
#ifdef HAVE_MMX
// use the new MMX scaler if the mmx2 can't be used (its faster than the x86asm one)
if(!(flags&SWS_FAST_BILINEAR) || (!canMMX2BeUsed))
#else
if(!(flags&SWS_FAST_BILINEAR))
#endif
{
RENAME(hScale)(dst, dstWidth, src, srcW, xInc, hLumFilter, hLumFilterPos, hLumFilterSize);
}
else // Fast Bilinear upscale / crap downscale
{
#ifdef ARCH_X86
#ifdef HAVE_MMX2
int i;
if(canMMX2BeUsed)
{
asm volatile(
"pxor %%mm7, %%mm7 \n\t"
"movl %0, %%ecx \n\t"
"movl %1, %%edi \n\t"
"movl %2, %%edx \n\t"
"movl %3, %%ebx \n\t"
"xorl %%eax, %%eax \n\t" // i
PREFETCH" (%%ecx) \n\t"
PREFETCH" 32(%%ecx) \n\t"
PREFETCH" 64(%%ecx) \n\t"
#define FUNNY_Y_CODE \
"movl (%%ebx), %%esi \n\t"\
"call *%4 \n\t"\
"addl (%%ebx, %%eax), %%ecx \n\t"\
"addl %%eax, %%edi \n\t"\
"xorl %%eax, %%eax \n\t"\
FUNNY_Y_CODE
FUNNY_Y_CODE
FUNNY_Y_CODE
FUNNY_Y_CODE
FUNNY_Y_CODE
FUNNY_Y_CODE
FUNNY_Y_CODE
FUNNY_Y_CODE
:: "m" (src), "m" (dst), "m" (mmx2Filter), "m" (mmx2FilterPos),
"m" (funnyYCode)
: "%eax", "%ebx", "%ecx", "%edx", "%esi", "%edi"
);
for(i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--) dst[i] = src[srcW-1]*128;
}
else
{
#endif
//NO MMX just normal asm ...
asm volatile(
"xorl %%eax, %%eax \n\t" // i
"xorl %%ebx, %%ebx \n\t" // xx
"xorl %%ecx, %%ecx \n\t" // 2*xalpha
".balign 16 \n\t"
"1: \n\t"
"movzbl (%0, %%ebx), %%edi \n\t" //src[xx]
"movzbl 1(%0, %%ebx), %%esi \n\t" //src[xx+1]
"subl %%edi, %%esi \n\t" //src[xx+1] - src[xx]
"imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha
"shll $16, %%edi \n\t"
"addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
"movl %1, %%edi \n\t"
"shrl $9, %%esi \n\t"
"movw %%si, (%%edi, %%eax, 2) \n\t"
"addw %4, %%cx \n\t" //2*xalpha += xInc&0xFF
"adcl %3, %%ebx \n\t" //xx+= xInc>>8 + carry
"movzbl (%0, %%ebx), %%edi \n\t" //src[xx]
"movzbl 1(%0, %%ebx), %%esi \n\t" //src[xx+1]
"subl %%edi, %%esi \n\t" //src[xx+1] - src[xx]
"imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha
"shll $16, %%edi \n\t"
"addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
"movl %1, %%edi \n\t"
"shrl $9, %%esi \n\t"
"movw %%si, 2(%%edi, %%eax, 2) \n\t"
"addw %4, %%cx \n\t" //2*xalpha += xInc&0xFF
"adcl %3, %%ebx \n\t" //xx+= xInc>>8 + carry
"addl $2, %%eax \n\t"
"cmpl %2, %%eax \n\t"
" jb 1b \n\t"
:: "r" (src), "m" (dst), "m" (dstWidth), "m" (xInc>>16), "m" (xInc&0xFFFF)
: "%eax", "%ebx", "%ecx", "%edi", "%esi"
);
#ifdef HAVE_MMX2
} //if MMX2 can't be used
#endif
#else
int i;
unsigned int xpos=0;
for(i=0;i<dstWidth;i++)
{
register unsigned int xx=xpos>>16;
register unsigned int xalpha=(xpos&0xFFFF)>>9;
dst[i]= (src[xx]<<7) + (src[xx+1] - src[xx])*xalpha;
xpos+=xInc;
}
#endif
}
}
inline static void RENAME(hcscale)(uint16_t *dst, int dstWidth, uint8_t *src1, uint8_t *src2,
int srcW, int xInc, int flags, int canMMX2BeUsed, int16_t *hChrFilter,
int16_t *hChrFilterPos, int hChrFilterSize, void *funnyUVCode,
int srcFormat, uint8_t *formatConvBuffer, int16_t *mmx2Filter,
int32_t *mmx2FilterPos)
{
if(srcFormat==IMGFMT_YUY2)
{
RENAME(yuy2ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
src1= formatConvBuffer;
src2= formatConvBuffer+2048;
}
else if(srcFormat==IMGFMT_UYVY)
{
RENAME(uyvyToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
src1= formatConvBuffer;
src2= formatConvBuffer+2048;
}
else if(srcFormat==IMGFMT_BGR32)
{
RENAME(bgr32ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
src1= formatConvBuffer;
src2= formatConvBuffer+2048;
}
else if(srcFormat==IMGFMT_BGR24)
{
RENAME(bgr24ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
src1= formatConvBuffer;
src2= formatConvBuffer+2048;
}
else if(srcFormat==IMGFMT_BGR16)
{
RENAME(bgr16ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
src1= formatConvBuffer;
src2= formatConvBuffer+2048;
}
else if(srcFormat==IMGFMT_BGR15)
{
RENAME(bgr15ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
src1= formatConvBuffer;
src2= formatConvBuffer+2048;
}
else if(srcFormat==IMGFMT_RGB32)
{
RENAME(rgb32ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
src1= formatConvBuffer;
src2= formatConvBuffer+2048;
}
else if(srcFormat==IMGFMT_RGB24)
{
RENAME(rgb24ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
src1= formatConvBuffer;
src2= formatConvBuffer+2048;
}
else if(isGray(srcFormat))
{
return;
}
#ifdef HAVE_MMX
// use the new MMX scaler if the mmx2 can't be used (its faster than the x86asm one)
if(!(flags&SWS_FAST_BILINEAR) || (!canMMX2BeUsed))
#else
if(!(flags&SWS_FAST_BILINEAR))
#endif
{
RENAME(hScale)(dst , dstWidth, src1, srcW, xInc, hChrFilter, hChrFilterPos, hChrFilterSize);
RENAME(hScale)(dst+2048, dstWidth, src2, srcW, xInc, hChrFilter, hChrFilterPos, hChrFilterSize);
}
else // Fast Bilinear upscale / crap downscale
{
#ifdef ARCH_X86
#ifdef HAVE_MMX2
int i;
if(canMMX2BeUsed)
{
asm volatile(
"pxor %%mm7, %%mm7 \n\t"
"movl %0, %%ecx \n\t"
"movl %1, %%edi \n\t"
"movl %2, %%edx \n\t"
"movl %3, %%ebx \n\t"
"xorl %%eax, %%eax \n\t" // i
PREFETCH" (%%ecx) \n\t"
PREFETCH" 32(%%ecx) \n\t"
PREFETCH" 64(%%ecx) \n\t"
#define FUNNY_UV_CODE \
"movl (%%ebx), %%esi \n\t"\
"call *%4 \n\t"\
"addl (%%ebx, %%eax), %%ecx \n\t"\
"addl %%eax, %%edi \n\t"\
"xorl %%eax, %%eax \n\t"\
FUNNY_UV_CODE
FUNNY_UV_CODE
FUNNY_UV_CODE
FUNNY_UV_CODE
"xorl %%eax, %%eax \n\t" // i
"movl %5, %%ecx \n\t" // src
"movl %1, %%edi \n\t" // buf1
"addl $4096, %%edi \n\t"
PREFETCH" (%%ecx) \n\t"
PREFETCH" 32(%%ecx) \n\t"
PREFETCH" 64(%%ecx) \n\t"
FUNNY_UV_CODE
FUNNY_UV_CODE
FUNNY_UV_CODE
FUNNY_UV_CODE
:: "m" (src1), "m" (dst), "m" (mmx2Filter), "m" (mmx2FilterPos),
"m" (funnyUVCode), "m" (src2)
: "%eax", "%ebx", "%ecx", "%edx", "%esi", "%edi"
);
for(i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--)
{
// printf("%d %d %d\n", dstWidth, i, srcW);
dst[i] = src1[srcW-1]*128;
dst[i+2048] = src2[srcW-1]*128;
}
}
else
{
#endif
asm volatile(
"xorl %%eax, %%eax \n\t" // i
"xorl %%ebx, %%ebx \n\t" // xx
"xorl %%ecx, %%ecx \n\t" // 2*xalpha
".balign 16 \n\t"
"1: \n\t"
"movl %0, %%esi \n\t"
"movzbl (%%esi, %%ebx), %%edi \n\t" //src[xx]
"movzbl 1(%%esi, %%ebx), %%esi \n\t" //src[xx+1]
"subl %%edi, %%esi \n\t" //src[xx+1] - src[xx]
"imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha
"shll $16, %%edi \n\t"
"addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
"movl %1, %%edi \n\t"
"shrl $9, %%esi \n\t"
"movw %%si, (%%edi, %%eax, 2) \n\t"
"movzbl (%5, %%ebx), %%edi \n\t" //src[xx]
"movzbl 1(%5, %%ebx), %%esi \n\t" //src[xx+1]
"subl %%edi, %%esi \n\t" //src[xx+1] - src[xx]
"imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha
"shll $16, %%edi \n\t"
"addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
"movl %1, %%edi \n\t"
"shrl $9, %%esi \n\t"
"movw %%si, 4096(%%edi, %%eax, 2)\n\t"
"addw %4, %%cx \n\t" //2*xalpha += xInc&0xFF
"adcl %3, %%ebx \n\t" //xx+= xInc>>8 + carry
"addl $1, %%eax \n\t"
"cmpl %2, %%eax \n\t"
" jb 1b \n\t"
:: "m" (src1), "m" (dst), "m" (dstWidth), "m" (xInc>>16), "m" (xInc&0xFFFF),
"r" (src2)
: "%eax", "%ebx", "%ecx", "%edi", "%esi"
);
#ifdef HAVE_MMX2
} //if MMX2 can't be used
#endif
#else
int i;
unsigned int xpos=0;
for(i=0;i<dstWidth;i++)
{
register unsigned int xx=xpos>>16;
register unsigned int xalpha=(xpos&0xFFFF)>>9;
dst[i]=(src1[xx]*(xalpha^127)+src1[xx+1]*xalpha);
dst[i+2048]=(src2[xx]*(xalpha^127)+src2[xx+1]*xalpha);
/* slower
dst[i]= (src1[xx]<<7) + (src1[xx+1] - src1[xx])*xalpha;
dst[i+2048]=(src2[xx]<<7) + (src2[xx+1] - src2[xx])*xalpha;
*/
xpos+=xInc;
}
#endif
}
}
static int RENAME(swScale)(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY,
int srcSliceH, uint8_t* dst[], int dstStride[]){
/* load a few things into local vars to make the code more readable? and faster */
const int srcW= c->srcW;
const int dstW= c->dstW;
const int dstH= c->dstH;
const int chrDstW= c->chrDstW;
const int chrSrcW= c->chrSrcW;
const int lumXInc= c->lumXInc;
const int chrXInc= c->chrXInc;
const int dstFormat= c->dstFormat;
const int srcFormat= c->srcFormat;
const int flags= c->flags;
const int canMMX2BeUsed= c->canMMX2BeUsed;
int16_t *vLumFilterPos= c->vLumFilterPos;
int16_t *vChrFilterPos= c->vChrFilterPos;
int16_t *hLumFilterPos= c->hLumFilterPos;
int16_t *hChrFilterPos= c->hChrFilterPos;
int16_t *vLumFilter= c->vLumFilter;
int16_t *vChrFilter= c->vChrFilter;
int16_t *hLumFilter= c->hLumFilter;
int16_t *hChrFilter= c->hChrFilter;
int32_t *lumMmxFilter= c->lumMmxFilter;
int32_t *chrMmxFilter= c->chrMmxFilter;
const int vLumFilterSize= c->vLumFilterSize;
const int vChrFilterSize= c->vChrFilterSize;
const int hLumFilterSize= c->hLumFilterSize;
const int hChrFilterSize= c->hChrFilterSize;
int16_t **lumPixBuf= c->lumPixBuf;
int16_t **chrPixBuf= c->chrPixBuf;
const int vLumBufSize= c->vLumBufSize;
const int vChrBufSize= c->vChrBufSize;
uint8_t *funnyYCode= c->funnyYCode;
uint8_t *funnyUVCode= c->funnyUVCode;
uint8_t *formatConvBuffer= c->formatConvBuffer;
const int chrSrcSliceY= srcSliceY >> c->chrSrcVSubSample;
const int chrSrcSliceH= -((-srcSliceH) >> c->chrSrcVSubSample);
int lastDstY;
/* vars whch will change and which we need to storw back in the context */
int dstY= c->dstY;
int lumBufIndex= c->lumBufIndex;
int chrBufIndex= c->chrBufIndex;
int lastInLumBuf= c->lastInLumBuf;
int lastInChrBuf= c->lastInChrBuf;
if(isPacked(c->srcFormat)){
src[0]=
src[1]=
src[2]= src[0];
srcStride[0]=
srcStride[1]=
srcStride[2]= srcStride[0];
}
srcStride[1]<<= c->vChrDrop;
srcStride[2]<<= c->vChrDrop;
// printf("swscale %X %X %X -> %X %X %X\n", (int)src[0], (int)src[1], (int)src[2],
// (int)dst[0], (int)dst[1], (int)dst[2]);
#if 0 //self test FIXME move to a vfilter or something
{
static volatile int i=0;
i++;
if(srcFormat==IMGFMT_YV12 && i==1 && srcSliceH>= c->srcH)
selfTest(src, srcStride, c->srcW, c->srcH);
i--;
}
#endif
//printf("sws Strides:%d %d %d -> %d %d %d\n", srcStride[0],srcStride[1],srcStride[2],
//dstStride[0],dstStride[1],dstStride[2]);
if(dstStride[0]%8 !=0 || dstStride[1]%8 !=0 || dstStride[2]%8 !=0)
{
static int firstTime=1; //FIXME move this into the context perhaps
if(flags & SWS_PRINT_INFO && firstTime)
{
MSG_WARN("SwScaler: Warning: dstStride is not aligned!\n"
"SwScaler: ->cannot do aligned memory acesses anymore\n");
firstTime=0;
}
}
/* Note the user might start scaling the picture in the middle so this will not get executed
this is not really intended but works currently, so ppl might do it */
if(srcSliceY ==0){
lumBufIndex=0;
chrBufIndex=0;
dstY=0;
lastInLumBuf= -1;
lastInChrBuf= -1;
}
lastDstY= dstY;
for(;dstY < dstH; dstY++){
unsigned char *dest =dst[0]+dstStride[0]*dstY;
const int chrDstY= dstY>>c->chrDstVSubSample;
unsigned char *uDest=dst[1]+dstStride[1]*chrDstY;
unsigned char *vDest=dst[2]+dstStride[2]*chrDstY;
const int firstLumSrcY= vLumFilterPos[dstY]; //First line needed as input
const int firstChrSrcY= vChrFilterPos[chrDstY]; //First line needed as input
const int lastLumSrcY= firstLumSrcY + vLumFilterSize -1; // Last line needed as input
const int lastChrSrcY= firstChrSrcY + vChrFilterSize -1; // Last line needed as input
//printf("dstY:%d dstH:%d firstLumSrcY:%d lastInLumBuf:%d vLumBufSize: %d vChrBufSize: %d slice: %d %d vLumFilterSize: %d firstChrSrcY: %d vChrFilterSize: %d c->chrSrcVSubSample: %d\n",
// dstY, dstH, firstLumSrcY, lastInLumBuf, vLumBufSize, vChrBufSize, srcSliceY, srcSliceH, vLumFilterSize, firstChrSrcY, vChrFilterSize, c->chrSrcVSubSample);
//handle holes (FAST_BILINEAR & weird filters)
if(firstLumSrcY > lastInLumBuf) lastInLumBuf= firstLumSrcY-1;
if(firstChrSrcY > lastInChrBuf) lastInChrBuf= firstChrSrcY-1;
//printf("%d %d %d\n", firstChrSrcY, lastInChrBuf, vChrBufSize);
ASSERT(firstLumSrcY >= lastInLumBuf - vLumBufSize + 1)
ASSERT(firstChrSrcY >= lastInChrBuf - vChrBufSize + 1)
// Do we have enough lines in this slice to output the dstY line
if(lastLumSrcY < srcSliceY + srcSliceH && lastChrSrcY < -((-srcSliceY - srcSliceH)>>c->chrSrcVSubSample))
{
//Do horizontal scaling
while(lastInLumBuf < lastLumSrcY)
{
uint8_t *s= src[0]+(lastInLumBuf + 1 - srcSliceY)*srcStride[0];
lumBufIndex++;
// printf("%d %d %d %d\n", lumBufIndex, vLumBufSize, lastInLumBuf, lastLumSrcY);
ASSERT(lumBufIndex < 2*vLumBufSize)
ASSERT(lastInLumBuf + 1 - srcSliceY < srcSliceH)
ASSERT(lastInLumBuf + 1 - srcSliceY >= 0)
// printf("%d %d\n", lumBufIndex, vLumBufSize);
RENAME(hyscale)(lumPixBuf[ lumBufIndex ], dstW, s, srcW, lumXInc,
flags, canMMX2BeUsed, hLumFilter, hLumFilterPos, hLumFilterSize,
funnyYCode, c->srcFormat, formatConvBuffer,
c->lumMmx2Filter, c->lumMmx2FilterPos);
lastInLumBuf++;
}
while(lastInChrBuf < lastChrSrcY)
{
uint8_t *src1= src[1]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[1];
uint8_t *src2= src[2]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[2];
chrBufIndex++;
ASSERT(chrBufIndex < 2*vChrBufSize)
ASSERT(lastInChrBuf + 1 - chrSrcSliceY < (chrSrcSliceH))
ASSERT(lastInChrBuf + 1 - chrSrcSliceY >= 0)
//FIXME replace parameters through context struct (some at least)
if(!(isGray(srcFormat) || isGray(dstFormat)))
RENAME(hcscale)(chrPixBuf[ chrBufIndex ], chrDstW, src1, src2, chrSrcW, chrXInc,
flags, canMMX2BeUsed, hChrFilter, hChrFilterPos, hChrFilterSize,
funnyUVCode, c->srcFormat, formatConvBuffer,
c->chrMmx2Filter, c->chrMmx2FilterPos);
lastInChrBuf++;
}
//wrap buf index around to stay inside the ring buffer
if(lumBufIndex >= vLumBufSize ) lumBufIndex-= vLumBufSize;
if(chrBufIndex >= vChrBufSize ) chrBufIndex-= vChrBufSize;
}
else // not enough lines left in this slice -> load the rest in the buffer
{
/* printf("%d %d Last:%d %d LastInBuf:%d %d Index:%d %d Y:%d FSize: %d %d BSize: %d %d\n",
firstChrSrcY,firstLumSrcY,lastChrSrcY,lastLumSrcY,
lastInChrBuf,lastInLumBuf,chrBufIndex,lumBufIndex,dstY,vChrFilterSize,vLumFilterSize,
vChrBufSize, vLumBufSize);*/
//Do horizontal scaling
while(lastInLumBuf+1 < srcSliceY + srcSliceH)
{
uint8_t *s= src[0]+(lastInLumBuf + 1 - srcSliceY)*srcStride[0];
lumBufIndex++;
ASSERT(lumBufIndex < 2*vLumBufSize)
ASSERT(lastInLumBuf + 1 - srcSliceY < srcSliceH)
ASSERT(lastInLumBuf + 1 - srcSliceY >= 0)
RENAME(hyscale)(lumPixBuf[ lumBufIndex ], dstW, s, srcW, lumXInc,
flags, canMMX2BeUsed, hLumFilter, hLumFilterPos, hLumFilterSize,
funnyYCode, c->srcFormat, formatConvBuffer,
c->lumMmx2Filter, c->lumMmx2FilterPos);
lastInLumBuf++;
}
while(lastInChrBuf+1 < (chrSrcSliceY + chrSrcSliceH))
{
uint8_t *src1= src[1]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[1];
uint8_t *src2= src[2]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[2];
chrBufIndex++;
ASSERT(chrBufIndex < 2*vChrBufSize)
ASSERT(lastInChrBuf + 1 - chrSrcSliceY < chrSrcSliceH)
ASSERT(lastInChrBuf + 1 - chrSrcSliceY >= 0)
if(!(isGray(srcFormat) || isGray(dstFormat)))
RENAME(hcscale)(chrPixBuf[ chrBufIndex ], chrDstW, src1, src2, chrSrcW, chrXInc,
flags, canMMX2BeUsed, hChrFilter, hChrFilterPos, hChrFilterSize,
funnyUVCode, c->srcFormat, formatConvBuffer,
c->chrMmx2Filter, c->chrMmx2FilterPos);
lastInChrBuf++;
}
//wrap buf index around to stay inside the ring buffer
if(lumBufIndex >= vLumBufSize ) lumBufIndex-= vLumBufSize;
if(chrBufIndex >= vChrBufSize ) chrBufIndex-= vChrBufSize;
break; //we can't output a dstY line so let's try with the next slice
}
#ifdef HAVE_MMX
b5Dither= dither8[dstY&1];
g6Dither= dither4[dstY&1];
g5Dither= dither8[dstY&1];
r5Dither= dither8[(dstY+1)&1];
#endif
if(dstY < dstH-2)
{
int16_t **lumSrcPtr= lumPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize;
int16_t **chrSrcPtr= chrPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize;
#ifdef HAVE_MMX
int i;
for(i=0; i<vLumFilterSize; i++)
{
lumMmxFilter[4*i+0]= (int32_t)lumSrcPtr[i];
lumMmxFilter[4*i+2]=
lumMmxFilter[4*i+3]=
((uint16_t)vLumFilter[dstY*vLumFilterSize + i])*0x10001;
}
for(i=0; i<vChrFilterSize; i++)
{
chrMmxFilter[4*i+0]= (int32_t)chrSrcPtr[i];
chrMmxFilter[4*i+2]=
chrMmxFilter[4*i+3]=
((uint16_t)vChrFilter[chrDstY*vChrFilterSize + i])*0x10001;
}
#endif
if(isPlanarYUV(dstFormat) || isGray(dstFormat)) //YV12 like
{
const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
if((dstY&chrSkipMask) || isGray(dstFormat)) uDest=vDest= NULL; //FIXME split functions in lumi / chromi
if(vLumFilterSize == 1 && vChrFilterSize == 1) // Unscaled YV12
{
int16_t *lumBuf = lumPixBuf[0];
int16_t *chrBuf= chrPixBuf[0];
RENAME(yuv2yuv1)(lumBuf, chrBuf, dest, uDest, vDest, dstW, chrDstW);
}
else //General YV12
{
RENAME(yuv2yuvX)(c,
vLumFilter+dstY*vLumFilterSize , lumSrcPtr, vLumFilterSize,
vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
dest, uDest, vDest, dstW, chrDstW);
}
}
else
{
ASSERT(lumSrcPtr + vLumFilterSize - 1 < lumPixBuf + vLumBufSize*2);
ASSERT(chrSrcPtr + vChrFilterSize - 1 < chrPixBuf + vChrBufSize*2);
if(vLumFilterSize == 1 && vChrFilterSize == 2) //Unscaled RGB
{
int chrAlpha= vChrFilter[2*dstY+1];
RENAME(yuv2packed1)(c, *lumSrcPtr, *chrSrcPtr, *(chrSrcPtr+1),
dest, dstW, chrAlpha, dstFormat, flags, dstY);
}
else if(vLumFilterSize == 2 && vChrFilterSize == 2) //BiLinear Upscale RGB
{
int lumAlpha= vLumFilter[2*dstY+1];
int chrAlpha= vChrFilter[2*dstY+1];
RENAME(yuv2packed2)(c, *lumSrcPtr, *(lumSrcPtr+1), *chrSrcPtr, *(chrSrcPtr+1),
dest, dstW, lumAlpha, chrAlpha, dstY);
}
else //General RGB
{
RENAME(yuv2packedX)(c,
vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
dest, dstW, dstY);
}
}
}
else // hmm looks like we can't use MMX here without overwriting this array's tail
{
int16_t **lumSrcPtr= lumPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize;
int16_t **chrSrcPtr= chrPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize;
if(isPlanarYUV(dstFormat) || isGray(dstFormat)) //YV12
{
const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
if((dstY&chrSkipMask) || isGray(dstFormat)) uDest=vDest= NULL; //FIXME split functions in lumi / chromi
yuv2yuvXinC(
vLumFilter+dstY*vLumFilterSize , lumSrcPtr, vLumFilterSize,
vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
dest, uDest, vDest, dstW, chrDstW);
}
else
{
ASSERT(lumSrcPtr + vLumFilterSize - 1 < lumPixBuf + vLumBufSize*2);
ASSERT(chrSrcPtr + vChrFilterSize - 1 < chrPixBuf + vChrBufSize*2);
yuv2packedXinC(c,
vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
dest, dstW, dstY);
}
}
}
#ifdef HAVE_MMX
__asm __volatile(SFENCE:::"memory");
__asm __volatile(EMMS:::"memory");
#endif
/* store changed local vars back in the context */
c->dstY= dstY;
c->lumBufIndex= lumBufIndex;
c->chrBufIndex= chrBufIndex;
c->lastInLumBuf= lastInLumBuf;
c->lastInChrBuf= lastInChrBuf;
return dstY - lastDstY;
}
/*
* yuv2rgb.c, Software YUV to RGB coverter
*
* Copyright (C) 1999, Aaron Holtzman <aholtzma@ess.engr.uvic.ca>
* All Rights Reserved.
*
* Functions broken out from display_x11.c and several new modes
* added by Hkan Hjort <d95hjort@dtek.chalmers.se>
*
* 15 & 16 bpp support by Franck Sicard <Franck.Sicard@solsoft.fr>
*
* This file is part of mpeg2dec, a free MPEG-2 video decoder
*
* mpeg2dec is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2, or (at your option)
* any later version.
*
* mpeg2dec is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with GNU Make; see the file COPYING. If not, write to
* the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
*
* MMX/MMX2 Template stuff from Michael Niedermayer (michaelni@gmx.at) (needed for fast movntq support)
* 1,4,8bpp support by Michael Niedermayer (michaelni@gmx.at)
* context / deglobalize stuff by Michael Niedermayer
*/
#include <stdio.h>
#include <stdlib.h>
#include <inttypes.h>
#include <assert.h>
#include "config.h"
//#include "video_out.h"
#include "rgb2rgb.h"
#include "swscale.h"
#include "swscale_internal.h"
#include "common.h"
#ifdef HAVE_MLIB
#include "yuv2rgb_mlib.c"
#endif
#define DITHER1XBPP // only for mmx
const uint8_t __attribute__((aligned(8))) dither_2x2_4[2][8]={
{ 1, 3, 1, 3, 1, 3, 1, 3, },
{ 2, 0, 2, 0, 2, 0, 2, 0, },
};
const uint8_t __attribute__((aligned(8))) dither_2x2_8[2][8]={
{ 6, 2, 6, 2, 6, 2, 6, 2, },
{ 0, 4, 0, 4, 0, 4, 0, 4, },
};
const uint8_t __attribute__((aligned(8))) dither_8x8_32[8][8]={
{ 17, 9, 23, 15, 16, 8, 22, 14, },
{ 5, 29, 3, 27, 4, 28, 2, 26, },
{ 21, 13, 19, 11, 20, 12, 18, 10, },
{ 0, 24, 6, 30, 1, 25, 7, 31, },
{ 16, 8, 22, 14, 17, 9, 23, 15, },
{ 4, 28, 2, 26, 5, 29, 3, 27, },
{ 20, 12, 18, 10, 21, 13, 19, 11, },
{ 1, 25, 7, 31, 0, 24, 6, 30, },
};
#if 0
const uint8_t __attribute__((aligned(8))) dither_8x8_64[8][8]={
{ 0, 48, 12, 60, 3, 51, 15, 63, },
{ 32, 16, 44, 28, 35, 19, 47, 31, },
{ 8, 56, 4, 52, 11, 59, 7, 55, },
{ 40, 24, 36, 20, 43, 27, 39, 23, },
{ 2, 50, 14, 62, 1, 49, 13, 61, },
{ 34, 18, 46, 30, 33, 17, 45, 29, },
{ 10, 58, 6, 54, 9, 57, 5, 53, },
{ 42, 26, 38, 22, 41, 25, 37, 21, },
};
#endif
const uint8_t __attribute__((aligned(8))) dither_8x8_73[8][8]={
{ 0, 55, 14, 68, 3, 58, 17, 72, },
{ 37, 18, 50, 32, 40, 22, 54, 35, },
{ 9, 64, 5, 59, 13, 67, 8, 63, },
{ 46, 27, 41, 23, 49, 31, 44, 26, },
{ 2, 57, 16, 71, 1, 56, 15, 70, },
{ 39, 21, 52, 34, 38, 19, 51, 33, },
{ 11, 66, 7, 62, 10, 65, 6, 60, },
{ 48, 30, 43, 25, 47, 29, 42, 24, },
};
#if 0
const uint8_t __attribute__((aligned(8))) dither_8x8_128[8][8]={
{ 68, 36, 92, 60, 66, 34, 90, 58, },
{ 20, 116, 12, 108, 18, 114, 10, 106, },
{ 84, 52, 76, 44, 82, 50, 74, 42, },
{ 0, 96, 24, 120, 6, 102, 30, 126, },
{ 64, 32, 88, 56, 70, 38, 94, 62, },
{ 16, 112, 8, 104, 22, 118, 14, 110, },
{ 80, 48, 72, 40, 86, 54, 78, 46, },
{ 4, 100, 28, 124, 2, 98, 26, 122, },
};
#endif
#if 1
const uint8_t __attribute__((aligned(8))) dither_8x8_220[8][8]={
{117, 62, 158, 103, 113, 58, 155, 100, },
{ 34, 199, 21, 186, 31, 196, 17, 182, },
{144, 89, 131, 76, 141, 86, 127, 72, },
{ 0, 165, 41, 206, 10, 175, 52, 217, },
{110, 55, 151, 96, 120, 65, 162, 107, },
{ 28, 193, 14, 179, 38, 203, 24, 189, },
{138, 83, 124, 69, 148, 93, 134, 79, },
{ 7, 172, 48, 213, 3, 168, 45, 210, },
};
#elif 1
// tries to correct a gamma of 1.5
const uint8_t __attribute__((aligned(8))) dither_8x8_220[8][8]={
{ 0, 143, 18, 200, 2, 156, 25, 215, },
{ 78, 28, 125, 64, 89, 36, 138, 74, },
{ 10, 180, 3, 161, 16, 195, 8, 175, },
{109, 51, 93, 38, 121, 60, 105, 47, },
{ 1, 152, 23, 210, 0, 147, 20, 205, },
{ 85, 33, 134, 71, 81, 30, 130, 67, },
{ 14, 190, 6, 171, 12, 185, 5, 166, },
{117, 57, 101, 44, 113, 54, 97, 41, },
};
#elif 1
// tries to correct a gamma of 2.0
const uint8_t __attribute__((aligned(8))) dither_8x8_220[8][8]={
{ 0, 124, 8, 193, 0, 140, 12, 213, },
{ 55, 14, 104, 42, 66, 19, 119, 52, },
{ 3, 168, 1, 145, 6, 187, 3, 162, },
{ 86, 31, 70, 21, 99, 39, 82, 28, },
{ 0, 134, 11, 206, 0, 129, 9, 200, },
{ 62, 17, 114, 48, 58, 16, 109, 45, },
{ 5, 181, 2, 157, 4, 175, 1, 151, },
{ 95, 36, 78, 26, 90, 34, 74, 24, },
};
#else
// tries to correct a gamma of 2.5
const uint8_t __attribute__((aligned(8))) dither_8x8_220[8][8]={
{ 0, 107, 3, 187, 0, 125, 6, 212, },
{ 39, 7, 86, 28, 49, 11, 102, 36, },
{ 1, 158, 0, 131, 3, 180, 1, 151, },
{ 68, 19, 52, 12, 81, 25, 64, 17, },
{ 0, 119, 5, 203, 0, 113, 4, 195, },
{ 45, 9, 96, 33, 42, 8, 91, 30, },
{ 2, 172, 1, 144, 2, 165, 0, 137, },
{ 77, 23, 60, 15, 72, 21, 56, 14, },
};
#endif
#ifdef ARCH_X86
/* hope these constant values are cache line aligned */
uint64_t attribute_used __attribute__((aligned(8))) mmx_00ffw = 0x00ff00ff00ff00ffULL;
uint64_t attribute_used __attribute__((aligned(8))) mmx_redmask = 0xf8f8f8f8f8f8f8f8ULL;
uint64_t attribute_used __attribute__((aligned(8))) mmx_grnmask = 0xfcfcfcfcfcfcfcfcULL;
uint64_t attribute_used __attribute__((aligned(8))) M24A= 0x00FF0000FF0000FFULL;
uint64_t attribute_used __attribute__((aligned(8))) M24B= 0xFF0000FF0000FF00ULL;
uint64_t attribute_used __attribute__((aligned(8))) M24C= 0x0000FF0000FF0000ULL;
// the volatile is required because gcc otherwise optimizes some writes away not knowing that these
// are read in the asm block
volatile uint64_t attribute_used __attribute__((aligned(8))) b5Dither;
volatile uint64_t attribute_used __attribute__((aligned(8))) g5Dither;
volatile uint64_t attribute_used __attribute__((aligned(8))) g6Dither;
volatile uint64_t attribute_used __attribute__((aligned(8))) r5Dither;
uint64_t __attribute__((aligned(8))) dither4[2]={
0x0103010301030103LL,
0x0200020002000200LL,};
uint64_t __attribute__((aligned(8))) dither8[2]={
0x0602060206020602LL,
0x0004000400040004LL,};
#undef HAVE_MMX
#undef ARCH_X86
//MMX versions
#undef RENAME
#define HAVE_MMX
#undef HAVE_MMX2
#undef HAVE_3DNOW
#define ARCH_X86
#define RENAME(a) a ## _MMX
#include "yuv2rgb_template.c"
//MMX2 versions
#undef RENAME
#define HAVE_MMX
#define HAVE_MMX2
#undef HAVE_3DNOW
#define ARCH_X86
#define RENAME(a) a ## _MMX2
#include "yuv2rgb_template.c"
#endif // CAN_COMPILE_X86_ASM
const int32_t Inverse_Table_6_9[8][4] = {
{117504, 138453, 13954, 34903}, /* no sequence_display_extension */
{117504, 138453, 13954, 34903}, /* ITU-R Rec. 709 (1990) */
{104597, 132201, 25675, 53279}, /* unspecified */
{104597, 132201, 25675, 53279}, /* reserved */
{104448, 132798, 24759, 53109}, /* FCC */
{104597, 132201, 25675, 53279}, /* ITU-R Rec. 624-4 System B, G */
{104597, 132201, 25675, 53279}, /* SMPTE 170M */
{117579, 136230, 16907, 35559} /* SMPTE 240M (1987) */
};
#define RGB(i) \
U = pu[i]; \
V = pv[i]; \
r = c->table_rV[V]; \
g = c->table_gU[U] + c->table_gV[V]; \
b = c->table_bU[U];
#define DST1(i) \
Y = py_1[2*i]; \
dst_1[2*i] = r[Y] + g[Y] + b[Y]; \
Y = py_1[2*i+1]; \
dst_1[2*i+1] = r[Y] + g[Y] + b[Y];
#define DST2(i) \
Y = py_2[2*i]; \
dst_2[2*i] = r[Y] + g[Y] + b[Y]; \
Y = py_2[2*i+1]; \
dst_2[2*i+1] = r[Y] + g[Y] + b[Y];
#define DST1RGB(i) \
Y = py_1[2*i]; \
dst_1[6*i] = r[Y]; dst_1[6*i+1] = g[Y]; dst_1[6*i+2] = b[Y]; \
Y = py_1[2*i+1]; \
dst_1[6*i+3] = r[Y]; dst_1[6*i+4] = g[Y]; dst_1[6*i+5] = b[Y];
#define DST2RGB(i) \
Y = py_2[2*i]; \
dst_2[6*i] = r[Y]; dst_2[6*i+1] = g[Y]; dst_2[6*i+2] = b[Y]; \
Y = py_2[2*i+1]; \
dst_2[6*i+3] = r[Y]; dst_2[6*i+4] = g[Y]; dst_2[6*i+5] = b[Y];
#define DST1BGR(i) \
Y = py_1[2*i]; \
dst_1[6*i] = b[Y]; dst_1[6*i+1] = g[Y]; dst_1[6*i+2] = r[Y]; \
Y = py_1[2*i+1]; \
dst_1[6*i+3] = b[Y]; dst_1[6*i+4] = g[Y]; dst_1[6*i+5] = r[Y];
#define DST2BGR(i) \
Y = py_2[2*i]; \
dst_2[6*i] = b[Y]; dst_2[6*i+1] = g[Y]; dst_2[6*i+2] = r[Y]; \
Y = py_2[2*i+1]; \
dst_2[6*i+3] = b[Y]; dst_2[6*i+4] = g[Y]; dst_2[6*i+5] = r[Y];
#define PROLOG(func_name, dst_type) \
static int func_name(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY, \
int srcSliceH, uint8_t* dst[], int dstStride[]){\
int y;\
\
if(c->srcFormat == IMGFMT_422P){\
srcStride[1] *= 2;\
srcStride[2] *= 2;\
}\
for(y=0; y<srcSliceH; y+=2){\
dst_type *dst_1= (dst_type*)(dst[0] + (y+srcSliceY )*dstStride[0]);\
dst_type *dst_2= (dst_type*)(dst[0] + (y+srcSliceY+1)*dstStride[0]);\
dst_type *r, *g, *b;\
uint8_t *py_1= src[0] + y*srcStride[0];\
uint8_t *py_2= py_1 + srcStride[0];\
uint8_t *pu= src[1] + (y>>1)*srcStride[1];\
uint8_t *pv= src[2] + (y>>1)*srcStride[2];\
unsigned int h_size= c->dstW>>3;\
while (h_size--) {\
int U, V, Y;\
#define EPILOG(dst_delta)\
pu += 4;\
pv += 4;\
py_1 += 8;\
py_2 += 8;\
dst_1 += dst_delta;\
dst_2 += dst_delta;\
}\
}\
return srcSliceH;\
}
PROLOG(yuv2rgb_c_32, uint32_t)
RGB(0);
DST1(0);
DST2(0);
RGB(1);
DST2(1);
DST1(1);
RGB(2);
DST1(2);
DST2(2);
RGB(3);
DST2(3);
DST1(3);
EPILOG(8)
PROLOG(yuv2rgb_c_24_rgb, uint8_t)
RGB(0);
DST1RGB(0);
DST2RGB(0);
RGB(1);
DST2RGB(1);
DST1RGB(1);
RGB(2);
DST1RGB(2);
DST2RGB(2);
RGB(3);
DST2RGB(3);
DST1RGB(3);
EPILOG(24)
// only trivial mods from yuv2rgb_c_24_rgb
PROLOG(yuv2rgb_c_24_bgr, uint8_t)
RGB(0);
DST1BGR(0);
DST2BGR(0);
RGB(1);
DST2BGR(1);
DST1BGR(1);
RGB(2);
DST1BGR(2);
DST2BGR(2);
RGB(3);
DST2BGR(3);
DST1BGR(3);
EPILOG(24)
// This is exactly the same code as yuv2rgb_c_32 except for the types of
// r, g, b, dst_1, dst_2
PROLOG(yuv2rgb_c_16, uint16_t)
RGB(0);
DST1(0);
DST2(0);
RGB(1);
DST2(1);
DST1(1);
RGB(2);
DST1(2);
DST2(2);
RGB(3);
DST2(3);
DST1(3);
EPILOG(8)
// This is exactly the same code as yuv2rgb_c_32 except for the types of
// r, g, b, dst_1, dst_2
PROLOG(yuv2rgb_c_8, uint8_t)
RGB(0);
DST1(0);
DST2(0);
RGB(1);
DST2(1);
DST1(1);
RGB(2);
DST1(2);
DST2(2);
RGB(3);
DST2(3);
DST1(3);
EPILOG(8)
// r, g, b, dst_1, dst_2
PROLOG(yuv2rgb_c_8_ordered_dither, uint8_t)
const uint8_t *d32= dither_8x8_32[y&7];
const uint8_t *d64= dither_8x8_73[y&7];
#define DST1bpp8(i,o) \
Y = py_1[2*i]; \
dst_1[2*i] = r[Y+d32[0+o]] + g[Y+d32[0+o]] + b[Y+d64[0+o]]; \
Y = py_1[2*i+1]; \
dst_1[2*i+1] = r[Y+d32[1+o]] + g[Y+d32[1+o]] + b[Y+d64[1+o]];
#define DST2bpp8(i,o) \
Y = py_2[2*i]; \
dst_2[2*i] = r[Y+d32[8+o]] + g[Y+d32[8+o]] + b[Y+d64[8+o]]; \
Y = py_2[2*i+1]; \
dst_2[2*i+1] = r[Y+d32[9+o]] + g[Y+d32[9+o]] + b[Y+d64[9+o]];
RGB(0);
DST1bpp8(0,0);
DST2bpp8(0,0);
RGB(1);
DST2bpp8(1,2);
DST1bpp8(1,2);
RGB(2);
DST1bpp8(2,4);
DST2bpp8(2,4);
RGB(3);
DST2bpp8(3,6);
DST1bpp8(3,6);
EPILOG(8)
// This is exactly the same code as yuv2rgb_c_32 except for the types of
// r, g, b, dst_1, dst_2
PROLOG(yuv2rgb_c_4, uint8_t)
int acc;
#define DST1_4(i) \
Y = py_1[2*i]; \
acc = r[Y] + g[Y] + b[Y]; \
Y = py_1[2*i+1]; \
acc |= (r[Y] + g[Y] + b[Y])<<4;\
dst_1[i] = acc;
#define DST2_4(i) \
Y = py_2[2*i]; \
acc = r[Y] + g[Y] + b[Y]; \
Y = py_2[2*i+1]; \
acc |= (r[Y] + g[Y] + b[Y])<<4;\
dst_2[i] = acc;
RGB(0);
DST1_4(0);
DST2_4(0);
RGB(1);
DST2_4(1);
DST1_4(1);
RGB(2);
DST1_4(2);
DST2_4(2);
RGB(3);
DST2_4(3);
DST1_4(3);
EPILOG(4)
PROLOG(yuv2rgb_c_4_ordered_dither, uint8_t)
const uint8_t *d64= dither_8x8_73[y&7];
const uint8_t *d128=dither_8x8_220[y&7];
int acc;
#define DST1bpp4(i,o) \
Y = py_1[2*i]; \
acc = r[Y+d128[0+o]] + g[Y+d64[0+o]] + b[Y+d128[0+o]]; \
Y = py_1[2*i+1]; \
acc |= (r[Y+d128[1+o]] + g[Y+d64[1+o]] + b[Y+d128[1+o]])<<4;\
dst_1[i]= acc;
#define DST2bpp4(i,o) \
Y = py_2[2*i]; \
acc = r[Y+d128[8+o]] + g[Y+d64[8+o]] + b[Y+d128[8+o]]; \
Y = py_2[2*i+1]; \
acc |= (r[Y+d128[9+o]] + g[Y+d64[9+o]] + b[Y+d128[9+o]])<<4;\
dst_2[i]= acc;
RGB(0);
DST1bpp4(0,0);
DST2bpp4(0,0);
RGB(1);
DST2bpp4(1,2);
DST1bpp4(1,2);
RGB(2);
DST1bpp4(2,4);
DST2bpp4(2,4);
RGB(3);
DST2bpp4(3,6);
DST1bpp4(3,6);
EPILOG(4)
// This is exactly the same code as yuv2rgb_c_32 except for the types of
// r, g, b, dst_1, dst_2
PROLOG(yuv2rgb_c_4b, uint8_t)
RGB(0);
DST1(0);
DST2(0);
RGB(1);
DST2(1);
DST1(1);
RGB(2);
DST1(2);
DST2(2);
RGB(3);
DST2(3);
DST1(3);
EPILOG(8)
PROLOG(yuv2rgb_c_4b_ordered_dither, uint8_t)
const uint8_t *d64= dither_8x8_73[y&7];
const uint8_t *d128=dither_8x8_220[y&7];
#define DST1bpp4b(i,o) \
Y = py_1[2*i]; \
dst_1[2*i] = r[Y+d128[0+o]] + g[Y+d64[0+o]] + b[Y+d128[0+o]]; \
Y = py_1[2*i+1]; \
dst_1[2*i+1] = r[Y+d128[1+o]] + g[Y+d64[1+o]] + b[Y+d128[1+o]];
#define DST2bpp4b(i,o) \
Y = py_2[2*i]; \
dst_2[2*i] = r[Y+d128[8+o]] + g[Y+d64[8+o]] + b[Y+d128[8+o]]; \
Y = py_2[2*i+1]; \
dst_2[2*i+1] = r[Y+d128[9+o]] + g[Y+d64[9+o]] + b[Y+d128[9+o]];
RGB(0);
DST1bpp4b(0,0);
DST2bpp4b(0,0);
RGB(1);
DST2bpp4b(1,2);
DST1bpp4b(1,2);
RGB(2);
DST1bpp4b(2,4);
DST2bpp4b(2,4);
RGB(3);
DST2bpp4b(3,6);
DST1bpp4b(3,6);
EPILOG(8)
PROLOG(yuv2rgb_c_1_ordered_dither, uint8_t)
const uint8_t *d128=dither_8x8_220[y&7];
char out_1=0, out_2=0;
g= c->table_gU[128] + c->table_gV[128];
#define DST1bpp1(i,o) \
Y = py_1[2*i]; \
out_1+= out_1 + g[Y+d128[0+o]]; \
Y = py_1[2*i+1]; \
out_1+= out_1 + g[Y+d128[1+o]];
#define DST2bpp1(i,o) \
Y = py_2[2*i]; \
out_2+= out_2 + g[Y+d128[8+o]]; \
Y = py_2[2*i+1]; \
out_2+= out_2 + g[Y+d128[9+o]];
DST1bpp1(0,0);
DST2bpp1(0,0);
DST2bpp1(1,2);
DST1bpp1(1,2);
DST1bpp1(2,4);
DST2bpp1(2,4);
DST2bpp1(3,6);
DST1bpp1(3,6);
dst_1[0]= out_1;
dst_2[0]= out_2;
EPILOG(1)
SwsFunc yuv2rgb_get_func_ptr (SwsContext *c)
{
#ifdef ARCH_X86
if(c->flags & SWS_CPU_CAPS_MMX2){
switch(c->dstFormat){
case IMGFMT_BGR32: return yuv420_rgb32_MMX2;
case IMGFMT_BGR24: return yuv420_rgb24_MMX2;
case IMGFMT_BGR16: return yuv420_rgb16_MMX2;
case IMGFMT_BGR15: return yuv420_rgb15_MMX2;
}
}
if(c->flags & SWS_CPU_CAPS_MMX){
switch(c->dstFormat){
case IMGFMT_BGR32: return yuv420_rgb32_MMX;
case IMGFMT_BGR24: return yuv420_rgb24_MMX;
case IMGFMT_BGR16: return yuv420_rgb16_MMX;
case IMGFMT_BGR15: return yuv420_rgb15_MMX;
}
}
#endif
#ifdef HAVE_MLIB
{
SwsFunc t= yuv2rgb_init_mlib(c);
if(t) return t;
}
#endif
#ifdef HAVE_ALTIVEC
if (c->flags & SWS_CPU_CAPS_ALTIVEC)
{
SwsFunc t = yuv2rgb_init_altivec(c);
if(t) return t;
}
#endif
MSG_WARN("No accelerated colorspace conversion found\n");
switch(c->dstFormat){
case IMGFMT_RGB32:
case IMGFMT_BGR32: return yuv2rgb_c_32;
case IMGFMT_RGB24: return yuv2rgb_c_24_rgb;
case IMGFMT_BGR24: return yuv2rgb_c_24_bgr;
case IMGFMT_RGB16:
case IMGFMT_BGR16:
case IMGFMT_RGB15:
case IMGFMT_BGR15: return yuv2rgb_c_16;
case IMGFMT_RGB8:
case IMGFMT_BGR8: return yuv2rgb_c_8_ordered_dither;
case IMGFMT_RGB4:
case IMGFMT_BGR4: return yuv2rgb_c_4_ordered_dither;
case IMGFMT_RG4B:
case IMGFMT_BG4B: return yuv2rgb_c_4b_ordered_dither;
case IMGFMT_RGB1:
case IMGFMT_BGR1: return yuv2rgb_c_1_ordered_dither;
default:
assert(0);
}
return NULL;
}
static int div_round (int dividend, int divisor)
{
if (dividend > 0)
return (dividend + (divisor>>1)) / divisor;
else
return -((-dividend + (divisor>>1)) / divisor);
}
int yuv2rgb_c_init_tables (SwsContext *c, const int inv_table[4], int fullRange, int brightness, int contrast, int saturation)
{
const int isRgb = IMGFMT_IS_BGR(c->dstFormat);
const int bpp = isRgb?IMGFMT_RGB_DEPTH(c->dstFormat):IMGFMT_BGR_DEPTH(c->dstFormat);
int i;
uint8_t table_Y[1024];
uint32_t *table_32 = 0;
uint16_t *table_16 = 0;
uint8_t *table_8 = 0;
uint8_t *table_332 = 0;
uint8_t *table_121 = 0;
uint8_t *table_1 = 0;
int entry_size = 0;
void *table_r = 0, *table_g = 0, *table_b = 0;
void *table_start;
int64_t crv = inv_table[0];
int64_t cbu = inv_table[1];
int64_t cgu = -inv_table[2];
int64_t cgv = -inv_table[3];
int64_t cy = 1<<16;
int64_t oy = 0;
//printf("%lld %lld %lld %lld %lld\n", cy, crv, cbu, cgu, cgv);
if(!fullRange){
cy= (cy*255) / 219;
oy= 16<<16;
}
cy = (cy *contrast )>>16;
crv= (crv*contrast * saturation)>>32;
cbu= (cbu*contrast * saturation)>>32;
cgu= (cgu*contrast * saturation)>>32;
cgv= (cgv*contrast * saturation)>>32;
//printf("%lld %lld %lld %lld %lld\n", cy, crv, cbu, cgu, cgv);
oy -= 256*brightness;
for (i = 0; i < 1024; i++) {
int j;
j= (cy*(((i - 384)<<16) - oy) + (1<<31))>>32;
j = (j < 0) ? 0 : ((j > 255) ? 255 : j);
table_Y[i] = j;
}
switch (bpp) {
case 32:
table_start= table_32 = malloc ((197 + 2*682 + 256 + 132) * sizeof (uint32_t));
entry_size = sizeof (uint32_t);
table_r = table_32 + 197;
table_b = table_32 + 197 + 685;
table_g = table_32 + 197 + 2*682;
for (i = -197; i < 256+197; i++)
((uint32_t *)table_r)[i] = table_Y[i+384] << (isRgb ? 16 : 0);
for (i = -132; i < 256+132; i++)
((uint32_t *)table_g)[i] = table_Y[i+384] << 8;
for (i = -232; i < 256+232; i++)
((uint32_t *)table_b)[i] = table_Y[i+384] << (isRgb ? 0 : 16);
break;
case 24:
table_start= table_8 = malloc ((256 + 2*232) * sizeof (uint8_t));
entry_size = sizeof (uint8_t);
table_r = table_g = table_b = table_8 + 232;
for (i = -232; i < 256+232; i++)
((uint8_t * )table_b)[i] = table_Y[i+384];
break;
case 15:
case 16:
table_start= table_16 = malloc ((197 + 2*682 + 256 + 132) * sizeof (uint16_t));
entry_size = sizeof (uint16_t);
table_r = table_16 + 197;
table_b = table_16 + 197 + 685;
table_g = table_16 + 197 + 2*682;
for (i = -197; i < 256+197; i++) {
int j = table_Y[i+384] >> 3;
if (isRgb)
j <<= ((bpp==16) ? 11 : 10);
((uint16_t *)table_r)[i] = j;
}
for (i = -132; i < 256+132; i++) {
int j = table_Y[i+384] >> ((bpp==16) ? 2 : 3);
((uint16_t *)table_g)[i] = j << 5;
}
for (i = -232; i < 256+232; i++) {
int j = table_Y[i+384] >> 3;
if (!isRgb)
j <<= ((bpp==16) ? 11 : 10);
((uint16_t *)table_b)[i] = j;
}
break;
case 8:
table_start= table_332 = malloc ((197 + 2*682 + 256 + 132) * sizeof (uint8_t));
entry_size = sizeof (uint8_t);
table_r = table_332 + 197;
table_b = table_332 + 197 + 685;
table_g = table_332 + 197 + 2*682;
for (i = -197; i < 256+197; i++) {
int j = (table_Y[i+384 - 16] + 18)/36;
if (isRgb)
j <<= 5;
((uint8_t *)table_r)[i] = j;
}
for (i = -132; i < 256+132; i++) {
int j = (table_Y[i+384 - 16] + 18)/36;
if (!isRgb)
j <<= 1;
((uint8_t *)table_g)[i] = j << 2;
}
for (i = -232; i < 256+232; i++) {
int j = (table_Y[i+384 - 37] + 43)/85;
if (!isRgb)
j <<= 6;
((uint8_t *)table_b)[i] = j;
}
break;
case 4:
case 4|128:
table_start= table_121 = malloc ((197 + 2*682 + 256 + 132) * sizeof (uint8_t));
entry_size = sizeof (uint8_t);
table_r = table_121 + 197;
table_b = table_121 + 197 + 685;
table_g = table_121 + 197 + 2*682;
for (i = -197; i < 256+197; i++) {
int j = table_Y[i+384 - 110] >> 7;
if (isRgb)
j <<= 3;
((uint8_t *)table_r)[i] = j;
}
for (i = -132; i < 256+132; i++) {
int j = (table_Y[i+384 - 37]+ 43)/85;
((uint8_t *)table_g)[i] = j << 1;
}
for (i = -232; i < 256+232; i++) {
int j =table_Y[i+384 - 110] >> 7;
if (!isRgb)
j <<= 3;
((uint8_t *)table_b)[i] = j;
}
break;
case 1:
table_start= table_1 = malloc (256*2 * sizeof (uint8_t));
entry_size = sizeof (uint8_t);
table_g = table_1;
table_r = table_b = NULL;
for (i = 0; i < 256+256; i++) {
int j = table_Y[i + 384 - 110]>>7;
((uint8_t *)table_g)[i] = j;
}
break;
default:
table_start= NULL;
MSG_ERR("%ibpp not supported by yuv2rgb\n", bpp);
//free mem?
return -1;
}
for (i = 0; i < 256; i++) {
c->table_rV[i] = table_r + entry_size * div_round (crv * (i-128), 76309);
c->table_gU[i] = table_g + entry_size * div_round (cgu * (i-128), 76309);
c->table_gV[i] = entry_size * div_round (cgv * (i-128), 76309);
c->table_bU[i] = table_b + entry_size * div_round (cbu * (i-128), 76309);
}
if(c->yuvTable) free(c->yuvTable);
c->yuvTable= table_start;
return 0;
}
/*
marc.hoffman@analog.com March 8, 2004
Altivec Acceleration for Color Space Conversion revision 0.2
convert I420 YV12 to RGB in various formats,
it rejects images that are not in 420 formats
it rejects images that don't have widths of multiples of 16
it rejects images that don't have heights of multiples of 2
reject defers to C simulation codes.
lots of optimizations to be done here
1. need to fix saturation code, I just couldn't get it to fly with packs and adds.
so we currently use max min to clip
2. the inefficient use of chroma loading needs a bit of brushing up
3. analysis of pipeline stalls needs to be done, use shark to identify pipeline stalls
MODIFIED to calculate coeffs from currently selected color space.
MODIFIED core to be a macro which you spec the output format.
ADDED UYVY conversion which is never called due to some thing in SWSCALE.
CORRECTED algorithim selection to be strict on input formats.
ADDED runtime detection of altivec.
ADDED altivec_yuv2packedX vertical scl + RGB converter
March 27,2004
PERFORMANCE ANALYSIS
The C version use 25% of the processor or ~250Mips for D1 video rawvideo used as test
The ALTIVEC version uses 10% of the processor or ~100Mips for D1 video same sequence
720*480*30 ~10MPS
so we have roughly 10clocks per pixel this is too high something has to be wrong.
OPTIMIZED clip codes to utilize vec_max and vec_packs removing the need for vec_min.
OPTIMIZED DST OUTPUT cache/dma controls. we are pretty much
guaranteed to have the input video frame it was just decompressed so
it probably resides in L1 caches. However we are creating the
output video stream this needs to use the DSTST instruction to
optimize for the cache. We couple this with the fact that we are
not going to be visiting the input buffer again so we mark it Least
Recently Used. This shaves 25% of the processor cycles off.
Now MEMCPY is the largest mips consumer in the system, probably due
to the inefficient X11 stuff.
GL libraries seem to be very slow on this machine 1.33Ghz PB running
Jaguar, this is not the case for my 1Ghz PB. I thought it might be
a versioning issues, however i have libGL.1.2.dylib for both
machines. ((We need to figure this out now))
GL2 libraries work now with patch for RGB32
NOTE quartz vo driver ARGB32_to_RGB24 consumes 30% of the processor
Integrated luma prescaling adjustment for saturation/contrast/brightness adjustment.
*/
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <inttypes.h>
#include <assert.h>
#include "config.h"
#include "rgb2rgb.h"
#include "swscale.h"
#include "swscale_internal.h"
#include "mangle.h"
#include "img_format.h" //FIXME try to reduce dependency of such stuff
#undef PROFILE_THE_BEAST
#undef INC_SCALING
typedef unsigned char ubyte;
typedef signed char sbyte;
/* RGB interleaver, 16 planar pels 8-bit samples per channel in
homogeneous vector registers x0,x1,x2 are interleaved with the
following technique:
o0 = vec_mergeh (x0,x1);
o1 = vec_perm (o0, x2, perm_rgb_0);
o2 = vec_perm (o0, x2, perm_rgb_1);
o3 = vec_mergel (x0,x1);
o4 = vec_perm (o3,o2,perm_rgb_2);
o5 = vec_perm (o3,o2,perm_rgb_3);
perm_rgb_0: o0(RG).h v1(B) --> o1*
0 1 2 3 4
rgbr|gbrg|brgb|rgbr
0010 0100 1001 0010
0102 3145 2673 894A
perm_rgb_1: o0(RG).h v1(B) --> o2
0 1 2 3 4
gbrg|brgb|bbbb|bbbb
0100 1001 1111 1111
B5CD 6EF7 89AB CDEF
perm_rgb_2: o3(RG).l o2(rgbB.l) --> o4*
0 1 2 3 4
gbrg|brgb|rgbr|gbrg
1111 1111 0010 0100
89AB CDEF 0182 3945
perm_rgb_2: o3(RG).l o2(rgbB.l) ---> o5*
0 1 2 3 4
brgb|rgbr|gbrg|brgb
1001 0010 0100 1001
a67b 89cA BdCD eEFf
*/
static
const vector unsigned char
perm_rgb_0 = (vector unsigned char)(0x00,0x01,0x10,0x02,0x03,0x11,0x04,0x05,
0x12,0x06,0x07,0x13,0x08,0x09,0x14,0x0a),
perm_rgb_1 = (vector unsigned char)(0x0b,0x15,0x0c,0x0d,0x16,0x0e,0x0f,0x17,
0x18,0x19,0x1a,0x1b,0x1c,0x1d,0x1e,0x1f),
perm_rgb_2 = (vector unsigned char)(0x10,0x11,0x12,0x13,0x14,0x15,0x16,0x17,
0x00,0x01,0x18,0x02,0x03,0x19,0x04,0x05),
perm_rgb_3 = (vector unsigned char)(0x1a,0x06,0x07,0x1b,0x08,0x09,0x1c,0x0a,
0x0b,0x1d,0x0c,0x0d,0x1e,0x0e,0x0f,0x1f);
#define vec_merge3(x2,x1,x0,y0,y1,y2) \
do { \
typeof(x0) o0,o2,o3; \
o0 = vec_mergeh (x0,x1); \
y0 = vec_perm (o0, x2, perm_rgb_0);\
o2 = vec_perm (o0, x2, perm_rgb_1);\
o3 = vec_mergel (x0,x1); \
y1 = vec_perm (o3,o2,perm_rgb_2); \
y2 = vec_perm (o3,o2,perm_rgb_3); \
} while(0)
#define vec_mstrgb24(x0,x1,x2,ptr) \
do { \
typeof(x0) _0,_1,_2; \
vec_merge3 (x0,x1,x2,_0,_1,_2); \
vec_st (_0, 0, ptr++); \
vec_st (_1, 0, ptr++); \
vec_st (_2, 0, ptr++); \
} while (0);
#define vec_mstbgr24(x0,x1,x2,ptr) \
do { \
typeof(x0) _0,_1,_2; \
vec_merge3 (x2,x1,x0,_0,_1,_2); \
vec_st (_0, 0, ptr++); \
vec_st (_1, 0, ptr++); \
vec_st (_2, 0, ptr++); \
} while (0);
/* pack the pixels in rgb0 format
msb R
lsb 0
*/
#define vec_mstrgb32(T,x0,x1,x2,x3,ptr) \
do { \
T _0,_1,_2,_3; \
_0 = vec_mergeh (x0,x1); \
_1 = vec_mergeh (x2,x3); \
_2 = (T)vec_mergeh ((vector unsigned short)_0,(vector unsigned short)_1); \
_3 = (T)vec_mergel ((vector unsigned short)_0,(vector unsigned short)_1); \
vec_st (_2, 0*16, (T *)ptr); \
vec_st (_3, 1*16, (T *)ptr); \
_0 = vec_mergel (x0,x1); \
_1 = vec_mergel (x2,x3); \
_2 = (T)vec_mergeh ((vector unsigned short)_0,(vector unsigned short)_1); \
_3 = (T)vec_mergel ((vector unsigned short)_0,(vector unsigned short)_1); \
vec_st (_2, 2*16, (T *)ptr); \
vec_st (_3, 3*16, (T *)ptr); \
ptr += 4; \
} while (0);
/*
| 1 0 1.4021 | | Y |
| 1 -0.3441 -0.7142 |x| Cb|
| 1 1.7718 0 | | Cr|
Y: [-128 127]
Cb/Cr : [-128 127]
typical yuv conversion work on Y: 0-255 this version has been optimized for jpeg decode.
*/
#define vec_unh(x) \
(vector signed short) \
vec_perm(x,(typeof(x))(0),\
(vector unsigned char)(0x10,0x00,0x10,0x01,0x10,0x02,0x10,0x03,\
0x10,0x04,0x10,0x05,0x10,0x06,0x10,0x07))
#define vec_unl(x) \
(vector signed short) \
vec_perm(x,(typeof(x))(0),\
(vector unsigned char)(0x10,0x08,0x10,0x09,0x10,0x0A,0x10,0x0B,\
0x10,0x0C,0x10,0x0D,0x10,0x0E,0x10,0x0F))
#define vec_clip(x) \
vec_max (vec_min (x, (typeof(x))(255)), (typeof(x))(0))
#define vec_packclp_a(x,y) \
(vector unsigned char)vec_pack (vec_clip (x), vec_clip (y))
#define vec_packclp(x,y) \
(vector unsigned char)vec_packs \
((vector unsigned short)vec_max (x,(vector signed short) (0)), \
(vector unsigned short)vec_max (y,(vector signed short) (0)))
//#define out_pixels(a,b,c,ptr) vec_mstrgb32(typeof(a),((typeof (a))(0)),a,a,a,ptr)
static inline void cvtyuvtoRGB (SwsContext *c,
vector signed short Y, vector signed short U, vector signed short V,
vector signed short *R, vector signed short *G, vector signed short *B)
{
vector signed short vx,ux,uvx;
Y = vec_mradds (Y, c->CY, c->OY);
U = vec_sub (U,(vector signed short)(128));
V = vec_sub (V,(vector signed short)(128));
// ux = (CBU*(u<<c->CSHIFT)+0x4000)>>15;
ux = vec_sl (U, c->CSHIFT);
*B = vec_mradds (ux, c->CBU, Y);
// vx = (CRV*(v<<c->CSHIFT)+0x4000)>>15;
vx = vec_sl (V, c->CSHIFT);
*R = vec_mradds (vx, c->CRV, Y);
// uvx = ((CGU*u) + (CGV*v))>>15;
uvx = vec_mradds (U, c->CGU, Y);
*G = vec_mradds (V, c->CGV, uvx);
}
/*
------------------------------------------------------------------------------
CS converters
------------------------------------------------------------------------------
*/
#define DEFCSP420_CVT(name,out_pixels) \
static int altivec_##name (SwsContext *c, \
unsigned char **in, int *instrides, \
int srcSliceY, int srcSliceH, \
unsigned char **oplanes, int *outstrides) \
{ \
int w = c->srcW; \
int h = srcSliceH; \
int i,j; \
int instrides_scl[3]; \
vector unsigned char y0,y1; \
\
vector signed char u,v; \
\
vector signed short Y0,Y1,Y2,Y3; \
vector signed short U,V; \
vector signed short vx,ux,uvx; \
vector signed short vx0,ux0,uvx0; \
vector signed short vx1,ux1,uvx1; \
vector signed short R0,G0,B0; \
vector signed short R1,G1,B1; \
vector unsigned char R,G,B; \
\
vector unsigned char *uivP, *vivP; \
vector unsigned char align_perm; \
\
vector signed short \
lCY = c->CY, \
lOY = c->OY, \
lCRV = c->CRV, \
lCBU = c->CBU, \
lCGU = c->CGU, \
lCGV = c->CGV; \
\
vector unsigned short lCSHIFT = c->CSHIFT; \
\
ubyte *y1i = in[0]; \
ubyte *y2i = in[0]+w; \
ubyte *ui = in[1]; \
ubyte *vi = in[2]; \
\
vector unsigned char *oute \
= (vector unsigned char *) \
(oplanes[0]+srcSliceY*outstrides[0]); \
vector unsigned char *outo \
= (vector unsigned char *) \
(oplanes[0]+srcSliceY*outstrides[0]+outstrides[0]); \
\
\
instrides_scl[0] = instrides[0]; \
instrides_scl[1] = instrides[1]-w/2; /* the loop moves ui by w/2 */ \
instrides_scl[2] = instrides[2]-w/2; /* the loop moves vi by w/2 */ \
\
\
for (i=0;i<h/2;i++) { \
vec_dstst (outo, (0x02000002|(((w*3+32)/32)<<16)), 0); \
vec_dstst (oute, (0x02000002|(((w*3+32)/32)<<16)), 1); \
\
for (j=0;j<w/16;j++) { \
\
y0 = vec_ldl (0,y1i); \
y1 = vec_ldl (0,y2i); \
uivP = (vector unsigned char *)ui; \
vivP = (vector unsigned char *)vi; \
\
align_perm = vec_lvsl (0, ui); \
u = (vector signed char)vec_perm (uivP[0], uivP[1], align_perm); \
\
align_perm = vec_lvsl (0, vi); \
v = (vector signed char)vec_perm (vivP[0], vivP[1], align_perm); \
\
u = (vector signed char)vec_sub (u, (vector signed char)(128)); \
v = (vector signed char)vec_sub (v, (vector signed char)(128)); \
U = vec_unpackh (u); \
V = vec_unpackh (v); \
\
\
Y0 = vec_unh (y0); \
Y1 = vec_unl (y0); \
Y2 = vec_unh (y1); \
Y3 = vec_unl (y1); \
\
Y0 = vec_mradds (Y0, lCY, lOY); \
Y1 = vec_mradds (Y1, lCY, lOY); \
Y2 = vec_mradds (Y2, lCY, lOY); \
Y3 = vec_mradds (Y3, lCY, lOY); \
\
/* ux = (CBU*(u<<CSHIFT)+0x4000)>>15 */ \
ux = vec_sl (U, lCSHIFT); \
ux = vec_mradds (ux, lCBU, (vector signed short)(0)); \
ux0 = vec_mergeh (ux,ux); \
ux1 = vec_mergel (ux,ux); \
\
/* vx = (CRV*(v<<CSHIFT)+0x4000)>>15; */ \
vx = vec_sl (V, lCSHIFT); \
vx = vec_mradds (vx, lCRV, (vector signed short)(0)); \
vx0 = vec_mergeh (vx,vx); \
vx1 = vec_mergel (vx,vx); \
\
/* uvx = ((CGU*u) + (CGV*v))>>15 */ \
uvx = vec_mradds (U, lCGU, (vector signed short)(0)); \
uvx = vec_mradds (V, lCGV, uvx); \
uvx0 = vec_mergeh (uvx,uvx); \
uvx1 = vec_mergel (uvx,uvx); \
\
R0 = vec_add (Y0,vx0); \
G0 = vec_add (Y0,uvx0); \
B0 = vec_add (Y0,ux0); \
R1 = vec_add (Y1,vx1); \
G1 = vec_add (Y1,uvx1); \
B1 = vec_add (Y1,ux1); \
\
R = vec_packclp (R0,R1); \
G = vec_packclp (G0,G1); \
B = vec_packclp (B0,B1); \
\
out_pixels(R,G,B,oute); \
\
R0 = vec_add (Y2,vx0); \
G0 = vec_add (Y2,uvx0); \
B0 = vec_add (Y2,ux0); \
R1 = vec_add (Y3,vx1); \
G1 = vec_add (Y3,uvx1); \
B1 = vec_add (Y3,ux1); \
R = vec_packclp (R0,R1); \
G = vec_packclp (G0,G1); \
B = vec_packclp (B0,B1); \
\
\
out_pixels(R,G,B,outo); \
\
y1i += 16; \
y2i += 16; \
ui += 8; \
vi += 8; \
\
} \
\
outo += (outstrides[0])>>4; \
oute += (outstrides[0])>>4; \
\
ui += instrides_scl[1]; \
vi += instrides_scl[2]; \
y1i += instrides_scl[0]; \
y2i += instrides_scl[0]; \
} \
return srcSliceH; \
}
#define out_abgr(a,b,c,ptr) vec_mstrgb32(typeof(a),((typeof (a))(0)),c,b,a,ptr)
#define out_bgra(a,b,c,ptr) vec_mstrgb32(typeof(a),c,b,a,((typeof (a))(0)),ptr)
#define out_rgba(a,b,c,ptr) vec_mstrgb32(typeof(a),a,b,c,((typeof (a))(0)),ptr)
#define out_argb(a,b,c,ptr) vec_mstrgb32(typeof(a),((typeof (a))(0)),a,b,c,ptr)
#define out_rgb24(a,b,c,ptr) vec_mstrgb24(a,b,c,ptr)
#define out_bgr24(a,b,c,ptr) vec_mstrgb24(c,b,a,ptr)
DEFCSP420_CVT (yuv2_abgr32, out_abgr)
DEFCSP420_CVT (yuv2_bgra32, out_argb)
DEFCSP420_CVT (yuv2_rgba32, out_rgba)
DEFCSP420_CVT (yuv2_argb32, out_argb)
DEFCSP420_CVT (yuv2_rgb24, out_rgb24)
DEFCSP420_CVT (yuv2_bgr24, out_bgr24)
// uyvy|uyvy|uyvy|uyvy
// 0123 4567 89ab cdef
static
const vector unsigned char
demux_u = (vector unsigned char)(0x10,0x00,0x10,0x00,
0x10,0x04,0x10,0x04,
0x10,0x08,0x10,0x08,
0x10,0x0c,0x10,0x0c),
demux_v = (vector unsigned char)(0x10,0x02,0x10,0x02,
0x10,0x06,0x10,0x06,
0x10,0x0A,0x10,0x0A,
0x10,0x0E,0x10,0x0E),
demux_y = (vector unsigned char)(0x10,0x01,0x10,0x03,
0x10,0x05,0x10,0x07,
0x10,0x09,0x10,0x0B,
0x10,0x0D,0x10,0x0F);
/*
this is so I can play live CCIR raw video
*/
static int altivec_uyvy_rgb32 (SwsContext *c,
unsigned char **in, int *instrides,
int srcSliceY, int srcSliceH,
unsigned char **oplanes, int *outstrides)
{
int w = c->srcW;
int h = srcSliceH;
int i,j;
vector unsigned char uyvy;
vector signed short Y,U,V;
vector signed short vx,ux,uvx;
vector signed short R0,G0,B0,R1,G1,B1;
vector unsigned char R,G,B;
vector unsigned char *out;
ubyte *img;
img = in[0];
out = (vector unsigned char *)(oplanes[0]+srcSliceY*outstrides[0]);
for (i=0;i<h;i++) {
for (j=0;j<w/16;j++) {
uyvy = vec_ld (0, img);
U = (vector signed short)
vec_perm (uyvy, (vector unsigned char)(0), demux_u);
V = (vector signed short)
vec_perm (uyvy, (vector unsigned char)(0), demux_v);
Y = (vector signed short)
vec_perm (uyvy, (vector unsigned char)(0), demux_y);
cvtyuvtoRGB (c, Y,U,V,&R0,&G0,&B0);
uyvy = vec_ld (16, img);
U = (vector signed short)
vec_perm (uyvy, (vector unsigned char)(0), demux_u);
V = (vector signed short)
vec_perm (uyvy, (vector unsigned char)(0), demux_v);
Y = (vector signed short)
vec_perm (uyvy, (vector unsigned char)(0), demux_y);
cvtyuvtoRGB (c, Y,U,V,&R1,&G1,&B1);
R = vec_packclp (R0,R1);
G = vec_packclp (G0,G1);
B = vec_packclp (B0,B1);
// vec_mstbgr24 (R,G,B, out);
out_rgba (R,G,B,out);
img += 32;
}
}
return srcSliceH;
}
/* Ok currently the acceleration routine only supports
inputs of widths a multiple of 16
and heights a multiple 2
So we just fall back to the C codes for this.
*/
SwsFunc yuv2rgb_init_altivec (SwsContext *c)
{
if (!(c->flags & SWS_CPU_CAPS_ALTIVEC))
return NULL;
/*
and this seems not to matter too much I tried a bunch of
videos with abnormal widths and mplayer crashes else where.
mplayer -vo x11 -rawvideo on:w=350:h=240 raw-350x240.eyuv
boom with X11 bad match.
*/
if ((c->srcW & 0xf) != 0) return NULL;
switch (c->srcFormat) {
case IMGFMT_YVU9:
case IMGFMT_IF09:
case IMGFMT_YV12:
case IMGFMT_I420:
case IMGFMT_IYUV:
case IMGFMT_CLPL:
case IMGFMT_Y800:
case IMGFMT_Y8:
case IMGFMT_NV12:
case IMGFMT_NV21:
if ((c->srcH & 0x1) != 0)
return NULL;
switch(c->dstFormat){
case IMGFMT_RGB24:
MSG_WARN("ALTIVEC: Color Space RGB24\n");
return altivec_yuv2_rgb24;
case IMGFMT_BGR24:
MSG_WARN("ALTIVEC: Color Space BGR24\n");
return altivec_yuv2_bgr24;
case IMGFMT_RGB32:
MSG_WARN("ALTIVEC: Color Space ARGB32\n");
return altivec_yuv2_argb32;
case IMGFMT_BGR32:
MSG_WARN("ALTIVEC: Color Space BGRA32\n");
// return profile_altivec_bgra32;
return altivec_yuv2_bgra32;
default: return NULL;
}
break;
case IMGFMT_UYVY:
switch(c->dstFormat){
case IMGFMT_RGB32:
MSG_WARN("ALTIVEC: Color Space UYVY -> RGB32\n");
return altivec_uyvy_rgb32;
default: return NULL;
}
break;
}
return NULL;
}
void yuv2rgb_altivec_init_tables (SwsContext *c, const int inv_table[4])
{
vector signed short CY, CRV, CBU, CGU, CGV, OY, Y0;
int64_t crv __attribute__ ((aligned(16))) = inv_table[0];
int64_t cbu __attribute__ ((aligned(16))) = inv_table[1];
int64_t cgu __attribute__ ((aligned(16))) = inv_table[2];
int64_t cgv __attribute__ ((aligned(16))) = inv_table[3];
int64_t cy = (1<<16)-1;
int64_t oy = 0;
short tmp __attribute__ ((aligned(16)));
if ((c->flags & SWS_CPU_CAPS_ALTIVEC) == 0)
return;
cy = (cy *c->contrast )>>17;
crv= (crv*c->contrast * c->saturation)>>32;
cbu= (cbu*c->contrast * c->saturation)>>32;
cgu= (cgu*c->contrast * c->saturation)>>32;
cgv= (cgv*c->contrast * c->saturation)>>32;
oy -= 256*c->brightness;
tmp = cy;
CY = vec_lde (0, &tmp);
CY = vec_splat (CY, 0);
tmp = oy;
OY = vec_lde (0, &tmp);
OY = vec_splat (OY, 0);
tmp = crv>>3;
CRV = vec_lde (0, &tmp);
CRV = vec_splat (CRV, 0);
tmp = cbu>>3;
CBU = vec_lde (0, &tmp);
CBU = vec_splat (CBU, 0);
tmp = -(cgu>>1);
CGU = vec_lde (0, &tmp);
CGU = vec_splat (CGU, 0);
tmp = -(cgv>>1);
CGV = vec_lde (0, &tmp);
CGV = vec_splat (CGV, 0);
c->CSHIFT = (vector unsigned short)(2);
c->CY = CY;
c->OY = OY;
c->CRV = CRV;
c->CBU = CBU;
c->CGU = CGU;
c->CGV = CGV;
#if 0
printf ("cy: %hvx\n", CY);
printf ("oy: %hvx\n", OY);
printf ("crv: %hvx\n", CRV);
printf ("cbu: %hvx\n", CBU);
printf ("cgv: %hvx\n", CGV);
printf ("cgu: %hvx\n", CGU);
#endif
return;
}
void
altivec_yuv2packedX (SwsContext *c,
int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize,
int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize,
uint8_t *dest, int dstW, int dstY)
{
int i,j;
short tmp __attribute__((aligned (16)));
short *p;
short *f;
vector signed short X,X0,X1,Y0,U0,V0,Y1,U1,V1,U,V;
vector signed short R0,G0,B0,R1,G1,B1;
vector unsigned char R,G,B,pels[3];
vector unsigned char *out,*nout;
vector signed short RND = (vector signed short)(1<<3);
vector unsigned short SCL = (vector unsigned short)(4);
unsigned long scratch[16] __attribute__ ((aligned (16)));
vector signed short *vYCoeffsBank, *vCCoeffsBank;
vector signed short *YCoeffs, *CCoeffs;
vYCoeffsBank = malloc (sizeof (vector signed short)*lumFilterSize*dstW);
vCCoeffsBank = malloc (sizeof (vector signed short)*chrFilterSize*dstW);
for (i=0;i<lumFilterSize*dstW;i++) {
tmp = c->vLumFilter[i];
p = &vYCoeffsBank[i];
for (j=0;j<8;j++)
p[j] = tmp;
}
for (i=0;i<chrFilterSize*dstW;i++) {
tmp = c->vChrFilter[i];
p = &vCCoeffsBank[i];
for (j=0;j<8;j++)
p[j] = tmp;
}
YCoeffs = vYCoeffsBank+dstY*lumFilterSize;
CCoeffs = vCCoeffsBank+dstY*chrFilterSize;
out = (vector unsigned char *)dest;
for(i=0; i<dstW; i+=16){
Y0 = RND;
Y1 = RND;
/* extract 16 coeffs from lumSrc */
for(j=0; j<lumFilterSize; j++) {
X0 = vec_ld (0, &lumSrc[j][i]);
X1 = vec_ld (16, &lumSrc[j][i]);
Y0 = vec_mradds (X0, YCoeffs[j], Y0);
Y1 = vec_mradds (X1, YCoeffs[j], Y1);
}
U = RND;
V = RND;
/* extract 8 coeffs from U,V */
for(j=0; j<chrFilterSize; j++) {
X = vec_ld (0, &chrSrc[j][i/2]);
U = vec_mradds (X, CCoeffs[j], U);
X = vec_ld (0, &chrSrc[j][i/2+2048]);
V = vec_mradds (X, CCoeffs[j], V);
}
/* scale and clip signals */
Y0 = vec_sra (Y0, SCL);
Y1 = vec_sra (Y1, SCL);
U = vec_sra (U, SCL);
V = vec_sra (V, SCL);
Y0 = vec_clip (Y0);
Y1 = vec_clip (Y1);
U = vec_clip (U);
V = vec_clip (V);
/* now we have
Y0= y0 y1 y2 y3 y4 y5 y6 y7 Y1= y8 y9 y10 y11 y12 y13 y14 y15
U= u0 u1 u2 u3 u4 u5 u6 u7 V= v0 v1 v2 v3 v4 v5 v6 v7
Y0= y0 y1 y2 y3 y4 y5 y6 y7 Y1= y8 y9 y10 y11 y12 y13 y14 y15
U0= u0 u0 u1 u1 u2 u2 u3 u3 U1= u4 u4 u5 u5 u6 u6 u7 u7
V0= v0 v0 v1 v1 v2 v2 v3 v3 V1= v4 v4 v5 v5 v6 v6 v7 v7
*/
U0 = vec_mergeh (U,U);
V0 = vec_mergeh (V,V);
U1 = vec_mergel (U,U);
V1 = vec_mergel (V,V);
cvtyuvtoRGB (c, Y0,U0,V0,&R0,&G0,&B0);
cvtyuvtoRGB (c, Y1,U1,V1,&R1,&G1,&B1);
R = vec_packclp (R0,R1);
G = vec_packclp (G0,G1);
B = vec_packclp (B0,B1);
out_rgba (R,G,B,out);
}
if (i < dstW) {
i -= 16;
Y0 = RND;
Y1 = RND;
/* extract 16 coeffs from lumSrc */
for(j=0; j<lumFilterSize; j++) {
X0 = vec_ld (0, &lumSrc[j][i]);
X1 = vec_ld (16, &lumSrc[j][i]);
Y0 = vec_mradds (X0, YCoeffs[j], Y0);
Y1 = vec_mradds (X1, YCoeffs[j], Y1);
}
U = RND;
V = RND;
/* extract 8 coeffs from U,V */
for(j=0; j<chrFilterSize; j++) {
X = vec_ld (0, &chrSrc[j][i/2]);
U = vec_mradds (X, CCoeffs[j], U);
X = vec_ld (0, &chrSrc[j][i/2+2048]);
V = vec_mradds (X, CCoeffs[j], V);
}
/* scale and clip signals */
Y0 = vec_sra (Y0, SCL);
Y1 = vec_sra (Y1, SCL);
U = vec_sra (U, SCL);
V = vec_sra (V, SCL);
Y0 = vec_clip (Y0);
Y1 = vec_clip (Y1);
U = vec_clip (U);
V = vec_clip (V);
/* now we have
Y0= y0 y1 y2 y3 y4 y5 y6 y7 Y1= y8 y9 y10 y11 y12 y13 y14 y15
U= u0 u1 u2 u3 u4 u5 u6 u7 V= v0 v1 v2 v3 v4 v5 v6 v7
Y0= y0 y1 y2 y3 y4 y5 y6 y7 Y1= y8 y9 y10 y11 y12 y13 y14 y15
U0= u0 u0 u1 u1 u2 u2 u3 u3 U1= u4 u4 u5 u5 u6 u6 u7 u7
V0= v0 v0 v1 v1 v2 v2 v3 v3 V1= v4 v4 v5 v5 v6 v6 v7 v7
*/
U0 = vec_mergeh (U,U);
V0 = vec_mergeh (V,V);
U1 = vec_mergel (U,U);
V1 = vec_mergel (V,V);
cvtyuvtoRGB (c, Y0,U0,V0,&R0,&G0,&B0);
cvtyuvtoRGB (c, Y1,U1,V1,&R1,&G1,&B1);
R = vec_packclp (R0,R1);
G = vec_packclp (G0,G1);
B = vec_packclp (B0,B1);
nout = (vector unsigned char *)scratch;
out_rgba (R,G,B,nout);
memcpy (&((uint32_t*)dest)[i], scratch, (dstW-i)/4);
}
if (vYCoeffsBank) free (vYCoeffsBank);
if (vCCoeffsBank) free (vCCoeffsBank);
}
/*
* yuv2rgb_mlib.c, Software YUV to RGB coverter using mediaLib
*
* Copyright (C) 2000, Hkan Hjort <d95hjort@dtek.chalmers.se>
* All Rights Reserved.
*
* This file is part of mpeg2dec, a free MPEG-2 video decoder
*
* mpeg2dec is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2, or (at your option)
* any later version.
*
* mpeg2dec is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with GNU Make; see the file COPYING. If not, write to
* the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
*
*/
#include <mlib_types.h>
#include <mlib_status.h>
#include <mlib_sys.h>
#include <mlib_video.h>
#include <inttypes.h>
#include <stdlib.h>
#include <assert.h>
#include "img_format.h" //FIXME try to reduce dependency of such stuff
#include "swscale.h"
static int mlib_YUV2ARGB420_32(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY,
int srcSliceH, uint8_t* dst[], int dstStride[]){
if(c->srcFormat == IMGFMT_422P){
srcStride[1] *= 2;
srcStride[2] *= 2;
}
assert(srcStride[1] == srcStride[2]);
mlib_VideoColorYUV2ARGB420(dst[0]+srcSliceY*dstStride[0], src[0], src[1], src[2], c->dstW,
srcSliceH, dstStride[0], srcStride[0], srcStride[1]);
return srcSliceH;
}
static int mlib_YUV2ABGR420_32(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY,
int srcSliceH, uint8_t* dst[], int dstStride[]){
if(c->srcFormat == IMGFMT_422P){
srcStride[1] *= 2;
srcStride[2] *= 2;
}
assert(srcStride[1] == srcStride[2]);
mlib_VideoColorYUV2ABGR420(dst[0]+srcSliceY*dstStride[0], src[0], src[1], src[2], c->dstW,
srcSliceH, dstStride[0], srcStride[0], srcStride[1]);
return srcSliceH;
}
static int mlib_YUV2RGB420_24(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY,
int srcSliceH, uint8_t* dst[], int dstStride[]){
if(c->srcFormat == IMGFMT_422P){
srcStride[1] *= 2;
srcStride[2] *= 2;
}
assert(srcStride[1] == srcStride[2]);
mlib_VideoColorYUV2RGB420(dst[0]+srcSliceY*dstStride[0], src[0], src[1], src[2], c->dstW,
srcSliceH, dstStride[0], srcStride[0], srcStride[1]);
return srcSliceH;
}
SwsFunc yuv2rgb_init_mlib(SwsContext *c)
{
switch(c->dstFormat){
case IMGFMT_RGB24: return mlib_YUV2RGB420_24;
case IMGFMT_RGB32: return mlib_YUV2ARGB420_32;
case IMGFMT_BGR32: return mlib_YUV2ABGR420_32;
default: return NULL;
}
}
/*
* yuv2rgb_mmx.c, Software YUV to RGB coverter with Intel MMX "technology"
*
* Copyright (C) 2000, Silicon Integrated System Corp.
* All Rights Reserved.
*
* Author: Olie Lho <ollie@sis.com.tw>
*
* This file is part of mpeg2dec, a free MPEG-2 video decoder
*
* mpeg2dec is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2, or (at your option)
* any later version.
*
* mpeg2dec is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with GNU Make; see the file COPYING. If not, write to
* the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
*
* 15,24 bpp and dithering from Michael Niedermayer (michaelni@gmx.at)
* MMX/MMX2 Template stuff from Michael Niedermayer (needed for fast movntq support)
* context / deglobalize stuff by Michael Niedermayer
*/
#undef MOVNTQ
#undef EMMS
#undef SFENCE
#ifdef HAVE_3DNOW
/* On K6 femms is faster of emms. On K7 femms is directly mapped on emms. */
#define EMMS "femms"
#else
#define EMMS "emms"
#endif
#ifdef HAVE_MMX2
#define MOVNTQ "movntq"
#define SFENCE "sfence"
#else
#define MOVNTQ "movq"
#define SFENCE "/nop"
#endif
#define YUV2RGB \
/* Do the multiply part of the conversion for even and odd pixels,
register usage:
mm0 -> Cblue, mm1 -> Cred, mm2 -> Cgreen even pixels,
mm3 -> Cblue, mm4 -> Cred, mm5 -> Cgreen odd pixels,
mm6 -> Y even, mm7 -> Y odd */\
/* convert the chroma part */\
"punpcklbw %%mm4, %%mm0;" /* scatter 4 Cb 00 u3 00 u2 00 u1 00 u0 */ \
"punpcklbw %%mm4, %%mm1;" /* scatter 4 Cr 00 v3 00 v2 00 v1 00 v0 */ \
\
"psllw $3, %%mm0;" /* Promote precision */ \
"psllw $3, %%mm1;" /* Promote precision */ \
\
"psubsw "U_OFFSET"(%4), %%mm0;" /* Cb -= 128 */ \
"psubsw "V_OFFSET"(%4), %%mm1;" /* Cr -= 128 */ \
\
"movq %%mm0, %%mm2;" /* Copy 4 Cb 00 u3 00 u2 00 u1 00 u0 */ \
"movq %%mm1, %%mm3;" /* Copy 4 Cr 00 v3 00 v2 00 v1 00 v0 */ \
\
"pmulhw "UG_COEFF"(%4), %%mm2;" /* Mul Cb with green coeff -> Cb green */ \
"pmulhw "VG_COEFF"(%4), %%mm3;" /* Mul Cr with green coeff -> Cr green */ \
\
"pmulhw "UB_COEFF"(%4), %%mm0;" /* Mul Cb -> Cblue 00 b3 00 b2 00 b1 00 b0 */\
"pmulhw "VR_COEFF"(%4), %%mm1;" /* Mul Cr -> Cred 00 r3 00 r2 00 r1 00 r0 */\
\
"paddsw %%mm3, %%mm2;" /* Cb green + Cr green -> Cgreen */\
\
/* convert the luma part */\
"movq %%mm6, %%mm7;" /* Copy 8 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 */\
"pand "MANGLE(mmx_00ffw)", %%mm6;" /* get Y even 00 Y6 00 Y4 00 Y2 00 Y0 */\
\
"psrlw $8, %%mm7;" /* get Y odd 00 Y7 00 Y5 00 Y3 00 Y1 */\
\
"psllw $3, %%mm6;" /* Promote precision */\
"psllw $3, %%mm7;" /* Promote precision */\
\
"psubw "Y_OFFSET"(%4), %%mm6;" /* Y -= 16 */\
"psubw "Y_OFFSET"(%4), %%mm7;" /* Y -= 16 */\
\
"pmulhw "Y_COEFF"(%4), %%mm6;" /* Mul 4 Y even 00 y6 00 y4 00 y2 00 y0 */\
"pmulhw "Y_COEFF"(%4), %%mm7;" /* Mul 4 Y odd 00 y7 00 y5 00 y3 00 y1 */\
\
/* Do the addition part of the conversion for even and odd pixels,
register usage:
mm0 -> Cblue, mm1 -> Cred, mm2 -> Cgreen even pixels,
mm3 -> Cblue, mm4 -> Cred, mm5 -> Cgreen odd pixels,
mm6 -> Y even, mm7 -> Y odd */\
"movq %%mm0, %%mm3;" /* Copy Cblue */\
"movq %%mm1, %%mm4;" /* Copy Cred */\
"movq %%mm2, %%mm5;" /* Copy Cgreen */\
\
"paddsw %%mm6, %%mm0;" /* Y even + Cblue 00 B6 00 B4 00 B2 00 B0 */\
"paddsw %%mm7, %%mm3;" /* Y odd + Cblue 00 B7 00 B5 00 B3 00 B1 */\
\
"paddsw %%mm6, %%mm1;" /* Y even + Cred 00 R6 00 R4 00 R2 00 R0 */\
"paddsw %%mm7, %%mm4;" /* Y odd + Cred 00 R7 00 R5 00 R3 00 R1 */\
\
"paddsw %%mm6, %%mm2;" /* Y even + Cgreen 00 G6 00 G4 00 G2 00 G0 */\
"paddsw %%mm7, %%mm5;" /* Y odd + Cgreen 00 G7 00 G5 00 G3 00 G1 */\
\
/* Limit RGB even to 0..255 */\
"packuswb %%mm0, %%mm0;" /* B6 B4 B2 B0 B6 B4 B2 B0 */\
"packuswb %%mm1, %%mm1;" /* R6 R4 R2 R0 R6 R4 R2 R0 */\
"packuswb %%mm2, %%mm2;" /* G6 G4 G2 G0 G6 G4 G2 G0 */\
\
/* Limit RGB odd to 0..255 */\
"packuswb %%mm3, %%mm3;" /* B7 B5 B3 B1 B7 B5 B3 B1 */\
"packuswb %%mm4, %%mm4;" /* R7 R5 R3 R1 R7 R5 R3 R1 */\
"packuswb %%mm5, %%mm5;" /* G7 G5 G3 G1 G7 G5 G3 G1 */\
\
/* Interleave RGB even and odd */\
"punpcklbw %%mm3, %%mm0;" /* B7 B6 B5 B4 B3 B2 B1 B0 */\
"punpcklbw %%mm4, %%mm1;" /* R7 R6 R5 R4 R3 R2 R1 R0 */\
"punpcklbw %%mm5, %%mm2;" /* G7 G6 G5 G4 G3 G2 G1 G0 */\
static inline int RENAME(yuv420_rgb16)(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY,
int srcSliceH, uint8_t* dst[], int dstStride[]){
int y, h_size;
if(c->srcFormat == IMGFMT_422P){
srcStride[1] *= 2;
srcStride[2] *= 2;
}
h_size= (c->dstW+7)&~7;
if(h_size*2 > dstStride[0]) h_size-=8;
__asm__ __volatile__ ("pxor %mm4, %mm4;" /* zero mm4 */ );
//printf("%X %X %X %X %X %X %X %X %X %X\n", (int)&c->redDither, (int)&b5Dither, (int)src[0], (int)src[1], (int)src[2], (int)dst[0],
//srcStride[0],srcStride[1],srcStride[2],dstStride[0]);
for (y= 0; y<srcSliceH; y++ ) {
uint8_t *_image = dst[0] + (y+srcSliceY)*dstStride[0];
uint8_t *_py = src[0] + y*srcStride[0];
uint8_t *_pu = src[1] + (y>>1)*srcStride[1];
uint8_t *_pv = src[2] + (y>>1)*srcStride[2];
int index= -h_size/2;
b5Dither= dither8[y&1];
g6Dither= dither4[y&1];
g5Dither= dither8[y&1];
r5Dither= dither8[(y+1)&1];
/* this mmx assembly code deals with SINGLE scan line at a time, it convert 8
pixels in each iteration */
__asm__ __volatile__ (
/* load data for start of next scan line */
"movd (%2, %0), %%mm0;" /* Load 4 Cb 00 00 00 00 u3 u2 u1 u0 */
"movd (%3, %0), %%mm1;" /* Load 4 Cr 00 00 00 00 v3 v2 v1 v0 */
"movq (%5, %0, 2), %%mm6;" /* Load 8 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 */
// ".balign 16 \n\t"
"1: \n\t"
/* no speed diference on my p3@500 with prefetch,
* if it is faster for anyone with -benchmark then tell me
PREFETCH" 64(%0) \n\t"
PREFETCH" 64(%1) \n\t"
PREFETCH" 64(%2) \n\t"
*/
YUV2RGB
#ifdef DITHER1XBPP
"paddusb "MANGLE(b5Dither)", %%mm0;"
"paddusb "MANGLE(g6Dither)", %%mm2;"
"paddusb "MANGLE(r5Dither)", %%mm1;"
#endif
/* mask unneeded bits off */
"pand "MANGLE(mmx_redmask)", %%mm0;" /* b7b6b5b4 b3_0_0_0 b7b6b5b4 b3_0_0_0 */
"pand "MANGLE(mmx_grnmask)", %%mm2;" /* g7g6g5g4 g3g2_0_0 g7g6g5g4 g3g2_0_0 */
"pand "MANGLE(mmx_redmask)", %%mm1;" /* r7r6r5r4 r3_0_0_0 r7r6r5r4 r3_0_0_0 */
"psrlw $3,%%mm0;" /* 0_0_0_b7 b6b5b4b3 0_0_0_b7 b6b5b4b3 */
"pxor %%mm4, %%mm4;" /* zero mm4 */
"movq %%mm0, %%mm5;" /* Copy B7-B0 */
"movq %%mm2, %%mm7;" /* Copy G7-G0 */
/* convert rgb24 plane to rgb16 pack for pixel 0-3 */
"punpcklbw %%mm4, %%mm2;" /* 0_0_0_0 0_0_0_0 g7g6g5g4 g3g2_0_0 */
"punpcklbw %%mm1, %%mm0;" /* r7r6r5r4 r3_0_0_0 0_0_0_b7 b6b5b4b3 */
"psllw $3, %%mm2;" /* 0_0_0_0 0_g7g6g5 g4g3g2_0 0_0_0_0 */
"por %%mm2, %%mm0;" /* r7r6r5r4 r3g7g6g5 g4g3g2b7 b6b5b4b3 */
"movq 8 (%5, %0, 2), %%mm6;" /* Load 8 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 */
MOVNTQ " %%mm0, (%1);" /* store pixel 0-3 */
/* convert rgb24 plane to rgb16 pack for pixel 0-3 */
"punpckhbw %%mm4, %%mm7;" /* 0_0_0_0 0_0_0_0 g7g6g5g4 g3g2_0_0 */
"punpckhbw %%mm1, %%mm5;" /* r7r6r5r4 r3_0_0_0 0_0_0_b7 b6b5b4b3 */
"psllw $3, %%mm7;" /* 0_0_0_0 0_g7g6g5 g4g3g2_0 0_0_0_0 */
"movd 4 (%2, %0), %%mm0;" /* Load 4 Cb 00 00 00 00 u3 u2 u1 u0 */
"por %%mm7, %%mm5;" /* r7r6r5r4 r3g7g6g5 g4g3g2b7 b6b5b4b3 */
"movd 4 (%3, %0), %%mm1;" /* Load 4 Cr 00 00 00 00 v3 v2 v1 v0 */
MOVNTQ " %%mm5, 8 (%1);" /* store pixel 4-7 */
"addl $16, %1 \n\t"
"addl $4, %0 \n\t"
" js 1b \n\t"
: "+r" (index), "+r" (_image)
: "r" (_pu - index), "r" (_pv - index), "r"(&c->redDither), "r" (_py - 2*index)
);
}
__asm__ __volatile__ (EMMS);
return srcSliceH;
}
static inline int RENAME(yuv420_rgb15)(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY,
int srcSliceH, uint8_t* dst[], int dstStride[]){
int y, h_size;
if(c->srcFormat == IMGFMT_422P){
srcStride[1] *= 2;
srcStride[2] *= 2;
}
h_size= (c->dstW+7)&~7;
if(h_size*2 > dstStride[0]) h_size-=8;
__asm__ __volatile__ ("pxor %mm4, %mm4;" /* zero mm4 */ );
//printf("%X %X %X %X %X %X %X %X %X %X\n", (int)&c->redDither, (int)&b5Dither, (int)src[0], (int)src[1], (int)src[2], (int)dst[0],
//srcStride[0],srcStride[1],srcStride[2],dstStride[0]);
for (y= 0; y<srcSliceH; y++ ) {
uint8_t *_image = dst[0] + (y+srcSliceY)*dstStride[0];
uint8_t *_py = src[0] + y*srcStride[0];
uint8_t *_pu = src[1] + (y>>1)*srcStride[1];
uint8_t *_pv = src[2] + (y>>1)*srcStride[2];
int index= -h_size/2;
b5Dither= dither8[y&1];
g6Dither= dither4[y&1];
g5Dither= dither8[y&1];
r5Dither= dither8[(y+1)&1];
/* this mmx assembly code deals with SINGLE scan line at a time, it convert 8
pixels in each iteration */
__asm__ __volatile__ (
/* load data for start of next scan line */
"movd (%2, %0), %%mm0;" /* Load 4 Cb 00 00 00 00 u3 u2 u1 u0 */
"movd (%3, %0), %%mm1;" /* Load 4 Cr 00 00 00 00 v3 v2 v1 v0 */
"movq (%5, %0, 2), %%mm6;" /* Load 8 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 */
// ".balign 16 \n\t"
"1: \n\t"
YUV2RGB
#ifdef DITHER1XBPP
"paddusb "MANGLE(b5Dither)", %%mm0 \n\t"
"paddusb "MANGLE(g5Dither)", %%mm2 \n\t"
"paddusb "MANGLE(r5Dither)", %%mm1 \n\t"
#endif
/* mask unneeded bits off */
"pand "MANGLE(mmx_redmask)", %%mm0;" /* b7b6b5b4 b3_0_0_0 b7b6b5b4 b3_0_0_0 */
"pand "MANGLE(mmx_redmask)", %%mm2;" /* g7g6g5g4 g3_0_0_0 g7g6g5g4 g3_0_0_0 */
"pand "MANGLE(mmx_redmask)", %%mm1;" /* r7r6r5r4 r3_0_0_0 r7r6r5r4 r3_0_0_0 */
"psrlw $3,%%mm0;" /* 0_0_0_b7 b6b5b4b3 0_0_0_b7 b6b5b4b3 */
"psrlw $1,%%mm1;" /* 0_r7r6r5 r4r3_0_0 0_r7r6r5 r4r3_0_0 */
"pxor %%mm4, %%mm4;" /* zero mm4 */
"movq %%mm0, %%mm5;" /* Copy B7-B0 */
"movq %%mm2, %%mm7;" /* Copy G7-G0 */
/* convert rgb24 plane to rgb16 pack for pixel 0-3 */
"punpcklbw %%mm4, %%mm2;" /* 0_0_0_0 0_0_0_0 g7g6g5g4 g3_0_0_0 */
"punpcklbw %%mm1, %%mm0;" /* r7r6r5r4 r3_0_0_0 0_0_0_b7 b6b5b4b3 */
"psllw $2, %%mm2;" /* 0_0_0_0 0_0_g7g6 g5g4g3_0 0_0_0_0 */
"por %%mm2, %%mm0;" /* 0_r7r6r5 r4r3g7g6 g5g4g3b7 b6b5b4b3 */
"movq 8 (%5, %0, 2), %%mm6;" /* Load 8 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 */
MOVNTQ " %%mm0, (%1);" /* store pixel 0-3 */
/* convert rgb24 plane to rgb16 pack for pixel 0-3 */
"punpckhbw %%mm4, %%mm7;" /* 0_0_0_0 0_0_0_0 0_g7g6g5 g4g3_0_0 */
"punpckhbw %%mm1, %%mm5;" /* r7r6r5r4 r3_0_0_0 0_0_0_b7 b6b5b4b3 */
"psllw $2, %%mm7;" /* 0_0_0_0 0_0_g7g6 g5g4g3_0 0_0_0_0 */
"movd 4 (%2, %0), %%mm0;" /* Load 4 Cb 00 00 00 00 u3 u2 u1 u0 */
"por %%mm7, %%mm5;" /* 0_r7r6r5 r4r3g7g6 g5g4g3b7 b6b5b4b3 */
"movd 4 (%3, %0), %%mm1;" /* Load 4 Cr 00 00 00 00 v3 v2 v1 v0 */
MOVNTQ " %%mm5, 8 (%1);" /* store pixel 4-7 */
"addl $16, %1 \n\t"
"addl $4, %0 \n\t"
" js 1b \n\t"
: "+r" (index), "+r" (_image)
: "r" (_pu - index), "r" (_pv - index), "r"(&c->redDither), "r" (_py - 2*index)
);
}
__asm__ __volatile__ (EMMS);
return srcSliceH;
}
static inline int RENAME(yuv420_rgb24)(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY,
int srcSliceH, uint8_t* dst[], int dstStride[]){
int y, h_size;
if(c->srcFormat == IMGFMT_422P){
srcStride[1] *= 2;
srcStride[2] *= 2;
}
h_size= (c->dstW+7)&~7;
if(h_size*3 > dstStride[0]) h_size-=8;
__asm__ __volatile__ ("pxor %mm4, %mm4;" /* zero mm4 */ );
for (y= 0; y<srcSliceH; y++ ) {
uint8_t *_image = dst[0] + (y+srcSliceY)*dstStride[0];
uint8_t *_py = src[0] + y*srcStride[0];
uint8_t *_pu = src[1] + (y>>1)*srcStride[1];
uint8_t *_pv = src[2] + (y>>1)*srcStride[2];
int index= -h_size/2;
/* this mmx assembly code deals with SINGLE scan line at a time, it convert 8
pixels in each iteration */
__asm__ __volatile__ (
/* load data for start of next scan line */
"movd (%2, %0), %%mm0;" /* Load 4 Cb 00 00 00 00 u3 u2 u1 u0 */
"movd (%3, %0), %%mm1;" /* Load 4 Cr 00 00 00 00 v3 v2 v1 v0 */
"movq (%5, %0, 2), %%mm6;" /* Load 8 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 */
// ".balign 16 \n\t"
"1: \n\t"
YUV2RGB
/* mm0=B, %%mm2=G, %%mm1=R */
#ifdef HAVE_MMX2
"movq "MANGLE(M24A)", %%mm4 \n\t"
"movq "MANGLE(M24C)", %%mm7 \n\t"
"pshufw $0x50, %%mm0, %%mm5 \n\t" /* B3 B2 B3 B2 B1 B0 B1 B0 */
"pshufw $0x50, %%mm2, %%mm3 \n\t" /* G3 G2 G3 G2 G1 G0 G1 G0 */
"pshufw $0x00, %%mm1, %%mm6 \n\t" /* R1 R0 R1 R0 R1 R0 R1 R0 */
"pand %%mm4, %%mm5 \n\t" /* B2 B1 B0 */
"pand %%mm4, %%mm3 \n\t" /* G2 G1 G0 */
"pand %%mm7, %%mm6 \n\t" /* R1 R0 */
"psllq $8, %%mm3 \n\t" /* G2 G1 G0 */
"por %%mm5, %%mm6 \n\t"
"por %%mm3, %%mm6 \n\t"
MOVNTQ" %%mm6, (%1) \n\t"
"psrlq $8, %%mm2 \n\t" /* 00 G7 G6 G5 G4 G3 G2 G1 */
"pshufw $0xA5, %%mm0, %%mm5 \n\t" /* B5 B4 B5 B4 B3 B2 B3 B2 */
"pshufw $0x55, %%mm2, %%mm3 \n\t" /* G4 G3 G4 G3 G4 G3 G4 G3 */
"pshufw $0xA5, %%mm1, %%mm6 \n\t" /* R5 R4 R5 R4 R3 R2 R3 R2 */
"pand "MANGLE(M24B)", %%mm5 \n\t" /* B5 B4 B3 */
"pand %%mm7, %%mm3 \n\t" /* G4 G3 */
"pand %%mm4, %%mm6 \n\t" /* R4 R3 R2 */
"por %%mm5, %%mm3 \n\t" /* B5 G4 B4 G3 B3 */
"por %%mm3, %%mm6 \n\t"
MOVNTQ" %%mm6, 8(%1) \n\t"
"pshufw $0xFF, %%mm0, %%mm5 \n\t" /* B7 B6 B7 B6 B7 B6 B6 B7 */
"pshufw $0xFA, %%mm2, %%mm3 \n\t" /* 00 G7 00 G7 G6 G5 G6 G5 */
"pshufw $0xFA, %%mm1, %%mm6 \n\t" /* R7 R6 R7 R6 R5 R4 R5 R4 */
"movd 4 (%2, %0), %%mm0;" /* Load 4 Cb 00 00 00 00 u3 u2 u1 u0 */
"pand %%mm7, %%mm5 \n\t" /* B7 B6 */
"pand %%mm4, %%mm3 \n\t" /* G7 G6 G5 */
"pand "MANGLE(M24B)", %%mm6 \n\t" /* R7 R6 R5 */
"movd 4 (%3, %0), %%mm1;" /* Load 4 Cr 00 00 00 00 v3 v2 v1 v0 */
\
"por %%mm5, %%mm3 \n\t"
"por %%mm3, %%mm6 \n\t"
MOVNTQ" %%mm6, 16(%1) \n\t"
"movq 8 (%5, %0, 2), %%mm6;" /* Load 8 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 */
"pxor %%mm4, %%mm4 \n\t"
#else
"pxor %%mm4, %%mm4 \n\t"
"movq %%mm0, %%mm5 \n\t" /* B */
"movq %%mm1, %%mm6 \n\t" /* R */
"punpcklbw %%mm2, %%mm0 \n\t" /* GBGBGBGB 0 */
"punpcklbw %%mm4, %%mm1 \n\t" /* 0R0R0R0R 0 */
"punpckhbw %%mm2, %%mm5 \n\t" /* GBGBGBGB 2 */
"punpckhbw %%mm4, %%mm6 \n\t" /* 0R0R0R0R 2 */
"movq %%mm0, %%mm7 \n\t" /* GBGBGBGB 0 */
"movq %%mm5, %%mm3 \n\t" /* GBGBGBGB 2 */
"punpcklwd %%mm1, %%mm7 \n\t" /* 0RGB0RGB 0 */
"punpckhwd %%mm1, %%mm0 \n\t" /* 0RGB0RGB 1 */
"punpcklwd %%mm6, %%mm5 \n\t" /* 0RGB0RGB 2 */
"punpckhwd %%mm6, %%mm3 \n\t" /* 0RGB0RGB 3 */
"movq %%mm7, %%mm2 \n\t" /* 0RGB0RGB 0 */
"movq %%mm0, %%mm6 \n\t" /* 0RGB0RGB 1 */
"movq %%mm5, %%mm1 \n\t" /* 0RGB0RGB 2 */
"movq %%mm3, %%mm4 \n\t" /* 0RGB0RGB 3 */
"psllq $40, %%mm7 \n\t" /* RGB00000 0 */
"psllq $40, %%mm0 \n\t" /* RGB00000 1 */
"psllq $40, %%mm5 \n\t" /* RGB00000 2 */
"psllq $40, %%mm3 \n\t" /* RGB00000 3 */
"punpckhdq %%mm2, %%mm7 \n\t" /* 0RGBRGB0 0 */
"punpckhdq %%mm6, %%mm0 \n\t" /* 0RGBRGB0 1 */
"punpckhdq %%mm1, %%mm5 \n\t" /* 0RGBRGB0 2 */
"punpckhdq %%mm4, %%mm3 \n\t" /* 0RGBRGB0 3 */
"psrlq $8, %%mm7 \n\t" /* 00RGBRGB 0 */
"movq %%mm0, %%mm6 \n\t" /* 0RGBRGB0 1 */
"psllq $40, %%mm0 \n\t" /* GB000000 1 */
"por %%mm0, %%mm7 \n\t" /* GBRGBRGB 0 */
MOVNTQ" %%mm7, (%1) \n\t"
"movd 4 (%2, %0), %%mm0;" /* Load 4 Cb 00 00 00 00 u3 u2 u1 u0 */
"psrlq $24, %%mm6 \n\t" /* 0000RGBR 1 */
"movq %%mm5, %%mm1 \n\t" /* 0RGBRGB0 2 */
"psllq $24, %%mm5 \n\t" /* BRGB0000 2 */
"por %%mm5, %%mm6 \n\t" /* BRGBRGBR 1 */
MOVNTQ" %%mm6, 8(%1) \n\t"
"movq 8 (%5, %0, 2), %%mm6;" /* Load 8 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 */
"psrlq $40, %%mm1 \n\t" /* 000000RG 2 */
"psllq $8, %%mm3 \n\t" /* RGBRGB00 3 */
"por %%mm3, %%mm1 \n\t" /* RGBRGBRG 2 */
MOVNTQ" %%mm1, 16(%1) \n\t"
"movd 4 (%3, %0), %%mm1;" /* Load 4 Cr 00 00 00 00 v3 v2 v1 v0 */
"pxor %%mm4, %%mm4 \n\t"
#endif
"addl $24, %1 \n\t"
"addl $4, %0 \n\t"
" js 1b \n\t"
: "+r" (index), "+r" (_image)
: "r" (_pu - index), "r" (_pv - index), "r"(&c->redDither), "r" (_py - 2*index)
);
}
__asm__ __volatile__ (EMMS);
return srcSliceH;
}
static inline int RENAME(yuv420_rgb32)(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY,
int srcSliceH, uint8_t* dst[], int dstStride[]){
int y, h_size;
if(c->srcFormat == IMGFMT_422P){
srcStride[1] *= 2;
srcStride[2] *= 2;
}
h_size= (c->dstW+7)&~7;
if(h_size*4 > dstStride[0]) h_size-=8;
__asm__ __volatile__ ("pxor %mm4, %mm4;" /* zero mm4 */ );
for (y= 0; y<srcSliceH; y++ ) {
uint8_t *_image = dst[0] + (y+srcSliceY)*dstStride[0];
uint8_t *_py = src[0] + y*srcStride[0];
uint8_t *_pu = src[1] + (y>>1)*srcStride[1];
uint8_t *_pv = src[2] + (y>>1)*srcStride[2];
int index= -h_size/2;
/* this mmx assembly code deals with SINGLE scan line at a time, it convert 8
pixels in each iteration */
__asm__ __volatile__ (
/* load data for start of next scan line */
"movd (%2, %0), %%mm0;" /* Load 4 Cb 00 00 00 00 u3 u2 u1 u0 */
"movd (%3, %0), %%mm1;" /* Load 4 Cr 00 00 00 00 v3 v2 v1 v0 */
"movq (%5, %0, 2), %%mm6;" /* Load 8 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 */
// ".balign 16 \n\t"
"1: \n\t"
YUV2RGB
/* convert RGB plane to RGB packed format,
mm0 -> B, mm1 -> R, mm2 -> G, mm3 -> 0,
mm4 -> GB, mm5 -> AR pixel 4-7,
mm6 -> GB, mm7 -> AR pixel 0-3 */
"pxor %%mm3, %%mm3;" /* zero mm3 */
"movq %%mm0, %%mm6;" /* B7 B6 B5 B4 B3 B2 B1 B0 */
"movq %%mm1, %%mm7;" /* R7 R6 R5 R4 R3 R2 R1 R0 */
"movq %%mm0, %%mm4;" /* B7 B6 B5 B4 B3 B2 B1 B0 */
"movq %%mm1, %%mm5;" /* R7 R6 R5 R4 R3 R2 R1 R0 */
"punpcklbw %%mm2, %%mm6;" /* G3 B3 G2 B2 G1 B1 G0 B0 */
"punpcklbw %%mm3, %%mm7;" /* 00 R3 00 R2 00 R1 00 R0 */
"punpcklwd %%mm7, %%mm6;" /* 00 R1 B1 G1 00 R0 B0 G0 */
MOVNTQ " %%mm6, (%1);" /* Store ARGB1 ARGB0 */
"movq %%mm0, %%mm6;" /* B7 B6 B5 B4 B3 B2 B1 B0 */
"punpcklbw %%mm2, %%mm6;" /* G3 B3 G2 B2 G1 B1 G0 B0 */
"punpckhwd %%mm7, %%mm6;" /* 00 R3 G3 B3 00 R2 B3 G2 */
MOVNTQ " %%mm6, 8 (%1);" /* Store ARGB3 ARGB2 */
"punpckhbw %%mm2, %%mm4;" /* G7 B7 G6 B6 G5 B5 G4 B4 */
"punpckhbw %%mm3, %%mm5;" /* 00 R7 00 R6 00 R5 00 R4 */
"punpcklwd %%mm5, %%mm4;" /* 00 R5 B5 G5 00 R4 B4 G4 */
MOVNTQ " %%mm4, 16 (%1);" /* Store ARGB5 ARGB4 */
"movq %%mm0, %%mm4;" /* B7 B6 B5 B4 B3 B2 B1 B0 */
"punpckhbw %%mm2, %%mm4;" /* G7 B7 G6 B6 G5 B5 G4 B4 */
"punpckhwd %%mm5, %%mm4;" /* 00 R7 G7 B7 00 R6 B6 G6 */
MOVNTQ " %%mm4, 24 (%1);" /* Store ARGB7 ARGB6 */
"movd 4 (%2, %0), %%mm0;" /* Load 4 Cb 00 00 00 00 u3 u2 u1 u0 */
"movd 4 (%3, %0), %%mm1;" /* Load 4 Cr 00 00 00 00 v3 v2 v1 v0 */
"pxor %%mm4, %%mm4;" /* zero mm4 */
"movq 8 (%5, %0, 2), %%mm6;" /* Load 8 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 */
"addl $32, %1 \n\t"
"addl $4, %0 \n\t"
" js 1b \n\t"
: "+r" (index), "+r" (_image)
: "r" (_pu - index), "r" (_pv - index), "r"(&c->redDither), "r" (_py - 2*index)
);
}
__asm__ __volatile__ (EMMS);
return srcSliceH;
}
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment