Commit a18c8560 authored by michael's avatar michael

vertical scaler with accurate rounding, some people on doom9 can see +-1 errors

the +-1 issue is limited to >2tap vertical filters, so bilinear upscale was unaffected
the new code is sometime faster sometimes slower but the difference is significant (~20%) so its optional and enabled with arnd=1


git-svn-id: file:///var/local/repositories/mplayer/trunk/libswscale@19177 b3059339-0415-0410-9bf9-f77b7e298cf2
parent 9a6a6693
...@@ -848,7 +848,7 @@ static double getSplineCoeff(double a, double b, double c, double d, double dist ...@@ -848,7 +848,7 @@ static double getSplineCoeff(double a, double b, double c, double d, double dist
dist-1.0); dist-1.0);
} }
static inline void initFilter(int16_t **outFilter, int16_t **filterPos, int *outFilterSize, int xInc, static inline int initFilter(int16_t **outFilter, int16_t **filterPos, int *outFilterSize, int xInc,
int srcW, int dstW, int filterAlign, int one, int flags, int srcW, int dstW, int filterAlign, int one, int flags,
SwsVector *srcFilter, SwsVector *dstFilter, double param[2]) SwsVector *srcFilter, SwsVector *dstFilter, double param[2])
{ {
...@@ -1127,10 +1127,18 @@ static inline void initFilter(int16_t **outFilter, int16_t **filterPos, int *out ...@@ -1127,10 +1127,18 @@ static inline void initFilter(int16_t **outFilter, int16_t **filterPos, int *out
filterAlign = 1; filterAlign = 1;
} }
if (flags & SWS_CPU_CAPS_MMX) {
// special case for unscaled vertical filtering
if(minFilterSize == 1 && filterAlign == 2)
filterAlign= 1;
}
ASSERT(minFilterSize > 0) ASSERT(minFilterSize > 0)
filterSize= (minFilterSize +(filterAlign-1)) & (~(filterAlign-1)); filterSize= (minFilterSize +(filterAlign-1)) & (~(filterAlign-1));
ASSERT(filterSize > 0) ASSERT(filterSize > 0)
filter= av_malloc(filterSize*dstW*sizeof(double)); filter= av_malloc(filterSize*dstW*sizeof(double));
if(filterSize >= MAX_FILTER_SIZE)
return -1;
*outFilterSize= filterSize; *outFilterSize= filterSize;
if(flags&SWS_PRINT_INFO) if(flags&SWS_PRINT_INFO)
...@@ -1216,6 +1224,7 @@ static inline void initFilter(int16_t **outFilter, int16_t **filterPos, int *out ...@@ -1216,6 +1224,7 @@ static inline void initFilter(int16_t **outFilter, int16_t **filterPos, int *out
} }
av_free(filter); av_free(filter);
return 0;
} }
#if defined(ARCH_X86) || defined(ARCH_X86_64) #if defined(ARCH_X86) || defined(ARCH_X86_64)
...@@ -2115,6 +2124,7 @@ SwsContext *sws_getContext(int srcW, int srcH, int origSrcFormat, int dstW, int ...@@ -2115,6 +2124,7 @@ SwsContext *sws_getContext(int srcW, int srcH, int origSrcFormat, int dstW, int
/* precalculate vertical scaler filter coefficients */ /* precalculate vertical scaler filter coefficients */
{ {
const int filterAlign= const int filterAlign=
(flags & SWS_CPU_CAPS_MMX) && (flags & SWS_ACCURATE_RND) ? 2 :
(flags & SWS_CPU_CAPS_ALTIVEC) ? 8 : (flags & SWS_CPU_CAPS_ALTIVEC) ? 8 :
1; 1;
......
...@@ -64,6 +64,7 @@ extern "C" { ...@@ -64,6 +64,7 @@ extern "C" {
//input subsampling info //input subsampling info
#define SWS_FULL_CHR_H_INP 0x4000 #define SWS_FULL_CHR_H_INP 0x4000
#define SWS_DIRECT_BGR 0x8000 #define SWS_DIRECT_BGR 0x8000
#define SWS_ACCURATE_RND 0x40000
#define SWS_CPU_CAPS_MMX 0x80000000 #define SWS_CPU_CAPS_MMX 0x80000000
#define SWS_CPU_CAPS_MMX2 0x20000000 #define SWS_CPU_CAPS_MMX2 0x20000000
......
...@@ -126,6 +126,8 @@ typedef struct SwsContext{ ...@@ -126,6 +126,8 @@ typedef struct SwsContext{
#define DSTW_OFFSET "11*8+4*4*256*2" //do not change, its hardcoded in the asm #define DSTW_OFFSET "11*8+4*4*256*2" //do not change, its hardcoded in the asm
#define ESP_OFFSET "11*8+4*4*256*2+8" #define ESP_OFFSET "11*8+4*4*256*2+8"
#define VROUNDER_OFFSET "11*8+4*4*256*2+16" #define VROUNDER_OFFSET "11*8+4*4*256*2+16"
#define U_TEMP "11*8+4*4*256*2+24"
#define V_TEMP "11*8+4*4*256*2+32"
uint64_t redDither __attribute__((aligned(8))); uint64_t redDither __attribute__((aligned(8)));
uint64_t greenDither __attribute__((aligned(8))); uint64_t greenDither __attribute__((aligned(8)));
...@@ -144,6 +146,8 @@ typedef struct SwsContext{ ...@@ -144,6 +146,8 @@ typedef struct SwsContext{
int dstW; int dstW;
uint64_t esp __attribute__((aligned(8))); uint64_t esp __attribute__((aligned(8)));
uint64_t vRounder __attribute__((aligned(8))); uint64_t vRounder __attribute__((aligned(8)));
uint64_t u_temp __attribute__((aligned(8)));
uint64_t v_temp __attribute__((aligned(8)));
#ifdef HAVE_ALTIVEC #ifdef HAVE_ALTIVEC
......
This diff is collapsed.
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment