Commit 5c7c27ca authored by Naohiro KORIYAMA's avatar Naohiro KORIYAMA Committed by Jean-Baptiste Kempf

yadif : Add SSSE3 and SSE2 support. porting from FFmpeg.

Signed-off-by: default avatarJean-Baptiste Kempf <jb@videolan.org>
parent 8634f761
/***************************************************************************** /*****************************************************************************
* algo_yadif.c : Wrapper for MPlayer's Yadif algorithm * algo_yadif.c : Wrapper for FFmpeg's Yadif algorithm
***************************************************************************** *****************************************************************************
* Copyright (C) 2000-2011 the VideoLAN team * Copyright (C) 2000-2011 the VideoLAN team
* $Id$ * $Id$
...@@ -26,10 +26,6 @@ ...@@ -26,10 +26,6 @@
# include "config.h" # include "config.h"
#endif #endif
#ifdef CAN_COMPILE_MMXEXT
# include "mmx.h"
#endif
#include <stdint.h> #include <stdint.h>
#include <assert.h> #include <assert.h>
...@@ -47,23 +43,7 @@ ...@@ -47,23 +43,7 @@
* Yadif (Yet Another DeInterlacing Filter). * Yadif (Yet Another DeInterlacing Filter).
*****************************************************************************/ *****************************************************************************/
/* Yadif's private data struct */ /* yadif.h comes from yadif.c of FFmpeg project.
struct vf_priv_s {
/*
* 0: Output 1 frame for each frame.
* 1: Output 1 frame for each field.
* 2: Like 0 but skips spatial interlacing check.
* 3: Like 1 but skips spatial interlacing check.
*
* In vlc, only & 0x02 has meaning, as we do the & 0x01 ourself.
*/
int mode;
};
/* I am unsure it is the right one */
typedef intptr_t x86_reg;
/* yadif.h comes from vf_yadif.c of mplayer project.
Necessary preprocessor macros are defined in common.h. */ Necessary preprocessor macros are defined in common.h. */
#include "yadif.h" #include "yadif.h"
...@@ -125,15 +105,22 @@ int RenderYadif( filter_t *p_filter, picture_t *p_dst, picture_t *p_src, ...@@ -125,15 +105,22 @@ int RenderYadif( filter_t *p_filter, picture_t *p_dst, picture_t *p_src,
if( p_prev && p_cur && p_next ) if( p_prev && p_cur && p_next )
{ {
/* */ /* */
void (*filter)(struct vf_priv_s *p, uint8_t *dst, void (*filter)(uint8_t *dst, uint8_t *prev, uint8_t *cur, uint8_t *next,
uint8_t *prev, uint8_t *cur, uint8_t *next, int w, int prefs, int mrefs, int parity, int mode);
int w, int refs, int parity);
filter = yadif_filter_line_c;
#if defined(HAVE_YADIF_MMX)
if( vlc_CPU() & CPU_CAPABILITY_MMX )
filter = yadif_filter_line_mmx;
#endif
#if defined(HAVE_YADIF_SSE2) #if defined(HAVE_YADIF_SSE2)
if( vlc_CPU() & CPU_CAPABILITY_SSE2 ) if( vlc_CPU() & CPU_CAPABILITY_SSE2 )
filter = yadif_filter_line_mmx2; filter = yadif_filter_line_sse2;
else #endif
#if defined(HAVE_YADIF_SSSE3)
if( vlc_CPU() & CPU_CAPABILITY_SSSE3 )
filter = yadif_filter_line_ssse3;
#endif #endif
filter = yadif_filter_line_c;
for( int n = 0; n < p_dst->i_planes; n++ ) for( int n = 0; n < p_dst->i_planes; n++ )
{ {
...@@ -151,19 +138,20 @@ int RenderYadif( filter_t *p_filter, picture_t *p_dst, picture_t *p_src, ...@@ -151,19 +138,20 @@ int RenderYadif( filter_t *p_filter, picture_t *p_dst, picture_t *p_src,
} }
else else
{ {
struct vf_priv_s cfg; int mode;
/* Spatial checks only when enough data */ /* Spatial checks only when enough data */
cfg.mode = (y >= 2 && y < dstp->i_visible_lines - 2) ? 0 : 2; mode = (y >= 2 && y < dstp->i_visible_lines - 2) ? 0 : 2;
assert( prevp->i_pitch == curp->i_pitch && curp->i_pitch == nextp->i_pitch ); assert( prevp->i_pitch == curp->i_pitch && curp->i_pitch == nextp->i_pitch );
filter( &cfg, filter( &dstp->p_pixels[y * dstp->i_pitch],
&dstp->p_pixels[y * dstp->i_pitch],
&prevp->p_pixels[y * prevp->i_pitch], &prevp->p_pixels[y * prevp->i_pitch],
&curp->p_pixels[y * curp->i_pitch], &curp->p_pixels[y * curp->i_pitch],
&nextp->p_pixels[y * nextp->i_pitch], &nextp->p_pixels[y * nextp->i_pitch],
dstp->i_visible_pitch, dstp->i_visible_pitch,
curp->i_pitch, y < dstp->i_visible_lines - 2 ? curp->i_pitch : -curp->i_pitch,
yadif_parity ); y - 1 ? -curp->i_pitch : curp->i_pitch,
yadif_parity,
mode );
} }
/* We duplicate the first and last lines */ /* We duplicate the first and last lines */
......
/* /*
* Copyright (C) 2006 Michael Niedermayer <michaelni@gmx.at> * Copyright (C) 2006 Michael Niedermayer <michaelni@gmx.at>
* *
* This file is part of MPlayer. * This file is part of FFmpeg.
* *
* MPlayer is free software; you can redistribute it and/or modify * FFmpeg is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by * it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or * the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version. * (at your option) any later version.
* *
* MPlayer is distributed in the hope that it will be useful, * FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of * but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details. * GNU General Public License for more details.
* *
* You should have received a copy of the GNU General Public License along * You should have received a copy of the GNU General Public License along
* with MPlayer; if not, write to the Free Software Foundation, Inc., * with FFmpeg; if not, write to the Free Software Foundation, Inc.,
* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
*/ */
/* */ #ifdef HAVE_CONFIG_H
#if defined(CAN_COMPILE_SSE2) && ((__GNUC__ > 3) || (__GNUC__ == 3 && __GNUC_MINOR__ > 0)) # include "config.h"
#endif
#define HAVE_YADIF_SSE2
#define LOAD4(mem,dst) \
"movd "mem", "#dst" \n\t"\
"punpcklbw %%mm7, "#dst" \n\t"
#define PABS(tmp,dst) \
"pxor "#tmp", "#tmp" \n\t"\
"psubw "#dst", "#tmp" \n\t"\
"pmaxsw "#tmp", "#dst" \n\t"
#define CHECK(pj,mj) \
"movq "#pj"(%[cur],%[mrefs]), %%mm2 \n\t" /* cur[x-refs-1+j] */\
"movq "#mj"(%[cur],%[prefs]), %%mm3 \n\t" /* cur[x+refs-1-j] */\
"movq %%mm2, %%mm4 \n\t"\
"movq %%mm2, %%mm5 \n\t"\
"pxor %%mm3, %%mm4 \n\t"\
"pavgb %%mm3, %%mm5 \n\t"\
"pand %[pb1], %%mm4 \n\t"\
"psubusb %%mm4, %%mm5 \n\t"\
"psrlq $8, %%mm5 \n\t"\
"punpcklbw %%mm7, %%mm5 \n\t" /* (cur[x-refs+j] + cur[x+refs-j])>>1 */\
"movq %%mm2, %%mm4 \n\t"\
"psubusb %%mm3, %%mm2 \n\t"\
"psubusb %%mm4, %%mm3 \n\t"\
"pmaxub %%mm3, %%mm2 \n\t"\
"movq %%mm2, %%mm3 \n\t"\
"movq %%mm2, %%mm4 \n\t" /* ABS(cur[x-refs-1+j] - cur[x+refs-1-j]) */\
"psrlq $8, %%mm3 \n\t" /* ABS(cur[x-refs +j] - cur[x+refs -j]) */\
"psrlq $16, %%mm4 \n\t" /* ABS(cur[x-refs+1+j] - cur[x+refs+1-j]) */\
"punpcklbw %%mm7, %%mm2 \n\t"\
"punpcklbw %%mm7, %%mm3 \n\t"\
"punpcklbw %%mm7, %%mm4 \n\t"\
"paddw %%mm3, %%mm2 \n\t"\
"paddw %%mm4, %%mm2 \n\t" /* score */
#define CHECK1 \
"movq %%mm0, %%mm3 \n\t"\
"pcmpgtw %%mm2, %%mm3 \n\t" /* if(score < spatial_score) */\
"pminsw %%mm2, %%mm0 \n\t" /* spatial_score= score; */\
"movq %%mm3, %%mm6 \n\t"\
"pand %%mm3, %%mm5 \n\t"\
"pandn %%mm1, %%mm3 \n\t"\
"por %%mm5, %%mm3 \n\t"\
"movq %%mm3, %%mm1 \n\t" /* spatial_pred= (cur[x-refs+j] + cur[x+refs-j])>>1; */
#define CHECK2 /* pretend not to have checked dir=2 if dir=1 was bad.\
hurts both quality and speed, but matches the C version. */\
"paddw %[pw1], %%mm6 \n\t"\
"psllw $14, %%mm6 \n\t"\
"paddsw %%mm6, %%mm2 \n\t"\
"movq %%mm0, %%mm3 \n\t"\
"pcmpgtw %%mm2, %%mm3 \n\t"\
"pminsw %%mm2, %%mm0 \n\t"\
"pand %%mm3, %%mm5 \n\t"\
"pandn %%mm1, %%mm3 \n\t"\
"por %%mm5, %%mm3 \n\t"\
"movq %%mm3, %%mm1 \n\t"
static void yadif_filter_line_mmx2(struct vf_priv_s *p, uint8_t *dst, uint8_t *prev, uint8_t *cur, uint8_t *next, int w, int refs, int parity){ #if defined(__GNUC__)
static const uint64_t pw_1 = 0x0001000100010001ULL; # define DECLARE_ALIGNED(n,t,v) t __attribute__ ((aligned (n))) v
static const uint64_t pb_1 = 0x0101010101010101ULL; # if VLC_GCC_VERSION(3,1)
const int mode = p->mode; # define DECLARE_ASM_CONST(n,t,v) static const t __attribute__((used)) __attribute__ ((aligned (n))) v
uint64_t tmp0, tmp1, tmp2, tmp3; # else
int x; # define DECLARE_ASM_CONST(n,t,v) static const t __attribute__ ((aligned (n))) v
# endif
#endif
#define FILTER\ typedef intptr_t x86_reg;
for(x=0; x<w; x+=4){\ typedef struct { uint64_t a, b; } xmm_reg;
__asm__ volatile(\
"pxor %%mm7, %%mm7 \n\t"\ DECLARE_ASM_CONST(16, const xmm_reg, pb_1) = {0x0101010101010101ULL, 0x0101010101010101ULL};
LOAD4("(%[cur],%[mrefs])", %%mm0) /* c = cur[x-refs] */\ DECLARE_ASM_CONST(16, const xmm_reg, pw_1) = {0x0001000100010001ULL, 0x0001000100010001ULL};
LOAD4("(%[cur],%[prefs])", %%mm1) /* e = cur[x+refs] */\
LOAD4("(%["prev2"])", %%mm2) /* prev2[x] */\
LOAD4("(%["next2"])", %%mm3) /* next2[x] */\ #ifdef CAN_COMPILE_SSSE3
"movq %%mm3, %%mm4 \n\t"\ #if defined(__SSE__) || VLC_GCC_VERSION(4, 4)
"paddw %%mm2, %%mm3 \n\t"\ // ================ SSSE3 =================
"psraw $1, %%mm3 \n\t" /* d = (prev2[x] + next2[x])>>1 */\ #define HAVE_YADIF_SSSE3
"movq %%mm0, %[tmp0] \n\t" /* c */\ #define COMPILE_TEMPLATE_SSE 1
"movq %%mm3, %[tmp1] \n\t" /* d */\ #define COMPILE_TEMPLATE_SSSE3 1
"movq %%mm1, %[tmp2] \n\t" /* e */\ #define VLC_TARGET VLC_SSE
"psubw %%mm4, %%mm2 \n\t"\ #define RENAME(a) a ## _ssse3
PABS( %%mm4, %%mm2) /* temporal_diff0 */\ #include "yadif_template.h"
LOAD4("(%[prev],%[mrefs])", %%mm3) /* prev[x-refs] */\ #undef COMPILE_TEMPLATE_SSE
LOAD4("(%[prev],%[prefs])", %%mm4) /* prev[x+refs] */\ #undef COMPILE_TEMPLATE_SSSE3
"psubw %%mm0, %%mm3 \n\t"\ #undef VLC_TARGET
"psubw %%mm1, %%mm4 \n\t"\ #undef RENAME
PABS( %%mm5, %%mm3)\ #endif
PABS( %%mm5, %%mm4)\ #endif
"paddw %%mm4, %%mm3 \n\t" /* temporal_diff1 */\
"psrlw $1, %%mm2 \n\t"\
"psrlw $1, %%mm3 \n\t"\
"pmaxsw %%mm3, %%mm2 \n\t"\
LOAD4("(%[next],%[mrefs])", %%mm3) /* next[x-refs] */\
LOAD4("(%[next],%[prefs])", %%mm4) /* next[x+refs] */\
"psubw %%mm0, %%mm3 \n\t"\
"psubw %%mm1, %%mm4 \n\t"\
PABS( %%mm5, %%mm3)\
PABS( %%mm5, %%mm4)\
"paddw %%mm4, %%mm3 \n\t" /* temporal_diff2 */\
"psrlw $1, %%mm3 \n\t"\
"pmaxsw %%mm3, %%mm2 \n\t"\
"movq %%mm2, %[tmp3] \n\t" /* diff */\
\
"paddw %%mm0, %%mm1 \n\t"\
"paddw %%mm0, %%mm0 \n\t"\
"psubw %%mm1, %%mm0 \n\t"\
"psrlw $1, %%mm1 \n\t" /* spatial_pred */\
PABS( %%mm2, %%mm0) /* ABS(c-e) */\
\
"movq -1(%[cur],%[mrefs]), %%mm2 \n\t" /* cur[x-refs-1] */\
"movq -1(%[cur],%[prefs]), %%mm3 \n\t" /* cur[x+refs-1] */\
"movq %%mm2, %%mm4 \n\t"\
"psubusb %%mm3, %%mm2 \n\t"\
"psubusb %%mm4, %%mm3 \n\t"\
"pmaxub %%mm3, %%mm2 \n\t"\
"pshufw $9,%%mm2, %%mm3 \n\t"\
"punpcklbw %%mm7, %%mm2 \n\t" /* ABS(cur[x-refs-1] - cur[x+refs-1]) */\
"punpcklbw %%mm7, %%mm3 \n\t" /* ABS(cur[x-refs+1] - cur[x+refs+1]) */\
"paddw %%mm2, %%mm0 \n\t"\
"paddw %%mm3, %%mm0 \n\t"\
"psubw %[pw1], %%mm0 \n\t" /* spatial_score */\
\
CHECK(-2,0)\
CHECK1\
CHECK(-3,1)\
CHECK2\
CHECK(0,-2)\
CHECK1\
CHECK(1,-3)\
CHECK2\
\
/* if(p->mode<2) ... */\
"movq %[tmp3], %%mm6 \n\t" /* diff */\
"cmpl $2, %[mode] \n\t"\
"jge 1f \n\t"\
LOAD4("(%["prev2"],%[mrefs],2)", %%mm2) /* prev2[x-2*refs] */\
LOAD4("(%["next2"],%[mrefs],2)", %%mm4) /* next2[x-2*refs] */\
LOAD4("(%["prev2"],%[prefs],2)", %%mm3) /* prev2[x+2*refs] */\
LOAD4("(%["next2"],%[prefs],2)", %%mm5) /* next2[x+2*refs] */\
"paddw %%mm4, %%mm2 \n\t"\
"paddw %%mm5, %%mm3 \n\t"\
"psrlw $1, %%mm2 \n\t" /* b */\
"psrlw $1, %%mm3 \n\t" /* f */\
"movq %[tmp0], %%mm4 \n\t" /* c */\
"movq %[tmp1], %%mm5 \n\t" /* d */\
"movq %[tmp2], %%mm7 \n\t" /* e */\
"psubw %%mm4, %%mm2 \n\t" /* b-c */\
"psubw %%mm7, %%mm3 \n\t" /* f-e */\
"movq %%mm5, %%mm0 \n\t"\
"psubw %%mm4, %%mm5 \n\t" /* d-c */\
"psubw %%mm7, %%mm0 \n\t" /* d-e */\
"movq %%mm2, %%mm4 \n\t"\
"pminsw %%mm3, %%mm2 \n\t"\
"pmaxsw %%mm4, %%mm3 \n\t"\
"pmaxsw %%mm5, %%mm2 \n\t"\
"pminsw %%mm5, %%mm3 \n\t"\
"pmaxsw %%mm0, %%mm2 \n\t" /* max */\
"pminsw %%mm0, %%mm3 \n\t" /* min */\
"pxor %%mm4, %%mm4 \n\t"\
"pmaxsw %%mm3, %%mm6 \n\t"\
"psubw %%mm2, %%mm4 \n\t" /* -max */\
"pmaxsw %%mm4, %%mm6 \n\t" /* diff= MAX3(diff, min, -max); */\
"1: \n\t"\
\
"movq %[tmp1], %%mm2 \n\t" /* d */\
"movq %%mm2, %%mm3 \n\t"\
"psubw %%mm6, %%mm2 \n\t" /* d-diff */\
"paddw %%mm6, %%mm3 \n\t" /* d+diff */\
"pmaxsw %%mm2, %%mm1 \n\t"\
"pminsw %%mm3, %%mm1 \n\t" /* d = clip(spatial_pred, d-diff, d+diff); */\
"packuswb %%mm1, %%mm1 \n\t"\
\
:[tmp0]"=m"(tmp0),\
[tmp1]"=m"(tmp1),\
[tmp2]"=m"(tmp2),\
[tmp3]"=m"(tmp3)\
:[prev] "r"(prev),\
[cur] "r"(cur),\
[next] "r"(next),\
[prefs]"r"((x86_reg)refs),\
[mrefs]"r"((x86_reg)-refs),\
[pw1] "m"(pw_1),\
[pb1] "m"(pb_1),\
[mode] "g"(mode)\
);\
__asm__ volatile("movd %%mm1, %0" :"=m"(*dst));\
dst += 4;\
prev+= 4;\
cur += 4;\
next+= 4;\
}
if(parity){ #ifdef CAN_COMPILE_SSE2
#define prev2 "prev" #if defined(__SSE__) || VLC_GCC_VERSION(4, 4)
#define next2 "cur" // ================= SSE2 =================
FILTER #define HAVE_YADIF_SSE2
#undef prev2 #define COMPILE_TEMPLATE_SSE 1
#undef next2 #define VLC_TARGET VLC_SSE
}else{ #define RENAME(a) a ## _sse2
#define prev2 "cur" #include "yadif_template.h"
#define next2 "next" #undef COMPILE_TEMPLATE_SSE
FILTER #undef VLC_TARGET
#undef prev2 #undef RENAME
#undef next2 #endif
} #endif
}
#undef LOAD4
#undef PABS
#undef CHECK
#undef CHECK1
#undef CHECK2
#undef FILTER
#ifdef CAN_COMPILE_MMX
#if defined(__MMX__) || VLC_GCC_VERSION(4, 4)
// ================ MMX =================
#define HAVE_YADIF_MMX
#define VLC_TARGET VLC_MMX
#define RENAME(a) a ## _mmx
#include "yadif_template.h"
#undef VLC_TARGET
#undef RENAME
#endif
#endif #endif
static void yadif_filter_line_c(struct vf_priv_s *p, uint8_t *dst, uint8_t *prev, uint8_t *cur, uint8_t *next, int w, int refs, int parity){ static void yadif_filter_line_c(uint8_t *dst, uint8_t *prev, uint8_t *cur, uint8_t *next, int w, int prefs, int mrefs, int parity, int mode) {
int x; int x;
uint8_t *prev2= parity ? prev : cur ; uint8_t *prev2= parity ? prev : cur ;
uint8_t *next2= parity ? cur : next; uint8_t *next2= parity ? cur : next;
for(x=0; x<w; x++){ for(x=0; x<w; x++){
int c= cur[-refs]; int c= cur[mrefs];
int d= (prev2[0] + next2[0])>>1; int d= (prev2[0] + next2[0])>>1;
int e= cur[+refs]; int e= cur[prefs];
int temporal_diff0= FFABS(prev2[0] - next2[0]); int temporal_diff0= FFABS(prev2[0] - next2[0]);
int temporal_diff1=( FFABS(prev[-refs] - c) + FFABS(prev[+refs] - e) )>>1; int temporal_diff1=( FFABS(prev[mrefs] - c) + FFABS(prev[prefs] - e) )>>1;
int temporal_diff2=( FFABS(next[-refs] - c) + FFABS(next[+refs] - e) )>>1; int temporal_diff2=( FFABS(next[mrefs] - c) + FFABS(next[prefs] - e) )>>1;
int diff= FFMAX3(temporal_diff0>>1, temporal_diff1, temporal_diff2); int diff= FFMAX3(temporal_diff0>>1, temporal_diff1, temporal_diff2);
int spatial_pred= (c+e)>>1; int spatial_pred= (c+e)>>1;
int spatial_score= FFABS(cur[-refs-1] - cur[+refs-1]) + FFABS(c-e) int spatial_score= FFABS(cur[mrefs-1] - cur[prefs-1]) + FFABS(c-e)
+ FFABS(cur[-refs+1] - cur[+refs+1]) - 1; + FFABS(cur[mrefs+1] - cur[prefs+1]) - 1;
#define CHECK(j)\ #define CHECK(j)\
{ int score= FFABS(cur[-refs-1+j] - cur[+refs-1-j])\ { int score= FFABS(cur[mrefs-1+j] - cur[prefs-1-j])\
+ FFABS(cur[-refs +j] - cur[+refs -j])\ + FFABS(cur[mrefs +j] - cur[prefs -j])\
+ FFABS(cur[-refs+1+j] - cur[+refs+1-j]);\ + FFABS(cur[mrefs+1+j] - cur[prefs+1-j]);\
if(score < spatial_score){\ if(score < spatial_score){\
spatial_score= score;\ spatial_score= score;\
spatial_pred= (cur[-refs +j] + cur[+refs -j])>>1;\ spatial_pred= (cur[mrefs +j] + cur[prefs -j])>>1;\
CHECK(-1) CHECK(-2) }} }} CHECK(-1) CHECK(-2) }} }}
CHECK( 1) CHECK( 2) }} }} CHECK( 1) CHECK( 2) }} }}
if(p->mode<2){ if(mode<2){
int b= (prev2[-2*refs] + next2[-2*refs])>>1; int b= (prev2[2*mrefs] + next2[2*mrefs])>>1;
int f= (prev2[+2*refs] + next2[+2*refs])>>1; int f= (prev2[2*prefs] + next2[2*prefs])>>1;
#if 0 #if 0
int a= cur[-3*refs]; int a= cur[3*mrefs];
int g= cur[+3*refs]; int g= cur[3*prefs];
int max= FFMAX3(d-e, d-c, FFMIN3(FFMAX(b-c,f-e),FFMAX(b-c,b-a),FFMAX(f-g,f-e)) ); int max= FFMAX3(d-e, d-c, FFMIN3(FFMAX(b-c,f-e),FFMAX(b-c,b-a),FFMAX(f-g,f-e)) );
int min= FFMIN3(d-e, d-c, FFMAX3(FFMIN(b-c,f-e),FFMIN(b-c,b-a),FFMIN(f-g,f-e)) ); int min= FFMIN3(d-e, d-c, FFMAX3(FFMIN(b-c,f-e),FFMIN(b-c,b-a),FFMIN(f-g,f-e)) );
#else #else
......
/*
* Copyright (C) 2006 Michael Niedermayer <michaelni@gmx.at>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License along
* with FFmpeg; if not, write to the Free Software Foundation, Inc.,
* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
*/
#ifdef COMPILE_TEMPLATE_SSE
#define REGMM "xmm"
#define MM "%%"REGMM
#define MOV "movq"
#define MOVQ "movdqa"
#define MOVQU "movdqu"
#define STEP 8
#define LOAD(mem,dst) \
MOV" "mem", "dst" \n\t"\
"punpcklbw "MM"7, "dst" \n\t"
#define PSRL1(reg) "psrldq $1, "reg" \n\t"
#define PSRL2(reg) "psrldq $2, "reg" \n\t"
#define PSHUF(src,dst) "movdqa "dst", "src" \n\t"\
"psrldq $2, "src" \n\t"
#else
#define REGMM "mm"
#define MM "%%"REGMM
#define MOV "movd"
#define MOVQ "movq"
#define MOVQU "movq"
#define STEP 4
#define LOAD(mem,dst) \
MOV" "mem", "dst" \n\t"\
"punpcklbw "MM"7, "dst" \n\t"
#define PSRL1(reg) "psrlq $8, "reg" \n\t"
#define PSRL2(reg) "psrlq $16, "reg" \n\t"
#define PSHUF(src,dst) "pshufw $9, "dst", "src" \n\t"
#endif
#ifdef COMPILE_TEMPLATE_SSSE3
#define PABS(tmp,dst) \
"pabsw "dst", "dst" \n\t"
#else
#define PABS(tmp,dst) \
"pxor "tmp", "tmp" \n\t"\
"psubw "dst", "tmp" \n\t"\
"pmaxsw "tmp", "dst" \n\t"
#endif
#define CHECK(pj,mj) \
MOVQU" "#pj"(%[cur],%[mrefs]), "MM"2 \n\t" /* cur[x-refs-1+j] */\
MOVQU" "#mj"(%[cur],%[prefs]), "MM"3 \n\t" /* cur[x+refs-1-j] */\
MOVQ" "MM"2, "MM"4 \n\t"\
MOVQ" "MM"2, "MM"5 \n\t"\
"pxor "MM"3, "MM"4 \n\t"\
"pavgb "MM"3, "MM"5 \n\t"\
"pand %[pb_1], "MM"4 \n\t"\
"psubusb "MM"4, "MM"5 \n\t"\
PSRL1(MM"5") \
"punpcklbw "MM"7, "MM"5 \n\t" /* (cur[x-refs+j] + cur[x+refs-j])>>1 */\
MOVQ" "MM"2, "MM"4 \n\t"\
"psubusb "MM"3, "MM"2 \n\t"\
"psubusb "MM"4, "MM"3 \n\t"\
"pmaxub "MM"3, "MM"2 \n\t"\
MOVQ" "MM"2, "MM"3 \n\t"\
MOVQ" "MM"2, "MM"4 \n\t" /* ABS(cur[x-refs-1+j] - cur[x+refs-1-j]) */\
PSRL1(MM"3") /* ABS(cur[x-refs +j] - cur[x+refs -j]) */\
PSRL2(MM"4") /* ABS(cur[x-refs+1+j] - cur[x+refs+1-j]) */\
"punpcklbw "MM"7, "MM"2 \n\t"\
"punpcklbw "MM"7, "MM"3 \n\t"\
"punpcklbw "MM"7, "MM"4 \n\t"\
"paddw "MM"3, "MM"2 \n\t"\
"paddw "MM"4, "MM"2 \n\t" /* score */
#define CHECK1 \
MOVQ" "MM"0, "MM"3 \n\t"\
"pcmpgtw "MM"2, "MM"3 \n\t" /* if(score < spatial_score) */\
"pminsw "MM"2, "MM"0 \n\t" /* spatial_score= score; */\
MOVQ" "MM"3, "MM"6 \n\t"\
"pand "MM"3, "MM"5 \n\t"\
"pandn "MM"1, "MM"3 \n\t"\
"por "MM"5, "MM"3 \n\t"\
MOVQ" "MM"3, "MM"1 \n\t" /* spatial_pred= (cur[x-refs+j] + cur[x+refs-j])>>1; */
#define CHECK2 /* pretend not to have checked dir=2 if dir=1 was bad.\
hurts both quality and speed, but matches the C version. */\
"paddw %[pw_1], "MM"6 \n\t"\
"psllw $14, "MM"6 \n\t"\
"paddsw "MM"6, "MM"2 \n\t"\
MOVQ" "MM"0, "MM"3 \n\t"\
"pcmpgtw "MM"2, "MM"3 \n\t"\
"pminsw "MM"2, "MM"0 \n\t"\
"pand "MM"3, "MM"5 \n\t"\
"pandn "MM"1, "MM"3 \n\t"\
"por "MM"5, "MM"3 \n\t"\
MOVQ" "MM"3, "MM"1 \n\t"
VLC_TARGET static void RENAME(yadif_filter_line)(uint8_t *dst,
uint8_t *prev, uint8_t *cur, uint8_t *next,
int w, int prefs, int mrefs, int parity, int mode)
{
DECLARE_ALIGNED(16, uint8_t, tmp0)[16];
DECLARE_ALIGNED(16, uint8_t, tmp1)[16];
DECLARE_ALIGNED(16, uint8_t, tmp2)[16];
DECLARE_ALIGNED(16, uint8_t, tmp3)[16];
int x;
#define FILTER\
for(x=0; x<w; x+=STEP){\
__asm__ volatile(\
"pxor "MM"7, "MM"7 \n\t"\
LOAD("(%[cur],%[mrefs])", MM"0") /* c = cur[x-refs] */\
LOAD("(%[cur],%[prefs])", MM"1") /* e = cur[x+refs] */\
LOAD("(%["prev2"])", MM"2") /* prev2[x] */\
LOAD("(%["next2"])", MM"3") /* next2[x] */\
MOVQ" "MM"3, "MM"4 \n\t"\
"paddw "MM"2, "MM"3 \n\t"\
"psraw $1, "MM"3 \n\t" /* d = (prev2[x] + next2[x])>>1 */\
MOVQ" "MM"0, %[tmp0] \n\t" /* c */\
MOVQ" "MM"3, %[tmp1] \n\t" /* d */\
MOVQ" "MM"1, %[tmp2] \n\t" /* e */\
"psubw "MM"4, "MM"2 \n\t"\
PABS( MM"4", MM"2") /* temporal_diff0 */\
LOAD("(%[prev],%[mrefs])", MM"3") /* prev[x-refs] */\
LOAD("(%[prev],%[prefs])", MM"4") /* prev[x+refs] */\
"psubw "MM"0, "MM"3 \n\t"\
"psubw "MM"1, "MM"4 \n\t"\
PABS( MM"5", MM"3")\
PABS( MM"5", MM"4")\
"paddw "MM"4, "MM"3 \n\t" /* temporal_diff1 */\
"psrlw $1, "MM"2 \n\t"\
"psrlw $1, "MM"3 \n\t"\
"pmaxsw "MM"3, "MM"2 \n\t"\
LOAD("(%[next],%[mrefs])", MM"3") /* next[x-refs] */\
LOAD("(%[next],%[prefs])", MM"4") /* next[x+refs] */\
"psubw "MM"0, "MM"3 \n\t"\
"psubw "MM"1, "MM"4 \n\t"\
PABS( MM"5", MM"3")\
PABS( MM"5", MM"4")\
"paddw "MM"4, "MM"3 \n\t" /* temporal_diff2 */\
"psrlw $1, "MM"3 \n\t"\
"pmaxsw "MM"3, "MM"2 \n\t"\
MOVQ" "MM"2, %[tmp3] \n\t" /* diff */\
\
"paddw "MM"0, "MM"1 \n\t"\
"paddw "MM"0, "MM"0 \n\t"\
"psubw "MM"1, "MM"0 \n\t"\
"psrlw $1, "MM"1 \n\t" /* spatial_pred */\
PABS( MM"2", MM"0") /* ABS(c-e) */\
\
MOVQU" -1(%[cur],%[mrefs]), "MM"2 \n\t" /* cur[x-refs-1] */\
MOVQU" -1(%[cur],%[prefs]), "MM"3 \n\t" /* cur[x+refs-1] */\
MOVQ" "MM"2, "MM"4 \n\t"\
"psubusb "MM"3, "MM"2 \n\t"\
"psubusb "MM"4, "MM"3 \n\t"\
"pmaxub "MM"3, "MM"2 \n\t"\
PSHUF(MM"3", MM"2") \
"punpcklbw "MM"7, "MM"2 \n\t" /* ABS(cur[x-refs-1] - cur[x+refs-1]) */\
"punpcklbw "MM"7, "MM"3 \n\t" /* ABS(cur[x-refs+1] - cur[x+refs+1]) */\
"paddw "MM"2, "MM"0 \n\t"\
"paddw "MM"3, "MM"0 \n\t"\
"psubw %[pw_1], "MM"0 \n\t" /* spatial_score */\
\
CHECK(-2,0)\
CHECK1\
CHECK(-3,1)\
CHECK2\
CHECK(0,-2)\
CHECK1\
CHECK(1,-3)\
CHECK2\
\
/* if(p->mode<2) ... */\
MOVQ" %[tmp3], "MM"6 \n\t" /* diff */\
"cmpl $2, %[mode] \n\t"\
"jge 1f \n\t"\
LOAD("(%["prev2"],%[mrefs],2)", MM"2") /* prev2[x-2*refs] */\
LOAD("(%["next2"],%[mrefs],2)", MM"4") /* next2[x-2*refs] */\
LOAD("(%["prev2"],%[prefs],2)", MM"3") /* prev2[x+2*refs] */\
LOAD("(%["next2"],%[prefs],2)", MM"5") /* next2[x+2*refs] */\
"paddw "MM"4, "MM"2 \n\t"\
"paddw "MM"5, "MM"3 \n\t"\
"psrlw $1, "MM"2 \n\t" /* b */\
"psrlw $1, "MM"3 \n\t" /* f */\
MOVQ" %[tmp0], "MM"4 \n\t" /* c */\
MOVQ" %[tmp1], "MM"5 \n\t" /* d */\
MOVQ" %[tmp2], "MM"7 \n\t" /* e */\
"psubw "MM"4, "MM"2 \n\t" /* b-c */\
"psubw "MM"7, "MM"3 \n\t" /* f-e */\
MOVQ" "MM"5, "MM"0 \n\t"\
"psubw "MM"4, "MM"5 \n\t" /* d-c */\
"psubw "MM"7, "MM"0 \n\t" /* d-e */\
MOVQ" "MM"2, "MM"4 \n\t"\
"pminsw "MM"3, "MM"2 \n\t"\
"pmaxsw "MM"4, "MM"3 \n\t"\
"pmaxsw "MM"5, "MM"2 \n\t"\
"pminsw "MM"5, "MM"3 \n\t"\
"pmaxsw "MM"0, "MM"2 \n\t" /* max */\
"pminsw "MM"0, "MM"3 \n\t" /* min */\
"pxor "MM"4, "MM"4 \n\t"\
"pmaxsw "MM"3, "MM"6 \n\t"\
"psubw "MM"2, "MM"4 \n\t" /* -max */\
"pmaxsw "MM"4, "MM"6 \n\t" /* diff= MAX3(diff, min, -max); */\
"1: \n\t"\
\
MOVQ" %[tmp1], "MM"2 \n\t" /* d */\
MOVQ" "MM"2, "MM"3 \n\t"\
"psubw "MM"6, "MM"2 \n\t" /* d-diff */\
"paddw "MM"6, "MM"3 \n\t" /* d+diff */\
"pmaxsw "MM"2, "MM"1 \n\t"\
"pminsw "MM"3, "MM"1 \n\t" /* d = clip(spatial_pred, d-diff, d+diff); */\
"packuswb "MM"1, "MM"1 \n\t"\
\
:[tmp0]"=m"(tmp0),\
[tmp1]"=m"(tmp1),\
[tmp2]"=m"(tmp2),\
[tmp3]"=m"(tmp3)\
:[prev] "r"(prev),\
[cur] "r"(cur),\
[next] "r"(next),\
[prefs]"r"((x86_reg)prefs),\
[mrefs]"r"((x86_reg)mrefs),\
[pw_1] "m"(pw_1),\
[pb_1] "m"(pb_1),\
[mode] "g"(mode)\
:REGMM"0",REGMM"1",REGMM"2",REGMM"3",REGMM"4",REGMM"5",REGMM"6",REGMM"7"\
);\
__asm__ volatile(MOV" "MM"1, %0" :"=m"(*dst));\
dst += STEP;\
prev+= STEP;\
cur += STEP;\
next+= STEP;\
}
if (parity) {
#define prev2 "prev"
#define next2 "cur"
FILTER
#undef prev2
#undef next2
} else {
#define prev2 "cur"
#define next2 "next"
FILTER
#undef prev2
#undef next2
}
}
#undef STEP
#undef REGMM
#undef MM
#undef MOV
#undef MOVQ
#undef MOVQU
#undef PSHUF
#undef PSRL1
#undef PSRL2
#undef LOAD
#undef PABS
#undef CHECK
#undef CHECK1
#undef CHECK2
#undef FILTER
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment