Commit d25c5e06 authored by Laurent Aimar's avatar Laurent Aimar

* add post processing modules.( As defined in MPEG4 ISO) There are C,

mmx, mmxext version, but only mmxext is really usable (other need a
 _lot_ of CPU power).

There are new options for ffmpeg plugins :
 --ffmpeg-pp to choose postprocessing module( c, mmx, mmext or mmx2 )
 --ffmpeg-pp-q to choose quality( 0..6 )
 --ffmpeg-db-?? ( where first ? is for y or c, and the other ? for v or h )
to force deblocking on luminance(y)/chrominance(c) horizontally or
vertically.
 --ffmpeg-dr-? ( where ? is y or c ) to force dering on ...
parent 25235c6d
......@@ -2,7 +2,7 @@
* ffmpeg.c: video decoder using ffmpeg library
*****************************************************************************
* Copyright (C) 1999-2001 VideoLAN
* $Id: ffmpeg.c,v 1.2 2002/08/04 18:39:41 sam Exp $
* $Id: ffmpeg.c,v 1.3 2002/08/04 22:13:05 fenrir Exp $
*
* Authors: Laurent Aimar <fenrir@via.ecp.fr>
*
......@@ -43,6 +43,9 @@
#endif
#include "avcodec.h" /* ffmpeg */
#include "postprocessing/postprocessing.h"
#include "ffmpeg.h"
/*
......@@ -71,7 +74,17 @@ static int b_ffmpeginit = 0;
"Allow the decoder to partially decode or skip frame(s) " \
"when there not enough time.\n It's usefull with low CPU power " \
"but it could produce broken pictures."
#define POSTPROCESSING_Q_LONGTEXT \
"Quality of post processing\n"\
"Valid range is 0 to 6\n" \
"( Overridden by others setting)"
#define POSTPROCESSING_AQ_LONGTEXT \
"Post processing quality is selected upon time left" \
"but no more than requested quality\n" \
"Not yet implemented !"
vlc_module_begin();
add_category_hint( N_("Miscellaneous"), NULL );
#if LIBAVCODEC_BUILD >= 4611
......@@ -81,6 +94,33 @@ vlc_module_begin();
"workaround bugs", "0-99, seems to be for msmpeg v3\n" );
#endif
add_bool( "ffmpeg-hurry-up", 0, NULL, "hurry up", HURRY_UP_LONGTEXT );
add_category_hint( N_("Post processing"), NULL );
add_module( "ffmpeg-pp", "postprocessing",NULL, NULL,
N_( "ffmpeg postprocessing module" ), NULL );
add_integer( "ffmpeg-pp-q", 0, NULL,
"Post processing quality", POSTPROCESSING_Q_LONGTEXT );
add_bool( "ffmpeg-pp-auto", 0, NULL,
"Auto-level Post processing quality", POSTPROCESSING_AQ_LONGTEXT );
add_bool( "ffmpeg-db-yv", 0, NULL,
"force vertical luminance deblocking",
"force vertical luminance deblocking (override other settings)" );
add_bool( "ffmpeg-db-yh", 0, NULL,
"force horizontal luminance deblocking",
"force horizontal luminance deblocking (override other settings)" );
add_bool( "ffmpeg-db-cv", 0, NULL,
"force vertical chrominance deblocking",
"force vertical chrominance deblocking (override other settings)" );
add_bool( "ffmpeg-db-ch", 0, NULL,
"force horizontal chrominance deblocking",
"force horizontal chrominance deblocking (override other settings) " );
add_bool( "ffmpeg-dr-y", 0, NULL,
"force luminance deringing",
"force luminance deringing (override other settings)" );
add_bool( "ffmpeg-dr-c", 0, NULL,
"force chrominance deringing",
"force chrominance deringing (override other settings)" );
set_description( _("ffmpeg video decoder((MS)MPEG4,SVQ1,H263)") );
set_capability( "decoder", 70 );
set_callbacks( OpenDecoder, NULL );
......@@ -161,6 +201,8 @@ static int RunDecoder( decoder_fifo_t *p_fifo )
( *(u8*)(p) + ( *((u8*)(p)+1) << 8 ) + \
( *((u8*)(p)+2) << 16 ) + ( *((u8*)(p)+3) << 24 ) )
#define FREE( p ) if( p ) free( p ); p = NULL
static void ffmpeg_ParseBitMapInfoHeader( bitmapinfoheader_t *p_bh,
u8 *p_data )
{
......@@ -179,8 +221,14 @@ static void ffmpeg_ParseBitMapInfoHeader( bitmapinfoheader_t *p_bh,
if( p_bh->i_size > 40 )
{
p_bh->i_data = p_bh->i_size - 40;
p_bh->p_data = malloc( p_bh->i_data );
memcpy( p_bh->p_data, p_data + 40, p_bh->i_data );
if( ( p_bh->p_data = malloc( p_bh->i_data ) ) )
{
memcpy( p_bh->p_data, p_data + 40, p_bh->i_data );
}
else
{
p_bh->i_data = 0;
}
}
else
{
......@@ -261,7 +309,21 @@ static inline void __GetFrame( videodec_thread_t *p_vdec )
return;
}
/* get a buffer and gather all data packet */
p_vdec->p_framedata = p_buffer = malloc( p_pes->i_pes_size );
if( p_vdec->i_buffer_size < p_pes->i_pes_size )
{
if( p_vdec->p_buffer )
{
p_vdec->p_buffer = realloc( p_vdec->p_buffer,
p_pes->i_pes_size );
}
else
{
p_vdec->p_buffer = malloc( p_pes->i_pes_size );
}
p_vdec->i_buffer_size = p_pes->i_pes_size;
}
p_buffer = p_vdec->p_framedata = p_vdec->p_buffer;
p_data = p_pes->p_first;
do
{
......@@ -274,56 +336,11 @@ static inline void __GetFrame( videodec_thread_t *p_vdec )
static inline void __NextFrame( videodec_thread_t *p_vdec )
{
pes_packet_t *p_pes;
p_pes = __PES_GET( p_vdec->p_fifo );
if( p_pes->i_nb_data != 1 )
{
free( p_vdec->p_framedata ); /* FIXME keep this buffer */
}
__PES_NEXT( p_vdec->p_fifo );
}
/* FIXME FIXME some of them are wrong */
static int i_ffmpeg_PixFmtToChroma[] =
{
/* PIX_FMT_ANY = -1,PIX_FMT_YUV420P, PIX_FMT_YUV422,
PIX_FMT_RGB24, PIX_FMT_BGR24, PIX_FMT_YUV422P,
PIX_FMT_YUV444P, PIX_FMT_YUV410P */
0, VLC_FOURCC('I','4','2','0'), VLC_FOURCC('I','4','2','0'),
VLC_FOURCC('R','V','2','4'), 0, VLC_FOURCC('Y','4','2','2'),
VLC_FOURCC('I','4','4','4'), 0
};
static inline u32 ffmpeg_PixFmtToChroma( int i_ffmpegchroma )
{
if( ++i_ffmpegchroma > 7 )
{
return( 0 );
}
else
{
return( i_ffmpeg_PixFmtToChroma[i_ffmpegchroma] );
}
}
static inline int ffmpeg_FfAspect( int i_width, int i_height, int i_ffaspect )
{
switch( i_ffaspect )
{
case( FF_ASPECT_4_3_625 ):
case( FF_ASPECT_4_3_525 ):
return( VOUT_ASPECT_FACTOR * 4 / 3);
case( FF_ASPECT_16_9_625 ):
case( FF_ASPECT_16_9_525 ):
return( VOUT_ASPECT_FACTOR * 16 / 9 );
case( FF_ASPECT_SQUARE ):
default:
return( VOUT_ASPECT_FACTOR * i_width / i_height );
}
}
/* Check if we have a Vout with good parameters */
static int ffmpeg_CheckVout( vout_thread_t *p_vout,
int i_width,
int i_height,
......@@ -414,13 +431,17 @@ static vout_thread_t *ffmpeg_CreateVout( videodec_thread_t *p_vdec,
{
msg_Dbg( p_vdec->p_fifo, "no vout present, spawning one" );
p_vout = vout_CreateThread( p_vdec->p_fifo,
i_width,
i_height,
i_chroma,
i_aspect );
if( !( p_vout = vout_CreateThread( p_vdec->p_fifo,
i_width,
i_height,
i_chroma,
i_aspect ) ) )
{
return( NULL ); /* everythings have failed */
}
}
/* up to now, all this stuff is used for post-processing */
return( p_vout );
}
......@@ -429,7 +450,6 @@ static vout_thread_t *ffmpeg_CreateVout( videodec_thread_t *p_vdec,
or said to me how write a better thing
FIXME FIXME FIXME
*/
static void ffmpeg_ConvertPictureI410toI420( picture_t *p_pic,
AVPicture *p_avpicture,
videodec_thread_t *p_vdec )
......@@ -527,9 +547,9 @@ static void ffmpeg_ConvertPictureI410toI420( picture_t *p_pic,
}
static void ffmpeg_ConvertPicture( picture_t *p_pic,
AVPicture *p_avpicture,
videodec_thread_t *p_vdec )
static void ffmpeg_GetPicture( picture_t *p_pic,
AVPicture *p_avpicture,
videodec_thread_t *p_vdec )
{
int i_plane;
int i_size;
......@@ -539,10 +559,9 @@ static void ffmpeg_ConvertPicture( picture_t *p_pic,
u8 *p_src;
int i_src_stride;
int i_dst_stride;
if( ffmpeg_PixFmtToChroma( p_vdec->p_context->pix_fmt ) )
{
/* convert ffmpeg picture to our format */
for( i_plane = 0; i_plane < p_pic->i_planes; i_plane++ )
{
p_src = p_avpicture->data[i_plane];
......@@ -558,24 +577,42 @@ static void ffmpeg_ConvertPicture( picture_t *p_pic,
p_dst += i_dst_stride;
}
}
return;
if( ( p_vdec->i_pp_mode )&&
( ( p_vdec->p_vout->render.i_chroma ==
VLC_FOURCC( 'I','4','2','0' ) )||
( p_vdec->p_vout->render.i_chroma ==
VLC_FOURCC( 'Y','V','1','2' ) ) ) )
{
/* Make postproc */
#if LIBAVCODEC_BUILD > 4313
p_vdec->p_pp->pf_postprocess( p_pic,
p_vdec->p_context->quant_store,
p_vdec->p_context->qstride,
p_vdec->i_pp_mode );
#endif
}
}
/* we need to convert to I420 */
switch( p_vdec->p_context->pix_fmt )
else
{
/* we need to convert to I420 */
switch( p_vdec->p_context->pix_fmt )
{
#if LIBAVCODEC_BUILD >= 4615
case( PIX_FMT_YUV410P ):
ffmpeg_ConvertPictureI410toI420( p_pic, p_avpicture, p_vdec );
break;
case( PIX_FMT_YUV410P ):
ffmpeg_ConvertPictureI410toI420( p_pic, p_avpicture, p_vdec );
break;
#endif
default:
p_vdec->p_fifo->b_error =1;
break;
default:
p_vdec->p_fifo->b_error = 1;
break;
}
}
}
/*****************************************************************************
*
* Functions that initialize, decode and end the decoding process
......@@ -585,15 +622,19 @@ static void ffmpeg_ConvertPicture( picture_t *p_pic,
/*****************************************************************************
* InitThread: initialize vdec output thread
*****************************************************************************
* This function is called from RunDecoderoder and performs the second step
* This function is called from decoder_Run and performs the second step
* of the initialization. It returns 0 on success. Note that the thread's
* flag are not modified inside this function.
*
* ffmpeg codec will be open, some memory allocated. But Vout is not yet
* open (done after the first decoded frame)
*****************************************************************************/
static int InitThread( videodec_thread_t *p_vdec )
{
int i_ffmpeg_codec;
int i_tmp;
int i_use_pp;
if( p_vdec->p_fifo->p_demux_data )
{
......@@ -605,7 +646,7 @@ static int InitThread( videodec_thread_t *p_vdec )
msg_Warn( p_vdec->p_fifo, "display informations missing" );
}
/*init ffmpeg */
/* **** init ffmpeg library (libavcodec) ***** */
if( !b_ffmpeginit )
{
avcodec_init();
......@@ -617,6 +658,8 @@ static int InitThread( videodec_thread_t *p_vdec )
{
msg_Dbg( p_vdec->p_fifo, "library ffmpeg already initialized" );
}
/* ***** Search for codec ***** */
ffmpeg_GetFfmpegCodec( p_vdec->p_fifo->i_fourcc,
&i_ffmpeg_codec,
&p_vdec->psz_namecodec );
......@@ -630,24 +673,14 @@ static int InitThread( videodec_thread_t *p_vdec )
return( -1 );
}
/* ***** Fill p_context with init values ***** */
p_vdec->p_context = &p_vdec->context;
memset( p_vdec->p_context, 0, sizeof( AVCodecContext ) );
p_vdec->p_context->width = p_vdec->format.i_width;
p_vdec->p_context->height = p_vdec->format.i_height;
/* XXX
p_vdec->p_context->workaround_bugs
--> seems to be for msmpeg 3 but can't know what is supposed to do
p_vdec->p_context->strict_std_compliance
--> strictly follow mpeg4 standard for decoder or encoder ??
p_vdec->p_context->error_resilience
--> don't make error resilience, because of some ms encoder witch
use some wrong VLC code.
*/
/* ***** Get configuration of ffmpeg plugin ***** */
#if LIBAVCODEC_BUILD >= 4611
i_tmp = config_GetInt( p_vdec->p_fifo, "ffmpeg-workaround-bugs" );
p_vdec->p_context->workaround_bugs = __MAX( __MIN( i_tmp, 99 ), 0 );
......@@ -661,7 +694,105 @@ static int InitThread( videodec_thread_t *p_vdec )
p_vdec->p_context->flags|= CODEC_FLAG_GRAY;
}
#endif
p_vdec->b_hurry_up = config_GetInt(p_vdec->p_fifo, "ffmpeg-hurry-up");
/* ***** Load for post processing ***** */
/* get overridden settings */
p_vdec->i_pp_mode = 0;
if( config_GetInt( p_vdec->p_fifo, "ffmpeg-db-yv" ) )
p_vdec->i_pp_mode |= PP_DEBLOCK_Y_V;
if( config_GetInt( p_vdec->p_fifo, "ffmpeg-db-yh" ) )
p_vdec->i_pp_mode |= PP_DEBLOCK_Y_H;
if( config_GetInt( p_vdec->p_fifo, "ffmpeg-db-cv" ) )
p_vdec->i_pp_mode |= PP_DEBLOCK_C_V;
if( config_GetInt( p_vdec->p_fifo, "ffmpeg-db-ch" ) )
p_vdec->i_pp_mode |= PP_DEBLOCK_C_H;
if( config_GetInt( p_vdec->p_fifo, "ffmpeg-dr-y" ) )
p_vdec->i_pp_mode |= PP_DERING_Y;
if( config_GetInt( p_vdec->p_fifo, "ffmpeg-dr-c" ) )
p_vdec->i_pp_mode |= PP_DERING_C;
if( ( config_GetInt( p_vdec->p_fifo, "ffmpeg-pp-q" ) > 0 )||
( p_vdec->i_pp_mode != 0 ) )
{
i_use_pp = 1;
}
else
{
i_use_pp = 0;
}
if( i_use_pp )
{
switch( i_ffmpeg_codec )
{
#if LIBAVCODEC_BUILD > 4608
case( CODEC_ID_MSMPEG4V1 ):
case( CODEC_ID_MSMPEG4V2 ):
case( CODEC_ID_MSMPEG4V3 ):
#else
case( CODEC_ID_MSMPEG4 ):
#endif
case( CODEC_ID_MPEG4 ):
case( CODEC_ID_H263 ):
// case( CODEC_ID_H263P ): I don't use it up to now
case( CODEC_ID_H263I ):
/* Ok we can make postprocessing :)) */
break;
default:
p_vdec->i_pp_mode = 0;
i_use_pp = 0;
msg_Warn( p_vdec->p_fifo,
"Post processing unsupported for this codec" );
break;
}
}
if( i_use_pp )
{
#if LIBAVCODEC_BUILD > 4613
char *psz_name;
/* first try to get a postprocess module */
p_vdec->p_pp = vlc_object_create( p_vdec->p_fifo,
sizeof( postprocessing_t ) );
p_vdec->p_pp->psz_object_name = "postprocessing";
psz_name = config_GetPsz( p_vdec->p_pp, "ffmpeg-pp" );
p_vdec->p_pp->p_module =
module_Need( p_vdec->p_pp, "postprocessing", psz_name );
FREE( psz_name );
if( !p_vdec->p_pp->p_module )
{
msg_Warn( p_vdec->p_fifo, "no suitable postprocessing module" );
vlc_object_destroy( p_vdec->p_pp );
p_vdec->p_pp = NULL;
p_vdec->i_pp_mode = 0;
}
else
{
/* get mode upon quality */
p_vdec->i_pp_mode |=
p_vdec->p_pp->pf_getmode( config_GetInt( p_vdec->p_fifo,
"ffmpeg-pp-q" ),
config_GetInt( p_vdec->p_fifo,
"ffmpeg-pp-auto" ) );
/* allocate table for postprocess */
p_vdec->p_context->quant_store =
malloc( sizeof( int ) * ( MBR + 1 ) * ( MBC + 1 ) );
p_vdec->p_context->qstride = MBC + 1;
}
#else
msg_Warn( p_vdec->p_fifo,
"post-processing not supported, upgrade ffmpeg" );
p_vdec->i_pp_mode = 0;
#endif
}
/* ***** Open the codec ***** */
if (avcodec_open(p_vdec->p_context, p_vdec->p_codec) < 0)
{
msg_Err( p_vdec->p_fifo, "cannot open codec (%s)",
......@@ -674,7 +805,7 @@ static int InitThread( videodec_thread_t *p_vdec )
p_vdec->psz_namecodec );
}
/* first give init data */
/* ***** init this codec with special data(up to now MPEG4 only) ***** */
if( p_vdec->format.i_data )
{
AVPicture avpicture;
......@@ -693,9 +824,6 @@ static int InitThread( videodec_thread_t *p_vdec )
}
}
/* This will be created after the first decoded frame */
p_vdec->p_vout = NULL;
return( 0 );
}
......@@ -714,8 +842,8 @@ static void DecodeThread( videodec_thread_t *p_vdec )
and send the image to the output */
/* TODO implement it in a better way */
if( ( config_GetInt(p_vdec->p_fifo, "ffmpeg-hurry-up") )&&
/* A good idea could be to decode all I pictures and see for the other */
if( ( p_vdec->b_hurry_up )&&
( p_vdec->i_frame_late > 4 ) )
{
#if LIBAVCODEC_BUILD > 4603
......@@ -735,7 +863,7 @@ static void DecodeThread( videodec_thread_t *p_vdec )
#else
if( p_vdec->i_frame_late < 8 )
{
b_drawpicture = 0; /* not really good but .. */
b_drawpicture = 0; /* not really good but .. UPGRADE FFMPEG !! */
}
else
{
......@@ -809,7 +937,7 @@ static void DecodeThread( videodec_thread_t *p_vdec )
}
}
/* Send decoded frame to vout */
/* Get a new picture */
while( !(p_pic = vout_CreatePicture( p_vdec->p_vout, 0, 0, 0 ) ) )
{
if( p_vdec->p_fifo->b_die || p_vdec->p_fifo->b_error )
......@@ -818,13 +946,13 @@ static void DecodeThread( videodec_thread_t *p_vdec )
}
msleep( VOUT_OUTMEM_SLEEP );
}
ffmpeg_ConvertPicture( p_pic,
&avpicture,
p_vdec );
/* fill p_picture_t from avpicture, do I410->I420 if needed
and do post-processing if requested */
ffmpeg_GetPicture( p_pic, &avpicture, p_vdec );
/* FIXME correct avi and use i_dts */
/* Send decoded frame to vout */
vout_DatePicture( p_vdec->p_vout, p_pic, p_vdec->i_pts);
vout_DisplayPicture( p_vdec->p_vout, p_pic );
......@@ -840,13 +968,22 @@ static void DecodeThread( videodec_thread_t *p_vdec )
*****************************************************************************/
static void EndThread( videodec_thread_t *p_vdec )
{
if( !p_vdec )
{
return;
}
if( p_vdec->p_pp )
{
/* release postprocessing module */
module_Unneed( p_vdec->p_pp, p_vdec->p_pp->p_module );
vlc_object_destroy( p_vdec->p_pp );
p_vdec->p_pp = NULL;
}
if( p_vdec->p_context != NULL)
{
FREE( p_vdec->p_context->quant_store );
avcodec_close( p_vdec->p_context );
msg_Dbg( p_vdec->p_fifo, "ffmpeg codec (%s) stopped",
p_vdec->psz_namecodec );
......@@ -859,10 +996,9 @@ static void EndThread( videodec_thread_t *p_vdec )
vlc_object_attach( p_vdec->p_vout, p_vdec->p_fifo->p_vlc );
}
if( p_vdec->format.p_data != NULL)
{
free( p_vdec->format.p_data );
}
FREE( p_vdec->format.p_data );
FREE( p_vdec->p_buffer );
free( p_vdec );
}
......@@ -2,7 +2,7 @@
* ffmpeg_vdec.h: video decoder using ffmpeg library
*****************************************************************************
* Copyright (C) 2001 VideoLAN
* $Id: ffmpeg.h,v 1.1 2002/08/04 17:23:42 sam Exp $
* $Id: ffmpeg.h,v 1.2 2002/08/04 22:13:05 fenrir Exp $
*
* Authors: Laurent Aimar <fenrir@via.ecp.fr>
*
......@@ -41,6 +41,38 @@ typedef struct bitmapinfoheader_s
} bitmapinfoheader_t;
typedef struct videodec_thread_s
{
decoder_fifo_t *p_fifo;
bitmapinfoheader_t format;
AVCodecContext context, *p_context;
AVCodec *p_codec;
vout_thread_t *p_vout;
/* for post processing */
u32 i_pp_mode; /* valid only with I420 and YV12 */
postprocessing_t *p_pp;
char *psz_namecodec;
/* for frame skipping algo */
int b_hurry_up;
int i_frame_error;
int i_frame_skip;
int i_frame_late; /* how may frame decoded are in late */
/* private */
mtime_t i_pts;
int i_framesize;
u8 *p_framedata;
u8 *p_buffer; /* buffer for gather pes */
int i_buffer_size; /* size of allocated p_framedata */
} videodec_thread_t;
/* MPEG4 video */
#define FOURCC_DIVX VLC_FOURCC('D','I','V','X')
#define FOURCC_divx VLC_FOURCC('d','i','v','x')
......@@ -171,7 +203,7 @@ static int ffmpeg_GetFfmpegCodec( vlc_fourcc_t i_fourcc,
i_codec = CODEC_ID_MPEG4;
psz_name = "MPEG-4";
break;
/* FIXME FOURCC_H263P exist but what fourcc ? */
case FOURCC_H263:
case FOURCC_h263:
case FOURCC_U263:
......@@ -196,25 +228,45 @@ static int ffmpeg_GetFfmpegCodec( vlc_fourcc_t i_fourcc,
return VLC_FALSE;
}
typedef struct videodec_thread_s
/* FIXME FIXME some of them are wrong */
static int i_ffmpeg_PixFmtToChroma[] =
{
decoder_fifo_t *p_fifo;
bitmapinfoheader_t format;
/* PIX_FMT_ANY = -1, PIX_FMT_YUV420P,
PIX_FMT_YUV422, PIX_FMT_RGB24,
PIX_FMT_BGR24, PIX_FMT_YUV422P,
PIX_FMT_YUV444P, PIX_FMT_YUV410P
*/
0, VLC_FOURCC('I','4','2','0'),
VLC_FOURCC('I','4','2','0'), VLC_FOURCC('R','V','2','4'),
0, VLC_FOURCC('Y','4','2','2'),
VLC_FOURCC('I','4','4','4'), 0
};
AVCodecContext context, *p_context;
AVCodec *p_codec;
vout_thread_t *p_vout;
static inline u32 ffmpeg_PixFmtToChroma( int i_ffmpegchroma )
{
if( ++i_ffmpegchroma > 7 )
{
return( 0 );
}
else
{
return( i_ffmpeg_PixFmtToChroma[i_ffmpegchroma] );
}
}
char *psz_namecodec;
/* private */
mtime_t i_pts;
int i_framesize;
byte_t *p_framedata;
static inline int ffmpeg_FfAspect( int i_width, int i_height, int i_ffaspect )
{
switch( i_ffaspect )
{
case( FF_ASPECT_4_3_625 ):
case( FF_ASPECT_4_3_525 ):
return( VOUT_ASPECT_FACTOR * 4 / 3);
case( FF_ASPECT_16_9_625 ):
case( FF_ASPECT_16_9_525 ):
return( VOUT_ASPECT_FACTOR * 16 / 9 );
case( FF_ASPECT_SQUARE ):
default:
return( VOUT_ASPECT_FACTOR * i_width / i_height );
}
}
int i_frame_error;
int i_frame_skip;
int i_frame_late; /* how may frame decoded are in late */
} videodec_thread_t;
postprocessing_c_SOURCES = postprocessing.c postprocessing_c.c
postprocessing_mmx_SOURCES = postprocessing.c postprocessing_mmx.c
postprocessing_mmxext_SOURCES = postprocessing.c postprocessing_mmxext.c
/*****************************************************************************
* postprocessing.c
*****************************************************************************
* Copyright (C) 1999-2001 VideoLAN
* $Id: postprocessing.c,v 1.1 2002/08/04 22:13:06 fenrir Exp $
*
* Authors: Laurent Aimar <fenrir@via.ecp.fr>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111, USA.
*****************************************************************************/
/*****************************************************************************
* Preamble
*****************************************************************************/
#include <stdlib.h>
#include <string.h>
#include <vlc/vlc.h>
#include <vlc/vout.h>
#include "postprocessing.h"
#include "postprocessing_common.h"
static int Open ( vlc_object_t *p_this );
static u32 pp_getmode( int i_quality, int b_autolevel );
static int pp_postprocess( picture_t *,
QT_STORE_T *, unsigned int,
unsigned int i_mode );
/*****************************************************************************
* Module descriptor
*****************************************************************************/
vlc_module_begin();
#ifdef MODULE_NAME_IS_postprocessing_c
set_description( _("C Post Processing module") );
set_capability( "postprocessing", 50 );
add_shortcut( "c" );
#elif defined( MODULE_NAME_IS_postprocessing_mmx )
set_description( _("MMX Post Processing module") );
set_capability( "postprocessing", 100 );
add_requirement( MMX );
add_shortcut( "mmx" );
#elif defined( MODULE_NAME_IS_postprocessing_mmxext )
set_description( _("MMXEXT Post Processing module") );
set_capability( "postprocessing", 150 );
add_requirement( MMXEXT );
add_shortcut( "mmxext" );
add_shortcut( "mmx2" );
#endif
set_callbacks( Open, NULL );
vlc_module_end();
/*****************************************************************************
* Module initializer
*****************************************************************************/
static int Open ( vlc_object_t *p_this )
{
postprocessing_t *p_pp = (postprocessing_t *)p_this;
p_pp->pf_getmode = pp_getmode;
p_pp->pf_postprocess = pp_postprocess;
return VLC_SUCCESS;
}
static u32 pp_getmode( int i_quality, int b_autolevel )
{
u32 i_mode;
i_quality = i_quality < 0 ? 0 : i_quality;
i_quality = i_quality > 6 ? 6 : i_quality;
switch( i_quality )
{
case( 0 ):
i_mode = 0;
break;
case( 1 ):
i_mode = PP_DEBLOCK_Y_H;
break;
case( 2 ):
i_mode = PP_DEBLOCK_Y_H|PP_DEBLOCK_Y_V;
break;
case( 3 ):
i_mode = PP_DEBLOCK_Y_H|PP_DEBLOCK_Y_V|
PP_DEBLOCK_C_H;
break;
case( 4 ):
i_mode = PP_DEBLOCK_Y_H|PP_DEBLOCK_Y_V|
PP_DEBLOCK_C_H|PP_DEBLOCK_C_V;
break;
case( 5 ):
i_mode = PP_DEBLOCK_Y_H|PP_DEBLOCK_Y_V|
PP_DEBLOCK_C_H|PP_DEBLOCK_C_V|
PP_DERING_Y;
break;
case( 6 ):
i_mode = PP_DEBLOCK_Y_H|PP_DEBLOCK_Y_V|
PP_DEBLOCK_C_H|PP_DEBLOCK_C_V|
PP_DERING_Y|PP_DERING_C;
break;
default:
i_mode = 0;
}
if( b_autolevel )
{
i_mode |= PP_AUTOLEVEL;
}
return( i_mode );
}
/*****************************************************************************
* pp_postprocess : make post-filter as defined in MPEG4-ISO
*****************************************************************************
*****************************************************************************/
static int pp_postprocess( picture_t *p_pic,
QT_STORE_T *p_QP_store, unsigned int i_QP_stride,
unsigned int i_mode )
{
/* Some sanity checks */
// if( ( p_pic->i_height&0x0f )||( p_pic->i_width&0x0f )||
if( ( p_pic->p_heap->i_chroma != VLC_FOURCC( 'I', '4', '2', '0' ) )&&
( p_pic->p_heap->i_chroma != VLC_FOURCC( 'Y', 'V', '1', '2' ) ) )
{
return( PP_ERR_INVALID_PICTURE );
}
if( ( !p_QP_store )||( i_QP_stride < p_pic->p_heap->i_width >> 4 ) )
{
return( PP_ERR_INVALID_QP );
}
/* First do vertical deblocking and then horizontal */
/* Luminance */
if( i_mode&PP_DEBLOCK_Y_V )
{
E_( pp_deblock_V )( p_pic->Y_PIXELS,
p_pic->p_heap->i_width, p_pic->p_heap->i_height, p_pic->Y_PITCH,
p_QP_store, i_QP_stride,
0 );
}
if( i_mode&PP_DEBLOCK_Y_H )
{
E_( pp_deblock_H )( p_pic->Y_PIXELS,
p_pic->p_heap->i_width, p_pic->p_heap->i_height, p_pic->Y_PITCH,
p_QP_store, i_QP_stride,
0 );
}
/* Chrominance */
if( i_mode&PP_DEBLOCK_C_V )
{
E_( pp_deblock_V )( p_pic->U_PIXELS,
p_pic->p_heap->i_width >> 1, p_pic->p_heap->i_height >> 1,
p_pic->U_PITCH,
p_QP_store, i_QP_stride,
1 );
E_( pp_deblock_V )( p_pic->V_PIXELS,
p_pic->p_heap->i_width >> 1, p_pic->p_heap->i_height >> 1,
p_pic->V_PITCH,
p_QP_store, i_QP_stride,
1 );
}
if( i_mode&PP_DEBLOCK_C_H )
{
E_( pp_deblock_H )( p_pic->U_PIXELS,
p_pic->p_heap->i_width >> 1, p_pic->p_heap->i_height >> 1,
p_pic->U_PITCH,
p_QP_store, i_QP_stride,
1 );
E_( pp_deblock_H )( p_pic->V_PIXELS,
p_pic->p_heap->i_width >> 1, p_pic->p_heap->i_height >> 1,
p_pic->V_PITCH,
p_QP_store, i_QP_stride,
1 );
}
/* After deblocking do dering */
/* TODO check for min size */
if( i_mode&PP_DERING_Y )
{
E_( pp_dering_Y )( p_pic->Y_PIXELS,
p_pic->p_heap->i_width, p_pic->p_heap->i_height,
p_pic->Y_PITCH,
p_QP_store, i_QP_stride );
}
if( i_mode&PP_DERING_C )
{
E_( pp_dering_C )( p_pic->U_PIXELS,
p_pic->p_heap->i_width >> 1, p_pic->p_heap->i_height >> 1,
p_pic->U_PITCH,
p_QP_store, i_QP_stride );
E_( pp_dering_C )( p_pic->V_PIXELS,
p_pic->p_heap->i_width >> 1, p_pic->p_heap->i_height >> 1,
p_pic->V_PITCH,
p_QP_store, i_QP_stride );
}
#if defined( MODULE_NAME_IS_postprocessing_mmx )||defined( MODULE_NAME_IS_postprocessing_mmxext )
/* We have used MMX so return to safe FPU state */
__asm__ __volatile__ ( "emms" );
#endif
return( PP_ERR_OK );
}
/*****************************************************************************
* postprocessing.h
*****************************************************************************
* Copyright (C) 2001 VideoLAN
* $Id: postprocessing.h,v 1.1 2002/08/04 22:13:06 fenrir Exp $
*
* Authors: Laurent Aimar <fenrir@via.ecp.fr>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111, USA.
*****************************************************************************/
#define QT_STORE_T int
/* postprocessing available using to create i_mode */
#define PP_DEBLOCK_Y_H 0x00000001
#define PP_DEBLOCK_Y_V 0x00000002
#define PP_DEBLOCK_C_H 0x00000004
#define PP_DEBLOCK_C_V 0x00000008
#define PP_DERING_Y 0x00000010
#define PP_DERING_C 0x00000020
#define PP_AUTOLEVEL 0x80000000
/* error code, not really used */
#define PP_ERR_OK 0 /* no problem */
#define PP_ERR_INVALID_PICTURE 1 /* wrong picture size or chroma */
#define PP_ERR_INVALID_QP 2 /* need valid QP to make the postprocess */
#define PP_ERR_UNKNOWN 255
typedef struct postprocessing_s
{
VLC_COMMON_MEMBERS
module_t * p_module;
u32 (*pf_getmode)( int i_quality, int b_autolevel );
int (*pf_postprocess)( picture_t *p_pic,
QT_STORE_T *p_QP_store, unsigned int i_QP_stride,
unsigned int i_mode );
} postprocessing_t;
/*****************************************************************************
* postprocessing_c.c: Post Processing plugin in C
*****************************************************************************
* Copyright (C) 2001 VideoLAN
* $Id: postprocessing_c.c,v 1.1 2002/08/04 22:13:06 fenrir Exp $
*
* Authors: Laurent Aimar <fenrir@via.ecp.fr>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111, USA.
*****************************************************************************/
#include <vlc/vlc.h> /* only use u8, u32 .... */
#include "postprocessing.h"
#include "postprocessing_common.h"
/*****************************************************************************
*
* Internals functions common to pp_deblock_V and pp_deblock_H
*
*****************************************************************************/
/****************************************************************************
* pp_deblock_isDC_mode : Check if we will use DC mode or Default mode
****************************************************************************
* Use constant PP_THR1 and PP_THR2 ( PP_2xTHR1 )
*
* Called for for each pixel on a boundary block when doing deblocking
* so need to be fast ...
*
****************************************************************************/
static inline int pp_deblock_isDC_mode( u8 *p_v )
{
int i_eq_cnt;
/* algo : if ( | v[i] -v[i+1] | <= PP_THR1 ) { i_eq_cnt++; } */
i_eq_cnt = 0;
if(( ( p_v[0] - p_v[1] + PP_THR1 )&0xffff )<= PP_2xTHR1 ) i_eq_cnt++;
if(( ( p_v[1] - p_v[2] + PP_THR1 )&0xffff )<= PP_2xTHR1 ) i_eq_cnt++;
if(( ( p_v[2] - p_v[3] + PP_THR1 )&0xffff )<= PP_2xTHR1 ) i_eq_cnt++;
if(( ( p_v[3] - p_v[4] + PP_THR1 )&0xffff )<= PP_2xTHR1 ) i_eq_cnt++;
if(( ( p_v[4] - p_v[5] + PP_THR1 )&0xffff )<= PP_2xTHR1 ) i_eq_cnt++;
if(( ( p_v[5] - p_v[6] + PP_THR1 )&0xffff )<= PP_2xTHR1 ) i_eq_cnt++;
if(( ( p_v[6] - p_v[7] + PP_THR1 )&0xffff )<= PP_2xTHR1 ) i_eq_cnt++;
if(( ( p_v[7] - p_v[8] + PP_THR1 )&0xffff )<= PP_2xTHR1 ) i_eq_cnt++;
if(( ( p_v[8] - p_v[9] + PP_THR1 )&0xffff )<= PP_2xTHR1 ) i_eq_cnt++;
#if 0
int i;
for( i =0; i < 9; i++ )
{
if(( ( p_v[i] - p_v[i+1] + PP_THR1 )&0xffff )<= PP_2xTHR1 )
{
i_eq_cnt++;
}
}
#endif
return( (i_eq_cnt >= PP_THR2 ) ? 1 : 0 );
}
static inline int pp_deblock_isMinMaxOk( u8 *p_v, int i_QP )
{
int i_max, i_min;
i_min = i_max = p_v[1];
if( i_max < p_v[1] ) i_max = p_v[1];
if( i_min > p_v[1] ) i_min = p_v[1];
if( i_max < p_v[2] ) i_max = p_v[2];
if( i_min > p_v[2] ) i_min = p_v[2];
if( i_max < p_v[3] ) i_max = p_v[3];
if( i_min > p_v[3] ) i_min = p_v[3];
if( i_max < p_v[4] ) i_max = p_v[4];
if( i_min > p_v[4] ) i_min = p_v[4];
if( i_max < p_v[5] ) i_max = p_v[5];
if( i_min > p_v[5] ) i_min = p_v[5];
if( i_max < p_v[6] ) i_max = p_v[6];
if( i_min > p_v[6] ) i_min = p_v[6];
if( i_max < p_v[7] ) i_max = p_v[7];
if( i_min > p_v[7] ) i_min = p_v[7];
if( i_max < p_v[8] ) i_max = p_v[8];
if( i_min > p_v[8] ) i_min = p_v[8];
#if 0
int i;
int i_range;
for( i = 2; i < 9; i++ )
{
if( i_max < p_v[i] ) i_max = p_v[i];
if( i_min > p_v[i] ) i_min = p_v[i];
}
i_range = i_max - i_min;
#endif
return( i_max - i_min < 2*i_QP ? 1 : 0 );
}
static inline void pp_deblock_DefaultMode( u8 i_v[10], int i_stride,
int i_QP )
{
int d, i_delta;
int a3x0, a3x0_, a3x1, a3x2;
int b_neg;
/* d = CLIP( 5(a3x0' - a3x0)//8, 0, (v4-v5)/2 ).d( abs(a3x0) < QP ) */
/* First calculate a3x0 */
a3x0 = 2 * ( i_v[3] - i_v[6] ) + 5 *( i_v[5] - i_v[4] );
if( a3x0 < 0 )
{
b_neg = 1;
a3x0 = -a3x0;
}
else
{
b_neg = 0;
}
/* XXX Now a3x0 is abs( a3x0 ) */
if( ( a3x0 < 8 * i_QP )&&( a3x0 != 0 ) ) /* |a3x0| < 8*i_QP */
{
/* calculate a3x1 et a3x2 */
a3x1 = 2 * ( i_v[1] - i_v[4] ) + 5 * ( i_v[3] - i_v[2] );
a3x2 = 2 * ( i_v[5] - i_v[8] ) + 5 * ( i_v[7] - i_v[6] );
if( a3x1 < 0) a3x1 = -a3x1; /* abs( a3x1 ) */
if( a3x2 < 0) a3x2 = -a3x2; /* abs( a3x2 ) */
a3x0_ = PP_MIN3( a3x0, a3x1, a3x2 );
d = 5 *( a3x0 - a3x0_ ) / 8; /* always > 0 */
i_delta = ( i_v[4] - i_v[5] ) / 2;
/* clip into [0, i_delta] or [i_delta, 0] */
if( i_delta < 0 )
{
if( !b_neg ) /* since true d has sgn(d) = - sgn( a3x0 ) */
{
d = -d;
if( d < i_delta ) d = i_delta;
i_v[4] -= d;
i_v[5] += d;
}
}
else
{
if( b_neg )
{
if( d > i_delta ) d = i_delta;
i_v[4] -= d;
i_v[5] += d;
}
}
}
}
static inline void pp_deblock_DCMode( u8 *p_v, /* = int i_v[10] */
int i_QP )
{
int v[10];
int i;
int i_p0, i_p9;
i_p0 = PP_ABS( p_v[1] - p_v[0] ) < i_QP ? p_v[0] : p_v[1];
i_p9 = PP_ABS( p_v[8] - p_v[9] ) < i_QP ? p_v[9] : p_v[8];
for( i = 1; i < 9; i++ )
{
v[i] = p_v[i]; /* save 8 pix that will be modified */
}
p_v[1] = ( 6 * i_p0 + 4 * v[1]
+ 2 *( v[2] + v[3]) + v[4] + v[5]) >> 4;
p_v[2] = ( 4 * i_p0 + 2 * v[1] + 4 * v[2]
+ 2 *( v[3] + v[4]) + v[5] + v[6]) >> 4;
p_v[3] = ( 2 * i_p0 + 2 * (v[1] + v[2]) + 4 * v[3]
+ 2 *( v[4] + v[5]) + v[6] + v[7]) >> 4;
p_v[4] = ( i_p0 + v[1] + 2 * (v[2] + v[3]) + 4 * v[4]
+ 2 *( v[5] + v[6]) + v[7] + v[8]) >> 4;
p_v[5] = ( v[1] + v[2] + 2 * (v[3] + v[4]) + 4 * v[5]
+ 2 *( v[6] + v[7]) + v[8] + i_p9) >> 4;
p_v[6] = ( v[2] + v[3] + 2 * (v[4] + v[5]) + 4 * v[6]
+ 2 *( v[7] + v[8]) + 2 * i_p9) >> 4;
p_v[7] = ( v[3] + v[4] + 2 * (v[5] + v[6]) + 4 * v[7]
+ 2 * v[8] + 4 * i_p9) >> 4;
p_v[8] = ( v[4] + v[5] + 2 * (v[6] + v[7]) + 4 * v[8]
+ 6 * i_p9) >> 4;
}
/*****************************************************************************/
/*---------------------------------------------------------------------------*/
/* */
/* ---------- filter Vertical lines so follow horizontal edges -------- */
/* */
/*---------------------------------------------------------------------------*/
/*****************************************************************************/
void E_( pp_deblock_V )( u8 *p_plane,
int i_width, int i_height, int i_stride,
QT_STORE_T *p_QP_store, int i_QP_stride,
int b_chroma )
{
int x, y, i;
u8 *p_v;
int i_QP_scale; /* use to do ( ? >> i_QP_scale ) */
int i_QP;
u8 i_v[10];
i_QP_scale = b_chroma ? 5 : 4 ;
for( y = 8; y < i_height - 4; y += 8 )
{
p_v = p_plane + ( y - 5 )* i_stride;
for( x = 0; x < i_width; x++ )
{
/* First get 10 vert pix to use them without i_stride */
for( i = 0; i < 10; i++ )
{
i_v[i] = p_v[i*i_stride + x];
}
i_QP = p_QP_store[(y>>i_QP_scale)*i_QP_stride+
(x>>i_QP_scale)];
/* XXX QP is for v5 */
if( pp_deblock_isDC_mode( i_v ) )
{
if( pp_deblock_isMinMaxOk( i_v, i_QP ) )
{
pp_deblock_DCMode( i_v, i_QP );
}
}
else
{
pp_deblock_DefaultMode( i_v, i_stride, i_QP );
}
/* Copy back, XXX only 1-8 were modified */
for( i = 1; i < 9; i++ )
{
p_v[i*i_stride + x] = i_v[i];
}
}
}
return;
}
/*****************************************************************************/
/*---------------------------------------------------------------------------*/
/* */
/* --------- filter Horizontal lines so follow vertical edges -------- */
/* */
/*---------------------------------------------------------------------------*/
/*****************************************************************************/
void E_( pp_deblock_H )( u8 *p_plane,
int i_width, int i_height, int i_stride,
QT_STORE_T *p_QP_store, int i_QP_stride,
int b_chroma )
{
int x, y;
u8 *p_v;
int i_QP_scale;
int i_QP;
i_QP_scale = b_chroma ? 5 : 4 ;
for( y = 0; y < i_height; y++ )
{
p_v = p_plane + y * i_stride - 5;
for( x = 8; x < i_width - 4; x += 8 )
{
/* p_v point 5 pix before a block boundary */
/* XXX QP is for v5 */
i_QP = p_QP_store[(y>>i_QP_scale)*i_QP_stride+
(x>>i_QP_scale)];
if( pp_deblock_isDC_mode( p_v + x ) )
{
if( pp_deblock_isMinMaxOk( p_v+ x, i_QP ) )
{
pp_deblock_DCMode( p_v+x, i_QP );
}
}
else
{
pp_deblock_DefaultMode( p_v+x, i_stride, i_QP );
}
}
}
return;
}
/*****************************************************************************
*
* Internals functions common to pp_Dering_Y pp_Dering_C
*
*****************************************************************************/
static inline void pp_dering_MinMax( u8 *p_block, int i_stride,
int *pi_min, int *pi_max )
{
int y;
int i_min, i_max;
i_min = 255; i_max = 0;
for( y = 0; y < 8; y++ )
{
if( i_min > p_block[0] ) i_min = p_block[0];
if( i_max < p_block[0] ) i_max = p_block[0];
if( i_min > p_block[1] ) i_min = p_block[1];
if( i_max < p_block[1] ) i_max = p_block[1];
if( i_min > p_block[2] ) i_min = p_block[2];
if( i_max < p_block[2] ) i_max = p_block[2];
if( i_min > p_block[3] ) i_min = p_block[3];
if( i_max < p_block[3] ) i_max = p_block[3];
if( i_min > p_block[4] ) i_min = p_block[4];
if( i_max < p_block[4] ) i_max = p_block[4];
if( i_min > p_block[5] ) i_min = p_block[5];
if( i_max < p_block[5] ) i_max = p_block[5];
if( i_min > p_block[6] ) i_min = p_block[6];
if( i_max < p_block[6] ) i_max = p_block[6];
if( i_min > p_block[7] ) i_min = p_block[7];
if( i_max < p_block[7] ) i_max = p_block[7];
#if 0
int x;
for( x = 0; x < 8; x++ )
{
if( i_min > p_block[x] ) i_min = p_block[x];
if( i_max < p_block[x] ) i_max = p_block[x];
}
#endif
p_block += i_stride;
}
*pi_min = i_min;
*pi_max = i_max;
}
static inline void pp_dering_BinIndex( u8 *p_block, int i_stride, int i_thr,
u32 *p_bin )
{
int x, y;
u32 i_bin;
for( y = 0; y < 10; y++ )
{
i_bin = 0;
for( x = 0; x < 10; x++ )
{
if( p_block[x] > i_thr )
{
i_bin |= 1 << x;
}
}
i_bin |= (~i_bin) << 16; /* for detect also three 0 */
*p_bin = i_bin&( i_bin >> 1 )&( i_bin << 1 );
p_block += i_stride;
p_bin++;
}
}
static inline void pp_dering_Filter( u8 *p_block, int i_stride,
u32 *p_bin,
int i_QP )
{
int x, y;
u32 i_bin;
int i_flt[8][8];
int i_f;
u8 *p_sav;
int i_QP_2;
p_sav = p_block;
i_QP_2 = i_QP >> 1;
for( y = 0; y < 8; y++ )
{
i_bin = p_bin[y] & p_bin[y+1] & p_bin[y+2]; /* To be optimised */
i_bin |= i_bin >> 16; /* detect 0 or 1 */
for( x = 0; x < 8; x++ )
{
if( i_bin&0x02 ) /* 0x02 since 10 index but want 1-9 */
{
/* apply dering */
/* 1 2 1
2 4 2 + (8)
1 2 1 */
i_f = p_block[x - i_stride - 1] +
( p_block[x - i_stride ] << 1)+
p_block[x - i_stride + 1] +
( p_block[x - 1] << 1 )+
( p_block[x ] << 2 )+
( p_block[x + 1] << 1 )+
p_block[x + i_stride - 1] +
( p_block[x + i_stride ] << 1 ) +
p_block[x + i_stride + 1];
i_f = ( 8 + i_f ) >> 4;
/* Clamp this value */
if( i_f - p_block[x] > ( i_QP_2 ) )
{
i_flt[y][x] = p_block[x] + i_QP_2;
}
else
if( i_f - p_block[x] < -i_QP_2 )
{
i_flt[y][x] = p_block[x] - i_QP_2;
}
else
{
i_flt[y][x] = i_f ;
}
}
else
{
i_flt[y][x] = p_block[x];
}
i_bin >>= 1;
}
p_block += i_stride;
}
for( y = 0; y < 8; y++ )
{
for( x = 0; x < 8; x++ )
{
p_sav[x] = i_flt[y][x];
}
p_sav+= i_stride;
}
}
/*****************************************************************************/
/*---------------------------------------------------------------------------*/
/* */
/* ----------------- Dering filter on Y and C blocks ----------------- */
/* */
/*---------------------------------------------------------------------------*/
/*****************************************************************************/
void E_( pp_dering_Y )( u8 *p_plane,
int i_width, int i_height, int i_stride,
QT_STORE_T *p_QP_store, int i_QP_stride )
{
int x, y, k;
int i_max[4], i_min[4], i_range[4];
int i_thr[4];
int i_max_range, i_kmax;
u32 i_bin[4][10];
u8 *p_block[4];
QT_STORE_T *p_QP;
/* We process 4 blocks/loop*/
for( y = 8; y < i_height-8; y += 16 )
{
/* +---+
|0|1|
+-+-+ :))
|2|3|
+-+-+ */
p_block[0] = p_plane + y * i_stride + 8;
p_block[1] = p_block[0] + 8;
p_block[2] = p_block[0] + ( i_stride << 3 );
p_block[3] = p_block[2] + 8;
for( x = 8; x < i_width-8; x += 16 )
{
/* 1: Calculate threshold */
/* Calculate max/min for each block */
pp_dering_MinMax( p_block[0], i_stride, &i_min[0], &i_max[0] );
pp_dering_MinMax( p_block[1], i_stride, &i_min[1], &i_max[1] );
pp_dering_MinMax( p_block[2], i_stride, &i_min[2], &i_max[2] );
pp_dering_MinMax( p_block[3], i_stride, &i_min[3], &i_max[3] );
/* Calculate range, max_range and thr */
i_max_range = 0; i_kmax = 0;
for( k = 0; k <= 4; k++ )
{
i_range[k] = i_max[k] - i_min[k];
i_thr[k] = ( i_max[k] + i_min[k] + 1 )/2;
if( i_max_range < i_max[k])
{
i_max_range = i_max[k];
i_kmax = k;
}
}
/* Now rearrange thr */
if( i_max_range > 64 )
{
for( k = 1; k < 5; k++ )
{
if( i_range[k] < 16 )
{
i_thr[k] = 0;
}
else
if( i_range[k] < 32 )
{
i_thr[k] = i_thr[i_kmax];
}
}
}
else
{
for( k = 1; k < 5; k++ )
{
if( i_range[k] < 16 )
{
i_thr[k] = 0;
}
}
}
/* 2: Index acquisition 10x10 ! so " -i_stride - 1"*/
pp_dering_BinIndex( p_block[0] - i_stride - 1, i_stride,
i_thr[0], i_bin[0] );
pp_dering_BinIndex( p_block[1] - i_stride - 1, i_stride,
i_thr[1], i_bin[1] );
pp_dering_BinIndex( p_block[2] - i_stride - 1, i_stride,
i_thr[2], i_bin[2] );
pp_dering_BinIndex( p_block[3] - i_stride - 1, i_stride,
i_thr[3], i_bin[3] );
/* 3: adaptive smoothing */
/* since we begin at (8,8) QP can be different for each block */
p_QP = &( p_QP_store[( y >> 4) * i_QP_stride + (x >> 4)] );
pp_dering_Filter( p_block[0], i_stride,
i_bin[0], p_QP[0] );
pp_dering_Filter( p_block[1], i_stride,
i_bin[1], p_QP[1] );
pp_dering_Filter( p_block[2], i_stride,
i_bin[2], p_QP[i_QP_stride] );
pp_dering_Filter( p_block[3], i_stride,
i_bin[3], p_QP[i_QP_stride+1] );
p_block[0] += 8;
p_block[1] += 8;
p_block[2] += 8;
p_block[3] += 8;
}
}
}
void E_( pp_dering_C )( u8 *p_plane,
int i_width, int i_height, int i_stride,
QT_STORE_T *p_QP_store, int i_QP_stride )
{
int x, y;
int i_max, i_min;
int i_thr;
u32 i_bin[10];
u8 *p_block;
for( y = 8; y < i_height-8; y += 8 )
{
p_block = p_plane + y * i_stride + 8;
for( x = 8; x < i_width-8; x += 8 )
{
/* 1: Calculate threshold */
/* Calculate max/min for each block */
pp_dering_MinMax( p_block, i_stride,
&i_min, &i_max );
/* Calculate thr*/
i_thr = ( i_max + i_min + 1 )/2;
/* 2: Index acquisition 10x10 */
/* point on 10x10 in wich we have our 8x8 block */
pp_dering_BinIndex( p_block - i_stride -1, i_stride,
i_thr,
i_bin );
/* 3: adaptive smoothing */
pp_dering_Filter( p_block, i_stride,
i_bin,
p_QP_store[(y>>5)*i_QP_stride+ (x>>5)]);
p_block += 8;
}
}
}
/*****************************************************************************
* postprocessing_common.h
*****************************************************************************
* Copyright (C) 2001 VideoLAN
* $Id: postprocessing_common.h,v 1.1 2002/08/04 22:13:06 fenrir Exp $
*
* Authors: Laurent Aimar <fenrir@via.ecp.fr>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111, USA.
*****************************************************************************/
//#define PP_USE_3DNOW /* Nothing done yet */
//#define PP_USE_MMX /* when only MMX is supported */
//#define PP_USE_MMXEXT /* when MMXEXT is also supported, imply MMX */
/* thresholds for deblocking, I've taken value given by ISO */
#define PP_THR1 2ULL /* threshold for deblocking */
#define PP_2xTHR1 ( 2 * PP_THR1 )/* internal usage */
#define PP_THR2 6ULL
/* Some usefull macros */
#define PP_MAX( a, b ) ( a > b ? (a) : (b) )
#define PP_MIN( a, b ) ( a < b ? (a) : (b) )
#define PP_ABS( x ) ( ( x < 0 ) ? (-(x)) : (x) )
#define PP_SGN( x ) ( ( x < 0 ) ? -1 : 1 )
#define PP_MIN3( a, b, c ) ( PP_MIN( (a), PP_MIN( (b), (c) ) ) )
#define PP_CLIP( x, a, b ) ( PP_MAX( (a), PP_MIN( (x), (b) ) ) )
void E_( pp_deblock_V )();
void E_( pp_deblock_H )();
void E_( pp_dering_Y )();
void E_( pp_dering_C )();
/*****************************************************************************
* postprocessing_mmx.c: Post Processing library in MMX
*****************************************************************************
* Copyright (C) 2001 VideoLAN
* $Id: postprocessing_mmx.c,v 1.1 2002/08/04 22:13:06 fenrir Exp $
*
* Authors: Laurent Aimar <fenrir@via.ecp.fr>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111, USA.
*****************************************************************************/
#include <vlc/vlc.h> /* only use u8, u32 .... */
#include "postprocessing.h"
#include "postprocessing_common.h"
/*****************************************************************************
*
* Internals functions common to pp_Deblock_V and pp_Deblock_H
*
*****************************************************************************/
/*****************************************************************************
* MMX stuff
*****************************************************************************/
/* XXX PP_THR1 need to be defined as ULL */
/* Use same things as in idct but how it work ? */
#define UNUSED_LONGLONG( foo ) \
static const unsigned long long foo __asm__ (#foo) __attribute__((unused))
/* to calculate isDC_mode for mmx */
UNUSED_LONGLONG( mmx_127_thr1 ) = ( ( 127ULL - PP_THR1 ) << 56 )|
( ( 127ULL - PP_THR1 ) << 48 )|
( ( 127ULL - PP_THR1 ) << 40 )|
( ( 127ULL - PP_THR1 ) << 32 )|
( ( 127ULL - PP_THR1 ) << 24 )|
( ( 127ULL - PP_THR1 ) << 16 )|
( ( 127ULL - PP_THR1 ) << 8 )|
( ( 127ULL - PP_THR1 ) );
UNUSED_LONGLONG( mmx_127_2xthr1_1 ) = ( ( 127ULL - PP_2xTHR1 -1) << 56 )|
( ( 127ULL - PP_2xTHR1 -1 ) << 48 )|
( ( 127ULL - PP_2xTHR1 -1 ) << 40 )|
( ( 127ULL - PP_2xTHR1 -1 ) << 32 )|
( ( 127ULL - PP_2xTHR1 -1 ) << 24 )|
( ( 127ULL - PP_2xTHR1 -1 ) << 16 )|
( ( 127ULL - PP_2xTHR1 -1 ) << 8 )|
( ( 127ULL - PP_2xTHR1 -1 ) );
UNUSED_LONGLONG( mmx_m2_5_m5_2 ) = 0xfffe0005fffb0002ULL;
#if 0
/* find min bytes from r ans set it in r, t is destroyed */
#define MMXEXT_GET_PMIN( r, t ) \
"movq " #r ", " #t " \n\
psrlq $8, " #t " \n\
pminub " #t ", " #r " \n\
pshufw $0xf5, " #r ", " #t " #instead of shift with tmp reg \n\
pminub " #t ", " #r " \n\
pshufw $0xfe, " #r ", " #t " \n\
pminub " #t ", " #r " \n"
/* find mzx bytes from r ans set it in r, t is destroyed */
#define MMXEXT_GET_PMAX( r, t ) \
"movq " #r ", " #t " \n\
psrlq $8, " #t " \n\
pmaxub " #t ", " #r " \n\
pshufw $0xf5, " #r ", " #t " \n\
pmaxub " #t ", " #r " \n\
pshufw $0xfe, " #r ", " #t " \n\
pmaxub " #t ", " #r " \n"
#define MMXEXT_GET_LMINMAX( s, m, M, t ) \
"movq " #s ", " #t " \n\
pminub " #t ", " #m " \n\
pmaxub " #t ", " #M " \n"
/* Some tips for MMX
* |a-b| :
d1 = a - b with unsigned saturate
d2 = b - a with ...
|a-b| = d1 | d2
*/
#endif
/****************************************************************************
* pp_deblock_isDC_mode : Check if we will use DC mode or Default mode
****************************************************************************
* Use constant PP_THR1 and PP_THR2 ( PP_2xTHR1 )
*
* Called for for each pixel on a boundary block when doing deblocking
* so need to be fast ...
*
****************************************************************************/
static inline int pp_deblock_isDC_mode( u8 *p_v )
{
int i_eq_cnt;
/* algo :
x = v[i] - v[i+1] without signed saturation
( XXX see if there is'nt problem, but can't be with signed
sat because pixel will be saturate :(
so x within [-128, 127] and we have to test if it fit in [-M, M]
we add 127-M with wrap around -> good value fit in [ 127-2*M, 127]
and if x >= 127 - 2 * M ie x > 127 -2*M - 1 value is good
*/
__asm__ __volatile__ (" \n\
#* Do (v0-v1) to (v7-v8) \n\
movq (%1), %%mm1 # load v0->v7 \n\
movq 1(%1), %%mm2 # load v1->v8 \n\
psubb %%mm2, %%mm1 # v[i]-v[i+1] \n\
paddb mmx_127_thr1, %%mm1 # + 127-THR1 with wrap \n\
pcmpgtb mmx_127_2xthr1_1, %%mm1 # > 127 -2*thr1 - 1 \n"
"movq %%mm1, %%mm0 # \n\
psrlw $8, %%mm1 # \n\
paddb %%mm1, %%mm0 # \n\
# \n\
movq %%mm0, %%mm1 # Now sum to create eq_cnt \n\
psrld $16, %%mm0 # \n\
paddb %%mm0, %%mm1 # \n\
# \n\
movq %%mm1, %%mm0 # \n\
psrlq $32, %%mm1 # \n\
paddb %%mm1, %%mm0 \n"
"movd %%mm0, %0 # \n\
negl %0 \n\
andl $255, %0"
: "=r"(i_eq_cnt) : "r" (p_v) );
/* last test, hey, 9 don't fit in MMX */
if(( ( p_v[8] - p_v[9] + PP_THR1 )&0xffff )<= PP_2xTHR1 )
{
i_eq_cnt++;
}
#if 0
/* algo : if ( | v[i] -v[i+1] | <= PP_THR1 ) { i_eq_cnt++; } */
i_eq_cnt = 0;
for( i =0; i < 9; i++ )
{
if(( ( p_v[i] - p_v[i+1] + PP_THR1 )&0xffff )<= PP_2xTHR1 )
{
i_eq_cnt++;
}
}
#endif
return( (i_eq_cnt >= PP_THR2 ) ? 1 : 0 );
}
static inline int pp_deblock_isMinMaxOk( u8 *p_v, int i_QP )
{
int i_range;
#if 0
__asm__ __volatile__ (
"movq 1(%1), %%mm0 # 8 bytes \n"
"movq %%mm0, %%mm1 \n"
MMXEXT_GET_PMIN( %%mm0, %%mm7 )
MMXEXT_GET_PMAX( %%mm1, %%mm7 )
"psubd %%mm0, %%mm1 # max - min \n\
movd %%mm1, %0 \n\
andl $255, %0" : "=r"(i_range) : "r"(p_v) );
#endif
int i_max, i_min;
int i;
i_min = i_max = p_v[1];
for( i = 2; i < 9; i++ )
{
if( i_max < p_v[i] ) i_max = p_v[i];
if( i_min > p_v[i] ) i_min = p_v[i];
}
i_range = i_max - i_min;
return( i_range< 2*i_QP ? 1 : 0 );
}
static inline void pp_deblock_DefaultMode( u8 i_v[10], int i_stride,
int i_QP )
{
int d, i_delta;
int a3x0, a3x0_, a3x1, a3x2;
int b_neg;
/* d = CLIP( 5(a3x0' - a3x0)//8, 0, (v4-v5)/2 ).d( abs(a3x0) < QP ) */
/* First calculate a3x0 */
__asm__ __volatile__ ( " \n\
pxor %%mm7, %%mm7 # mm7 = 0 \n\
movq mmx_m2_5_m5_2, %%mm6 # mm6 =(2,-5,5,-2) \n\
movd 3(%1), %%mm0 \n\
punpcklbw %%mm7,%%mm0 \n\
pmaddwd %%mm6, %%mm0 \n"
"movq %%mm0, %%mm1 \n\
psrlq $32, %%mm1 \n"
"paddd %%mm1, %%mm0 \n\
movd %%mm0, %0" : "=r"(a3x0) :"r"(i_v) );
#if 0
a3x0 = 2 * ( i_v[3] - i_v[6] ) + 5 *( i_v[5] - i_v[4] );
#endif
if( a3x0 < 0 )
{
b_neg = 1;
a3x0 = -a3x0;
}
else
{
b_neg = 0;
}
/* XXX Now a3x0 is abs( a3x0 ) */
if( ( a3x0 < 8 * i_QP )&&( a3x0 != 0 ) ) /* |a3x0| < 8*i_QP */
{
/* calculate a3x1 et a3x2 */
__asm__ __volatile__ ( " \n\
# mm7 = 0 \n\
# mm6 = ( 2, -5, 5, -2 ) \n\
movd 1(%2), %%mm0 \n\
movd 5(%2), %%mm2 \n\
punpcklbw %%mm7,%%mm0 \n\
punpcklbw %%mm7,%%mm2 \n\
pmaddwd %%mm6, %%mm0 \n\
pmaddwd %%mm6, %%mm2 \n"
"movq %%mm0, %%mm1 \n\
psrlq $32, %%mm1 \n"
"paddd %%mm1, %%mm0 # mm0 = a3x1 \n\
movd %%mm0, %0 \n"
"movq %%mm2, %%mm1 \n\
psrlq $32, %%mm1 \n"
"paddd %%mm1, %%mm2 # mm2 = a3x2 \n\
movd %%mm2, %1 \n\
" : "=r"(a3x1), "=r"(a3x2) : "r"(i_v) );
#if 0
a3x1 = 2 * ( i_v[1] - i_v[4] ) + 5 * ( i_v[3] - i_v[2] );
a3x2 = 2 * ( i_v[5] - i_v[8] ) + 5 * ( i_v[7] - i_v[6] );
#endif
if( a3x1 < 0) a3x1 = -a3x1; /* abs( a3x1 ) */
if( a3x2 < 0) a3x2 = -a3x2; /* abs( a3x2 ) */
a3x0_ = PP_MIN3( a3x0, a3x1, a3x2 );
d = 5 *( a3x0 - a3x0_ ) / 8; /* always > 0 */
i_delta = ( i_v[4] - i_v[5] ) / 2;
/* clip into [0, i_delta] or [i_delta, 0] */
if( i_delta < 0 )
{
if( !b_neg ) /* since true d has sgn(d) = - sgn( a3x0 ) */
{
d = -d;
if( d < i_delta ) d = i_delta;
i_v[4] -= d;
i_v[5] += d;
}
}
else
{
if( b_neg )
{
if( d > i_delta ) d = i_delta;
i_v[4] -= d;
i_v[5] += d;
}
}
}
}
static inline void pp_deblock_DCMode( u8 *p_v, /* = int i_v[10] */
int i_QP )
{
int i_p0, i_p9;
i_p0 = PP_ABS( p_v[1] - p_v[0] ) < i_QP ? p_v[0] : p_v[1];
i_p9 = PP_ABS( p_v[8] - p_v[9] ) < i_QP ? p_v[9] : p_v[8];
/* mm0 = 8 pix unmodified
-We will process first 4 pixel
mm0 = 8 pix unmodified
mm1 = for the first part of the 4 first pix
(v1) -> (p0) -> ... ( word )
(v2) (v1)
(v3) (v2)
(v4) (v3)
= for the commoin part between first and last pix
(v2) -> (v3) -> ... ( word )
(v3) (v4)
(v4) (v5)
(v5) (v6)
= for the last part of the 4 last pix
(v5) -> (v6) -> ... ( word )
(v6) (v7)
(v7) (v8)
(v8) (p9)
mm2 = acu for first new pix
mm3 = acu for last pix
mm4 = unused
mm5 = p0
mm6 = p9 << 48
mm7 = 0 */
__asm__ __volatile__ (
"pxor %%mm7, %%mm7 \n\
movq 1(%0), %%mm0 # get 8 pix \n\
# unpack into mm1 \n\
movq %%mm0, %%mm1 \n\
punpcklbw %%mm7, %%mm1 \n\
# get p_0 and i_p9 \n\
movd %1, %%mm5 \n\
movd %2, %%mm6 \n\
psllq $48, %%mm6 \n
\n\
movq %%mm1, %%mm3 # p_v[5-8] = v[1-4] !! \n\
movq %%mm1, %%mm2 \n\
psllw $2, %%mm2 # p_v[1-4] = 4*v[1-4] \n\
\n\
psllq $16, %%mm1 \n\
por %%mm5, %%mm1 # mm1 =( p0, v1, v2 ,v3)\n\
\n\
paddw %%mm1, %%mm2 \n\
paddw %%mm1, %%mm2 \n\
\n\
psllq $16, %%mm1 \n\
por %%mm5, %%mm1 # mm1 =( p0, p0, v1, v2)\n\
\n\
paddw %%mm1, %%mm2 \n\
paddw %%mm1, %%mm2 \n\
\n\
psllq $16, %%mm1 \n\
por %%mm5, %%mm1 # mm1 =( p0, p0, p0, v1)\n\
\n\
paddw %%mm1, %%mm2 \n\
\n\
psllq $16, %%mm1 \n\
por %%mm5, %%mm1 # mm1 =( p0, p0, p0, p0)\n\
\n\
paddw %%mm1, %%mm2 \n\
# Now last part a little borring\n\
# last part for mm2, beginig for mm3
movq %%mm0, %%mm1 \n\
psrlq $8, %%mm1 \n\
punpcklbw %%mm7, %%mm1 # mm1 =( v2, v3, v4, v5 )\n\
paddw %%mm1, %%mm2 \n\
paddw %%mm1, %%mm2 \n\
paddw %%mm1, %%mm3 \n\
\n\
movq %%mm0, %%mm1 \n\
psrlq $16, %%mm1 \n\
punpcklbw %%mm7, %%mm1 # mm1 =( v3, v4, v5, v6 )\n\
psllw $1, %%mm1 \n\
paddw %%mm1, %%mm2 \n\
paddw %%mm1, %%mm3 \n\
\n\
movq %%mm0, %%mm1 \n\
psrlq $24, %%mm1 \n\
punpcklbw %%mm7, %%mm1 # mm1 =( v4, v5, v6, v7) \n\
paddw %%mm1, %%mm2 \n\
paddw %%mm1, %%mm3 \n\
paddw %%mm1, %%mm3 \n\
\n\
movq %%mm0, %%mm1 \n\
psrlq $32, %%mm1 \n\
punpcklbw %%mm7, %%mm1 # mm1 =( v5, v6, v7, v8) \n\
paddw %%mm1, %%mm2 \n\
psllw $2, %%mm1
paddw %%mm1, %%mm3 \n\
# Now last part for last 4 pix \n\
# \n\
movq %%mm0, %%mm1 \n\
punpckhbw %%mm7, %%mm1 # mm1 = ( v5, v6, v7, v8) \n\
\n\
psrlq $16, %%mm1 \n\
por %%mm6, %%mm1 # mm1 =( v6, v7, v8, p9 )\n\
\n\
paddw %%mm1, %%mm3 \n\
paddw %%mm1, %%mm3 \n\
\n\
psrlq $16, %%mm1 \n\
por %%mm6, %%mm1 # mm1 =( v7, v8, p9, p9)\n\
\n\
paddw %%mm1, %%mm3 \n\
paddw %%mm1, %%mm3 \n\
\n\
psrlq $16, %%mm1 \n\
por %%mm6, %%mm1 # mm1 =( v8, p9, p9, p9 )\n\
\n\
paddw %%mm1, %%mm3 \n\
\n\
psrlq $16, %%mm1 \n\
por %%mm6, %%mm1 # mm1 =( p9, p9, p9, p9 )\n\
\n\
paddw %%mm1, %%mm3 \n\
psrlw $4, %%mm2 \n\
psrlw $4, %%mm3 \n\
packuswb %%mm3, %%mm2 \n\
movq %%mm2, 1(%0) \n\
": : "r"(p_v), "r"(i_p0), "r"(i_p9) : "memory" );
#if 0
for( i = 1; i < 9; i++ )
{
v[i] = p_v[i]; /* save 8 pix that will be modified */
}
p_v[1] = ( 6 * i_p0 + 4 * v[1]
+ 2 *( v[2] + v[3]) + v[4] + v[5]) >> 4;
p_v[2] = ( 4 * i_p0 + 2 * v[1] + 4 * v[2]
+ 2 *( v[3] + v[4]) + v[5] + v[6]) >> 4;
p_v[3] = ( 2 * i_p0 + 2 * (v[1] + v[2]) + 4 * v[3]
+ 2 *( v[4] + v[5]) + v[6] + v[7]) >> 4;
p_v[4] = ( i_p0 + v[1] + 2 * (v[2] + v[3]) + 4 * v[4]
+ 2 *( v[5] + v[6]) + v[7] + v[8]) >> 4;
p_v[5] = ( v[1] + v[2] + 2 * (v[3] + v[4]) + 4 * v[5]
+ 2 *( v[6] + v[7]) + v[8] + i_p9) >> 4;
p_v[6] = ( v[2] + v[3] + 2 * (v[4] + v[5]) + 4 * v[6]
+ 2 *( v[7] + v[8]) + 2 * i_p9) >> 4;
p_v[7] = ( v[3] + v[4] + 2 * (v[5] + v[6]) + 4 * v[7]
+ 2 * v[8] + 4 * i_p9) >> 4;
p_v[8] = ( v[4] + v[5] + 2 * (v[6] + v[7]) + 4 * v[8]
+ 6 * i_p9) >> 4;
#endif
}
/*****************************************************************************/
/*---------------------------------------------------------------------------*/
/* */
/* ---------- filter Vertical lines so follow horizontal edges -------- */
/* */
/*---------------------------------------------------------------------------*/
/*****************************************************************************/
void E_( pp_deblock_V )( u8 *p_plane,
int i_width, int i_height, int i_stride,
QT_STORE_T *p_QP_store, int i_QP_stride,
int b_chroma )
{
int x, y, i;
u8 *p_v;
int i_QP_scale; /* use to do ( ? >> i_QP_scale ) */
int i_QP;
u8 i_v[10];
i_QP_scale = b_chroma ? 5 : 4 ;
for( y = 8; y < i_height - 4; y += 8 )
{
p_v = p_plane + ( y - 5 )* i_stride;
for( x = 0; x < i_width; x++ )
{
/* First get 10 vert pix to use them without i_stride */
for( i = 0; i < 10; i++ )
{
i_v[i] = p_v[i*i_stride + x];
}
i_QP = p_QP_store[(y>>i_QP_scale)*i_QP_stride+
(x>>i_QP_scale)];
/* XXX QP is for v5 */
if( pp_deblock_isDC_mode( i_v ) )
{
if( pp_deblock_isMinMaxOk( i_v, i_QP ) )
{
pp_deblock_DCMode( i_v, i_QP );
}
}
else
{
pp_deblock_DefaultMode( i_v, i_stride, i_QP );
}
/* Copy back, XXX only 1-8 were modified */
for( i = 1; i < 9; i++ )
{
p_v[i*i_stride + x] = i_v[i];
}
}
}
return;
}
/*****************************************************************************/
/*---------------------------------------------------------------------------*/
/* */
/* --------- filter Horizontal lines so follow vertical edges -------- */
/* */
/*---------------------------------------------------------------------------*/
/*****************************************************************************/
void E_( pp_deblock_H )( u8 *p_plane,
int i_width, int i_height, int i_stride,
QT_STORE_T *p_QP_store, int i_QP_stride,
int b_chroma )
{
int x, y;
u8 *p_v;
int i_QP_scale;
int i_QP;
i_QP_scale = b_chroma ? 5 : 4 ;
for( y = 0; y < i_height; y++ )
{
p_v = p_plane + y * i_stride - 5;
for( x = 8; x < i_width - 4; x += 8 )
{
/* p_v point 5 pix before a block boundary */
/* XXX QP is for v5 */
i_QP = p_QP_store[(y>>i_QP_scale)*i_QP_stride+
(x>>i_QP_scale)];
if( pp_deblock_isDC_mode( p_v + x ) )
{
if( pp_deblock_isMinMaxOk( p_v+ x, i_QP ) )
{
pp_deblock_DCMode( p_v+x, i_QP );
}
}
else
{
pp_deblock_DefaultMode( p_v+x, i_stride, i_QP );
}
}
}
return;
}
/*****************************************************************************
*
* Internals functions common to pp_Dering_Y pp_Dering_C
*
*****************************************************************************/
static inline void pp_dering_MinMax( u8 *p_block, int i_stride,
int *pi_min, int *pi_max )
{
int x, y;
int i_min, i_max;
#if 0
/* First we will extract min/max for each pix on vertical line
and next extract global min/max */
__asm__ __volatile__(
"leal (%2,%3), %%eax \n\
movq (%2), %%mm0 #load line \n\
movq %%mm0, %%mm1 \n"
MMXEXT_GET_LMINMAX( (%%eax), %%mm0, %%mm1, %%mm7 )
MMXEXT_GET_LMINMAX( (%%eax, %3), %%mm0, %%mm1, %%mm7 )
MMXEXT_GET_LMINMAX( (%%eax, %3,2), %%mm0, %%mm1, %%mm7 )
MMXEXT_GET_LMINMAX( (%2, %3, 4), %%mm0, %%mm1, %%mm7 )
"leal (%%eax,%3,4), %%eax \n"
MMXEXT_GET_LMINMAX( (%%eax), %%mm0, %%mm1, %%mm7 )
MMXEXT_GET_LMINMAX( (%%eax, %3), %%mm0, %%mm1, %%mm7 )
MMXEXT_GET_LMINMAX( (%%eax, %3,2), %%mm0, %%mm1, %%mm7 )
MMXEXT_GET_PMIN( %%mm0, %%mm7 )
MMXEXT_GET_PMAX( %%mm1, %%mm7 )
"movd %%mm0, %%eax \n\
andl $255, %%eax \n\
movl %%eax, (%0) \n\
movd %%mm1, %%eax \n\
andl $255, %%eax \n\
movl %%eax, (%1) \n"
: : "r"(pi_min), "r"(pi_max), "r"(p_block), "r"(i_stride) : "%eax", "memory" );
#endif
i_min = 255; i_max = 0;
for( y = 0; y < 8; y++ )
{
for( x = 0; x < 8; x++ )
{
if( i_min > p_block[x] ) i_min = p_block[x];
if( i_max < p_block[x] ) i_max = p_block[x];
}
p_block += i_stride;
}
*pi_min = i_min;
*pi_max = i_max;
}
static inline void pp_dering_BinIndex( u8 *p_block, int i_stride, int i_thr,
u32 *p_bin )
{
int x, y;
u32 i_bin;
for( y = 0; y < 10; y++ )
{
i_bin = 0;
for( x = 0; x < 10; x++ )
{
if( p_block[x] > i_thr )
{
i_bin |= 1 << x;
}
}
i_bin |= (~i_bin) << 16; /* for detect also three 0 */
*p_bin = i_bin&( i_bin >> 1 )&( i_bin << 1 );
p_block += i_stride;
p_bin++;
}
}
static inline void pp_dering_Filter( u8 *p_block, int i_stride,
u32 *p_bin,
int i_QP )
{
int x, y;
u32 i_bin;
int i_flt[8][8];
int i_f;
u8 *p_sav;
int i_QP_2;
p_sav = p_block;
i_QP_2 = i_QP >> 1;
for( y = 0; y < 8; y++ )
{
i_bin = p_bin[y] & p_bin[y+1] & p_bin[y+2]; /* To be optimised */
i_bin |= i_bin >> 16; /* detect 0 or 1 */
for( x = 0; x < 8; x++ )
{
if( i_bin&0x02 ) /* 0x02 since 10 index but want 1-9 */
{
/* apply dering */
/* 1 2 1
2 4 2 + (8)
1 2 1 */
i_f = p_block[x - i_stride - 1] +
( p_block[x - i_stride ] << 1)+
p_block[x - i_stride + 1] +
( p_block[x - 1] << 1 )+
( p_block[x ] << 2 )+
( p_block[x + 1] << 1 )+
p_block[x + i_stride - 1] +
( p_block[x + i_stride ] << 1 ) +
p_block[x + i_stride + 1];
i_f = ( 8 + i_f ) >> 4;
/* Clamp this value */
if( i_f - p_block[x] > ( i_QP_2 ) )
{
i_flt[y][x] = p_block[x] + i_QP_2;
}
else
if( i_f - p_block[x] < -i_QP_2 )
{
i_flt[y][x] = p_block[x] - i_QP_2;
}
else
{
i_flt[y][x] = i_f ;
}
}
else
{
i_flt[y][x] = p_block[x];
}
i_bin >>= 1;
}
p_block += i_stride;
}
for( y = 0; y < 8; y++ )
{
for( x = 0; x < 8; x++ )
{
p_sav[x] = i_flt[y][x];
}
p_sav+= i_stride;
}
}
/*****************************************************************************/
/*---------------------------------------------------------------------------*/
/* */
/* ----------------- Dering filter on Y and C blocks ----------------- */
/* */
/*---------------------------------------------------------------------------*/
/*****************************************************************************/
void E_( pp_dering_Y )( u8 *p_plane,
int i_width, int i_height, int i_stride,
QT_STORE_T *p_QP_store, int i_QP_stride )
{
int x, y, k;
int i_max[4], i_min[4], i_range[4];
int i_thr[4];
int i_max_range, i_kmax;
u32 i_bin[4][10];
u8 *p_block[4];
QT_STORE_T *p_QP;
/* We process 4 blocks/loop*/
for( y = 8; y < i_height-8; y += 16 )
{
/* +---+
|0|1|
+-+-+ :))
|2|3|
+-+-+ */
p_block[0] = p_plane + y * i_stride + 8;
p_block[1] = p_block[0] + 8;
p_block[2] = p_block[0] + ( i_stride << 3 );
p_block[3] = p_block[2] + 8;
for( x = 8; x < i_width-8; x += 16 )
{
/* 1: Calculate threshold */
/* Calculate max/min for each block */
pp_dering_MinMax( p_block[0], i_stride, &i_min[0], &i_max[0] );
pp_dering_MinMax( p_block[1], i_stride, &i_min[1], &i_max[1] );
pp_dering_MinMax( p_block[2], i_stride, &i_min[2], &i_max[2] );
pp_dering_MinMax( p_block[3], i_stride, &i_min[3], &i_max[3] );
/* Calculate range, max_range and thr */
i_max_range = 0; i_kmax = 0;
for( k = 0; k <= 4; k++ )
{
i_range[k] = i_max[k] - i_min[k];
i_thr[k] = ( i_max[k] + i_min[k] + 1 )/2;
if( i_max_range < i_max[k])
{
i_max_range = i_max[k];
i_kmax = k;
}
}
/* Now rearrange thr */
if( i_max_range > 64 )
{
for( k = 1; k < 5; k++ )
{
if( i_range[k] < 16 )
{
i_thr[k] = 0;
}
else
if( i_range[k] < 32 )
{
i_thr[k] = i_thr[i_kmax];
}
}
}
else
{
for( k = 1; k < 5; k++ )
{
if( i_range[k] < 16 )
{
i_thr[k] = 0;
}
}
}
/* 2: Index acquisition 10x10 ! so " -i_stride - 1"*/
pp_dering_BinIndex( p_block[0] - i_stride - 1, i_stride,
i_thr[0], i_bin[0] );
pp_dering_BinIndex( p_block[1] - i_stride - 1, i_stride,
i_thr[1], i_bin[1] );
pp_dering_BinIndex( p_block[2] - i_stride - 1, i_stride,
i_thr[2], i_bin[2] );
pp_dering_BinIndex( p_block[3] - i_stride - 1, i_stride,
i_thr[3], i_bin[3] );
/* 3: adaptive smoothing */
/* since we begin at (8,8) QP can be different for each block */
p_QP = &( p_QP_store[( y >> 4) * i_QP_stride + (x >> 4)] );
pp_dering_Filter( p_block[0], i_stride,
i_bin[0], p_QP[0] );
pp_dering_Filter( p_block[1], i_stride,
i_bin[1], p_QP[1] );
pp_dering_Filter( p_block[2], i_stride,
i_bin[2], p_QP[i_QP_stride] );
pp_dering_Filter( p_block[3], i_stride,
i_bin[3], p_QP[i_QP_stride+1] );
p_block[0] += 8;
p_block[1] += 8;
p_block[2] += 8;
p_block[3] += 8;
}
}
}
void E_( pp_dering_C )( u8 *p_plane,
int i_width, int i_height, int i_stride,
QT_STORE_T *p_QP_store, int i_QP_stride )
{
int x, y;
int i_max, i_min;
int i_thr;
u32 i_bin[10];
u8 *p_block;
for( y = 8; y < i_height-8; y += 8 )
{
p_block = p_plane + y * i_stride + 8;
for( x = 8; x < i_width-8; x += 8 )
{
/* 1: Calculate threshold */
/* Calculate max/min for each block */
pp_dering_MinMax( p_block, i_stride,
&i_min, &i_max );
/* Calculate thr*/
i_thr = ( i_max + i_min + 1 )/2;
/* 2: Index acquisition 10x10 */
/* point on 10x10 in wich we have our 8x8 block */
pp_dering_BinIndex( p_block - i_stride -1, i_stride,
i_thr,
i_bin );
/* 3: adaptive smoothing */
pp_dering_Filter( p_block, i_stride,
i_bin,
p_QP_store[(y>>5)*i_QP_stride+ (x>>5)]);
p_block += 8;
}
}
}
/*****************************************************************************
* postprocessing_mmxext.c: Post Processing plugin MMXEXT
*****************************************************************************
* Copyright (C) 2001 VideoLAN
* $Id: postprocessing_mmxext.c,v 1.1 2002/08/04 22:13:06 fenrir Exp $
*
* Authors: Laurent Aimar <fenrir@via.ecp.fr>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111, USA.
*****************************************************************************/
#include <vlc/vlc.h> /* only use u8, u32 .... */
#include "postprocessing.h"
#include "postprocessing_common.h"
/*****************************************************************************
*
* Internals functions common to pp_Deblock_V and pp_Deblock_H
*
*****************************************************************************/
/*****************************************************************************
* MMX stuff
*****************************************************************************/
/* XXX PP_THR1 need to be defined as ULL */
/* Use same things as in idct but how it work ? */
#define UNUSED_LONGLONG( foo ) \
static const unsigned long long foo __asm__ (#foo) __attribute__((unused))
/* to calculate isDC_mode for mmx */
UNUSED_LONGLONG( mmx_127_thr1 ) = ( ( 127ULL - PP_THR1 ) << 56 )|
( ( 127ULL - PP_THR1 ) << 48 )|
( ( 127ULL - PP_THR1 ) << 40 )|
( ( 127ULL - PP_THR1 ) << 32 )|
( ( 127ULL - PP_THR1 ) << 24 )|
( ( 127ULL - PP_THR1 ) << 16 )|
( ( 127ULL - PP_THR1 ) << 8 )|
( ( 127ULL - PP_THR1 ) );
UNUSED_LONGLONG( mmx_127_2xthr1_1 ) = ( ( 127ULL - PP_2xTHR1 -1) << 56 )|
( ( 127ULL - PP_2xTHR1 -1 ) << 48 )|
( ( 127ULL - PP_2xTHR1 -1 ) << 40 )|
( ( 127ULL - PP_2xTHR1 -1 ) << 32 )|
( ( 127ULL - PP_2xTHR1 -1 ) << 24 )|
( ( 127ULL - PP_2xTHR1 -1 ) << 16 )|
( ( 127ULL - PP_2xTHR1 -1 ) << 8 )|
( ( 127ULL - PP_2xTHR1 -1 ) );
UNUSED_LONGLONG( mmx_m2_5_m5_2 ) = 0xfffe0005fffb0002ULL;
/* find min bytes from r ans set it in r, t is destroyed */
#define MMXEXT_GET_PMIN( r, t ) \
"movq " #r ", " #t " \n\
psrlq $8, " #t " \n\
pminub " #t ", " #r " \n\
pshufw $0xf5, " #r ", " #t " #instead of shift with tmp reg \n\
pminub " #t ", " #r " \n\
pshufw $0xfe, " #r ", " #t " \n\
pminub " #t ", " #r " \n"
/* find mzx bytes from r ans set it in r, t is destroyed */
#define MMXEXT_GET_PMAX( r, t ) \
"movq " #r ", " #t " \n\
psrlq $8, " #t " \n\
pmaxub " #t ", " #r " \n\
pshufw $0xf5, " #r ", " #t " \n\
pmaxub " #t ", " #r " \n\
pshufw $0xfe, " #r ", " #t " \n\
pmaxub " #t ", " #r " \n"
#define MMXEXT_GET_LMINMAX( s, m, M, t ) \
"movq " #s ", " #t " \n\
pminub " #t ", " #m " \n\
pmaxub " #t ", " #M " \n"
/* Some tips for MMX
* |a-b| :
d1 = a - b with unsigned saturate
d2 = b - a with ...
|a-b| = d1 | d2
*/
/****************************************************************************
* pp_deblock_isDC_mode : Check if we will use DC mode or Default mode
****************************************************************************
* Use constant PP_THR1 and PP_THR2 ( PP_2xTHR1 )
*
* Called for for each pixel on a boundary block when doing deblocking
* so need to be fast ...
*
****************************************************************************/
static inline int pp_deblock_isDC_mode( u8 *p_v )
{
int i_eq_cnt;
/* algo :
x = v[i] - v[i+1] without signed saturation
( XXX see if there is'nt problem, but can't be with signed
sat because pixel will be saturate :(
so x within [-128, 127] and we have to test if it fit in [-M, M]
we add 127-M with wrap around -> good value fit in [ 127-2*M, 127]
and if x >= 127 - 2 * M ie x > 127 -2*M - 1 value is good
*/
__asm__ __volatile__ (" \n\
#* Do (v0-v1) to (v7-v8) \n\
movq (%1), %%mm1 # load v0->v7 \n\
movq 1(%1), %%mm2 # load v1->v8 \n\
psubb %%mm2, %%mm1 # v[i]-v[i+1] \n\
paddb mmx_127_thr1, %%mm1 # + 127-THR1 with wrap \n\
pcmpgtb mmx_127_2xthr1_1, %%mm1 # > 127 -2*thr1 - 1 \n\
pxor %%mm0, %%mm0 # mm0 = 0 \n\
psadbw %%mm1, %%mm0 \n\
movd %%mm0, %0 # \n\
negl %0 \n\
andl $255, %0"
: "=r"(i_eq_cnt) : "r" (p_v) );
/* last test, hey, 9 don't fit in MMX */
if(( ( p_v[8] - p_v[9] + PP_THR1 )&0xffff )<= PP_2xTHR1 )
{
i_eq_cnt++;
}
#if 0
/* algo : if ( | v[i] -v[i+1] | <= PP_THR1 ) { i_eq_cnt++; } */
i_eq_cnt = 0;
for( i =0; i < 9; i++ )
{
if(( ( p_v[i] - p_v[i+1] + PP_THR1 )&0xffff )<= PP_2xTHR1 )
{
i_eq_cnt++;
}
}
#endif
return( (i_eq_cnt >= PP_THR2 ) ? 1 : 0 );
}
static inline int pp_deblock_isMinMaxOk( u8 *p_v, int i_QP )
{
int i_range;
__asm__ __volatile__ (
"movq 1(%1), %%mm0 # 8 bytes \n"
"movq %%mm0, %%mm1 \n"
MMXEXT_GET_PMIN( %%mm0, %%mm7 )
MMXEXT_GET_PMAX( %%mm1, %%mm7 )
"psubd %%mm0, %%mm1 # max - min \n\
movd %%mm1, %0 \n\
andl $255, %0" : "=r"(i_range) : "r"(p_v) );
#if 0
int i_max, i_min;
int i;
i_min = i_max = p_v[1];
for( i = 2; i < 9; i++ )
{
if( i_max < p_v[i] ) i_max = p_v[i];
if( i_min > p_v[i] ) i_min = p_v[i];
}
i_range = i_max - i_min;
#endif
return( i_range< 2*i_QP ? 1 : 0 );
}
static inline void pp_deblock_DefaultMode( u8 i_v[10], int i_stride,
int i_QP )
{
int d, i_delta;
int a3x0, a3x0_, a3x1, a3x2;
int b_neg;
/* d = CLIP( 5(a3x0' - a3x0)//8, 0, (v4-v5)/2 ).d( abs(a3x0) < QP ) */
/* First calculate a3x0 */
__asm__ __volatile__ ( " \n\
pxor %%mm7, %%mm7 # mm7 = 0 \n\
movq mmx_m2_5_m5_2, %%mm6 # mm6 =(2,-5,5,-2) \n\
movd 3(%1), %%mm0 \n\
punpcklbw %%mm7,%%mm0 \n\
pmaddwd %%mm6, %%mm0 \n"
"pshufw $0xfe, %%mm0, %%mm1 \n"
"paddd %%mm1, %%mm0 \n\
movd %%mm0, %0" : "=r"(a3x0) :"r"(i_v) );
#if 0
a3x0 = 2 * ( i_v[3] - i_v[6] ) + 5 *( i_v[5] - i_v[4] );
#endif
if( a3x0 < 0 )
{
b_neg = 1;
a3x0 = -a3x0;
}
else
{
b_neg = 0;
}
/* XXX Now a3x0 is abs( a3x0 ) */
if( ( a3x0 < 8 * i_QP )&&( a3x0 != 0 ) ) /* |a3x0| < 8*i_QP */
{
/* calculate a3x1 et a3x2 */
__asm__ __volatile__ ( " \n\
# mm7 = 0 \n\
# mm6 = ( 2, -5, 5, -2 ) \n\
movd 1(%2), %%mm0 \n\
movd 5(%2), %%mm2 \n\
punpcklbw %%mm7,%%mm0 \n\
punpcklbw %%mm7,%%mm2 \n\
pmaddwd %%mm6, %%mm0 \n\
pmaddwd %%mm6, %%mm2 \n\
pshufw $0xfe, %%mm0, %%mm1 \n\
paddd %%mm1, %%mm0 # mm0 = a3x1 \n\
movd %%mm0, %0 \n\
pshufw $0xfe, %%mm2, %%mm1 \n\
paddd %%mm1, %%mm2 # mm2 = a3x2 \n\
movd %%mm2, %1 \n\
" : "=r"(a3x1), "=r"(a3x2) : "r"(i_v) );
#if 0
a3x1 = 2 * ( i_v[1] - i_v[4] ) + 5 * ( i_v[3] - i_v[2] );
a3x2 = 2 * ( i_v[5] - i_v[8] ) + 5 * ( i_v[7] - i_v[6] );
#endif
if( a3x1 < 0) a3x1 = -a3x1; /* abs( a3x1 ) */
if( a3x2 < 0) a3x2 = -a3x2; /* abs( a3x2 ) */
a3x0_ = PP_MIN3( a3x0, a3x1, a3x2 );
d = 5 *( a3x0 - a3x0_ ) / 8; /* always > 0 */
i_delta = ( i_v[4] - i_v[5] ) / 2;
/* clip into [0, i_delta] or [i_delta, 0] */
if( i_delta < 0 )
{
if( !b_neg ) /* since true d has sgn(d) = - sgn( a3x0 ) */
{
d = -d;
if( d < i_delta ) d = i_delta;
i_v[4] -= d;
i_v[5] += d;
}
}
else
{
if( b_neg )
{
if( d > i_delta ) d = i_delta;
i_v[4] -= d;
i_v[5] += d;
}
}
}
}
static inline void pp_deblock_DCMode( u8 *p_v, /* = int i_v[10] */
int i_QP )
{
int i_p0, i_p9;
i_p0 = PP_ABS( p_v[1] - p_v[0] ) < i_QP ? p_v[0] : p_v[1];
i_p9 = PP_ABS( p_v[8] - p_v[9] ) < i_QP ? p_v[9] : p_v[8];
/* mm0 = 8 pix unmodified
-We will process first 4 pixel
mm0 = 8 pix unmodified
mm1 = for the first part of the 4 first pix
(v1) -> (p0) -> ... ( word )
(v2) (v1)
(v3) (v2)
(v4) (v3)
= for the commoin part between first and last pix
(v2) -> (v3) -> ... ( word )
(v3) (v4)
(v4) (v5)
(v5) (v6)
= for the last part of the 4 last pix
(v5) -> (v6) -> ... ( word )
(v6) (v7)
(v7) (v8)
(v8) (p9)
mm2 = acu for first new pix
mm3 = acu for last pix
mm4 = unused
mm5 = p0
mm6 = p9 << 48
mm7 = 0 */
__asm__ __volatile__ (
"pxor %%mm7, %%mm7 \n\
movq 1(%0), %%mm0 # get 8 pix \n\
# unpack into mm1 \n\
movq %%mm0, %%mm1 \n\
punpcklbw %%mm7, %%mm1 \n\
# get p_0 and i_p9 \n\
movd %1, %%mm5 \n\
movd %2, %%mm6 \n\
psllq $48, %%mm6 \n
\n\
movq %%mm1, %%mm3 # p_v[5-8] = v[1-4] !! \n\
movq %%mm1, %%mm2 \n\
psllw $2, %%mm2 # p_v[1-4] = 4*v[1-4] \n\
\n\
psllq $16, %%mm1 \n\
por %%mm5, %%mm1 # mm1 =( p0, v1, v2 ,v3)\n\
\n\
paddw %%mm1, %%mm2 \n\
paddw %%mm1, %%mm2 \n\
\n\
pshufw $0x90,%%mm1,%%mm1 # mm1 =( p0, p0, v1, v2)\n\
paddw %%mm1, %%mm2 \n\
paddw %%mm1, %%mm2 \n\
\n\
pshufw $0x90,%%mm1,%%mm1 # mm1 =( p0, p0, p0, v2)\n\
paddw %%mm1, %%mm2 \n\
\n\
pshufw $0x90,%%mm1,%%mm1 # mm1 =( p0, p0, p0, p0)\n\
paddw %%mm1, %%mm2 \n\
# Now last part a little borring\n\
# last part for mm2, beginig for mm3
movq %%mm0, %%mm1 \n\
psrlq $8, %%mm1 \n\
punpcklbw %%mm7, %%mm1 # mm1 =( v2, v3, v4, v5 )\n\
paddw %%mm1, %%mm2 \n\
paddw %%mm1, %%mm2 \n\
paddw %%mm1, %%mm3 \n\
\n\
movq %%mm0, %%mm1 \n\
psrlq $16, %%mm1 \n\
punpcklbw %%mm7, %%mm1 # mm1 =( v3, v4, v5, v6 )\n\
psllw $1, %%mm1 \n\
paddw %%mm1, %%mm2 \n\
paddw %%mm1, %%mm3 \n\
\n\
movq %%mm0, %%mm1 \n\
psrlq $24, %%mm1 \n\
punpcklbw %%mm7, %%mm1 # mm1 =( v4, v5, v6, v7) \n\
paddw %%mm1, %%mm2 \n\
paddw %%mm1, %%mm3 \n\
paddw %%mm1, %%mm3 \n\
\n\
movq %%mm0, %%mm1 \n\
psrlq $32, %%mm1 \n\
punpcklbw %%mm7, %%mm1 # mm1 =( v5, v6, v7, v8) \n\
paddw %%mm1, %%mm2 \n\
psllw $2, %%mm1
paddw %%mm1, %%mm3 \n\
# Now last part for last 4 pix \n\
# \n\
movq %%mm0, %%mm1 \n\
punpckhbw %%mm7, %%mm1 # mm1 = ( v5, v6, v7, v8) \n\
\n\
psrlq $16, %%mm1 \n\
por %%mm6, %%mm1 # mm1 =( v6, v7, v8, p9 )\n\
\n\
paddw %%mm1, %%mm3 \n\
paddw %%mm1, %%mm3 \n\
\n\
pshufw $0xf9,%%mm1,%%mm1 # mm1 =( v7, v8, p9, p9)\n\
paddw %%mm1, %%mm3 \n\
paddw %%mm1, %%mm3 \n\
\n\
pshufw $0xf9,%%mm1,%%mm1 # mm1 =( v8, p9, p9, p9)\n\
paddw %%mm1, %%mm3 \n\
\n\
pshufw $0xf9,%%mm1,%%mm1 # mm1 =( p9, p9, p9, p9)\n\
paddw %%mm1, %%mm3 \n\
psrlw $4, %%mm2 \n\
psrlw $4, %%mm3 \n\
packuswb %%mm3, %%mm2 \n\
movq %%mm2, 1(%0) \n\
": : "r"(p_v), "r"(i_p0), "r"(i_p9) : "memory" );
#if 0
for( i = 1; i < 9; i++ )
{
v[i] = p_v[i]; /* save 8 pix that will be modified */
}
p_v[1] = ( 6 * i_p0 + 4 * v[1]
+ 2 *( v[2] + v[3]) + v[4] + v[5]) >> 4;
p_v[2] = ( 4 * i_p0 + 2 * v[1] + 4 * v[2]
+ 2 *( v[3] + v[4]) + v[5] + v[6]) >> 4;
p_v[3] = ( 2 * i_p0 + 2 * (v[1] + v[2]) + 4 * v[3]
+ 2 *( v[4] + v[5]) + v[6] + v[7]) >> 4;
p_v[4] = ( i_p0 + v[1] + 2 * (v[2] + v[3]) + 4 * v[4]
+ 2 *( v[5] + v[6]) + v[7] + v[8]) >> 4;
p_v[5] = ( v[1] + v[2] + 2 * (v[3] + v[4]) + 4 * v[5]
+ 2 *( v[6] + v[7]) + v[8] + i_p9) >> 4;
p_v[6] = ( v[2] + v[3] + 2 * (v[4] + v[5]) + 4 * v[6]
+ 2 *( v[7] + v[8]) + 2 * i_p9) >> 4;
p_v[7] = ( v[3] + v[4] + 2 * (v[5] + v[6]) + 4 * v[7]
+ 2 * v[8] + 4 * i_p9) >> 4;
p_v[8] = ( v[4] + v[5] + 2 * (v[6] + v[7]) + 4 * v[8]
+ 6 * i_p9) >> 4;
#endif
}
/*****************************************************************************/
/*---------------------------------------------------------------------------*/
/* */
/* ---------- filter Vertical lines so follow horizontal edges -------- */
/* */
/*---------------------------------------------------------------------------*/
/*****************************************************************************/
void E_( pp_deblock_V )( u8 *p_plane,
int i_width, int i_height, int i_stride,
QT_STORE_T *p_QP_store, int i_QP_stride,
int b_chroma )
{
int x, y, i;
u8 *p_v;
int i_QP_scale; /* use to do ( ? >> i_QP_scale ) */
int i_QP;
u8 i_v[10];
i_QP_scale = b_chroma ? 5 : 4 ;
for( y = 8; y < i_height - 4; y += 8 )
{
p_v = p_plane + ( y - 5 )* i_stride;
for( x = 0; x < i_width; x++ )
{
/* First get 10 vert pix to use them without i_stride */
for( i = 0; i < 10; i++ )
{
i_v[i] = p_v[i*i_stride + x];
}
i_QP = p_QP_store[(y>>i_QP_scale)*i_QP_stride+
(x>>i_QP_scale)];
/* XXX QP is for v5 */
if( pp_deblock_isDC_mode( i_v ) )
{
if( pp_deblock_isMinMaxOk( i_v, i_QP ) )
{
pp_deblock_DCMode( i_v, i_QP );
}
}
else
{
pp_deblock_DefaultMode( i_v, i_stride, i_QP );
}
/* Copy back, XXX only 1-8 were modified */
for( i = 1; i < 9; i++ )
{
p_v[i*i_stride + x] = i_v[i];
}
}
}
return;
}
/*****************************************************************************/
/*---------------------------------------------------------------------------*/
/* */
/* --------- filter Horizontal lines so follow vertical edges -------- */
/* */
/*---------------------------------------------------------------------------*/
/*****************************************************************************/
void E_( pp_deblock_H )( u8 *p_plane,
int i_width, int i_height, int i_stride,
QT_STORE_T *p_QP_store, int i_QP_stride,
int b_chroma )
{
int x, y;
u8 *p_v;
int i_QP_scale;
int i_QP;
i_QP_scale = b_chroma ? 5 : 4 ;
for( y = 0; y < i_height; y++ )
{
p_v = p_plane + y * i_stride - 5;
for( x = 8; x < i_width - 4; x += 8 )
{
/* p_v point 5 pix before a block boundary */
/* XXX QP is for v5 */
i_QP = p_QP_store[(y>>i_QP_scale)*i_QP_stride+
(x>>i_QP_scale)];
if( pp_deblock_isDC_mode( p_v + x ) )
{
if( pp_deblock_isMinMaxOk( p_v+ x, i_QP ) )
{
pp_deblock_DCMode( p_v+x, i_QP );
}
}
else
{
pp_deblock_DefaultMode( p_v+x, i_stride, i_QP );
}
}
}
return;
}
/*****************************************************************************
*
* Internals functions common to pp_Dering_Y pp_Dering_C
*
*****************************************************************************/
static inline void pp_dering_MinMax( u8 *p_block, int i_stride,
int *pi_min, int *pi_max )
{
/* First we will extract min/max for each pix on vertical line
and next extract global min/max */
__asm__ __volatile__(
"leal (%2,%3), %%eax \n\
movq (%2), %%mm0 #load line \n\
movq %%mm0, %%mm1 \n"
MMXEXT_GET_LMINMAX( (%%eax), %%mm0, %%mm1, %%mm7 )
MMXEXT_GET_LMINMAX( (%%eax, %3), %%mm0, %%mm1, %%mm7 )
MMXEXT_GET_LMINMAX( (%%eax, %3,2), %%mm0, %%mm1, %%mm7 )
MMXEXT_GET_LMINMAX( (%2, %3, 4), %%mm0, %%mm1, %%mm7 )
"leal (%%eax,%3,4), %%eax \n"
MMXEXT_GET_LMINMAX( (%%eax), %%mm0, %%mm1, %%mm7 )
MMXEXT_GET_LMINMAX( (%%eax, %3), %%mm0, %%mm1, %%mm7 )
MMXEXT_GET_LMINMAX( (%%eax, %3,2), %%mm0, %%mm1, %%mm7 )
MMXEXT_GET_PMIN( %%mm0, %%mm7 )
MMXEXT_GET_PMAX( %%mm1, %%mm7 )
"movd %%mm0, %%eax \n\
andl $255, %%eax \n\
movl %%eax, (%0) \n\
movd %%mm1, %%eax \n\
andl $255, %%eax \n\
movl %%eax, (%1) \n"
: : "r"(pi_min), "r"(pi_max), "r"(p_block), "r"(i_stride) : "%eax", "memory" );
#if 0
i_min = 255; i_max = 0;
for( y = 0; y < 8; y++ )
{
for( x = 0; x < 8; x++ )
{
if( i_min > p_block[x] ) i_min = p_block[x];
if( i_max < p_block[x] ) i_max = p_block[x];
}
p_block += i_stride;
}
*pi_min = i_min;
*pi_max = i_max;
#endif
}
static inline void pp_dering_BinIndex( u8 *p_block, int i_stride, int i_thr,
u32 *p_bin )
{
int y;
u32 i_bin;
/* first create mm7 with all bytes set to thr and mm6 = 0 */
__asm__ __volatile__(
"movl %0, %%eax \n"
"movb %%al, %%ah \n"
"movd %%eax, %%mm7 \n"
"pshufw $0x00, %%mm7, %%mm7 \n"
"pxor %%mm6, %%mm6 \n"
: : "r"(i_thr) : "%eax" );
for( y = 0; y < 10; y++ )
{
__asm__ __volatile__(
"movq (%1), %%mm0 \n"
"psubusb %%mm7, %%mm0 \n" // sat makes that x <= thr --> 0
"pcmpeqb %%mm6, %%mm0 \n" // p_block <= i_thr ? -1 : 0
"pmovmskb %%mm0, %0 \n" // i_bin msb of each bytes
: "=r"(i_bin) :"r"(p_block) );
/* Now last 2 tests */
if( p_block[8] <= i_thr ) i_bin |= 1 << 8;
if( p_block[9] <= i_thr ) i_bin |= 1 << 9;
i_bin |= (~i_bin) << 16; /* for detect three 1 or three 0*/
*p_bin = ( i_bin >> 1 )&&( i_bin )&&( i_bin << 1 );
p_block += i_stride;
p_bin++;
}
#if 0
int x, y;
for( y = 0; y < 10; y++ )
{
i_bin = 0;
for( x = 0; x < 10; x++ )
{
if( p_block[x] > i_thr )
{
i_bin |= 1 << x;
}
}
i_bin |= (~i_bin) << 16; /* for detect also three 0 */
*p_bin = i_bin&( i_bin >> 1 )&( i_bin << 1 );
*p_bin = i_bin;
p_block += i_stride;
p_bin++;
}
#endif
}
static inline void pp_dering_Filter( u8 *p_block, int i_stride,
u32 *p_bin,
int i_QP )
{
int x, y;
u32 i_bin;
u8 i_flt[8][8];
int i_f;
u8 *p_sav;
int i_QP_2;
p_sav = p_block;
i_QP_2 = i_QP >> 1;
for( y = 0; y < 8; y++ )
{
i_bin = p_bin[y] & p_bin[y+1] & p_bin[y+2]; /* To be optimised */
i_bin |= i_bin >> 16; /* detect 0 or 1 */
for( x = 0; x < 8; x++ )
{
if( i_bin&0x02 ) /* 0x02 since 10 index but want 1-9 */
{
/* apply dering */
/* 1 2 1
2 4 2 + (8)
1 2 1 */
i_f = p_block[x - i_stride - 1] +
( p_block[x - i_stride ] << 1)+
p_block[x - i_stride + 1] +
( p_block[x - 1] << 1 )+
( p_block[x ] << 2 )+
( p_block[x + 1] << 1 )+
p_block[x + i_stride - 1] +
( p_block[x + i_stride ] << 1 ) +
p_block[x + i_stride + 1];
i_flt[y][x] = ( 8 + i_f ) >> 4;
}
else
{
i_flt[y][x] = p_block[x];
}
i_bin >>= 1;
}
p_block += i_stride;
}
/* Create mm7 with all bytes set to QP/2 */
__asm__ __volatile__(
"movl %0, %%eax \n"
"shrl $1, %%eax \n" // i_QP/2
"movb %%al, %%ah \n"
"movd %%eax, %%mm7 \n"
"pshufw $0x00, %%mm7, %%mm7 \n"
: : "r"(i_QP) : "%eax" );
for( y = 0; y < 8; y++ )
{
/* clamp those values and copy them */
__asm__ __volatile__(
"movq (%0), %%mm0 \n" // mm0 = i_ftl[y][0] ... i_ftl[y][7]
"movq (%1), %%mm1 \n" // mm1 = p_sav[0] ... p_sav[7]
"movq %%mm1, %%mm2 \n"
"psubusb %%mm7, %%mm1 \n" // mm1 = psav - i_QP/2 ( >= 0 )
"paddusb %%mm7, %%mm2 \n" // mm2 = psav + i_QP/2 ( <= 255 )
"pmaxub %%mm1, %%mm0 \n" // psav - i_QP/2 <= mm0
"pminub %%mm2, %%mm0 \n" // mm0 <= psav + i_QP/2
"movq %%mm0, (%1) \n"
: :"r"(i_flt[y]), "r"(p_sav) : "memory" );
p_sav+= i_stride;
}
}
/*****************************************************************************/
/*---------------------------------------------------------------------------*/
/* */
/* ----------------- Dering filter on Y and C blocks ----------------- */
/* */
/*---------------------------------------------------------------------------*/
/*****************************************************************************/
void E_( pp_dering_Y )( u8 *p_plane,
int i_width, int i_height, int i_stride,
QT_STORE_T *p_QP_store, int i_QP_stride )
{
int x, y, k;
int i_max[4], i_min[4], i_range[4];
int i_thr[4];
int i_max_range, i_kmax;
u32 i_bin[4][10];
u8 *p_block[4];
QT_STORE_T *p_QP;
/* We process 4 blocks/loop*/
for( y = 8; y < i_height-8; y += 16 )
{
/* +---+
|0|1|
+-+-+ :))
|2|3|
+-+-+ */
p_block[0] = p_plane + y * i_stride + 8;
p_block[1] = p_block[0] + 8;
p_block[2] = p_block[0] + ( i_stride << 3 );
p_block[3] = p_block[2] + 8;
for( x = 8; x < i_width-8; x += 16 )
{
/* 1: Calculate threshold */
/* Calculate max/min for each block */
pp_dering_MinMax( p_block[0], i_stride, &i_min[0], &i_max[0] );
pp_dering_MinMax( p_block[1], i_stride, &i_min[1], &i_max[1] );
pp_dering_MinMax( p_block[2], i_stride, &i_min[2], &i_max[2] );
pp_dering_MinMax( p_block[3], i_stride, &i_min[3], &i_max[3] );
/* Calculate range, max_range and thr */
i_max_range = 0; i_kmax = 0;
for( k = 0; k < 4; k++ )
{
i_range[k] = i_max[k] - i_min[k];
i_thr[k] = ( i_max[k] + i_min[k] + 1 )/2;
if( i_max_range < i_max[k])
{
i_max_range = i_max[k];
i_kmax = k;
}
}
/* Now rearrange thr */
if( i_max_range > 64 )
{
for( k = 1; k < 5; k++ )
{
if( i_range[k] < 16 )
{
i_thr[k] = 0;
}
else
if( i_range[k] < 32 )
{
i_thr[k] = i_thr[i_kmax];
}
}
}
else
{
for( k = 1; k < 5; k++ )
{
if( i_range[k] < 16 )
{
i_thr[k] = 0;
}
}
}
/* 2: Index acquisition 10x10 ! so " -i_stride - 1"*/
pp_dering_BinIndex( p_block[0] - i_stride - 1, i_stride,
i_thr[0], i_bin[0] );
pp_dering_BinIndex( p_block[1] - i_stride - 1, i_stride,
i_thr[1], i_bin[1] );
pp_dering_BinIndex( p_block[2] - i_stride - 1, i_stride,
i_thr[2], i_bin[2] );
pp_dering_BinIndex( p_block[3] - i_stride - 1, i_stride,
i_thr[3], i_bin[3] );
/* 3: adaptive smoothing */
/* since we begin at (8,8) QP can be different for each block */
p_QP = &( p_QP_store[( y >> 4) * i_QP_stride + (x >> 4)] );
pp_dering_Filter( p_block[0], i_stride,
i_bin[0], p_QP[0] );
pp_dering_Filter( p_block[1], i_stride,
i_bin[1], p_QP[1] );
pp_dering_Filter( p_block[2], i_stride,
i_bin[2], p_QP[i_QP_stride] );
pp_dering_Filter( p_block[3], i_stride,
i_bin[3], p_QP[i_QP_stride+1] );
p_block[0] += 8;
p_block[1] += 8;
p_block[2] += 8;
p_block[3] += 8;
}
}
}
void E_( pp_dering_C )( u8 *p_plane,
int i_width, int i_height, int i_stride,
QT_STORE_T *p_QP_store, int i_QP_stride )
{
int x, y;
int i_max, i_min;
int i_thr;
u32 i_bin[10];
u8 *p_block;
for( y = 8; y < i_height-8; y += 8 )
{
p_block = p_plane + y * i_stride + 8;
for( x = 8; x < i_width-8; x += 8 )
{
/* 1: Calculate threshold */
/* Calculate max/min for each block */
pp_dering_MinMax( p_block, i_stride,
&i_min, &i_max );
/* Calculate thr*/
i_thr = ( i_max + i_min + 1 )/2;
/* 2: Index acquisition 10x10 */
/* point on 10x10 in wich we have our 8x8 block */
pp_dering_BinIndex( p_block - i_stride -1, i_stride,
i_thr,
i_bin );
/* 3: adaptive smoothing */
pp_dering_Filter( p_block, i_stride,
i_bin,
p_QP_store[(y>>5)*i_QP_stride+ (x>>5)]);
p_block += 8;
}
}
}
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment