Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Support
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
V
vlc-2-2
Project overview
Project overview
Details
Activity
Releases
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Issues
0
Issues
0
List
Boards
Labels
Milestones
Redmine
Redmine
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Operations
Operations
Metrics
Environments
Analytics
Analytics
CI / CD
Repository
Value Stream
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
videolan
vlc-2-2
Commits
925e40a3
Commit
925e40a3
authored
Nov 17, 2006
by
Derk-Jan Hartman
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
* Remove the old swscale files, and use the newer libswscale from ffmpeg.
Untested and likely to break stuff
parent
3a2462f6
Changes
19
Hide whitespace changes
Inline
Side-by-side
Showing
19 changed files
with
44 additions
and
12104 deletions
+44
-12104
configure.ac
configure.ac
+6
-1
modules/codec/ffmpeg/Modules.am
modules/codec/ffmpeg/Modules.am
+1
-0
modules/codec/ffmpeg/ffmpeg.c
modules/codec/ffmpeg/ffmpeg.c
+19
-2
modules/codec/ffmpeg/ffmpeg.h
modules/codec/ffmpeg/ffmpeg.h
+6
-0
modules/codec/ffmpeg/scale.c
modules/codec/ffmpeg/scale.c
+12
-79
modules/video_filter/swscale/Modules.am
modules/video_filter/swscale/Modules.am
+0
-20
modules/video_filter/swscale/common.h
modules/video_filter/swscale/common.h
+0
-136
modules/video_filter/swscale/rgb2rgb.c
modules/video_filter/swscale/rgb2rgb.c
+0
-598
modules/video_filter/swscale/rgb2rgb.h
modules/video_filter/swscale/rgb2rgb.h
+0
-143
modules/video_filter/swscale/rgb2rgb_template.c
modules/video_filter/swscale/rgb2rgb_template.c
+0
-2634
modules/video_filter/swscale/swscale.c
modules/video_filter/swscale/swscale.c
+0
-2562
modules/video_filter/swscale/swscale.h
modules/video_filter/swscale/swscale.h
+0
-130
modules/video_filter/swscale/swscale_altivec_template.c
modules/video_filter/swscale/swscale_altivec_template.c
+0
-532
modules/video_filter/swscale/swscale_internal.h
modules/video_filter/swscale/swscale_internal.h
+0
-156
modules/video_filter/swscale/swscale_template.c
modules/video_filter/swscale/swscale_template.c
+0
-2834
modules/video_filter/swscale/yuv2rgb.c
modules/video_filter/swscale/yuv2rgb.c
+0
-850
modules/video_filter/swscale/yuv2rgb_altivec.c
modules/video_filter/swscale/yuv2rgb_altivec.c
+0
-799
modules/video_filter/swscale/yuv2rgb_mlib.c
modules/video_filter/swscale/yuv2rgb_mlib.c
+0
-88
modules/video_filter/swscale/yuv2rgb_template.c
modules/video_filter/swscale/yuv2rgb_template.c
+0
-540
No files found.
configure.ac
View file @
925e40a3
...
@@ -2789,8 +2789,14 @@ dnl Trying with pkg-config
...
@@ -2789,8 +2789,14 @@ dnl Trying with pkg-config
VLC_ADD_LDFLAGS([ffmpeg stream_out_switcher],[${FFMPEG_LIBS}])
VLC_ADD_LDFLAGS([ffmpeg stream_out_switcher],[${FFMPEG_LIBS}])
dnl newer ffmpeg have a separate libpostproc
dnl newer ffmpeg have a separate libpostproc
PKG_CHECK_MODULES(POSTPROC, libpostproc,[
PKG_CHECK_MODULES(POSTPROC, libpostproc,[
VLC_ADD_CFLAGS([ffmpeg],[${POSTPROC_CFLAGS}])
VLC_ADD_LDFLAGS([ffmpeg],[${POSTPROC_LIBS}])
VLC_ADD_LDFLAGS([ffmpeg],[${POSTPROC_LIBS}])
],[ true ])
],[ true ])
dnl even newer ffmpeg has a libswscale
PKG_CHECK_MODULES(SWSCALE, libswscale,[
VLC_ADD_CFLAGS([ffmpeg],[${SWSCALE_CFLAGS}])
VLC_ADD_LDFLAGS([ffmpeg],[${SWSCALE_LIBS}])
],[ true ])
],[
],[
dnl
dnl
...
@@ -5637,7 +5643,6 @@ AC_CONFIG_FILES([
...
@@ -5637,7 +5643,6 @@ AC_CONFIG_FILES([
modules/stream_out/transrate/Makefile
modules/stream_out/transrate/Makefile
modules/video_chroma/Makefile
modules/video_chroma/Makefile
modules/video_filter/Makefile
modules/video_filter/Makefile
modules/video_filter/swscale/Makefile
modules/video_output/Makefile
modules/video_output/Makefile
modules/video_output/directx/Makefile
modules/video_output/directx/Makefile
modules/video_output/qte/Makefile
modules/video_output/qte/Makefile
...
...
modules/codec/ffmpeg/Modules.am
View file @
925e40a3
...
@@ -9,6 +9,7 @@ SOURCES_ffmpeg = \
...
@@ -9,6 +9,7 @@ SOURCES_ffmpeg = \
postprocess.c \
postprocess.c \
demux.c \
demux.c \
mux.c \
mux.c \
scale.c \
$(NULL)
$(NULL)
SOURCES_ffmpegaltivec = \
SOURCES_ffmpegaltivec = \
...
...
modules/codec/ffmpeg/ffmpeg.c
View file @
925e40a3
...
@@ -76,6 +76,14 @@ static char *nloopf_list_text[] =
...
@@ -76,6 +76,14 @@ static char *nloopf_list_text[] =
static
char
*
enc_hq_list
[]
=
{
"rd"
,
"bits"
,
"simple"
};
static
char
*
enc_hq_list
[]
=
{
"rd"
,
"bits"
,
"simple"
};
static
char
*
enc_hq_list_text
[]
=
{
N_
(
"rd"
),
N_
(
"bits"
),
N_
(
"simple"
)
};
static
char
*
enc_hq_list_text
[]
=
{
N_
(
"rd"
),
N_
(
"bits"
),
N_
(
"simple"
)
};
static
int
pi_mode_values
[]
=
{
0
,
1
,
2
,
4
,
8
,
5
,
6
,
9
,
10
};
static
char
*
ppsz_mode_descriptions
[]
=
{
N_
(
"Fast bilinear"
),
N_
(
"Bilinear"
),
N_
(
"Bicubic (good quality)"
),
N_
(
"Experimental"
),
N_
(
"Nearest neighbour (bad quality)"
),
N_
(
"Area"
),
N_
(
"Luma bicubic / chroma bilinear"
),
N_
(
"Gauss"
),
N_
(
"SincR"
),
N_
(
"Lanczos"
),
N_
(
"Bicubic spline"
)
};
/*****************************************************************************
/*****************************************************************************
* Module descriptor
* Module descriptor
*****************************************************************************/
*****************************************************************************/
...
@@ -216,6 +224,16 @@ vlc_module_begin();
...
@@ -216,6 +224,16 @@ vlc_module_begin();
set_description
(
_
(
"FFmpeg deinterlace video filter"
)
);
set_description
(
_
(
"FFmpeg deinterlace video filter"
)
);
add_shortcut
(
"ffmpeg-deinterlace"
);
add_shortcut
(
"ffmpeg-deinterlace"
);
/* video filter submodule */
add_submodule
();
set_description
(
_
(
"Video scaling filter"
)
);
set_capability
(
"video filter2"
,
1000
);
set_category
(
CAT_VIDEO
);
set_subcategory
(
SUBCAT_VIDEO_VFILTER
);
set_callbacks
(
E_
(
OpenScaler
),
E_
(
CloseScaler
)
);
add_integer
(
"swscale-mode"
,
0
,
NULL
,
SCALEMODE_TEXT
,
SCALEMODE_LONGTEXT
,
VLC_TRUE
);
change_integer_list
(
pi_mode_values
,
ppsz_mode_descriptions
,
0
);
var_Create
(
p_module
->
p_libvlc_global
,
"avcodec"
,
VLC_VAR_MUTEX
);
var_Create
(
p_module
->
p_libvlc_global
,
"avcodec"
,
VLC_VAR_MUTEX
);
vlc_module_end
();
vlc_module_end
();
...
@@ -455,9 +473,8 @@ static struct
...
@@ -455,9 +473,8 @@ static struct
/* Packed YUV formats */
/* Packed YUV formats */
{
VLC_FOURCC
(
'Y'
,
'U'
,
'Y'
,
'2'
),
PIX_FMT_YUV422
},
{
VLC_FOURCC
(
'Y'
,
'U'
,
'Y'
,
'2'
),
PIX_FMT_YUV422
},
#if LIBAVCODEC_BUILD >= 4720
{
VLC_FOURCC
(
'Y'
,
'U'
,
'Y'
,
'V'
),
PIX_FMT_YUV422
},
{
VLC_FOURCC
(
'U'
,
'Y'
,
'V'
,
'Y'
),
PIX_FMT_UYVY422
},
{
VLC_FOURCC
(
'U'
,
'Y'
,
'V'
,
'Y'
),
PIX_FMT_UYVY422
},
#endif
/* Packed RGB formats */
/* Packed RGB formats */
{
VLC_FOURCC
(
'R'
,
'V'
,
'1'
,
'5'
),
PIX_FMT_RGB555
},
{
VLC_FOURCC
(
'R'
,
'V'
,
'1'
,
'5'
),
PIX_FMT_RGB555
},
...
...
modules/codec/ffmpeg/ffmpeg.h
View file @
925e40a3
...
@@ -78,6 +78,8 @@ int E_(OpenCropPadd)( vlc_object_t * );
...
@@ -78,6 +78,8 @@ int E_(OpenCropPadd)( vlc_object_t * );
void
E_
(
CloseFilter
)(
vlc_object_t
*
);
void
E_
(
CloseFilter
)(
vlc_object_t
*
);
int
E_
(
OpenDeinterlace
)(
vlc_object_t
*
);
int
E_
(
OpenDeinterlace
)(
vlc_object_t
*
);
void
E_
(
CloseDeinterlace
)(
vlc_object_t
*
);
void
E_
(
CloseDeinterlace
)(
vlc_object_t
*
);
int
E_
(
OpenScaler
)(
vlc_object_t
*
);
void
E_
(
CloseScaler
)(
vlc_object_t
*
);
/* Postprocessing module */
/* Postprocessing module */
void
*
E_
(
OpenPostproc
)(
decoder_t
*
,
vlc_bool_t
*
);
void
*
E_
(
OpenPostproc
)(
decoder_t
*
,
vlc_bool_t
*
);
...
@@ -297,3 +299,7 @@ N_("<filterName>[:<option>[:<option>...]][[,|/][-]<filterName>[:<option>...]]...
...
@@ -297,3 +299,7 @@ N_("<filterName>[:<option>[:<option>...]][[,|/][-]<filterName>[:<option>...]]...
#define ENC_CHROMA_ELIM_LONGTEXT N_( "Eliminates chrominance blocks when " \
#define ENC_CHROMA_ELIM_LONGTEXT N_( "Eliminates chrominance blocks when " \
"the PSNR isn't much changed (default: 0.0). The H264 specification " \
"the PSNR isn't much changed (default: 0.0). The H264 specification " \
"recommends 7." )
"recommends 7." )
#define SCALEMODE_TEXT N_("Scaling mode")
#define SCALEMODE_LONGTEXT N_("Scaling mode to use.")
modules/
video_filter/swscale/filter
.c
→
modules/
codec/ffmpeg/scale
.c
View file @
925e40a3
...
@@ -28,8 +28,10 @@
...
@@ -28,8 +28,10 @@
#include <vlc/decoder.h>
#include <vlc/decoder.h>
#include "vlc_filter.h"
#include "vlc_filter.h"
#include "common.h"
#include <ffmpeg/swscale.h>
#include "swscale.h"
#include <ffmpeg/avcodec.h>
#include <ffmpeg/avutil.h>
#include "ffmpeg.h"
void
*
(
*
swscale_fast_memcpy
)(
void
*
,
const
void
*
,
size_t
);
void
*
(
*
swscale_fast_memcpy
)(
void
*
,
const
void
*
,
size_t
);
...
@@ -50,41 +52,19 @@ struct filter_sys_t
...
@@ -50,41 +52,19 @@ struct filter_sys_t
/****************************************************************************
/****************************************************************************
* Local prototypes
* Local prototypes
****************************************************************************/
****************************************************************************/
static
int
OpenFilter
(
vlc_object_t
*
);
static
void
CloseFilter
(
vlc_object_t
*
);
static
picture_t
*
Filter
(
filter_t
*
,
picture_t
*
);
static
picture_t
*
Filter
(
filter_t
*
,
picture_t
*
);
static
int
CheckInit
(
filter_t
*
);
static
int
CheckInit
(
filter_t
*
);
static
int
GetSwscaleChroma
(
vlc_fourcc_t
);
/*****************************************************************************
* Module descriptor
*****************************************************************************/
#define MODE_TEXT N_("Scaling mode")
#define MODE_LONGTEXT N_("Scaling mode to use.")
static
int
pi_mode_values
[]
=
{
0
,
1
,
2
,
4
,
8
,
5
,
6
,
9
,
10
};
static
char
*
ppsz_mode_descriptions
[]
=
static
char
*
ppsz_mode_descriptions
[]
=
{
N_
(
"Fast bilinear"
),
N_
(
"Bilinear"
),
N_
(
"Bicubic (good quality)"
),
{
N_
(
"Fast bilinear"
),
N_
(
"Bilinear"
),
N_
(
"Bicubic (good quality)"
),
N_
(
"Experimental"
),
N_
(
"Nearest neighbour (bad quality)"
),
N_
(
"Experimental"
),
N_
(
"Nearest neighbour (bad quality)"
),
N_
(
"Area"
),
N_
(
"Luma bicubic / chroma bilinear"
),
N_
(
"Gauss"
),
N_
(
"Area"
),
N_
(
"Luma bicubic / chroma bilinear"
),
N_
(
"Gauss"
),
N_
(
"SincR"
),
N_
(
"Lanczos"
),
N_
(
"Bicubic spline"
)
};
N_
(
"SincR"
),
N_
(
"Lanczos"
),
N_
(
"Bicubic spline"
)
};
vlc_module_begin
();
set_description
(
_
(
"Video scaling filter"
)
);
set_capability
(
"video filter2"
,
1000
);
set_category
(
CAT_VIDEO
);
set_subcategory
(
SUBCAT_VIDEO_VFILTER
);
set_callbacks
(
OpenFilter
,
CloseFilter
);
add_integer
(
"swscale-mode"
,
0
,
NULL
,
MODE_TEXT
,
MODE_LONGTEXT
,
VLC_TRUE
);
change_integer_list
(
pi_mode_values
,
ppsz_mode_descriptions
,
0
);
vlc_module_end
();
/*****************************************************************************
/*****************************************************************************
* Open
Filt
er: probe the filter and return score
* Open
Scal
er: probe the filter and return score
*****************************************************************************/
*****************************************************************************/
static
int
OpenFilter
(
vlc_object_t
*
p_this
)
int
E_
(
OpenScaler
)
(
vlc_object_t
*
p_this
)
{
{
filter_t
*
p_filter
=
(
filter_t
*
)
p_this
;
filter_t
*
p_filter
=
(
filter_t
*
)
p_this
;
filter_sys_t
*
p_sys
;
filter_sys_t
*
p_sys
;
...
@@ -99,14 +79,14 @@ static int OpenFilter( vlc_object_t *p_this )
...
@@ -99,14 +79,14 @@ static int OpenFilter( vlc_object_t *p_this )
/* Supported Input formats: YV12, I420/IYUV, YUY2, UYVY, BGR32, BGR24,
/* Supported Input formats: YV12, I420/IYUV, YUY2, UYVY, BGR32, BGR24,
* BGR16, BGR15, RGB32, RGB24, Y8/Y800, YVU9/IF09 */
* BGR16, BGR15, RGB32, RGB24, Y8/Y800, YVU9/IF09 */
if
(
!
(
i_fmt_in
=
GetSwscaleChroma
(
p_filter
->
fmt_in
.
video
.
i_chroma
))
)
if
(
!
(
i_fmt_in
=
E_
(
GetFfmpegChroma
)
(
p_filter
->
fmt_in
.
video
.
i_chroma
))
)
{
{
return
VLC_EGENERIC
;
return
VLC_EGENERIC
;
}
}
/* Supported output formats: YV12, I420/IYUV, YUY2, UYVY,
/* Supported output formats: YV12, I420/IYUV, YUY2, UYVY,
* {BGR,RGB}{1,4,8,15,16,24,32}, Y8/Y800, YVU9/IF09 */
* {BGR,RGB}{1,4,8,15,16,24,32}, Y8/Y800, YVU9/IF09 */
if
(
!
(
i_fmt_out
=
GetSwscaleChroma
(
p_filter
->
fmt_out
.
video
.
i_chroma
))
)
if
(
!
(
i_fmt_out
=
E_
(
GetFfmpegChroma
)
(
p_filter
->
fmt_out
.
video
.
i_chroma
))
)
{
{
return
VLC_EGENERIC
;
return
VLC_EGENERIC
;
}
}
...
@@ -198,7 +178,7 @@ static int OpenFilter( vlc_object_t *p_this )
...
@@ -198,7 +178,7 @@ static int OpenFilter( vlc_object_t *p_this )
/*****************************************************************************
/*****************************************************************************
* CloseFilter: clean up the filter
* CloseFilter: clean up the filter
*****************************************************************************/
*****************************************************************************/
static
void
CloseFilter
(
vlc_object_t
*
p_this
)
void
E_
(
CloseScaler
)
(
vlc_object_t
*
p_this
)
{
{
filter_t
*
p_filter
=
(
filter_t
*
)
p_this
;
filter_t
*
p_filter
=
(
filter_t
*
)
p_this
;
filter_sys_t
*
p_sys
=
p_filter
->
p_sys
;
filter_sys_t
*
p_sys
=
p_filter
->
p_sys
;
...
@@ -222,8 +202,8 @@ static int CheckInit( filter_t *p_filter )
...
@@ -222,8 +202,8 @@ static int CheckInit( filter_t *p_filter )
{
{
unsigned
int
i_fmt_in
,
i_fmt_out
;
unsigned
int
i_fmt_in
,
i_fmt_out
;
if
(
!
(
i_fmt_in
=
GetSwscaleChroma
(
p_filter
->
fmt_in
.
video
.
i_chroma
))
||
if
(
!
(
i_fmt_in
=
E_
(
GetFfmpegChroma
)
(
p_filter
->
fmt_in
.
video
.
i_chroma
))
||
!
(
i_fmt_out
=
GetSwscaleChroma
(
p_filter
->
fmt_out
.
video
.
i_chroma
))
)
!
(
i_fmt_out
=
E_
(
GetFfmpegChroma
)
(
p_filter
->
fmt_out
.
video
.
i_chroma
))
)
{
{
msg_Err
(
p_filter
,
"format not supported"
);
msg_Err
(
p_filter
,
"format not supported"
);
return
VLC_EGENERIC
;
return
VLC_EGENERIC
;
...
@@ -237,7 +217,7 @@ static int CheckInit( filter_t *p_filter )
...
@@ -237,7 +217,7 @@ static int CheckInit( filter_t *p_filter )
p_filter
->
fmt_out
.
video
.
i_width
,
p_filter
->
fmt_out
.
video
.
i_width
,
p_filter
->
fmt_out
.
video
.
i_height
,
i_fmt_out
,
p_filter
->
fmt_out
.
video
.
i_height
,
i_fmt_out
,
p_sys
->
i_sws_flags
|
p_sys
->
i_cpu_mask
,
p_sys
->
i_sws_flags
|
p_sys
->
i_cpu_mask
,
p_sys
->
p_src_filter
,
p_sys
->
p_dst_filter
);
p_sys
->
p_src_filter
,
p_sys
->
p_dst_filter
,
0
);
if
(
!
p_sys
->
ctx
)
if
(
!
p_sys
->
ctx
)
{
{
msg_Err
(
p_filter
,
"could not init SwScaler"
);
msg_Err
(
p_filter
,
"could not init SwScaler"
);
...
@@ -308,50 +288,3 @@ static picture_t *Filter( filter_t *p_filter, picture_t *p_pic )
...
@@ -308,50 +288,3 @@ static picture_t *Filter( filter_t *p_filter, picture_t *p_pic )
return
p_pic_dst
;
return
p_pic_dst
;
}
}
/*****************************************************************************
* Chroma fourcc -> ffmpeg_id mapping
*****************************************************************************/
static
struct
{
vlc_fourcc_t
i_chroma
;
unsigned
int
i_swscale_chroma
;
}
chroma_table
[]
=
{
/* Planar YUV formats */
{
VLC_FOURCC
(
'Y'
,
'V'
,
'1'
,
'2'
),
IMGFMT_YV12
},
{
VLC_FOURCC
(
'I'
,
'4'
,
'2'
,
'0'
),
IMGFMT_I420
},
{
VLC_FOURCC
(
'I'
,
'Y'
,
'U'
,
'V'
),
IMGFMT_IYUV
},
{
VLC_FOURCC
(
'I'
,
'4'
,
'4'
,
'4'
),
IMGFMT_444P
},
{
VLC_FOURCC
(
'Y'
,
'U'
,
'V'
,
'A'
),
IMGFMT_444P
},
{
VLC_FOURCC
(
'I'
,
'4'
,
'2'
,
'2'
),
IMGFMT_422P
},
{
VLC_FOURCC
(
'I'
,
'4'
,
'1'
,
'1'
),
IMGFMT_411P
},
#if 0
{ VLC_FOURCC('Y','U','V','P'), IMGFMT_Y800 },
#endif
/* Packed YUV formats */
{
VLC_FOURCC
(
'U'
,
'Y'
,
'V'
,
'Y'
),
IMGFMT_UYVY
},
{
VLC_FOURCC
(
'Y'
,
'U'
,
'Y'
,
'2'
),
IMGFMT_YUY2
},
/* Packed RGB formats */
{
VLC_FOURCC
(
'R'
,
'V'
,
'1'
,
'5'
),
IMGFMT_RGB15
},
{
VLC_FOURCC
(
'R'
,
'V'
,
'1'
,
'6'
),
IMGFMT_RGB16
},
{
VLC_FOURCC
(
'R'
,
'V'
,
'2'
,
'4'
),
IMGFMT_RGB24
},
{
VLC_FOURCC
(
'R'
,
'V'
,
'3'
,
'2'
),
IMGFMT_RGB32
},
{
VLC_FOURCC
(
'G'
,
'R'
,
'E'
,
'Y'
),
IMGFMT_RGB8
},
{
0
}
};
static
int
GetSwscaleChroma
(
vlc_fourcc_t
i_chroma
)
{
int
i
;
for
(
i
=
0
;
chroma_table
[
i
].
i_chroma
!=
0
;
i
++
)
{
if
(
chroma_table
[
i
].
i_chroma
==
i_chroma
)
return
chroma_table
[
i
].
i_swscale_chroma
;
}
return
0
;
}
modules/video_filter/swscale/Modules.am
deleted
100644 → 0
View file @
3a2462f6
SOURCES_swscale = \
swscale.c \
swscale.h \
swscale_internal.h \
yuv2rgb.c \
rgb2rgb.c \
rgb2rgb.h \
common.h \
filter.c \
$(NULL)
EXTRA_DIST += \
swscale_template.c \
swscale_altivec_template.c \
yuv2rgb_template.c \
yuv2rgb_altivec.c \
yuv2rgb_mlib.c \
rgb2rgb_template.c \
$(NULL)
modules/video_filter/swscale/common.h
deleted
100644 → 0
View file @
3a2462f6
/*****************************************************************************
* common.h: a few defines and wrappers for swscale
*****************************************************************************
* Copyright (C) 1999-2004 the VideoLAN team
* $Id$
*
* Authors: Gildas Bazin <gbazin@videolan.org>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston MA 02110-1301, USA.
*****************************************************************************/
#define mp_msg(a,b,args... ) //fprintf(stderr, ##args)
#define vo_format_name(a) ""
#ifndef _VLC_FILTER_H
extern
void
*
(
*
swscale_fast_memcpy
)(
void
*
,
const
void
*
,
int
);
#endif
#define memcpy(a,b,c) swscale_fast_memcpy(a,b,c)
#if defined(__CYGWIN__) || defined(__MINGW32__) || defined(__OS2__) || \
(defined(__OpenBSD__) && !defined(__ELF__))
#define MANGLE(a) "_" #a
#else
#define MANGLE(a) #a
#endif
#ifdef ARCH_X86
static
inline
unsigned
short
ByteSwap16
(
unsigned
short
x
)
{
__asm
(
"xchgb %b0,%h0"
:
"=q"
(
x
)
:
"0"
(
x
));
return
x
;
}
#define bswap_16(x) ByteSwap16(x)
#else
#define bswap_16(x) (((x) & 0x00ff) << 8 | ((x) & 0xff00) >> 8)
#endif
/* !ARCH_X86 */
/* SWSCALE image formats */
#define IMGFMT_RGB_MASK 0xFFFFFF00
#define IMGFMT_RGB (('R'<<24)|('G'<<16)|('B'<<8))
#define IMGFMT_RGB1 (IMGFMT_RGB|1)
#define IMGFMT_RGB4 (IMGFMT_RGB|4)
#define IMGFMT_RGB4_CHAR (IMGFMT_RGB|4|128) // RGB4 with 1 pixel per byte
#define IMGFMT_RGB8 (IMGFMT_RGB|8)
#define IMGFMT_RGB15 (IMGFMT_RGB|15)
#define IMGFMT_RGB16 (IMGFMT_RGB|16)
#define IMGFMT_RGB24 (IMGFMT_RGB|24)
#define IMGFMT_RGB32 (IMGFMT_RGB|32)
#define IMGFMT_BGR_MASK 0xFFFFFF00
#define IMGFMT_BGR (('B'<<24)|('G'<<16)|('R'<<8))
#define IMGFMT_BGR1 (IMGFMT_BGR|1)
#define IMGFMT_BGR4 (IMGFMT_BGR|4)
#define IMGFMT_BGR4_CHAR (IMGFMT_BGR|4|128) // BGR4 with 1 pixel per byte
#define IMGFMT_BGR8 (IMGFMT_BGR|8)
#define IMGFMT_BGR15 (IMGFMT_BGR|15)
#define IMGFMT_BGR16 (IMGFMT_BGR|16)
#define IMGFMT_BGR24 (IMGFMT_BGR|24)
#define IMGFMT_BGR32 (IMGFMT_BGR|32)
#ifdef WORDS_BIGENDIAN
#define IMGFMT_ABGR IMGFMT_RGB32
#define IMGFMT_BGRA (IMGFMT_RGB32|64)
#define IMGFMT_ARGB IMGFMT_BGR32
#define IMGFMT_RGBA (IMGFMT_BGR32|64)
#else
#define IMGFMT_ABGR (IMGFMT_BGR32|64)
#define IMGFMT_BGRA IMGFMT_BGR32
#define IMGFMT_ARGB (IMGFMT_RGB32|64)
#define IMGFMT_RGBA IMGFMT_RGB32
#endif
/* old names for compatibility */
#define IMGFMT_RG4B IMGFMT_RGB4_CHAR
#define IMGFMT_BG4B IMGFMT_BGR4_CHAR
#define IMGFMT_IS_RGB(fmt) (((fmt)&IMGFMT_RGB_MASK)==IMGFMT_RGB)
#define IMGFMT_IS_BGR(fmt) (((fmt)&IMGFMT_BGR_MASK)==IMGFMT_BGR)
#define IMGFMT_RGB_DEPTH(fmt) ((fmt)&0x3F)
#define IMGFMT_BGR_DEPTH(fmt) ((fmt)&0x3F)
/* Planar YUV Formats */
#define IMGFMT_YVU9 0x39555659
#define IMGFMT_IF09 0x39304649
#define IMGFMT_YV12 0x32315659
#define IMGFMT_I420 0x30323449
#define IMGFMT_IYUV 0x56555949
#define IMGFMT_CLPL 0x4C504C43
#define IMGFMT_Y800 0x30303859
#define IMGFMT_Y8 0x20203859
#define IMGFMT_NV12 0x3231564E
#define IMGFMT_NV21 0x3132564E
/* unofficial Planar Formats, FIXME if official 4CC exists */
#define IMGFMT_444P 0x50343434
#define IMGFMT_422P 0x50323234
#define IMGFMT_411P 0x50313134
#define IMGFMT_HM12 0x32314D48
/* Packed YUV Formats */
#define IMGFMT_IUYV 0x56595549
#define IMGFMT_IY41 0x31435949
#define IMGFMT_IYU1 0x31555949
#define IMGFMT_IYU2 0x32555949
#define IMGFMT_UYVY 0x59565955
#define IMGFMT_UYNV 0x564E5955
#define IMGFMT_cyuv 0x76757963
#define IMGFMT_Y422 0x32323459
#define IMGFMT_YUY2 0x32595559
#define IMGFMT_YUNV 0x564E5559
#define IMGFMT_YVYU 0x55595659
#define IMGFMT_Y41P 0x50313459
#define IMGFMT_Y211 0x31313259
#define IMGFMT_Y41T 0x54313459
#define IMGFMT_Y42T 0x54323459
#define IMGFMT_V422 0x32323456
#define IMGFMT_V655 0x35353656
#define IMGFMT_CLJR 0x524A4C43
#define IMGFMT_YUVP 0x50565559
#define IMGFMT_UYVP 0x50565955
modules/video_filter/swscale/rgb2rgb.c
deleted
100644 → 0
View file @
3a2462f6
/*
*
* rgb2rgb.c, Software RGB to RGB convertor
* pluralize by Software PAL8 to RGB convertor
* Software YUV to YUV convertor
* Software YUV to RGB convertor
* Written by Nick Kurshev.
* palette & yuv & runtime cpu stuff by Michael (michaelni@gmx.at) (under GPL)
*/
#include <inttypes.h>
#include "config.h"
#include "rgb2rgb.h"
#include "swscale.h"
#include "common.h"
#define FAST_BGR2YV12 // use 7 bit coeffs instead of 15bit
void
(
*
rgb24to32
)(
const
uint8_t
*
src
,
uint8_t
*
dst
,
unsigned
src_size
);
void
(
*
rgb24to16
)(
const
uint8_t
*
src
,
uint8_t
*
dst
,
unsigned
src_size
);
void
(
*
rgb24to15
)(
const
uint8_t
*
src
,
uint8_t
*
dst
,
unsigned
src_size
);
void
(
*
rgb32to24
)(
const
uint8_t
*
src
,
uint8_t
*
dst
,
unsigned
src_size
);
void
(
*
rgb32to16
)(
const
uint8_t
*
src
,
uint8_t
*
dst
,
unsigned
src_size
);
void
(
*
rgb32to15
)(
const
uint8_t
*
src
,
uint8_t
*
dst
,
unsigned
src_size
);
void
(
*
rgb15to16
)(
const
uint8_t
*
src
,
uint8_t
*
dst
,
unsigned
src_size
);
void
(
*
rgb15to24
)(
const
uint8_t
*
src
,
uint8_t
*
dst
,
unsigned
src_size
);
void
(
*
rgb15to32
)(
const
uint8_t
*
src
,
uint8_t
*
dst
,
unsigned
src_size
);
void
(
*
rgb16to15
)(
const
uint8_t
*
src
,
uint8_t
*
dst
,
unsigned
src_size
);
void
(
*
rgb16to24
)(
const
uint8_t
*
src
,
uint8_t
*
dst
,
unsigned
src_size
);
void
(
*
rgb16to32
)(
const
uint8_t
*
src
,
uint8_t
*
dst
,
unsigned
src_size
);
//void (*rgb24tobgr32)(const uint8_t *src, uint8_t *dst, unsigned src_size);
void
(
*
rgb24tobgr24
)(
const
uint8_t
*
src
,
uint8_t
*
dst
,
unsigned
src_size
);
void
(
*
rgb24tobgr16
)(
const
uint8_t
*
src
,
uint8_t
*
dst
,
unsigned
src_size
);
void
(
*
rgb24tobgr15
)(
const
uint8_t
*
src
,
uint8_t
*
dst
,
unsigned
src_size
);
void
(
*
rgb32tobgr32
)(
const
uint8_t
*
src
,
uint8_t
*
dst
,
unsigned
src_size
);
//void (*rgb32tobgr24)(const uint8_t *src, uint8_t *dst, unsigned src_size);
void
(
*
rgb32tobgr16
)(
const
uint8_t
*
src
,
uint8_t
*
dst
,
unsigned
src_size
);
void
(
*
rgb32tobgr15
)(
const
uint8_t
*
src
,
uint8_t
*
dst
,
unsigned
src_size
);
void
(
*
yv12toyuy2
)(
const
uint8_t
*
ysrc
,
const
uint8_t
*
usrc
,
const
uint8_t
*
vsrc
,
uint8_t
*
dst
,
unsigned
int
width
,
unsigned
int
height
,
int
lumStride
,
int
chromStride
,
int
dstStride
);
void
(
*
yv12touyvy
)(
const
uint8_t
*
ysrc
,
const
uint8_t
*
usrc
,
const
uint8_t
*
vsrc
,
uint8_t
*
dst
,
unsigned
int
width
,
unsigned
int
height
,
int
lumStride
,
int
chromStride
,
int
dstStride
);
void
(
*
yuv422ptoyuy2
)(
const
uint8_t
*
ysrc
,
const
uint8_t
*
usrc
,
const
uint8_t
*
vsrc
,
uint8_t
*
dst
,
unsigned
int
width
,
unsigned
int
height
,
int
lumStride
,
int
chromStride
,
int
dstStride
);
void
(
*
yuy2toyv12
)(
const
uint8_t
*
src
,
uint8_t
*
ydst
,
uint8_t
*
udst
,
uint8_t
*
vdst
,
unsigned
int
width
,
unsigned
int
height
,
int
lumStride
,
int
chromStride
,
int
srcStride
);
void
(
*
rgb24toyv12
)(
const
uint8_t
*
src
,
uint8_t
*
ydst
,
uint8_t
*
udst
,
uint8_t
*
vdst
,
unsigned
int
width
,
unsigned
int
height
,
int
lumStride
,
int
chromStride
,
int
srcStride
);
void
(
*
planar2x
)(
const
uint8_t
*
src
,
uint8_t
*
dst
,
int
width
,
int
height
,
int
srcStride
,
int
dstStride
);
void
(
*
interleaveBytes
)(
uint8_t
*
src1
,
uint8_t
*
src2
,
uint8_t
*
dst
,
unsigned
width
,
unsigned
height
,
int
src1Stride
,
int
src2Stride
,
int
dstStride
);
void
(
*
vu9_to_vu12
)(
const
uint8_t
*
src1
,
const
uint8_t
*
src2
,
uint8_t
*
dst1
,
uint8_t
*
dst2
,
unsigned
width
,
unsigned
height
,
int
srcStride1
,
int
srcStride2
,
int
dstStride1
,
int
dstStride2
);
void
(
*
yvu9_to_yuy2
)(
const
uint8_t
*
src1
,
const
uint8_t
*
src2
,
const
uint8_t
*
src3
,
uint8_t
*
dst
,
unsigned
width
,
unsigned
height
,
int
srcStride1
,
int
srcStride2
,
int
srcStride3
,
int
dstStride
);
#ifdef ARCH_X86
static
const
uint64_t
mmx_null
__attribute__
((
aligned
(
8
)))
=
0x0000000000000000ULL
;
static
const
uint64_t
mmx_one
__attribute__
((
aligned
(
8
)))
=
0xFFFFFFFFFFFFFFFFULL
;
static
const
uint64_t
mask32b
attribute_used
__attribute__
((
aligned
(
8
)))
=
0x000000FF000000FFULL
;
static
const
uint64_t
mask32g
attribute_used
__attribute__
((
aligned
(
8
)))
=
0x0000FF000000FF00ULL
;
static
const
uint64_t
mask32r
attribute_used
__attribute__
((
aligned
(
8
)))
=
0x00FF000000FF0000ULL
;
static
const
uint64_t
mask32
__attribute__
((
aligned
(
8
)))
=
0x00FFFFFF00FFFFFFULL
;
static
const
uint64_t
mask3216br
__attribute__
((
aligned
(
8
)))
=
0x00F800F800F800F8ULL
;
static
const
uint64_t
mask3216g
__attribute__
((
aligned
(
8
)))
=
0x0000FC000000FC00ULL
;
static
const
uint64_t
mask3215g
__attribute__
((
aligned
(
8
)))
=
0x0000F8000000F800ULL
;
static
const
uint64_t
mul3216
__attribute__
((
aligned
(
8
)))
=
0x2000000420000004ULL
;
static
const
uint64_t
mul3215
__attribute__
((
aligned
(
8
)))
=
0x2000000820000008ULL
;
static
const
uint64_t
mask24b
attribute_used
__attribute__
((
aligned
(
8
)))
=
0x00FF0000FF0000FFULL
;
static
const
uint64_t
mask24g
attribute_used
__attribute__
((
aligned
(
8
)))
=
0xFF0000FF0000FF00ULL
;
static
const
uint64_t
mask24r
attribute_used
__attribute__
((
aligned
(
8
)))
=
0x0000FF0000FF0000ULL
;
static
const
uint64_t
mask24l
__attribute__
((
aligned
(
8
)))
=
0x0000000000FFFFFFULL
;
static
const
uint64_t
mask24h
__attribute__
((
aligned
(
8
)))
=
0x0000FFFFFF000000ULL
;
static
const
uint64_t
mask24hh
__attribute__
((
aligned
(
8
)))
=
0xffff000000000000ULL
;
static
const
uint64_t
mask24hhh
__attribute__
((
aligned
(
8
)))
=
0xffffffff00000000ULL
;
static
const
uint64_t
mask24hhhh
__attribute__
((
aligned
(
8
)))
=
0xffffffffffff0000ULL
;
static
const
uint64_t
mask15b
__attribute__
((
aligned
(
8
)))
=
0x001F001F001F001FULL
;
/* 00000000 00011111 xxB */
static
const
uint64_t
mask15rg
__attribute__
((
aligned
(
8
)))
=
0x7FE07FE07FE07FE0ULL
;
/* 01111111 11100000 RGx */
static
const
uint64_t
mask15s
__attribute__
((
aligned
(
8
)))
=
0xFFE0FFE0FFE0FFE0ULL
;
static
const
uint64_t
mask15g
__attribute__
((
aligned
(
8
)))
=
0x03E003E003E003E0ULL
;
static
const
uint64_t
mask15r
__attribute__
((
aligned
(
8
)))
=
0x7C007C007C007C00ULL
;
#define mask16b mask15b
static
const
uint64_t
mask16g
__attribute__
((
aligned
(
8
)))
=
0x07E007E007E007E0ULL
;
static
const
uint64_t
mask16r
__attribute__
((
aligned
(
8
)))
=
0xF800F800F800F800ULL
;
static
const
uint64_t
red_16mask
__attribute__
((
aligned
(
8
)))
=
0x0000f8000000f800ULL
;
static
const
uint64_t
green_16mask
__attribute__
((
aligned
(
8
)))
=
0x000007e0000007e0ULL
;
static
const
uint64_t
blue_16mask
__attribute__
((
aligned
(
8
)))
=
0x0000001f0000001fULL
;
static
const
uint64_t
red_15mask
__attribute__
((
aligned
(
8
)))
=
0x00007c000000f800ULL
;
static
const
uint64_t
green_15mask
__attribute__
((
aligned
(
8
)))
=
0x000003e0000007e0ULL
;
static
const
uint64_t
blue_15mask
__attribute__
((
aligned
(
8
)))
=
0x0000001f0000001fULL
;
#ifdef FAST_BGR2YV12
static
const
uint64_t
bgr2YCoeff
attribute_used
__attribute__
((
aligned
(
8
)))
=
0x000000210041000DULL
;
static
const
uint64_t
bgr2UCoeff
attribute_used
__attribute__
((
aligned
(
8
)))
=
0x0000FFEEFFDC0038ULL
;
static
const
uint64_t
bgr2VCoeff
attribute_used
__attribute__
((
aligned
(
8
)))
=
0x00000038FFD2FFF8ULL
;
#else
static
const
uint64_t
bgr2YCoeff
attribute_used
__attribute__
((
aligned
(
8
)))
=
0x000020E540830C8BULL
;
static
const
uint64_t
bgr2UCoeff
attribute_used
__attribute__
((
aligned
(
8
)))
=
0x0000ED0FDAC23831ULL
;
static
const
uint64_t
bgr2VCoeff
attribute_used
__attribute__
((
aligned
(
8
)))
=
0x00003831D0E6F6EAULL
;
#endif
static
const
uint64_t
bgr2YOffset
attribute_used
__attribute__
((
aligned
(
8
)))
=
0x1010101010101010ULL
;
static
const
uint64_t
bgr2UVOffset
attribute_used
__attribute__
((
aligned
(
8
)))
=
0x8080808080808080ULL
;
static
const
uint64_t
w1111
attribute_used
__attribute__
((
aligned
(
8
)))
=
0x0001000100010001ULL
;
#if 0
static volatile uint64_t __attribute__((aligned(8))) b5Dither;
static volatile uint64_t __attribute__((aligned(8))) g5Dither;
static volatile uint64_t __attribute__((aligned(8))) g6Dither;
static volatile uint64_t __attribute__((aligned(8))) r5Dither;
static uint64_t __attribute__((aligned(8))) dither4[2]={
0x0103010301030103LL,
0x0200020002000200LL,};
static uint64_t __attribute__((aligned(8))) dither8[2]={
0x0602060206020602LL,
0x0004000400040004LL,};
#endif
#endif
#define RGB2YUV_SHIFT 8
#define BY ((int)( 0.098*(1<<RGB2YUV_SHIFT)+0.5))
#define BV ((int)(-0.071*(1<<RGB2YUV_SHIFT)+0.5))
#define BU ((int)( 0.439*(1<<RGB2YUV_SHIFT)+0.5))
#define GY ((int)( 0.504*(1<<RGB2YUV_SHIFT)+0.5))
#define GV ((int)(-0.368*(1<<RGB2YUV_SHIFT)+0.5))
#define GU ((int)(-0.291*(1<<RGB2YUV_SHIFT)+0.5))
#define RY ((int)( 0.257*(1<<RGB2YUV_SHIFT)+0.5))
#define RV ((int)( 0.439*(1<<RGB2YUV_SHIFT)+0.5))
#define RU ((int)(-0.148*(1<<RGB2YUV_SHIFT)+0.5))
//Note: we have C, MMX, MMX2, 3DNOW version therse no 3DNOW+MMX2 one
//Plain C versions
#undef HAVE_MMX
#undef HAVE_MMX2
#undef HAVE_3DNOW
#undef HAVE_SSE2
#define RENAME(a) a ## _C
#include "rgb2rgb_template.c"
#ifdef ARCH_X86
//MMX versions
#undef RENAME
#define HAVE_MMX
#undef HAVE_MMX2
#undef HAVE_3DNOW
#undef HAVE_SSE2
#define RENAME(a) a ## _MMX
#include "rgb2rgb_template.c"
//MMX2 versions
#undef RENAME
#define HAVE_MMX
#define HAVE_MMX2
#undef HAVE_3DNOW
#undef HAVE_SSE2
#define RENAME(a) a ## _MMX2
#include "rgb2rgb_template.c"
//3DNOW versions
#undef RENAME
#define HAVE_MMX
#undef HAVE_MMX2
#define HAVE_3DNOW
#undef HAVE_SSE2
#define RENAME(a) a ## _3DNOW
#include "rgb2rgb_template.c"
#endif //ARCH_X86
/*
rgb15->rgb16 Original by Strepto/Astral
ported to gcc & bugfixed : A'rpi
MMX2, 3DNOW optimization by Nick Kurshev
32bit c version, and and&add trick by Michael Niedermayer
*/
void
sws_rgb2rgb_init
(
int
flags
){
#ifdef ARCH_X86
if
(
flags
&
SWS_CPU_CAPS_MMX2
){
rgb15to16
=
rgb15to16_MMX2
;
rgb15to24
=
rgb15to24_MMX2
;
rgb15to32
=
rgb15to32_MMX2
;
rgb16to24
=
rgb16to24_MMX2
;
rgb16to32
=
rgb16to32_MMX2
;
rgb16to15
=
rgb16to15_MMX2
;
rgb24to16
=
rgb24to16_MMX2
;
rgb24to15
=
rgb24to15_MMX2
;
rgb24to32
=
rgb24to32_MMX2
;
rgb32to16
=
rgb32to16_MMX2
;
rgb32to15
=
rgb32to15_MMX2
;
rgb32to24
=
rgb32to24_MMX2
;
rgb24tobgr15
=
rgb24tobgr15_MMX2
;
rgb24tobgr16
=
rgb24tobgr16_MMX2
;
rgb24tobgr24
=
rgb24tobgr24_MMX2
;
rgb32tobgr32
=
rgb32tobgr32_MMX2
;
rgb32tobgr16
=
rgb32tobgr16_MMX2
;
rgb32tobgr15
=
rgb32tobgr15_MMX2
;
yv12toyuy2
=
yv12toyuy2_MMX2
;
yv12touyvy
=
yv12touyvy_MMX2
;
yuv422ptoyuy2
=
yuv422ptoyuy2_MMX2
;
yuy2toyv12
=
yuy2toyv12_MMX2
;
// uyvytoyv12= uyvytoyv12_MMX2;
// yvu9toyv12= yvu9toyv12_MMX2;
planar2x
=
planar2x_MMX2
;
rgb24toyv12
=
rgb24toyv12_MMX2
;
interleaveBytes
=
interleaveBytes_MMX2
;
vu9_to_vu12
=
vu9_to_vu12_MMX2
;
yvu9_to_yuy2
=
yvu9_to_yuy2_MMX2
;
}
else
if
(
flags
&
SWS_CPU_CAPS_3DNOW
){
rgb15to16
=
rgb15to16_3DNOW
;
rgb15to24
=
rgb15to24_3DNOW
;
rgb15to32
=
rgb15to32_3DNOW
;
rgb16to24
=
rgb16to24_3DNOW
;
rgb16to32
=
rgb16to32_3DNOW
;
rgb16to15
=
rgb16to15_3DNOW
;
rgb24to16
=
rgb24to16_3DNOW
;
rgb24to15
=
rgb24to15_3DNOW
;
rgb24to32
=
rgb24to32_3DNOW
;
rgb32to16
=
rgb32to16_3DNOW
;
rgb32to15
=
rgb32to15_3DNOW
;
rgb32to24
=
rgb32to24_3DNOW
;
rgb24tobgr15
=
rgb24tobgr15_3DNOW
;
rgb24tobgr16
=
rgb24tobgr16_3DNOW
;
rgb24tobgr24
=
rgb24tobgr24_3DNOW
;
rgb32tobgr32
=
rgb32tobgr32_3DNOW
;
rgb32tobgr16
=
rgb32tobgr16_3DNOW
;
rgb32tobgr15
=
rgb32tobgr15_3DNOW
;
yv12toyuy2
=
yv12toyuy2_3DNOW
;
yv12touyvy
=
yv12touyvy_3DNOW
;
yuv422ptoyuy2
=
yuv422ptoyuy2_3DNOW
;
yuy2toyv12
=
yuy2toyv12_3DNOW
;
// uyvytoyv12= uyvytoyv12_3DNOW;
// yvu9toyv12= yvu9toyv12_3DNOW;
planar2x
=
planar2x_3DNOW
;
rgb24toyv12
=
rgb24toyv12_3DNOW
;
interleaveBytes
=
interleaveBytes_3DNOW
;
vu9_to_vu12
=
vu9_to_vu12_3DNOW
;
yvu9_to_yuy2
=
yvu9_to_yuy2_3DNOW
;
}
else
if
(
flags
&
SWS_CPU_CAPS_MMX
){
rgb15to16
=
rgb15to16_MMX
;
rgb15to24
=
rgb15to24_MMX
;
rgb15to32
=
rgb15to32_MMX
;
rgb16to24
=
rgb16to24_MMX
;
rgb16to32
=
rgb16to32_MMX
;
rgb16to15
=
rgb16to15_MMX
;
rgb24to16
=
rgb24to16_MMX
;
rgb24to15
=
rgb24to15_MMX
;
rgb24to32
=
rgb24to32_MMX
;
rgb32to16
=
rgb32to16_MMX
;
rgb32to15
=
rgb32to15_MMX
;
rgb32to24
=
rgb32to24_MMX
;
rgb24tobgr15
=
rgb24tobgr15_MMX
;
rgb24tobgr16
=
rgb24tobgr16_MMX
;
rgb24tobgr24
=
rgb24tobgr24_MMX
;
rgb32tobgr32
=
rgb32tobgr32_MMX
;
rgb32tobgr16
=
rgb32tobgr16_MMX
;
rgb32tobgr15
=
rgb32tobgr15_MMX
;
yv12toyuy2
=
yv12toyuy2_MMX
;
yv12touyvy
=
yv12touyvy_MMX
;
yuv422ptoyuy2
=
yuv422ptoyuy2_MMX
;
yuy2toyv12
=
yuy2toyv12_MMX
;
// uyvytoyv12= uyvytoyv12_MMX;
// yvu9toyv12= yvu9toyv12_MMX;
planar2x
=
planar2x_MMX
;
rgb24toyv12
=
rgb24toyv12_MMX
;
interleaveBytes
=
interleaveBytes_MMX
;
vu9_to_vu12
=
vu9_to_vu12_MMX
;
yvu9_to_yuy2
=
yvu9_to_yuy2_MMX
;
}
else
#endif
{
rgb15to16
=
rgb15to16_C
;
rgb15to24
=
rgb15to24_C
;
rgb15to32
=
rgb15to32_C
;
rgb16to24
=
rgb16to24_C
;
rgb16to32
=
rgb16to32_C
;
rgb16to15
=
rgb16to15_C
;
rgb24to16
=
rgb24to16_C
;
rgb24to15
=
rgb24to15_C
;
rgb24to32
=
rgb24to32_C
;
rgb32to16
=
rgb32to16_C
;
rgb32to15
=
rgb32to15_C
;
rgb32to24
=
rgb32to24_C
;
rgb24tobgr15
=
rgb24tobgr15_C
;
rgb24tobgr16
=
rgb24tobgr16_C
;
rgb24tobgr24
=
rgb24tobgr24_C
;
rgb32tobgr32
=
rgb32tobgr32_C
;
rgb32tobgr16
=
rgb32tobgr16_C
;
rgb32tobgr15
=
rgb32tobgr15_C
;
yv12toyuy2
=
yv12toyuy2_C
;
yv12touyvy
=
yv12touyvy_C
;
yuv422ptoyuy2
=
yuv422ptoyuy2_C
;
yuy2toyv12
=
yuy2toyv12_C
;
// uyvytoyv12= uyvytoyv12_C;
// yvu9toyv12= yvu9toyv12_C;
planar2x
=
planar2x_C
;
rgb24toyv12
=
rgb24toyv12_C
;
interleaveBytes
=
interleaveBytes_C
;
vu9_to_vu12
=
vu9_to_vu12_C
;
yvu9_to_yuy2
=
yvu9_to_yuy2_C
;
}
}
/**
* Pallete is assumed to contain bgr32
*/
void
palette8torgb32
(
const
uint8_t
*
src
,
uint8_t
*
dst
,
unsigned
num_pixels
,
const
uint8_t
*
palette
)
{
unsigned
i
;
/*
for(i=0; i<num_pixels; i++)
((unsigned *)dst)[i] = ((unsigned *)palette)[ src[i] ];
*/
for
(
i
=
0
;
i
<
num_pixels
;
i
++
)
{
//FIXME slow?
dst
[
0
]
=
palette
[
src
[
i
]
*
4
+
2
];
dst
[
1
]
=
palette
[
src
[
i
]
*
4
+
1
];
dst
[
2
]
=
palette
[
src
[
i
]
*
4
+
0
];
// dst[3]= 0; /* do we need this cleansing? */
dst
+=
4
;
}
}
void
palette8tobgr32
(
const
uint8_t
*
src
,
uint8_t
*
dst
,
unsigned
num_pixels
,
const
uint8_t
*
palette
)
{
unsigned
i
;
for
(
i
=
0
;
i
<
num_pixels
;
i
++
)
{
//FIXME slow?
dst
[
0
]
=
palette
[
src
[
i
]
*
4
+
0
];
dst
[
1
]
=
palette
[
src
[
i
]
*
4
+
1
];
dst
[
2
]
=
palette
[
src
[
i
]
*
4
+
2
];
// dst[3]= 0; /* do we need this cleansing? */
dst
+=
4
;
}
}
/**
* Pallete is assumed to contain bgr32
*/
void
palette8torgb24
(
const
uint8_t
*
src
,
uint8_t
*
dst
,
unsigned
num_pixels
,
const
uint8_t
*
palette
)
{
unsigned
i
;
/*
writes 1 byte o much and might cause alignment issues on some architectures?
for(i=0; i<num_pixels; i++)
((unsigned *)(&dst[i*3])) = ((unsigned *)palette)[ src[i] ];
*/
for
(
i
=
0
;
i
<
num_pixels
;
i
++
)
{
//FIXME slow?
dst
[
0
]
=
palette
[
src
[
i
]
*
4
+
2
];
dst
[
1
]
=
palette
[
src
[
i
]
*
4
+
1
];
dst
[
2
]
=
palette
[
src
[
i
]
*
4
+
0
];
dst
+=
3
;
}
}
void
palette8tobgr24
(
const
uint8_t
*
src
,
uint8_t
*
dst
,
unsigned
num_pixels
,
const
uint8_t
*
palette
)
{
unsigned
i
;
/*
writes 1 byte o much and might cause alignment issues on some architectures?
for(i=0; i<num_pixels; i++)
((unsigned *)(&dst[i*3])) = ((unsigned *)palette)[ src[i] ];
*/
for
(
i
=
0
;
i
<
num_pixels
;
i
++
)
{
//FIXME slow?
dst
[
0
]
=
palette
[
src
[
i
]
*
4
+
0
];
dst
[
1
]
=
palette
[
src
[
i
]
*
4
+
1
];
dst
[
2
]
=
palette
[
src
[
i
]
*
4
+
2
];
dst
+=
3
;
}
}
/**
* Palette is assumed to contain bgr16, see rgb32to16 to convert the palette
*/
void
palette8torgb16
(
const
uint8_t
*
src
,
uint8_t
*
dst
,
unsigned
num_pixels
,
const
uint8_t
*
palette
)
{
unsigned
i
;
for
(
i
=
0
;
i
<
num_pixels
;
i
++
)
((
uint16_t
*
)
dst
)[
i
]
=
((
uint16_t
*
)
palette
)[
src
[
i
]
];
}
void
palette8tobgr16
(
const
uint8_t
*
src
,
uint8_t
*
dst
,
unsigned
num_pixels
,
const
uint8_t
*
palette
)
{
unsigned
i
;
for
(
i
=
0
;
i
<
num_pixels
;
i
++
)
((
uint16_t
*
)
dst
)[
i
]
=
bswap_16
(((
uint16_t
*
)
palette
)[
src
[
i
]
]);
}
/**
* Pallete is assumed to contain bgr15, see rgb32to15 to convert the palette
*/
void
palette8torgb15
(
const
uint8_t
*
src
,
uint8_t
*
dst
,
unsigned
num_pixels
,
const
uint8_t
*
palette
)
{
unsigned
i
;
for
(
i
=
0
;
i
<
num_pixels
;
i
++
)
((
uint16_t
*
)
dst
)[
i
]
=
((
uint16_t
*
)
palette
)[
src
[
i
]
];
}
void
palette8tobgr15
(
const
uint8_t
*
src
,
uint8_t
*
dst
,
unsigned
num_pixels
,
const
uint8_t
*
palette
)
{
unsigned
i
;
for
(
i
=
0
;
i
<
num_pixels
;
i
++
)
((
uint16_t
*
)
dst
)[
i
]
=
bswap_16
(((
uint16_t
*
)
palette
)[
src
[
i
]
]);
}
void
rgb32tobgr24
(
const
uint8_t
*
src
,
uint8_t
*
dst
,
unsigned
int
src_size
)
{
unsigned
i
;
unsigned
num_pixels
=
src_size
>>
2
;
for
(
i
=
0
;
i
<
num_pixels
;
i
++
)
{
dst
[
3
*
i
+
0
]
=
src
[
4
*
i
+
2
];
dst
[
3
*
i
+
1
]
=
src
[
4
*
i
+
1
];
dst
[
3
*
i
+
2
]
=
src
[
4
*
i
+
0
];
}
}
void
rgb24tobgr32
(
const
uint8_t
*
src
,
uint8_t
*
dst
,
unsigned
int
src_size
)
{
unsigned
i
;
for
(
i
=
0
;
3
*
i
<
src_size
;
i
++
)
{
dst
[
4
*
i
+
0
]
=
src
[
3
*
i
+
2
];
dst
[
4
*
i
+
1
]
=
src
[
3
*
i
+
1
];
dst
[
4
*
i
+
2
]
=
src
[
3
*
i
+
0
];
dst
[
4
*
i
+
3
]
=
0
;
}
}
void
rgb16tobgr32
(
const
uint8_t
*
src
,
uint8_t
*
dst
,
unsigned
int
src_size
)
{
const
uint16_t
*
end
;
uint8_t
*
d
=
(
uint8_t
*
)
dst
;
const
uint16_t
*
s
=
(
uint16_t
*
)
src
;
end
=
s
+
src_size
/
2
;
while
(
s
<
end
)
{
register
uint16_t
bgr
;
bgr
=
*
s
++
;
*
d
++
=
(
bgr
&
0xF800
)
>>
8
;
*
d
++
=
(
bgr
&
0x7E0
)
>>
3
;
*
d
++
=
(
bgr
&
0x1F
)
<<
3
;
*
d
++
=
0
;
}
}
void
rgb16tobgr24
(
const
uint8_t
*
src
,
uint8_t
*
dst
,
unsigned
int
src_size
)
{
const
uint16_t
*
end
;
uint8_t
*
d
=
(
uint8_t
*
)
dst
;
const
uint16_t
*
s
=
(
const
uint16_t
*
)
src
;
end
=
s
+
src_size
/
2
;
while
(
s
<
end
)
{
register
uint16_t
bgr
;
bgr
=
*
s
++
;
*
d
++
=
(
bgr
&
0xF800
)
>>
8
;
*
d
++
=
(
bgr
&
0x7E0
)
>>
3
;
*
d
++
=
(
bgr
&
0x1F
)
<<
3
;
}
}
void
rgb16tobgr16
(
const
uint8_t
*
src
,
uint8_t
*
dst
,
unsigned
int
src_size
)
{
unsigned
i
;
unsigned
num_pixels
=
src_size
>>
1
;
for
(
i
=
0
;
i
<
num_pixels
;
i
++
)
{
unsigned
b
,
g
,
r
;
register
uint16_t
rgb
;
rgb
=
src
[
2
*
i
];
r
=
rgb
&
0x1F
;
g
=
(
rgb
&
0x7E0
)
>>
5
;
b
=
(
rgb
&
0xF800
)
>>
11
;
dst
[
2
*
i
]
=
(
b
&
0x1F
)
|
((
g
&
0x3F
)
<<
5
)
|
((
r
&
0x1F
)
<<
11
);
}
}
void
rgb16tobgr15
(
const
uint8_t
*
src
,
uint8_t
*
dst
,
unsigned
int
src_size
)
{
unsigned
i
;
unsigned
num_pixels
=
src_size
>>
1
;
for
(
i
=
0
;
i
<
num_pixels
;
i
++
)
{
unsigned
b
,
g
,
r
;
register
uint16_t
rgb
;
rgb
=
src
[
2
*
i
];
r
=
rgb
&
0x1F
;
g
=
(
rgb
&
0x7E0
)
>>
5
;
b
=
(
rgb
&
0xF800
)
>>
11
;
dst
[
2
*
i
]
=
(
b
&
0x1F
)
|
((
g
&
0x1F
)
<<
5
)
|
((
r
&
0x1F
)
<<
10
);
}
}
void
rgb15tobgr32
(
const
uint8_t
*
src
,
uint8_t
*
dst
,
unsigned
int
src_size
)
{
const
uint16_t
*
end
;
uint8_t
*
d
=
(
uint8_t
*
)
dst
;
const
uint16_t
*
s
=
(
const
uint16_t
*
)
src
;
end
=
s
+
src_size
/
2
;
while
(
s
<
end
)
{
register
uint16_t
bgr
;
bgr
=
*
s
++
;
*
d
++
=
(
bgr
&
0x7C00
)
>>
7
;
*
d
++
=
(
bgr
&
0x3E0
)
>>
2
;
*
d
++
=
(
bgr
&
0x1F
)
<<
3
;
*
d
++
=
0
;
}
}
void
rgb15tobgr24
(
const
uint8_t
*
src
,
uint8_t
*
dst
,
unsigned
int
src_size
)
{
const
uint16_t
*
end
;
uint8_t
*
d
=
(
uint8_t
*
)
dst
;
const
uint16_t
*
s
=
(
uint16_t
*
)
src
;
end
=
s
+
src_size
/
2
;
while
(
s
<
end
)
{
register
uint16_t
bgr
;
bgr
=
*
s
++
;
*
d
++
=
(
bgr
&
0x7C00
)
>>
7
;
*
d
++
=
(
bgr
&
0x3E0
)
>>
2
;
*
d
++
=
(
bgr
&
0x1F
)
<<
3
;
}
}
void
rgb15tobgr16
(
const
uint8_t
*
src
,
uint8_t
*
dst
,
unsigned
int
src_size
)
{
unsigned
i
;
unsigned
num_pixels
=
src_size
>>
1
;
for
(
i
=
0
;
i
<
num_pixels
;
i
++
)
{
unsigned
b
,
g
,
r
;
register
uint16_t
rgb
;
rgb
=
src
[
2
*
i
];
r
=
rgb
&
0x1F
;
g
=
(
rgb
&
0x3E0
)
>>
5
;
b
=
(
rgb
&
0x7C00
)
>>
10
;
dst
[
2
*
i
]
=
(
b
&
0x1F
)
|
((
g
&
0x3F
)
<<
5
)
|
((
r
&
0x1F
)
<<
11
);
}
}
void
rgb15tobgr15
(
const
uint8_t
*
src
,
uint8_t
*
dst
,
unsigned
int
src_size
)
{
unsigned
i
;
unsigned
num_pixels
=
src_size
>>
1
;
for
(
i
=
0
;
i
<
num_pixels
;
i
++
)
{
unsigned
b
,
g
,
r
;
register
uint16_t
rgb
;
rgb
=
src
[
2
*
i
];
r
=
rgb
&
0x1F
;
g
=
(
rgb
&
0x3E0
)
>>
5
;
b
=
(
rgb
&
0x7C00
)
>>
10
;
dst
[
2
*
i
]
=
(
b
&
0x1F
)
|
((
g
&
0x1F
)
<<
5
)
|
((
r
&
0x1F
)
<<
10
);
}
}
void
rgb8tobgr8
(
const
uint8_t
*
src
,
uint8_t
*
dst
,
unsigned
int
src_size
)
{
unsigned
i
;
unsigned
num_pixels
=
src_size
;
for
(
i
=
0
;
i
<
num_pixels
;
i
++
)
{
unsigned
b
,
g
,
r
;
register
uint8_t
rgb
;
rgb
=
src
[
i
];
r
=
(
rgb
&
0x07
);
g
=
(
rgb
&
0x38
)
>>
3
;
b
=
(
rgb
&
0xC0
)
>>
6
;
dst
[
i
]
=
((
b
<<
1
)
&
0x07
)
|
((
g
&
0x07
)
<<
3
)
|
((
r
&
0x03
)
<<
6
);
}
}
modules/video_filter/swscale/rgb2rgb.h
deleted
100644 → 0
View file @
3a2462f6
/*
*
* rgb2rgb.h, Software RGB to RGB convertor
* pluralize by Software PAL8 to RGB convertor
* Software YUV to YUV convertor
* Software YUV to RGB convertor
*/
#ifndef RGB2RGB_INCLUDED
#define RGB2RGB_INCLUDED
// Note: do not fix the dependence on stdio.h
/* A full collection of rgb to rgb(bgr) convertors */
extern
void
(
*
rgb24to32
)(
const
uint8_t
*
src
,
uint8_t
*
dst
,
unsigned
src_size
);
extern
void
(
*
rgb24to16
)(
const
uint8_t
*
src
,
uint8_t
*
dst
,
unsigned
src_size
);
extern
void
(
*
rgb24to15
)(
const
uint8_t
*
src
,
uint8_t
*
dst
,
unsigned
src_size
);
extern
void
(
*
rgb32to24
)(
const
uint8_t
*
src
,
uint8_t
*
dst
,
unsigned
src_size
);
extern
void
(
*
rgb32to16
)(
const
uint8_t
*
src
,
uint8_t
*
dst
,
unsigned
src_size
);
extern
void
(
*
rgb32to15
)(
const
uint8_t
*
src
,
uint8_t
*
dst
,
unsigned
src_size
);
extern
void
(
*
rgb15to16
)(
const
uint8_t
*
src
,
uint8_t
*
dst
,
unsigned
src_size
);
extern
void
(
*
rgb15to24
)(
const
uint8_t
*
src
,
uint8_t
*
dst
,
unsigned
src_size
);
extern
void
(
*
rgb15to32
)(
const
uint8_t
*
src
,
uint8_t
*
dst
,
unsigned
src_size
);
extern
void
(
*
rgb16to15
)(
const
uint8_t
*
src
,
uint8_t
*
dst
,
unsigned
src_size
);
extern
void
(
*
rgb16to24
)(
const
uint8_t
*
src
,
uint8_t
*
dst
,
unsigned
src_size
);
extern
void
(
*
rgb16to32
)(
const
uint8_t
*
src
,
uint8_t
*
dst
,
unsigned
src_size
);
extern
void
(
*
rgb24tobgr24
)(
const
uint8_t
*
src
,
uint8_t
*
dst
,
unsigned
src_size
);
extern
void
(
*
rgb24tobgr16
)(
const
uint8_t
*
src
,
uint8_t
*
dst
,
unsigned
src_size
);
extern
void
(
*
rgb24tobgr15
)(
const
uint8_t
*
src
,
uint8_t
*
dst
,
unsigned
src_size
);
extern
void
(
*
rgb32tobgr32
)(
const
uint8_t
*
src
,
uint8_t
*
dst
,
unsigned
src_size
);
extern
void
(
*
rgb32tobgr16
)(
const
uint8_t
*
src
,
uint8_t
*
dst
,
unsigned
src_size
);
extern
void
(
*
rgb32tobgr15
)(
const
uint8_t
*
src
,
uint8_t
*
dst
,
unsigned
src_size
);
extern
void
rgb24tobgr32
(
const
uint8_t
*
src
,
uint8_t
*
dst
,
unsigned
src_size
);
extern
void
rgb32tobgr24
(
const
uint8_t
*
src
,
uint8_t
*
dst
,
unsigned
src_size
);
extern
void
rgb16tobgr32
(
const
uint8_t
*
src
,
uint8_t
*
dst
,
unsigned
src_size
);
extern
void
rgb16tobgr24
(
const
uint8_t
*
src
,
uint8_t
*
dst
,
unsigned
src_size
);
extern
void
rgb16tobgr16
(
const
uint8_t
*
src
,
uint8_t
*
dst
,
unsigned
src_size
);
extern
void
rgb16tobgr15
(
const
uint8_t
*
src
,
uint8_t
*
dst
,
unsigned
src_size
);
extern
void
rgb15tobgr32
(
const
uint8_t
*
src
,
uint8_t
*
dst
,
unsigned
src_size
);
extern
void
rgb15tobgr24
(
const
uint8_t
*
src
,
uint8_t
*
dst
,
unsigned
src_size
);
extern
void
rgb15tobgr16
(
const
uint8_t
*
src
,
uint8_t
*
dst
,
unsigned
src_size
);
extern
void
rgb15tobgr15
(
const
uint8_t
*
src
,
uint8_t
*
dst
,
unsigned
src_size
);
extern
void
rgb8tobgr8
(
const
uint8_t
*
src
,
uint8_t
*
dst
,
unsigned
src_size
);
extern
void
palette8torgb32
(
const
uint8_t
*
src
,
uint8_t
*
dst
,
unsigned
num_pixels
,
const
uint8_t
*
palette
);
extern
void
palette8tobgr32
(
const
uint8_t
*
src
,
uint8_t
*
dst
,
unsigned
num_pixels
,
const
uint8_t
*
palette
);
extern
void
palette8torgb24
(
const
uint8_t
*
src
,
uint8_t
*
dst
,
unsigned
num_pixels
,
const
uint8_t
*
palette
);
extern
void
palette8tobgr24
(
const
uint8_t
*
src
,
uint8_t
*
dst
,
unsigned
num_pixels
,
const
uint8_t
*
palette
);
extern
void
palette8torgb16
(
const
uint8_t
*
src
,
uint8_t
*
dst
,
unsigned
num_pixels
,
const
uint8_t
*
palette
);
extern
void
palette8tobgr16
(
const
uint8_t
*
src
,
uint8_t
*
dst
,
unsigned
num_pixels
,
const
uint8_t
*
palette
);
extern
void
palette8torgb15
(
const
uint8_t
*
src
,
uint8_t
*
dst
,
unsigned
num_pixels
,
const
uint8_t
*
palette
);
extern
void
palette8tobgr15
(
const
uint8_t
*
src
,
uint8_t
*
dst
,
unsigned
num_pixels
,
const
uint8_t
*
palette
);
/**
*
* height should be a multiple of 2 and width should be a multiple of 16 (if this is a
* problem for anyone then tell me, and ill fix it)
* chrominance data is only taken from every secound line others are ignored FIXME write HQ version
*/
//void uyvytoyv12(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
/**
*
* height should be a multiple of 2 and width should be a multiple of 16 (if this is a
* problem for anyone then tell me, and ill fix it)
*/
extern
void
(
*
yv12toyuy2
)(
const
uint8_t
*
ysrc
,
const
uint8_t
*
usrc
,
const
uint8_t
*
vsrc
,
uint8_t
*
dst
,
unsigned
int
width
,
unsigned
int
height
,
int
lumStride
,
int
chromStride
,
int
dstStride
);
/**
*
* width should be a multiple of 16
*/
extern
void
(
*
yuv422ptoyuy2
)(
const
uint8_t
*
ysrc
,
const
uint8_t
*
usrc
,
const
uint8_t
*
vsrc
,
uint8_t
*
dst
,
unsigned
int
width
,
unsigned
int
height
,
int
lumStride
,
int
chromStride
,
int
dstStride
);
/**
*
* height should be a multiple of 2 and width should be a multiple of 16 (if this is a
* problem for anyone then tell me, and ill fix it)
*/
extern
void
(
*
yuy2toyv12
)(
const
uint8_t
*
src
,
uint8_t
*
ydst
,
uint8_t
*
udst
,
uint8_t
*
vdst
,
unsigned
int
width
,
unsigned
int
height
,
int
lumStride
,
int
chromStride
,
int
srcStride
);
/**
*
* height should be a multiple of 2 and width should be a multiple of 16 (if this is a
* problem for anyone then tell me, and ill fix it)
*/
extern
void
(
*
yv12touyvy
)(
const
uint8_t
*
ysrc
,
const
uint8_t
*
usrc
,
const
uint8_t
*
vsrc
,
uint8_t
*
dst
,
unsigned
int
width
,
unsigned
int
height
,
int
lumStride
,
int
chromStride
,
int
dstStride
);
/**
*
* height should be a multiple of 2 and width should be a multiple of 2 (if this is a
* problem for anyone then tell me, and ill fix it)
* chrominance data is only taken from every secound line others are ignored FIXME write HQ version
*/
extern
void
(
*
rgb24toyv12
)(
const
uint8_t
*
src
,
uint8_t
*
ydst
,
uint8_t
*
udst
,
uint8_t
*
vdst
,
unsigned
int
width
,
unsigned
int
height
,
int
lumStride
,
int
chromStride
,
int
srcStride
);
extern
void
(
*
planar2x
)(
const
uint8_t
*
src
,
uint8_t
*
dst
,
int
width
,
int
height
,
int
srcStride
,
int
dstStride
);
extern
void
(
*
interleaveBytes
)(
uint8_t
*
src1
,
uint8_t
*
src2
,
uint8_t
*
dst
,
unsigned
width
,
unsigned
height
,
int
src1Stride
,
int
src2Stride
,
int
dstStride
);
extern
void
(
*
vu9_to_vu12
)(
const
uint8_t
*
src1
,
const
uint8_t
*
src2
,
uint8_t
*
dst1
,
uint8_t
*
dst2
,
unsigned
width
,
unsigned
height
,
int
srcStride1
,
int
srcStride2
,
int
dstStride1
,
int
dstStride2
);
extern
void
(
*
yvu9_to_yuy2
)(
const
uint8_t
*
src1
,
const
uint8_t
*
src2
,
const
uint8_t
*
src3
,
uint8_t
*
dst
,
unsigned
width
,
unsigned
height
,
int
srcStride1
,
int
srcStride2
,
int
srcStride3
,
int
dstStride
);
#define MODE_RGB 0x1
#define MODE_BGR 0x2
static
void
yuv2rgb
(
uint8_t
*
image
,
uint8_t
*
py
,
uint8_t
*
pu
,
uint8_t
*
pv
,
unsigned
h_size
,
unsigned
v_size
,
int
rgb_stride
,
int
y_stride
,
int
uv_stride
){
printf
(
"broken, this should use the swscaler
\n
"
);
}
static
void
yuv2rgb_init
(
unsigned
bpp
,
int
mode
){
printf
(
"broken, this should use the swscaler
\n
"
);
}
void
sws_rgb2rgb_init
(
int
flags
);
#endif
modules/video_filter/swscale/rgb2rgb_template.c
deleted
100644 → 0
View file @
3a2462f6
/*
*
* rgb2rgb.c, Software RGB to RGB convertor
* pluralize by Software PAL8 to RGB convertor
* Software YUV to YUV convertor
* Software YUV to RGB convertor
* Written by Nick Kurshev.
* palette & yuv & runtime cpu stuff by Michael (michaelni@gmx.at) (under GPL)
*/
#include <stddef.h>
#include <inttypes.h>
/* for __WORDSIZE */
#ifndef __WORDSIZE
// #warning You have misconfigured system and probably will lose performance!
#define __WORDSIZE MP_WORDSIZE
#endif
#undef PREFETCH
#undef MOVNTQ
#undef EMMS
#undef SFENCE
#undef MMREG_SIZE
#undef PREFETCHW
#undef PAVGB
#ifdef HAVE_SSE2
#define MMREG_SIZE 16
#else
#define MMREG_SIZE 8
#endif
#ifdef HAVE_3DNOW
#define PREFETCH "prefetch"
#define PREFETCHW "prefetchw"
#define PAVGB "pavgusb"
#elif defined ( HAVE_MMX2 )
#define PREFETCH "prefetchnta"
#define PREFETCHW "prefetcht0"
#define PAVGB "pavgb"
#else
#define PREFETCH "/nop"
#define PREFETCHW "/nop"
#endif
#ifdef HAVE_3DNOW
/* On K6 femms is faster of emms. On K7 femms is directly mapped on emms. */
#define EMMS "femms"
#else
#define EMMS "emms"
#endif
#ifdef HAVE_MMX2
#define MOVNTQ "movntq"
#define SFENCE "sfence"
#else
#define MOVNTQ "movq"
#define SFENCE "/nop"
#endif
static
inline
void
RENAME
(
rgb24to32
)(
const
uint8_t
*
src
,
uint8_t
*
dst
,
unsigned
src_size
)
{
uint8_t
*
dest
=
dst
;
const
uint8_t
*
s
=
src
;
const
uint8_t
*
end
;
#ifdef HAVE_MMX
const
uint8_t
*
mm_end
;
#endif
end
=
s
+
src_size
;
#ifdef HAVE_MMX
__asm
__volatile
(
PREFETCH
" %0"
::
"m"
(
*
s
)
:
"memory"
);
mm_end
=
end
-
23
;
__asm
__volatile
(
"movq %0, %%mm7"
::
"m"
(
mask32
)
:
"memory"
);
while
(
s
<
mm_end
)
{
__asm
__volatile
(
PREFETCH
" 32%1
\n\t
"
"movd %1, %%mm0
\n\t
"
"punpckldq 3%1, %%mm0
\n\t
"
"movd 6%1, %%mm1
\n\t
"
"punpckldq 9%1, %%mm1
\n\t
"
"movd 12%1, %%mm2
\n\t
"
"punpckldq 15%1, %%mm2
\n\t
"
"movd 18%1, %%mm3
\n\t
"
"punpckldq 21%1, %%mm3
\n\t
"
"pand %%mm7, %%mm0
\n\t
"
"pand %%mm7, %%mm1
\n\t
"
"pand %%mm7, %%mm2
\n\t
"
"pand %%mm7, %%mm3
\n\t
"
MOVNTQ
" %%mm0, %0
\n\t
"
MOVNTQ
" %%mm1, 8%0
\n\t
"
MOVNTQ
" %%mm2, 16%0
\n\t
"
MOVNTQ
" %%mm3, 24%0"
:
"=m"
(
*
dest
)
:
"m"
(
*
s
)
:
"memory"
);
dest
+=
32
;
s
+=
24
;
}
__asm
__volatile
(
SFENCE
:::
"memory"
);
__asm
__volatile
(
EMMS
:::
"memory"
);
#endif
while
(
s
<
end
)
{
*
dest
++
=
*
s
++
;
*
dest
++
=
*
s
++
;
*
dest
++
=
*
s
++
;
*
dest
++
=
0
;
}
}
static
inline
void
RENAME
(
rgb32to24
)(
const
uint8_t
*
src
,
uint8_t
*
dst
,
unsigned
src_size
)
{
uint8_t
*
dest
=
dst
;
const
uint8_t
*
s
=
src
;
const
uint8_t
*
end
;
#ifdef HAVE_MMX
const
uint8_t
*
mm_end
;
#endif
end
=
s
+
src_size
;
#ifdef HAVE_MMX
__asm
__volatile
(
PREFETCH
" %0"
::
"m"
(
*
s
)
:
"memory"
);
mm_end
=
end
-
31
;
while
(
s
<
mm_end
)
{
__asm
__volatile
(
PREFETCH
" 32%1
\n\t
"
"movq %1, %%mm0
\n\t
"
"movq 8%1, %%mm1
\n\t
"
"movq 16%1, %%mm4
\n\t
"
"movq 24%1, %%mm5
\n\t
"
"movq %%mm0, %%mm2
\n\t
"
"movq %%mm1, %%mm3
\n\t
"
"movq %%mm4, %%mm6
\n\t
"
"movq %%mm5, %%mm7
\n\t
"
"psrlq $8, %%mm2
\n\t
"
"psrlq $8, %%mm3
\n\t
"
"psrlq $8, %%mm6
\n\t
"
"psrlq $8, %%mm7
\n\t
"
"pand %2, %%mm0
\n\t
"
"pand %2, %%mm1
\n\t
"
"pand %2, %%mm4
\n\t
"
"pand %2, %%mm5
\n\t
"
"pand %3, %%mm2
\n\t
"
"pand %3, %%mm3
\n\t
"
"pand %3, %%mm6
\n\t
"
"pand %3, %%mm7
\n\t
"
"por %%mm2, %%mm0
\n\t
"
"por %%mm3, %%mm1
\n\t
"
"por %%mm6, %%mm4
\n\t
"
"por %%mm7, %%mm5
\n\t
"
"movq %%mm1, %%mm2
\n\t
"
"movq %%mm4, %%mm3
\n\t
"
"psllq $48, %%mm2
\n\t
"
"psllq $32, %%mm3
\n\t
"
"pand %4, %%mm2
\n\t
"
"pand %5, %%mm3
\n\t
"
"por %%mm2, %%mm0
\n\t
"
"psrlq $16, %%mm1
\n\t
"
"psrlq $32, %%mm4
\n\t
"
"psllq $16, %%mm5
\n\t
"
"por %%mm3, %%mm1
\n\t
"
"pand %6, %%mm5
\n\t
"
"por %%mm5, %%mm4
\n\t
"
MOVNTQ
" %%mm0, %0
\n\t
"
MOVNTQ
" %%mm1, 8%0
\n\t
"
MOVNTQ
" %%mm4, 16%0"
:
"=m"
(
*
dest
)
:
"m"
(
*
s
),
"m"
(
mask24l
),
"m"
(
mask24h
),
"m"
(
mask24hh
),
"m"
(
mask24hhh
),
"m"
(
mask24hhhh
)
:
"memory"
);
dest
+=
24
;
s
+=
32
;
}
__asm
__volatile
(
SFENCE
:::
"memory"
);
__asm
__volatile
(
EMMS
:::
"memory"
);
#endif
while
(
s
<
end
)
{
*
dest
++
=
*
s
++
;
*
dest
++
=
*
s
++
;
*
dest
++
=
*
s
++
;
s
++
;
}
}
/*
Original by Strepto/Astral
ported to gcc & bugfixed : A'rpi
MMX2, 3DNOW optimization by Nick Kurshev
32bit c version, and and&add trick by Michael Niedermayer
*/
static
inline
void
RENAME
(
rgb15to16
)(
const
uint8_t
*
src
,
uint8_t
*
dst
,
unsigned
src_size
)
{
register
const
uint8_t
*
s
=
src
;
register
uint8_t
*
d
=
dst
;
register
const
uint8_t
*
end
;
const
uint8_t
*
mm_end
;
end
=
s
+
src_size
;
#ifdef HAVE_MMX
__asm
__volatile
(
PREFETCH
" %0"
::
"m"
(
*
s
));
__asm
__volatile
(
"movq %0, %%mm4"
::
"m"
(
mask15s
));
mm_end
=
end
-
15
;
while
(
s
<
mm_end
)
{
__asm
__volatile
(
PREFETCH
" 32%1
\n\t
"
"movq %1, %%mm0
\n\t
"
"movq 8%1, %%mm2
\n\t
"
"movq %%mm0, %%mm1
\n\t
"
"movq %%mm2, %%mm3
\n\t
"
"pand %%mm4, %%mm0
\n\t
"
"pand %%mm4, %%mm2
\n\t
"
"paddw %%mm1, %%mm0
\n\t
"
"paddw %%mm3, %%mm2
\n\t
"
MOVNTQ
" %%mm0, %0
\n\t
"
MOVNTQ
" %%mm2, 8%0"
:
"=m"
(
*
d
)
:
"m"
(
*
s
)
);
d
+=
16
;
s
+=
16
;
}
__asm
__volatile
(
SFENCE
:::
"memory"
);
__asm
__volatile
(
EMMS
:::
"memory"
);
#endif
mm_end
=
end
-
3
;
while
(
s
<
mm_end
)
{
register
unsigned
x
=
*
((
uint32_t
*
)
s
);
*
((
uint32_t
*
)
d
)
=
(
x
&
0x7FFF7FFF
)
+
(
x
&
0x7FE07FE0
);
d
+=
4
;
s
+=
4
;
}
if
(
s
<
end
)
{
register
unsigned
short
x
=
*
((
uint16_t
*
)
s
);
*
((
uint16_t
*
)
d
)
=
(
x
&
0x7FFF
)
+
(
x
&
0x7FE0
);
}
}
static
inline
void
RENAME
(
rgb16to15
)(
const
uint8_t
*
src
,
uint8_t
*
dst
,
unsigned
src_size
)
{
register
const
uint8_t
*
s
=
src
;
register
uint8_t
*
d
=
dst
;
register
const
uint8_t
*
end
;
const
uint8_t
*
mm_end
;
end
=
s
+
src_size
;
#ifdef HAVE_MMX
__asm
__volatile
(
PREFETCH
" %0"
::
"m"
(
*
s
));
__asm
__volatile
(
"movq %0, %%mm7"
::
"m"
(
mask15rg
));
__asm
__volatile
(
"movq %0, %%mm6"
::
"m"
(
mask15b
));
mm_end
=
end
-
15
;
while
(
s
<
mm_end
)
{
__asm
__volatile
(
PREFETCH
" 32%1
\n\t
"
"movq %1, %%mm0
\n\t
"
"movq 8%1, %%mm2
\n\t
"
"movq %%mm0, %%mm1
\n\t
"
"movq %%mm2, %%mm3
\n\t
"
"psrlq $1, %%mm0
\n\t
"
"psrlq $1, %%mm2
\n\t
"
"pand %%mm7, %%mm0
\n\t
"
"pand %%mm7, %%mm2
\n\t
"
"pand %%mm6, %%mm1
\n\t
"
"pand %%mm6, %%mm3
\n\t
"
"por %%mm1, %%mm0
\n\t
"
"por %%mm3, %%mm2
\n\t
"
MOVNTQ
" %%mm0, %0
\n\t
"
MOVNTQ
" %%mm2, 8%0"
:
"=m"
(
*
d
)
:
"m"
(
*
s
)
);
d
+=
16
;
s
+=
16
;
}
__asm
__volatile
(
SFENCE
:::
"memory"
);
__asm
__volatile
(
EMMS
:::
"memory"
);
#endif
mm_end
=
end
-
3
;
while
(
s
<
mm_end
)
{
register
uint32_t
x
=
*
((
uint32_t
*
)
s
);
*
((
uint32_t
*
)
d
)
=
((
x
>>
1
)
&
0x7FE07FE0
)
|
(
x
&
0x001F001F
);
s
+=
4
;
d
+=
4
;
}
if
(
s
<
end
)
{
register
uint16_t
x
=
*
((
uint16_t
*
)
s
);
*
((
uint16_t
*
)
d
)
=
((
x
>>
1
)
&
0x7FE0
)
|
(
x
&
0x001F
);
s
+=
2
;
d
+=
2
;
}
}
static
inline
void
RENAME
(
rgb32to16
)(
const
uint8_t
*
src
,
uint8_t
*
dst
,
unsigned
src_size
)
{
const
uint8_t
*
s
=
src
;
const
uint8_t
*
end
;
#ifdef HAVE_MMX
const
uint8_t
*
mm_end
;
#endif
uint16_t
*
d
=
(
uint16_t
*
)
dst
;
end
=
s
+
src_size
;
#ifdef HAVE_MMX
mm_end
=
end
-
15
;
#if 1 //is faster only if multiplies are reasonable fast (FIXME figure out on which cpus this is faster, on Athlon its slightly faster)
asm
volatile
(
"movq %3, %%mm5
\n\t
"
"movq %4, %%mm6
\n\t
"
"movq %5, %%mm7
\n\t
"
".balign 16
\n\t
"
"1:
\n\t
"
PREFETCH
" 32(%1)
\n\t
"
"movd (%1), %%mm0
\n\t
"
"movd 4(%1), %%mm3
\n\t
"
"punpckldq 8(%1), %%mm0
\n\t
"
"punpckldq 12(%1), %%mm3
\n\t
"
"movq %%mm0, %%mm1
\n\t
"
"movq %%mm3, %%mm4
\n\t
"
"pand %%mm6, %%mm0
\n\t
"
"pand %%mm6, %%mm3
\n\t
"
"pmaddwd %%mm7, %%mm0
\n\t
"
"pmaddwd %%mm7, %%mm3
\n\t
"
"pand %%mm5, %%mm1
\n\t
"
"pand %%mm5, %%mm4
\n\t
"
"por %%mm1, %%mm0
\n\t
"
"por %%mm4, %%mm3
\n\t
"
"psrld $5, %%mm0
\n\t
"
"pslld $11, %%mm3
\n\t
"
"por %%mm3, %%mm0
\n\t
"
MOVNTQ
" %%mm0, (%0)
\n\t
"
"addl $16, %1
\n\t
"
"addl $8, %0
\n\t
"
"cmpl %2, %1
\n\t
"
" jb 1b
\n\t
"
:
"+r"
(
d
),
"+r"
(
s
)
:
"r"
(
mm_end
),
"m"
(
mask3216g
),
"m"
(
mask3216br
),
"m"
(
mul3216
)
);
#else
__asm
__volatile
(
PREFETCH
" %0"
::
"m"
(
*
src
)
:
"memory"
);
__asm
__volatile
(
"movq %0, %%mm7
\n\t
"
"movq %1, %%mm6
\n\t
"
::
"m"
(
red_16mask
),
"m"
(
green_16mask
));
while
(
s
<
mm_end
)
{
__asm
__volatile
(
PREFETCH
" 32%1
\n\t
"
"movd %1, %%mm0
\n\t
"
"movd 4%1, %%mm3
\n\t
"
"punpckldq 8%1, %%mm0
\n\t
"
"punpckldq 12%1, %%mm3
\n\t
"
"movq %%mm0, %%mm1
\n\t
"
"movq %%mm0, %%mm2
\n\t
"
"movq %%mm3, %%mm4
\n\t
"
"movq %%mm3, %%mm5
\n\t
"
"psrlq $3, %%mm0
\n\t
"
"psrlq $3, %%mm3
\n\t
"
"pand %2, %%mm0
\n\t
"
"pand %2, %%mm3
\n\t
"
"psrlq $5, %%mm1
\n\t
"
"psrlq $5, %%mm4
\n\t
"
"pand %%mm6, %%mm1
\n\t
"
"pand %%mm6, %%mm4
\n\t
"
"psrlq $8, %%mm2
\n\t
"
"psrlq $8, %%mm5
\n\t
"
"pand %%mm7, %%mm2
\n\t
"
"pand %%mm7, %%mm5
\n\t
"
"por %%mm1, %%mm0
\n\t
"
"por %%mm4, %%mm3
\n\t
"
"por %%mm2, %%mm0
\n\t
"
"por %%mm5, %%mm3
\n\t
"
"psllq $16, %%mm3
\n\t
"
"por %%mm3, %%mm0
\n\t
"
MOVNTQ
" %%mm0, %0
\n\t
"
:
"=m"
(
*
d
)
:
"m"
(
*
s
),
"m"
(
blue_16mask
)
:
"memory"
);
d
+=
4
;
s
+=
16
;
}
#endif
__asm
__volatile
(
SFENCE
:::
"memory"
);
__asm
__volatile
(
EMMS
:::
"memory"
);
#endif
while
(
s
<
end
)
{
const
int
src
=
*
s
;
s
+=
4
;
*
d
++
=
((
src
&
0xFF
)
>>
3
)
+
((
src
&
0xFC00
)
>>
5
)
+
((
src
&
0xF80000
)
>>
8
);
// *d++ = ((src>>3)&0x1F) + ((src>>5)&0x7E0) + ((src>>8)&0xF800);
}
}
static
inline
void
RENAME
(
rgb32tobgr16
)(
const
uint8_t
*
src
,
uint8_t
*
dst
,
unsigned
int
src_size
)
{
const
uint8_t
*
s
=
src
;
const
uint8_t
*
end
;
#ifdef HAVE_MMX
const
uint8_t
*
mm_end
;
#endif
uint16_t
*
d
=
(
uint16_t
*
)
dst
;
end
=
s
+
src_size
;
#ifdef HAVE_MMX
__asm
__volatile
(
PREFETCH
" %0"
::
"m"
(
*
src
)
:
"memory"
);
__asm
__volatile
(
"movq %0, %%mm7
\n\t
"
"movq %1, %%mm6
\n\t
"
::
"m"
(
red_16mask
),
"m"
(
green_16mask
));
mm_end
=
end
-
15
;
while
(
s
<
mm_end
)
{
__asm
__volatile
(
PREFETCH
" 32%1
\n\t
"
"movd %1, %%mm0
\n\t
"
"movd 4%1, %%mm3
\n\t
"
"punpckldq 8%1, %%mm0
\n\t
"
"punpckldq 12%1, %%mm3
\n\t
"
"movq %%mm0, %%mm1
\n\t
"
"movq %%mm0, %%mm2
\n\t
"
"movq %%mm3, %%mm4
\n\t
"
"movq %%mm3, %%mm5
\n\t
"
"psllq $8, %%mm0
\n\t
"
"psllq $8, %%mm3
\n\t
"
"pand %%mm7, %%mm0
\n\t
"
"pand %%mm7, %%mm3
\n\t
"
"psrlq $5, %%mm1
\n\t
"
"psrlq $5, %%mm4
\n\t
"
"pand %%mm6, %%mm1
\n\t
"
"pand %%mm6, %%mm4
\n\t
"
"psrlq $19, %%mm2
\n\t
"
"psrlq $19, %%mm5
\n\t
"
"pand %2, %%mm2
\n\t
"
"pand %2, %%mm5
\n\t
"
"por %%mm1, %%mm0
\n\t
"
"por %%mm4, %%mm3
\n\t
"
"por %%mm2, %%mm0
\n\t
"
"por %%mm5, %%mm3
\n\t
"
"psllq $16, %%mm3
\n\t
"
"por %%mm3, %%mm0
\n\t
"
MOVNTQ
" %%mm0, %0
\n\t
"
:
"=m"
(
*
d
)
:
"m"
(
*
s
),
"m"
(
blue_16mask
)
:
"memory"
);
d
+=
4
;
s
+=
16
;
}
__asm
__volatile
(
SFENCE
:::
"memory"
);
__asm
__volatile
(
EMMS
:::
"memory"
);
#endif
while
(
s
<
end
)
{
const
int
src
=
*
s
;
s
+=
4
;
*
d
++
=
((
src
&
0xF8
)
<<
8
)
+
((
src
&
0xFC00
)
>>
5
)
+
((
src
&
0xF80000
)
>>
19
);
}
}
static
inline
void
RENAME
(
rgb32to15
)(
const
uint8_t
*
src
,
uint8_t
*
dst
,
unsigned
src_size
)
{
const
uint8_t
*
s
=
src
;
const
uint8_t
*
end
;
#ifdef HAVE_MMX
const
uint8_t
*
mm_end
;
#endif
uint16_t
*
d
=
(
uint16_t
*
)
dst
;
end
=
s
+
src_size
;
#ifdef HAVE_MMX
mm_end
=
end
-
15
;
#if 1 //is faster only if multiplies are reasonable fast (FIXME figure out on which cpus this is faster, on Athlon its slightly faster)
asm
volatile
(
"movq %3, %%mm5
\n\t
"
"movq %4, %%mm6
\n\t
"
"movq %5, %%mm7
\n\t
"
".balign 16
\n\t
"
"1:
\n\t
"
PREFETCH
" 32(%1)
\n\t
"
"movd (%1), %%mm0
\n\t
"
"movd 4(%1), %%mm3
\n\t
"
"punpckldq 8(%1), %%mm0
\n\t
"
"punpckldq 12(%1), %%mm3
\n\t
"
"movq %%mm0, %%mm1
\n\t
"
"movq %%mm3, %%mm4
\n\t
"
"pand %%mm6, %%mm0
\n\t
"
"pand %%mm6, %%mm3
\n\t
"
"pmaddwd %%mm7, %%mm0
\n\t
"
"pmaddwd %%mm7, %%mm3
\n\t
"
"pand %%mm5, %%mm1
\n\t
"
"pand %%mm5, %%mm4
\n\t
"
"por %%mm1, %%mm0
\n\t
"
"por %%mm4, %%mm3
\n\t
"
"psrld $6, %%mm0
\n\t
"
"pslld $10, %%mm3
\n\t
"
"por %%mm3, %%mm0
\n\t
"
MOVNTQ
" %%mm0, (%0)
\n\t
"
"addl $16, %1
\n\t
"
"addl $8, %0
\n\t
"
"cmpl %2, %1
\n\t
"
" jb 1b
\n\t
"
:
"+r"
(
d
),
"+r"
(
s
)
:
"r"
(
mm_end
),
"m"
(
mask3215g
),
"m"
(
mask3216br
),
"m"
(
mul3215
)
);
#else
__asm
__volatile
(
PREFETCH
" %0"
::
"m"
(
*
src
)
:
"memory"
);
__asm
__volatile
(
"movq %0, %%mm7
\n\t
"
"movq %1, %%mm6
\n\t
"
::
"m"
(
red_15mask
),
"m"
(
green_15mask
));
while
(
s
<
mm_end
)
{
__asm
__volatile
(
PREFETCH
" 32%1
\n\t
"
"movd %1, %%mm0
\n\t
"
"movd 4%1, %%mm3
\n\t
"
"punpckldq 8%1, %%mm0
\n\t
"
"punpckldq 12%1, %%mm3
\n\t
"
"movq %%mm0, %%mm1
\n\t
"
"movq %%mm0, %%mm2
\n\t
"
"movq %%mm3, %%mm4
\n\t
"
"movq %%mm3, %%mm5
\n\t
"
"psrlq $3, %%mm0
\n\t
"
"psrlq $3, %%mm3
\n\t
"
"pand %2, %%mm0
\n\t
"
"pand %2, %%mm3
\n\t
"
"psrlq $6, %%mm1
\n\t
"
"psrlq $6, %%mm4
\n\t
"
"pand %%mm6, %%mm1
\n\t
"
"pand %%mm6, %%mm4
\n\t
"
"psrlq $9, %%mm2
\n\t
"
"psrlq $9, %%mm5
\n\t
"
"pand %%mm7, %%mm2
\n\t
"
"pand %%mm7, %%mm5
\n\t
"
"por %%mm1, %%mm0
\n\t
"
"por %%mm4, %%mm3
\n\t
"
"por %%mm2, %%mm0
\n\t
"
"por %%mm5, %%mm3
\n\t
"
"psllq $16, %%mm3
\n\t
"
"por %%mm3, %%mm0
\n\t
"
MOVNTQ
" %%mm0, %0
\n\t
"
:
"=m"
(
*
d
)
:
"m"
(
*
s
),
"m"
(
blue_15mask
)
:
"memory"
);
d
+=
4
;
s
+=
16
;
}
#endif
__asm
__volatile
(
SFENCE
:::
"memory"
);
__asm
__volatile
(
EMMS
:::
"memory"
);
#endif
while
(
s
<
end
)
{
const
int
src
=
*
s
;
s
+=
4
;
*
d
++
=
((
src
&
0xFF
)
>>
3
)
+
((
src
&
0xF800
)
>>
6
)
+
((
src
&
0xF80000
)
>>
9
);
}
}
static
inline
void
RENAME
(
rgb32tobgr15
)(
const
uint8_t
*
src
,
uint8_t
*
dst
,
unsigned
src_size
)
{
const
uint8_t
*
s
=
src
;
const
uint8_t
*
end
;
#ifdef HAVE_MMX
const
uint8_t
*
mm_end
;
#endif
uint16_t
*
d
=
(
uint16_t
*
)
dst
;
end
=
s
+
src_size
;
#ifdef HAVE_MMX
__asm
__volatile
(
PREFETCH
" %0"
::
"m"
(
*
src
)
:
"memory"
);
__asm
__volatile
(
"movq %0, %%mm7
\n\t
"
"movq %1, %%mm6
\n\t
"
::
"m"
(
red_15mask
),
"m"
(
green_15mask
));
mm_end
=
end
-
15
;
while
(
s
<
mm_end
)
{
__asm
__volatile
(
PREFETCH
" 32%1
\n\t
"
"movd %1, %%mm0
\n\t
"
"movd 4%1, %%mm3
\n\t
"
"punpckldq 8%1, %%mm0
\n\t
"
"punpckldq 12%1, %%mm3
\n\t
"
"movq %%mm0, %%mm1
\n\t
"
"movq %%mm0, %%mm2
\n\t
"
"movq %%mm3, %%mm4
\n\t
"
"movq %%mm3, %%mm5
\n\t
"
"psllq $7, %%mm0
\n\t
"
"psllq $7, %%mm3
\n\t
"
"pand %%mm7, %%mm0
\n\t
"
"pand %%mm7, %%mm3
\n\t
"
"psrlq $6, %%mm1
\n\t
"
"psrlq $6, %%mm4
\n\t
"
"pand %%mm6, %%mm1
\n\t
"
"pand %%mm6, %%mm4
\n\t
"
"psrlq $19, %%mm2
\n\t
"
"psrlq $19, %%mm5
\n\t
"
"pand %2, %%mm2
\n\t
"
"pand %2, %%mm5
\n\t
"
"por %%mm1, %%mm0
\n\t
"
"por %%mm4, %%mm3
\n\t
"
"por %%mm2, %%mm0
\n\t
"
"por %%mm5, %%mm3
\n\t
"
"psllq $16, %%mm3
\n\t
"
"por %%mm3, %%mm0
\n\t
"
MOVNTQ
" %%mm0, %0
\n\t
"
:
"=m"
(
*
d
)
:
"m"
(
*
s
),
"m"
(
blue_15mask
)
:
"memory"
);
d
+=
4
;
s
+=
16
;
}
__asm
__volatile
(
SFENCE
:::
"memory"
);
__asm
__volatile
(
EMMS
:::
"memory"
);
#endif
while
(
s
<
end
)
{
const
int
src
=
*
s
;
s
+=
4
;
*
d
++
=
((
src
&
0xF8
)
<<
7
)
+
((
src
&
0xF800
)
>>
6
)
+
((
src
&
0xF80000
)
>>
19
);
}
}
static
inline
void
RENAME
(
rgb24to16
)(
const
uint8_t
*
src
,
uint8_t
*
dst
,
unsigned
src_size
)
{
const
uint8_t
*
s
=
src
;
const
uint8_t
*
end
;
#ifdef HAVE_MMX
const
uint8_t
*
mm_end
;
#endif
uint16_t
*
d
=
(
uint16_t
*
)
dst
;
end
=
s
+
src_size
;
#ifdef HAVE_MMX
__asm
__volatile
(
PREFETCH
" %0"
::
"m"
(
*
src
)
:
"memory"
);
__asm
__volatile
(
"movq %0, %%mm7
\n\t
"
"movq %1, %%mm6
\n\t
"
::
"m"
(
red_16mask
),
"m"
(
green_16mask
));
mm_end
=
end
-
11
;
while
(
s
<
mm_end
)
{
__asm
__volatile
(
PREFETCH
" 32%1
\n\t
"
"movd %1, %%mm0
\n\t
"
"movd 3%1, %%mm3
\n\t
"
"punpckldq 6%1, %%mm0
\n\t
"
"punpckldq 9%1, %%mm3
\n\t
"
"movq %%mm0, %%mm1
\n\t
"
"movq %%mm0, %%mm2
\n\t
"
"movq %%mm3, %%mm4
\n\t
"
"movq %%mm3, %%mm5
\n\t
"
"psrlq $3, %%mm0
\n\t
"
"psrlq $3, %%mm3
\n\t
"
"pand %2, %%mm0
\n\t
"
"pand %2, %%mm3
\n\t
"
"psrlq $5, %%mm1
\n\t
"
"psrlq $5, %%mm4
\n\t
"
"pand %%mm6, %%mm1
\n\t
"
"pand %%mm6, %%mm4
\n\t
"
"psrlq $8, %%mm2
\n\t
"
"psrlq $8, %%mm5
\n\t
"
"pand %%mm7, %%mm2
\n\t
"
"pand %%mm7, %%mm5
\n\t
"
"por %%mm1, %%mm0
\n\t
"
"por %%mm4, %%mm3
\n\t
"
"por %%mm2, %%mm0
\n\t
"
"por %%mm5, %%mm3
\n\t
"
"psllq $16, %%mm3
\n\t
"
"por %%mm3, %%mm0
\n\t
"
MOVNTQ
" %%mm0, %0
\n\t
"
:
"=m"
(
*
d
)
:
"m"
(
*
s
),
"m"
(
blue_16mask
)
:
"memory"
);
d
+=
4
;
s
+=
12
;
}
__asm
__volatile
(
SFENCE
:::
"memory"
);
__asm
__volatile
(
EMMS
:::
"memory"
);
#endif
while
(
s
<
end
)
{
const
int
b
=
*
s
++
;
const
int
g
=
*
s
++
;
const
int
r
=
*
s
++
;
*
d
++
=
(
b
>>
3
)
|
((
g
&
0xFC
)
<<
3
)
|
((
r
&
0xF8
)
<<
8
);
}
}
static
inline
void
RENAME
(
rgb24tobgr16
)(
const
uint8_t
*
src
,
uint8_t
*
dst
,
unsigned
int
src_size
)
{
const
uint8_t
*
s
=
src
;
const
uint8_t
*
end
;
#ifdef HAVE_MMX
const
uint8_t
*
mm_end
;
#endif
uint16_t
*
d
=
(
uint16_t
*
)
dst
;
end
=
s
+
src_size
;
#ifdef HAVE_MMX
__asm
__volatile
(
PREFETCH
" %0"
::
"m"
(
*
src
)
:
"memory"
);
__asm
__volatile
(
"movq %0, %%mm7
\n\t
"
"movq %1, %%mm6
\n\t
"
::
"m"
(
red_16mask
),
"m"
(
green_16mask
));
mm_end
=
end
-
15
;
while
(
s
<
mm_end
)
{
__asm
__volatile
(
PREFETCH
" 32%1
\n\t
"
"movd %1, %%mm0
\n\t
"
"movd 3%1, %%mm3
\n\t
"
"punpckldq 6%1, %%mm0
\n\t
"
"punpckldq 9%1, %%mm3
\n\t
"
"movq %%mm0, %%mm1
\n\t
"
"movq %%mm0, %%mm2
\n\t
"
"movq %%mm3, %%mm4
\n\t
"
"movq %%mm3, %%mm5
\n\t
"
"psllq $8, %%mm0
\n\t
"
"psllq $8, %%mm3
\n\t
"
"pand %%mm7, %%mm0
\n\t
"
"pand %%mm7, %%mm3
\n\t
"
"psrlq $5, %%mm1
\n\t
"
"psrlq $5, %%mm4
\n\t
"
"pand %%mm6, %%mm1
\n\t
"
"pand %%mm6, %%mm4
\n\t
"
"psrlq $19, %%mm2
\n\t
"
"psrlq $19, %%mm5
\n\t
"
"pand %2, %%mm2
\n\t
"
"pand %2, %%mm5
\n\t
"
"por %%mm1, %%mm0
\n\t
"
"por %%mm4, %%mm3
\n\t
"
"por %%mm2, %%mm0
\n\t
"
"por %%mm5, %%mm3
\n\t
"
"psllq $16, %%mm3
\n\t
"
"por %%mm3, %%mm0
\n\t
"
MOVNTQ
" %%mm0, %0
\n\t
"
:
"=m"
(
*
d
)
:
"m"
(
*
s
),
"m"
(
blue_16mask
)
:
"memory"
);
d
+=
4
;
s
+=
12
;
}
__asm
__volatile
(
SFENCE
:::
"memory"
);
__asm
__volatile
(
EMMS
:::
"memory"
);
#endif
while
(
s
<
end
)
{
const
int
r
=
*
s
++
;
const
int
g
=
*
s
++
;
const
int
b
=
*
s
++
;
*
d
++
=
(
b
>>
3
)
|
((
g
&
0xFC
)
<<
3
)
|
((
r
&
0xF8
)
<<
8
);
}
}
static
inline
void
RENAME
(
rgb24to15
)(
const
uint8_t
*
src
,
uint8_t
*
dst
,
unsigned
src_size
)
{
const
uint8_t
*
s
=
src
;
const
uint8_t
*
end
;
#ifdef HAVE_MMX
const
uint8_t
*
mm_end
;
#endif
uint16_t
*
d
=
(
uint16_t
*
)
dst
;
end
=
s
+
src_size
;
#ifdef HAVE_MMX
__asm
__volatile
(
PREFETCH
" %0"
::
"m"
(
*
src
)
:
"memory"
);
__asm
__volatile
(
"movq %0, %%mm7
\n\t
"
"movq %1, %%mm6
\n\t
"
::
"m"
(
red_15mask
),
"m"
(
green_15mask
));
mm_end
=
end
-
11
;
while
(
s
<
mm_end
)
{
__asm
__volatile
(
PREFETCH
" 32%1
\n\t
"
"movd %1, %%mm0
\n\t
"
"movd 3%1, %%mm3
\n\t
"
"punpckldq 6%1, %%mm0
\n\t
"
"punpckldq 9%1, %%mm3
\n\t
"
"movq %%mm0, %%mm1
\n\t
"
"movq %%mm0, %%mm2
\n\t
"
"movq %%mm3, %%mm4
\n\t
"
"movq %%mm3, %%mm5
\n\t
"
"psrlq $3, %%mm0
\n\t
"
"psrlq $3, %%mm3
\n\t
"
"pand %2, %%mm0
\n\t
"
"pand %2, %%mm3
\n\t
"
"psrlq $6, %%mm1
\n\t
"
"psrlq $6, %%mm4
\n\t
"
"pand %%mm6, %%mm1
\n\t
"
"pand %%mm6, %%mm4
\n\t
"
"psrlq $9, %%mm2
\n\t
"
"psrlq $9, %%mm5
\n\t
"
"pand %%mm7, %%mm2
\n\t
"
"pand %%mm7, %%mm5
\n\t
"
"por %%mm1, %%mm0
\n\t
"
"por %%mm4, %%mm3
\n\t
"
"por %%mm2, %%mm0
\n\t
"
"por %%mm5, %%mm3
\n\t
"
"psllq $16, %%mm3
\n\t
"
"por %%mm3, %%mm0
\n\t
"
MOVNTQ
" %%mm0, %0
\n\t
"
:
"=m"
(
*
d
)
:
"m"
(
*
s
),
"m"
(
blue_15mask
)
:
"memory"
);
d
+=
4
;
s
+=
12
;
}
__asm
__volatile
(
SFENCE
:::
"memory"
);
__asm
__volatile
(
EMMS
:::
"memory"
);
#endif
while
(
s
<
end
)
{
const
int
b
=
*
s
++
;
const
int
g
=
*
s
++
;
const
int
r
=
*
s
++
;
*
d
++
=
(
b
>>
3
)
|
((
g
&
0xF8
)
<<
2
)
|
((
r
&
0xF8
)
<<
7
);
}
}
static
inline
void
RENAME
(
rgb24tobgr15
)(
const
uint8_t
*
src
,
uint8_t
*
dst
,
unsigned
src_size
)
{
const
uint8_t
*
s
=
src
;
const
uint8_t
*
end
;
#ifdef HAVE_MMX
const
uint8_t
*
mm_end
;
#endif
uint16_t
*
d
=
(
uint16_t
*
)
dst
;
end
=
s
+
src_size
;
#ifdef HAVE_MMX
__asm
__volatile
(
PREFETCH
" %0"
::
"m"
(
*
src
)
:
"memory"
);
__asm
__volatile
(
"movq %0, %%mm7
\n\t
"
"movq %1, %%mm6
\n\t
"
::
"m"
(
red_15mask
),
"m"
(
green_15mask
));
mm_end
=
end
-
15
;
while
(
s
<
mm_end
)
{
__asm
__volatile
(
PREFETCH
" 32%1
\n\t
"
"movd %1, %%mm0
\n\t
"
"movd 3%1, %%mm3
\n\t
"
"punpckldq 6%1, %%mm0
\n\t
"
"punpckldq 9%1, %%mm3
\n\t
"
"movq %%mm0, %%mm1
\n\t
"
"movq %%mm0, %%mm2
\n\t
"
"movq %%mm3, %%mm4
\n\t
"
"movq %%mm3, %%mm5
\n\t
"
"psllq $7, %%mm0
\n\t
"
"psllq $7, %%mm3
\n\t
"
"pand %%mm7, %%mm0
\n\t
"
"pand %%mm7, %%mm3
\n\t
"
"psrlq $6, %%mm1
\n\t
"
"psrlq $6, %%mm4
\n\t
"
"pand %%mm6, %%mm1
\n\t
"
"pand %%mm6, %%mm4
\n\t
"
"psrlq $19, %%mm2
\n\t
"
"psrlq $19, %%mm5
\n\t
"
"pand %2, %%mm2
\n\t
"
"pand %2, %%mm5
\n\t
"
"por %%mm1, %%mm0
\n\t
"
"por %%mm4, %%mm3
\n\t
"
"por %%mm2, %%mm0
\n\t
"
"por %%mm5, %%mm3
\n\t
"
"psllq $16, %%mm3
\n\t
"
"por %%mm3, %%mm0
\n\t
"
MOVNTQ
" %%mm0, %0
\n\t
"
:
"=m"
(
*
d
)
:
"m"
(
*
s
),
"m"
(
blue_15mask
)
:
"memory"
);
d
+=
4
;
s
+=
12
;
}
__asm
__volatile
(
SFENCE
:::
"memory"
);
__asm
__volatile
(
EMMS
:::
"memory"
);
#endif
while
(
s
<
end
)
{
const
int
r
=
*
s
++
;
const
int
g
=
*
s
++
;
const
int
b
=
*
s
++
;
*
d
++
=
(
b
>>
3
)
|
((
g
&
0xF8
)
<<
2
)
|
((
r
&
0xF8
)
<<
7
);
}
}
/*
I use here less accurate approximation by simply
left-shifting the input
value and filling the low order bits with
zeroes. This method improves png's
compression but this scheme cannot reproduce white exactly, since it does not
generate an all-ones maximum value; the net effect is to darken the
image slightly.
The better method should be "left bit replication":
4 3 2 1 0
---------
1 1 0 1 1
7 6 5 4 3 2 1 0
----------------
1 1 0 1 1 1 1 0
|=======| |===|
| Leftmost Bits Repeated to Fill Open Bits
|
Original Bits
*/
static
inline
void
RENAME
(
rgb15to24
)(
const
uint8_t
*
src
,
uint8_t
*
dst
,
unsigned
src_size
)
{
const
uint16_t
*
end
;
#ifdef HAVE_MMX
const
uint16_t
*
mm_end
;
#endif
uint8_t
*
d
=
(
uint8_t
*
)
dst
;
const
uint16_t
*
s
=
(
uint16_t
*
)
src
;
end
=
s
+
src_size
/
2
;
#ifdef HAVE_MMX
__asm
__volatile
(
PREFETCH
" %0"
::
"m"
(
*
s
)
:
"memory"
);
mm_end
=
end
-
7
;
while
(
s
<
mm_end
)
{
__asm
__volatile
(
PREFETCH
" 32%1
\n\t
"
"movq %1, %%mm0
\n\t
"
"movq %1, %%mm1
\n\t
"
"movq %1, %%mm2
\n\t
"
"pand %2, %%mm0
\n\t
"
"pand %3, %%mm1
\n\t
"
"pand %4, %%mm2
\n\t
"
"psllq $3, %%mm0
\n\t
"
"psrlq $2, %%mm1
\n\t
"
"psrlq $7, %%mm2
\n\t
"
"movq %%mm0, %%mm3
\n\t
"
"movq %%mm1, %%mm4
\n\t
"
"movq %%mm2, %%mm5
\n\t
"
"punpcklwd %5, %%mm0
\n\t
"
"punpcklwd %5, %%mm1
\n\t
"
"punpcklwd %5, %%mm2
\n\t
"
"punpckhwd %5, %%mm3
\n\t
"
"punpckhwd %5, %%mm4
\n\t
"
"punpckhwd %5, %%mm5
\n\t
"
"psllq $8, %%mm1
\n\t
"
"psllq $16, %%mm2
\n\t
"
"por %%mm1, %%mm0
\n\t
"
"por %%mm2, %%mm0
\n\t
"
"psllq $8, %%mm4
\n\t
"
"psllq $16, %%mm5
\n\t
"
"por %%mm4, %%mm3
\n\t
"
"por %%mm5, %%mm3
\n\t
"
"movq %%mm0, %%mm6
\n\t
"
"movq %%mm3, %%mm7
\n\t
"
"movq 8%1, %%mm0
\n\t
"
"movq 8%1, %%mm1
\n\t
"
"movq 8%1, %%mm2
\n\t
"
"pand %2, %%mm0
\n\t
"
"pand %3, %%mm1
\n\t
"
"pand %4, %%mm2
\n\t
"
"psllq $3, %%mm0
\n\t
"
"psrlq $2, %%mm1
\n\t
"
"psrlq $7, %%mm2
\n\t
"
"movq %%mm0, %%mm3
\n\t
"
"movq %%mm1, %%mm4
\n\t
"
"movq %%mm2, %%mm5
\n\t
"
"punpcklwd %5, %%mm0
\n\t
"
"punpcklwd %5, %%mm1
\n\t
"
"punpcklwd %5, %%mm2
\n\t
"
"punpckhwd %5, %%mm3
\n\t
"
"punpckhwd %5, %%mm4
\n\t
"
"punpckhwd %5, %%mm5
\n\t
"
"psllq $8, %%mm1
\n\t
"
"psllq $16, %%mm2
\n\t
"
"por %%mm1, %%mm0
\n\t
"
"por %%mm2, %%mm0
\n\t
"
"psllq $8, %%mm4
\n\t
"
"psllq $16, %%mm5
\n\t
"
"por %%mm4, %%mm3
\n\t
"
"por %%mm5, %%mm3
\n\t
"
:
"=m"
(
*
d
)
:
"m"
(
*
s
),
"m"
(
mask15b
),
"m"
(
mask15g
),
"m"
(
mask15r
),
"m"
(
mmx_null
)
:
"memory"
);
/* Borrowed 32 to 24 */
__asm
__volatile
(
"movq %%mm0, %%mm4
\n\t
"
"movq %%mm3, %%mm5
\n\t
"
"movq %%mm6, %%mm0
\n\t
"
"movq %%mm7, %%mm1
\n\t
"
"movq %%mm4, %%mm6
\n\t
"
"movq %%mm5, %%mm7
\n\t
"
"movq %%mm0, %%mm2
\n\t
"
"movq %%mm1, %%mm3
\n\t
"
"psrlq $8, %%mm2
\n\t
"
"psrlq $8, %%mm3
\n\t
"
"psrlq $8, %%mm6
\n\t
"
"psrlq $8, %%mm7
\n\t
"
"pand %2, %%mm0
\n\t
"
"pand %2, %%mm1
\n\t
"
"pand %2, %%mm4
\n\t
"
"pand %2, %%mm5
\n\t
"
"pand %3, %%mm2
\n\t
"
"pand %3, %%mm3
\n\t
"
"pand %3, %%mm6
\n\t
"
"pand %3, %%mm7
\n\t
"
"por %%mm2, %%mm0
\n\t
"
"por %%mm3, %%mm1
\n\t
"
"por %%mm6, %%mm4
\n\t
"
"por %%mm7, %%mm5
\n\t
"
"movq %%mm1, %%mm2
\n\t
"
"movq %%mm4, %%mm3
\n\t
"
"psllq $48, %%mm2
\n\t
"
"psllq $32, %%mm3
\n\t
"
"pand %4, %%mm2
\n\t
"
"pand %5, %%mm3
\n\t
"
"por %%mm2, %%mm0
\n\t
"
"psrlq $16, %%mm1
\n\t
"
"psrlq $32, %%mm4
\n\t
"
"psllq $16, %%mm5
\n\t
"
"por %%mm3, %%mm1
\n\t
"
"pand %6, %%mm5
\n\t
"
"por %%mm5, %%mm4
\n\t
"
MOVNTQ
" %%mm0, %0
\n\t
"
MOVNTQ
" %%mm1, 8%0
\n\t
"
MOVNTQ
" %%mm4, 16%0"
:
"=m"
(
*
d
)
:
"m"
(
*
s
),
"m"
(
mask24l
),
"m"
(
mask24h
),
"m"
(
mask24hh
),
"m"
(
mask24hhh
),
"m"
(
mask24hhhh
)
:
"memory"
);
d
+=
24
;
s
+=
8
;
}
__asm
__volatile
(
SFENCE
:::
"memory"
);
__asm
__volatile
(
EMMS
:::
"memory"
);
#endif
while
(
s
<
end
)
{
register
uint16_t
bgr
;
bgr
=
*
s
++
;
*
d
++
=
(
bgr
&
0x1F
)
<<
3
;
*
d
++
=
(
bgr
&
0x3E0
)
>>
2
;
*
d
++
=
(
bgr
&
0x7C00
)
>>
7
;
}
}
static
inline
void
RENAME
(
rgb16to24
)(
const
uint8_t
*
src
,
uint8_t
*
dst
,
unsigned
src_size
)
{
const
uint16_t
*
end
;
#ifdef HAVE_MMX
const
uint16_t
*
mm_end
;
#endif
uint8_t
*
d
=
(
uint8_t
*
)
dst
;
const
uint16_t
*
s
=
(
const
uint16_t
*
)
src
;
end
=
s
+
src_size
/
2
;
#ifdef HAVE_MMX
__asm
__volatile
(
PREFETCH
" %0"
::
"m"
(
*
s
)
:
"memory"
);
mm_end
=
end
-
7
;
while
(
s
<
mm_end
)
{
__asm
__volatile
(
PREFETCH
" 32%1
\n\t
"
"movq %1, %%mm0
\n\t
"
"movq %1, %%mm1
\n\t
"
"movq %1, %%mm2
\n\t
"
"pand %2, %%mm0
\n\t
"
"pand %3, %%mm1
\n\t
"
"pand %4, %%mm2
\n\t
"
"psllq $3, %%mm0
\n\t
"
"psrlq $3, %%mm1
\n\t
"
"psrlq $8, %%mm2
\n\t
"
"movq %%mm0, %%mm3
\n\t
"
"movq %%mm1, %%mm4
\n\t
"
"movq %%mm2, %%mm5
\n\t
"
"punpcklwd %5, %%mm0
\n\t
"
"punpcklwd %5, %%mm1
\n\t
"
"punpcklwd %5, %%mm2
\n\t
"
"punpckhwd %5, %%mm3
\n\t
"
"punpckhwd %5, %%mm4
\n\t
"
"punpckhwd %5, %%mm5
\n\t
"
"psllq $8, %%mm1
\n\t
"
"psllq $16, %%mm2
\n\t
"
"por %%mm1, %%mm0
\n\t
"
"por %%mm2, %%mm0
\n\t
"
"psllq $8, %%mm4
\n\t
"
"psllq $16, %%mm5
\n\t
"
"por %%mm4, %%mm3
\n\t
"
"por %%mm5, %%mm3
\n\t
"
"movq %%mm0, %%mm6
\n\t
"
"movq %%mm3, %%mm7
\n\t
"
"movq 8%1, %%mm0
\n\t
"
"movq 8%1, %%mm1
\n\t
"
"movq 8%1, %%mm2
\n\t
"
"pand %2, %%mm0
\n\t
"
"pand %3, %%mm1
\n\t
"
"pand %4, %%mm2
\n\t
"
"psllq $3, %%mm0
\n\t
"
"psrlq $3, %%mm1
\n\t
"
"psrlq $8, %%mm2
\n\t
"
"movq %%mm0, %%mm3
\n\t
"
"movq %%mm1, %%mm4
\n\t
"
"movq %%mm2, %%mm5
\n\t
"
"punpcklwd %5, %%mm0
\n\t
"
"punpcklwd %5, %%mm1
\n\t
"
"punpcklwd %5, %%mm2
\n\t
"
"punpckhwd %5, %%mm3
\n\t
"
"punpckhwd %5, %%mm4
\n\t
"
"punpckhwd %5, %%mm5
\n\t
"
"psllq $8, %%mm1
\n\t
"
"psllq $16, %%mm2
\n\t
"
"por %%mm1, %%mm0
\n\t
"
"por %%mm2, %%mm0
\n\t
"
"psllq $8, %%mm4
\n\t
"
"psllq $16, %%mm5
\n\t
"
"por %%mm4, %%mm3
\n\t
"
"por %%mm5, %%mm3
\n\t
"
:
"=m"
(
*
d
)
:
"m"
(
*
s
),
"m"
(
mask16b
),
"m"
(
mask16g
),
"m"
(
mask16r
),
"m"
(
mmx_null
)
:
"memory"
);
/* Borrowed 32 to 24 */
__asm
__volatile
(
"movq %%mm0, %%mm4
\n\t
"
"movq %%mm3, %%mm5
\n\t
"
"movq %%mm6, %%mm0
\n\t
"
"movq %%mm7, %%mm1
\n\t
"
"movq %%mm4, %%mm6
\n\t
"
"movq %%mm5, %%mm7
\n\t
"
"movq %%mm0, %%mm2
\n\t
"
"movq %%mm1, %%mm3
\n\t
"
"psrlq $8, %%mm2
\n\t
"
"psrlq $8, %%mm3
\n\t
"
"psrlq $8, %%mm6
\n\t
"
"psrlq $8, %%mm7
\n\t
"
"pand %2, %%mm0
\n\t
"
"pand %2, %%mm1
\n\t
"
"pand %2, %%mm4
\n\t
"
"pand %2, %%mm5
\n\t
"
"pand %3, %%mm2
\n\t
"
"pand %3, %%mm3
\n\t
"
"pand %3, %%mm6
\n\t
"
"pand %3, %%mm7
\n\t
"
"por %%mm2, %%mm0
\n\t
"
"por %%mm3, %%mm1
\n\t
"
"por %%mm6, %%mm4
\n\t
"
"por %%mm7, %%mm5
\n\t
"
"movq %%mm1, %%mm2
\n\t
"
"movq %%mm4, %%mm3
\n\t
"
"psllq $48, %%mm2
\n\t
"
"psllq $32, %%mm3
\n\t
"
"pand %4, %%mm2
\n\t
"
"pand %5, %%mm3
\n\t
"
"por %%mm2, %%mm0
\n\t
"
"psrlq $16, %%mm1
\n\t
"
"psrlq $32, %%mm4
\n\t
"
"psllq $16, %%mm5
\n\t
"
"por %%mm3, %%mm1
\n\t
"
"pand %6, %%mm5
\n\t
"
"por %%mm5, %%mm4
\n\t
"
MOVNTQ
" %%mm0, %0
\n\t
"
MOVNTQ
" %%mm1, 8%0
\n\t
"
MOVNTQ
" %%mm4, 16%0"
:
"=m"
(
*
d
)
:
"m"
(
*
s
),
"m"
(
mask24l
),
"m"
(
mask24h
),
"m"
(
mask24hh
),
"m"
(
mask24hhh
),
"m"
(
mask24hhhh
)
:
"memory"
);
d
+=
24
;
s
+=
8
;
}
__asm
__volatile
(
SFENCE
:::
"memory"
);
__asm
__volatile
(
EMMS
:::
"memory"
);
#endif
while
(
s
<
end
)
{
register
uint16_t
bgr
;
bgr
=
*
s
++
;
*
d
++
=
(
bgr
&
0x1F
)
<<
3
;
*
d
++
=
(
bgr
&
0x7E0
)
>>
3
;
*
d
++
=
(
bgr
&
0xF800
)
>>
8
;
}
}
static
inline
void
RENAME
(
rgb15to32
)(
const
uint8_t
*
src
,
uint8_t
*
dst
,
unsigned
src_size
)
{
const
uint16_t
*
end
;
#ifdef HAVE_MMX
const
uint16_t
*
mm_end
;
#endif
uint8_t
*
d
=
(
uint8_t
*
)
dst
;
const
uint16_t
*
s
=
(
const
uint16_t
*
)
src
;
end
=
s
+
src_size
/
2
;
#ifdef HAVE_MMX
__asm
__volatile
(
PREFETCH
" %0"
::
"m"
(
*
s
)
:
"memory"
);
__asm
__volatile
(
"pxor %%mm7,%%mm7
\n\t
"
:::
"memory"
);
mm_end
=
end
-
3
;
while
(
s
<
mm_end
)
{
__asm
__volatile
(
PREFETCH
" 32%1
\n\t
"
"movq %1, %%mm0
\n\t
"
"movq %1, %%mm1
\n\t
"
"movq %1, %%mm2
\n\t
"
"pand %2, %%mm0
\n\t
"
"pand %3, %%mm1
\n\t
"
"pand %4, %%mm2
\n\t
"
"psllq $3, %%mm0
\n\t
"
"psrlq $2, %%mm1
\n\t
"
"psrlq $7, %%mm2
\n\t
"
"movq %%mm0, %%mm3
\n\t
"
"movq %%mm1, %%mm4
\n\t
"
"movq %%mm2, %%mm5
\n\t
"
"punpcklwd %%mm7, %%mm0
\n\t
"
"punpcklwd %%mm7, %%mm1
\n\t
"
"punpcklwd %%mm7, %%mm2
\n\t
"
"punpckhwd %%mm7, %%mm3
\n\t
"
"punpckhwd %%mm7, %%mm4
\n\t
"
"punpckhwd %%mm7, %%mm5
\n\t
"
"psllq $8, %%mm1
\n\t
"
"psllq $16, %%mm2
\n\t
"
"por %%mm1, %%mm0
\n\t
"
"por %%mm2, %%mm0
\n\t
"
"psllq $8, %%mm4
\n\t
"
"psllq $16, %%mm5
\n\t
"
"por %%mm4, %%mm3
\n\t
"
"por %%mm5, %%mm3
\n\t
"
MOVNTQ
" %%mm0, %0
\n\t
"
MOVNTQ
" %%mm3, 8%0
\n\t
"
:
"=m"
(
*
d
)
:
"m"
(
*
s
),
"m"
(
mask15b
),
"m"
(
mask15g
),
"m"
(
mask15r
)
:
"memory"
);
d
+=
16
;
s
+=
4
;
}
__asm
__volatile
(
SFENCE
:::
"memory"
);
__asm
__volatile
(
EMMS
:::
"memory"
);
#endif
while
(
s
<
end
)
{
#if 0 //slightly slower on athlon
int bgr= *s++;
*((uint32_t*)d)++ = ((bgr&0x1F)<<3) + ((bgr&0x3E0)<<6) + ((bgr&0x7C00)<<9);
#else
//FIXME this is very likely wrong for bigendian (and the following converters too)
register
uint16_t
bgr
;
bgr
=
*
s
++
;
*
d
++
=
(
bgr
&
0x1F
)
<<
3
;
*
d
++
=
(
bgr
&
0x3E0
)
>>
2
;
*
d
++
=
(
bgr
&
0x7C00
)
>>
7
;
*
d
++
=
0
;
#endif
}
}
static
inline
void
RENAME
(
rgb16to32
)(
const
uint8_t
*
src
,
uint8_t
*
dst
,
unsigned
src_size
)
{
const
uint16_t
*
end
;
#ifdef HAVE_MMX
const
uint16_t
*
mm_end
;
#endif
uint8_t
*
d
=
(
uint8_t
*
)
dst
;
const
uint16_t
*
s
=
(
uint16_t
*
)
src
;
end
=
s
+
src_size
/
2
;
#ifdef HAVE_MMX
__asm
__volatile
(
PREFETCH
" %0"
::
"m"
(
*
s
)
:
"memory"
);
__asm
__volatile
(
"pxor %%mm7,%%mm7
\n\t
"
:::
"memory"
);
mm_end
=
end
-
3
;
while
(
s
<
mm_end
)
{
__asm
__volatile
(
PREFETCH
" 32%1
\n\t
"
"movq %1, %%mm0
\n\t
"
"movq %1, %%mm1
\n\t
"
"movq %1, %%mm2
\n\t
"
"pand %2, %%mm0
\n\t
"
"pand %3, %%mm1
\n\t
"
"pand %4, %%mm2
\n\t
"
"psllq $3, %%mm0
\n\t
"
"psrlq $3, %%mm1
\n\t
"
"psrlq $8, %%mm2
\n\t
"
"movq %%mm0, %%mm3
\n\t
"
"movq %%mm1, %%mm4
\n\t
"
"movq %%mm2, %%mm5
\n\t
"
"punpcklwd %%mm7, %%mm0
\n\t
"
"punpcklwd %%mm7, %%mm1
\n\t
"
"punpcklwd %%mm7, %%mm2
\n\t
"
"punpckhwd %%mm7, %%mm3
\n\t
"
"punpckhwd %%mm7, %%mm4
\n\t
"
"punpckhwd %%mm7, %%mm5
\n\t
"
"psllq $8, %%mm1
\n\t
"
"psllq $16, %%mm2
\n\t
"
"por %%mm1, %%mm0
\n\t
"
"por %%mm2, %%mm0
\n\t
"
"psllq $8, %%mm4
\n\t
"
"psllq $16, %%mm5
\n\t
"
"por %%mm4, %%mm3
\n\t
"
"por %%mm5, %%mm3
\n\t
"
MOVNTQ
" %%mm0, %0
\n\t
"
MOVNTQ
" %%mm3, 8%0
\n\t
"
:
"=m"
(
*
d
)
:
"m"
(
*
s
),
"m"
(
mask16b
),
"m"
(
mask16g
),
"m"
(
mask16r
)
:
"memory"
);
d
+=
16
;
s
+=
4
;
}
__asm
__volatile
(
SFENCE
:::
"memory"
);
__asm
__volatile
(
EMMS
:::
"memory"
);
#endif
while
(
s
<
end
)
{
register
uint16_t
bgr
;
bgr
=
*
s
++
;
*
d
++
=
(
bgr
&
0x1F
)
<<
3
;
*
d
++
=
(
bgr
&
0x7E0
)
>>
3
;
*
d
++
=
(
bgr
&
0xF800
)
>>
8
;
*
d
++
=
0
;
}
}
static
inline
void
RENAME
(
rgb32tobgr32
)(
const
uint8_t
*
src
,
uint8_t
*
dst
,
unsigned
int
src_size
)
{
#ifdef HAVE_MMX
/* TODO: unroll this loop */
asm
volatile
(
"xorl %%eax, %%eax
\n\t
"
".balign 16
\n\t
"
"1:
\n\t
"
PREFETCH
" 32(%0, %%eax)
\n\t
"
"movq (%0, %%eax), %%mm0
\n\t
"
"movq %%mm0, %%mm1
\n\t
"
"movq %%mm0, %%mm2
\n\t
"
"pslld $16, %%mm0
\n\t
"
"psrld $16, %%mm1
\n\t
"
"pand "
MANGLE
(
mask32r
)
", %%mm0
\n\t
"
"pand "
MANGLE
(
mask32g
)
", %%mm2
\n\t
"
"pand "
MANGLE
(
mask32b
)
", %%mm1
\n\t
"
"por %%mm0, %%mm2
\n\t
"
"por %%mm1, %%mm2
\n\t
"
MOVNTQ
" %%mm2, (%1, %%eax)
\n\t
"
"addl $8, %%eax
\n\t
"
"cmpl %2, %%eax
\n\t
"
" jb 1b
\n\t
"
::
"r"
(
src
),
"r"
(
dst
),
"r"
(
src_size
-
7
)
:
"%eax"
);
__asm
__volatile
(
SFENCE
:::
"memory"
);
__asm
__volatile
(
EMMS
:::
"memory"
);
#else
unsigned
i
;
unsigned
num_pixels
=
src_size
>>
2
;
for
(
i
=
0
;
i
<
num_pixels
;
i
++
)
{
#ifdef WORDS_BIGENDIAN
dst
[
4
*
i
+
1
]
=
src
[
4
*
i
+
3
];
dst
[
4
*
i
+
2
]
=
src
[
4
*
i
+
2
];
dst
[
4
*
i
+
3
]
=
src
[
4
*
i
+
1
];
#else
dst
[
4
*
i
+
0
]
=
src
[
4
*
i
+
2
];
dst
[
4
*
i
+
1
]
=
src
[
4
*
i
+
1
];
dst
[
4
*
i
+
2
]
=
src
[
4
*
i
+
0
];
#endif
}
#endif
}
static
inline
void
RENAME
(
rgb24tobgr24
)(
const
uint8_t
*
src
,
uint8_t
*
dst
,
unsigned
int
src_size
)
{
unsigned
i
;
#ifdef HAVE_MMX
int
mmx_size
=
23
-
src_size
;
asm
volatile
(
"movq "
MANGLE
(
mask24r
)
", %%mm5
\n\t
"
"movq "
MANGLE
(
mask24g
)
", %%mm6
\n\t
"
"movq "
MANGLE
(
mask24b
)
", %%mm7
\n\t
"
".balign 16
\n\t
"
"1:
\n\t
"
PREFETCH
" 32(%1, %%eax)
\n\t
"
"movq (%1, %%eax), %%mm0
\n\t
"
// BGR BGR BG
"movq (%1, %%eax), %%mm1
\n\t
"
// BGR BGR BG
"movq 2(%1, %%eax), %%mm2
\n\t
"
// R BGR BGR B
"psllq $16, %%mm0
\n\t
"
// 00 BGR BGR
"pand %%mm5, %%mm0
\n\t
"
"pand %%mm6, %%mm1
\n\t
"
"pand %%mm7, %%mm2
\n\t
"
"por %%mm0, %%mm1
\n\t
"
"por %%mm2, %%mm1
\n\t
"
"movq 6(%1, %%eax), %%mm0
\n\t
"
// BGR BGR BG
MOVNTQ
" %%mm1, (%2, %%eax)
\n\t
"
// RGB RGB RG
"movq 8(%1, %%eax), %%mm1
\n\t
"
// R BGR BGR B
"movq 10(%1, %%eax), %%mm2
\n\t
"
// GR BGR BGR
"pand %%mm7, %%mm0
\n\t
"
"pand %%mm5, %%mm1
\n\t
"
"pand %%mm6, %%mm2
\n\t
"
"por %%mm0, %%mm1
\n\t
"
"por %%mm2, %%mm1
\n\t
"
"movq 14(%1, %%eax), %%mm0
\n\t
"
// R BGR BGR B
MOVNTQ
" %%mm1, 8(%2, %%eax)
\n\t
"
// B RGB RGB R
"movq 16(%1, %%eax), %%mm1
\n\t
"
// GR BGR BGR
"movq 18(%1, %%eax), %%mm2
\n\t
"
// BGR BGR BG
"pand %%mm6, %%mm0
\n\t
"
"pand %%mm7, %%mm1
\n\t
"
"pand %%mm5, %%mm2
\n\t
"
"por %%mm0, %%mm1
\n\t
"
"por %%mm2, %%mm1
\n\t
"
MOVNTQ
" %%mm1, 16(%2, %%eax)
\n\t
"
"addl $24, %%eax
\n\t
"
" js 1b
\n\t
"
:
"+a"
(
mmx_size
)
:
"r"
(
src
-
mmx_size
),
"r"
(
dst
-
mmx_size
)
);
__asm
__volatile
(
SFENCE
:::
"memory"
);
__asm
__volatile
(
EMMS
:::
"memory"
);
if
(
mmx_size
==
23
)
return
;
//finihsed, was multiple of 8
src
+=
src_size
;
dst
+=
src_size
;
src_size
=
23
-
mmx_size
;
src
-=
src_size
;
dst
-=
src_size
;
#endif
for
(
i
=
0
;
i
<
src_size
;
i
+=
3
)
{
register
uint8_t
x
;
x
=
src
[
i
+
2
];
dst
[
i
+
1
]
=
src
[
i
+
1
];
dst
[
i
+
2
]
=
src
[
i
+
0
];
dst
[
i
+
0
]
=
x
;
}
}
static
inline
void
RENAME
(
yuvPlanartoyuy2
)(
const
uint8_t
*
ysrc
,
const
uint8_t
*
usrc
,
const
uint8_t
*
vsrc
,
uint8_t
*
dst
,
unsigned
int
width
,
unsigned
int
height
,
int
lumStride
,
int
chromStride
,
int
dstStride
,
int
vertLumPerChroma
)
{
unsigned
y
;
const
unsigned
chromWidth
=
width
>>
1
;
for
(
y
=
0
;
y
<
height
;
y
++
)
{
#ifdef HAVE_MMX
//FIXME handle 2 lines a once (fewer prefetch, reuse some chrom, but very likely limited by mem anyway)
asm
volatile
(
"xorl %%eax, %%eax
\n\t
"
".balign 16
\n\t
"
"1:
\n\t
"
PREFETCH
" 32(%1, %%eax, 2)
\n\t
"
PREFETCH
" 32(%2, %%eax)
\n\t
"
PREFETCH
" 32(%3, %%eax)
\n\t
"
"movq (%2, %%eax), %%mm0
\n\t
"
// U(0)
"movq %%mm0, %%mm2
\n\t
"
// U(0)
"movq (%3, %%eax), %%mm1
\n\t
"
// V(0)
"punpcklbw %%mm1, %%mm0
\n\t
"
// UVUV UVUV(0)
"punpckhbw %%mm1, %%mm2
\n\t
"
// UVUV UVUV(8)
"movq (%1, %%eax,2), %%mm3
\n\t
"
// Y(0)
"movq 8(%1, %%eax,2), %%mm5
\n\t
"
// Y(8)
"movq %%mm3, %%mm4
\n\t
"
// Y(0)
"movq %%mm5, %%mm6
\n\t
"
// Y(8)
"punpcklbw %%mm0, %%mm3
\n\t
"
// YUYV YUYV(0)
"punpckhbw %%mm0, %%mm4
\n\t
"
// YUYV YUYV(4)
"punpcklbw %%mm2, %%mm5
\n\t
"
// YUYV YUYV(8)
"punpckhbw %%mm2, %%mm6
\n\t
"
// YUYV YUYV(12)
MOVNTQ
" %%mm3, (%0, %%eax, 4)
\n\t
"
MOVNTQ
" %%mm4, 8(%0, %%eax, 4)
\n\t
"
MOVNTQ
" %%mm5, 16(%0, %%eax, 4)
\n\t
"
MOVNTQ
" %%mm6, 24(%0, %%eax, 4)
\n\t
"
"addl $8, %%eax
\n\t
"
"cmpl %4, %%eax
\n\t
"
" jb 1b
\n\t
"
::
"r"
(
dst
),
"r"
(
ysrc
),
"r"
(
usrc
),
"r"
(
vsrc
),
"g"
(
chromWidth
)
:
"%eax"
);
#else
#if defined ARCH_ALPHA && defined HAVE_MVI
#define pl2yuy2(n) \
y1 = yc[n]; \
y2 = yc2[n]; \
u = uc[n]; \
v = vc[n]; \
asm("unpkbw %1, %0" : "=r"(y1) : "r"(y1)); \
asm("unpkbw %1, %0" : "=r"(y2) : "r"(y2)); \
asm("unpkbl %1, %0" : "=r"(u) : "r"(u)); \
asm("unpkbl %1, %0" : "=r"(v) : "r"(v)); \
yuv1 = (u << 8) + (v << 24); \
yuv2 = yuv1 + y2; \
yuv1 += y1; \
qdst[n] = yuv1; \
qdst2[n] = yuv2;
int
i
;
uint64_t
*
qdst
=
(
uint64_t
*
)
dst
;
uint64_t
*
qdst2
=
(
uint64_t
*
)
(
dst
+
dstStride
);
const
uint32_t
*
yc
=
(
uint32_t
*
)
ysrc
;
const
uint32_t
*
yc2
=
(
uint32_t
*
)
(
ysrc
+
lumStride
);
const
uint16_t
*
uc
=
(
uint16_t
*
)
usrc
,
*
vc
=
(
uint16_t
*
)
vsrc
;
for
(
i
=
0
;
i
<
chromWidth
;
i
+=
8
){
uint64_t
y1
,
y2
,
yuv1
,
yuv2
;
uint64_t
u
,
v
;
/* Prefetch */
asm
(
"ldq $31,64(%0)"
::
"r"
(
yc
));
asm
(
"ldq $31,64(%0)"
::
"r"
(
yc2
));
asm
(
"ldq $31,64(%0)"
::
"r"
(
uc
));
asm
(
"ldq $31,64(%0)"
::
"r"
(
vc
));
pl2yuy2
(
0
);
pl2yuy2
(
1
);
pl2yuy2
(
2
);
pl2yuy2
(
3
);
yc
+=
4
;
yc2
+=
4
;
uc
+=
4
;
vc
+=
4
;
qdst
+=
4
;
qdst2
+=
4
;
}
y
++
;
ysrc
+=
lumStride
;
dst
+=
dstStride
;
#elif __WORDSIZE >= 64
int
i
;
uint64_t
*
ldst
=
(
uint64_t
*
)
dst
;
const
uint8_t
*
yc
=
ysrc
,
*
uc
=
usrc
,
*
vc
=
vsrc
;
for
(
i
=
0
;
i
<
chromWidth
;
i
+=
2
){
uint64_t
k
,
l
;
k
=
yc
[
0
]
+
(
uc
[
0
]
<<
8
)
+
(
yc
[
1
]
<<
16
)
+
(
vc
[
0
]
<<
24
);
l
=
yc
[
2
]
+
(
uc
[
1
]
<<
8
)
+
(
yc
[
3
]
<<
16
)
+
(
vc
[
1
]
<<
24
);
*
ldst
++
=
k
+
(
l
<<
32
);
yc
+=
4
;
uc
+=
2
;
vc
+=
2
;
}
#else
int
i
,
*
idst
=
(
int32_t
*
)
dst
;
const
uint8_t
*
yc
=
ysrc
,
*
uc
=
usrc
,
*
vc
=
vsrc
;
for
(
i
=
0
;
i
<
chromWidth
;
i
++
){
#ifdef WORDS_BIGENDIAN
*
idst
++
=
(
yc
[
0
]
<<
24
)
+
(
uc
[
0
]
<<
16
)
+
(
yc
[
1
]
<<
8
)
+
(
vc
[
0
]
<<
0
);
#else
*
idst
++
=
yc
[
0
]
+
(
uc
[
0
]
<<
8
)
+
(
yc
[
1
]
<<
16
)
+
(
vc
[
0
]
<<
24
);
#endif
yc
+=
2
;
uc
++
;
vc
++
;
}
#endif
#endif
if
((
y
&
(
vertLumPerChroma
-
1
))
==
(
vertLumPerChroma
-
1
)
)
{
usrc
+=
chromStride
;
vsrc
+=
chromStride
;
}
ysrc
+=
lumStride
;
dst
+=
dstStride
;
}
#ifdef HAVE_MMX
asm
(
EMMS
"
\n\t
"
SFENCE
"
\n\t
"
:::
"memory"
);
#endif
}
/**
*
* height should be a multiple of 2 and width should be a multiple of 16 (if this is a
* problem for anyone then tell me, and ill fix it)
*/
static
inline
void
RENAME
(
yv12toyuy2
)(
const
uint8_t
*
ysrc
,
const
uint8_t
*
usrc
,
const
uint8_t
*
vsrc
,
uint8_t
*
dst
,
unsigned
int
width
,
unsigned
int
height
,
int
lumStride
,
int
chromStride
,
int
dstStride
)
{
//FIXME interpolate chroma
RENAME
(
yuvPlanartoyuy2
)(
ysrc
,
usrc
,
vsrc
,
dst
,
width
,
height
,
lumStride
,
chromStride
,
dstStride
,
2
);
}
static
inline
void
RENAME
(
yuvPlanartouyvy
)(
const
uint8_t
*
ysrc
,
const
uint8_t
*
usrc
,
const
uint8_t
*
vsrc
,
uint8_t
*
dst
,
unsigned
int
width
,
unsigned
int
height
,
int
lumStride
,
int
chromStride
,
int
dstStride
,
int
vertLumPerChroma
)
{
unsigned
y
;
const
unsigned
chromWidth
=
width
>>
1
;
for
(
y
=
0
;
y
<
height
;
y
++
)
{
#ifdef HAVE_MMX
//FIXME handle 2 lines a once (fewer prefetch, reuse some chrom, but very likely limited by mem anyway)
asm
volatile
(
"xorl %%eax, %%eax
\n\t
"
".balign 16
\n\t
"
"1:
\n\t
"
PREFETCH
" 32(%1, %%eax, 2)
\n\t
"
PREFETCH
" 32(%2, %%eax)
\n\t
"
PREFETCH
" 32(%3, %%eax)
\n\t
"
"movq (%2, %%eax), %%mm0
\n\t
"
// U(0)
"movq %%mm0, %%mm2
\n\t
"
// U(0)
"movq (%3, %%eax), %%mm1
\n\t
"
// V(0)
"punpcklbw %%mm1, %%mm0
\n\t
"
// UVUV UVUV(0)
"punpckhbw %%mm1, %%mm2
\n\t
"
// UVUV UVUV(8)
"movq (%1, %%eax,2), %%mm3
\n\t
"
// Y(0)
"movq 8(%1, %%eax,2), %%mm5
\n\t
"
// Y(8)
"movq %%mm0, %%mm4
\n\t
"
// Y(0)
"movq %%mm2, %%mm6
\n\t
"
// Y(8)
"punpcklbw %%mm3, %%mm0
\n\t
"
// YUYV YUYV(0)
"punpckhbw %%mm3, %%mm4
\n\t
"
// YUYV YUYV(4)
"punpcklbw %%mm5, %%mm2
\n\t
"
// YUYV YUYV(8)
"punpckhbw %%mm5, %%mm6
\n\t
"
// YUYV YUYV(12)
MOVNTQ
" %%mm0, (%0, %%eax, 4)
\n\t
"
MOVNTQ
" %%mm4, 8(%0, %%eax, 4)
\n\t
"
MOVNTQ
" %%mm2, 16(%0, %%eax, 4)
\n\t
"
MOVNTQ
" %%mm6, 24(%0, %%eax, 4)
\n\t
"
"addl $8, %%eax
\n\t
"
"cmpl %4, %%eax
\n\t
"
" jb 1b
\n\t
"
::
"r"
(
dst
),
"r"
(
ysrc
),
"r"
(
usrc
),
"r"
(
vsrc
),
"g"
(
chromWidth
)
:
"%eax"
);
#else
//FIXME adapt the alpha asm code from yv12->yuy2
#if __WORDSIZE >= 64
int
i
;
uint64_t
*
ldst
=
(
uint64_t
*
)
dst
;
const
uint8_t
*
yc
=
ysrc
,
*
uc
=
usrc
,
*
vc
=
vsrc
;
for
(
i
=
0
;
i
<
chromWidth
;
i
+=
2
){
uint64_t
k
,
l
;
k
=
uc
[
0
]
+
(
yc
[
0
]
<<
8
)
+
(
vc
[
0
]
<<
16
)
+
(
yc
[
1
]
<<
24
);
l
=
uc
[
1
]
+
(
yc
[
2
]
<<
8
)
+
(
vc
[
1
]
<<
16
)
+
(
yc
[
3
]
<<
24
);
*
ldst
++
=
k
+
(
l
<<
32
);
yc
+=
4
;
uc
+=
2
;
vc
+=
2
;
}
#else
int
i
,
*
idst
=
(
int32_t
*
)
dst
;
const
uint8_t
*
yc
=
ysrc
,
*
uc
=
usrc
,
*
vc
=
vsrc
;
for
(
i
=
0
;
i
<
chromWidth
;
i
++
){
#ifdef WORDS_BIGENDIAN
*
idst
++
=
(
uc
[
0
]
<<
24
)
+
(
yc
[
0
]
<<
16
)
+
(
vc
[
0
]
<<
8
)
+
(
yc
[
1
]
<<
0
);
#else
*
idst
++
=
uc
[
0
]
+
(
yc
[
0
]
<<
8
)
+
(
vc
[
0
]
<<
16
)
+
(
yc
[
1
]
<<
24
);
#endif
yc
+=
2
;
uc
++
;
vc
++
;
}
#endif
#endif
if
((
y
&
(
vertLumPerChroma
-
1
))
==
(
vertLumPerChroma
-
1
)
)
{
usrc
+=
chromStride
;
vsrc
+=
chromStride
;
}
ysrc
+=
lumStride
;
dst
+=
dstStride
;
}
#ifdef HAVE_MMX
asm
(
EMMS
"
\n\t
"
SFENCE
"
\n\t
"
:::
"memory"
);
#endif
}
/**
*
* height should be a multiple of 2 and width should be a multiple of 16 (if this is a
* problem for anyone then tell me, and ill fix it)
*/
static
inline
void
RENAME
(
yv12touyvy
)(
const
uint8_t
*
ysrc
,
const
uint8_t
*
usrc
,
const
uint8_t
*
vsrc
,
uint8_t
*
dst
,
unsigned
int
width
,
unsigned
int
height
,
int
lumStride
,
int
chromStride
,
int
dstStride
)
{
//FIXME interpolate chroma
RENAME
(
yuvPlanartouyvy
)(
ysrc
,
usrc
,
vsrc
,
dst
,
width
,
height
,
lumStride
,
chromStride
,
dstStride
,
2
);
}
/**
*
* width should be a multiple of 16
*/
static
inline
void
RENAME
(
yuv422ptoyuy2
)(
const
uint8_t
*
ysrc
,
const
uint8_t
*
usrc
,
const
uint8_t
*
vsrc
,
uint8_t
*
dst
,
unsigned
int
width
,
unsigned
int
height
,
int
lumStride
,
int
chromStride
,
int
dstStride
)
{
RENAME
(
yuvPlanartoyuy2
)(
ysrc
,
usrc
,
vsrc
,
dst
,
width
,
height
,
lumStride
,
chromStride
,
dstStride
,
1
);
}
/**
*
* height should be a multiple of 2 and width should be a multiple of 16 (if this is a
* problem for anyone then tell me, and ill fix it)
*/
static
inline
void
RENAME
(
yuy2toyv12
)(
const
uint8_t
*
src
,
uint8_t
*
ydst
,
uint8_t
*
udst
,
uint8_t
*
vdst
,
unsigned
int
width
,
unsigned
int
height
,
int
lumStride
,
int
chromStride
,
int
srcStride
)
{
unsigned
y
;
const
unsigned
chromWidth
=
width
>>
1
;
for
(
y
=
0
;
y
<
height
;
y
+=
2
)
{
#ifdef HAVE_MMX
asm
volatile
(
"xorl %%eax, %%eax
\n\t
"
"pcmpeqw %%mm7, %%mm7
\n\t
"
"psrlw $8, %%mm7
\n\t
"
// FF,00,FF,00...
".balign 16
\n\t
"
"1:
\n\t
"
PREFETCH
" 64(%0, %%eax, 4)
\n\t
"
"movq (%0, %%eax, 4), %%mm0
\n\t
"
// YUYV YUYV(0)
"movq 8(%0, %%eax, 4), %%mm1
\n\t
"
// YUYV YUYV(4)
"movq %%mm0, %%mm2
\n\t
"
// YUYV YUYV(0)
"movq %%mm1, %%mm3
\n\t
"
// YUYV YUYV(4)
"psrlw $8, %%mm0
\n\t
"
// U0V0 U0V0(0)
"psrlw $8, %%mm1
\n\t
"
// U0V0 U0V0(4)
"pand %%mm7, %%mm2
\n\t
"
// Y0Y0 Y0Y0(0)
"pand %%mm7, %%mm3
\n\t
"
// Y0Y0 Y0Y0(4)
"packuswb %%mm1, %%mm0
\n\t
"
// UVUV UVUV(0)
"packuswb %%mm3, %%mm2
\n\t
"
// YYYY YYYY(0)
MOVNTQ
" %%mm2, (%1, %%eax, 2)
\n\t
"
"movq 16(%0, %%eax, 4), %%mm1
\n\t
"
// YUYV YUYV(8)
"movq 24(%0, %%eax, 4), %%mm2
\n\t
"
// YUYV YUYV(12)
"movq %%mm1, %%mm3
\n\t
"
// YUYV YUYV(8)
"movq %%mm2, %%mm4
\n\t
"
// YUYV YUYV(12)
"psrlw $8, %%mm1
\n\t
"
// U0V0 U0V0(8)
"psrlw $8, %%mm2
\n\t
"
// U0V0 U0V0(12)
"pand %%mm7, %%mm3
\n\t
"
// Y0Y0 Y0Y0(8)
"pand %%mm7, %%mm4
\n\t
"
// Y0Y0 Y0Y0(12)
"packuswb %%mm2, %%mm1
\n\t
"
// UVUV UVUV(8)
"packuswb %%mm4, %%mm3
\n\t
"
// YYYY YYYY(8)
MOVNTQ
" %%mm3, 8(%1, %%eax, 2)
\n\t
"
"movq %%mm0, %%mm2
\n\t
"
// UVUV UVUV(0)
"movq %%mm1, %%mm3
\n\t
"
// UVUV UVUV(8)
"psrlw $8, %%mm0
\n\t
"
// V0V0 V0V0(0)
"psrlw $8, %%mm1
\n\t
"
// V0V0 V0V0(8)
"pand %%mm7, %%mm2
\n\t
"
// U0U0 U0U0(0)
"pand %%mm7, %%mm3
\n\t
"
// U0U0 U0U0(8)
"packuswb %%mm1, %%mm0
\n\t
"
// VVVV VVVV(0)
"packuswb %%mm3, %%mm2
\n\t
"
// UUUU UUUU(0)
MOVNTQ
" %%mm0, (%3, %%eax)
\n\t
"
MOVNTQ
" %%mm2, (%2, %%eax)
\n\t
"
"addl $8, %%eax
\n\t
"
"cmpl %4, %%eax
\n\t
"
" jb 1b
\n\t
"
::
"r"
(
src
),
"r"
(
ydst
),
"r"
(
udst
),
"r"
(
vdst
),
"g"
(
chromWidth
)
:
"memory"
,
"%eax"
);
ydst
+=
lumStride
;
src
+=
srcStride
;
asm
volatile
(
"xorl %%eax, %%eax
\n\t
"
".balign 16
\n\t
"
"1:
\n\t
"
PREFETCH
" 64(%0, %%eax, 4)
\n\t
"
"movq (%0, %%eax, 4), %%mm0
\n\t
"
// YUYV YUYV(0)
"movq 8(%0, %%eax, 4), %%mm1
\n\t
"
// YUYV YUYV(4)
"movq 16(%0, %%eax, 4), %%mm2
\n\t
"
// YUYV YUYV(8)
"movq 24(%0, %%eax, 4), %%mm3
\n\t
"
// YUYV YUYV(12)
"pand %%mm7, %%mm0
\n\t
"
// Y0Y0 Y0Y0(0)
"pand %%mm7, %%mm1
\n\t
"
// Y0Y0 Y0Y0(4)
"pand %%mm7, %%mm2
\n\t
"
// Y0Y0 Y0Y0(8)
"pand %%mm7, %%mm3
\n\t
"
// Y0Y0 Y0Y0(12)
"packuswb %%mm1, %%mm0
\n\t
"
// YYYY YYYY(0)
"packuswb %%mm3, %%mm2
\n\t
"
// YYYY YYYY(8)
MOVNTQ
" %%mm0, (%1, %%eax, 2)
\n\t
"
MOVNTQ
" %%mm2, 8(%1, %%eax, 2)
\n\t
"
"addl $8, %%eax
\n\t
"
"cmpl %4, %%eax
\n\t
"
" jb 1b
\n\t
"
::
"r"
(
src
),
"r"
(
ydst
),
"r"
(
udst
),
"r"
(
vdst
),
"g"
(
chromWidth
)
:
"memory"
,
"%eax"
);
#else
unsigned
i
;
for
(
i
=
0
;
i
<
chromWidth
;
i
++
)
{
ydst
[
2
*
i
+
0
]
=
src
[
4
*
i
+
0
];
udst
[
i
]
=
src
[
4
*
i
+
1
];
ydst
[
2
*
i
+
1
]
=
src
[
4
*
i
+
2
];
vdst
[
i
]
=
src
[
4
*
i
+
3
];
}
ydst
+=
lumStride
;
src
+=
srcStride
;
for
(
i
=
0
;
i
<
chromWidth
;
i
++
)
{
ydst
[
2
*
i
+
0
]
=
src
[
4
*
i
+
0
];
ydst
[
2
*
i
+
1
]
=
src
[
4
*
i
+
2
];
}
#endif
udst
+=
chromStride
;
vdst
+=
chromStride
;
ydst
+=
lumStride
;
src
+=
srcStride
;
}
#ifdef HAVE_MMX
asm
volatile
(
EMMS
"
\n\t
"
SFENCE
"
\n\t
"
:::
"memory"
);
#endif
}
static
inline
void
RENAME
(
yvu9toyv12
)(
const
uint8_t
*
ysrc
,
const
uint8_t
*
usrc
,
const
uint8_t
*
vsrc
,
uint8_t
*
ydst
,
uint8_t
*
udst
,
uint8_t
*
vdst
,
unsigned
int
width
,
unsigned
int
height
,
int
lumStride
,
int
chromStride
)
{
/* Y Plane */
memcpy
(
ydst
,
ysrc
,
width
*
height
);
/* XXX: implement upscaling for U,V */
}
static
inline
void
RENAME
(
planar2x
)(
const
uint8_t
*
src
,
uint8_t
*
dst
,
int
srcWidth
,
int
srcHeight
,
int
srcStride
,
int
dstStride
)
{
int
x
,
y
;
dst
[
0
]
=
src
[
0
];
// first line
for
(
x
=
0
;
x
<
srcWidth
-
1
;
x
++
){
dst
[
2
*
x
+
1
]
=
(
3
*
src
[
x
]
+
src
[
x
+
1
])
>>
2
;
dst
[
2
*
x
+
2
]
=
(
src
[
x
]
+
3
*
src
[
x
+
1
])
>>
2
;
}
dst
[
2
*
srcWidth
-
1
]
=
src
[
srcWidth
-
1
];
dst
+=
dstStride
;
for
(
y
=
1
;
y
<
srcHeight
;
y
++
){
#if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
const
int
mmxSize
=
srcWidth
&~
15
;
asm
volatile
(
"movl %4, %%eax
\n\t
"
"1:
\n\t
"
"movq (%0, %%eax), %%mm0
\n\t
"
"movq (%1, %%eax), %%mm1
\n\t
"
"movq 1(%0, %%eax), %%mm2
\n\t
"
"movq 1(%1, %%eax), %%mm3
\n\t
"
"movq -1(%0, %%eax), %%mm4
\n\t
"
"movq -1(%1, %%eax), %%mm5
\n\t
"
PAVGB
" %%mm0, %%mm5
\n\t
"
PAVGB
" %%mm0, %%mm3
\n\t
"
PAVGB
" %%mm0, %%mm5
\n\t
"
PAVGB
" %%mm0, %%mm3
\n\t
"
PAVGB
" %%mm1, %%mm4
\n\t
"
PAVGB
" %%mm1, %%mm2
\n\t
"
PAVGB
" %%mm1, %%mm4
\n\t
"
PAVGB
" %%mm1, %%mm2
\n\t
"
"movq %%mm5, %%mm7
\n\t
"
"movq %%mm4, %%mm6
\n\t
"
"punpcklbw %%mm3, %%mm5
\n\t
"
"punpckhbw %%mm3, %%mm7
\n\t
"
"punpcklbw %%mm2, %%mm4
\n\t
"
"punpckhbw %%mm2, %%mm6
\n\t
"
#if 1
MOVNTQ
" %%mm5, (%2, %%eax, 2)
\n\t
"
MOVNTQ
" %%mm7, 8(%2, %%eax, 2)
\n\t
"
MOVNTQ
" %%mm4, (%3, %%eax, 2)
\n\t
"
MOVNTQ
" %%mm6, 8(%3, %%eax, 2)
\n\t
"
#else
"movq %%mm5, (%2, %%eax, 2)
\n\t
"
"movq %%mm7, 8(%2, %%eax, 2)
\n\t
"
"movq %%mm4, (%3, %%eax, 2)
\n\t
"
"movq %%mm6, 8(%3, %%eax, 2)
\n\t
"
#endif
"addl $8, %%eax
\n\t
"
" js 1b
\n\t
"
::
"r"
(
src
+
mmxSize
),
"r"
(
src
+
srcStride
+
mmxSize
),
"r"
(
dst
+
mmxSize
*
2
),
"r"
(
dst
+
dstStride
+
mmxSize
*
2
),
"g"
(
-
mmxSize
)
:
"%eax"
);
#else
const
int
mmxSize
=
1
;
#endif
dst
[
0
]
=
(
3
*
src
[
0
]
+
src
[
srcStride
])
>>
2
;
dst
[
dstStride
]
=
(
src
[
0
]
+
3
*
src
[
srcStride
])
>>
2
;
for
(
x
=
mmxSize
-
1
;
x
<
srcWidth
-
1
;
x
++
){
dst
[
2
*
x
+
1
]
=
(
3
*
src
[
x
+
0
]
+
src
[
x
+
srcStride
+
1
])
>>
2
;
dst
[
2
*
x
+
dstStride
+
2
]
=
(
src
[
x
+
0
]
+
3
*
src
[
x
+
srcStride
+
1
])
>>
2
;
dst
[
2
*
x
+
dstStride
+
1
]
=
(
src
[
x
+
1
]
+
3
*
src
[
x
+
srcStride
])
>>
2
;
dst
[
2
*
x
+
2
]
=
(
3
*
src
[
x
+
1
]
+
src
[
x
+
srcStride
])
>>
2
;
}
dst
[
srcWidth
*
2
-
1
]
=
(
3
*
src
[
srcWidth
-
1
]
+
src
[
srcWidth
-
1
+
srcStride
])
>>
2
;
dst
[
srcWidth
*
2
-
1
+
dstStride
]
=
(
src
[
srcWidth
-
1
]
+
3
*
src
[
srcWidth
-
1
+
srcStride
])
>>
2
;
dst
+=
dstStride
*
2
;
src
+=
srcStride
;
}
// last line
#if 1
dst
[
0
]
=
src
[
0
];
for
(
x
=
0
;
x
<
srcWidth
-
1
;
x
++
){
dst
[
2
*
x
+
1
]
=
(
3
*
src
[
x
]
+
src
[
x
+
1
])
>>
2
;
dst
[
2
*
x
+
2
]
=
(
src
[
x
]
+
3
*
src
[
x
+
1
])
>>
2
;
}
dst
[
2
*
srcWidth
-
1
]
=
src
[
srcWidth
-
1
];
#else
for
(
x
=
0
;
x
<
srcWidth
;
x
++
){
dst
[
2
*
x
+
0
]
=
dst
[
2
*
x
+
1
]
=
src
[
x
];
}
#endif
#ifdef HAVE_MMX
asm
volatile
(
EMMS
"
\n\t
"
SFENCE
"
\n\t
"
:::
"memory"
);
#endif
}
/**
*
* height should be a multiple of 2 and width should be a multiple of 16 (if this is a
* problem for anyone then tell me, and ill fix it)
* chrominance data is only taken from every secound line others are ignored FIXME write HQ version
*/
static
inline
void
RENAME
(
uyvytoyv12
)(
const
uint8_t
*
src
,
uint8_t
*
ydst
,
uint8_t
*
udst
,
uint8_t
*
vdst
,
unsigned
int
width
,
unsigned
int
height
,
int
lumStride
,
int
chromStride
,
int
srcStride
)
{
unsigned
y
;
const
unsigned
chromWidth
=
width
>>
1
;
for
(
y
=
0
;
y
<
height
;
y
+=
2
)
{
#ifdef HAVE_MMX
asm
volatile
(
"xorl %%eax, %%eax
\n\t
"
"pcmpeqw %%mm7, %%mm7
\n\t
"
"psrlw $8, %%mm7
\n\t
"
// FF,00,FF,00...
".balign 16
\n\t
"
"1:
\n\t
"
PREFETCH
" 64(%0, %%eax, 4)
\n\t
"
"movq (%0, %%eax, 4), %%mm0
\n\t
"
// UYVY UYVY(0)
"movq 8(%0, %%eax, 4), %%mm1
\n\t
"
// UYVY UYVY(4)
"movq %%mm0, %%mm2
\n\t
"
// UYVY UYVY(0)
"movq %%mm1, %%mm3
\n\t
"
// UYVY UYVY(4)
"pand %%mm7, %%mm0
\n\t
"
// U0V0 U0V0(0)
"pand %%mm7, %%mm1
\n\t
"
// U0V0 U0V0(4)
"psrlw $8, %%mm2
\n\t
"
// Y0Y0 Y0Y0(0)
"psrlw $8, %%mm3
\n\t
"
// Y0Y0 Y0Y0(4)
"packuswb %%mm1, %%mm0
\n\t
"
// UVUV UVUV(0)
"packuswb %%mm3, %%mm2
\n\t
"
// YYYY YYYY(0)
MOVNTQ
" %%mm2, (%1, %%eax, 2)
\n\t
"
"movq 16(%0, %%eax, 4), %%mm1
\n\t
"
// UYVY UYVY(8)
"movq 24(%0, %%eax, 4), %%mm2
\n\t
"
// UYVY UYVY(12)
"movq %%mm1, %%mm3
\n\t
"
// UYVY UYVY(8)
"movq %%mm2, %%mm4
\n\t
"
// UYVY UYVY(12)
"pand %%mm7, %%mm1
\n\t
"
// U0V0 U0V0(8)
"pand %%mm7, %%mm2
\n\t
"
// U0V0 U0V0(12)
"psrlw $8, %%mm3
\n\t
"
// Y0Y0 Y0Y0(8)
"psrlw $8, %%mm4
\n\t
"
// Y0Y0 Y0Y0(12)
"packuswb %%mm2, %%mm1
\n\t
"
// UVUV UVUV(8)
"packuswb %%mm4, %%mm3
\n\t
"
// YYYY YYYY(8)
MOVNTQ
" %%mm3, 8(%1, %%eax, 2)
\n\t
"
"movq %%mm0, %%mm2
\n\t
"
// UVUV UVUV(0)
"movq %%mm1, %%mm3
\n\t
"
// UVUV UVUV(8)
"psrlw $8, %%mm0
\n\t
"
// V0V0 V0V0(0)
"psrlw $8, %%mm1
\n\t
"
// V0V0 V0V0(8)
"pand %%mm7, %%mm2
\n\t
"
// U0U0 U0U0(0)
"pand %%mm7, %%mm3
\n\t
"
// U0U0 U0U0(8)
"packuswb %%mm1, %%mm0
\n\t
"
// VVVV VVVV(0)
"packuswb %%mm3, %%mm2
\n\t
"
// UUUU UUUU(0)
MOVNTQ
" %%mm0, (%3, %%eax)
\n\t
"
MOVNTQ
" %%mm2, (%2, %%eax)
\n\t
"
"addl $8, %%eax
\n\t
"
"cmpl %4, %%eax
\n\t
"
" jb 1b
\n\t
"
::
"r"
(
src
),
"r"
(
ydst
),
"r"
(
udst
),
"r"
(
vdst
),
"g"
(
chromWidth
)
:
"memory"
,
"%eax"
);
ydst
+=
lumStride
;
src
+=
srcStride
;
asm
volatile
(
"xorl %%eax, %%eax
\n\t
"
".balign 16
\n\t
"
"1:
\n\t
"
PREFETCH
" 64(%0, %%eax, 4)
\n\t
"
"movq (%0, %%eax, 4), %%mm0
\n\t
"
// YUYV YUYV(0)
"movq 8(%0, %%eax, 4), %%mm1
\n\t
"
// YUYV YUYV(4)
"movq 16(%0, %%eax, 4), %%mm2
\n\t
"
// YUYV YUYV(8)
"movq 24(%0, %%eax, 4), %%mm3
\n\t
"
// YUYV YUYV(12)
"psrlw $8, %%mm0
\n\t
"
// Y0Y0 Y0Y0(0)
"psrlw $8, %%mm1
\n\t
"
// Y0Y0 Y0Y0(4)
"psrlw $8, %%mm2
\n\t
"
// Y0Y0 Y0Y0(8)
"psrlw $8, %%mm3
\n\t
"
// Y0Y0 Y0Y0(12)
"packuswb %%mm1, %%mm0
\n\t
"
// YYYY YYYY(0)
"packuswb %%mm3, %%mm2
\n\t
"
// YYYY YYYY(8)
MOVNTQ
" %%mm0, (%1, %%eax, 2)
\n\t
"
MOVNTQ
" %%mm2, 8(%1, %%eax, 2)
\n\t
"
"addl $8, %%eax
\n\t
"
"cmpl %4, %%eax
\n\t
"
" jb 1b
\n\t
"
::
"r"
(
src
),
"r"
(
ydst
),
"r"
(
udst
),
"r"
(
vdst
),
"g"
(
chromWidth
)
:
"memory"
,
"%eax"
);
#else
unsigned
i
;
for
(
i
=
0
;
i
<
chromWidth
;
i
++
)
{
udst
[
i
]
=
src
[
4
*
i
+
0
];
ydst
[
2
*
i
+
0
]
=
src
[
4
*
i
+
1
];
vdst
[
i
]
=
src
[
4
*
i
+
2
];
ydst
[
2
*
i
+
1
]
=
src
[
4
*
i
+
3
];
}
ydst
+=
lumStride
;
src
+=
srcStride
;
for
(
i
=
0
;
i
<
chromWidth
;
i
++
)
{
ydst
[
2
*
i
+
0
]
=
src
[
4
*
i
+
1
];
ydst
[
2
*
i
+
1
]
=
src
[
4
*
i
+
3
];
}
#endif
udst
+=
chromStride
;
vdst
+=
chromStride
;
ydst
+=
lumStride
;
src
+=
srcStride
;
}
#ifdef HAVE_MMX
asm
volatile
(
EMMS
"
\n\t
"
SFENCE
"
\n\t
"
:::
"memory"
);
#endif
}
/**
*
* height should be a multiple of 2 and width should be a multiple of 2 (if this is a
* problem for anyone then tell me, and ill fix it)
* chrominance data is only taken from every secound line others are ignored in the C version FIXME write HQ version
*/
static
inline
void
RENAME
(
rgb24toyv12
)(
const
uint8_t
*
src
,
uint8_t
*
ydst
,
uint8_t
*
udst
,
uint8_t
*
vdst
,
unsigned
int
width
,
unsigned
int
height
,
int
lumStride
,
int
chromStride
,
int
srcStride
)
{
unsigned
y
;
const
unsigned
chromWidth
=
width
>>
1
;
#ifdef HAVE_MMX
for
(
y
=
0
;
y
<
height
-
2
;
y
+=
2
)
{
unsigned
i
;
for
(
i
=
0
;
i
<
2
;
i
++
)
{
asm
volatile
(
"movl %2, %%eax
\n\t
"
"movq "
MANGLE
(
bgr2YCoeff
)
", %%mm6
\n\t
"
"movq "
MANGLE
(
w1111
)
", %%mm5
\n\t
"
"pxor %%mm7, %%mm7
\n\t
"
"leal (%%eax, %%eax, 2), %%ebx
\n\t
"
".balign 16
\n\t
"
"1:
\n\t
"
PREFETCH
" 64(%0, %%ebx)
\n\t
"
"movd (%0, %%ebx), %%mm0
\n\t
"
"movd 3(%0, %%ebx), %%mm1
\n\t
"
"punpcklbw %%mm7, %%mm0
\n\t
"
"punpcklbw %%mm7, %%mm1
\n\t
"
"movd 6(%0, %%ebx), %%mm2
\n\t
"
"movd 9(%0, %%ebx), %%mm3
\n\t
"
"punpcklbw %%mm7, %%mm2
\n\t
"
"punpcklbw %%mm7, %%mm3
\n\t
"
"pmaddwd %%mm6, %%mm0
\n\t
"
"pmaddwd %%mm6, %%mm1
\n\t
"
"pmaddwd %%mm6, %%mm2
\n\t
"
"pmaddwd %%mm6, %%mm3
\n\t
"
#ifndef FAST_BGR2YV12
"psrad $8, %%mm0
\n\t
"
"psrad $8, %%mm1
\n\t
"
"psrad $8, %%mm2
\n\t
"
"psrad $8, %%mm3
\n\t
"
#endif
"packssdw %%mm1, %%mm0
\n\t
"
"packssdw %%mm3, %%mm2
\n\t
"
"pmaddwd %%mm5, %%mm0
\n\t
"
"pmaddwd %%mm5, %%mm2
\n\t
"
"packssdw %%mm2, %%mm0
\n\t
"
"psraw $7, %%mm0
\n\t
"
"movd 12(%0, %%ebx), %%mm4
\n\t
"
"movd 15(%0, %%ebx), %%mm1
\n\t
"
"punpcklbw %%mm7, %%mm4
\n\t
"
"punpcklbw %%mm7, %%mm1
\n\t
"
"movd 18(%0, %%ebx), %%mm2
\n\t
"
"movd 21(%0, %%ebx), %%mm3
\n\t
"
"punpcklbw %%mm7, %%mm2
\n\t
"
"punpcklbw %%mm7, %%mm3
\n\t
"
"pmaddwd %%mm6, %%mm4
\n\t
"
"pmaddwd %%mm6, %%mm1
\n\t
"
"pmaddwd %%mm6, %%mm2
\n\t
"
"pmaddwd %%mm6, %%mm3
\n\t
"
#ifndef FAST_BGR2YV12
"psrad $8, %%mm4
\n\t
"
"psrad $8, %%mm1
\n\t
"
"psrad $8, %%mm2
\n\t
"
"psrad $8, %%mm3
\n\t
"
#endif
"packssdw %%mm1, %%mm4
\n\t
"
"packssdw %%mm3, %%mm2
\n\t
"
"pmaddwd %%mm5, %%mm4
\n\t
"
"pmaddwd %%mm5, %%mm2
\n\t
"
"addl $24, %%ebx
\n\t
"
"packssdw %%mm2, %%mm4
\n\t
"
"psraw $7, %%mm4
\n\t
"
"packuswb %%mm4, %%mm0
\n\t
"
"paddusb "
MANGLE
(
bgr2YOffset
)
", %%mm0
\n\t
"
MOVNTQ
" %%mm0, (%1, %%eax)
\n\t
"
"addl $8, %%eax
\n\t
"
" js 1b
\n\t
"
:
:
"r"
(
src
+
width
*
3
),
"r"
(
ydst
+
width
),
"g"
(
-
width
)
:
"%eax"
,
"%ebx"
);
ydst
+=
lumStride
;
src
+=
srcStride
;
}
src
-=
srcStride
*
2
;
asm
volatile
(
"movl %4, %%eax
\n\t
"
"movq "
MANGLE
(
w1111
)
", %%mm5
\n\t
"
"movq "
MANGLE
(
bgr2UCoeff
)
", %%mm6
\n\t
"
"pxor %%mm7, %%mm7
\n\t
"
"leal (%%eax, %%eax, 2), %%ebx
\n\t
"
"addl %%ebx, %%ebx
\n\t
"
".balign 16
\n\t
"
"1:
\n\t
"
PREFETCH
" 64(%0, %%ebx)
\n\t
"
PREFETCH
" 64(%1, %%ebx)
\n\t
"
#if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
"movq (%0, %%ebx), %%mm0
\n\t
"
"movq (%1, %%ebx), %%mm1
\n\t
"
"movq 6(%0, %%ebx), %%mm2
\n\t
"
"movq 6(%1, %%ebx), %%mm3
\n\t
"
PAVGB
" %%mm1, %%mm0
\n\t
"
PAVGB
" %%mm3, %%mm2
\n\t
"
"movq %%mm0, %%mm1
\n\t
"
"movq %%mm2, %%mm3
\n\t
"
"psrlq $24, %%mm0
\n\t
"
"psrlq $24, %%mm2
\n\t
"
PAVGB
" %%mm1, %%mm0
\n\t
"
PAVGB
" %%mm3, %%mm2
\n\t
"
"punpcklbw %%mm7, %%mm0
\n\t
"
"punpcklbw %%mm7, %%mm2
\n\t
"
#else
"movd (%0, %%ebx), %%mm0
\n\t
"
"movd (%1, %%ebx), %%mm1
\n\t
"
"movd 3(%0, %%ebx), %%mm2
\n\t
"
"movd 3(%1, %%ebx), %%mm3
\n\t
"
"punpcklbw %%mm7, %%mm0
\n\t
"
"punpcklbw %%mm7, %%mm1
\n\t
"
"punpcklbw %%mm7, %%mm2
\n\t
"
"punpcklbw %%mm7, %%mm3
\n\t
"
"paddw %%mm1, %%mm0
\n\t
"
"paddw %%mm3, %%mm2
\n\t
"
"paddw %%mm2, %%mm0
\n\t
"
"movd 6(%0, %%ebx), %%mm4
\n\t
"
"movd 6(%1, %%ebx), %%mm1
\n\t
"
"movd 9(%0, %%ebx), %%mm2
\n\t
"
"movd 9(%1, %%ebx), %%mm3
\n\t
"
"punpcklbw %%mm7, %%mm4
\n\t
"
"punpcklbw %%mm7, %%mm1
\n\t
"
"punpcklbw %%mm7, %%mm2
\n\t
"
"punpcklbw %%mm7, %%mm3
\n\t
"
"paddw %%mm1, %%mm4
\n\t
"
"paddw %%mm3, %%mm2
\n\t
"
"paddw %%mm4, %%mm2
\n\t
"
"psrlw $2, %%mm0
\n\t
"
"psrlw $2, %%mm2
\n\t
"
#endif
"movq "
MANGLE
(
bgr2VCoeff
)
", %%mm1
\n\t
"
"movq "
MANGLE
(
bgr2VCoeff
)
", %%mm3
\n\t
"
"pmaddwd %%mm0, %%mm1
\n\t
"
"pmaddwd %%mm2, %%mm3
\n\t
"
"pmaddwd %%mm6, %%mm0
\n\t
"
"pmaddwd %%mm6, %%mm2
\n\t
"
#ifndef FAST_BGR2YV12
"psrad $8, %%mm0
\n\t
"
"psrad $8, %%mm1
\n\t
"
"psrad $8, %%mm2
\n\t
"
"psrad $8, %%mm3
\n\t
"
#endif
"packssdw %%mm2, %%mm0
\n\t
"
"packssdw %%mm3, %%mm1
\n\t
"
"pmaddwd %%mm5, %%mm0
\n\t
"
"pmaddwd %%mm5, %%mm1
\n\t
"
"packssdw %%mm1, %%mm0
\n\t
"
// V1 V0 U1 U0
"psraw $7, %%mm0
\n\t
"
#if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
"movq 12(%0, %%ebx), %%mm4
\n\t
"
"movq 12(%1, %%ebx), %%mm1
\n\t
"
"movq 18(%0, %%ebx), %%mm2
\n\t
"
"movq 18(%1, %%ebx), %%mm3
\n\t
"
PAVGB
" %%mm1, %%mm4
\n\t
"
PAVGB
" %%mm3, %%mm2
\n\t
"
"movq %%mm4, %%mm1
\n\t
"
"movq %%mm2, %%mm3
\n\t
"
"psrlq $24, %%mm4
\n\t
"
"psrlq $24, %%mm2
\n\t
"
PAVGB
" %%mm1, %%mm4
\n\t
"
PAVGB
" %%mm3, %%mm2
\n\t
"
"punpcklbw %%mm7, %%mm4
\n\t
"
"punpcklbw %%mm7, %%mm2
\n\t
"
#else
"movd 12(%0, %%ebx), %%mm4
\n\t
"
"movd 12(%1, %%ebx), %%mm1
\n\t
"
"movd 15(%0, %%ebx), %%mm2
\n\t
"
"movd 15(%1, %%ebx), %%mm3
\n\t
"
"punpcklbw %%mm7, %%mm4
\n\t
"
"punpcklbw %%mm7, %%mm1
\n\t
"
"punpcklbw %%mm7, %%mm2
\n\t
"
"punpcklbw %%mm7, %%mm3
\n\t
"
"paddw %%mm1, %%mm4
\n\t
"
"paddw %%mm3, %%mm2
\n\t
"
"paddw %%mm2, %%mm4
\n\t
"
"movd 18(%0, %%ebx), %%mm5
\n\t
"
"movd 18(%1, %%ebx), %%mm1
\n\t
"
"movd 21(%0, %%ebx), %%mm2
\n\t
"
"movd 21(%1, %%ebx), %%mm3
\n\t
"
"punpcklbw %%mm7, %%mm5
\n\t
"
"punpcklbw %%mm7, %%mm1
\n\t
"
"punpcklbw %%mm7, %%mm2
\n\t
"
"punpcklbw %%mm7, %%mm3
\n\t
"
"paddw %%mm1, %%mm5
\n\t
"
"paddw %%mm3, %%mm2
\n\t
"
"paddw %%mm5, %%mm2
\n\t
"
"movq "
MANGLE
(
w1111
)
", %%mm5
\n\t
"
"psrlw $2, %%mm4
\n\t
"
"psrlw $2, %%mm2
\n\t
"
#endif
"movq "
MANGLE
(
bgr2VCoeff
)
", %%mm1
\n\t
"
"movq "
MANGLE
(
bgr2VCoeff
)
", %%mm3
\n\t
"
"pmaddwd %%mm4, %%mm1
\n\t
"
"pmaddwd %%mm2, %%mm3
\n\t
"
"pmaddwd %%mm6, %%mm4
\n\t
"
"pmaddwd %%mm6, %%mm2
\n\t
"
#ifndef FAST_BGR2YV12
"psrad $8, %%mm4
\n\t
"
"psrad $8, %%mm1
\n\t
"
"psrad $8, %%mm2
\n\t
"
"psrad $8, %%mm3
\n\t
"
#endif
"packssdw %%mm2, %%mm4
\n\t
"
"packssdw %%mm3, %%mm1
\n\t
"
"pmaddwd %%mm5, %%mm4
\n\t
"
"pmaddwd %%mm5, %%mm1
\n\t
"
"addl $24, %%ebx
\n\t
"
"packssdw %%mm1, %%mm4
\n\t
"
// V3 V2 U3 U2
"psraw $7, %%mm4
\n\t
"
"movq %%mm0, %%mm1
\n\t
"
"punpckldq %%mm4, %%mm0
\n\t
"
"punpckhdq %%mm4, %%mm1
\n\t
"
"packsswb %%mm1, %%mm0
\n\t
"
"paddb "
MANGLE
(
bgr2UVOffset
)
", %%mm0
\n\t
"
"movd %%mm0, (%2, %%eax)
\n\t
"
"punpckhdq %%mm0, %%mm0
\n\t
"
"movd %%mm0, (%3, %%eax)
\n\t
"
"addl $4, %%eax
\n\t
"
" js 1b
\n\t
"
:
:
"r"
(
src
+
chromWidth
*
6
),
"r"
(
src
+
srcStride
+
chromWidth
*
6
),
"r"
(
udst
+
chromWidth
),
"r"
(
vdst
+
chromWidth
),
"g"
(
-
chromWidth
)
:
"%eax"
,
"%ebx"
);
udst
+=
chromStride
;
vdst
+=
chromStride
;
src
+=
srcStride
*
2
;
}
asm
volatile
(
EMMS
"
\n\t
"
SFENCE
"
\n\t
"
:::
"memory"
);
#else
y
=
0
;
#endif
for
(;
y
<
height
;
y
+=
2
)
{
unsigned
i
;
for
(
i
=
0
;
i
<
chromWidth
;
i
++
)
{
unsigned
int
b
=
src
[
6
*
i
+
0
];
unsigned
int
g
=
src
[
6
*
i
+
1
];
unsigned
int
r
=
src
[
6
*
i
+
2
];
unsigned
int
Y
=
((
RY
*
r
+
GY
*
g
+
BY
*
b
)
>>
RGB2YUV_SHIFT
)
+
16
;
unsigned
int
V
=
((
RV
*
r
+
GV
*
g
+
BV
*
b
)
>>
RGB2YUV_SHIFT
)
+
128
;
unsigned
int
U
=
((
RU
*
r
+
GU
*
g
+
BU
*
b
)
>>
RGB2YUV_SHIFT
)
+
128
;
udst
[
i
]
=
U
;
vdst
[
i
]
=
V
;
ydst
[
2
*
i
]
=
Y
;
b
=
src
[
6
*
i
+
3
];
g
=
src
[
6
*
i
+
4
];
r
=
src
[
6
*
i
+
5
];
Y
=
((
RY
*
r
+
GY
*
g
+
BY
*
b
)
>>
RGB2YUV_SHIFT
)
+
16
;
ydst
[
2
*
i
+
1
]
=
Y
;
}
ydst
+=
lumStride
;
src
+=
srcStride
;
for
(
i
=
0
;
i
<
chromWidth
;
i
++
)
{
unsigned
int
b
=
src
[
6
*
i
+
0
];
unsigned
int
g
=
src
[
6
*
i
+
1
];
unsigned
int
r
=
src
[
6
*
i
+
2
];
unsigned
int
Y
=
((
RY
*
r
+
GY
*
g
+
BY
*
b
)
>>
RGB2YUV_SHIFT
)
+
16
;
ydst
[
2
*
i
]
=
Y
;
b
=
src
[
6
*
i
+
3
];
g
=
src
[
6
*
i
+
4
];
r
=
src
[
6
*
i
+
5
];
Y
=
((
RY
*
r
+
GY
*
g
+
BY
*
b
)
>>
RGB2YUV_SHIFT
)
+
16
;
ydst
[
2
*
i
+
1
]
=
Y
;
}
udst
+=
chromStride
;
vdst
+=
chromStride
;
ydst
+=
lumStride
;
src
+=
srcStride
;
}
}
void
RENAME
(
interleaveBytes
)(
uint8_t
*
src1
,
uint8_t
*
src2
,
uint8_t
*
dest
,
unsigned
width
,
unsigned
height
,
int
src1Stride
,
int
src2Stride
,
int
dstStride
){
unsigned
h
;
for
(
h
=
0
;
h
<
height
;
h
++
)
{
unsigned
w
;
#ifdef HAVE_MMX
#ifdef HAVE_SSE2
asm
(
"xorl %%eax, %%eax
\n\t
"
"1:
\n\t
"
PREFETCH
" 64(%1, %%eax)
\n\t
"
PREFETCH
" 64(%2, %%eax)
\n\t
"
"movdqa (%1, %%eax), %%xmm0
\n\t
"
"movdqa (%1, %%eax), %%xmm1
\n\t
"
"movdqa (%2, %%eax), %%xmm2
\n\t
"
"punpcklbw %%xmm2, %%xmm0
\n\t
"
"punpckhbw %%xmm2, %%xmm1
\n\t
"
"movntdq %%xmm0, (%0, %%eax, 2)
\n\t
"
"movntdq %%xmm1, 16(%0, %%eax, 2)
\n\t
"
"addl $16, %%eax
\n\t
"
"cmpl %3, %%eax
\n\t
"
" jb 1b
\n\t
"
::
"r"
(
dest
),
"r"
(
src1
),
"r"
(
src2
),
"r"
(
width
-
15
)
:
"memory"
,
"%eax"
);
#else
asm
(
"xorl %%eax, %%eax
\n\t
"
"1:
\n\t
"
PREFETCH
" 64(%1, %%eax)
\n\t
"
PREFETCH
" 64(%2, %%eax)
\n\t
"
"movq (%1, %%eax), %%mm0
\n\t
"
"movq 8(%1, %%eax), %%mm2
\n\t
"
"movq %%mm0, %%mm1
\n\t
"
"movq %%mm2, %%mm3
\n\t
"
"movq (%2, %%eax), %%mm4
\n\t
"
"movq 8(%2, %%eax), %%mm5
\n\t
"
"punpcklbw %%mm4, %%mm0
\n\t
"
"punpckhbw %%mm4, %%mm1
\n\t
"
"punpcklbw %%mm5, %%mm2
\n\t
"
"punpckhbw %%mm5, %%mm3
\n\t
"
MOVNTQ
" %%mm0, (%0, %%eax, 2)
\n\t
"
MOVNTQ
" %%mm1, 8(%0, %%eax, 2)
\n\t
"
MOVNTQ
" %%mm2, 16(%0, %%eax, 2)
\n\t
"
MOVNTQ
" %%mm3, 24(%0, %%eax, 2)
\n\t
"
"addl $16, %%eax
\n\t
"
"cmpl %3, %%eax
\n\t
"
" jb 1b
\n\t
"
::
"r"
(
dest
),
"r"
(
src1
),
"r"
(
src2
),
"r"
(
width
-
15
)
:
"memory"
,
"%eax"
);
#endif
for
(
w
=
(
width
&
(
~
15
));
w
<
width
;
w
++
)
{
dest
[
2
*
w
+
0
]
=
src1
[
w
];
dest
[
2
*
w
+
1
]
=
src2
[
w
];
}
#else
for
(
w
=
0
;
w
<
width
;
w
++
)
{
dest
[
2
*
w
+
0
]
=
src1
[
w
];
dest
[
2
*
w
+
1
]
=
src2
[
w
];
}
#endif
dest
+=
dstStride
;
src1
+=
src1Stride
;
src2
+=
src2Stride
;
}
#ifdef HAVE_MMX
asm
(
EMMS
"
\n\t
"
SFENCE
"
\n\t
"
:::
"memory"
);
#endif
}
static
inline
void
RENAME
(
vu9_to_vu12
)(
const
uint8_t
*
src1
,
const
uint8_t
*
src2
,
uint8_t
*
dst1
,
uint8_t
*
dst2
,
unsigned
width
,
unsigned
height
,
int
srcStride1
,
int
srcStride2
,
int
dstStride1
,
int
dstStride2
)
{
unsigned
int
y
,
x
,
h
;
int
w
;
w
=
width
/
2
;
h
=
height
/
2
;
#ifdef HAVE_MMX
asm
volatile
(
PREFETCH
" %0
\n\t
"
PREFETCH
" %1
\n\t
"
::
"m"
(
*
(
src1
+
srcStride1
)),
"m"
(
*
(
src2
+
srcStride2
))
:
"memory"
);
#endif
for
(
y
=
0
;
y
<
h
;
y
++
){
const
uint8_t
*
s1
=
src1
+
srcStride1
*
(
y
>>
1
);
uint8_t
*
d
=
dst1
+
dstStride1
*
y
;
x
=
0
;
#ifdef HAVE_MMX
for
(;
x
<
w
-
31
;
x
+=
32
)
{
asm
volatile
(
PREFETCH
" 32%1
\n\t
"
"movq %1, %%mm0
\n\t
"
"movq 8%1, %%mm2
\n\t
"
"movq 16%1, %%mm4
\n\t
"
"movq 24%1, %%mm6
\n\t
"
"movq %%mm0, %%mm1
\n\t
"
"movq %%mm2, %%mm3
\n\t
"
"movq %%mm4, %%mm5
\n\t
"
"movq %%mm6, %%mm7
\n\t
"
"punpcklbw %%mm0, %%mm0
\n\t
"
"punpckhbw %%mm1, %%mm1
\n\t
"
"punpcklbw %%mm2, %%mm2
\n\t
"
"punpckhbw %%mm3, %%mm3
\n\t
"
"punpcklbw %%mm4, %%mm4
\n\t
"
"punpckhbw %%mm5, %%mm5
\n\t
"
"punpcklbw %%mm6, %%mm6
\n\t
"
"punpckhbw %%mm7, %%mm7
\n\t
"
MOVNTQ
" %%mm0, %0
\n\t
"
MOVNTQ
" %%mm1, 8%0
\n\t
"
MOVNTQ
" %%mm2, 16%0
\n\t
"
MOVNTQ
" %%mm3, 24%0
\n\t
"
MOVNTQ
" %%mm4, 32%0
\n\t
"
MOVNTQ
" %%mm5, 40%0
\n\t
"
MOVNTQ
" %%mm6, 48%0
\n\t
"
MOVNTQ
" %%mm7, 56%0"
:
"=m"
(
d
[
2
*
x
])
:
"m"
(
s1
[
x
])
:
"memory"
);
}
#endif
for
(;
x
<
w
;
x
++
)
d
[
2
*
x
]
=
d
[
2
*
x
+
1
]
=
s1
[
x
];
}
for
(
y
=
0
;
y
<
h
;
y
++
){
const
uint8_t
*
s2
=
src2
+
srcStride2
*
(
y
>>
1
);
uint8_t
*
d
=
dst2
+
dstStride2
*
y
;
x
=
0
;
#ifdef HAVE_MMX
for
(;
x
<
w
-
31
;
x
+=
32
)
{
asm
volatile
(
PREFETCH
" 32%1
\n\t
"
"movq %1, %%mm0
\n\t
"
"movq 8%1, %%mm2
\n\t
"
"movq 16%1, %%mm4
\n\t
"
"movq 24%1, %%mm6
\n\t
"
"movq %%mm0, %%mm1
\n\t
"
"movq %%mm2, %%mm3
\n\t
"
"movq %%mm4, %%mm5
\n\t
"
"movq %%mm6, %%mm7
\n\t
"
"punpcklbw %%mm0, %%mm0
\n\t
"
"punpckhbw %%mm1, %%mm1
\n\t
"
"punpcklbw %%mm2, %%mm2
\n\t
"
"punpckhbw %%mm3, %%mm3
\n\t
"
"punpcklbw %%mm4, %%mm4
\n\t
"
"punpckhbw %%mm5, %%mm5
\n\t
"
"punpcklbw %%mm6, %%mm6
\n\t
"
"punpckhbw %%mm7, %%mm7
\n\t
"
MOVNTQ
" %%mm0, %0
\n\t
"
MOVNTQ
" %%mm1, 8%0
\n\t
"
MOVNTQ
" %%mm2, 16%0
\n\t
"
MOVNTQ
" %%mm3, 24%0
\n\t
"
MOVNTQ
" %%mm4, 32%0
\n\t
"
MOVNTQ
" %%mm5, 40%0
\n\t
"
MOVNTQ
" %%mm6, 48%0
\n\t
"
MOVNTQ
" %%mm7, 56%0"
:
"=m"
(
d
[
2
*
x
])
:
"m"
(
s2
[
x
])
:
"memory"
);
}
#endif
for
(;
x
<
w
;
x
++
)
d
[
2
*
x
]
=
d
[
2
*
x
+
1
]
=
s2
[
x
];
}
#ifdef HAVE_MMX
asm
(
EMMS
"
\n\t
"
SFENCE
"
\n\t
"
:::
"memory"
);
#endif
}
static
inline
void
RENAME
(
yvu9_to_yuy2
)(
const
uint8_t
*
src1
,
const
uint8_t
*
src2
,
const
uint8_t
*
src3
,
uint8_t
*
dst
,
unsigned
width
,
unsigned
height
,
int
srcStride1
,
int
srcStride2
,
int
srcStride3
,
int
dstStride
)
{
unsigned
y
,
x
,
w
,
h
;
w
=
width
/
2
;
h
=
height
;
for
(
y
=
0
;
y
<
h
;
y
++
){
const
uint8_t
*
yp
=
src1
+
srcStride1
*
y
;
const
uint8_t
*
up
=
src2
+
srcStride2
*
(
y
>>
2
);
const
uint8_t
*
vp
=
src3
+
srcStride3
*
(
y
>>
2
);
uint8_t
*
d
=
dst
+
dstStride
*
y
;
x
=
0
;
#ifdef HAVE_MMX
for
(;
x
<
w
-
7
;
x
+=
8
)
{
asm
volatile
(
PREFETCH
" 32(%1, %0)
\n\t
"
PREFETCH
" 32(%2, %0)
\n\t
"
PREFETCH
" 32(%3, %0)
\n\t
"
"movq (%1, %0, 4), %%mm0
\n\t
"
/* Y0Y1Y2Y3Y4Y5Y6Y7 */
"movq (%2, %0), %%mm1
\n\t
"
/* U0U1U2U3U4U5U6U7 */
"movq (%3, %0), %%mm2
\n\t
"
/* V0V1V2V3V4V5V6V7 */
"movq %%mm0, %%mm3
\n\t
"
/* Y0Y1Y2Y3Y4Y5Y6Y7 */
"movq %%mm1, %%mm4
\n\t
"
/* U0U1U2U3U4U5U6U7 */
"movq %%mm2, %%mm5
\n\t
"
/* V0V1V2V3V4V5V6V7 */
"punpcklbw %%mm1, %%mm1
\n\t
"
/* U0U0 U1U1 U2U2 U3U3 */
"punpcklbw %%mm2, %%mm2
\n\t
"
/* V0V0 V1V1 V2V2 V3V3 */
"punpckhbw %%mm4, %%mm4
\n\t
"
/* U4U4 U5U5 U6U6 U7U7 */
"punpckhbw %%mm5, %%mm5
\n\t
"
/* V4V4 V5V5 V6V6 V7V7 */
"movq %%mm1, %%mm6
\n\t
"
"punpcklbw %%mm2, %%mm1
\n\t
"
/* U0V0 U0V0 U1V1 U1V1*/
"punpcklbw %%mm1, %%mm0
\n\t
"
/* Y0U0 Y1V0 Y2U0 Y3V0*/
"punpckhbw %%mm1, %%mm3
\n\t
"
/* Y4U1 Y5V1 Y6U1 Y7V1*/
MOVNTQ
" %%mm0, (%4, %0, 8)
\n\t
"
MOVNTQ
" %%mm3, 8(%4, %0, 8)
\n\t
"
"punpckhbw %%mm2, %%mm6
\n\t
"
/* U2V2 U2V2 U3V3 U3V3*/
"movq 8(%1, %0, 4), %%mm0
\n\t
"
"movq %%mm0, %%mm3
\n\t
"
"punpcklbw %%mm6, %%mm0
\n\t
"
/* Y U2 Y V2 Y U2 Y V2*/
"punpckhbw %%mm6, %%mm3
\n\t
"
/* Y U3 Y V3 Y U3 Y V3*/
MOVNTQ
" %%mm0, 16(%4, %0, 8)
\n\t
"
MOVNTQ
" %%mm3, 24(%4, %0, 8)
\n\t
"
"movq %%mm4, %%mm6
\n\t
"
"movq 16(%1, %0, 4), %%mm0
\n\t
"
"movq %%mm0, %%mm3
\n\t
"
"punpcklbw %%mm5, %%mm4
\n\t
"
"punpcklbw %%mm4, %%mm0
\n\t
"
/* Y U4 Y V4 Y U4 Y V4*/
"punpckhbw %%mm4, %%mm3
\n\t
"
/* Y U5 Y V5 Y U5 Y V5*/
MOVNTQ
" %%mm0, 32(%4, %0, 8)
\n\t
"
MOVNTQ
" %%mm3, 40(%4, %0, 8)
\n\t
"
"punpckhbw %%mm5, %%mm6
\n\t
"
"movq 24(%1, %0, 4), %%mm0
\n\t
"
"movq %%mm0, %%mm3
\n\t
"
"punpcklbw %%mm6, %%mm0
\n\t
"
/* Y U6 Y V6 Y U6 Y V6*/
"punpckhbw %%mm6, %%mm3
\n\t
"
/* Y U7 Y V7 Y U7 Y V7*/
MOVNTQ
" %%mm0, 48(%4, %0, 8)
\n\t
"
MOVNTQ
" %%mm3, 56(%4, %0, 8)
\n\t
"
:
"+r"
(
x
)
:
"r"
(
yp
),
"r"
(
up
),
"r"
(
vp
),
"r"
(
d
)
:
"memory"
);
}
#endif
for
(;
x
<
w
;
x
++
)
{
const
int
x2
=
x
<<
2
;
d
[
8
*
x
+
0
]
=
yp
[
x2
];
d
[
8
*
x
+
1
]
=
up
[
x
];
d
[
8
*
x
+
2
]
=
yp
[
x2
+
1
];
d
[
8
*
x
+
3
]
=
vp
[
x
];
d
[
8
*
x
+
4
]
=
yp
[
x2
+
2
];
d
[
8
*
x
+
5
]
=
up
[
x
];
d
[
8
*
x
+
6
]
=
yp
[
x2
+
3
];
d
[
8
*
x
+
7
]
=
vp
[
x
];
}
}
#ifdef HAVE_MMX
asm
(
EMMS
"
\n\t
"
SFENCE
"
\n\t
"
:::
"memory"
);
#endif
}
modules/video_filter/swscale/swscale.c
deleted
100644 → 0
View file @
3a2462f6
/*
Copyright (C) 2001-2003 Michael Niedermayer <michaelni@gmx.at>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*/
/*
supported Input formats: YV12, I420/IYUV, YUY2, UYVY, BGR32, BGR24, BGR16, BGR15, RGB32, RGB24, Y8/Y800, YVU9/IF09
supported output formats: YV12, I420/IYUV, YUY2, UYVY, {BGR,RGB}{1,4,8,15,16,24,32}, Y8/Y800, YVU9/IF09
{BGR,RGB}{1,4,8,15,16} support dithering
unscaled special converters (YV12=I420=IYUV, Y800=Y8)
YV12 -> {BGR,RGB}{1,4,8,15,16,24,32}
x -> x
YUV9 -> YV12
YUV9/YV12 -> Y800
Y800 -> YUV9/YV12
BGR24 -> BGR32 & RGB24 -> RGB32
BGR32 -> BGR24 & RGB32 -> RGB24
BGR15 -> BGR16
*/
/*
tested special converters (most are tested actually but i didnt write it down ...)
YV12 -> BGR16
YV12 -> YV12
BGR15 -> BGR16
BGR16 -> BGR16
YVU9 -> YV12
untested special converters
YV12/I420 -> BGR15/BGR24/BGR32 (its the yuv2rgb stuff, so it should be ok)
YV12/I420 -> YV12/I420
YUY2/BGR15/BGR24/BGR32/RGB24/RGB32 -> same format
BGR24 -> BGR32 & RGB24 -> RGB32
BGR32 -> BGR24 & RGB32 -> RGB24
BGR24 -> YV12
*/
#include <inttypes.h>
#include <string.h>
#include <math.h>
#include <stdio.h>
#include "config.h"
#include <assert.h>
#ifdef HAVE_MALLOC_H
#include <malloc.h>
#else
#include <stdlib.h>
#endif
#include "swscale.h"
#include "swscale_internal.h"
#include "common.h"
#include "rgb2rgb.h"
#define RUNTIME_CPUDETECT 1
#undef MOVNTQ
#undef PAVGB
//#undef HAVE_MMX2
//#define HAVE_3DNOW
//#undef HAVE_MMX
//#undef ARCH_X86
//#define WORDS_BIGENDIAN
#define DITHER1XBPP
#define FAST_BGR2YV12 // use 7 bit coeffs instead of 15bit
#define RET 0xC3 //near return opcode for X86
#ifdef MP_DEBUG
#define ASSERT(x) assert(x);
#else
#define ASSERT(x) ;
#endif
#ifdef M_PI
#define PI M_PI
#else
#define PI 3.14159265358979323846
#endif
//FIXME replace this with something faster
#define isPlanarYUV(x) ((x)==IMGFMT_YV12 || (x)==IMGFMT_YVU9 \
|| (x)==IMGFMT_444P || (x)==IMGFMT_422P || (x)==IMGFMT_411P)
#define isYUV(x) ((x)==IMGFMT_UYVY || (x)==IMGFMT_YUY2 || isPlanarYUV(x))
#define isGray(x) ((x)==IMGFMT_Y800)
#define isRGB(x) (((x)&IMGFMT_RGB_MASK)==IMGFMT_RGB)
#define isBGR(x) (((x)&IMGFMT_BGR_MASK)==IMGFMT_BGR)
#define isSupportedIn(x) ((x)==IMGFMT_YV12 || (x)==IMGFMT_YUY2 || (x)==IMGFMT_UYVY\
|| (x)==IMGFMT_BGR32|| (x)==IMGFMT_BGR24|| (x)==IMGFMT_BGR16|| (x)==IMGFMT_BGR15\
|| (x)==IMGFMT_RGB32|| (x)==IMGFMT_RGB24\
|| (x)==IMGFMT_Y800 || (x)==IMGFMT_YVU9\
|| (x)==IMGFMT_444P || (x)==IMGFMT_422P || (x)==IMGFMT_411P)
#define isSupportedOut(x) ((x)==IMGFMT_YV12 || (x)==IMGFMT_YUY2 || (x)==IMGFMT_UYVY\
|| (x)==IMGFMT_444P || (x)==IMGFMT_422P || (x)==IMGFMT_411P\
|| isRGB(x) || isBGR(x)\
|| (x)==IMGFMT_Y800 || (x)==IMGFMT_YVU9)
#define isPacked(x) ((x)==IMGFMT_YUY2 || (x)==IMGFMT_UYVY ||isRGB(x) || isBGR(x))
#define RGB2YUV_SHIFT 16
#define BY ((int)( 0.098*(1<<RGB2YUV_SHIFT)+0.5))
#define BV ((int)(-0.071*(1<<RGB2YUV_SHIFT)+0.5))
#define BU ((int)( 0.439*(1<<RGB2YUV_SHIFT)+0.5))
#define GY ((int)( 0.504*(1<<RGB2YUV_SHIFT)+0.5))
#define GV ((int)(-0.368*(1<<RGB2YUV_SHIFT)+0.5))
#define GU ((int)(-0.291*(1<<RGB2YUV_SHIFT)+0.5))
#define RY ((int)( 0.257*(1<<RGB2YUV_SHIFT)+0.5))
#define RV ((int)( 0.439*(1<<RGB2YUV_SHIFT)+0.5))
#define RU ((int)(-0.148*(1<<RGB2YUV_SHIFT)+0.5))
extern
const
int32_t
Inverse_Table_6_9
[
8
][
4
];
/*
NOTES
Special versions: fast Y 1:1 scaling (no interpolation in y direction)
TODO
more intelligent missalignment avoidance for the horizontal scaler
write special vertical cubic upscale version
Optimize C code (yv12 / minmax)
add support for packed pixel yuv input & output
add support for Y8 output
optimize bgr24 & bgr32
add BGR4 output support
write special BGR->BGR scaler
*/
#define ABS(a) ((a) > 0 ? (a) : (-(a)))
#define MIN(a,b) ((a) > (b) ? (b) : (a))
#define MAX(a,b) ((a) < (b) ? (b) : (a))
#ifdef ARCH_X86
static
uint64_t
attribute_used
__attribute__
((
aligned
(
8
)))
bF8
=
0xF8F8F8F8F8F8F8F8LL
;
static
uint64_t
attribute_used
__attribute__
((
aligned
(
8
)))
bFC
=
0xFCFCFCFCFCFCFCFCLL
;
static
uint64_t
__attribute__
((
aligned
(
8
)))
w10
=
0x0010001000100010LL
;
static
uint64_t
attribute_used
__attribute__
((
aligned
(
8
)))
w02
=
0x0002000200020002LL
;
static
uint64_t
attribute_used
__attribute__
((
aligned
(
8
)))
bm00001111
=
0x00000000FFFFFFFFLL
;
static
uint64_t
attribute_used
__attribute__
((
aligned
(
8
)))
bm00000111
=
0x0000000000FFFFFFLL
;
static
uint64_t
attribute_used
__attribute__
((
aligned
(
8
)))
bm11111000
=
0xFFFFFFFFFF000000LL
;
static
uint64_t
attribute_used
__attribute__
((
aligned
(
8
)))
bm01010101
=
0x00FF00FF00FF00FFLL
;
static
volatile
uint64_t
attribute_used
__attribute__
((
aligned
(
8
)))
b5Dither
;
static
volatile
uint64_t
attribute_used
__attribute__
((
aligned
(
8
)))
g5Dither
;
static
volatile
uint64_t
attribute_used
__attribute__
((
aligned
(
8
)))
g6Dither
;
static
volatile
uint64_t
attribute_used
__attribute__
((
aligned
(
8
)))
r5Dither
;
static
uint64_t
__attribute__
((
aligned
(
8
)))
dither4
[
2
]
=
{
0x0103010301030103LL
,
0x0200020002000200LL
,};
static
uint64_t
__attribute__
((
aligned
(
8
)))
dither8
[
2
]
=
{
0x0602060206020602LL
,
0x0004000400040004LL
,};
static
uint64_t
__attribute__
((
aligned
(
8
)))
b16Mask
=
0x001F001F001F001FLL
;
static
uint64_t
attribute_used
__attribute__
((
aligned
(
8
)))
g16Mask
=
0x07E007E007E007E0LL
;
static
uint64_t
attribute_used
__attribute__
((
aligned
(
8
)))
r16Mask
=
0xF800F800F800F800LL
;
static
uint64_t
__attribute__
((
aligned
(
8
)))
b15Mask
=
0x001F001F001F001FLL
;
static
uint64_t
attribute_used
__attribute__
((
aligned
(
8
)))
g15Mask
=
0x03E003E003E003E0LL
;
static
uint64_t
attribute_used
__attribute__
((
aligned
(
8
)))
r15Mask
=
0x7C007C007C007C00LL
;
static
uint64_t
attribute_used
__attribute__
((
aligned
(
8
)))
M24A
=
0x00FF0000FF0000FFLL
;
static
uint64_t
attribute_used
__attribute__
((
aligned
(
8
)))
M24B
=
0xFF0000FF0000FF00LL
;
static
uint64_t
attribute_used
__attribute__
((
aligned
(
8
)))
M24C
=
0x0000FF0000FF0000LL
;
#ifdef FAST_BGR2YV12
static
const
uint64_t
bgr2YCoeff
attribute_used
__attribute__
((
aligned
(
8
)))
=
0x000000210041000DULL
;
static
const
uint64_t
bgr2UCoeff
attribute_used
__attribute__
((
aligned
(
8
)))
=
0x0000FFEEFFDC0038ULL
;
static
const
uint64_t
bgr2VCoeff
attribute_used
__attribute__
((
aligned
(
8
)))
=
0x00000038FFD2FFF8ULL
;
#else
static
const
uint64_t
bgr2YCoeff
attribute_used
__attribute__
((
aligned
(
8
)))
=
0x000020E540830C8BULL
;
static
const
uint64_t
bgr2UCoeff
attribute_used
__attribute__
((
aligned
(
8
)))
=
0x0000ED0FDAC23831ULL
;
static
const
uint64_t
bgr2VCoeff
attribute_used
__attribute__
((
aligned
(
8
)))
=
0x00003831D0E6F6EAULL
;
#endif
static
const
uint64_t
bgr2YOffset
attribute_used
__attribute__
((
aligned
(
8
)))
=
0x1010101010101010ULL
;
static
const
uint64_t
bgr2UVOffset
attribute_used
__attribute__
((
aligned
(
8
)))
=
0x8080808080808080ULL
;
static
const
uint64_t
w1111
attribute_used
__attribute__
((
aligned
(
8
)))
=
0x0001000100010001ULL
;
#endif
// clipping helper table for C implementations:
static
unsigned
char
clip_table
[
768
];
static
SwsVector
*
sws_getConvVec
(
SwsVector
*
a
,
SwsVector
*
b
);
extern
const
uint8_t
dither_2x2_4
[
2
][
8
];
extern
const
uint8_t
dither_2x2_8
[
2
][
8
];
extern
const
uint8_t
dither_8x8_32
[
8
][
8
];
extern
const
uint8_t
dither_8x8_73
[
8
][
8
];
extern
const
uint8_t
dither_8x8_220
[
8
][
8
];
#ifdef ARCH_X86
void
in_asm_used_var_warning_killer
()
{
volatile
int
i
=
bF8
+
bFC
+
w10
+
bm00001111
+
bm00000111
+
bm11111000
+
b16Mask
+
g16Mask
+
r16Mask
+
b15Mask
+
g15Mask
+
r15Mask
+
M24A
+
M24B
+
M24C
+
w02
+
b5Dither
+
g5Dither
+
r5Dither
+
g6Dither
+
dither4
[
0
]
+
dither8
[
0
]
+
bm01010101
;
if
(
i
)
i
=
0
;
}
#endif
static
inline
void
yuv2yuvXinC
(
int16_t
*
lumFilter
,
int16_t
**
lumSrc
,
int
lumFilterSize
,
int16_t
*
chrFilter
,
int16_t
**
chrSrc
,
int
chrFilterSize
,
uint8_t
*
dest
,
uint8_t
*
uDest
,
uint8_t
*
vDest
,
int
dstW
,
int
chrDstW
)
{
//FIXME Optimize (just quickly writen not opti..)
int
i
;
for
(
i
=
0
;
i
<
dstW
;
i
++
)
{
int
val
=
1
<<
18
;
int
j
;
for
(
j
=
0
;
j
<
lumFilterSize
;
j
++
)
val
+=
lumSrc
[
j
][
i
]
*
lumFilter
[
j
];
dest
[
i
]
=
MIN
(
MAX
(
val
>>
19
,
0
),
255
);
}
if
(
uDest
!=
NULL
)
for
(
i
=
0
;
i
<
chrDstW
;
i
++
)
{
int
u
=
1
<<
18
;
int
v
=
1
<<
18
;
int
j
;
for
(
j
=
0
;
j
<
chrFilterSize
;
j
++
)
{
u
+=
chrSrc
[
j
][
i
]
*
chrFilter
[
j
];
v
+=
chrSrc
[
j
][
i
+
2048
]
*
chrFilter
[
j
];
}
uDest
[
i
]
=
MIN
(
MAX
(
u
>>
19
,
0
),
255
);
vDest
[
i
]
=
MIN
(
MAX
(
v
>>
19
,
0
),
255
);
}
}
#define YSCALE_YUV_2_PACKEDX_C(type) \
for(i=0; i<(dstW>>1); i++){\
int j;\
int Y1=1<<18;\
int Y2=1<<18;\
int U=1<<18;\
int V=1<<18;\
type *r, *b, *g;\
const int i2= 2*i;\
\
for(j=0; j<lumFilterSize; j++)\
{\
Y1 += lumSrc[j][i2] * lumFilter[j];\
Y2 += lumSrc[j][i2+1] * lumFilter[j];\
}\
for(j=0; j<chrFilterSize; j++)\
{\
U += chrSrc[j][i] * chrFilter[j];\
V += chrSrc[j][i+2048] * chrFilter[j];\
}\
Y1>>=19;\
Y2>>=19;\
U >>=19;\
V >>=19;\
if((Y1|Y2|U|V)&256)\
{\
if(Y1>255) Y1=255;\
else if(Y1<0)Y1=0;\
if(Y2>255) Y2=255;\
else if(Y2<0)Y2=0;\
if(U>255) U=255;\
else if(U<0) U=0;\
if(V>255) V=255;\
else if(V<0) V=0;\
}
#define YSCALE_YUV_2_RGBX_C(type) \
YSCALE_YUV_2_PACKEDX_C(type)\
r = c->table_rV[V];\
g = c->table_gU[U] + c->table_gV[V];\
b = c->table_bU[U];\
#define YSCALE_YUV_2_PACKED2_C \
for(i=0; i<(dstW>>1); i++){\
const int i2= 2*i;\
int Y1= (buf0[i2 ]*yalpha1+buf1[i2 ]*yalpha)>>19;\
int Y2= (buf0[i2+1]*yalpha1+buf1[i2+1]*yalpha)>>19;\
int U= (uvbuf0[i ]*uvalpha1+uvbuf1[i ]*uvalpha)>>19;\
int V= (uvbuf0[i+2048]*uvalpha1+uvbuf1[i+2048]*uvalpha)>>19;\
#define YSCALE_YUV_2_RGB2_C(type) \
YSCALE_YUV_2_PACKED2_C\
type *r, *b, *g;\
r = c->table_rV[V];\
g = c->table_gU[U] + c->table_gV[V];\
b = c->table_bU[U];\
#define YSCALE_YUV_2_PACKED1_C \
for(i=0; i<(dstW>>1); i++){\
const int i2= 2*i;\
int Y1= buf0[i2 ]>>7;\
int Y2= buf0[i2+1]>>7;\
int U= (uvbuf1[i ])>>7;\
int V= (uvbuf1[i+2048])>>7;\
#define YSCALE_YUV_2_RGB1_C(type) \
YSCALE_YUV_2_PACKED1_C\
type *r, *b, *g;\
r = c->table_rV[V];\
g = c->table_gU[U] + c->table_gV[V];\
b = c->table_bU[U];\
#define YSCALE_YUV_2_PACKED1B_C \
for(i=0; i<(dstW>>1); i++){\
const int i2= 2*i;\
int Y1= buf0[i2 ]>>7;\
int Y2= buf0[i2+1]>>7;\
int U= (uvbuf0[i ] + uvbuf1[i ])>>8;\
int V= (uvbuf0[i+2048] + uvbuf1[i+2048])>>8;\
#define YSCALE_YUV_2_RGB1B_C(type) \
YSCALE_YUV_2_PACKED1B_C\
type *r, *b, *g;\
r = c->table_rV[V];\
g = c->table_gU[U] + c->table_gV[V];\
b = c->table_bU[U];\
#define YSCALE_YUV_2_ANYRGB_C(func, func2)\
switch(c->dstFormat)\
{\
case IMGFMT_BGR32:\
case IMGFMT_RGB32:\
func(uint32_t)\
((uint32_t*)dest)[i2+0]= r[Y1] + g[Y1] + b[Y1];\
((uint32_t*)dest)[i2+1]= r[Y2] + g[Y2] + b[Y2];\
} \
break;\
case IMGFMT_RGB24:\
func(uint8_t)\
((uint8_t*)dest)[0]= r[Y1];\
((uint8_t*)dest)[1]= g[Y1];\
((uint8_t*)dest)[2]= b[Y1];\
((uint8_t*)dest)[3]= r[Y2];\
((uint8_t*)dest)[4]= g[Y2];\
((uint8_t*)dest)[5]= b[Y2];\
dest+=6;\
}\
break;\
case IMGFMT_BGR24:\
func(uint8_t)\
((uint8_t*)dest)[0]= b[Y1];\
((uint8_t*)dest)[1]= g[Y1];\
((uint8_t*)dest)[2]= r[Y1];\
((uint8_t*)dest)[3]= b[Y2];\
((uint8_t*)dest)[4]= g[Y2];\
((uint8_t*)dest)[5]= r[Y2];\
dest+=6;\
}\
break;\
case IMGFMT_RGB16:\
case IMGFMT_BGR16:\
{\
const int dr1= dither_2x2_8[y&1 ][0];\
const int dg1= dither_2x2_4[y&1 ][0];\
const int db1= dither_2x2_8[(y&1)^1][0];\
const int dr2= dither_2x2_8[y&1 ][1];\
const int dg2= dither_2x2_4[y&1 ][1];\
const int db2= dither_2x2_8[(y&1)^1][1];\
func(uint16_t)\
((uint16_t*)dest)[i2+0]= r[Y1+dr1] + g[Y1+dg1] + b[Y1+db1];\
((uint16_t*)dest)[i2+1]= r[Y2+dr2] + g[Y2+dg2] + b[Y2+db2];\
}\
}\
break;\
case IMGFMT_RGB15:\
case IMGFMT_BGR15:\
{\
const int dr1= dither_2x2_8[y&1 ][0];\
const int dg1= dither_2x2_8[y&1 ][1];\
const int db1= dither_2x2_8[(y&1)^1][0];\
const int dr2= dither_2x2_8[y&1 ][1];\
const int dg2= dither_2x2_8[y&1 ][0];\
const int db2= dither_2x2_8[(y&1)^1][1];\
func(uint16_t)\
((uint16_t*)dest)[i2+0]= r[Y1+dr1] + g[Y1+dg1] + b[Y1+db1];\
((uint16_t*)dest)[i2+1]= r[Y2+dr2] + g[Y2+dg2] + b[Y2+db2];\
}\
}\
break;\
case IMGFMT_RGB8:\
case IMGFMT_BGR8:\
{\
const uint8_t * const d64= dither_8x8_73[y&7];\
const uint8_t * const d32= dither_8x8_32[y&7];\
func(uint8_t)\
((uint8_t*)dest)[i2+0]= r[Y1+d32[(i2+0)&7]] + g[Y1+d32[(i2+0)&7]] + b[Y1+d64[(i2+0)&7]];\
((uint8_t*)dest)[i2+1]= r[Y2+d32[(i2+1)&7]] + g[Y2+d32[(i2+1)&7]] + b[Y2+d64[(i2+1)&7]];\
}\
}\
break;\
case IMGFMT_RGB4:\
case IMGFMT_BGR4:\
{\
const uint8_t * const d64= dither_8x8_73 [y&7];\
const uint8_t * const d128=dither_8x8_220[y&7];\
func(uint8_t)\
((uint8_t*)dest)[i]= r[Y1+d128[(i2+0)&7]] + g[Y1+d64[(i2+0)&7]] + b[Y1+d128[(i2+0)&7]]\
+ ((r[Y2+d128[(i2+1)&7]] + g[Y2+d64[(i2+1)&7]] + b[Y2+d128[(i2+1)&7]])<<4);\
}\
}\
break;\
case IMGFMT_RG4B:\
case IMGFMT_BG4B:\
{\
const uint8_t * const d64= dither_8x8_73 [y&7];\
const uint8_t * const d128=dither_8x8_220[y&7];\
func(uint8_t)\
((uint8_t*)dest)[i2+0]= r[Y1+d128[(i2+0)&7]] + g[Y1+d64[(i2+0)&7]] + b[Y1+d128[(i2+0)&7]];\
((uint8_t*)dest)[i2+1]= r[Y2+d128[(i2+1)&7]] + g[Y2+d64[(i2+1)&7]] + b[Y2+d128[(i2+1)&7]];\
}\
}\
break;\
case IMGFMT_RGB1:\
case IMGFMT_BGR1:\
{\
const uint8_t * const d128=dither_8x8_220[y&7];\
uint8_t *g= c->table_gU[128] + c->table_gV[128];\
for(i=0; i<dstW-7; i+=8){\
int acc;\
acc = g[((buf0[i ]*yalpha1+buf1[i ]*yalpha)>>19) + d128[0]];\
acc+= acc + g[((buf0[i+1]*yalpha1+buf1[i+1]*yalpha)>>19) + d128[1]];\
acc+= acc + g[((buf0[i+2]*yalpha1+buf1[i+2]*yalpha)>>19) + d128[2]];\
acc+= acc + g[((buf0[i+3]*yalpha1+buf1[i+3]*yalpha)>>19) + d128[3]];\
acc+= acc + g[((buf0[i+4]*yalpha1+buf1[i+4]*yalpha)>>19) + d128[4]];\
acc+= acc + g[((buf0[i+5]*yalpha1+buf1[i+5]*yalpha)>>19) + d128[5]];\
acc+= acc + g[((buf0[i+6]*yalpha1+buf1[i+6]*yalpha)>>19) + d128[6]];\
acc+= acc + g[((buf0[i+7]*yalpha1+buf1[i+7]*yalpha)>>19) + d128[7]];\
((uint8_t*)dest)[0]= acc;\
dest++;\
}\
\
/*\
((uint8_t*)dest)-= dstW>>4;\
{\
int acc=0;\
int left=0;\
static int top[1024];\
static int last_new[1024][1024];\
static int last_in3[1024][1024];\
static int drift[1024][1024];\
int topLeft=0;\
int shift=0;\
int count=0;\
const uint8_t * const d128=dither_8x8_220[y&7];\
int error_new=0;\
int error_in3=0;\
int f=0;\
\
for(i=dstW>>1; i<dstW; i++){\
int in= ((buf0[i ]*yalpha1+buf1[i ]*yalpha)>>19);\
int in2 = (76309 * (in - 16) + 32768) >> 16;\
int in3 = (in2 < 0) ? 0 : ((in2 > 255) ? 255 : in2);\
int old= (left*7 + topLeft + top[i]*5 + top[i+1]*3)/20 + in3\
+ (last_new[y][i] - in3)*f/256;\
int new= old> 128 ? 255 : 0;\
\
error_new+= ABS(last_new[y][i] - new);\
error_in3+= ABS(last_in3[y][i] - in3);\
f= error_new - error_in3*4;\
if(f<0) f=0;\
if(f>256) f=256;\
\
topLeft= top[i];\
left= top[i]= old - new;\
last_new[y][i]= new;\
last_in3[y][i]= in3;\
\
acc+= acc + (new&1);\
if((i&7)==6){\
((uint8_t*)dest)[0]= acc;\
((uint8_t*)dest)++;\
}\
}\
}\
*/
\
}\
break;\
case IMGFMT_YUY2:\
func2\
((uint8_t*)dest)[2*i2+0]= Y1;\
((uint8_t*)dest)[2*i2+1]= U;\
((uint8_t*)dest)[2*i2+2]= Y2;\
((uint8_t*)dest)[2*i2+3]= V;\
} \
break;\
case IMGFMT_UYVY:\
func2\
((uint8_t*)dest)[2*i2+0]= U;\
((uint8_t*)dest)[2*i2+1]= Y1;\
((uint8_t*)dest)[2*i2+2]= V;\
((uint8_t*)dest)[2*i2+3]= Y2;\
} \
break;\
}\
static
inline
void
yuv2packedXinC
(
SwsContext
*
c
,
int16_t
*
lumFilter
,
int16_t
**
lumSrc
,
int
lumFilterSize
,
int16_t
*
chrFilter
,
int16_t
**
chrSrc
,
int
chrFilterSize
,
uint8_t
*
dest
,
int
dstW
,
int
y
)
{
int
i
;
switch
(
c
->
dstFormat
)
{
case
IMGFMT_RGB32
:
case
IMGFMT_BGR32
:
YSCALE_YUV_2_RGBX_C
(
uint32_t
)
((
uint32_t
*
)
dest
)[
i2
+
0
]
=
r
[
Y1
]
+
g
[
Y1
]
+
b
[
Y1
];
((
uint32_t
*
)
dest
)[
i2
+
1
]
=
r
[
Y2
]
+
g
[
Y2
]
+
b
[
Y2
];
}
break
;
case
IMGFMT_RGB24
:
YSCALE_YUV_2_RGBX_C
(
uint8_t
)
((
uint8_t
*
)
dest
)[
0
]
=
r
[
Y1
];
((
uint8_t
*
)
dest
)[
1
]
=
g
[
Y1
];
((
uint8_t
*
)
dest
)[
2
]
=
b
[
Y1
];
((
uint8_t
*
)
dest
)[
3
]
=
r
[
Y2
];
((
uint8_t
*
)
dest
)[
4
]
=
g
[
Y2
];
((
uint8_t
*
)
dest
)[
5
]
=
b
[
Y2
];
dest
+=
6
;
}
break
;
case
IMGFMT_BGR24
:
YSCALE_YUV_2_RGBX_C
(
uint8_t
)
((
uint8_t
*
)
dest
)[
0
]
=
b
[
Y1
];
((
uint8_t
*
)
dest
)[
1
]
=
g
[
Y1
];
((
uint8_t
*
)
dest
)[
2
]
=
r
[
Y1
];
((
uint8_t
*
)
dest
)[
3
]
=
b
[
Y2
];
((
uint8_t
*
)
dest
)[
4
]
=
g
[
Y2
];
((
uint8_t
*
)
dest
)[
5
]
=
r
[
Y2
];
dest
+=
6
;
}
break
;
case
IMGFMT_RGB16
:
case
IMGFMT_BGR16
:
{
const
int
dr1
=
dither_2x2_8
[
y
&
1
][
0
];
const
int
dg1
=
dither_2x2_4
[
y
&
1
][
0
];
const
int
db1
=
dither_2x2_8
[(
y
&
1
)
^
1
][
0
];
const
int
dr2
=
dither_2x2_8
[
y
&
1
][
1
];
const
int
dg2
=
dither_2x2_4
[
y
&
1
][
1
];
const
int
db2
=
dither_2x2_8
[(
y
&
1
)
^
1
][
1
];
YSCALE_YUV_2_RGBX_C
(
uint16_t
)
((
uint16_t
*
)
dest
)[
i2
+
0
]
=
r
[
Y1
+
dr1
]
+
g
[
Y1
+
dg1
]
+
b
[
Y1
+
db1
];
((
uint16_t
*
)
dest
)[
i2
+
1
]
=
r
[
Y2
+
dr2
]
+
g
[
Y2
+
dg2
]
+
b
[
Y2
+
db2
];
}
}
break
;
case
IMGFMT_RGB15
:
case
IMGFMT_BGR15
:
{
const
int
dr1
=
dither_2x2_8
[
y
&
1
][
0
];
const
int
dg1
=
dither_2x2_8
[
y
&
1
][
1
];
const
int
db1
=
dither_2x2_8
[(
y
&
1
)
^
1
][
0
];
const
int
dr2
=
dither_2x2_8
[
y
&
1
][
1
];
const
int
dg2
=
dither_2x2_8
[
y
&
1
][
0
];
const
int
db2
=
dither_2x2_8
[(
y
&
1
)
^
1
][
1
];
YSCALE_YUV_2_RGBX_C
(
uint16_t
)
((
uint16_t
*
)
dest
)[
i2
+
0
]
=
r
[
Y1
+
dr1
]
+
g
[
Y1
+
dg1
]
+
b
[
Y1
+
db1
];
((
uint16_t
*
)
dest
)[
i2
+
1
]
=
r
[
Y2
+
dr2
]
+
g
[
Y2
+
dg2
]
+
b
[
Y2
+
db2
];
}
}
break
;
case
IMGFMT_RGB8
:
case
IMGFMT_BGR8
:
{
const
uint8_t
*
const
d64
=
dither_8x8_73
[
y
&
7
];
const
uint8_t
*
const
d32
=
dither_8x8_32
[
y
&
7
];
YSCALE_YUV_2_RGBX_C
(
uint8_t
)
((
uint8_t
*
)
dest
)[
i2
+
0
]
=
r
[
Y1
+
d32
[(
i2
+
0
)
&
7
]]
+
g
[
Y1
+
d32
[(
i2
+
0
)
&
7
]]
+
b
[
Y1
+
d64
[(
i2
+
0
)
&
7
]];
((
uint8_t
*
)
dest
)[
i2
+
1
]
=
r
[
Y2
+
d32
[(
i2
+
1
)
&
7
]]
+
g
[
Y2
+
d32
[(
i2
+
1
)
&
7
]]
+
b
[
Y2
+
d64
[(
i2
+
1
)
&
7
]];
}
}
break
;
case
IMGFMT_RGB4
:
case
IMGFMT_BGR4
:
{
const
uint8_t
*
const
d64
=
dither_8x8_73
[
y
&
7
];
const
uint8_t
*
const
d128
=
dither_8x8_220
[
y
&
7
];
YSCALE_YUV_2_RGBX_C
(
uint8_t
)
((
uint8_t
*
)
dest
)[
i
]
=
r
[
Y1
+
d128
[(
i2
+
0
)
&
7
]]
+
g
[
Y1
+
d64
[(
i2
+
0
)
&
7
]]
+
b
[
Y1
+
d128
[(
i2
+
0
)
&
7
]]
+
((
r
[
Y2
+
d128
[(
i2
+
1
)
&
7
]]
+
g
[
Y2
+
d64
[(
i2
+
1
)
&
7
]]
+
b
[
Y2
+
d128
[(
i2
+
1
)
&
7
]])
<<
4
);
}
}
break
;
case
IMGFMT_RG4B
:
case
IMGFMT_BG4B
:
{
const
uint8_t
*
const
d64
=
dither_8x8_73
[
y
&
7
];
const
uint8_t
*
const
d128
=
dither_8x8_220
[
y
&
7
];
YSCALE_YUV_2_RGBX_C
(
uint8_t
)
((
uint8_t
*
)
dest
)[
i2
+
0
]
=
r
[
Y1
+
d128
[(
i2
+
0
)
&
7
]]
+
g
[
Y1
+
d64
[(
i2
+
0
)
&
7
]]
+
b
[
Y1
+
d128
[(
i2
+
0
)
&
7
]];
((
uint8_t
*
)
dest
)[
i2
+
1
]
=
r
[
Y2
+
d128
[(
i2
+
1
)
&
7
]]
+
g
[
Y2
+
d64
[(
i2
+
1
)
&
7
]]
+
b
[
Y2
+
d128
[(
i2
+
1
)
&
7
]];
}
}
break
;
case
IMGFMT_RGB1
:
case
IMGFMT_BGR1
:
{
const
uint8_t
*
const
d128
=
dither_8x8_220
[
y
&
7
];
uint8_t
*
g
=
c
->
table_gU
[
128
]
+
c
->
table_gV
[
128
];
int
acc
=
0
;
for
(
i
=
0
;
i
<
dstW
-
1
;
i
+=
2
){
int
j
;
int
Y1
=
1
<<
18
;
int
Y2
=
1
<<
18
;
for
(
j
=
0
;
j
<
lumFilterSize
;
j
++
)
{
Y1
+=
lumSrc
[
j
][
i
]
*
lumFilter
[
j
];
Y2
+=
lumSrc
[
j
][
i
+
1
]
*
lumFilter
[
j
];
}
Y1
>>=
19
;
Y2
>>=
19
;
if
((
Y1
|
Y2
)
&
256
)
{
if
(
Y1
>
255
)
Y1
=
255
;
else
if
(
Y1
<
0
)
Y1
=
0
;
if
(
Y2
>
255
)
Y2
=
255
;
else
if
(
Y2
<
0
)
Y2
=
0
;
}
acc
+=
acc
+
g
[
Y1
+
d128
[(
i
+
0
)
&
7
]];
acc
+=
acc
+
g
[
Y2
+
d128
[(
i
+
1
)
&
7
]];
if
((
i
&
7
)
==
6
){
((
uint8_t
*
)
dest
)[
0
]
=
acc
;
dest
++
;
}
}
}
break
;
case
IMGFMT_YUY2
:
YSCALE_YUV_2_PACKEDX_C
(
void
)
((
uint8_t
*
)
dest
)[
2
*
i2
+
0
]
=
Y1
;
((
uint8_t
*
)
dest
)[
2
*
i2
+
1
]
=
U
;
((
uint8_t
*
)
dest
)[
2
*
i2
+
2
]
=
Y2
;
((
uint8_t
*
)
dest
)[
2
*
i2
+
3
]
=
V
;
}
break
;
case
IMGFMT_UYVY
:
YSCALE_YUV_2_PACKEDX_C
(
void
)
((
uint8_t
*
)
dest
)[
2
*
i2
+
0
]
=
U
;
((
uint8_t
*
)
dest
)[
2
*
i2
+
1
]
=
Y1
;
((
uint8_t
*
)
dest
)[
2
*
i2
+
2
]
=
V
;
((
uint8_t
*
)
dest
)[
2
*
i2
+
3
]
=
Y2
;
}
break
;
}
}
//Note: we have C, X86, MMX, MMX2, 3DNOW version therse no 3DNOW+MMX2 one
//Plain C versions
#if !defined (HAVE_MMX) || defined (RUNTIME_CPUDETECT)
#define COMPILE_C
#endif
#ifdef ARCH_POWERPC
#ifdef HAVE_ALTIVEC
#define COMPILE_ALTIVEC
#endif //HAVE_ALTIVEC
#endif //ARCH_POWERPC
#ifdef ARCH_X86
#if (defined (HAVE_MMX) && !defined (HAVE_3DNOW) && !defined (HAVE_MMX2)) || defined (RUNTIME_CPUDETECT)
#define COMPILE_MMX
#endif
#if defined (HAVE_MMX2) || defined (RUNTIME_CPUDETECT)
#define COMPILE_MMX2
#endif
#if (defined (HAVE_3DNOW) && !defined (HAVE_MMX2)) || defined (RUNTIME_CPUDETECT)
#define COMPILE_3DNOW
#endif
#endif //ARCH_X86
#undef HAVE_MMX
#undef HAVE_MMX2
#undef HAVE_3DNOW
#ifdef COMPILE_C
#undef HAVE_MMX
#undef HAVE_MMX2
#undef HAVE_3DNOW
#undef HAVE_ALTIVEC
#define RENAME(a) a ## _C
#include "swscale_template.c"
#endif
#ifdef ARCH_POWERPC
#ifdef COMPILE_ALTIVEC
#undef RENAME
#define HAVE_ALTIVEC
#define RENAME(a) a ## _altivec
#include "swscale_template.c"
#endif
#endif //ARCH_POWERPC
#ifdef ARCH_X86
//X86 versions
/*
#undef RENAME
#undef HAVE_MMX
#undef HAVE_MMX2
#undef HAVE_3DNOW
#define ARCH_X86
#define RENAME(a) a ## _X86
#include "swscale_template.c"
*/
//MMX versions
#ifdef COMPILE_MMX
#undef RENAME
#define HAVE_MMX
#undef HAVE_MMX2
#undef HAVE_3DNOW
#define RENAME(a) a ## _MMX
#include "swscale_template.c"
#endif
//MMX2 versions
#ifdef COMPILE_MMX2
#undef RENAME
#define HAVE_MMX
#define HAVE_MMX2
#undef HAVE_3DNOW
#define RENAME(a) a ## _MMX2
#include "swscale_template.c"
#endif
//3DNOW versions
#ifdef COMPILE_3DNOW
#undef RENAME
#define HAVE_MMX
#undef HAVE_MMX2
#define HAVE_3DNOW
#define RENAME(a) a ## _3DNow
#include "swscale_template.c"
#endif
#endif //ARCH_X86
// minor note: the HAVE_xyz is messed up after that line so don't use it
static
double
getSplineCoeff
(
double
a
,
double
b
,
double
c
,
double
d
,
double
dist
)
{
// printf("%f %f %f %f %f\n", a,b,c,d,dist);
if
(
dist
<=
1
.
0
)
return
((
d
*
dist
+
c
)
*
dist
+
b
)
*
dist
+
a
;
else
return
getSplineCoeff
(
0
.
0
,
b
+
2
.
0
*
c
+
3
.
0
*
d
,
c
+
3
.
0
*
d
,
-
b
-
3
.
0
*
c
-
6
.
0
*
d
,
dist
-
1
.
0
);
}
static
inline
void
initFilter
(
int16_t
**
outFilter
,
int16_t
**
filterPos
,
int
*
outFilterSize
,
int
xInc
,
int
srcW
,
int
dstW
,
int
filterAlign
,
int
one
,
int
flags
,
SwsVector
*
srcFilter
,
SwsVector
*
dstFilter
)
{
int
i
;
int
filterSize
;
int
filter2Size
;
int
minFilterSize
;
double
*
filter
=
NULL
;
double
*
filter2
=
NULL
;
#ifdef ARCH_X86
if
(
flags
&
SWS_CPU_CAPS_MMX
)
asm
volatile
(
"emms
\n\t
"
:::
"memory"
);
//FIXME this shouldnt be required but it IS (even for non mmx versions)
#endif
// Note the +1 is for the MMXscaler which reads over the end
*
filterPos
=
(
int16_t
*
)
memalign
(
8
,
(
dstW
+
1
)
*
sizeof
(
int16_t
));
if
(
ABS
(
xInc
-
0x10000
)
<
10
)
// unscaled
{
int
i
;
filterSize
=
1
;
filter
=
(
double
*
)
memalign
(
8
,
dstW
*
sizeof
(
double
)
*
filterSize
);
for
(
i
=
0
;
i
<
dstW
*
filterSize
;
i
++
)
filter
[
i
]
=
0
;
for
(
i
=
0
;
i
<
dstW
;
i
++
)
{
filter
[
i
*
filterSize
]
=
1
;
(
*
filterPos
)[
i
]
=
i
;
}
}
else
if
(
flags
&
SWS_POINT
)
// lame looking point sampling mode
{
int
i
;
int
xDstInSrc
;
filterSize
=
1
;
filter
=
(
double
*
)
memalign
(
8
,
dstW
*
sizeof
(
double
)
*
filterSize
);
xDstInSrc
=
xInc
/
2
-
0x8000
;
for
(
i
=
0
;
i
<
dstW
;
i
++
)
{
int
xx
=
(
xDstInSrc
-
((
filterSize
-
1
)
<<
15
)
+
(
1
<<
15
))
>>
16
;
(
*
filterPos
)[
i
]
=
xx
;
filter
[
i
]
=
1
.
0
;
xDstInSrc
+=
xInc
;
}
}
else
if
((
xInc
<=
(
1
<<
16
)
&&
(
flags
&
SWS_AREA
))
||
(
flags
&
SWS_FAST_BILINEAR
))
// bilinear upscale
{
int
i
;
int
xDstInSrc
;
if
(
flags
&
SWS_BICUBIC
)
filterSize
=
4
;
else
if
(
flags
&
SWS_X
)
filterSize
=
4
;
else
filterSize
=
2
;
// SWS_BILINEAR / SWS_AREA
filter
=
(
double
*
)
memalign
(
8
,
dstW
*
sizeof
(
double
)
*
filterSize
);
xDstInSrc
=
xInc
/
2
-
0x8000
;
for
(
i
=
0
;
i
<
dstW
;
i
++
)
{
int
xx
=
(
xDstInSrc
-
((
filterSize
-
1
)
<<
15
)
+
(
1
<<
15
))
>>
16
;
int
j
;
(
*
filterPos
)[
i
]
=
xx
;
//Bilinear upscale / linear interpolate / Area averaging
for
(
j
=
0
;
j
<
filterSize
;
j
++
)
{
double
d
=
ABS
((
xx
<<
16
)
-
xDstInSrc
)
/
(
double
)(
1
<<
16
);
double
coeff
=
1
.
0
-
d
;
if
(
coeff
<
0
)
coeff
=
0
;
filter
[
i
*
filterSize
+
j
]
=
coeff
;
xx
++
;
}
xDstInSrc
+=
xInc
;
}
}
else
{
double
xDstInSrc
;
double
sizeFactor
,
filterSizeInSrc
;
const
double
xInc1
=
(
double
)
xInc
/
(
double
)(
1
<<
16
);
int
param
=
(
flags
&
SWS_PARAM_MASK
)
>>
SWS_PARAM_SHIFT
;
if
(
flags
&
SWS_BICUBIC
)
sizeFactor
=
4
.
0
;
else
if
(
flags
&
SWS_X
)
sizeFactor
=
8
.
0
;
else
if
(
flags
&
SWS_AREA
)
sizeFactor
=
1
.
0
;
//downscale only, for upscale it is bilinear
else
if
(
flags
&
SWS_GAUSS
)
sizeFactor
=
8
.
0
;
// infinite ;)
else
if
(
flags
&
SWS_LANCZOS
)
sizeFactor
=
param
?
2
.
0
*
param
:
6
.
0
;
else
if
(
flags
&
SWS_SINC
)
sizeFactor
=
20
.
0
;
// infinite ;)
else
if
(
flags
&
SWS_SPLINE
)
sizeFactor
=
20
.
0
;
// infinite ;)
else
if
(
flags
&
SWS_BILINEAR
)
sizeFactor
=
2
.
0
;
else
{
sizeFactor
=
0
.
0
;
//GCC warning killer
ASSERT
(
0
)
}
if
(
xInc1
<=
1
.
0
)
filterSizeInSrc
=
sizeFactor
;
// upscale
else
filterSizeInSrc
=
sizeFactor
*
srcW
/
(
double
)
dstW
;
filterSize
=
(
int
)
ceil
(
1
+
filterSizeInSrc
);
// will be reduced later if possible
if
(
filterSize
>
srcW
-
2
)
filterSize
=
srcW
-
2
;
filter
=
(
double
*
)
memalign
(
16
,
dstW
*
sizeof
(
double
)
*
filterSize
);
xDstInSrc
=
xInc1
/
2
.
0
-
0
.
5
;
for
(
i
=
0
;
i
<
dstW
;
i
++
)
{
int
xx
=
(
int
)(
xDstInSrc
-
(
filterSize
-
1
)
*
0
.
5
+
0
.
5
);
int
j
;
(
*
filterPos
)[
i
]
=
xx
;
for
(
j
=
0
;
j
<
filterSize
;
j
++
)
{
double
d
=
ABS
(
xx
-
xDstInSrc
)
/
filterSizeInSrc
*
sizeFactor
;
double
coeff
;
if
(
flags
&
SWS_BICUBIC
)
{
double
A
=
param
?
-
param
*
0
.
01
:
-
0
.
60
;
// Equation is from VirtualDub
if
(
d
<
1
.
0
)
coeff
=
(
1
.
0
-
(
A
+
3
.
0
)
*
d
*
d
+
(
A
+
2
.
0
)
*
d
*
d
*
d
);
else
if
(
d
<
2
.
0
)
coeff
=
(
-
4
.
0
*
A
+
8
.
0
*
A
*
d
-
5
.
0
*
A
*
d
*
d
+
A
*
d
*
d
*
d
);
else
coeff
=
0
.
0
;
}
/* else if(flags & SWS_X)
{
double p= param ? param*0.01 : 0.3;
coeff = d ? sin(d*PI)/(d*PI) : 1.0;
coeff*= pow(2.0, - p*d*d);
}*/
else
if
(
flags
&
SWS_X
)
{
double
A
=
param
?
param
*
0
.
1
:
1
.
0
;
if
(
d
<
1
.
0
)
coeff
=
cos
(
d
*
PI
);
else
coeff
=-
1
.
0
;
if
(
coeff
<
0
.
0
)
coeff
=
-
pow
(
-
coeff
,
A
);
else
coeff
=
pow
(
coeff
,
A
);
coeff
=
coeff
*
0
.
5
+
0
.
5
;
}
else
if
(
flags
&
SWS_AREA
)
{
double
srcPixelSize
=
1
.
0
/
xInc1
;
if
(
d
+
srcPixelSize
/
2
<
0
.
5
)
coeff
=
1
.
0
;
else
if
(
d
-
srcPixelSize
/
2
<
0
.
5
)
coeff
=
(
0
.
5
-
d
)
/
srcPixelSize
+
0
.
5
;
else
coeff
=
0
.
0
;
}
else
if
(
flags
&
SWS_GAUSS
)
{
double
p
=
param
?
param
*
0
.
1
:
3
.
0
;
coeff
=
pow
(
2
.
0
,
-
p
*
d
*
d
);
}
else
if
(
flags
&
SWS_SINC
)
{
coeff
=
d
?
sin
(
d
*
PI
)
/
(
d
*
PI
)
:
1
.
0
;
}
else
if
(
flags
&
SWS_LANCZOS
)
{
double
p
=
param
?
param
:
3
.
0
;
coeff
=
d
?
sin
(
d
*
PI
)
*
sin
(
d
*
PI
/
p
)
/
(
d
*
d
*
PI
*
PI
/
p
)
:
1
.
0
;
if
(
d
>
p
)
coeff
=
0
;
}
else
if
(
flags
&
SWS_BILINEAR
)
{
coeff
=
1
.
0
-
d
;
if
(
coeff
<
0
)
coeff
=
0
;
}
else
if
(
flags
&
SWS_SPLINE
)
{
double
p
=-
2
.
196152422706632
;
coeff
=
getSplineCoeff
(
1
.
0
,
0
.
0
,
p
,
-
p
-
1
.
0
,
d
);
}
else
{
coeff
=
0
.
0
;
//GCC warning killer
ASSERT
(
0
)
}
filter
[
i
*
filterSize
+
j
]
=
coeff
;
xx
++
;
}
xDstInSrc
+=
xInc1
;
}
}
/* apply src & dst Filter to filter -> filter2
free(filter);
*/
ASSERT
(
filterSize
>
0
)
filter2Size
=
filterSize
;
if
(
srcFilter
)
filter2Size
+=
srcFilter
->
length
-
1
;
if
(
dstFilter
)
filter2Size
+=
dstFilter
->
length
-
1
;
ASSERT
(
filter2Size
>
0
)
filter2
=
(
double
*
)
memalign
(
8
,
filter2Size
*
dstW
*
sizeof
(
double
));
for
(
i
=
0
;
i
<
dstW
;
i
++
)
{
int
j
;
SwsVector
scaleFilter
;
SwsVector
*
outVec
;
scaleFilter
.
coeff
=
filter
+
i
*
filterSize
;
scaleFilter
.
length
=
filterSize
;
if
(
srcFilter
)
outVec
=
sws_getConvVec
(
srcFilter
,
&
scaleFilter
);
else
outVec
=
&
scaleFilter
;
ASSERT
(
outVec
->
length
==
filter2Size
)
//FIXME dstFilter
for
(
j
=
0
;
j
<
outVec
->
length
;
j
++
)
{
filter2
[
i
*
filter2Size
+
j
]
=
outVec
->
coeff
[
j
];
}
(
*
filterPos
)[
i
]
+=
(
filterSize
-
1
)
/
2
-
(
filter2Size
-
1
)
/
2
;
if
(
outVec
!=
&
scaleFilter
)
sws_freeVec
(
outVec
);
}
free
(
filter
);
filter
=
NULL
;
/* try to reduce the filter-size (step1 find size and shift left) */
// Assume its near normalized (*0.5 or *2.0 is ok but * 0.001 is not)
minFilterSize
=
0
;
for
(
i
=
dstW
-
1
;
i
>=
0
;
i
--
)
{
int
min
=
filter2Size
;
int
j
;
double
cutOff
=
0
.
0
;
/* get rid off near zero elements on the left by shifting left */
for
(
j
=
0
;
j
<
filter2Size
;
j
++
)
{
int
k
;
cutOff
+=
ABS
(
filter2
[
i
*
filter2Size
]);
if
(
cutOff
>
SWS_MAX_REDUCE_CUTOFF
)
break
;
/* preserve Monotonicity because the core can't handle the filter otherwise */
if
(
i
<
dstW
-
1
&&
(
*
filterPos
)[
i
]
>=
(
*
filterPos
)[
i
+
1
])
break
;
// Move filter coeffs left
for
(
k
=
1
;
k
<
filter2Size
;
k
++
)
filter2
[
i
*
filter2Size
+
k
-
1
]
=
filter2
[
i
*
filter2Size
+
k
];
filter2
[
i
*
filter2Size
+
k
-
1
]
=
0
.
0
;
(
*
filterPos
)[
i
]
++
;
}
cutOff
=
0
.
0
;
/* count near zeros on the right */
for
(
j
=
filter2Size
-
1
;
j
>
0
;
j
--
)
{
cutOff
+=
ABS
(
filter2
[
i
*
filter2Size
+
j
]);
if
(
cutOff
>
SWS_MAX_REDUCE_CUTOFF
)
break
;
min
--
;
}
if
(
min
>
minFilterSize
)
minFilterSize
=
min
;
}
if
(
flags
&
SWS_CPU_CAPS_ALTIVEC
)
{
// we can handle the special case 4,
// so we don't want to go to the full 8
if
(
minFilterSize
<
5
)
filterAlign
=
4
;
// we really don't want to waste our time
// doing useless computation, so fall-back on
// the scalar C code for very small filter.
// vectorizing is worth it only if you have
// decent-sized vector.
if
(
minFilterSize
<
3
)
filterAlign
=
1
;
}
ASSERT
(
minFilterSize
>
0
)
filterSize
=
(
minFilterSize
+
(
filterAlign
-
1
))
&
(
~
(
filterAlign
-
1
));
ASSERT
(
filterSize
>
0
)
filter
=
(
double
*
)
memalign
(
8
,
filterSize
*
dstW
*
sizeof
(
double
));
*
outFilterSize
=
filterSize
;
if
(
flags
&
SWS_PRINT_INFO
)
MSG_INFO
(
"SwScaler: reducing / aligning filtersize %d -> %d
\n
"
,
filter2Size
,
filterSize
);
/* try to reduce the filter-size (step2 reduce it) */
for
(
i
=
0
;
i
<
dstW
;
i
++
)
{
int
j
;
for
(
j
=
0
;
j
<
filterSize
;
j
++
)
{
if
(
j
>=
filter2Size
)
filter
[
i
*
filterSize
+
j
]
=
0
.
0
;
else
filter
[
i
*
filterSize
+
j
]
=
filter2
[
i
*
filter2Size
+
j
];
}
}
free
(
filter2
);
filter2
=
NULL
;
//FIXME try to align filterpos if possible
//fix borders
for
(
i
=
0
;
i
<
dstW
;
i
++
)
{
int
j
;
if
((
*
filterPos
)[
i
]
<
0
)
{
// Move filter coeffs left to compensate for filterPos
for
(
j
=
1
;
j
<
filterSize
;
j
++
)
{
int
left
=
MAX
(
j
+
(
*
filterPos
)[
i
],
0
);
filter
[
i
*
filterSize
+
left
]
+=
filter
[
i
*
filterSize
+
j
];
filter
[
i
*
filterSize
+
j
]
=
0
;
}
(
*
filterPos
)[
i
]
=
0
;
}
if
((
*
filterPos
)[
i
]
+
filterSize
>
srcW
)
{
int
shift
=
(
*
filterPos
)[
i
]
+
filterSize
-
srcW
;
// Move filter coeffs right to compensate for filterPos
for
(
j
=
filterSize
-
2
;
j
>=
0
;
j
--
)
{
int
right
=
MIN
(
j
+
shift
,
filterSize
-
1
);
filter
[
i
*
filterSize
+
right
]
+=
filter
[
i
*
filterSize
+
j
];
filter
[
i
*
filterSize
+
j
]
=
0
;
}
(
*
filterPos
)[
i
]
=
srcW
-
filterSize
;
}
}
// Note the +1 is for the MMXscaler which reads over the end
*
outFilter
=
(
int16_t
*
)
memalign
(
8
,
*
outFilterSize
*
(
dstW
+
1
)
*
sizeof
(
int16_t
));
memset
(
*
outFilter
,
0
,
*
outFilterSize
*
(
dstW
+
1
)
*
sizeof
(
int16_t
));
/* Normalize & Store in outFilter */
for
(
i
=
0
;
i
<
dstW
;
i
++
)
{
int
j
;
double
error
=
0
;
double
sum
=
0
;
double
scale
=
one
;
for
(
j
=
0
;
j
<
filterSize
;
j
++
)
{
sum
+=
filter
[
i
*
filterSize
+
j
];
}
scale
/=
sum
;
for
(
j
=
0
;
j
<*
outFilterSize
;
j
++
)
{
double
v
=
filter
[
i
*
filterSize
+
j
]
*
scale
+
error
;
int
intV
=
floor
(
v
+
0
.
5
);
(
*
outFilter
)[
i
*
(
*
outFilterSize
)
+
j
]
=
intV
;
error
=
v
-
intV
;
}
}
(
*
filterPos
)[
dstW
]
=
(
*
filterPos
)[
dstW
-
1
];
// the MMX scaler will read over the end
for
(
i
=
0
;
i
<*
outFilterSize
;
i
++
)
{
int
j
=
dstW
*
(
*
outFilterSize
);
(
*
outFilter
)[
j
+
i
]
=
(
*
outFilter
)[
j
+
i
-
(
*
outFilterSize
)];
}
free
(
filter
);
}
#ifdef ARCH_X86
static
void
initMMX2HScaler
(
int
dstW
,
int
xInc
,
uint8_t
*
funnyCode
,
int16_t
*
filter
,
int32_t
*
filterPos
,
int
numSplits
)
{
uint8_t
*
fragmentA
;
int
imm8OfPShufW1A
;
int
imm8OfPShufW2A
;
int
fragmentLengthA
;
uint8_t
*
fragmentB
;
int
imm8OfPShufW1B
;
int
imm8OfPShufW2B
;
int
fragmentLengthB
;
int
fragmentPos
;
int
xpos
,
i
;
// create an optimized horizontal scaling routine
//code fragment
asm
volatile
(
"jmp 9f
\n\t
"
// Begin
"0:
\n\t
"
"movq (%%edx, %%eax), %%mm3
\n\t
"
"movd (%%ecx, %%esi), %%mm0
\n\t
"
"movd 1(%%ecx, %%esi), %%mm1
\n\t
"
"punpcklbw %%mm7, %%mm1
\n\t
"
"punpcklbw %%mm7, %%mm0
\n\t
"
"pshufw $0xFF, %%mm1, %%mm1
\n\t
"
"1:
\n\t
"
"pshufw $0xFF, %%mm0, %%mm0
\n\t
"
"2:
\n\t
"
"psubw %%mm1, %%mm0
\n\t
"
"movl 8(%%ebx, %%eax), %%esi
\n\t
"
"pmullw %%mm3, %%mm0
\n\t
"
"psllw $7, %%mm1
\n\t
"
"paddw %%mm1, %%mm0
\n\t
"
"movq %%mm0, (%%edi, %%eax)
\n\t
"
"addl $8, %%eax
\n\t
"
// End
"9:
\n\t
"
// "int $3\n\t"
"leal 0b, %0
\n\t
"
"leal 1b, %1
\n\t
"
"leal 2b, %2
\n\t
"
"decl %1
\n\t
"
"decl %2
\n\t
"
"subl %0, %1
\n\t
"
"subl %0, %2
\n\t
"
"leal 9b, %3
\n\t
"
"subl %0, %3
\n\t
"
:
"=r"
(
fragmentA
),
"=r"
(
imm8OfPShufW1A
),
"=r"
(
imm8OfPShufW2A
),
"=r"
(
fragmentLengthA
)
);
asm
volatile
(
"jmp 9f
\n\t
"
// Begin
"0:
\n\t
"
"movq (%%edx, %%eax), %%mm3
\n\t
"
"movd (%%ecx, %%esi), %%mm0
\n\t
"
"punpcklbw %%mm7, %%mm0
\n\t
"
"pshufw $0xFF, %%mm0, %%mm1
\n\t
"
"1:
\n\t
"
"pshufw $0xFF, %%mm0, %%mm0
\n\t
"
"2:
\n\t
"
"psubw %%mm1, %%mm0
\n\t
"
"movl 8(%%ebx, %%eax), %%esi
\n\t
"
"pmullw %%mm3, %%mm0
\n\t
"
"psllw $7, %%mm1
\n\t
"
"paddw %%mm1, %%mm0
\n\t
"
"movq %%mm0, (%%edi, %%eax)
\n\t
"
"addl $8, %%eax
\n\t
"
// End
"9:
\n\t
"
// "int $3\n\t"
"leal 0b, %0
\n\t
"
"leal 1b, %1
\n\t
"
"leal 2b, %2
\n\t
"
"decl %1
\n\t
"
"decl %2
\n\t
"
"subl %0, %1
\n\t
"
"subl %0, %2
\n\t
"
"leal 9b, %3
\n\t
"
"subl %0, %3
\n\t
"
:
"=r"
(
fragmentB
),
"=r"
(
imm8OfPShufW1B
),
"=r"
(
imm8OfPShufW2B
),
"=r"
(
fragmentLengthB
)
);
xpos
=
0
;
//lumXInc/2 - 0x8000; // difference between pixel centers
fragmentPos
=
0
;
for
(
i
=
0
;
i
<
dstW
/
numSplits
;
i
++
)
{
int
xx
=
xpos
>>
16
;
if
((
i
&
3
)
==
0
)
{
int
a
=
0
;
int
b
=
((
xpos
+
xInc
)
>>
16
)
-
xx
;
int
c
=
((
xpos
+
xInc
*
2
)
>>
16
)
-
xx
;
int
d
=
((
xpos
+
xInc
*
3
)
>>
16
)
-
xx
;
filter
[
i
]
=
((
xpos
&
0xFFFF
)
^
0xFFFF
)
>>
9
;
filter
[
i
+
1
]
=
(((
xpos
+
xInc
)
&
0xFFFF
)
^
0xFFFF
)
>>
9
;
filter
[
i
+
2
]
=
(((
xpos
+
xInc
*
2
)
&
0xFFFF
)
^
0xFFFF
)
>>
9
;
filter
[
i
+
3
]
=
(((
xpos
+
xInc
*
3
)
&
0xFFFF
)
^
0xFFFF
)
>>
9
;
filterPos
[
i
/
2
]
=
xx
;
if
(
d
+
1
<
4
)
{
int
maxShift
=
3
-
(
d
+
1
);
int
shift
=
0
;
memcpy
(
funnyCode
+
fragmentPos
,
fragmentB
,
fragmentLengthB
);
funnyCode
[
fragmentPos
+
imm8OfPShufW1B
]
=
(
a
+
1
)
|
((
b
+
1
)
<<
2
)
|
((
c
+
1
)
<<
4
)
|
((
d
+
1
)
<<
6
);
funnyCode
[
fragmentPos
+
imm8OfPShufW2B
]
=
a
|
(
b
<<
2
)
|
(
c
<<
4
)
|
(
d
<<
6
);
if
(
i
+
3
>=
dstW
)
shift
=
maxShift
;
//avoid overread
else
if
((
filterPos
[
i
/
2
]
&
3
)
<=
maxShift
)
shift
=
filterPos
[
i
/
2
]
&
3
;
//Align
if
(
shift
&&
i
>=
shift
)
{
funnyCode
[
fragmentPos
+
imm8OfPShufW1B
]
+=
0x55
*
shift
;
funnyCode
[
fragmentPos
+
imm8OfPShufW2B
]
+=
0x55
*
shift
;
filterPos
[
i
/
2
]
-=
shift
;
}
fragmentPos
+=
fragmentLengthB
;
}
else
{
int
maxShift
=
3
-
d
;
int
shift
=
0
;
memcpy
(
funnyCode
+
fragmentPos
,
fragmentA
,
fragmentLengthA
);
funnyCode
[
fragmentPos
+
imm8OfPShufW1A
]
=
funnyCode
[
fragmentPos
+
imm8OfPShufW2A
]
=
a
|
(
b
<<
2
)
|
(
c
<<
4
)
|
(
d
<<
6
);
if
(
i
+
4
>=
dstW
)
shift
=
maxShift
;
//avoid overread
else
if
((
filterPos
[
i
/
2
]
&
3
)
<=
maxShift
)
shift
=
filterPos
[
i
/
2
]
&
3
;
//partial align
if
(
shift
&&
i
>=
shift
)
{
funnyCode
[
fragmentPos
+
imm8OfPShufW1A
]
+=
0x55
*
shift
;
funnyCode
[
fragmentPos
+
imm8OfPShufW2A
]
+=
0x55
*
shift
;
filterPos
[
i
/
2
]
-=
shift
;
}
fragmentPos
+=
fragmentLengthA
;
}
funnyCode
[
fragmentPos
]
=
RET
;
}
xpos
+=
xInc
;
}
filterPos
[
i
/
2
]
=
xpos
>>
16
;
// needed to jump to the next part
}
#endif // ARCH_X86
static
void
globalInit
(){
// generating tables:
int
i
;
for
(
i
=
0
;
i
<
768
;
i
++
){
int
c
=
MIN
(
MAX
(
i
-
256
,
0
),
255
);
clip_table
[
i
]
=
c
;
}
}
static
SwsFunc
getSwsFunc
(
int
flags
){
#ifdef RUNTIME_CPUDETECT
#ifdef ARCH_X86
// ordered per speed fasterst first
if
(
flags
&
SWS_CPU_CAPS_MMX2
)
return
swScale_MMX2
;
else
if
(
flags
&
SWS_CPU_CAPS_3DNOW
)
return
swScale_3DNow
;
else
if
(
flags
&
SWS_CPU_CAPS_MMX
)
return
swScale_MMX
;
else
return
swScale_C
;
#else
#ifdef ARCH_POWERPC
if
(
flags
&
SWS_CPU_CAPS_ALTIVEC
)
return
swScale_altivec
;
else
return
swScale_C
;
#endif
return
swScale_C
;
#endif
#else //RUNTIME_CPUDETECT
#ifdef HAVE_MMX2
return
swScale_MMX2
;
#elif defined (HAVE_3DNOW)
return
swScale_3DNow
;
#elif defined (HAVE_MMX)
return
swScale_MMX
;
#elif defined (HAVE_ALTIVEC)
return
swScale_altivec
;
#else
return
swScale_C
;
#endif
#endif //!RUNTIME_CPUDETECT
}
static
int
PlanarToNV12Wrapper
(
SwsContext
*
c
,
uint8_t
*
src
[],
int
srcStride
[],
int
srcSliceY
,
int
srcSliceH
,
uint8_t
*
dstParam
[],
int
dstStride
[]){
uint8_t
*
dst
=
dstParam
[
0
]
+
dstStride
[
0
]
*
srcSliceY
;
/* Copy Y plane */
if
(
dstStride
[
0
]
==
srcStride
[
0
])
memcpy
(
dst
,
src
[
0
],
srcSliceH
*
dstStride
[
0
]);
else
{
int
i
;
uint8_t
*
srcPtr
=
src
[
0
];
uint8_t
*
dstPtr
=
dst
;
for
(
i
=
0
;
i
<
srcSliceH
;
i
++
)
{
memcpy
(
dstPtr
,
srcPtr
,
srcStride
[
0
]);
srcPtr
+=
srcStride
[
0
];
dstPtr
+=
dstStride
[
0
];
}
}
dst
=
dstParam
[
1
]
+
dstStride
[
1
]
*
srcSliceY
;
interleaveBytes
(
src
[
1
],
src
[
2
],
dst
,
c
->
srcW
,
srcSliceH
,
srcStride
[
1
],
srcStride
[
2
],
dstStride
[
0
]
);
return
srcSliceH
;
}
static
int
PlanarToYuy2Wrapper
(
SwsContext
*
c
,
uint8_t
*
src
[],
int
srcStride
[],
int
srcSliceY
,
int
srcSliceH
,
uint8_t
*
dstParam
[],
int
dstStride
[]){
uint8_t
*
dst
=
dstParam
[
0
]
+
dstStride
[
0
]
*
srcSliceY
;
yv12toyuy2
(
src
[
0
],
src
[
1
],
src
[
2
],
dst
,
c
->
srcW
,
srcSliceH
,
srcStride
[
0
],
srcStride
[
1
],
dstStride
[
0
]
);
return
srcSliceH
;
}
static
int
PlanarToUyvyWrapper
(
SwsContext
*
c
,
uint8_t
*
src
[],
int
srcStride
[],
int
srcSliceY
,
int
srcSliceH
,
uint8_t
*
dstParam
[],
int
dstStride
[]){
uint8_t
*
dst
=
dstParam
[
0
]
+
dstStride
[
0
]
*
srcSliceY
;
yv12touyvy
(
src
[
0
],
src
[
1
],
src
[
2
],
dst
,
c
->
srcW
,
srcSliceH
,
srcStride
[
0
],
srcStride
[
1
],
dstStride
[
0
]
);
return
srcSliceH
;
}
/* {RGB,BGR}{15,16,24,32} -> {RGB,BGR}{15,16,24,32} */
static
int
rgb2rgbWrapper
(
SwsContext
*
c
,
uint8_t
*
src
[],
int
srcStride
[],
int
srcSliceY
,
int
srcSliceH
,
uint8_t
*
dst
[],
int
dstStride
[]){
const
int
srcFormat
=
c
->
srcFormat
;
const
int
dstFormat
=
c
->
dstFormat
;
const
int
srcBpp
=
((
srcFormat
&
0xFF
)
+
7
)
>>
3
;
const
int
dstBpp
=
((
dstFormat
&
0xFF
)
+
7
)
>>
3
;
const
int
srcId
=
(
srcFormat
&
0xFF
)
>>
2
;
// 1:0, 4:1, 8:2, 15:3, 16:4, 24:6, 32:8
const
int
dstId
=
(
dstFormat
&
0xFF
)
>>
2
;
void
(
*
conv
)(
const
uint8_t
*
src
,
uint8_t
*
dst
,
unsigned
src_size
)
=
NULL
;
/* BGR -> BGR */
if
(
(
isBGR
(
srcFormat
)
&&
isBGR
(
dstFormat
))
||
(
isRGB
(
srcFormat
)
&&
isRGB
(
dstFormat
))){
switch
(
srcId
|
(
dstId
<<
4
)){
case
0x34
:
conv
=
rgb16to15
;
break
;
case
0x36
:
conv
=
rgb24to15
;
break
;
case
0x38
:
conv
=
rgb32to15
;
break
;
case
0x43
:
conv
=
rgb15to16
;
break
;
case
0x46
:
conv
=
rgb24to16
;
break
;
case
0x48
:
conv
=
rgb32to16
;
break
;
case
0x63
:
conv
=
rgb15to24
;
break
;
case
0x64
:
conv
=
rgb16to24
;
break
;
case
0x68
:
conv
=
rgb32to24
;
break
;
case
0x83
:
conv
=
rgb15to32
;
break
;
case
0x84
:
conv
=
rgb16to32
;
break
;
case
0x86
:
conv
=
rgb24to32
;
break
;
default:
MSG_ERR
(
"swScaler: internal error %s -> %s converter
\n
"
,
vo_format_name
(
srcFormat
),
vo_format_name
(
dstFormat
));
break
;
}
}
else
if
(
(
isBGR
(
srcFormat
)
&&
isRGB
(
dstFormat
))
||
(
isRGB
(
srcFormat
)
&&
isBGR
(
dstFormat
))){
switch
(
srcId
|
(
dstId
<<
4
)){
case
0x33
:
conv
=
rgb15tobgr15
;
break
;
case
0x34
:
conv
=
rgb16tobgr15
;
break
;
case
0x36
:
conv
=
rgb24tobgr15
;
break
;
case
0x38
:
conv
=
rgb32tobgr15
;
break
;
case
0x43
:
conv
=
rgb15tobgr16
;
break
;
case
0x44
:
conv
=
rgb16tobgr16
;
break
;
case
0x46
:
conv
=
rgb24tobgr16
;
break
;
case
0x48
:
conv
=
rgb32tobgr16
;
break
;
case
0x63
:
conv
=
rgb15tobgr24
;
break
;
case
0x64
:
conv
=
rgb16tobgr24
;
break
;
case
0x66
:
conv
=
rgb24tobgr24
;
break
;
case
0x68
:
conv
=
rgb32tobgr24
;
break
;
case
0x83
:
conv
=
rgb15tobgr32
;
break
;
case
0x84
:
conv
=
rgb16tobgr32
;
break
;
case
0x86
:
conv
=
rgb24tobgr32
;
break
;
case
0x88
:
conv
=
rgb32tobgr32
;
break
;
default:
MSG_ERR
(
"swScaler: internal error %s -> %s converter
\n
"
,
vo_format_name
(
srcFormat
),
vo_format_name
(
dstFormat
));
break
;
}
}
else
{
MSG_ERR
(
"swScaler: internal error %s -> %s converter
\n
"
,
vo_format_name
(
srcFormat
),
vo_format_name
(
dstFormat
));
}
if
(
dstStride
[
0
]
*
srcBpp
==
srcStride
[
0
]
*
dstBpp
)
conv
(
src
[
0
],
dst
[
0
]
+
dstStride
[
0
]
*
srcSliceY
,
srcSliceH
*
srcStride
[
0
]);
else
{
int
i
;
uint8_t
*
srcPtr
=
src
[
0
];
uint8_t
*
dstPtr
=
dst
[
0
]
+
dstStride
[
0
]
*
srcSliceY
;
for
(
i
=
0
;
i
<
srcSliceH
;
i
++
)
{
conv
(
srcPtr
,
dstPtr
,
c
->
srcW
*
srcBpp
);
srcPtr
+=
srcStride
[
0
];
dstPtr
+=
dstStride
[
0
];
}
}
return
srcSliceH
;
}
static
int
bgr24toyv12Wrapper
(
SwsContext
*
c
,
uint8_t
*
src
[],
int
srcStride
[],
int
srcSliceY
,
int
srcSliceH
,
uint8_t
*
dst
[],
int
dstStride
[]){
rgb24toyv12
(
src
[
0
],
dst
[
0
]
+
srcSliceY
*
dstStride
[
0
],
dst
[
1
]
+
(
srcSliceY
>>
1
)
*
dstStride
[
1
],
dst
[
2
]
+
(
srcSliceY
>>
1
)
*
dstStride
[
2
],
c
->
srcW
,
srcSliceH
,
dstStride
[
0
],
dstStride
[
1
],
srcStride
[
0
]);
return
srcSliceH
;
}
static
int
yvu9toyv12Wrapper
(
SwsContext
*
c
,
uint8_t
*
src
[],
int
srcStride
[],
int
srcSliceY
,
int
srcSliceH
,
uint8_t
*
dst
[],
int
dstStride
[]){
int
i
;
/* copy Y */
if
(
srcStride
[
0
]
==
dstStride
[
0
])
memcpy
(
dst
[
0
]
+
srcSliceY
*
dstStride
[
0
],
src
[
0
],
srcStride
[
0
]
*
srcSliceH
);
else
{
uint8_t
*
srcPtr
=
src
[
0
];
uint8_t
*
dstPtr
=
dst
[
0
]
+
dstStride
[
0
]
*
srcSliceY
;
for
(
i
=
0
;
i
<
srcSliceH
;
i
++
)
{
memcpy
(
dstPtr
,
srcPtr
,
c
->
srcW
);
srcPtr
+=
srcStride
[
0
];
dstPtr
+=
dstStride
[
0
];
}
}
if
(
c
->
dstFormat
==
IMGFMT_YV12
){
planar2x
(
src
[
1
],
dst
[
1
],
c
->
chrSrcW
,
c
->
chrSrcH
,
srcStride
[
1
],
dstStride
[
1
]);
planar2x
(
src
[
2
],
dst
[
2
],
c
->
chrSrcW
,
c
->
chrSrcH
,
srcStride
[
2
],
dstStride
[
2
]);
}
else
{
planar2x
(
src
[
1
],
dst
[
2
],
c
->
chrSrcW
,
c
->
chrSrcH
,
srcStride
[
1
],
dstStride
[
2
]);
planar2x
(
src
[
2
],
dst
[
1
],
c
->
chrSrcW
,
c
->
chrSrcH
,
srcStride
[
2
],
dstStride
[
1
]);
}
return
srcSliceH
;
}
/**
* bring pointers in YUV order instead of YVU
*/
static
inline
void
sws_orderYUV
(
int
format
,
uint8_t
*
sortedP
[],
int
sortedStride
[],
uint8_t
*
p
[],
int
stride
[]){
if
(
format
==
IMGFMT_YV12
||
format
==
IMGFMT_YVU9
||
format
==
IMGFMT_444P
||
format
==
IMGFMT_422P
||
format
==
IMGFMT_411P
){
sortedP
[
0
]
=
p
[
0
];
sortedP
[
1
]
=
p
[
2
];
sortedP
[
2
]
=
p
[
1
];
sortedStride
[
0
]
=
stride
[
0
];
sortedStride
[
1
]
=
stride
[
2
];
sortedStride
[
2
]
=
stride
[
1
];
}
else
if
(
isPacked
(
format
)
||
isGray
(
format
)
||
format
==
IMGFMT_Y8
)
{
sortedP
[
0
]
=
p
[
0
];
sortedP
[
1
]
=
sortedP
[
2
]
=
NULL
;
sortedStride
[
0
]
=
stride
[
0
];
sortedStride
[
1
]
=
sortedStride
[
2
]
=
0
;
}
else
if
(
format
==
IMGFMT_I420
||
format
==
IMGFMT_IYUV
)
{
sortedP
[
0
]
=
p
[
0
];
sortedP
[
1
]
=
p
[
1
];
sortedP
[
2
]
=
p
[
2
];
sortedStride
[
0
]
=
stride
[
0
];
sortedStride
[
1
]
=
stride
[
1
];
sortedStride
[
2
]
=
stride
[
2
];
}
else
{
MSG_ERR
(
"internal error in orderYUV
\n
"
);
}
}
/* unscaled copy like stuff (assumes nearly identical formats) */
static
int
simpleCopy
(
SwsContext
*
c
,
uint8_t
*
src
[],
int
srcStride
[],
int
srcSliceY
,
int
srcSliceH
,
uint8_t
*
dst
[],
int
dstStride
[]){
if
(
isPacked
(
c
->
srcFormat
))
{
if
(
dstStride
[
0
]
==
srcStride
[
0
])
memcpy
(
dst
[
0
]
+
dstStride
[
0
]
*
srcSliceY
,
src
[
0
],
srcSliceH
*
dstStride
[
0
]);
else
{
int
i
;
uint8_t
*
srcPtr
=
src
[
0
];
uint8_t
*
dstPtr
=
dst
[
0
]
+
dstStride
[
0
]
*
srcSliceY
;
int
length
=
0
;
/* universal length finder */
while
(
length
+
c
->
srcW
<=
ABS
(
dstStride
[
0
])
&&
length
+
c
->
srcW
<=
ABS
(
srcStride
[
0
]))
length
+=
c
->
srcW
;
ASSERT
(
length
!=
0
);
for
(
i
=
0
;
i
<
srcSliceH
;
i
++
)
{
memcpy
(
dstPtr
,
srcPtr
,
length
);
srcPtr
+=
srcStride
[
0
];
dstPtr
+=
dstStride
[
0
];
}
}
}
else
{
/* Planar YUV or gray */
int
plane
;
for
(
plane
=
0
;
plane
<
3
;
plane
++
)
{
int
length
=
plane
==
0
?
c
->
srcW
:
-
((
-
c
->
srcW
)
>>
c
->
chrDstHSubSample
);
int
y
=
plane
==
0
?
srcSliceY
:
-
((
-
srcSliceY
)
>>
c
->
chrDstVSubSample
);
int
height
=
plane
==
0
?
srcSliceH
:
-
((
-
srcSliceH
)
>>
c
->
chrDstVSubSample
);
if
((
isGray
(
c
->
srcFormat
)
||
isGray
(
c
->
dstFormat
))
&&
plane
>
0
)
{
if
(
!
isGray
(
c
->
dstFormat
))
memset
(
dst
[
plane
],
128
,
dstStride
[
plane
]
*
height
);
}
else
{
if
(
dstStride
[
plane
]
==
srcStride
[
plane
])
memcpy
(
dst
[
plane
]
+
dstStride
[
plane
]
*
y
,
src
[
plane
],
height
*
dstStride
[
plane
]);
else
{
int
i
;
uint8_t
*
srcPtr
=
src
[
plane
];
uint8_t
*
dstPtr
=
dst
[
plane
]
+
dstStride
[
plane
]
*
y
;
for
(
i
=
0
;
i
<
height
;
i
++
)
{
memcpy
(
dstPtr
,
srcPtr
,
length
);
srcPtr
+=
srcStride
[
plane
];
dstPtr
+=
dstStride
[
plane
];
}
}
}
}
}
return
srcSliceH
;
}
static
int
remove_dup_fourcc
(
int
fourcc
)
{
switch
(
fourcc
)
{
case
IMGFMT_I420
:
case
IMGFMT_IYUV
:
return
IMGFMT_YV12
;
case
IMGFMT_Y8
:
return
IMGFMT_Y800
;
case
IMGFMT_IF09
:
return
IMGFMT_YVU9
;
default:
return
fourcc
;
}
}
static
void
getSubSampleFactors
(
int
*
h
,
int
*
v
,
int
format
){
switch
(
format
){
case
IMGFMT_UYVY
:
case
IMGFMT_YUY2
:
*
h
=
1
;
*
v
=
0
;
break
;
case
IMGFMT_YV12
:
case
IMGFMT_Y800
:
//FIXME remove after different subsamplings are fully implemented
*
h
=
1
;
*
v
=
1
;
break
;
case
IMGFMT_YVU9
:
*
h
=
2
;
*
v
=
2
;
break
;
case
IMGFMT_444P
:
*
h
=
0
;
*
v
=
0
;
break
;
case
IMGFMT_422P
:
*
h
=
1
;
*
v
=
0
;
break
;
case
IMGFMT_411P
:
*
h
=
2
;
*
v
=
0
;
break
;
default:
*
h
=
0
;
*
v
=
0
;
break
;
}
}
static
uint16_t
roundToInt16
(
int64_t
f
){
int
r
=
(
f
+
(
1
<<
15
))
>>
16
;
if
(
r
<-
0x7FFF
)
return
0x8000
;
else
if
(
r
>
0x7FFF
)
return
0x7FFF
;
else
return
r
;
}
/**
* @param inv_table the yuv2rgb coeffs, normally Inverse_Table_6_9[x]
* @param fullRange if 1 then the luma range is 0..255 if 0 its 16..235
* @return -1 if not supported
*/
int
sws_setColorspaceDetails
(
SwsContext
*
c
,
const
int
inv_table
[
4
],
int
srcRange
,
const
int
table
[
4
],
int
dstRange
,
int
brightness
,
int
contrast
,
int
saturation
){
int64_t
crv
=
inv_table
[
0
];
int64_t
cbu
=
inv_table
[
1
];
int64_t
cgu
=
-
inv_table
[
2
];
int64_t
cgv
=
-
inv_table
[
3
];
int64_t
cy
=
1
<<
16
;
int64_t
oy
=
0
;
if
(
isYUV
(
c
->
dstFormat
)
||
isGray
(
c
->
dstFormat
))
return
-
1
;
memcpy
(
c
->
srcColorspaceTable
,
inv_table
,
sizeof
(
int
)
*
4
);
memcpy
(
c
->
dstColorspaceTable
,
table
,
sizeof
(
int
)
*
4
);
c
->
brightness
=
brightness
;
c
->
contrast
=
contrast
;
c
->
saturation
=
saturation
;
c
->
srcRange
=
srcRange
;
c
->
dstRange
=
dstRange
;
c
->
uOffset
=
0x0400040004000400LL
;
c
->
vOffset
=
0x0400040004000400LL
;
if
(
!
srcRange
){
cy
=
(
cy
*
255
)
/
219
;
oy
=
16
<<
16
;
}
cy
=
(
cy
*
contrast
)
>>
16
;
crv
=
(
crv
*
contrast
*
saturation
)
>>
32
;
cbu
=
(
cbu
*
contrast
*
saturation
)
>>
32
;
cgu
=
(
cgu
*
contrast
*
saturation
)
>>
32
;
cgv
=
(
cgv
*
contrast
*
saturation
)
>>
32
;
oy
-=
256
*
brightness
;
c
->
yCoeff
=
roundToInt16
(
cy
*
8192
)
*
0x0001000100010001ULL
;
c
->
vrCoeff
=
roundToInt16
(
crv
*
8192
)
*
0x0001000100010001ULL
;
c
->
ubCoeff
=
roundToInt16
(
cbu
*
8192
)
*
0x0001000100010001ULL
;
c
->
vgCoeff
=
roundToInt16
(
cgv
*
8192
)
*
0x0001000100010001ULL
;
c
->
ugCoeff
=
roundToInt16
(
cgu
*
8192
)
*
0x0001000100010001ULL
;
c
->
yOffset
=
roundToInt16
(
oy
*
8
)
*
0x0001000100010001ULL
;
yuv2rgb_c_init_tables
(
c
,
inv_table
,
srcRange
,
brightness
,
contrast
,
saturation
);
//FIXME factorize
#ifdef HAVE_ALTIVEC
yuv2rgb_altivec_init_tables
(
c
,
inv_table
);
#endif
return
0
;
}
/**
* @return -1 if not supported
*/
int
sws_getColorspaceDetails
(
SwsContext
*
c
,
int
**
inv_table
,
int
*
srcRange
,
int
**
table
,
int
*
dstRange
,
int
*
brightness
,
int
*
contrast
,
int
*
saturation
){
if
(
isYUV
(
c
->
dstFormat
)
||
isGray
(
c
->
dstFormat
))
return
-
1
;
*
inv_table
=
c
->
srcColorspaceTable
;
*
table
=
c
->
dstColorspaceTable
;
*
srcRange
=
c
->
srcRange
;
*
dstRange
=
c
->
dstRange
;
*
brightness
=
c
->
brightness
;
*
contrast
=
c
->
contrast
;
*
saturation
=
c
->
saturation
;
return
0
;
}
SwsContext
*
sws_getContext
(
int
srcW
,
int
srcH
,
int
origSrcFormat
,
int
dstW
,
int
dstH
,
int
origDstFormat
,
int
flags
,
SwsFilter
*
srcFilter
,
SwsFilter
*
dstFilter
){
SwsContext
*
c
;
int
i
;
int
usesVFilter
,
usesHFilter
;
int
unscaled
,
needsDither
;
int
srcFormat
,
dstFormat
;
SwsFilter
dummyFilter
=
{
NULL
,
NULL
,
NULL
,
NULL
};
#ifdef ARCH_X86
if
(
flags
&
SWS_CPU_CAPS_MMX
)
asm
volatile
(
"emms
\n\t
"
:::
"memory"
);
#endif
#ifndef RUNTIME_CPUDETECT //ensure that the flags match the compiled variant if cpudetect is off
flags
&=
~
(
SWS_CPU_CAPS_MMX
|
SWS_CPU_CAPS_MMX2
|
SWS_CPU_CAPS_3DNOW
|
SWS_CPU_CAPS_ALTIVEC
);
#ifdef HAVE_MMX2
flags
|=
SWS_CPU_CAPS_MMX
|
SWS_CPU_CAPS_MMX2
;
#elif defined (HAVE_3DNOW)
flags
|=
SWS_CPU_CAPS_MMX
|
SWS_CPU_CAPS_3DNOW
;
#elif defined (HAVE_MMX)
flags
|=
SWS_CPU_CAPS_MMX
;
#elif defined (HAVE_ALTIVEC)
flags
|=
SWS_CPU_CAPS_ALTIVEC
;
#endif
#endif
if
(
clip_table
[
512
]
!=
255
)
globalInit
();
if
(
rgb15to16
==
NULL
)
sws_rgb2rgb_init
(
flags
);
/* avoid duplicate Formats, so we don't need to check to much */
srcFormat
=
remove_dup_fourcc
(
origSrcFormat
);
dstFormat
=
remove_dup_fourcc
(
origDstFormat
);
unscaled
=
(
srcW
==
dstW
&&
srcH
==
dstH
);
needsDither
=
(
isBGR
(
dstFormat
)
||
isRGB
(
dstFormat
))
&&
(
dstFormat
&
0xFF
)
<
24
&&
((
dstFormat
&
0xFF
)
<
(
srcFormat
&
0xFF
)
||
(
!
(
isRGB
(
srcFormat
)
||
isBGR
(
srcFormat
))));
if
(
!
isSupportedIn
(
srcFormat
))
{
MSG_ERR
(
"swScaler: %s is not supported as input format
\n
"
,
vo_format_name
(
srcFormat
));
return
NULL
;
}
if
(
!
isSupportedOut
(
dstFormat
))
{
MSG_ERR
(
"swScaler: %s is not supported as output format
\n
"
,
vo_format_name
(
dstFormat
));
return
NULL
;
}
/* sanity check */
if
(
srcW
<
4
||
srcH
<
1
||
dstW
<
8
||
dstH
<
1
)
//FIXME check if these are enough and try to lowwer them after fixing the relevant parts of the code
{
MSG_ERR
(
"swScaler: %dx%d -> %dx%d is invalid scaling dimension
\n
"
,
srcW
,
srcH
,
dstW
,
dstH
);
return
NULL
;
}
if
(
!
dstFilter
)
dstFilter
=
&
dummyFilter
;
if
(
!
srcFilter
)
srcFilter
=
&
dummyFilter
;
c
=
memalign
(
64
,
sizeof
(
SwsContext
));
memset
(
c
,
0
,
sizeof
(
SwsContext
));
c
->
srcW
=
srcW
;
c
->
srcH
=
srcH
;
c
->
dstW
=
dstW
;
c
->
dstH
=
dstH
;
c
->
lumXInc
=
((
srcW
<<
16
)
+
(
dstW
>>
1
))
/
dstW
;
c
->
lumYInc
=
((
srcH
<<
16
)
+
(
dstH
>>
1
))
/
dstH
;
c
->
flags
=
flags
;
c
->
dstFormat
=
dstFormat
;
c
->
srcFormat
=
srcFormat
;
c
->
origDstFormat
=
origDstFormat
;
c
->
origSrcFormat
=
origSrcFormat
;
c
->
vRounder
=
4
*
0x0001000100010001ULL
;
usesHFilter
=
usesVFilter
=
0
;
if
(
dstFilter
->
lumV
!=
NULL
&&
dstFilter
->
lumV
->
length
>
1
)
usesVFilter
=
1
;
if
(
dstFilter
->
lumH
!=
NULL
&&
dstFilter
->
lumH
->
length
>
1
)
usesHFilter
=
1
;
if
(
dstFilter
->
chrV
!=
NULL
&&
dstFilter
->
chrV
->
length
>
1
)
usesVFilter
=
1
;
if
(
dstFilter
->
chrH
!=
NULL
&&
dstFilter
->
chrH
->
length
>
1
)
usesHFilter
=
1
;
if
(
srcFilter
->
lumV
!=
NULL
&&
srcFilter
->
lumV
->
length
>
1
)
usesVFilter
=
1
;
if
(
srcFilter
->
lumH
!=
NULL
&&
srcFilter
->
lumH
->
length
>
1
)
usesHFilter
=
1
;
if
(
srcFilter
->
chrV
!=
NULL
&&
srcFilter
->
chrV
->
length
>
1
)
usesVFilter
=
1
;
if
(
srcFilter
->
chrH
!=
NULL
&&
srcFilter
->
chrH
->
length
>
1
)
usesHFilter
=
1
;
getSubSampleFactors
(
&
c
->
chrSrcHSubSample
,
&
c
->
chrSrcVSubSample
,
srcFormat
);
getSubSampleFactors
(
&
c
->
chrDstHSubSample
,
&
c
->
chrDstVSubSample
,
dstFormat
);
// reuse chroma for 2 pixles rgb/bgr unless user wants full chroma interpolation
if
((
isBGR
(
dstFormat
)
||
isRGB
(
dstFormat
))
&&
!
(
flags
&
SWS_FULL_CHR_H_INT
))
c
->
chrDstHSubSample
=
1
;
// drop some chroma lines if the user wants it
c
->
vChrDrop
=
(
flags
&
SWS_SRC_V_CHR_DROP_MASK
)
>>
SWS_SRC_V_CHR_DROP_SHIFT
;
c
->
chrSrcVSubSample
+=
c
->
vChrDrop
;
// drop every 2. pixel for chroma calculation unless user wants full chroma
if
((
isBGR
(
srcFormat
)
||
isRGB
(
srcFormat
))
&&
!
(
flags
&
SWS_FULL_CHR_H_INP
))
c
->
chrSrcHSubSample
=
1
;
c
->
chrIntHSubSample
=
c
->
chrDstHSubSample
;
c
->
chrIntVSubSample
=
c
->
chrSrcVSubSample
;
// note the -((-x)>>y) is so that we allways round toward +inf
c
->
chrSrcW
=
-
((
-
srcW
)
>>
c
->
chrSrcHSubSample
);
c
->
chrSrcH
=
-
((
-
srcH
)
>>
c
->
chrSrcVSubSample
);
c
->
chrDstW
=
-
((
-
dstW
)
>>
c
->
chrDstHSubSample
);
c
->
chrDstH
=
-
((
-
dstH
)
>>
c
->
chrDstVSubSample
);
sws_setColorspaceDetails
(
c
,
Inverse_Table_6_9
[
SWS_CS_DEFAULT
],
0
,
Inverse_Table_6_9
[
SWS_CS_DEFAULT
]
/* FIXME*/
,
0
,
0
,
1
<<
16
,
1
<<
16
);
/* unscaled special Cases */
if
(
unscaled
&&
!
usesHFilter
&&
!
usesVFilter
)
{
/* yv12_to_nv12 */
if
(
srcFormat
==
IMGFMT_YV12
&&
dstFormat
==
IMGFMT_NV12
)
{
c
->
swScale
=
PlanarToNV12Wrapper
;
}
/* yuv2bgr */
if
((
srcFormat
==
IMGFMT_YV12
||
srcFormat
==
IMGFMT_422P
)
&&
(
isBGR
(
dstFormat
)
||
isRGB
(
dstFormat
)))
{
c
->
swScale
=
yuv2rgb_get_func_ptr
(
c
);
}
if
(
srcFormat
==
IMGFMT_YVU9
&&
dstFormat
==
IMGFMT_YV12
)
{
c
->
swScale
=
yvu9toyv12Wrapper
;
}
/* bgr24toYV12 */
if
(
srcFormat
==
IMGFMT_BGR24
&&
dstFormat
==
IMGFMT_YV12
)
c
->
swScale
=
bgr24toyv12Wrapper
;
/* rgb/bgr -> rgb/bgr (no dither needed forms) */
if
(
(
isBGR
(
srcFormat
)
||
isRGB
(
srcFormat
))
&&
(
isBGR
(
dstFormat
)
||
isRGB
(
dstFormat
))
&&
!
needsDither
)
c
->
swScale
=
rgb2rgbWrapper
;
/* LQ converters if -sws 0 or -sws 4*/
if
(
c
->
flags
&
(
SWS_FAST_BILINEAR
|
SWS_POINT
)){
/* rgb/bgr -> rgb/bgr (dither needed forms) */
if
(
(
isBGR
(
srcFormat
)
||
isRGB
(
srcFormat
))
&&
(
isBGR
(
dstFormat
)
||
isRGB
(
dstFormat
))
&&
needsDither
)
c
->
swScale
=
rgb2rgbWrapper
;
/* yv12_to_yuy2 */
if
(
srcFormat
==
IMGFMT_YV12
&&
(
dstFormat
==
IMGFMT_YUY2
||
dstFormat
==
IMGFMT_UYVY
))
{
if
(
dstFormat
==
IMGFMT_YUY2
)
c
->
swScale
=
PlanarToYuy2Wrapper
;
else
c
->
swScale
=
PlanarToUyvyWrapper
;
}
}
#ifdef HAVE_ALTIVEC
if
((
c
->
flags
&
SWS_CPU_CAPS_ALTIVEC
)
&&
((
srcFormat
==
IMGFMT_YV12
&&
(
dstFormat
==
IMGFMT_YUY2
||
dstFormat
==
IMGFMT_UYVY
))))
{
// unscaled YV12 -> packed YUV, we want speed
if
(
dstFormat
==
IMGFMT_YUY2
)
c
->
swScale
=
yv12toyuy2_unscaled_altivec
;
else
c
->
swScale
=
yv12touyvy_unscaled_altivec
;
}
#endif
/* simple copy */
if
(
srcFormat
==
dstFormat
||
(
isPlanarYUV
(
srcFormat
)
&&
isGray
(
dstFormat
))
||
(
isPlanarYUV
(
dstFormat
)
&&
isGray
(
srcFormat
))
)
{
c
->
swScale
=
simpleCopy
;
}
if
(
c
->
swScale
){
if
(
flags
&
SWS_PRINT_INFO
)
MSG_INFO
(
"SwScaler: using unscaled %s -> %s special converter
\n
"
,
vo_format_name
(
srcFormat
),
vo_format_name
(
dstFormat
));
return
c
;
}
}
if
(
flags
&
SWS_CPU_CAPS_MMX2
)
{
c
->
canMMX2BeUsed
=
(
dstW
>=
srcW
&&
(
dstW
&
31
)
==
0
&&
(
srcW
&
15
)
==
0
)
?
1
:
0
;
if
(
!
c
->
canMMX2BeUsed
&&
dstW
>=
srcW
&&
(
srcW
&
15
)
==
0
&&
(
flags
&
SWS_FAST_BILINEAR
))
{
if
(
flags
&
SWS_PRINT_INFO
)
MSG_INFO
(
"SwScaler: output Width is not a multiple of 32 -> no MMX2 scaler
\n
"
);
}
if
(
usesHFilter
)
c
->
canMMX2BeUsed
=
0
;
}
else
c
->
canMMX2BeUsed
=
0
;
c
->
chrXInc
=
((
c
->
chrSrcW
<<
16
)
+
(
c
->
chrDstW
>>
1
))
/
c
->
chrDstW
;
c
->
chrYInc
=
((
c
->
chrSrcH
<<
16
)
+
(
c
->
chrDstH
>>
1
))
/
c
->
chrDstH
;
// match pixel 0 of the src to pixel 0 of dst and match pixel n-2 of src to pixel n-2 of dst
// but only for the FAST_BILINEAR mode otherwise do correct scaling
// n-2 is the last chrominance sample available
// this is not perfect, but noone shuld notice the difference, the more correct variant
// would be like the vertical one, but that would require some special code for the
// first and last pixel
if
(
flags
&
SWS_FAST_BILINEAR
)
{
if
(
c
->
canMMX2BeUsed
)
{
c
->
lumXInc
+=
20
;
c
->
chrXInc
+=
20
;
}
//we don't use the x86asm scaler if mmx is available
else
if
(
flags
&
SWS_CPU_CAPS_MMX
)
{
c
->
lumXInc
=
((
srcW
-
2
)
<<
16
)
/
(
dstW
-
2
)
-
20
;
c
->
chrXInc
=
((
c
->
chrSrcW
-
2
)
<<
16
)
/
(
c
->
chrDstW
-
2
)
-
20
;
}
}
/* precalculate horizontal scaler filter coefficients */
{
const
int
filterAlign
=
(
flags
&
SWS_CPU_CAPS_MMX
)
?
4
:
(
flags
&
SWS_CPU_CAPS_ALTIVEC
)
?
8
:
1
;
initFilter
(
&
c
->
hLumFilter
,
&
c
->
hLumFilterPos
,
&
c
->
hLumFilterSize
,
c
->
lumXInc
,
srcW
,
dstW
,
filterAlign
,
1
<<
14
,
(
flags
&
SWS_BICUBLIN
)
?
(
flags
|
SWS_BICUBIC
)
:
flags
,
srcFilter
->
lumH
,
dstFilter
->
lumH
);
initFilter
(
&
c
->
hChrFilter
,
&
c
->
hChrFilterPos
,
&
c
->
hChrFilterSize
,
c
->
chrXInc
,
c
->
chrSrcW
,
c
->
chrDstW
,
filterAlign
,
1
<<
14
,
(
flags
&
SWS_BICUBLIN
)
?
(
flags
|
SWS_BILINEAR
)
:
flags
,
srcFilter
->
chrH
,
dstFilter
->
chrH
);
#ifdef ARCH_X86
// can't downscale !!!
if
(
c
->
canMMX2BeUsed
&&
(
flags
&
SWS_FAST_BILINEAR
))
{
c
->
lumMmx2Filter
=
(
int16_t
*
)
memalign
(
8
,
(
dstW
/
8
+
8
)
*
sizeof
(
int16_t
));
c
->
chrMmx2Filter
=
(
int16_t
*
)
memalign
(
8
,
(
c
->
chrDstW
/
4
+
8
)
*
sizeof
(
int16_t
));
c
->
lumMmx2FilterPos
=
(
int32_t
*
)
memalign
(
8
,
(
dstW
/
2
/
8
+
8
)
*
sizeof
(
int32_t
));
c
->
chrMmx2FilterPos
=
(
int32_t
*
)
memalign
(
8
,
(
c
->
chrDstW
/
2
/
4
+
8
)
*
sizeof
(
int32_t
));
initMMX2HScaler
(
dstW
,
c
->
lumXInc
,
c
->
funnyYCode
,
c
->
lumMmx2Filter
,
c
->
lumMmx2FilterPos
,
8
);
initMMX2HScaler
(
c
->
chrDstW
,
c
->
chrXInc
,
c
->
funnyUVCode
,
c
->
chrMmx2Filter
,
c
->
chrMmx2FilterPos
,
4
);
}
#endif
}
// Init Horizontal stuff
/* precalculate vertical scaler filter coefficients */
{
const
int
filterAlign
=
(
flags
&
SWS_CPU_CAPS_ALTIVEC
)
?
8
:
1
;
initFilter
(
&
c
->
vLumFilter
,
&
c
->
vLumFilterPos
,
&
c
->
vLumFilterSize
,
c
->
lumYInc
,
srcH
,
dstH
,
filterAlign
,
(
1
<<
12
)
-
4
,
(
flags
&
SWS_BICUBLIN
)
?
(
flags
|
SWS_BICUBIC
)
:
flags
,
srcFilter
->
lumV
,
dstFilter
->
lumV
);
initFilter
(
&
c
->
vChrFilter
,
&
c
->
vChrFilterPos
,
&
c
->
vChrFilterSize
,
c
->
chrYInc
,
c
->
chrSrcH
,
c
->
chrDstH
,
filterAlign
,
(
1
<<
12
)
-
4
,
(
flags
&
SWS_BICUBLIN
)
?
(
flags
|
SWS_BILINEAR
)
:
flags
,
srcFilter
->
chrV
,
dstFilter
->
chrV
);
}
// Calculate Buffer Sizes so that they won't run out while handling these damn slices
c
->
vLumBufSize
=
c
->
vLumFilterSize
;
c
->
vChrBufSize
=
c
->
vChrFilterSize
;
for
(
i
=
0
;
i
<
dstH
;
i
++
)
{
int
chrI
=
i
*
c
->
chrDstH
/
dstH
;
int
nextSlice
=
MAX
(
c
->
vLumFilterPos
[
i
]
+
c
->
vLumFilterSize
-
1
,
((
c
->
vChrFilterPos
[
chrI
]
+
c
->
vChrFilterSize
-
1
)
<<
c
->
chrSrcVSubSample
));
nextSlice
>>=
c
->
chrSrcVSubSample
;
nextSlice
<<=
c
->
chrSrcVSubSample
;
if
(
c
->
vLumFilterPos
[
i
]
+
c
->
vLumBufSize
<
nextSlice
)
c
->
vLumBufSize
=
nextSlice
-
c
->
vLumFilterPos
[
i
];
if
(
c
->
vChrFilterPos
[
chrI
]
+
c
->
vChrBufSize
<
(
nextSlice
>>
c
->
chrSrcVSubSample
))
c
->
vChrBufSize
=
(
nextSlice
>>
c
->
chrSrcVSubSample
)
-
c
->
vChrFilterPos
[
chrI
];
}
// allocate pixbufs (we use dynamic allocation because otherwise we would need to
c
->
lumPixBuf
=
(
int16_t
**
)
memalign
(
4
,
c
->
vLumBufSize
*
2
*
sizeof
(
int16_t
*
));
c
->
chrPixBuf
=
(
int16_t
**
)
memalign
(
4
,
c
->
vChrBufSize
*
2
*
sizeof
(
int16_t
*
));
//Note we need at least one pixel more at the end because of the mmx code (just in case someone wanna replace the 4000/8000)
for
(
i
=
0
;
i
<
c
->
vLumBufSize
;
i
++
)
c
->
lumPixBuf
[
i
]
=
c
->
lumPixBuf
[
i
+
c
->
vLumBufSize
]
=
(
uint16_t
*
)
memalign
(
8
,
4000
);
for
(
i
=
0
;
i
<
c
->
vChrBufSize
;
i
++
)
c
->
chrPixBuf
[
i
]
=
c
->
chrPixBuf
[
i
+
c
->
vChrBufSize
]
=
(
uint16_t
*
)
memalign
(
8
,
8000
);
//try to avoid drawing green stuff between the right end and the stride end
for
(
i
=
0
;
i
<
c
->
vLumBufSize
;
i
++
)
memset
(
c
->
lumPixBuf
[
i
],
0
,
4000
);
for
(
i
=
0
;
i
<
c
->
vChrBufSize
;
i
++
)
memset
(
c
->
chrPixBuf
[
i
],
64
,
8000
);
ASSERT
(
c
->
chrDstH
<=
dstH
)
if
(
flags
&
SWS_PRINT_INFO
)
{
#ifdef DITHER1XBPP
char
*
dither
=
" dithered"
;
#else
char
*
dither
=
""
;
#endif
if
(
flags
&
SWS_FAST_BILINEAR
)
MSG_INFO
(
"
\n
SwScaler: FAST_BILINEAR scaler, "
);
else
if
(
flags
&
SWS_BILINEAR
)
MSG_INFO
(
"
\n
SwScaler: BILINEAR scaler, "
);
else
if
(
flags
&
SWS_BICUBIC
)
MSG_INFO
(
"
\n
SwScaler: BICUBIC scaler, "
);
else
if
(
flags
&
SWS_X
)
MSG_INFO
(
"
\n
SwScaler: Experimental scaler, "
);
else
if
(
flags
&
SWS_POINT
)
MSG_INFO
(
"
\n
SwScaler: Nearest Neighbor / POINT scaler, "
);
else
if
(
flags
&
SWS_AREA
)
MSG_INFO
(
"
\n
SwScaler: Area Averageing scaler, "
);
else
if
(
flags
&
SWS_BICUBLIN
)
MSG_INFO
(
"
\n
SwScaler: luma BICUBIC / chroma BILINEAR scaler, "
);
else
if
(
flags
&
SWS_GAUSS
)
MSG_INFO
(
"
\n
SwScaler: Gaussian scaler, "
);
else
if
(
flags
&
SWS_SINC
)
MSG_INFO
(
"
\n
SwScaler: Sinc scaler, "
);
else
if
(
flags
&
SWS_LANCZOS
)
MSG_INFO
(
"
\n
SwScaler: Lanczos scaler, "
);
else
if
(
flags
&
SWS_SPLINE
)
MSG_INFO
(
"
\n
SwScaler: Bicubic spline scaler, "
);
else
MSG_INFO
(
"
\n
SwScaler: ehh flags invalid?! "
);
if
(
dstFormat
==
IMGFMT_BGR15
||
dstFormat
==
IMGFMT_BGR16
)
MSG_INFO
(
"from %s to%s %s "
,
vo_format_name
(
srcFormat
),
dither
,
vo_format_name
(
dstFormat
));
else
MSG_INFO
(
"from %s to %s "
,
vo_format_name
(
srcFormat
),
vo_format_name
(
dstFormat
));
if
(
flags
&
SWS_CPU_CAPS_MMX2
)
MSG_INFO
(
"using MMX2
\n
"
);
else
if
(
flags
&
SWS_CPU_CAPS_3DNOW
)
MSG_INFO
(
"using 3DNOW
\n
"
);
else
if
(
flags
&
SWS_CPU_CAPS_MMX
)
MSG_INFO
(
"using MMX
\n
"
);
else
if
(
flags
&
SWS_CPU_CAPS_ALTIVEC
)
MSG_INFO
(
"using AltiVec
\n
"
);
else
MSG_INFO
(
"using C
\n
"
);
}
if
(
flags
&
SWS_PRINT_INFO
)
{
if
(
flags
&
SWS_CPU_CAPS_MMX
)
{
if
(
c
->
canMMX2BeUsed
&&
(
flags
&
SWS_FAST_BILINEAR
))
MSG_V
(
"SwScaler: using FAST_BILINEAR MMX2 scaler for horizontal scaling
\n
"
);
else
{
if
(
c
->
hLumFilterSize
==
4
)
MSG_V
(
"SwScaler: using 4-tap MMX scaler for horizontal luminance scaling
\n
"
);
else
if
(
c
->
hLumFilterSize
==
8
)
MSG_V
(
"SwScaler: using 8-tap MMX scaler for horizontal luminance scaling
\n
"
);
else
MSG_V
(
"SwScaler: using n-tap MMX scaler for horizontal luminance scaling
\n
"
);
if
(
c
->
hChrFilterSize
==
4
)
MSG_V
(
"SwScaler: using 4-tap MMX scaler for horizontal chrominance scaling
\n
"
);
else
if
(
c
->
hChrFilterSize
==
8
)
MSG_V
(
"SwScaler: using 8-tap MMX scaler for horizontal chrominance scaling
\n
"
);
else
MSG_V
(
"SwScaler: using n-tap MMX scaler for horizontal chrominance scaling
\n
"
);
}
}
else
{
#ifdef ARCH_X86
MSG_V
(
"SwScaler: using X86-Asm scaler for horizontal scaling
\n
"
);
#else
if
(
flags
&
SWS_FAST_BILINEAR
)
MSG_V
(
"SwScaler: using FAST_BILINEAR C scaler for horizontal scaling
\n
"
);
else
MSG_V
(
"SwScaler: using C scaler for horizontal scaling
\n
"
);
#endif
}
if
(
isPlanarYUV
(
dstFormat
))
{
if
(
c
->
vLumFilterSize
==
1
)
MSG_V
(
"SwScaler: using 1-tap %s
\"
scaler
\"
for vertical scaling (YV12 like)
\n
"
,
(
flags
&
SWS_CPU_CAPS_MMX
)
?
"MMX"
:
"C"
);
else
MSG_V
(
"SwScaler: using n-tap %s scaler for vertical scaling (YV12 like)
\n
"
,
(
flags
&
SWS_CPU_CAPS_MMX
)
?
"MMX"
:
"C"
);
}
else
{
if
(
c
->
vLumFilterSize
==
1
&&
c
->
vChrFilterSize
==
2
)
MSG_V
(
"SwScaler: using 1-tap %s
\"
scaler
\"
for vertical luminance scaling (BGR)
\n
"
"SwScaler: 2-tap scaler for vertical chrominance scaling (BGR)
\n
"
,(
flags
&
SWS_CPU_CAPS_MMX
)
?
"MMX"
:
"C"
);
else
if
(
c
->
vLumFilterSize
==
2
&&
c
->
vChrFilterSize
==
2
)
MSG_V
(
"SwScaler: using 2-tap linear %s scaler for vertical scaling (BGR)
\n
"
,
(
flags
&
SWS_CPU_CAPS_MMX
)
?
"MMX"
:
"C"
);
else
MSG_V
(
"SwScaler: using n-tap %s scaler for vertical scaling (BGR)
\n
"
,
(
flags
&
SWS_CPU_CAPS_MMX
)
?
"MMX"
:
"C"
);
}
if
(
dstFormat
==
IMGFMT_BGR24
)
MSG_V
(
"SwScaler: using %s YV12->BGR24 Converter
\n
"
,
(
flags
&
SWS_CPU_CAPS_MMX2
)
?
"MMX2"
:
((
flags
&
SWS_CPU_CAPS_MMX
)
?
"MMX"
:
"C"
));
else
if
(
dstFormat
==
IMGFMT_BGR32
)
MSG_V
(
"SwScaler: using %s YV12->BGR32 Converter
\n
"
,
(
flags
&
SWS_CPU_CAPS_MMX
)
?
"MMX"
:
"C"
);
else
if
(
dstFormat
==
IMGFMT_BGR16
)
MSG_V
(
"SwScaler: using %s YV12->BGR16 Converter
\n
"
,
(
flags
&
SWS_CPU_CAPS_MMX
)
?
"MMX"
:
"C"
);
else
if
(
dstFormat
==
IMGFMT_BGR15
)
MSG_V
(
"SwScaler: using %s YV12->BGR15 Converter
\n
"
,
(
flags
&
SWS_CPU_CAPS_MMX
)
?
"MMX"
:
"C"
);
MSG_V
(
"SwScaler: %dx%d -> %dx%d
\n
"
,
srcW
,
srcH
,
dstW
,
dstH
);
}
if
(
flags
&
SWS_PRINT_INFO
)
{
MSG_DBG2
(
"SwScaler:Lum srcW=%d srcH=%d dstW=%d dstH=%d xInc=%d yInc=%d
\n
"
,
c
->
srcW
,
c
->
srcH
,
c
->
dstW
,
c
->
dstH
,
c
->
lumXInc
,
c
->
lumYInc
);
MSG_DBG2
(
"SwScaler:Chr srcW=%d srcH=%d dstW=%d dstH=%d xInc=%d yInc=%d
\n
"
,
c
->
chrSrcW
,
c
->
chrSrcH
,
c
->
chrDstW
,
c
->
chrDstH
,
c
->
chrXInc
,
c
->
chrYInc
);
}
c
->
swScale
=
getSwsFunc
(
flags
);
return
c
;
}
/**
* swscale warper, so we don't need to export the SwsContext.
* assumes planar YUV to be in YUV order instead of YVU
*/
int
sws_scale_ordered
(
SwsContext
*
c
,
uint8_t
*
src
[],
int
srcStride
[],
int
srcSliceY
,
int
srcSliceH
,
uint8_t
*
dst
[],
int
dstStride
[]){
//copy strides, so they can safely be modified
int
srcStride2
[
3
]
=
{
srcStride
[
0
],
srcStride
[
1
],
srcStride
[
2
]};
int
dstStride2
[
3
]
=
{
dstStride
[
0
],
dstStride
[
1
],
dstStride
[
2
]};
return
c
->
swScale
(
c
,
src
,
srcStride2
,
srcSliceY
,
srcSliceH
,
dst
,
dstStride2
);
}
/**
* swscale warper, so we don't need to export the SwsContext
*/
int
sws_scale
(
SwsContext
*
c
,
uint8_t
*
srcParam
[],
int
srcStrideParam
[],
int
srcSliceY
,
int
srcSliceH
,
uint8_t
*
dstParam
[],
int
dstStrideParam
[]){
int
srcStride
[
3
];
int
dstStride
[
3
];
uint8_t
*
src
[
3
];
uint8_t
*
dst
[
3
];
sws_orderYUV
(
c
->
origSrcFormat
,
src
,
srcStride
,
srcParam
,
srcStrideParam
);
sws_orderYUV
(
c
->
origDstFormat
,
dst
,
dstStride
,
dstParam
,
dstStrideParam
);
//printf("sws: slice %d %d\n", srcSliceY, srcSliceH);
return
c
->
swScale
(
c
,
src
,
srcStride
,
srcSliceY
,
srcSliceH
,
dst
,
dstStride
);
}
SwsFilter
*
sws_getDefaultFilter
(
float
lumaGBlur
,
float
chromaGBlur
,
float
lumaSharpen
,
float
chromaSharpen
,
float
chromaHShift
,
float
chromaVShift
,
int
verbose
)
{
SwsFilter
*
filter
=
malloc
(
sizeof
(
SwsFilter
));
if
(
lumaGBlur
!=
0
.
0
){
filter
->
lumH
=
sws_getGaussianVec
(
lumaGBlur
,
3
.
0
);
filter
->
lumV
=
sws_getGaussianVec
(
lumaGBlur
,
3
.
0
);
}
else
{
filter
->
lumH
=
sws_getIdentityVec
();
filter
->
lumV
=
sws_getIdentityVec
();
}
if
(
chromaGBlur
!=
0
.
0
){
filter
->
chrH
=
sws_getGaussianVec
(
chromaGBlur
,
3
.
0
);
filter
->
chrV
=
sws_getGaussianVec
(
chromaGBlur
,
3
.
0
);
}
else
{
filter
->
chrH
=
sws_getIdentityVec
();
filter
->
chrV
=
sws_getIdentityVec
();
}
if
(
chromaSharpen
!=
0
.
0
){
SwsVector
*
g
=
sws_getConstVec
(
-
1
.
0
,
3
);
SwsVector
*
id
=
sws_getConstVec
(
10
.
0
/
chromaSharpen
,
1
);
g
->
coeff
[
1
]
=
2
.
0
;
sws_addVec
(
id
,
g
);
sws_convVec
(
filter
->
chrH
,
id
);
sws_convVec
(
filter
->
chrV
,
id
);
sws_freeVec
(
g
);
sws_freeVec
(
id
);
}
if
(
lumaSharpen
!=
0
.
0
){
SwsVector
*
g
=
sws_getConstVec
(
-
1
.
0
,
3
);
SwsVector
*
id
=
sws_getConstVec
(
10
.
0
/
lumaSharpen
,
1
);
g
->
coeff
[
1
]
=
2
.
0
;
sws_addVec
(
id
,
g
);
sws_convVec
(
filter
->
lumH
,
id
);
sws_convVec
(
filter
->
lumV
,
id
);
sws_freeVec
(
g
);
sws_freeVec
(
id
);
}
if
(
chromaHShift
!=
0
.
0
)
sws_shiftVec
(
filter
->
chrH
,
(
int
)(
chromaHShift
+
0
.
5
));
if
(
chromaVShift
!=
0
.
0
)
sws_shiftVec
(
filter
->
chrV
,
(
int
)(
chromaVShift
+
0
.
5
));
sws_normalizeVec
(
filter
->
chrH
,
1
.
0
);
sws_normalizeVec
(
filter
->
chrV
,
1
.
0
);
sws_normalizeVec
(
filter
->
lumH
,
1
.
0
);
sws_normalizeVec
(
filter
->
lumV
,
1
.
0
);
if
(
verbose
)
sws_printVec
(
filter
->
chrH
);
if
(
verbose
)
sws_printVec
(
filter
->
lumH
);
return
filter
;
}
/**
* returns a normalized gaussian curve used to filter stuff
* quality=3 is high quality, lowwer is lowwer quality
*/
SwsVector
*
sws_getGaussianVec
(
double
variance
,
double
quality
){
const
int
length
=
(
int
)(
variance
*
quality
+
0
.
5
)
|
1
;
int
i
;
double
*
coeff
=
memalign
(
sizeof
(
double
),
length
*
sizeof
(
double
));
double
middle
=
(
length
-
1
)
*
0
.
5
;
SwsVector
*
vec
=
malloc
(
sizeof
(
SwsVector
));
vec
->
coeff
=
coeff
;
vec
->
length
=
length
;
for
(
i
=
0
;
i
<
length
;
i
++
)
{
double
dist
=
i
-
middle
;
coeff
[
i
]
=
exp
(
-
dist
*
dist
/
(
2
*
variance
*
variance
)
)
/
sqrt
(
2
*
variance
*
PI
);
}
sws_normalizeVec
(
vec
,
1
.
0
);
return
vec
;
}
SwsVector
*
sws_getConstVec
(
double
c
,
int
length
){
int
i
;
double
*
coeff
=
memalign
(
sizeof
(
double
),
length
*
sizeof
(
double
));
SwsVector
*
vec
=
malloc
(
sizeof
(
SwsVector
));
vec
->
coeff
=
coeff
;
vec
->
length
=
length
;
for
(
i
=
0
;
i
<
length
;
i
++
)
coeff
[
i
]
=
c
;
return
vec
;
}
SwsVector
*
sws_getIdentityVec
(
void
){
double
*
coeff
=
memalign
(
sizeof
(
double
),
sizeof
(
double
));
SwsVector
*
vec
=
malloc
(
sizeof
(
SwsVector
));
coeff
[
0
]
=
1
.
0
;
vec
->
coeff
=
coeff
;
vec
->
length
=
1
;
return
vec
;
}
void
sws_normalizeVec
(
SwsVector
*
a
,
double
height
){
int
i
;
double
sum
=
0
;
double
inv
;
for
(
i
=
0
;
i
<
a
->
length
;
i
++
)
sum
+=
a
->
coeff
[
i
];
inv
=
height
/
sum
;
for
(
i
=
0
;
i
<
a
->
length
;
i
++
)
a
->
coeff
[
i
]
*=
inv
;
}
void
sws_scaleVec
(
SwsVector
*
a
,
double
scalar
){
int
i
;
for
(
i
=
0
;
i
<
a
->
length
;
i
++
)
a
->
coeff
[
i
]
*=
scalar
;
}
static
SwsVector
*
sws_getConvVec
(
SwsVector
*
a
,
SwsVector
*
b
){
int
length
=
a
->
length
+
b
->
length
-
1
;
double
*
coeff
=
memalign
(
sizeof
(
double
),
length
*
sizeof
(
double
));
int
i
,
j
;
SwsVector
*
vec
=
malloc
(
sizeof
(
SwsVector
));
vec
->
coeff
=
coeff
;
vec
->
length
=
length
;
for
(
i
=
0
;
i
<
length
;
i
++
)
coeff
[
i
]
=
0
.
0
;
for
(
i
=
0
;
i
<
a
->
length
;
i
++
)
{
for
(
j
=
0
;
j
<
b
->
length
;
j
++
)
{
coeff
[
i
+
j
]
+=
a
->
coeff
[
i
]
*
b
->
coeff
[
j
];
}
}
return
vec
;
}
static
SwsVector
*
sws_sumVec
(
SwsVector
*
a
,
SwsVector
*
b
){
int
length
=
MAX
(
a
->
length
,
b
->
length
);
double
*
coeff
=
memalign
(
sizeof
(
double
),
length
*
sizeof
(
double
));
int
i
;
SwsVector
*
vec
=
malloc
(
sizeof
(
SwsVector
));
vec
->
coeff
=
coeff
;
vec
->
length
=
length
;
for
(
i
=
0
;
i
<
length
;
i
++
)
coeff
[
i
]
=
0
.
0
;
for
(
i
=
0
;
i
<
a
->
length
;
i
++
)
coeff
[
i
+
(
length
-
1
)
/
2
-
(
a
->
length
-
1
)
/
2
]
+=
a
->
coeff
[
i
];
for
(
i
=
0
;
i
<
b
->
length
;
i
++
)
coeff
[
i
+
(
length
-
1
)
/
2
-
(
b
->
length
-
1
)
/
2
]
+=
b
->
coeff
[
i
];
return
vec
;
}
static
SwsVector
*
sws_diffVec
(
SwsVector
*
a
,
SwsVector
*
b
){
int
length
=
MAX
(
a
->
length
,
b
->
length
);
double
*
coeff
=
memalign
(
sizeof
(
double
),
length
*
sizeof
(
double
));
int
i
;
SwsVector
*
vec
=
malloc
(
sizeof
(
SwsVector
));
vec
->
coeff
=
coeff
;
vec
->
length
=
length
;
for
(
i
=
0
;
i
<
length
;
i
++
)
coeff
[
i
]
=
0
.
0
;
for
(
i
=
0
;
i
<
a
->
length
;
i
++
)
coeff
[
i
+
(
length
-
1
)
/
2
-
(
a
->
length
-
1
)
/
2
]
+=
a
->
coeff
[
i
];
for
(
i
=
0
;
i
<
b
->
length
;
i
++
)
coeff
[
i
+
(
length
-
1
)
/
2
-
(
b
->
length
-
1
)
/
2
]
-=
b
->
coeff
[
i
];
return
vec
;
}
/* shift left / or right if "shift" is negative */
static
SwsVector
*
sws_getShiftedVec
(
SwsVector
*
a
,
int
shift
){
int
length
=
a
->
length
+
ABS
(
shift
)
*
2
;
double
*
coeff
=
memalign
(
sizeof
(
double
),
length
*
sizeof
(
double
));
int
i
;
SwsVector
*
vec
=
malloc
(
sizeof
(
SwsVector
));
vec
->
coeff
=
coeff
;
vec
->
length
=
length
;
for
(
i
=
0
;
i
<
length
;
i
++
)
coeff
[
i
]
=
0
.
0
;
for
(
i
=
0
;
i
<
a
->
length
;
i
++
)
{
coeff
[
i
+
(
length
-
1
)
/
2
-
(
a
->
length
-
1
)
/
2
-
shift
]
=
a
->
coeff
[
i
];
}
return
vec
;
}
void
sws_shiftVec
(
SwsVector
*
a
,
int
shift
){
SwsVector
*
shifted
=
sws_getShiftedVec
(
a
,
shift
);
free
(
a
->
coeff
);
a
->
coeff
=
shifted
->
coeff
;
a
->
length
=
shifted
->
length
;
free
(
shifted
);
}
void
sws_addVec
(
SwsVector
*
a
,
SwsVector
*
b
){
SwsVector
*
sum
=
sws_sumVec
(
a
,
b
);
free
(
a
->
coeff
);
a
->
coeff
=
sum
->
coeff
;
a
->
length
=
sum
->
length
;
free
(
sum
);
}
void
sws_subVec
(
SwsVector
*
a
,
SwsVector
*
b
){
SwsVector
*
diff
=
sws_diffVec
(
a
,
b
);
free
(
a
->
coeff
);
a
->
coeff
=
diff
->
coeff
;
a
->
length
=
diff
->
length
;
free
(
diff
);
}
void
sws_convVec
(
SwsVector
*
a
,
SwsVector
*
b
){
SwsVector
*
conv
=
sws_getConvVec
(
a
,
b
);
free
(
a
->
coeff
);
a
->
coeff
=
conv
->
coeff
;
a
->
length
=
conv
->
length
;
free
(
conv
);
}
SwsVector
*
sws_cloneVec
(
SwsVector
*
a
){
double
*
coeff
=
memalign
(
sizeof
(
double
),
a
->
length
*
sizeof
(
double
));
int
i
;
SwsVector
*
vec
=
malloc
(
sizeof
(
SwsVector
));
vec
->
coeff
=
coeff
;
vec
->
length
=
a
->
length
;
for
(
i
=
0
;
i
<
a
->
length
;
i
++
)
coeff
[
i
]
=
a
->
coeff
[
i
];
return
vec
;
}
void
sws_printVec
(
SwsVector
*
a
){
int
i
;
double
max
=
0
;
double
min
=
0
;
double
range
;
for
(
i
=
0
;
i
<
a
->
length
;
i
++
)
if
(
a
->
coeff
[
i
]
>
max
)
max
=
a
->
coeff
[
i
];
for
(
i
=
0
;
i
<
a
->
length
;
i
++
)
if
(
a
->
coeff
[
i
]
<
min
)
min
=
a
->
coeff
[
i
];
range
=
max
-
min
;
for
(
i
=
0
;
i
<
a
->
length
;
i
++
)
{
int
x
=
(
int
)((
a
->
coeff
[
i
]
-
min
)
*
60
.
0
/
range
+
0
.
5
);
MSG_DBG2
(
"%1.3f "
,
a
->
coeff
[
i
]);
for
(;
x
>
0
;
x
--
)
MSG_DBG2
(
" "
);
MSG_DBG2
(
"|
\n
"
);
}
}
void
sws_freeVec
(
SwsVector
*
a
){
if
(
!
a
)
return
;
if
(
a
->
coeff
)
free
(
a
->
coeff
);
a
->
coeff
=
NULL
;
a
->
length
=
0
;
free
(
a
);
}
void
sws_freeFilter
(
SwsFilter
*
filter
){
if
(
!
filter
)
return
;
if
(
filter
->
lumH
)
sws_freeVec
(
filter
->
lumH
);
if
(
filter
->
lumV
)
sws_freeVec
(
filter
->
lumV
);
if
(
filter
->
chrH
)
sws_freeVec
(
filter
->
chrH
);
if
(
filter
->
chrV
)
sws_freeVec
(
filter
->
chrV
);
free
(
filter
);
}
void
sws_freeContext
(
SwsContext
*
c
){
int
i
;
if
(
!
c
)
return
;
if
(
c
->
lumPixBuf
)
{
for
(
i
=
0
;
i
<
c
->
vLumBufSize
;
i
++
)
{
if
(
c
->
lumPixBuf
[
i
])
free
(
c
->
lumPixBuf
[
i
]);
c
->
lumPixBuf
[
i
]
=
NULL
;
}
free
(
c
->
lumPixBuf
);
c
->
lumPixBuf
=
NULL
;
}
if
(
c
->
chrPixBuf
)
{
for
(
i
=
0
;
i
<
c
->
vChrBufSize
;
i
++
)
{
if
(
c
->
chrPixBuf
[
i
])
free
(
c
->
chrPixBuf
[
i
]);
c
->
chrPixBuf
[
i
]
=
NULL
;
}
free
(
c
->
chrPixBuf
);
c
->
chrPixBuf
=
NULL
;
}
if
(
c
->
vLumFilter
)
free
(
c
->
vLumFilter
);
c
->
vLumFilter
=
NULL
;
if
(
c
->
vChrFilter
)
free
(
c
->
vChrFilter
);
c
->
vChrFilter
=
NULL
;
if
(
c
->
hLumFilter
)
free
(
c
->
hLumFilter
);
c
->
hLumFilter
=
NULL
;
if
(
c
->
hChrFilter
)
free
(
c
->
hChrFilter
);
c
->
hChrFilter
=
NULL
;
if
(
c
->
vLumFilterPos
)
free
(
c
->
vLumFilterPos
);
c
->
vLumFilterPos
=
NULL
;
if
(
c
->
vChrFilterPos
)
free
(
c
->
vChrFilterPos
);
c
->
vChrFilterPos
=
NULL
;
if
(
c
->
hLumFilterPos
)
free
(
c
->
hLumFilterPos
);
c
->
hLumFilterPos
=
NULL
;
if
(
c
->
hChrFilterPos
)
free
(
c
->
hChrFilterPos
);
c
->
hChrFilterPos
=
NULL
;
if
(
c
->
lumMmx2Filter
)
free
(
c
->
lumMmx2Filter
);
c
->
lumMmx2Filter
=
NULL
;
if
(
c
->
chrMmx2Filter
)
free
(
c
->
chrMmx2Filter
);
c
->
chrMmx2Filter
=
NULL
;
if
(
c
->
lumMmx2FilterPos
)
free
(
c
->
lumMmx2FilterPos
);
c
->
lumMmx2FilterPos
=
NULL
;
if
(
c
->
chrMmx2FilterPos
)
free
(
c
->
chrMmx2FilterPos
);
c
->
chrMmx2FilterPos
=
NULL
;
if
(
c
->
yuvTable
)
free
(
c
->
yuvTable
);
c
->
yuvTable
=
NULL
;
free
(
c
);
}
modules/video_filter/swscale/swscale.h
deleted
100644 → 0
View file @
3a2462f6
/*
Copyright (C) 2001-2003 Michael Niedermayer <michaelni@gmx.at>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*/
#ifndef SWSCALE_H
#define SWSCALE_H
/**
* @file swscale.h
* @brief
* external api for the swscale stuff
*/
#ifdef __cplusplus
extern
"C"
{
#endif
/* values for the flags, the stuff on the command line is different */
#define SWS_FAST_BILINEAR 1
#define SWS_BILINEAR 2
#define SWS_BICUBIC 4
#define SWS_X 8
#define SWS_POINT 0x10
#define SWS_AREA 0x20
#define SWS_BICUBLIN 0x40
#define SWS_GAUSS 0x80
#define SWS_SINC 0x100
#define SWS_LANCZOS 0x200
#define SWS_SPLINE 0x400
#define SWS_SRC_V_CHR_DROP_MASK 0x30000
#define SWS_SRC_V_CHR_DROP_SHIFT 16
#define SWS_PARAM_MASK 0x3FC0000
#define SWS_PARAM_SHIFT 18
#define SWS_PRINT_INFO 0x1000
//the following 3 flags are not completly implemented
//internal chrominace subsamling info
#define SWS_FULL_CHR_H_INT 0x2000
//input subsampling info
#define SWS_FULL_CHR_H_INP 0x4000
#define SWS_DIRECT_BGR 0x8000
#define SWS_CPU_CAPS_MMX 0x80000000
#define SWS_CPU_CAPS_MMX2 0x20000000
#define SWS_CPU_CAPS_3DNOW 0x40000000
#define SWS_CPU_CAPS_ALTIVEC 0x10000000
#define SWS_MAX_REDUCE_CUTOFF 0.002
#define SWS_CS_ITU709 1
#define SWS_CS_FCC 4
#define SWS_CS_ITU601 5
#define SWS_CS_ITU624 5
#define SWS_CS_SMPTE170M 5
#define SWS_CS_SMPTE240M 7
#define SWS_CS_DEFAULT 5
// when used for filters they must have an odd number of elements
// coeffs cannot be shared between vectors
typedef
struct
{
double
*
coeff
;
int
length
;
}
SwsVector
;
// vectors can be shared
typedef
struct
{
SwsVector
*
lumH
;
SwsVector
*
lumV
;
SwsVector
*
chrH
;
SwsVector
*
chrV
;
}
SwsFilter
;
struct
SwsContext
;
void
sws_freeContext
(
struct
SwsContext
*
swsContext
);
struct
SwsContext
*
sws_getContext
(
int
srcW
,
int
srcH
,
int
srcFormat
,
int
dstW
,
int
dstH
,
int
dstFormat
,
int
flags
,
SwsFilter
*
srcFilter
,
SwsFilter
*
dstFilter
);
int
sws_scale
(
struct
SwsContext
*
context
,
uint8_t
*
src
[],
int
srcStride
[],
int
srcSliceY
,
int
srcSliceH
,
uint8_t
*
dst
[],
int
dstStride
[]);
int
sws_scale_ordered
(
struct
SwsContext
*
context
,
uint8_t
*
src
[],
int
srcStride
[],
int
srcSliceY
,
int
srcSliceH
,
uint8_t
*
dst
[],
int
dstStride
[]);
int
sws_setColorspaceDetails
(
struct
SwsContext
*
c
,
const
int
inv_table
[
4
],
int
srcRange
,
const
int
table
[
4
],
int
dstRange
,
int
brightness
,
int
contrast
,
int
saturation
);
int
sws_getColorspaceDetails
(
struct
SwsContext
*
c
,
int
**
inv_table
,
int
*
srcRange
,
int
**
table
,
int
*
dstRange
,
int
*
brightness
,
int
*
contrast
,
int
*
saturation
);
SwsVector
*
sws_getGaussianVec
(
double
variance
,
double
quality
);
SwsVector
*
sws_getConstVec
(
double
c
,
int
length
);
SwsVector
*
sws_getIdentityVec
(
void
);
void
sws_scaleVec
(
SwsVector
*
a
,
double
scalar
);
void
sws_normalizeVec
(
SwsVector
*
a
,
double
height
);
void
sws_convVec
(
SwsVector
*
a
,
SwsVector
*
b
);
void
sws_addVec
(
SwsVector
*
a
,
SwsVector
*
b
);
void
sws_subVec
(
SwsVector
*
a
,
SwsVector
*
b
);
void
sws_shiftVec
(
SwsVector
*
a
,
int
shift
);
SwsVector
*
sws_cloneVec
(
SwsVector
*
a
);
void
sws_printVec
(
SwsVector
*
a
);
void
sws_freeVec
(
SwsVector
*
a
);
SwsFilter
*
sws_getDefaultFilter
(
float
lumaGBlur
,
float
chromaGBlur
,
float
lumaSarpen
,
float
chromaSharpen
,
float
chromaHShift
,
float
chromaVShift
,
int
verbose
);
void
sws_freeFilter
(
SwsFilter
*
filter
);
#ifdef __cplusplus
}
#endif
#endif
modules/video_filter/swscale/swscale_altivec_template.c
deleted
100644 → 0
View file @
3a2462f6
/*
AltiVec-enhanced yuv2yuvX
Copyright (C) 2004 Romain Dolbeau <romain@dolbeau.org>
based on the equivalent C code in "postproc/swscale.c"
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*/
#ifdef CONFIG_DARWIN
#define AVV(x...) (x)
#else
#define AVV(x...) {x}
#endif
static
const
vector
signed
int
vzero
=
(
const
vector
signed
int
)
AVV
(
0
,
0
,
0
,
0
);
static
const
vector
unsigned
int
altivec_vectorShiftInt19
=
(
const
vector
unsigned
int
)
AVV
(
19
,
19
,
19
,
19
);
static
inline
void
altivec_packIntArrayToCharArray
(
int
*
val
,
uint8_t
*
dest
,
int
dstW
)
{
register
int
i
;
if
((
unsigned
long
)
dest
%
16
)
{
/* badly aligned store, we force store alignement */
/* and will handle load misalignement on val w/ vec_perm */
for
(
i
=
0
;
(
i
<
dstW
)
&&
(((
unsigned
long
)
dest
+
i
)
%
16
)
;
i
++
)
{
int
t
=
val
[
i
]
>>
19
;
dest
[
i
]
=
(
t
<
0
)
?
0
:
((
t
>
255
)
?
255
:
t
);
}
vector
unsigned
char
perm1
=
vec_lvsl
(
i
<<
2
,
val
);
vector
signed
int
v1
=
vec_ld
(
i
<<
2
,
val
);
for
(
;
i
<
(
dstW
-
15
);
i
+=
16
)
{
int
offset
=
i
<<
2
;
vector
signed
int
v2
=
vec_ld
(
offset
+
16
,
val
);
vector
signed
int
v3
=
vec_ld
(
offset
+
32
,
val
);
vector
signed
int
v4
=
vec_ld
(
offset
+
48
,
val
);
vector
signed
int
v5
=
vec_ld
(
offset
+
64
,
val
);
vector
signed
int
v12
=
vec_perm
(
v1
,
v2
,
perm1
);
vector
signed
int
v23
=
vec_perm
(
v2
,
v3
,
perm1
);
vector
signed
int
v34
=
vec_perm
(
v3
,
v4
,
perm1
);
vector
signed
int
v45
=
vec_perm
(
v4
,
v5
,
perm1
);
vector
signed
int
vA
=
vec_sra
(
v12
,
altivec_vectorShiftInt19
);
vector
signed
int
vB
=
vec_sra
(
v23
,
altivec_vectorShiftInt19
);
vector
signed
int
vC
=
vec_sra
(
v34
,
altivec_vectorShiftInt19
);
vector
signed
int
vD
=
vec_sra
(
v45
,
altivec_vectorShiftInt19
);
vector
unsigned
short
vs1
=
vec_packsu
(
vA
,
vB
);
vector
unsigned
short
vs2
=
vec_packsu
(
vC
,
vD
);
vector
unsigned
char
vf
=
vec_packsu
(
vs1
,
vs2
);
vec_st
(
vf
,
i
,
dest
);
v1
=
v5
;
}
}
else
{
// dest is properly aligned, great
for
(
i
=
0
;
i
<
(
dstW
-
15
);
i
+=
16
)
{
int
offset
=
i
<<
2
;
vector
signed
int
v1
=
vec_ld
(
offset
,
val
);
vector
signed
int
v2
=
vec_ld
(
offset
+
16
,
val
);
vector
signed
int
v3
=
vec_ld
(
offset
+
32
,
val
);
vector
signed
int
v4
=
vec_ld
(
offset
+
48
,
val
);
vector
signed
int
v5
=
vec_sra
(
v1
,
altivec_vectorShiftInt19
);
vector
signed
int
v6
=
vec_sra
(
v2
,
altivec_vectorShiftInt19
);
vector
signed
int
v7
=
vec_sra
(
v3
,
altivec_vectorShiftInt19
);
vector
signed
int
v8
=
vec_sra
(
v4
,
altivec_vectorShiftInt19
);
vector
unsigned
short
vs1
=
vec_packsu
(
v5
,
v6
);
vector
unsigned
short
vs2
=
vec_packsu
(
v7
,
v8
);
vector
unsigned
char
vf
=
vec_packsu
(
vs1
,
vs2
);
vec_st
(
vf
,
i
,
dest
);
}
}
for
(
;
i
<
dstW
;
i
++
)
{
int
t
=
val
[
i
]
>>
19
;
dest
[
i
]
=
(
t
<
0
)
?
0
:
((
t
>
255
)
?
255
:
t
);
}
}
static
inline
void
yuv2yuvX_altivec_real
(
int16_t
*
lumFilter
,
int16_t
**
lumSrc
,
int
lumFilterSize
,
int16_t
*
chrFilter
,
int16_t
**
chrSrc
,
int
chrFilterSize
,
uint8_t
*
dest
,
uint8_t
*
uDest
,
uint8_t
*
vDest
,
int
dstW
,
int
chrDstW
)
{
const
vector
signed
int
vini
=
{(
1
<<
18
),
(
1
<<
18
),
(
1
<<
18
),
(
1
<<
18
)};
register
int
i
,
j
;
{
int
__attribute__
((
aligned
(
16
)))
val
[
dstW
];
for
(
i
=
0
;
i
<
(
dstW
-
7
);
i
+=
4
)
{
vec_st
(
vini
,
i
<<
2
,
val
);
}
for
(;
i
<
dstW
;
i
++
)
{
val
[
i
]
=
(
1
<<
18
);
}
for
(
j
=
0
;
j
<
lumFilterSize
;
j
++
)
{
vector
signed
short
vLumFilter
=
vec_ld
(
j
<<
1
,
lumFilter
);
vector
unsigned
char
perm0
=
vec_lvsl
(
j
<<
1
,
lumFilter
);
vLumFilter
=
vec_perm
(
vLumFilter
,
vLumFilter
,
perm0
);
vLumFilter
=
vec_splat
(
vLumFilter
,
0
);
// lumFilter[j] is loaded 8 times in vLumFilter
vector
unsigned
char
perm
=
vec_lvsl
(
0
,
lumSrc
[
j
]);
vector
signed
short
l1
=
vec_ld
(
0
,
lumSrc
[
j
]);
for
(
i
=
0
;
i
<
(
dstW
-
7
);
i
+=
8
)
{
int
offset
=
i
<<
2
;
vector
signed
short
l2
=
vec_ld
((
i
<<
1
)
+
16
,
lumSrc
[
j
]);
vector
signed
int
v1
=
vec_ld
(
offset
,
val
);
vector
signed
int
v2
=
vec_ld
(
offset
+
16
,
val
);
vector
signed
short
ls
=
vec_perm
(
l1
,
l2
,
perm
);
// lumSrc[j][i] ... lumSrc[j][i+7]
vector
signed
int
i1
=
vec_mule
(
vLumFilter
,
ls
);
vector
signed
int
i2
=
vec_mulo
(
vLumFilter
,
ls
);
vector
signed
int
vf1
=
vec_mergeh
(
i1
,
i2
);
vector
signed
int
vf2
=
vec_mergel
(
i1
,
i2
);
// lumSrc[j][i] * lumFilter[j] ... lumSrc[j][i+7] * lumFilter[j]
vector
signed
int
vo1
=
vec_add
(
v1
,
vf1
);
vector
signed
int
vo2
=
vec_add
(
v2
,
vf2
);
vec_st
(
vo1
,
offset
,
val
);
vec_st
(
vo2
,
offset
+
16
,
val
);
l1
=
l2
;
}
for
(
;
i
<
dstW
;
i
++
)
{
val
[
i
]
+=
lumSrc
[
j
][
i
]
*
lumFilter
[
j
];
}
}
altivec_packIntArrayToCharArray
(
val
,
dest
,
dstW
);
}
if
(
uDest
!=
0
)
{
int
__attribute__
((
aligned
(
16
)))
u
[
chrDstW
];
int
__attribute__
((
aligned
(
16
)))
v
[
chrDstW
];
for
(
i
=
0
;
i
<
(
chrDstW
-
7
);
i
+=
4
)
{
vec_st
(
vini
,
i
<<
2
,
u
);
vec_st
(
vini
,
i
<<
2
,
v
);
}
for
(;
i
<
chrDstW
;
i
++
)
{
u
[
i
]
=
(
1
<<
18
);
v
[
i
]
=
(
1
<<
18
);
}
for
(
j
=
0
;
j
<
chrFilterSize
;
j
++
)
{
vector
signed
short
vChrFilter
=
vec_ld
(
j
<<
1
,
chrFilter
);
vector
unsigned
char
perm0
=
vec_lvsl
(
j
<<
1
,
chrFilter
);
vChrFilter
=
vec_perm
(
vChrFilter
,
vChrFilter
,
perm0
);
vChrFilter
=
vec_splat
(
vChrFilter
,
0
);
// chrFilter[j] is loaded 8 times in vChrFilter
vector
unsigned
char
perm
=
vec_lvsl
(
0
,
chrSrc
[
j
]);
vector
signed
short
l1
=
vec_ld
(
0
,
chrSrc
[
j
]);
vector
signed
short
l1_V
=
vec_ld
(
2048
<<
1
,
chrSrc
[
j
]);
for
(
i
=
0
;
i
<
(
chrDstW
-
7
);
i
+=
8
)
{
int
offset
=
i
<<
2
;
vector
signed
short
l2
=
vec_ld
((
i
<<
1
)
+
16
,
chrSrc
[
j
]);
vector
signed
short
l2_V
=
vec_ld
(((
i
+
2048
)
<<
1
)
+
16
,
chrSrc
[
j
]);
vector
signed
int
v1
=
vec_ld
(
offset
,
u
);
vector
signed
int
v2
=
vec_ld
(
offset
+
16
,
u
);
vector
signed
int
v1_V
=
vec_ld
(
offset
,
v
);
vector
signed
int
v2_V
=
vec_ld
(
offset
+
16
,
v
);
vector
signed
short
ls
=
vec_perm
(
l1
,
l2
,
perm
);
// chrSrc[j][i] ... chrSrc[j][i+7]
vector
signed
short
ls_V
=
vec_perm
(
l1_V
,
l2_V
,
perm
);
// chrSrc[j][i+2048] ... chrSrc[j][i+2055]
vector
signed
int
i1
=
vec_mule
(
vChrFilter
,
ls
);
vector
signed
int
i2
=
vec_mulo
(
vChrFilter
,
ls
);
vector
signed
int
i1_V
=
vec_mule
(
vChrFilter
,
ls_V
);
vector
signed
int
i2_V
=
vec_mulo
(
vChrFilter
,
ls_V
);
vector
signed
int
vf1
=
vec_mergeh
(
i1
,
i2
);
vector
signed
int
vf2
=
vec_mergel
(
i1
,
i2
);
// chrSrc[j][i] * chrFilter[j] ... chrSrc[j][i+7] * chrFilter[j]
vector
signed
int
vf1_V
=
vec_mergeh
(
i1_V
,
i2_V
);
vector
signed
int
vf2_V
=
vec_mergel
(
i1_V
,
i2_V
);
// chrSrc[j][i] * chrFilter[j] ... chrSrc[j][i+7] * chrFilter[j]
vector
signed
int
vo1
=
vec_add
(
v1
,
vf1
);
vector
signed
int
vo2
=
vec_add
(
v2
,
vf2
);
vector
signed
int
vo1_V
=
vec_add
(
v1_V
,
vf1_V
);
vector
signed
int
vo2_V
=
vec_add
(
v2_V
,
vf2_V
);
vec_st
(
vo1
,
offset
,
u
);
vec_st
(
vo2
,
offset
+
16
,
u
);
vec_st
(
vo1_V
,
offset
,
v
);
vec_st
(
vo2_V
,
offset
+
16
,
v
);
l1
=
l2
;
l1_V
=
l2_V
;
}
for
(
;
i
<
chrDstW
;
i
++
)
{
u
[
i
]
+=
chrSrc
[
j
][
i
]
*
chrFilter
[
j
];
v
[
i
]
+=
chrSrc
[
j
][
i
+
2048
]
*
chrFilter
[
j
];
}
}
altivec_packIntArrayToCharArray
(
u
,
uDest
,
chrDstW
);
altivec_packIntArrayToCharArray
(
v
,
vDest
,
chrDstW
);
}
}
static
inline
void
hScale_altivec_real
(
int16_t
*
dst
,
int
dstW
,
uint8_t
*
src
,
int
srcW
,
int
xInc
,
int16_t
*
filter
,
int16_t
*
filterPos
,
int
filterSize
)
{
register
int
i
;
int
__attribute__
((
aligned
(
16
)))
tempo
[
4
];
if
(
filterSize
%
4
)
{
for
(
i
=
0
;
i
<
dstW
;
i
++
)
{
register
int
j
;
register
int
srcPos
=
filterPos
[
i
];
register
int
val
=
0
;
for
(
j
=
0
;
j
<
filterSize
;
j
++
)
{
val
+=
((
int
)
src
[
srcPos
+
j
])
*
filter
[
filterSize
*
i
+
j
];
}
dst
[
i
]
=
MIN
(
MAX
(
0
,
val
>>
7
),
(
1
<<
15
)
-
1
);
}
}
else
switch
(
filterSize
)
{
case
4
:
{
for
(
i
=
0
;
i
<
dstW
;
i
++
)
{
register
int
j
;
register
int
srcPos
=
filterPos
[
i
];
vector
unsigned
char
src_v0
=
vec_ld
(
srcPos
,
src
);
vector
unsigned
char
src_v1
;
if
((((
int
)
src
+
srcPos
)
%
16
)
>
12
)
{
src_v1
=
vec_ld
(
srcPos
+
16
,
src
);
}
vector
unsigned
char
src_vF
=
vec_perm
(
src_v0
,
src_v1
,
vec_lvsl
(
srcPos
,
src
));
vector
signed
short
src_v
=
// vec_unpackh sign-extends...
(
vector
signed
short
)(
vec_mergeh
((
vector
unsigned
char
)
vzero
,
src_vF
));
// now put our elements in the even slots
src_v
=
vec_mergeh
(
src_v
,
(
vector
signed
short
)
vzero
);
vector
signed
short
filter_v
=
vec_ld
(
i
<<
3
,
filter
);
// the 3 above is 2 (filterSize == 4) + 1 (sizeof(short) == 2)
// the neat trick : we only care for half the elements,
// high or low depending on (i<<3)%16 (it's 0 or 8 here),
// and we're going to use vec_mule, so we chose
// carefully how to "unpack" the elements into the even slots
if
((
i
<<
3
)
%
16
)
filter_v
=
vec_mergel
(
filter_v
,(
vector
signed
short
)
vzero
);
else
filter_v
=
vec_mergeh
(
filter_v
,(
vector
signed
short
)
vzero
);
vector
signed
int
val_vEven
=
vec_mule
(
src_v
,
filter_v
);
vector
signed
int
val_s
=
vec_sums
(
val_vEven
,
vzero
);
vec_st
(
val_s
,
0
,
tempo
);
dst
[
i
]
=
MIN
(
MAX
(
0
,
tempo
[
3
]
>>
7
),
(
1
<<
15
)
-
1
);
}
}
break
;
case
8
:
{
for
(
i
=
0
;
i
<
dstW
;
i
++
)
{
register
int
srcPos
=
filterPos
[
i
];
vector
unsigned
char
src_v0
=
vec_ld
(
srcPos
,
src
);
vector
unsigned
char
src_v1
;
if
((((
int
)
src
+
srcPos
)
%
16
)
>
8
)
{
src_v1
=
vec_ld
(
srcPos
+
16
,
src
);
}
vector
unsigned
char
src_vF
=
vec_perm
(
src_v0
,
src_v1
,
vec_lvsl
(
srcPos
,
src
));
vector
signed
short
src_v
=
// vec_unpackh sign-extends...
(
vector
signed
short
)(
vec_mergeh
((
vector
unsigned
char
)
vzero
,
src_vF
));
vector
signed
short
filter_v
=
vec_ld
(
i
<<
4
,
filter
);
// the 4 above is 3 (filterSize == 8) + 1 (sizeof(short) == 2)
vector
signed
int
val_v
=
vec_msums
(
src_v
,
filter_v
,
(
vector
signed
int
)
vzero
);
vector
signed
int
val_s
=
vec_sums
(
val_v
,
vzero
);
vec_st
(
val_s
,
0
,
tempo
);
dst
[
i
]
=
MIN
(
MAX
(
0
,
tempo
[
3
]
>>
7
),
(
1
<<
15
)
-
1
);
}
}
break
;
case
16
:
{
for
(
i
=
0
;
i
<
dstW
;
i
++
)
{
register
int
srcPos
=
filterPos
[
i
];
vector
unsigned
char
src_v0
=
vec_ld
(
srcPos
,
src
);
vector
unsigned
char
src_v1
=
vec_ld
(
srcPos
+
16
,
src
);
vector
unsigned
char
src_vF
=
vec_perm
(
src_v0
,
src_v1
,
vec_lvsl
(
srcPos
,
src
));
vector
signed
short
src_vA
=
// vec_unpackh sign-extends...
(
vector
signed
short
)(
vec_mergeh
((
vector
unsigned
char
)
vzero
,
src_vF
));
vector
signed
short
src_vB
=
// vec_unpackh sign-extends...
(
vector
signed
short
)(
vec_mergel
((
vector
unsigned
char
)
vzero
,
src_vF
));
vector
signed
short
filter_v0
=
vec_ld
(
i
<<
5
,
filter
);
vector
signed
short
filter_v1
=
vec_ld
((
i
<<
5
)
+
16
,
filter
);
// the 5 above are 4 (filterSize == 16) + 1 (sizeof(short) == 2)
vector
signed
int
val_acc
=
vec_msums
(
src_vA
,
filter_v0
,
(
vector
signed
int
)
vzero
);
vector
signed
int
val_v
=
vec_msums
(
src_vB
,
filter_v1
,
val_acc
);
vector
signed
int
val_s
=
vec_sums
(
val_v
,
vzero
);
vec_st
(
val_s
,
0
,
tempo
);
dst
[
i
]
=
MIN
(
MAX
(
0
,
tempo
[
3
]
>>
7
),
(
1
<<
15
)
-
1
);
}
}
break
;
default:
{
for
(
i
=
0
;
i
<
dstW
;
i
++
)
{
register
int
j
;
register
int
srcPos
=
filterPos
[
i
];
vector
signed
int
val_v
=
(
vector
signed
int
)
vzero
;
vector
signed
short
filter_v0R
=
vec_ld
(
i
*
2
*
filterSize
,
filter
);
vector
unsigned
char
permF
=
vec_lvsl
((
i
*
2
*
filterSize
),
filter
);
vector
unsigned
char
src_v0
=
vec_ld
(
srcPos
,
src
);
vector
unsigned
char
permS
=
vec_lvsl
(
srcPos
,
src
);
for
(
j
=
0
;
j
<
filterSize
-
15
;
j
+=
16
)
{
vector
unsigned
char
src_v1
=
vec_ld
(
srcPos
+
j
+
16
,
src
);
vector
unsigned
char
src_vF
=
vec_perm
(
src_v0
,
src_v1
,
permS
);
vector
signed
short
src_vA
=
// vec_unpackh sign-extends...
(
vector
signed
short
)(
vec_mergeh
((
vector
unsigned
char
)
vzero
,
src_vF
));
vector
signed
short
src_vB
=
// vec_unpackh sign-extends...
(
vector
signed
short
)(
vec_mergel
((
vector
unsigned
char
)
vzero
,
src_vF
));
vector
signed
short
filter_v1R
=
vec_ld
((
i
*
2
*
filterSize
)
+
(
j
*
2
)
+
16
,
filter
);
vector
signed
short
filter_v2R
=
vec_ld
((
i
*
2
*
filterSize
)
+
(
j
*
2
)
+
32
,
filter
);
vector
signed
short
filter_v0
=
vec_perm
(
filter_v0R
,
filter_v1R
,
permF
);
vector
signed
short
filter_v1
=
vec_perm
(
filter_v1R
,
filter_v2R
,
permF
);
vector
signed
int
val_acc
=
vec_msums
(
src_vA
,
filter_v0
,
val_v
);
val_v
=
vec_msums
(
src_vB
,
filter_v1
,
val_acc
);
filter_v0R
=
filter_v2R
;
src_v0
=
src_v1
;
}
if
(
j
<
(
filterSize
-
7
))
{
// loading src_v0 is useless, it's already done above
//vector unsigned char src_v0 = vec_ld(srcPos + j, src);
vector
unsigned
char
src_v1
;
if
((((
int
)
src
+
srcPos
)
%
16
)
>
8
)
{
src_v1
=
vec_ld
(
srcPos
+
j
+
16
,
src
);
}
vector
unsigned
char
src_vF
=
vec_perm
(
src_v0
,
src_v1
,
permS
);
vector
signed
short
src_v
=
// vec_unpackh sign-extends...
(
vector
signed
short
)(
vec_mergeh
((
vector
unsigned
char
)
vzero
,
src_vF
));
// loading filter_v0R is useless, it's already done above
//vector signed short filter_v0R = vec_ld((i * 2 * filterSize) + j, filter);
vector
signed
short
filter_v1R
=
vec_ld
((
i
*
2
*
filterSize
)
+
(
j
*
2
)
+
16
,
filter
);
vector
signed
short
filter_v
=
vec_perm
(
filter_v0R
,
filter_v1R
,
permF
);
val_v
=
vec_msums
(
src_v
,
filter_v
,
val_v
);
}
vector
signed
int
val_s
=
vec_sums
(
val_v
,
vzero
);
vec_st
(
val_s
,
0
,
tempo
);
dst
[
i
]
=
MIN
(
MAX
(
0
,
tempo
[
3
]
>>
7
),
(
1
<<
15
)
-
1
);
}
}
}
}
static
inline
int
yv12toyuy2_unscaled_altivec
(
SwsContext
*
c
,
uint8_t
*
src
[],
int
srcStride
[],
int
srcSliceY
,
int
srcSliceH
,
uint8_t
*
dstParam
[],
int
dstStride_a
[])
{
uint8_t
*
dst
=
dstParam
[
0
]
+
dstStride_a
[
0
]
*
srcSliceY
;
// yv12toyuy2( src[0],src[1],src[2],dst,c->srcW,srcSliceH,srcStride[0],srcStride[1],dstStride[0] );
uint8_t
*
ysrc
=
src
[
0
];
uint8_t
*
usrc
=
src
[
1
];
uint8_t
*
vsrc
=
src
[
2
];
const
int
width
=
c
->
srcW
;
const
int
height
=
srcSliceH
;
const
int
lumStride
=
srcStride
[
0
];
const
int
chromStride
=
srcStride
[
1
];
const
int
dstStride
=
dstStride_a
[
0
];
const
vector
unsigned
char
yperm
=
vec_lvsl
(
0
,
ysrc
);
const
int
vertLumPerChroma
=
2
;
register
unsigned
int
y
;
/* this code assume:
1) dst is 16 bytes-aligned
2) dstStride is a multiple of 16
3) width is a multiple of 16
4) lum&chrom stride are multiple of 8
*/
for
(
y
=
0
;
y
<
height
;
y
++
)
{
int
i
;
for
(
i
=
0
;
i
<
width
-
31
;
i
+=
32
)
{
const
unsigned
int
j
=
i
>>
1
;
vector
unsigned
char
v_yA
=
vec_ld
(
i
,
ysrc
);
vector
unsigned
char
v_yB
=
vec_ld
(
i
+
16
,
ysrc
);
vector
unsigned
char
v_yC
=
vec_ld
(
i
+
32
,
ysrc
);
vector
unsigned
char
v_y1
=
vec_perm
(
v_yA
,
v_yB
,
yperm
);
vector
unsigned
char
v_y2
=
vec_perm
(
v_yB
,
v_yC
,
yperm
);
vector
unsigned
char
v_uA
=
vec_ld
(
j
,
usrc
);
vector
unsigned
char
v_uB
=
vec_ld
(
j
+
16
,
usrc
);
vector
unsigned
char
v_u
=
vec_perm
(
v_uA
,
v_uB
,
vec_lvsl
(
j
,
usrc
));
vector
unsigned
char
v_vA
=
vec_ld
(
j
,
vsrc
);
vector
unsigned
char
v_vB
=
vec_ld
(
j
+
16
,
vsrc
);
vector
unsigned
char
v_v
=
vec_perm
(
v_vA
,
v_vB
,
vec_lvsl
(
j
,
vsrc
));
vector
unsigned
char
v_uv_a
=
vec_mergeh
(
v_u
,
v_v
);
vector
unsigned
char
v_uv_b
=
vec_mergel
(
v_u
,
v_v
);
vector
unsigned
char
v_yuy2_0
=
vec_mergeh
(
v_y1
,
v_uv_a
);
vector
unsigned
char
v_yuy2_1
=
vec_mergel
(
v_y1
,
v_uv_a
);
vector
unsigned
char
v_yuy2_2
=
vec_mergeh
(
v_y2
,
v_uv_b
);
vector
unsigned
char
v_yuy2_3
=
vec_mergel
(
v_y2
,
v_uv_b
);
vec_st
(
v_yuy2_0
,
(
i
<<
1
),
dst
);
vec_st
(
v_yuy2_1
,
(
i
<<
1
)
+
16
,
dst
);
vec_st
(
v_yuy2_2
,
(
i
<<
1
)
+
32
,
dst
);
vec_st
(
v_yuy2_3
,
(
i
<<
1
)
+
48
,
dst
);
}
if
(
i
<
width
)
{
const
unsigned
int
j
=
i
>>
1
;
vector
unsigned
char
v_y1
=
vec_ld
(
i
,
ysrc
);
vector
unsigned
char
v_u
=
vec_ld
(
j
,
usrc
);
vector
unsigned
char
v_v
=
vec_ld
(
j
,
vsrc
);
vector
unsigned
char
v_uv_a
=
vec_mergeh
(
v_u
,
v_v
);
vector
unsigned
char
v_yuy2_0
=
vec_mergeh
(
v_y1
,
v_uv_a
);
vector
unsigned
char
v_yuy2_1
=
vec_mergel
(
v_y1
,
v_uv_a
);
vec_st
(
v_yuy2_0
,
(
i
<<
1
),
dst
);
vec_st
(
v_yuy2_1
,
(
i
<<
1
)
+
16
,
dst
);
}
if
((
y
&
(
vertLumPerChroma
-
1
))
==
(
vertLumPerChroma
-
1
)
)
{
usrc
+=
chromStride
;
vsrc
+=
chromStride
;
}
ysrc
+=
lumStride
;
dst
+=
dstStride
;
}
return
srcSliceH
;
}
static
inline
int
yv12touyvy_unscaled_altivec
(
SwsContext
*
c
,
uint8_t
*
src
[],
int
srcStride
[],
int
srcSliceY
,
int
srcSliceH
,
uint8_t
*
dstParam
[],
int
dstStride_a
[])
{
uint8_t
*
dst
=
dstParam
[
0
]
+
dstStride_a
[
0
]
*
srcSliceY
;
// yv12toyuy2( src[0],src[1],src[2],dst,c->srcW,srcSliceH,srcStride[0],srcStride[1],dstStride[0] );
uint8_t
*
ysrc
=
src
[
0
];
uint8_t
*
usrc
=
src
[
1
];
uint8_t
*
vsrc
=
src
[
2
];
const
int
width
=
c
->
srcW
;
const
int
height
=
srcSliceH
;
const
int
lumStride
=
srcStride
[
0
];
const
int
chromStride
=
srcStride
[
1
];
const
int
dstStride
=
dstStride_a
[
0
];
const
int
vertLumPerChroma
=
2
;
const
vector
unsigned
char
yperm
=
vec_lvsl
(
0
,
ysrc
);
register
unsigned
int
y
;
/* this code assume:
1) dst is 16 bytes-aligned
2) dstStride is a multiple of 16
3) width is a multiple of 16
4) lum&chrom stride are multiple of 8
*/
for
(
y
=
0
;
y
<
height
;
y
++
)
{
int
i
;
for
(
i
=
0
;
i
<
width
-
31
;
i
+=
32
)
{
const
unsigned
int
j
=
i
>>
1
;
vector
unsigned
char
v_yA
=
vec_ld
(
i
,
ysrc
);
vector
unsigned
char
v_yB
=
vec_ld
(
i
+
16
,
ysrc
);
vector
unsigned
char
v_yC
=
vec_ld
(
i
+
32
,
ysrc
);
vector
unsigned
char
v_y1
=
vec_perm
(
v_yA
,
v_yB
,
yperm
);
vector
unsigned
char
v_y2
=
vec_perm
(
v_yB
,
v_yC
,
yperm
);
vector
unsigned
char
v_uA
=
vec_ld
(
j
,
usrc
);
vector
unsigned
char
v_uB
=
vec_ld
(
j
+
16
,
usrc
);
vector
unsigned
char
v_u
=
vec_perm
(
v_uA
,
v_uB
,
vec_lvsl
(
j
,
usrc
));
vector
unsigned
char
v_vA
=
vec_ld
(
j
,
vsrc
);
vector
unsigned
char
v_vB
=
vec_ld
(
j
+
16
,
vsrc
);
vector
unsigned
char
v_v
=
vec_perm
(
v_vA
,
v_vB
,
vec_lvsl
(
j
,
vsrc
));
vector
unsigned
char
v_uv_a
=
vec_mergeh
(
v_u
,
v_v
);
vector
unsigned
char
v_uv_b
=
vec_mergel
(
v_u
,
v_v
);
vector
unsigned
char
v_uyvy_0
=
vec_mergeh
(
v_uv_a
,
v_y1
);
vector
unsigned
char
v_uyvy_1
=
vec_mergel
(
v_uv_a
,
v_y1
);
vector
unsigned
char
v_uyvy_2
=
vec_mergeh
(
v_uv_b
,
v_y2
);
vector
unsigned
char
v_uyvy_3
=
vec_mergel
(
v_uv_b
,
v_y2
);
vec_st
(
v_uyvy_0
,
(
i
<<
1
),
dst
);
vec_st
(
v_uyvy_1
,
(
i
<<
1
)
+
16
,
dst
);
vec_st
(
v_uyvy_2
,
(
i
<<
1
)
+
32
,
dst
);
vec_st
(
v_uyvy_3
,
(
i
<<
1
)
+
48
,
dst
);
}
if
(
i
<
width
)
{
const
unsigned
int
j
=
i
>>
1
;
vector
unsigned
char
v_y1
=
vec_ld
(
i
,
ysrc
);
vector
unsigned
char
v_u
=
vec_ld
(
j
,
usrc
);
vector
unsigned
char
v_v
=
vec_ld
(
j
,
vsrc
);
vector
unsigned
char
v_uv_a
=
vec_mergeh
(
v_u
,
v_v
);
vector
unsigned
char
v_uyvy_0
=
vec_mergeh
(
v_uv_a
,
v_y1
);
vector
unsigned
char
v_uyvy_1
=
vec_mergel
(
v_uv_a
,
v_y1
);
vec_st
(
v_uyvy_0
,
(
i
<<
1
),
dst
);
vec_st
(
v_uyvy_1
,
(
i
<<
1
)
+
16
,
dst
);
}
if
((
y
&
(
vertLumPerChroma
-
1
))
==
(
vertLumPerChroma
-
1
)
)
{
usrc
+=
chromStride
;
vsrc
+=
chromStride
;
}
ysrc
+=
lumStride
;
dst
+=
dstStride
;
}
return
srcSliceH
;
}
modules/video_filter/swscale/swscale_internal.h
deleted
100644 → 0
View file @
3a2462f6
/*
Copyright (C) 2001-2003 Michael Niedermayer <michaelni@gmx.at>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*/
#ifndef SWSCALE_INTERNAL_H
#define SWSCALE_INTERNAL_H
#ifdef HAVE_ALTIVEC_H
#include <altivec.h>
#endif
#define MSG_WARN(args...) mp_msg(MSGT_SWS,MSGL_WARN, ##args )
#define MSG_FATAL(args...) mp_msg(MSGT_SWS,MSGL_FATAL, ##args )
#define MSG_ERR(args...) mp_msg(MSGT_SWS,MSGL_ERR, ##args )
#define MSG_V(args...) mp_msg(MSGT_SWS,MSGL_V, ##args )
#define MSG_DBG2(args...) mp_msg(MSGT_SWS,MSGL_DBG2, ##args )
#define MSG_INFO(args...) mp_msg(MSGT_SWS,MSGL_INFO, ##args )
#define MAX_FILTER_SIZE 256
typedef
int
(
*
SwsFunc
)(
struct
SwsContext
*
context
,
uint8_t
*
src
[],
int
srcStride
[],
int
srcSliceY
,
int
srcSliceH
,
uint8_t
*
dst
[],
int
dstStride
[]);
/* this struct should be aligned on at least 32-byte boundary */
typedef
struct
SwsContext
{
/**
*
* Note the src,dst,srcStride,dstStride will be copied, in the sws_scale() warper so they can freely be modified here
*/
SwsFunc
swScale
;
int
srcW
,
srcH
,
dstH
;
int
chrSrcW
,
chrSrcH
,
chrDstW
,
chrDstH
;
int
lumXInc
,
chrXInc
;
int
lumYInc
,
chrYInc
;
int
dstFormat
,
srcFormat
;
///< format 4:2:0 type is allways YV12
int
origDstFormat
,
origSrcFormat
;
///< format
int
chrSrcHSubSample
,
chrSrcVSubSample
;
int
chrIntHSubSample
,
chrIntVSubSample
;
int
chrDstHSubSample
,
chrDstVSubSample
;
int
vChrDrop
;
int16_t
**
lumPixBuf
;
int16_t
**
chrPixBuf
;
int16_t
*
hLumFilter
;
int16_t
*
hLumFilterPos
;
int16_t
*
hChrFilter
;
int16_t
*
hChrFilterPos
;
int16_t
*
vLumFilter
;
int16_t
*
vLumFilterPos
;
int16_t
*
vChrFilter
;
int16_t
*
vChrFilterPos
;
uint8_t
formatConvBuffer
[
4000
];
//FIXME dynamic alloc, but we have to change alot of code for this to be usefull
int
hLumFilterSize
;
int
hChrFilterSize
;
int
vLumFilterSize
;
int
vChrFilterSize
;
int
vLumBufSize
;
int
vChrBufSize
;
uint8_t
__attribute__
((
aligned
(
32
)))
funnyYCode
[
10000
];
uint8_t
__attribute__
((
aligned
(
32
)))
funnyUVCode
[
10000
];
int32_t
*
lumMmx2FilterPos
;
int32_t
*
chrMmx2FilterPos
;
int16_t
*
lumMmx2Filter
;
int16_t
*
chrMmx2Filter
;
int
canMMX2BeUsed
;
int
lastInLumBuf
;
int
lastInChrBuf
;
int
lumBufIndex
;
int
chrBufIndex
;
int
dstY
;
int
flags
;
void
*
yuvTable
;
// pointer to the yuv->rgb table start so it can be freed()
void
*
table_rV
[
256
];
void
*
table_gU
[
256
];
int
table_gV
[
256
];
void
*
table_bU
[
256
];
//Colorspace stuff
int
contrast
,
brightness
,
saturation
;
// for sws_getColorspaceDetails
int
srcColorspaceTable
[
4
];
int
dstColorspaceTable
[
4
];
int
srcRange
,
dstRange
;
#define RED_DITHER "0*8"
#define GREEN_DITHER "1*8"
#define BLUE_DITHER "2*8"
#define Y_COEFF "3*8"
#define VR_COEFF "4*8"
#define UB_COEFF "5*8"
#define VG_COEFF "6*8"
#define UG_COEFF "7*8"
#define Y_OFFSET "8*8"
#define U_OFFSET "9*8"
#define V_OFFSET "10*8"
#define LUM_MMX_FILTER_OFFSET "11*8"
#define CHR_MMX_FILTER_OFFSET "11*8+4*4*256"
#define DSTW_OFFSET "11*8+4*4*256*2" //do not change, its hardcoded in the asm
#define ESP_OFFSET "11*8+4*4*256*2+4"
#define VROUNDER_OFFSET "11*8+4*4*256*2+8"
uint64_t
redDither
__attribute__
((
aligned
(
8
)));
uint64_t
greenDither
__attribute__
((
aligned
(
8
)));
uint64_t
blueDither
__attribute__
((
aligned
(
8
)));
uint64_t
yCoeff
__attribute__
((
aligned
(
8
)));
uint64_t
vrCoeff
__attribute__
((
aligned
(
8
)));
uint64_t
ubCoeff
__attribute__
((
aligned
(
8
)));
uint64_t
vgCoeff
__attribute__
((
aligned
(
8
)));
uint64_t
ugCoeff
__attribute__
((
aligned
(
8
)));
uint64_t
yOffset
__attribute__
((
aligned
(
8
)));
uint64_t
uOffset
__attribute__
((
aligned
(
8
)));
uint64_t
vOffset
__attribute__
((
aligned
(
8
)));
int32_t
lumMmxFilter
[
4
*
MAX_FILTER_SIZE
];
int32_t
chrMmxFilter
[
4
*
MAX_FILTER_SIZE
];
int
dstW
;
int
esp
;
uint64_t
vRounder
__attribute__
((
aligned
(
8
)));
#ifdef HAVE_ALTIVEC
vector
signed
short
CY
;
vector
signed
short
CRV
;
vector
signed
short
CBU
;
vector
signed
short
CGU
;
vector
signed
short
CGV
;
vector
signed
short
OY
;
vector
unsigned
short
CSHIFT
;
#endif
}
SwsContext
;
//FIXME check init (where 0)
SwsFunc
yuv2rgb_get_func_ptr
(
SwsContext
*
c
);
int
yuv2rgb_c_init_tables
(
SwsContext
*
c
,
const
int
inv_table
[
4
],
int
fullRange
,
int
brightness
,
int
contrast
,
int
saturation
);
#endif
modules/video_filter/swscale/swscale_template.c
deleted
100644 → 0
View file @
3a2462f6
/*
Copyright (C) 2001-2003 Michael Niedermayer <michaelni@gmx.at>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*/
#undef MOVNTQ
#undef PAVGB
#undef PREFETCH
#undef PREFETCHW
#undef EMMS
#undef SFENCE
#ifdef HAVE_3DNOW
/* On K6 femms is faster of emms. On K7 femms is directly mapped on emms. */
#define EMMS "femms"
#else
#define EMMS "emms"
#endif
#ifdef HAVE_3DNOW
#define PREFETCH "prefetch"
#define PREFETCHW "prefetchw"
#elif defined ( HAVE_MMX2 )
#define PREFETCH "prefetchnta"
#define PREFETCHW "prefetcht0"
#else
#define PREFETCH "/nop"
#define PREFETCHW "/nop"
#endif
#ifdef HAVE_MMX2
#define SFENCE "sfence"
#else
#define SFENCE "/nop"
#endif
#ifdef HAVE_MMX2
#define PAVGB(a,b) "pavgb " #a ", " #b " \n\t"
#elif defined (HAVE_3DNOW)
#define PAVGB(a,b) "pavgusb " #a ", " #b " \n\t"
#endif
#ifdef HAVE_MMX2
#define MOVNTQ(a,b) "movntq " #a ", " #b " \n\t"
#else
#define MOVNTQ(a,b) "movq " #a ", " #b " \n\t"
#endif
#ifdef HAVE_ALTIVEC
#include "swscale_altivec_template.c"
#endif
#define YSCALEYUV2YV12X(x, offset) \
"xorl %%eax, %%eax \n\t"\
"movq "VROUNDER_OFFSET"(%0), %%mm3\n\t"\
"movq %%mm3, %%mm4 \n\t"\
"leal " offset "(%0), %%edx \n\t"\
"movl (%%edx), %%esi \n\t"\
".balign 16 \n\t"
/* FIXME Unroll? */
\
"1: \n\t"\
"movq 8(%%edx), %%mm0 \n\t"
/* filterCoeff */
\
"movq " #x "(%%esi, %%eax, 2), %%mm2 \n\t"
/* srcData */
\
"movq 8+" #x "(%%esi, %%eax, 2), %%mm5 \n\t"
/* srcData */
\
"addl $16, %%edx \n\t"\
"movl (%%edx), %%esi \n\t"\
"testl %%esi, %%esi \n\t"\
"pmulhw %%mm0, %%mm2 \n\t"\
"pmulhw %%mm0, %%mm5 \n\t"\
"paddw %%mm2, %%mm3 \n\t"\
"paddw %%mm5, %%mm4 \n\t"\
" jnz 1b \n\t"\
"psraw $3, %%mm3 \n\t"\
"psraw $3, %%mm4 \n\t"\
"packuswb %%mm4, %%mm3 \n\t"\
MOVNTQ(%%mm3, (%1, %%eax))\
"addl $8, %%eax \n\t"\
"cmpl %2, %%eax \n\t"\
"movq "VROUNDER_OFFSET"(%0), %%mm3\n\t"\
"movq %%mm3, %%mm4 \n\t"\
"leal " offset "(%0), %%edx \n\t"\
"movl (%%edx), %%esi \n\t"\
"jb 1b \n\t"
#define YSCALEYUV2YV121 \
"movl %2, %%eax \n\t"\
".balign 16 \n\t"
/* FIXME Unroll? */
\
"1: \n\t"\
"movq (%0, %%eax, 2), %%mm0 \n\t"\
"movq 8(%0, %%eax, 2), %%mm1 \n\t"\
"psraw $7, %%mm0 \n\t"\
"psraw $7, %%mm1 \n\t"\
"packuswb %%mm1, %%mm0 \n\t"\
MOVNTQ(%%mm0, (%1, %%eax))\
"addl $8, %%eax \n\t"\
"jnc 1b \n\t"
/*
:: "m" (-lumFilterSize), "m" (-chrFilterSize),
"m" (lumMmxFilter+lumFilterSize*4), "m" (chrMmxFilter+chrFilterSize*4),
"r" (dest), "m" (dstW),
"m" (lumSrc+lumFilterSize), "m" (chrSrc+chrFilterSize)
: "%eax", "%ebx", "%ecx", "%edx", "%esi"
*/
#define YSCALEYUV2PACKEDX \
"xorl %%eax, %%eax \n\t"\
".balign 16 \n\t"\
"nop \n\t"\
"1: \n\t"\
"leal "CHR_MMX_FILTER_OFFSET"(%0), %%edx \n\t"\
"movl (%%edx), %%esi \n\t"\
"movq "VROUNDER_OFFSET"(%0), %%mm3\n\t"\
"movq %%mm3, %%mm4 \n\t"\
".balign 16 \n\t"\
"2: \n\t"\
"movq 8(%%edx), %%mm0 \n\t"
/* filterCoeff */
\
"movq (%%esi, %%eax), %%mm2 \n\t"
/* UsrcData */
\
"movq 4096(%%esi, %%eax), %%mm5 \n\t"
/* VsrcData */
\
"addl $16, %%edx \n\t"\
"movl (%%edx), %%esi \n\t"\
"pmulhw %%mm0, %%mm2 \n\t"\
"pmulhw %%mm0, %%mm5 \n\t"\
"paddw %%mm2, %%mm3 \n\t"\
"paddw %%mm5, %%mm4 \n\t"\
"testl %%esi, %%esi \n\t"\
" jnz 2b \n\t"\
\
"leal "LUM_MMX_FILTER_OFFSET"(%0), %%edx \n\t"\
"movl (%%edx), %%esi \n\t"\
"movq "VROUNDER_OFFSET"(%0), %%mm1\n\t"\
"movq %%mm1, %%mm7 \n\t"\
".balign 16 \n\t"\
"2: \n\t"\
"movq 8(%%edx), %%mm0 \n\t"
/* filterCoeff */
\
"movq (%%esi, %%eax, 2), %%mm2 \n\t"
/* Y1srcData */
\
"movq 8(%%esi, %%eax, 2), %%mm5 \n\t"
/* Y2srcData */
\
"addl $16, %%edx \n\t"\
"movl (%%edx), %%esi \n\t"\
"pmulhw %%mm0, %%mm2 \n\t"\
"pmulhw %%mm0, %%mm5 \n\t"\
"paddw %%mm2, %%mm1 \n\t"\
"paddw %%mm5, %%mm7 \n\t"\
"testl %%esi, %%esi \n\t"\
" jnz 2b \n\t"\
#define YSCALEYUV2RGBX \
YSCALEYUV2PACKEDX\
"psubw "U_OFFSET"(%0), %%mm3 \n\t"
/* (U-128)8*/
\
"psubw "V_OFFSET"(%0), %%mm4 \n\t"
/* (V-128)8*/
\
"movq %%mm3, %%mm2 \n\t"
/* (U-128)8*/
\
"movq %%mm4, %%mm5 \n\t"
/* (V-128)8*/
\
"pmulhw "UG_COEFF"(%0), %%mm3 \n\t"\
"pmulhw "VG_COEFF"(%0), %%mm4 \n\t"\
/* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */
\
"pmulhw "UB_COEFF"(%0), %%mm2 \n\t"\
"pmulhw "VR_COEFF"(%0), %%mm5 \n\t"\
"psubw "Y_OFFSET"(%0), %%mm1 \n\t"
/* 8(Y-16)*/
\
"psubw "Y_OFFSET"(%0), %%mm7 \n\t"
/* 8(Y-16)*/
\
"pmulhw "Y_COEFF"(%0), %%mm1 \n\t"\
"pmulhw "Y_COEFF"(%0), %%mm7 \n\t"\
/* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */
\
"paddw %%mm3, %%mm4 \n\t"\
"movq %%mm2, %%mm0 \n\t"\
"movq %%mm5, %%mm6 \n\t"\
"movq %%mm4, %%mm3 \n\t"\
"punpcklwd %%mm2, %%mm2 \n\t"\
"punpcklwd %%mm5, %%mm5 \n\t"\
"punpcklwd %%mm4, %%mm4 \n\t"\
"paddw %%mm1, %%mm2 \n\t"\
"paddw %%mm1, %%mm5 \n\t"\
"paddw %%mm1, %%mm4 \n\t"\
"punpckhwd %%mm0, %%mm0 \n\t"\
"punpckhwd %%mm6, %%mm6 \n\t"\
"punpckhwd %%mm3, %%mm3 \n\t"\
"paddw %%mm7, %%mm0 \n\t"\
"paddw %%mm7, %%mm6 \n\t"\
"paddw %%mm7, %%mm3 \n\t"\
/* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */
\
"packuswb %%mm0, %%mm2 \n\t"\
"packuswb %%mm6, %%mm5 \n\t"\
"packuswb %%mm3, %%mm4 \n\t"\
"pxor %%mm7, %%mm7 \n\t"
#if 0
#define FULL_YSCALEYUV2RGB \
"pxor %%mm7, %%mm7 \n\t"\
"movd %6, %%mm6 \n\t" /*yalpha1*/\
"punpcklwd %%mm6, %%mm6 \n\t"\
"punpcklwd %%mm6, %%mm6 \n\t"\
"movd %7, %%mm5 \n\t" /*uvalpha1*/\
"punpcklwd %%mm5, %%mm5 \n\t"\
"punpcklwd %%mm5, %%mm5 \n\t"\
"xorl %%eax, %%eax \n\t"\
".balign 16 \n\t"\
"1: \n\t"\
"movq (%0, %%eax, 2), %%mm0 \n\t" /*buf0[eax]*/\
"movq (%1, %%eax, 2), %%mm1 \n\t" /*buf1[eax]*/\
"movq (%2, %%eax,2), %%mm2 \n\t" /* uvbuf0[eax]*/\
"movq (%3, %%eax,2), %%mm3 \n\t" /* uvbuf1[eax]*/\
"psubw %%mm1, %%mm0 \n\t" /* buf0[eax] - buf1[eax]*/\
"psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
"pmulhw %%mm6, %%mm0 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
"pmulhw %%mm5, %%mm2 \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
"psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
"movq 4096(%2, %%eax,2), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\
"psraw $4, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
"paddw %%mm0, %%mm1 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
"movq 4096(%3, %%eax,2), %%mm0 \n\t" /* uvbuf1[eax+2048]*/\
"paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
"psubw %%mm0, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
"psubw "MANGLE(w80)", %%mm1 \n\t" /* 8(Y-16)*/\
"psubw "MANGLE(w400)", %%mm3 \n\t" /* 8(U-128)*/\
"pmulhw "MANGLE(yCoeff)", %%mm1 \n\t"\
\
\
"pmulhw %%mm5, %%mm4 \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
"movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
"pmulhw "MANGLE(ubCoeff)", %%mm3\n\t"\
"psraw $4, %%mm0 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
"pmulhw "MANGLE(ugCoeff)", %%mm2\n\t"\
"paddw %%mm4, %%mm0 \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
"psubw "MANGLE(w400)", %%mm0 \n\t" /* (V-128)8*/\
\
\
"movq %%mm0, %%mm4 \n\t" /* (V-128)8*/\
"pmulhw "MANGLE(vrCoeff)", %%mm0\n\t"\
"pmulhw "MANGLE(vgCoeff)", %%mm4\n\t"\
"paddw %%mm1, %%mm3 \n\t" /* B*/\
"paddw %%mm1, %%mm0 \n\t" /* R*/\
"packuswb %%mm3, %%mm3 \n\t"\
\
"packuswb %%mm0, %%mm0 \n\t"\
"paddw %%mm4, %%mm2 \n\t"\
"paddw %%mm2, %%mm1 \n\t" /* G*/\
\
"packuswb %%mm1, %%mm1 \n\t"
#endif
#define YSCALEYUV2PACKED(index, c) \
"movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0\n\t"\
"movq "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm1\n\t"\
"psraw $3, %%mm0 \n\t"\
"psraw $3, %%mm1 \n\t"\
"movq %%mm0, "CHR_MMX_FILTER_OFFSET"+8("#c")\n\t"\
"movq %%mm1, "LUM_MMX_FILTER_OFFSET"+8("#c")\n\t"\
"xorl "#index", "#index" \n\t"\
".balign 16 \n\t"\
"1: \n\t"\
"movq (%2, "#index"), %%mm2 \n\t"
/* uvbuf0[eax]*/
\
"movq (%3, "#index"), %%mm3 \n\t"
/* uvbuf1[eax]*/
\
"movq 4096(%2, "#index"), %%mm5 \n\t"
/* uvbuf0[eax+2048]*/
\
"movq 4096(%3, "#index"), %%mm4 \n\t"
/* uvbuf1[eax+2048]*/
\
"psubw %%mm3, %%mm2 \n\t"
/* uvbuf0[eax] - uvbuf1[eax]*/
\
"psubw %%mm4, %%mm5 \n\t"
/* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/
\
"movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0\n\t"\
"pmulhw %%mm0, %%mm2 \n\t"
/* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/
\
"pmulhw %%mm0, %%mm5 \n\t"
/* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/
\
"psraw $7, %%mm3 \n\t"
/* uvbuf0[eax] - uvbuf1[eax] >>4*/
\
"psraw $7, %%mm4 \n\t"
/* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/
\
"paddw %%mm2, %%mm3 \n\t"
/* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/
\
"paddw %%mm5, %%mm4 \n\t"
/* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/
\
"movq (%0, "#index", 2), %%mm0 \n\t"
/*buf0[eax]*/
\
"movq (%1, "#index", 2), %%mm1 \n\t"
/*buf1[eax]*/
\
"movq 8(%0, "#index", 2), %%mm6 \n\t"
/*buf0[eax]*/
\
"movq 8(%1, "#index", 2), %%mm7 \n\t"
/*buf1[eax]*/
\
"psubw %%mm1, %%mm0 \n\t"
/* buf0[eax] - buf1[eax]*/
\
"psubw %%mm7, %%mm6 \n\t"
/* buf0[eax] - buf1[eax]*/
\
"pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm0\n\t"
/* (buf0[eax] - buf1[eax])yalpha1>>16*/
\
"pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm6\n\t"
/* (buf0[eax] - buf1[eax])yalpha1>>16*/
\
"psraw $7, %%mm1 \n\t"
/* buf0[eax] - buf1[eax] >>4*/
\
"psraw $7, %%mm7 \n\t"
/* buf0[eax] - buf1[eax] >>4*/
\
"paddw %%mm0, %%mm1 \n\t"
/* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/
\
"paddw %%mm6, %%mm7 \n\t"
/* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/
\
#define YSCALEYUV2RGB(index, c) \
"xorl "#index", "#index" \n\t"\
".balign 16 \n\t"\
"1: \n\t"\
"movq (%2, "#index"), %%mm2 \n\t"
/* uvbuf0[eax]*/
\
"movq (%3, "#index"), %%mm3 \n\t"
/* uvbuf1[eax]*/
\
"movq 4096(%2, "#index"), %%mm5\n\t"
/* uvbuf0[eax+2048]*/
\
"movq 4096(%3, "#index"), %%mm4\n\t"
/* uvbuf1[eax+2048]*/
\
"psubw %%mm3, %%mm2 \n\t"
/* uvbuf0[eax] - uvbuf1[eax]*/
\
"psubw %%mm4, %%mm5 \n\t"
/* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/
\
"movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0\n\t"\
"pmulhw %%mm0, %%mm2 \n\t"
/* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/
\
"pmulhw %%mm0, %%mm5 \n\t"
/* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/
\
"psraw $4, %%mm3 \n\t"
/* uvbuf0[eax] - uvbuf1[eax] >>4*/
\
"psraw $4, %%mm4 \n\t"
/* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/
\
"paddw %%mm2, %%mm3 \n\t"
/* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/
\
"paddw %%mm5, %%mm4 \n\t"
/* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/
\
"psubw "U_OFFSET"("#c"), %%mm3 \n\t"
/* (U-128)8*/
\
"psubw "V_OFFSET"("#c"), %%mm4 \n\t"
/* (V-128)8*/
\
"movq %%mm3, %%mm2 \n\t"
/* (U-128)8*/
\
"movq %%mm4, %%mm5 \n\t"
/* (V-128)8*/
\
"pmulhw "UG_COEFF"("#c"), %%mm3\n\t"\
"pmulhw "VG_COEFF"("#c"), %%mm4\n\t"\
/* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */
\
"movq (%0, "#index", 2), %%mm0 \n\t"
/*buf0[eax]*/
\
"movq (%1, "#index", 2), %%mm1 \n\t"
/*buf1[eax]*/
\
"movq 8(%0, "#index", 2), %%mm6\n\t"
/*buf0[eax]*/
\
"movq 8(%1, "#index", 2), %%mm7\n\t"
/*buf1[eax]*/
\
"psubw %%mm1, %%mm0 \n\t"
/* buf0[eax] - buf1[eax]*/
\
"psubw %%mm7, %%mm6 \n\t"
/* buf0[eax] - buf1[eax]*/
\
"pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm0\n\t"
/* (buf0[eax] - buf1[eax])yalpha1>>16*/
\
"pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm6\n\t"
/* (buf0[eax] - buf1[eax])yalpha1>>16*/
\
"psraw $4, %%mm1 \n\t"
/* buf0[eax] - buf1[eax] >>4*/
\
"psraw $4, %%mm7 \n\t"
/* buf0[eax] - buf1[eax] >>4*/
\
"paddw %%mm0, %%mm1 \n\t"
/* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/
\
"paddw %%mm6, %%mm7 \n\t"
/* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/
\
"pmulhw "UB_COEFF"("#c"), %%mm2\n\t"\
"pmulhw "VR_COEFF"("#c"), %%mm5\n\t"\
"psubw "Y_OFFSET"("#c"), %%mm1 \n\t"
/* 8(Y-16)*/
\
"psubw "Y_OFFSET"("#c"), %%mm7 \n\t"
/* 8(Y-16)*/
\
"pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\
"pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\
/* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */
\
"paddw %%mm3, %%mm4 \n\t"\
"movq %%mm2, %%mm0 \n\t"\
"movq %%mm5, %%mm6 \n\t"\
"movq %%mm4, %%mm3 \n\t"\
"punpcklwd %%mm2, %%mm2 \n\t"\
"punpcklwd %%mm5, %%mm5 \n\t"\
"punpcklwd %%mm4, %%mm4 \n\t"\
"paddw %%mm1, %%mm2 \n\t"\
"paddw %%mm1, %%mm5 \n\t"\
"paddw %%mm1, %%mm4 \n\t"\
"punpckhwd %%mm0, %%mm0 \n\t"\
"punpckhwd %%mm6, %%mm6 \n\t"\
"punpckhwd %%mm3, %%mm3 \n\t"\
"paddw %%mm7, %%mm0 \n\t"\
"paddw %%mm7, %%mm6 \n\t"\
"paddw %%mm7, %%mm3 \n\t"\
/* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */
\
"packuswb %%mm0, %%mm2 \n\t"\
"packuswb %%mm6, %%mm5 \n\t"\
"packuswb %%mm3, %%mm4 \n\t"\
"pxor %%mm7, %%mm7 \n\t"
#define YSCALEYUV2PACKED1(index, c) \
"xorl "#index", "#index" \n\t"\
".balign 16 \n\t"\
"1: \n\t"\
"movq (%2, "#index"), %%mm3 \n\t"
/* uvbuf0[eax]*/
\
"movq 4096(%2, "#index"), %%mm4 \n\t"
/* uvbuf0[eax+2048]*/
\
"psraw $7, %%mm3 \n\t" \
"psraw $7, %%mm4 \n\t" \
"movq (%0, "#index", 2), %%mm1 \n\t"
/*buf0[eax]*/
\
"movq 8(%0, "#index", 2), %%mm7 \n\t"
/*buf0[eax]*/
\
"psraw $7, %%mm1 \n\t" \
"psraw $7, %%mm7 \n\t" \
#define YSCALEYUV2RGB1(index, c) \
"xorl "#index", "#index" \n\t"\
".balign 16 \n\t"\
"1: \n\t"\
"movq (%2, "#index"), %%mm3 \n\t"
/* uvbuf0[eax]*/
\
"movq 4096(%2, "#index"), %%mm4 \n\t"
/* uvbuf0[eax+2048]*/
\
"psraw $4, %%mm3 \n\t"
/* uvbuf0[eax] - uvbuf1[eax] >>4*/
\
"psraw $4, %%mm4 \n\t"
/* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/
\
"psubw "U_OFFSET"("#c"), %%mm3 \n\t"
/* (U-128)8*/
\
"psubw "V_OFFSET"("#c"), %%mm4 \n\t"
/* (V-128)8*/
\
"movq %%mm3, %%mm2 \n\t"
/* (U-128)8*/
\
"movq %%mm4, %%mm5 \n\t"
/* (V-128)8*/
\
"pmulhw "UG_COEFF"("#c"), %%mm3\n\t"\
"pmulhw "VG_COEFF"("#c"), %%mm4\n\t"\
/* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */
\
"movq (%0, "#index", 2), %%mm1 \n\t"
/*buf0[eax]*/
\
"movq 8(%0, "#index", 2), %%mm7 \n\t"
/*buf0[eax]*/
\
"psraw $4, %%mm1 \n\t"
/* buf0[eax] - buf1[eax] >>4*/
\
"psraw $4, %%mm7 \n\t"
/* buf0[eax] - buf1[eax] >>4*/
\
"pmulhw "UB_COEFF"("#c"), %%mm2\n\t"\
"pmulhw "VR_COEFF"("#c"), %%mm5\n\t"\
"psubw "Y_OFFSET"("#c"), %%mm1 \n\t"
/* 8(Y-16)*/
\
"psubw "Y_OFFSET"("#c"), %%mm7 \n\t"
/* 8(Y-16)*/
\
"pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\
"pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\
/* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */
\
"paddw %%mm3, %%mm4 \n\t"\
"movq %%mm2, %%mm0 \n\t"\
"movq %%mm5, %%mm6 \n\t"\
"movq %%mm4, %%mm3 \n\t"\
"punpcklwd %%mm2, %%mm2 \n\t"\
"punpcklwd %%mm5, %%mm5 \n\t"\
"punpcklwd %%mm4, %%mm4 \n\t"\
"paddw %%mm1, %%mm2 \n\t"\
"paddw %%mm1, %%mm5 \n\t"\
"paddw %%mm1, %%mm4 \n\t"\
"punpckhwd %%mm0, %%mm0 \n\t"\
"punpckhwd %%mm6, %%mm6 \n\t"\
"punpckhwd %%mm3, %%mm3 \n\t"\
"paddw %%mm7, %%mm0 \n\t"\
"paddw %%mm7, %%mm6 \n\t"\
"paddw %%mm7, %%mm3 \n\t"\
/* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */
\
"packuswb %%mm0, %%mm2 \n\t"\
"packuswb %%mm6, %%mm5 \n\t"\
"packuswb %%mm3, %%mm4 \n\t"\
"pxor %%mm7, %%mm7 \n\t"
#define YSCALEYUV2PACKED1b(index, c) \
"xorl "#index", "#index" \n\t"\
".balign 16 \n\t"\
"1: \n\t"\
"movq (%2, "#index"), %%mm2 \n\t"
/* uvbuf0[eax]*/
\
"movq (%3, "#index"), %%mm3 \n\t"
/* uvbuf1[eax]*/
\
"movq 4096(%2, "#index"), %%mm5 \n\t"
/* uvbuf0[eax+2048]*/
\
"movq 4096(%3, "#index"), %%mm4 \n\t"
/* uvbuf1[eax+2048]*/
\
"paddw %%mm2, %%mm3 \n\t"
/* uvbuf0[eax] + uvbuf1[eax]*/
\
"paddw %%mm5, %%mm4 \n\t"
/* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/
\
"psrlw $8, %%mm3 \n\t" \
"psrlw $8, %%mm4 \n\t" \
"movq (%0, "#index", 2), %%mm1 \n\t"
/*buf0[eax]*/
\
"movq 8(%0, "#index", 2), %%mm7 \n\t"
/*buf0[eax]*/
\
"psraw $7, %%mm1 \n\t" \
"psraw $7, %%mm7 \n\t"
// do vertical chrominance interpolation
#define YSCALEYUV2RGB1b(index, c) \
"xorl "#index", "#index" \n\t"\
".balign 16 \n\t"\
"1: \n\t"\
"movq (%2, "#index"), %%mm2 \n\t"
/* uvbuf0[eax]*/
\
"movq (%3, "#index"), %%mm3 \n\t"
/* uvbuf1[eax]*/
\
"movq 4096(%2, "#index"), %%mm5 \n\t"
/* uvbuf0[eax+2048]*/
\
"movq 4096(%3, "#index"), %%mm4 \n\t"
/* uvbuf1[eax+2048]*/
\
"paddw %%mm2, %%mm3 \n\t"
/* uvbuf0[eax] + uvbuf1[eax]*/
\
"paddw %%mm5, %%mm4 \n\t"
/* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/
\
"psrlw $5, %%mm3 \n\t"
/*FIXME might overflow*/
\
"psrlw $5, %%mm4 \n\t"
/*FIXME might overflow*/
\
"psubw "U_OFFSET"("#c"), %%mm3 \n\t"
/* (U-128)8*/
\
"psubw "V_OFFSET"("#c"), %%mm4 \n\t"
/* (V-128)8*/
\
"movq %%mm3, %%mm2 \n\t"
/* (U-128)8*/
\
"movq %%mm4, %%mm5 \n\t"
/* (V-128)8*/
\
"pmulhw "UG_COEFF"("#c"), %%mm3\n\t"\
"pmulhw "VG_COEFF"("#c"), %%mm4\n\t"\
/* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */
\
"movq (%0, "#index", 2), %%mm1 \n\t"
/*buf0[eax]*/
\
"movq 8(%0, "#index", 2), %%mm7 \n\t"
/*buf0[eax]*/
\
"psraw $4, %%mm1 \n\t"
/* buf0[eax] - buf1[eax] >>4*/
\
"psraw $4, %%mm7 \n\t"
/* buf0[eax] - buf1[eax] >>4*/
\
"pmulhw "UB_COEFF"("#c"), %%mm2\n\t"\
"pmulhw "VR_COEFF"("#c"), %%mm5\n\t"\
"psubw "Y_OFFSET"("#c"), %%mm1 \n\t"
/* 8(Y-16)*/
\
"psubw "Y_OFFSET"("#c"), %%mm7 \n\t"
/* 8(Y-16)*/
\
"pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\
"pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\
/* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */
\
"paddw %%mm3, %%mm4 \n\t"\
"movq %%mm2, %%mm0 \n\t"\
"movq %%mm5, %%mm6 \n\t"\
"movq %%mm4, %%mm3 \n\t"\
"punpcklwd %%mm2, %%mm2 \n\t"\
"punpcklwd %%mm5, %%mm5 \n\t"\
"punpcklwd %%mm4, %%mm4 \n\t"\
"paddw %%mm1, %%mm2 \n\t"\
"paddw %%mm1, %%mm5 \n\t"\
"paddw %%mm1, %%mm4 \n\t"\
"punpckhwd %%mm0, %%mm0 \n\t"\
"punpckhwd %%mm6, %%mm6 \n\t"\
"punpckhwd %%mm3, %%mm3 \n\t"\
"paddw %%mm7, %%mm0 \n\t"\
"paddw %%mm7, %%mm6 \n\t"\
"paddw %%mm7, %%mm3 \n\t"\
/* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */
\
"packuswb %%mm0, %%mm2 \n\t"\
"packuswb %%mm6, %%mm5 \n\t"\
"packuswb %%mm3, %%mm4 \n\t"\
"pxor %%mm7, %%mm7 \n\t"
#define WRITEBGR32(dst, dstw, index) \
/* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
\
"movq %%mm2, %%mm1 \n\t"
/* B */
\
"movq %%mm5, %%mm6 \n\t"
/* R */
\
"punpcklbw %%mm4, %%mm2 \n\t"
/* GBGBGBGB 0 */
\
"punpcklbw %%mm7, %%mm5 \n\t"
/* 0R0R0R0R 0 */
\
"punpckhbw %%mm4, %%mm1 \n\t"
/* GBGBGBGB 2 */
\
"punpckhbw %%mm7, %%mm6 \n\t"
/* 0R0R0R0R 2 */
\
"movq %%mm2, %%mm0 \n\t"
/* GBGBGBGB 0 */
\
"movq %%mm1, %%mm3 \n\t"
/* GBGBGBGB 2 */
\
"punpcklwd %%mm5, %%mm0 \n\t"
/* 0RGB0RGB 0 */
\
"punpckhwd %%mm5, %%mm2 \n\t"
/* 0RGB0RGB 1 */
\
"punpcklwd %%mm6, %%mm1 \n\t"
/* 0RGB0RGB 2 */
\
"punpckhwd %%mm6, %%mm3 \n\t"
/* 0RGB0RGB 3 */
\
\
MOVNTQ(%%mm0, (dst, index, 4))\
MOVNTQ(%%mm2, 8(dst, index, 4))\
MOVNTQ(%%mm1, 16(dst, index, 4))\
MOVNTQ(%%mm3, 24(dst, index, 4))\
\
"addl $8, "#index" \n\t"\
"cmpl "#dstw", "#index" \n\t"\
" jb 1b \n\t"
#define WRITEBGR16(dst, dstw, index) \
"pand "MANGLE(bF8)", %%mm2 \n\t"
/* B */
\
"pand "MANGLE(bFC)", %%mm4 \n\t"
/* G */
\
"pand "MANGLE(bF8)", %%mm5 \n\t"
/* R */
\
"psrlq $3, %%mm2 \n\t"\
\
"movq %%mm2, %%mm1 \n\t"\
"movq %%mm4, %%mm3 \n\t"\
\
"punpcklbw %%mm7, %%mm3 \n\t"\
"punpcklbw %%mm5, %%mm2 \n\t"\
"punpckhbw %%mm7, %%mm4 \n\t"\
"punpckhbw %%mm5, %%mm1 \n\t"\
\
"psllq $3, %%mm3 \n\t"\
"psllq $3, %%mm4 \n\t"\
\
"por %%mm3, %%mm2 \n\t"\
"por %%mm4, %%mm1 \n\t"\
\
MOVNTQ(%%mm2, (dst, index, 2))\
MOVNTQ(%%mm1, 8(dst, index, 2))\
\
"addl $8, "#index" \n\t"\
"cmpl "#dstw", "#index" \n\t"\
" jb 1b \n\t"
#define WRITEBGR15(dst, dstw, index) \
"pand "MANGLE(bF8)", %%mm2 \n\t"
/* B */
\
"pand "MANGLE(bF8)", %%mm4 \n\t"
/* G */
\
"pand "MANGLE(bF8)", %%mm5 \n\t"
/* R */
\
"psrlq $3, %%mm2 \n\t"\
"psrlq $1, %%mm5 \n\t"\
\
"movq %%mm2, %%mm1 \n\t"\
"movq %%mm4, %%mm3 \n\t"\
\
"punpcklbw %%mm7, %%mm3 \n\t"\
"punpcklbw %%mm5, %%mm2 \n\t"\
"punpckhbw %%mm7, %%mm4 \n\t"\
"punpckhbw %%mm5, %%mm1 \n\t"\
\
"psllq $2, %%mm3 \n\t"\
"psllq $2, %%mm4 \n\t"\
\
"por %%mm3, %%mm2 \n\t"\
"por %%mm4, %%mm1 \n\t"\
\
MOVNTQ(%%mm2, (dst, index, 2))\
MOVNTQ(%%mm1, 8(dst, index, 2))\
\
"addl $8, "#index" \n\t"\
"cmpl "#dstw", "#index" \n\t"\
" jb 1b \n\t"
#define WRITEBGR24OLD(dst, dstw, index) \
/* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
\
"movq %%mm2, %%mm1 \n\t"
/* B */
\
"movq %%mm5, %%mm6 \n\t"
/* R */
\
"punpcklbw %%mm4, %%mm2 \n\t"
/* GBGBGBGB 0 */
\
"punpcklbw %%mm7, %%mm5 \n\t"
/* 0R0R0R0R 0 */
\
"punpckhbw %%mm4, %%mm1 \n\t"
/* GBGBGBGB 2 */
\
"punpckhbw %%mm7, %%mm6 \n\t"
/* 0R0R0R0R 2 */
\
"movq %%mm2, %%mm0 \n\t"
/* GBGBGBGB 0 */
\
"movq %%mm1, %%mm3 \n\t"
/* GBGBGBGB 2 */
\
"punpcklwd %%mm5, %%mm0 \n\t"
/* 0RGB0RGB 0 */
\
"punpckhwd %%mm5, %%mm2 \n\t"
/* 0RGB0RGB 1 */
\
"punpcklwd %%mm6, %%mm1 \n\t"
/* 0RGB0RGB 2 */
\
"punpckhwd %%mm6, %%mm3 \n\t"
/* 0RGB0RGB 3 */
\
\
"movq %%mm0, %%mm4 \n\t"
/* 0RGB0RGB 0 */
\
"psrlq $8, %%mm0 \n\t"
/* 00RGB0RG 0 */
\
"pand "MANGLE(bm00000111)", %%mm4\n\t"
/* 00000RGB 0 */
\
"pand "MANGLE(bm11111000)", %%mm0\n\t"
/* 00RGB000 0.5 */
\
"por %%mm4, %%mm0 \n\t"
/* 00RGBRGB 0 */
\
"movq %%mm2, %%mm4 \n\t"
/* 0RGB0RGB 1 */
\
"psllq $48, %%mm2 \n\t"
/* GB000000 1 */
\
"por %%mm2, %%mm0 \n\t"
/* GBRGBRGB 0 */
\
\
"movq %%mm4, %%mm2 \n\t"
/* 0RGB0RGB 1 */
\
"psrld $16, %%mm4 \n\t"
/* 000R000R 1 */
\
"psrlq $24, %%mm2 \n\t"
/* 0000RGB0 1.5 */
\
"por %%mm4, %%mm2 \n\t"
/* 000RRGBR 1 */
\
"pand "MANGLE(bm00001111)", %%mm2\n\t"
/* 0000RGBR 1 */
\
"movq %%mm1, %%mm4 \n\t"
/* 0RGB0RGB 2 */
\
"psrlq $8, %%mm1 \n\t"
/* 00RGB0RG 2 */
\
"pand "MANGLE(bm00000111)", %%mm4\n\t"
/* 00000RGB 2 */
\
"pand "MANGLE(bm11111000)", %%mm1\n\t"
/* 00RGB000 2.5 */
\
"por %%mm4, %%mm1 \n\t"
/* 00RGBRGB 2 */
\
"movq %%mm1, %%mm4 \n\t"
/* 00RGBRGB 2 */
\
"psllq $32, %%mm1 \n\t"
/* BRGB0000 2 */
\
"por %%mm1, %%mm2 \n\t"
/* BRGBRGBR 1 */
\
\
"psrlq $32, %%mm4 \n\t"
/* 000000RG 2.5 */
\
"movq %%mm3, %%mm5 \n\t"
/* 0RGB0RGB 3 */
\
"psrlq $8, %%mm3 \n\t"
/* 00RGB0RG 3 */
\
"pand "MANGLE(bm00000111)", %%mm5\n\t"
/* 00000RGB 3 */
\
"pand "MANGLE(bm11111000)", %%mm3\n\t"
/* 00RGB000 3.5 */
\
"por %%mm5, %%mm3 \n\t"
/* 00RGBRGB 3 */
\
"psllq $16, %%mm3 \n\t"
/* RGBRGB00 3 */
\
"por %%mm4, %%mm3 \n\t"
/* RGBRGBRG 2.5 */
\
\
MOVNTQ(%%mm0, (dst))\
MOVNTQ(%%mm2, 8(dst))\
MOVNTQ(%%mm3, 16(dst))\
"addl $24, "#dst" \n\t"\
\
"addl $8, "#index" \n\t"\
"cmpl "#dstw", "#index" \n\t"\
" jb 1b \n\t"
#define WRITEBGR24MMX(dst, dstw, index) \
/* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
\
"movq %%mm2, %%mm1 \n\t"
/* B */
\
"movq %%mm5, %%mm6 \n\t"
/* R */
\
"punpcklbw %%mm4, %%mm2 \n\t"
/* GBGBGBGB 0 */
\
"punpcklbw %%mm7, %%mm5 \n\t"
/* 0R0R0R0R 0 */
\
"punpckhbw %%mm4, %%mm1 \n\t"
/* GBGBGBGB 2 */
\
"punpckhbw %%mm7, %%mm6 \n\t"
/* 0R0R0R0R 2 */
\
"movq %%mm2, %%mm0 \n\t"
/* GBGBGBGB 0 */
\
"movq %%mm1, %%mm3 \n\t"
/* GBGBGBGB 2 */
\
"punpcklwd %%mm5, %%mm0 \n\t"
/* 0RGB0RGB 0 */
\
"punpckhwd %%mm5, %%mm2 \n\t"
/* 0RGB0RGB 1 */
\
"punpcklwd %%mm6, %%mm1 \n\t"
/* 0RGB0RGB 2 */
\
"punpckhwd %%mm6, %%mm3 \n\t"
/* 0RGB0RGB 3 */
\
\
"movq %%mm0, %%mm4 \n\t"
/* 0RGB0RGB 0 */
\
"movq %%mm2, %%mm6 \n\t"
/* 0RGB0RGB 1 */
\
"movq %%mm1, %%mm5 \n\t"
/* 0RGB0RGB 2 */
\
"movq %%mm3, %%mm7 \n\t"
/* 0RGB0RGB 3 */
\
\
"psllq $40, %%mm0 \n\t"
/* RGB00000 0 */
\
"psllq $40, %%mm2 \n\t"
/* RGB00000 1 */
\
"psllq $40, %%mm1 \n\t"
/* RGB00000 2 */
\
"psllq $40, %%mm3 \n\t"
/* RGB00000 3 */
\
\
"punpckhdq %%mm4, %%mm0 \n\t"
/* 0RGBRGB0 0 */
\
"punpckhdq %%mm6, %%mm2 \n\t"
/* 0RGBRGB0 1 */
\
"punpckhdq %%mm5, %%mm1 \n\t"
/* 0RGBRGB0 2 */
\
"punpckhdq %%mm7, %%mm3 \n\t"
/* 0RGBRGB0 3 */
\
\
"psrlq $8, %%mm0 \n\t"
/* 00RGBRGB 0 */
\
"movq %%mm2, %%mm6 \n\t"
/* 0RGBRGB0 1 */
\
"psllq $40, %%mm2 \n\t"
/* GB000000 1 */
\
"por %%mm2, %%mm0 \n\t"
/* GBRGBRGB 0 */
\
MOVNTQ(%%mm0, (dst))\
\
"psrlq $24, %%mm6 \n\t"
/* 0000RGBR 1 */
\
"movq %%mm1, %%mm5 \n\t"
/* 0RGBRGB0 2 */
\
"psllq $24, %%mm1 \n\t"
/* BRGB0000 2 */
\
"por %%mm1, %%mm6 \n\t"
/* BRGBRGBR 1 */
\
MOVNTQ(%%mm6, 8(dst))\
\
"psrlq $40, %%mm5 \n\t"
/* 000000RG 2 */
\
"psllq $8, %%mm3 \n\t"
/* RGBRGB00 3 */
\
"por %%mm3, %%mm5 \n\t"
/* RGBRGBRG 2 */
\
MOVNTQ(%%mm5, 16(dst))\
\
"addl $24, "#dst" \n\t"\
\
"addl $8, "#index" \n\t"\
"cmpl "#dstw", "#index" \n\t"\
" jb 1b \n\t"
#define WRITEBGR24MMX2(dst, dstw, index) \
/* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
\
"movq "MANGLE(M24A)", %%mm0 \n\t"\
"movq "MANGLE(M24C)", %%mm7 \n\t"\
"pshufw $0x50, %%mm2, %%mm1 \n\t"
/* B3 B2 B3 B2 B1 B0 B1 B0 */
\
"pshufw $0x50, %%mm4, %%mm3 \n\t"
/* G3 G2 G3 G2 G1 G0 G1 G0 */
\
"pshufw $0x00, %%mm5, %%mm6 \n\t"
/* R1 R0 R1 R0 R1 R0 R1 R0 */
\
\
"pand %%mm0, %%mm1 \n\t"
/* B2 B1 B0 */
\
"pand %%mm0, %%mm3 \n\t"
/* G2 G1 G0 */
\
"pand %%mm7, %%mm6 \n\t"
/* R1 R0 */
\
\
"psllq $8, %%mm3 \n\t"
/* G2 G1 G0 */
\
"por %%mm1, %%mm6 \n\t"\
"por %%mm3, %%mm6 \n\t"\
MOVNTQ(%%mm6, (dst))\
\
"psrlq $8, %%mm4 \n\t"
/* 00 G7 G6 G5 G4 G3 G2 G1 */
\
"pshufw $0xA5, %%mm2, %%mm1 \n\t"
/* B5 B4 B5 B4 B3 B2 B3 B2 */
\
"pshufw $0x55, %%mm4, %%mm3 \n\t"
/* G4 G3 G4 G3 G4 G3 G4 G3 */
\
"pshufw $0xA5, %%mm5, %%mm6 \n\t"
/* R5 R4 R5 R4 R3 R2 R3 R2 */
\
\
"pand "MANGLE(M24B)", %%mm1 \n\t"
/* B5 B4 B3 */
\
"pand %%mm7, %%mm3 \n\t"
/* G4 G3 */
\
"pand %%mm0, %%mm6 \n\t"
/* R4 R3 R2 */
\
\
"por %%mm1, %%mm3 \n\t"
/* B5 G4 B4 G3 B3 */
\
"por %%mm3, %%mm6 \n\t"\
MOVNTQ(%%mm6, 8(dst))\
\
"pshufw $0xFF, %%mm2, %%mm1 \n\t"
/* B7 B6 B7 B6 B7 B6 B6 B7 */
\
"pshufw $0xFA, %%mm4, %%mm3 \n\t"
/* 00 G7 00 G7 G6 G5 G6 G5 */
\
"pshufw $0xFA, %%mm5, %%mm6 \n\t"
/* R7 R6 R7 R6 R5 R4 R5 R4 */
\
\
"pand %%mm7, %%mm1 \n\t"
/* B7 B6 */
\
"pand %%mm0, %%mm3 \n\t"
/* G7 G6 G5 */
\
"pand "MANGLE(M24B)", %%mm6 \n\t"
/* R7 R6 R5 */
\
\
"por %%mm1, %%mm3 \n\t"\
"por %%mm3, %%mm6 \n\t"\
MOVNTQ(%%mm6, 16(dst))\
\
"addl $24, "#dst" \n\t"\
\
"addl $8, "#index" \n\t"\
"cmpl "#dstw", "#index" \n\t"\
" jb 1b \n\t"
#ifdef HAVE_MMX2
#undef WRITEBGR24
#define WRITEBGR24 WRITEBGR24MMX2
#else
#undef WRITEBGR24
#define WRITEBGR24 WRITEBGR24MMX
#endif
#define WRITEYUY2(dst, dstw, index) \
"packuswb %%mm3, %%mm3 \n\t"\
"packuswb %%mm4, %%mm4 \n\t"\
"packuswb %%mm7, %%mm1 \n\t"\
"punpcklbw %%mm4, %%mm3 \n\t"\
"movq %%mm1, %%mm7 \n\t"\
"punpcklbw %%mm3, %%mm1 \n\t"\
"punpckhbw %%mm3, %%mm7 \n\t"\
\
MOVNTQ(%%mm1, (dst, index, 2))\
MOVNTQ(%%mm7, 8(dst, index, 2))\
\
"addl $8, "#index" \n\t"\
"cmpl "#dstw", "#index" \n\t"\
" jb 1b \n\t"
static
inline
void
RENAME
(
yuv2yuvX
)(
SwsContext
*
c
,
int16_t
*
lumFilter
,
int16_t
**
lumSrc
,
int
lumFilterSize
,
int16_t
*
chrFilter
,
int16_t
**
chrSrc
,
int
chrFilterSize
,
uint8_t
*
dest
,
uint8_t
*
uDest
,
uint8_t
*
vDest
,
int
dstW
,
int
chrDstW
)
{
#ifdef HAVE_MMX
if
(
uDest
!=
NULL
)
{
asm
volatile
(
YSCALEYUV2YV12X
(
0
,
CHR_MMX_FILTER_OFFSET
)
::
"r"
(
&
c
->
redDither
),
"r"
(
uDest
),
"m"
(
chrDstW
)
:
"%eax"
,
"%edx"
,
"%esi"
);
asm
volatile
(
YSCALEYUV2YV12X
(
4096
,
CHR_MMX_FILTER_OFFSET
)
::
"r"
(
&
c
->
redDither
),
"r"
(
vDest
),
"m"
(
chrDstW
)
:
"%eax"
,
"%edx"
,
"%esi"
);
}
asm
volatile
(
YSCALEYUV2YV12X
(
0
,
LUM_MMX_FILTER_OFFSET
)
::
"r"
(
&
c
->
redDither
),
"r"
(
dest
),
"m"
(
dstW
)
:
"%eax"
,
"%edx"
,
"%esi"
);
#else
#ifdef HAVE_ALTIVEC
yuv2yuvX_altivec_real
(
lumFilter
,
lumSrc
,
lumFilterSize
,
chrFilter
,
chrSrc
,
chrFilterSize
,
dest
,
uDest
,
vDest
,
dstW
,
chrDstW
);
#else //HAVE_ALTIVEC
yuv2yuvXinC
(
lumFilter
,
lumSrc
,
lumFilterSize
,
chrFilter
,
chrSrc
,
chrFilterSize
,
dest
,
uDest
,
vDest
,
dstW
,
chrDstW
);
#endif //!HAVE_ALTIVEC
#endif
}
static
inline
void
RENAME
(
yuv2yuv1
)(
int16_t
*
lumSrc
,
int16_t
*
chrSrc
,
uint8_t
*
dest
,
uint8_t
*
uDest
,
uint8_t
*
vDest
,
int
dstW
,
int
chrDstW
)
{
#ifdef HAVE_MMX
if
(
uDest
!=
NULL
)
{
asm
volatile
(
YSCALEYUV2YV121
::
"r"
(
chrSrc
+
chrDstW
),
"r"
(
uDest
+
chrDstW
),
"g"
(
-
chrDstW
)
:
"%eax"
);
asm
volatile
(
YSCALEYUV2YV121
::
"r"
(
chrSrc
+
2048
+
chrDstW
),
"r"
(
vDest
+
chrDstW
),
"g"
(
-
chrDstW
)
:
"%eax"
);
}
asm
volatile
(
YSCALEYUV2YV121
::
"r"
(
lumSrc
+
dstW
),
"r"
(
dest
+
dstW
),
"g"
(
-
dstW
)
:
"%eax"
);
#else
int
i
;
for
(
i
=
0
;
i
<
dstW
;
i
++
)
{
int
val
=
lumSrc
[
i
]
>>
7
;
if
(
val
&
256
){
if
(
val
<
0
)
val
=
0
;
else
val
=
255
;
}
dest
[
i
]
=
val
;
}
if
(
uDest
!=
NULL
)
for
(
i
=
0
;
i
<
chrDstW
;
i
++
)
{
int
u
=
chrSrc
[
i
]
>>
7
;
int
v
=
chrSrc
[
i
+
2048
]
>>
7
;
if
((
u
|
v
)
&
256
){
if
(
u
<
0
)
u
=
0
;
else
if
(
u
>
255
)
u
=
255
;
if
(
v
<
0
)
v
=
0
;
else
if
(
v
>
255
)
v
=
255
;
}
uDest
[
i
]
=
u
;
vDest
[
i
]
=
v
;
}
#endif
}
/**
* vertical scale YV12 to RGB
*/
static
inline
void
RENAME
(
yuv2packedX
)(
SwsContext
*
c
,
int16_t
*
lumFilter
,
int16_t
**
lumSrc
,
int
lumFilterSize
,
int16_t
*
chrFilter
,
int16_t
**
chrSrc
,
int
chrFilterSize
,
uint8_t
*
dest
,
int
dstW
,
int
dstY
)
{
int
dummy
=
0
;
switch
(
c
->
dstFormat
)
{
#ifdef HAVE_MMX
case
IMGFMT_BGR32
:
{
asm
volatile
(
YSCALEYUV2RGBX
WRITEBGR32
(
%
4
,
%
5
,
%%
eax
)
::
"r"
(
&
c
->
redDither
),
"m"
(
dummy
),
"m"
(
dummy
),
"m"
(
dummy
),
"r"
(
dest
),
"m"
(
dstW
)
:
"%eax"
,
"%edx"
,
"%esi"
);
}
break
;
case
IMGFMT_BGR24
:
{
asm
volatile
(
YSCALEYUV2RGBX
"leal (%%eax, %%eax, 2), %%ebx
\n\t
"
//FIXME optimize
"addl %4, %%ebx
\n\t
"
WRITEBGR24
(
%%
ebx
,
%
5
,
%%
eax
)
::
"r"
(
&
c
->
redDither
),
"m"
(
dummy
),
"m"
(
dummy
),
"m"
(
dummy
),
"r"
(
dest
),
"m"
(
dstW
)
:
"%eax"
,
"%ebx"
,
"%edx"
,
"%esi"
//FIXME ebx
);
}
break
;
case
IMGFMT_BGR15
:
{
asm
volatile
(
YSCALEYUV2RGBX
/* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
#ifdef DITHER1XBPP
"paddusb "
MANGLE
(
b5Dither
)
", %%mm2
\n\t
"
"paddusb "
MANGLE
(
g5Dither
)
", %%mm4
\n\t
"
"paddusb "
MANGLE
(
r5Dither
)
", %%mm5
\n\t
"
#endif
WRITEBGR15
(
%
4
,
%
5
,
%%
eax
)
::
"r"
(
&
c
->
redDither
),
"m"
(
dummy
),
"m"
(
dummy
),
"m"
(
dummy
),
"r"
(
dest
),
"m"
(
dstW
)
:
"%eax"
,
"%edx"
,
"%esi"
);
}
break
;
case
IMGFMT_BGR16
:
{
asm
volatile
(
YSCALEYUV2RGBX
/* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
#ifdef DITHER1XBPP
"paddusb "
MANGLE
(
b5Dither
)
", %%mm2
\n\t
"
"paddusb "
MANGLE
(
g6Dither
)
", %%mm4
\n\t
"
"paddusb "
MANGLE
(
r5Dither
)
", %%mm5
\n\t
"
#endif
WRITEBGR16
(
%
4
,
%
5
,
%%
eax
)
::
"r"
(
&
c
->
redDither
),
"m"
(
dummy
),
"m"
(
dummy
),
"m"
(
dummy
),
"r"
(
dest
),
"m"
(
dstW
)
:
"%eax"
,
"%edx"
,
"%esi"
);
}
break
;
case
IMGFMT_YUY2
:
{
asm
volatile
(
YSCALEYUV2PACKEDX
/* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
"psraw $3, %%mm3
\n\t
"
"psraw $3, %%mm4
\n\t
"
"psraw $3, %%mm1
\n\t
"
"psraw $3, %%mm7
\n\t
"
WRITEYUY2
(
%
4
,
%
5
,
%%
eax
)
::
"r"
(
&
c
->
redDither
),
"m"
(
dummy
),
"m"
(
dummy
),
"m"
(
dummy
),
"r"
(
dest
),
"m"
(
dstW
)
:
"%eax"
,
"%edx"
,
"%esi"
);
}
break
;
#endif
default:
#ifdef HAVE_ALTIVEC
altivec_yuv2packedX
(
c
,
lumFilter
,
lumSrc
,
lumFilterSize
,
chrFilter
,
chrSrc
,
chrFilterSize
,
dest
,
dstW
,
dstY
);
#else
yuv2packedXinC
(
c
,
lumFilter
,
lumSrc
,
lumFilterSize
,
chrFilter
,
chrSrc
,
chrFilterSize
,
dest
,
dstW
,
dstY
);
#endif
break
;
}
}
/**
* vertical bilinear scale YV12 to RGB
*/
static
inline
void
RENAME
(
yuv2packed2
)(
SwsContext
*
c
,
uint16_t
*
buf0
,
uint16_t
*
buf1
,
uint16_t
*
uvbuf0
,
uint16_t
*
uvbuf1
,
uint8_t
*
dest
,
int
dstW
,
int
yalpha
,
int
uvalpha
,
int
y
)
{
int
yalpha1
=
yalpha
^
4095
;
int
uvalpha1
=
uvalpha
^
4095
;
int
i
;
#if 0 //isn't used
if(flags&SWS_FULL_CHR_H_INT)
{
switch(dstFormat)
{
#ifdef HAVE_MMX
case IMGFMT_BGR32:
asm volatile(
FULL_YSCALEYUV2RGB
"punpcklbw %%mm1, %%mm3 \n\t" // BGBGBGBG
"punpcklbw %%mm7, %%mm0 \n\t" // R0R0R0R0
"movq %%mm3, %%mm1 \n\t"
"punpcklwd %%mm0, %%mm3 \n\t" // BGR0BGR0
"punpckhwd %%mm0, %%mm1 \n\t" // BGR0BGR0
MOVNTQ(%%mm3, (%4, %%eax, 4))
MOVNTQ(%%mm1, 8(%4, %%eax, 4))
"addl $4, %%eax \n\t"
"cmpl %5, %%eax \n\t"
" jb 1b \n\t"
:: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
"m" (yalpha1), "m" (uvalpha1)
: "%eax"
);
break;
case IMGFMT_BGR24:
asm volatile(
FULL_YSCALEYUV2RGB
// lsb ... msb
"punpcklbw %%mm1, %%mm3 \n\t" // BGBGBGBG
"punpcklbw %%mm7, %%mm0 \n\t" // R0R0R0R0
"movq %%mm3, %%mm1 \n\t"
"punpcklwd %%mm0, %%mm3 \n\t" // BGR0BGR0
"punpckhwd %%mm0, %%mm1 \n\t" // BGR0BGR0
"movq %%mm3, %%mm2 \n\t" // BGR0BGR0
"psrlq $8, %%mm3 \n\t" // GR0BGR00
"pand "MANGLE(bm00000111)", %%mm2\n\t" // BGR00000
"pand "MANGLE(bm11111000)", %%mm3\n\t" // 000BGR00
"por %%mm2, %%mm3 \n\t" // BGRBGR00
"movq %%mm1, %%mm2 \n\t"
"psllq $48, %%mm1 \n\t" // 000000BG
"por %%mm1, %%mm3 \n\t" // BGRBGRBG
"movq %%mm2, %%mm1 \n\t" // BGR0BGR0
"psrld $16, %%mm2 \n\t" // R000R000
"psrlq $24, %%mm1 \n\t" // 0BGR0000
"por %%mm2, %%mm1 \n\t" // RBGRR000
"movl %4, %%ebx \n\t"
"addl %%eax, %%ebx \n\t"
#ifdef HAVE_MMX2
//FIXME Alignment
"movntq %%mm3, (%%ebx, %%eax, 2)\n\t"
"movntq %%mm1, 8(%%ebx, %%eax, 2)\n\t"
#else
"movd %%mm3, (%%ebx, %%eax, 2) \n\t"
"psrlq $32, %%mm3 \n\t"
"movd %%mm3, 4(%%ebx, %%eax, 2) \n\t"
"movd %%mm1, 8(%%ebx, %%eax, 2) \n\t"
#endif
"addl $4, %%eax \n\t"
"cmpl %5, %%eax \n\t"
" jb 1b \n\t"
:: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest), "m" (dstW),
"m" (yalpha1), "m" (uvalpha1)
: "%eax", "%ebx"
);
break;
case IMGFMT_BGR15:
asm volatile(
FULL_YSCALEYUV2RGB
#ifdef DITHER1XBPP
"paddusb "MANGLE(g5Dither)", %%mm1\n\t"
"paddusb "MANGLE(r5Dither)", %%mm0\n\t"
"paddusb "MANGLE(b5Dither)", %%mm3\n\t"
#endif
"punpcklbw %%mm7, %%mm1 \n\t" // 0G0G0G0G
"punpcklbw %%mm7, %%mm3 \n\t" // 0B0B0B0B
"punpcklbw %%mm7, %%mm0 \n\t" // 0R0R0R0R
"psrlw $3, %%mm3 \n\t"
"psllw $2, %%mm1 \n\t"
"psllw $7, %%mm0 \n\t"
"pand "MANGLE(g15Mask)", %%mm1 \n\t"
"pand "MANGLE(r15Mask)", %%mm0 \n\t"
"por %%mm3, %%mm1 \n\t"
"por %%mm1, %%mm0 \n\t"
MOVNTQ(%%mm0, (%4, %%eax, 2))
"addl $4, %%eax \n\t"
"cmpl %5, %%eax \n\t"
" jb 1b \n\t"
:: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
"m" (yalpha1), "m" (uvalpha1)
: "%eax"
);
break;
case IMGFMT_BGR16:
asm volatile(
FULL_YSCALEYUV2RGB
#ifdef DITHER1XBPP
"paddusb "MANGLE(g6Dither)", %%mm1\n\t"
"paddusb "MANGLE(r5Dither)", %%mm0\n\t"
"paddusb "MANGLE(b5Dither)", %%mm3\n\t"
#endif
"punpcklbw %%mm7, %%mm1 \n\t" // 0G0G0G0G
"punpcklbw %%mm7, %%mm3 \n\t" // 0B0B0B0B
"punpcklbw %%mm7, %%mm0 \n\t" // 0R0R0R0R
"psrlw $3, %%mm3 \n\t"
"psllw $3, %%mm1 \n\t"
"psllw $8, %%mm0 \n\t"
"pand "MANGLE(g16Mask)", %%mm1 \n\t"
"pand "MANGLE(r16Mask)", %%mm0 \n\t"
"por %%mm3, %%mm1 \n\t"
"por %%mm1, %%mm0 \n\t"
MOVNTQ(%%mm0, (%4, %%eax, 2))
"addl $4, %%eax \n\t"
"cmpl %5, %%eax \n\t"
" jb 1b \n\t"
:: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
"m" (yalpha1), "m" (uvalpha1)
: "%eax"
);
break;
#endif
case
IMGFMT_RGB32
:
#ifndef HAVE_MMX
case
IMGFMT_BGR32
:
#endif
if
(
dstFormat
==
IMGFMT_BGR32
)
{
int
i
;
#ifdef WORDS_BIGENDIAN
dest
++
;
#endif
for
(
i
=
0
;
i
<
dstW
;
i
++
){
// vertical linear interpolation && yuv2rgb in a single step:
int
Y
=
yuvtab_2568
[((
buf0
[
i
]
*
yalpha1
+
buf1
[
i
]
*
yalpha
)
>>
19
)];
int
U
=
((
uvbuf0
[
i
]
*
uvalpha1
+
uvbuf1
[
i
]
*
uvalpha
)
>>
19
);
int
V
=
((
uvbuf0
[
i
+
2048
]
*
uvalpha1
+
uvbuf1
[
i
+
2048
]
*
uvalpha
)
>>
19
);
dest
[
0
]
=
clip_table
[((
Y
+
yuvtab_40cf
[
U
])
>>
13
)];
dest
[
1
]
=
clip_table
[((
Y
+
yuvtab_1a1e
[
V
]
+
yuvtab_0c92
[
U
])
>>
13
)];
dest
[
2
]
=
clip_table
[((
Y
+
yuvtab_3343
[
V
])
>>
13
)];
dest
+=
4
;
}
}
else
if
(
dstFormat
==
IMGFMT_BGR24
)
{
int
i
;
for
(
i
=
0
;
i
<
dstW
;
i
++
){
// vertical linear interpolation && yuv2rgb in a single step:
int
Y
=
yuvtab_2568
[((
buf0
[
i
]
*
yalpha1
+
buf1
[
i
]
*
yalpha
)
>>
19
)];
int
U
=
((
uvbuf0
[
i
]
*
uvalpha1
+
uvbuf1
[
i
]
*
uvalpha
)
>>
19
);
int
V
=
((
uvbuf0
[
i
+
2048
]
*
uvalpha1
+
uvbuf1
[
i
+
2048
]
*
uvalpha
)
>>
19
);
dest
[
0
]
=
clip_table
[((
Y
+
yuvtab_40cf
[
U
])
>>
13
)];
dest
[
1
]
=
clip_table
[((
Y
+
yuvtab_1a1e
[
V
]
+
yuvtab_0c92
[
U
])
>>
13
)];
dest
[
2
]
=
clip_table
[((
Y
+
yuvtab_3343
[
V
])
>>
13
)];
dest
+=
3
;
}
}
else
if
(
dstFormat
==
IMGFMT_BGR16
)
{
int
i
;
for
(
i
=
0
;
i
<
dstW
;
i
++
){
// vertical linear interpolation && yuv2rgb in a single step:
int
Y
=
yuvtab_2568
[((
buf0
[
i
]
*
yalpha1
+
buf1
[
i
]
*
yalpha
)
>>
19
)];
int
U
=
((
uvbuf0
[
i
]
*
uvalpha1
+
uvbuf1
[
i
]
*
uvalpha
)
>>
19
);
int
V
=
((
uvbuf0
[
i
+
2048
]
*
uvalpha1
+
uvbuf1
[
i
+
2048
]
*
uvalpha
)
>>
19
);
((
uint16_t
*
)
dest
)[
i
]
=
clip_table16b
[(
Y
+
yuvtab_40cf
[
U
])
>>
13
]
|
clip_table16g
[(
Y
+
yuvtab_1a1e
[
V
]
+
yuvtab_0c92
[
U
])
>>
13
]
|
clip_table16r
[(
Y
+
yuvtab_3343
[
V
])
>>
13
];
}
}
else
if
(
dstFormat
==
IMGFMT_BGR15
)
{
int
i
;
for
(
i
=
0
;
i
<
dstW
;
i
++
){
// vertical linear interpolation && yuv2rgb in a single step:
int
Y
=
yuvtab_2568
[((
buf0
[
i
]
*
yalpha1
+
buf1
[
i
]
*
yalpha
)
>>
19
)];
int
U
=
((
uvbuf0
[
i
]
*
uvalpha1
+
uvbuf1
[
i
]
*
uvalpha
)
>>
19
);
int
V
=
((
uvbuf0
[
i
+
2048
]
*
uvalpha1
+
uvbuf1
[
i
+
2048
]
*
uvalpha
)
>>
19
);
((
uint16_t
*
)
dest
)[
i
]
=
clip_table15b
[(
Y
+
yuvtab_40cf
[
U
])
>>
13
]
|
clip_table15g
[(
Y
+
yuvtab_1a1e
[
V
]
+
yuvtab_0c92
[
U
])
>>
13
]
|
clip_table15r
[(
Y
+
yuvtab_3343
[
V
])
>>
13
];
}
}
}
//FULL_UV_IPOL
else
{
#endif // if 0
#ifdef HAVE_MMX
switch
(
c
->
dstFormat
)
{
//Note 8280 == DSTW_OFFSET but the preprocessor can't handle that there :(
case
IMGFMT_BGR32
:
asm
volatile
(
"movl %%esp, "
ESP_OFFSET
"(%5)
\n\t
"
"movl %4, %%esp
\n\t
"
YSCALEYUV2RGB
(
%%
eax
,
%
5
)
WRITEBGR32
(
%%
esp
,
8280
(
%
5
),
%%
eax
)
"movl "
ESP_OFFSET
"(%5), %%esp
\n\t
"
::
"r"
(
buf0
),
"r"
(
buf1
),
"r"
(
uvbuf0
),
"r"
(
uvbuf1
),
"m"
(
dest
),
"r"
(
&
c
->
redDither
)
:
"%eax"
);
return
;
case
IMGFMT_BGR24
:
asm
volatile
(
"movl %%esp, "
ESP_OFFSET
"(%5)
\n\t
"
"movl %4, %%esp
\n\t
"
YSCALEYUV2RGB
(
%%
eax
,
%
5
)
WRITEBGR24
(
%%
esp
,
8280
(
%
5
),
%%
eax
)
"movl "
ESP_OFFSET
"(%5), %%esp
\n\t
"
::
"r"
(
buf0
),
"r"
(
buf1
),
"r"
(
uvbuf0
),
"r"
(
uvbuf1
),
"m"
(
dest
),
"r"
(
&
c
->
redDither
)
:
"%eax"
);
return
;
case
IMGFMT_BGR15
:
asm
volatile
(
"movl %%esp, "
ESP_OFFSET
"(%5)
\n\t
"
"movl %4, %%esp
\n\t
"
YSCALEYUV2RGB
(
%%
eax
,
%
5
)
/* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
#ifdef DITHER1XBPP
"paddusb "
MANGLE
(
b5Dither
)
", %%mm2
\n\t
"
"paddusb "
MANGLE
(
g5Dither
)
", %%mm4
\n\t
"
"paddusb "
MANGLE
(
r5Dither
)
", %%mm5
\n\t
"
#endif
WRITEBGR15
(
%%
esp
,
8280
(
%
5
),
%%
eax
)
"movl "
ESP_OFFSET
"(%5), %%esp
\n\t
"
::
"r"
(
buf0
),
"r"
(
buf1
),
"r"
(
uvbuf0
),
"r"
(
uvbuf1
),
"m"
(
dest
),
"r"
(
&
c
->
redDither
)
:
"%eax"
);
return
;
case
IMGFMT_BGR16
:
asm
volatile
(
"movl %%esp, "
ESP_OFFSET
"(%5)
\n\t
"
"movl %4, %%esp
\n\t
"
YSCALEYUV2RGB
(
%%
eax
,
%
5
)
/* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
#ifdef DITHER1XBPP
"paddusb "
MANGLE
(
b5Dither
)
", %%mm2
\n\t
"
"paddusb "
MANGLE
(
g6Dither
)
", %%mm4
\n\t
"
"paddusb "
MANGLE
(
r5Dither
)
", %%mm5
\n\t
"
#endif
WRITEBGR16
(
%%
esp
,
8280
(
%
5
),
%%
eax
)
"movl "
ESP_OFFSET
"(%5), %%esp
\n\t
"
::
"r"
(
buf0
),
"r"
(
buf1
),
"r"
(
uvbuf0
),
"r"
(
uvbuf1
),
"m"
(
dest
),
"r"
(
&
c
->
redDither
)
:
"%eax"
);
return
;
case
IMGFMT_YUY2
:
asm
volatile
(
"movl %%esp, "
ESP_OFFSET
"(%5)
\n\t
"
"movl %4, %%esp
\n\t
"
YSCALEYUV2PACKED
(
%%
eax
,
%
5
)
WRITEYUY2
(
%%
esp
,
8280
(
%
5
),
%%
eax
)
"movl "
ESP_OFFSET
"(%5), %%esp
\n\t
"
::
"r"
(
buf0
),
"r"
(
buf1
),
"r"
(
uvbuf0
),
"r"
(
uvbuf1
),
"m"
(
dest
),
"r"
(
&
c
->
redDither
)
:
"%eax"
);
return
;
default:
break
;
}
#endif //HAVE_MMX
YSCALE_YUV_2_ANYRGB_C
(
YSCALE_YUV_2_RGB2_C
,
YSCALE_YUV_2_PACKED2_C
)
}
/**
* YV12 to RGB without scaling or interpolating
*/
static
inline
void
RENAME
(
yuv2packed1
)(
SwsContext
*
c
,
uint16_t
*
buf0
,
uint16_t
*
uvbuf0
,
uint16_t
*
uvbuf1
,
uint8_t
*
dest
,
int
dstW
,
int
uvalpha
,
int
dstFormat
,
int
flags
,
int
y
)
{
const
int
yalpha1
=
0
;
int
i
;
uint16_t
*
buf1
=
buf0
;
//FIXME needed for the rgb1/bgr1
const
int
yalpha
=
4096
;
//FIXME ...
if
(
flags
&
SWS_FULL_CHR_H_INT
)
{
RENAME
(
yuv2packed2
)(
c
,
buf0
,
buf0
,
uvbuf0
,
uvbuf1
,
dest
,
dstW
,
0
,
uvalpha
,
y
);
return
;
}
#ifdef HAVE_MMX
if
(
uvalpha
<
2048
)
// note this is not correct (shifts chrominance by 0.5 pixels) but its a bit faster
{
switch
(
dstFormat
)
{
case
IMGFMT_BGR32
:
asm
volatile
(
"movl %%esp, "
ESP_OFFSET
"(%5)
\n\t
"
"movl %4, %%esp
\n\t
"
YSCALEYUV2RGB1
(
%%
eax
,
%
5
)
WRITEBGR32
(
%%
esp
,
8280
(
%
5
),
%%
eax
)
"movl "
ESP_OFFSET
"(%5), %%esp
\n\t
"
::
"r"
(
buf0
),
"r"
(
buf1
),
"r"
(
uvbuf0
),
"r"
(
uvbuf1
),
"m"
(
dest
),
"r"
(
&
c
->
redDither
)
:
"%eax"
);
return
;
case
IMGFMT_BGR24
:
asm
volatile
(
"movl %%esp, "
ESP_OFFSET
"(%5)
\n\t
"
"movl %4, %%esp
\n\t
"
YSCALEYUV2RGB1
(
%%
eax
,
%
5
)
WRITEBGR24
(
%%
esp
,
8280
(
%
5
),
%%
eax
)
"movl "
ESP_OFFSET
"(%5), %%esp
\n\t
"
::
"r"
(
buf0
),
"r"
(
buf1
),
"r"
(
uvbuf0
),
"r"
(
uvbuf1
),
"m"
(
dest
),
"r"
(
&
c
->
redDither
)
:
"%eax"
);
return
;
case
IMGFMT_BGR15
:
asm
volatile
(
"movl %%esp, "
ESP_OFFSET
"(%5)
\n\t
"
"movl %4, %%esp
\n\t
"
YSCALEYUV2RGB1
(
%%
eax
,
%
5
)
/* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
#ifdef DITHER1XBPP
"paddusb "
MANGLE
(
b5Dither
)
", %%mm2
\n\t
"
"paddusb "
MANGLE
(
g5Dither
)
", %%mm4
\n\t
"
"paddusb "
MANGLE
(
r5Dither
)
", %%mm5
\n\t
"
#endif
WRITEBGR15
(
%%
esp
,
8280
(
%
5
),
%%
eax
)
"movl "
ESP_OFFSET
"(%5), %%esp
\n\t
"
::
"r"
(
buf0
),
"r"
(
buf1
),
"r"
(
uvbuf0
),
"r"
(
uvbuf1
),
"m"
(
dest
),
"r"
(
&
c
->
redDither
)
:
"%eax"
);
return
;
case
IMGFMT_BGR16
:
asm
volatile
(
"movl %%esp, "
ESP_OFFSET
"(%5)
\n\t
"
"movl %4, %%esp
\n\t
"
YSCALEYUV2RGB1
(
%%
eax
,
%
5
)
/* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
#ifdef DITHER1XBPP
"paddusb "
MANGLE
(
b5Dither
)
", %%mm2
\n\t
"
"paddusb "
MANGLE
(
g6Dither
)
", %%mm4
\n\t
"
"paddusb "
MANGLE
(
r5Dither
)
", %%mm5
\n\t
"
#endif
WRITEBGR16
(
%%
esp
,
8280
(
%
5
),
%%
eax
)
"movl "
ESP_OFFSET
"(%5), %%esp
\n\t
"
::
"r"
(
buf0
),
"r"
(
buf1
),
"r"
(
uvbuf0
),
"r"
(
uvbuf1
),
"m"
(
dest
),
"r"
(
&
c
->
redDither
)
:
"%eax"
);
return
;
case
IMGFMT_YUY2
:
asm
volatile
(
"movl %%esp, "
ESP_OFFSET
"(%5)
\n\t
"
"movl %4, %%esp
\n\t
"
YSCALEYUV2PACKED1
(
%%
eax
,
%
5
)
WRITEYUY2
(
%%
esp
,
8280
(
%
5
),
%%
eax
)
"movl "
ESP_OFFSET
"(%5), %%esp
\n\t
"
::
"r"
(
buf0
),
"r"
(
buf1
),
"r"
(
uvbuf0
),
"r"
(
uvbuf1
),
"m"
(
dest
),
"r"
(
&
c
->
redDither
)
:
"%eax"
);
return
;
}
}
else
{
switch
(
dstFormat
)
{
case
IMGFMT_BGR32
:
asm
volatile
(
"movl %%esp, "
ESP_OFFSET
"(%5)
\n\t
"
"movl %4, %%esp
\n\t
"
YSCALEYUV2RGB1b
(
%%
eax
,
%
5
)
WRITEBGR32
(
%%
esp
,
8280
(
%
5
),
%%
eax
)
"movl "
ESP_OFFSET
"(%5), %%esp
\n\t
"
::
"r"
(
buf0
),
"r"
(
buf1
),
"r"
(
uvbuf0
),
"r"
(
uvbuf1
),
"m"
(
dest
),
"r"
(
&
c
->
redDither
)
:
"%eax"
);
return
;
case
IMGFMT_BGR24
:
asm
volatile
(
"movl %%esp, "
ESP_OFFSET
"(%5)
\n\t
"
"movl %4, %%esp
\n\t
"
YSCALEYUV2RGB1b
(
%%
eax
,
%
5
)
WRITEBGR24
(
%%
esp
,
8280
(
%
5
),
%%
eax
)
"movl "
ESP_OFFSET
"(%5), %%esp
\n\t
"
::
"r"
(
buf0
),
"r"
(
buf1
),
"r"
(
uvbuf0
),
"r"
(
uvbuf1
),
"m"
(
dest
),
"r"
(
&
c
->
redDither
)
:
"%eax"
);
return
;
case
IMGFMT_BGR15
:
asm
volatile
(
"movl %%esp, "
ESP_OFFSET
"(%5)
\n\t
"
"movl %4, %%esp
\n\t
"
YSCALEYUV2RGB1b
(
%%
eax
,
%
5
)
/* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
#ifdef DITHER1XBPP
"paddusb "
MANGLE
(
b5Dither
)
", %%mm2
\n\t
"
"paddusb "
MANGLE
(
g5Dither
)
", %%mm4
\n\t
"
"paddusb "
MANGLE
(
r5Dither
)
", %%mm5
\n\t
"
#endif
WRITEBGR15
(
%%
esp
,
8280
(
%
5
),
%%
eax
)
"movl "
ESP_OFFSET
"(%5), %%esp
\n\t
"
::
"r"
(
buf0
),
"r"
(
buf1
),
"r"
(
uvbuf0
),
"r"
(
uvbuf1
),
"m"
(
dest
),
"r"
(
&
c
->
redDither
)
:
"%eax"
);
return
;
case
IMGFMT_BGR16
:
asm
volatile
(
"movl %%esp, "
ESP_OFFSET
"(%5)
\n\t
"
"movl %4, %%esp
\n\t
"
YSCALEYUV2RGB1b
(
%%
eax
,
%
5
)
/* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
#ifdef DITHER1XBPP
"paddusb "
MANGLE
(
b5Dither
)
", %%mm2
\n\t
"
"paddusb "
MANGLE
(
g6Dither
)
", %%mm4
\n\t
"
"paddusb "
MANGLE
(
r5Dither
)
", %%mm5
\n\t
"
#endif
WRITEBGR16
(
%%
esp
,
8280
(
%
5
),
%%
eax
)
"movl "
ESP_OFFSET
"(%5), %%esp
\n\t
"
::
"r"
(
buf0
),
"r"
(
buf1
),
"r"
(
uvbuf0
),
"r"
(
uvbuf1
),
"m"
(
dest
),
"r"
(
&
c
->
redDither
)
:
"%eax"
);
return
;
case
IMGFMT_YUY2
:
asm
volatile
(
"movl %%esp, "
ESP_OFFSET
"(%5)
\n\t
"
"movl %4, %%esp
\n\t
"
YSCALEYUV2PACKED1b
(
%%
eax
,
%
5
)
WRITEYUY2
(
%%
esp
,
8280
(
%
5
),
%%
eax
)
"movl "
ESP_OFFSET
"(%5), %%esp
\n\t
"
::
"r"
(
buf0
),
"r"
(
buf1
),
"r"
(
uvbuf0
),
"r"
(
uvbuf1
),
"m"
(
dest
),
"r"
(
&
c
->
redDither
)
:
"%eax"
);
return
;
}
}
#endif
if
(
uvalpha
<
2048
)
{
YSCALE_YUV_2_ANYRGB_C
(
YSCALE_YUV_2_RGB1_C
,
YSCALE_YUV_2_PACKED1_C
)
}
else
{
YSCALE_YUV_2_ANYRGB_C
(
YSCALE_YUV_2_RGB1B_C
,
YSCALE_YUV_2_PACKED1B_C
)
}
}
//FIXME yuy2* can read upto 7 samples to much
static
inline
void
RENAME
(
yuy2ToY
)(
uint8_t
*
dst
,
uint8_t
*
src
,
int
width
)
{
#ifdef HAVE_MMX
asm
volatile
(
"movq "
MANGLE
(
bm01010101
)
", %%mm2
\n\t
"
"movl %0, %%eax
\n\t
"
"1:
\n\t
"
"movq (%1, %%eax,2), %%mm0
\n\t
"
"movq 8(%1, %%eax,2), %%mm1
\n\t
"
"pand %%mm2, %%mm0
\n\t
"
"pand %%mm2, %%mm1
\n\t
"
"packuswb %%mm1, %%mm0
\n\t
"
"movq %%mm0, (%2, %%eax)
\n\t
"
"addl $8, %%eax
\n\t
"
" js 1b
\n\t
"
:
:
"g"
(
-
width
),
"r"
(
src
+
width
*
2
),
"r"
(
dst
+
width
)
:
"%eax"
);
#else
int
i
;
for
(
i
=
0
;
i
<
width
;
i
++
)
dst
[
i
]
=
src
[
2
*
i
];
#endif
}
static
inline
void
RENAME
(
yuy2ToUV
)(
uint8_t
*
dstU
,
uint8_t
*
dstV
,
uint8_t
*
src1
,
uint8_t
*
src2
,
int
width
)
{
#if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
asm
volatile
(
"movq "
MANGLE
(
bm01010101
)
", %%mm4
\n\t
"
"movl %0, %%eax
\n\t
"
"1:
\n\t
"
"movq (%1, %%eax,4), %%mm0
\n\t
"
"movq 8(%1, %%eax,4), %%mm1
\n\t
"
"movq (%2, %%eax,4), %%mm2
\n\t
"
"movq 8(%2, %%eax,4), %%mm3
\n\t
"
PAVGB
(
%%
mm2
,
%%
mm0
)
PAVGB
(
%%
mm3
,
%%
mm1
)
"psrlw $8, %%mm0
\n\t
"
"psrlw $8, %%mm1
\n\t
"
"packuswb %%mm1, %%mm0
\n\t
"
"movq %%mm0, %%mm1
\n\t
"
"psrlw $8, %%mm0
\n\t
"
"pand %%mm4, %%mm1
\n\t
"
"packuswb %%mm0, %%mm0
\n\t
"
"packuswb %%mm1, %%mm1
\n\t
"
"movd %%mm0, (%4, %%eax)
\n\t
"
"movd %%mm1, (%3, %%eax)
\n\t
"
"addl $4, %%eax
\n\t
"
" js 1b
\n\t
"
:
:
"g"
(
-
width
),
"r"
(
src1
+
width
*
4
),
"r"
(
src2
+
width
*
4
),
"r"
(
dstU
+
width
),
"r"
(
dstV
+
width
)
:
"%eax"
);
#else
int
i
;
for
(
i
=
0
;
i
<
width
;
i
++
)
{
dstU
[
i
]
=
(
src1
[
4
*
i
+
1
]
+
src2
[
4
*
i
+
1
])
>>
1
;
dstV
[
i
]
=
(
src1
[
4
*
i
+
3
]
+
src2
[
4
*
i
+
3
])
>>
1
;
}
#endif
}
//this is allmost identical to the previous, end exists only cuz yuy2ToY/UV)(dst, src+1, ...) would have 100% unaligned accesses
static
inline
void
RENAME
(
uyvyToY
)(
uint8_t
*
dst
,
uint8_t
*
src
,
int
width
)
{
#ifdef HAVE_MMX
asm
volatile
(
"movl %0, %%eax
\n\t
"
"1:
\n\t
"
"movq (%1, %%eax,2), %%mm0
\n\t
"
"movq 8(%1, %%eax,2), %%mm1
\n\t
"
"psrlw $8, %%mm0
\n\t
"
"psrlw $8, %%mm1
\n\t
"
"packuswb %%mm1, %%mm0
\n\t
"
"movq %%mm0, (%2, %%eax)
\n\t
"
"addl $8, %%eax
\n\t
"
" js 1b
\n\t
"
:
:
"g"
(
-
width
),
"r"
(
src
+
width
*
2
),
"r"
(
dst
+
width
)
:
"%eax"
);
#else
int
i
;
for
(
i
=
0
;
i
<
width
;
i
++
)
dst
[
i
]
=
src
[
2
*
i
+
1
];
#endif
}
static
inline
void
RENAME
(
uyvyToUV
)(
uint8_t
*
dstU
,
uint8_t
*
dstV
,
uint8_t
*
src1
,
uint8_t
*
src2
,
int
width
)
{
#if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
asm
volatile
(
"movq "
MANGLE
(
bm01010101
)
", %%mm4
\n\t
"
"movl %0, %%eax
\n\t
"
"1:
\n\t
"
"movq (%1, %%eax,4), %%mm0
\n\t
"
"movq 8(%1, %%eax,4), %%mm1
\n\t
"
"movq (%2, %%eax,4), %%mm2
\n\t
"
"movq 8(%2, %%eax,4), %%mm3
\n\t
"
PAVGB
(
%%
mm2
,
%%
mm0
)
PAVGB
(
%%
mm3
,
%%
mm1
)
"pand %%mm4, %%mm0
\n\t
"
"pand %%mm4, %%mm1
\n\t
"
"packuswb %%mm1, %%mm0
\n\t
"
"movq %%mm0, %%mm1
\n\t
"
"psrlw $8, %%mm0
\n\t
"
"pand %%mm4, %%mm1
\n\t
"
"packuswb %%mm0, %%mm0
\n\t
"
"packuswb %%mm1, %%mm1
\n\t
"
"movd %%mm0, (%4, %%eax)
\n\t
"
"movd %%mm1, (%3, %%eax)
\n\t
"
"addl $4, %%eax
\n\t
"
" js 1b
\n\t
"
:
:
"g"
(
-
width
),
"r"
(
src1
+
width
*
4
),
"r"
(
src2
+
width
*
4
),
"r"
(
dstU
+
width
),
"r"
(
dstV
+
width
)
:
"%eax"
);
#else
int
i
;
for
(
i
=
0
;
i
<
width
;
i
++
)
{
dstU
[
i
]
=
(
src1
[
4
*
i
+
0
]
+
src2
[
4
*
i
+
0
])
>>
1
;
dstV
[
i
]
=
(
src1
[
4
*
i
+
2
]
+
src2
[
4
*
i
+
2
])
>>
1
;
}
#endif
}
static
inline
void
RENAME
(
bgr32ToY
)(
uint8_t
*
dst
,
uint8_t
*
src
,
int
width
)
{
#ifdef HAVE_MMXFIXME
#else
int
i
;
for
(
i
=
0
;
i
<
width
;
i
++
)
{
int
b
=
((
uint32_t
*
)
src
)[
i
]
&
0xFF
;
int
g
=
(((
uint32_t
*
)
src
)[
i
]
>>
8
)
&
0xFF
;
int
r
=
(((
uint32_t
*
)
src
)[
i
]
>>
16
)
&
0xFF
;
dst
[
i
]
=
((
RY
*
r
+
GY
*
g
+
BY
*
b
+
(
33
<<
(
RGB2YUV_SHIFT
-
1
))
)
>>
RGB2YUV_SHIFT
);
}
#endif
}
static
inline
void
RENAME
(
bgr32ToUV
)(
uint8_t
*
dstU
,
uint8_t
*
dstV
,
uint8_t
*
src1
,
uint8_t
*
src2
,
int
width
)
{
#ifdef HAVE_MMXFIXME
#else
int
i
;
for
(
i
=
0
;
i
<
width
;
i
++
)
{
const
int
a
=
((
uint32_t
*
)
src1
)[
2
*
i
+
0
];
const
int
e
=
((
uint32_t
*
)
src1
)[
2
*
i
+
1
];
const
int
c
=
((
uint32_t
*
)
src2
)[
2
*
i
+
0
];
const
int
d
=
((
uint32_t
*
)
src2
)[
2
*
i
+
1
];
const
int
l
=
(
a
&
0xFF00FF
)
+
(
e
&
0xFF00FF
)
+
(
c
&
0xFF00FF
)
+
(
d
&
0xFF00FF
);
const
int
h
=
(
a
&
0x00FF00
)
+
(
e
&
0x00FF00
)
+
(
c
&
0x00FF00
)
+
(
d
&
0x00FF00
);
const
int
b
=
l
&
0x3FF
;
const
int
g
=
h
>>
8
;
const
int
r
=
l
>>
16
;
dstU
[
i
]
=
((
RU
*
r
+
GU
*
g
+
BU
*
b
)
>>
(
RGB2YUV_SHIFT
+
2
))
+
128
;
dstV
[
i
]
=
((
RV
*
r
+
GV
*
g
+
BV
*
b
)
>>
(
RGB2YUV_SHIFT
+
2
))
+
128
;
}
#endif
}
static
inline
void
RENAME
(
bgr24ToY
)(
uint8_t
*
dst
,
uint8_t
*
src
,
int
width
)
{
#ifdef HAVE_MMX
asm
volatile
(
"movl %2, %%eax
\n\t
"
"movq "
MANGLE
(
bgr2YCoeff
)
", %%mm6
\n\t
"
"movq "
MANGLE
(
w1111
)
", %%mm5
\n\t
"
"pxor %%mm7, %%mm7
\n\t
"
"leal (%%eax, %%eax, 2), %%ebx
\n\t
"
".balign 16
\n\t
"
"1:
\n\t
"
PREFETCH
" 64(%0, %%ebx)
\n\t
"
"movd (%0, %%ebx), %%mm0
\n\t
"
"movd 3(%0, %%ebx), %%mm1
\n\t
"
"punpcklbw %%mm7, %%mm0
\n\t
"
"punpcklbw %%mm7, %%mm1
\n\t
"
"movd 6(%0, %%ebx), %%mm2
\n\t
"
"movd 9(%0, %%ebx), %%mm3
\n\t
"
"punpcklbw %%mm7, %%mm2
\n\t
"
"punpcklbw %%mm7, %%mm3
\n\t
"
"pmaddwd %%mm6, %%mm0
\n\t
"
"pmaddwd %%mm6, %%mm1
\n\t
"
"pmaddwd %%mm6, %%mm2
\n\t
"
"pmaddwd %%mm6, %%mm3
\n\t
"
#ifndef FAST_BGR2YV12
"psrad $8, %%mm0
\n\t
"
"psrad $8, %%mm1
\n\t
"
"psrad $8, %%mm2
\n\t
"
"psrad $8, %%mm3
\n\t
"
#endif
"packssdw %%mm1, %%mm0
\n\t
"
"packssdw %%mm3, %%mm2
\n\t
"
"pmaddwd %%mm5, %%mm0
\n\t
"
"pmaddwd %%mm5, %%mm2
\n\t
"
"packssdw %%mm2, %%mm0
\n\t
"
"psraw $7, %%mm0
\n\t
"
"movd 12(%0, %%ebx), %%mm4
\n\t
"
"movd 15(%0, %%ebx), %%mm1
\n\t
"
"punpcklbw %%mm7, %%mm4
\n\t
"
"punpcklbw %%mm7, %%mm1
\n\t
"
"movd 18(%0, %%ebx), %%mm2
\n\t
"
"movd 21(%0, %%ebx), %%mm3
\n\t
"
"punpcklbw %%mm7, %%mm2
\n\t
"
"punpcklbw %%mm7, %%mm3
\n\t
"
"pmaddwd %%mm6, %%mm4
\n\t
"
"pmaddwd %%mm6, %%mm1
\n\t
"
"pmaddwd %%mm6, %%mm2
\n\t
"
"pmaddwd %%mm6, %%mm3
\n\t
"
#ifndef FAST_BGR2YV12
"psrad $8, %%mm4
\n\t
"
"psrad $8, %%mm1
\n\t
"
"psrad $8, %%mm2
\n\t
"
"psrad $8, %%mm3
\n\t
"
#endif
"packssdw %%mm1, %%mm4
\n\t
"
"packssdw %%mm3, %%mm2
\n\t
"
"pmaddwd %%mm5, %%mm4
\n\t
"
"pmaddwd %%mm5, %%mm2
\n\t
"
"addl $24, %%ebx
\n\t
"
"packssdw %%mm2, %%mm4
\n\t
"
"psraw $7, %%mm4
\n\t
"
"packuswb %%mm4, %%mm0
\n\t
"
"paddusb "
MANGLE
(
bgr2YOffset
)
", %%mm0
\n\t
"
"movq %%mm0, (%1, %%eax)
\n\t
"
"addl $8, %%eax
\n\t
"
" js 1b
\n\t
"
:
:
"r"
(
src
+
width
*
3
),
"r"
(
dst
+
width
),
"g"
(
-
width
)
:
"%eax"
,
"%ebx"
);
#else
int
i
;
for
(
i
=
0
;
i
<
width
;
i
++
)
{
int
b
=
src
[
i
*
3
+
0
];
int
g
=
src
[
i
*
3
+
1
];
int
r
=
src
[
i
*
3
+
2
];
dst
[
i
]
=
((
RY
*
r
+
GY
*
g
+
BY
*
b
+
(
33
<<
(
RGB2YUV_SHIFT
-
1
))
)
>>
RGB2YUV_SHIFT
);
}
#endif
}
static
inline
void
RENAME
(
bgr24ToUV
)(
uint8_t
*
dstU
,
uint8_t
*
dstV
,
uint8_t
*
src1
,
uint8_t
*
src2
,
int
width
)
{
#ifdef HAVE_MMX
asm
volatile
(
"movl %4, %%eax
\n\t
"
"movq "
MANGLE
(
w1111
)
", %%mm5
\n\t
"
"movq "
MANGLE
(
bgr2UCoeff
)
", %%mm6
\n\t
"
"pxor %%mm7, %%mm7
\n\t
"
"leal (%%eax, %%eax, 2), %%ebx
\n\t
"
"addl %%ebx, %%ebx
\n\t
"
".balign 16
\n\t
"
"1:
\n\t
"
PREFETCH
" 64(%0, %%ebx)
\n\t
"
PREFETCH
" 64(%1, %%ebx)
\n\t
"
#if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
"movq (%0, %%ebx), %%mm0
\n\t
"
"movq (%1, %%ebx), %%mm1
\n\t
"
"movq 6(%0, %%ebx), %%mm2
\n\t
"
"movq 6(%1, %%ebx), %%mm3
\n\t
"
PAVGB
(
%%
mm1
,
%%
mm0
)
PAVGB
(
%%
mm3
,
%%
mm2
)
"movq %%mm0, %%mm1
\n\t
"
"movq %%mm2, %%mm3
\n\t
"
"psrlq $24, %%mm0
\n\t
"
"psrlq $24, %%mm2
\n\t
"
PAVGB
(
%%
mm1
,
%%
mm0
)
PAVGB
(
%%
mm3
,
%%
mm2
)
"punpcklbw %%mm7, %%mm0
\n\t
"
"punpcklbw %%mm7, %%mm2
\n\t
"
#else
"movd (%0, %%ebx), %%mm0
\n\t
"
"movd (%1, %%ebx), %%mm1
\n\t
"
"movd 3(%0, %%ebx), %%mm2
\n\t
"
"movd 3(%1, %%ebx), %%mm3
\n\t
"
"punpcklbw %%mm7, %%mm0
\n\t
"
"punpcklbw %%mm7, %%mm1
\n\t
"
"punpcklbw %%mm7, %%mm2
\n\t
"
"punpcklbw %%mm7, %%mm3
\n\t
"
"paddw %%mm1, %%mm0
\n\t
"
"paddw %%mm3, %%mm2
\n\t
"
"paddw %%mm2, %%mm0
\n\t
"
"movd 6(%0, %%ebx), %%mm4
\n\t
"
"movd 6(%1, %%ebx), %%mm1
\n\t
"
"movd 9(%0, %%ebx), %%mm2
\n\t
"
"movd 9(%1, %%ebx), %%mm3
\n\t
"
"punpcklbw %%mm7, %%mm4
\n\t
"
"punpcklbw %%mm7, %%mm1
\n\t
"
"punpcklbw %%mm7, %%mm2
\n\t
"
"punpcklbw %%mm7, %%mm3
\n\t
"
"paddw %%mm1, %%mm4
\n\t
"
"paddw %%mm3, %%mm2
\n\t
"
"paddw %%mm4, %%mm2
\n\t
"
"psrlw $2, %%mm0
\n\t
"
"psrlw $2, %%mm2
\n\t
"
#endif
"movq "
MANGLE
(
bgr2VCoeff
)
", %%mm1
\n\t
"
"movq "
MANGLE
(
bgr2VCoeff
)
", %%mm3
\n\t
"
"pmaddwd %%mm0, %%mm1
\n\t
"
"pmaddwd %%mm2, %%mm3
\n\t
"
"pmaddwd %%mm6, %%mm0
\n\t
"
"pmaddwd %%mm6, %%mm2
\n\t
"
#ifndef FAST_BGR2YV12
"psrad $8, %%mm0
\n\t
"
"psrad $8, %%mm1
\n\t
"
"psrad $8, %%mm2
\n\t
"
"psrad $8, %%mm3
\n\t
"
#endif
"packssdw %%mm2, %%mm0
\n\t
"
"packssdw %%mm3, %%mm1
\n\t
"
"pmaddwd %%mm5, %%mm0
\n\t
"
"pmaddwd %%mm5, %%mm1
\n\t
"
"packssdw %%mm1, %%mm0
\n\t
"
// V1 V0 U1 U0
"psraw $7, %%mm0
\n\t
"
#if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
"movq 12(%0, %%ebx), %%mm4
\n\t
"
"movq 12(%1, %%ebx), %%mm1
\n\t
"
"movq 18(%0, %%ebx), %%mm2
\n\t
"
"movq 18(%1, %%ebx), %%mm3
\n\t
"
PAVGB
(
%%
mm1
,
%%
mm4
)
PAVGB
(
%%
mm3
,
%%
mm2
)
"movq %%mm4, %%mm1
\n\t
"
"movq %%mm2, %%mm3
\n\t
"
"psrlq $24, %%mm4
\n\t
"
"psrlq $24, %%mm2
\n\t
"
PAVGB
(
%%
mm1
,
%%
mm4
)
PAVGB
(
%%
mm3
,
%%
mm2
)
"punpcklbw %%mm7, %%mm4
\n\t
"
"punpcklbw %%mm7, %%mm2
\n\t
"
#else
"movd 12(%0, %%ebx), %%mm4
\n\t
"
"movd 12(%1, %%ebx), %%mm1
\n\t
"
"movd 15(%0, %%ebx), %%mm2
\n\t
"
"movd 15(%1, %%ebx), %%mm3
\n\t
"
"punpcklbw %%mm7, %%mm4
\n\t
"
"punpcklbw %%mm7, %%mm1
\n\t
"
"punpcklbw %%mm7, %%mm2
\n\t
"
"punpcklbw %%mm7, %%mm3
\n\t
"
"paddw %%mm1, %%mm4
\n\t
"
"paddw %%mm3, %%mm2
\n\t
"
"paddw %%mm2, %%mm4
\n\t
"
"movd 18(%0, %%ebx), %%mm5
\n\t
"
"movd 18(%1, %%ebx), %%mm1
\n\t
"
"movd 21(%0, %%ebx), %%mm2
\n\t
"
"movd 21(%1, %%ebx), %%mm3
\n\t
"
"punpcklbw %%mm7, %%mm5
\n\t
"
"punpcklbw %%mm7, %%mm1
\n\t
"
"punpcklbw %%mm7, %%mm2
\n\t
"
"punpcklbw %%mm7, %%mm3
\n\t
"
"paddw %%mm1, %%mm5
\n\t
"
"paddw %%mm3, %%mm2
\n\t
"
"paddw %%mm5, %%mm2
\n\t
"
"movq "
MANGLE
(
w1111
)
", %%mm5
\n\t
"
"psrlw $2, %%mm4
\n\t
"
"psrlw $2, %%mm2
\n\t
"
#endif
"movq "
MANGLE
(
bgr2VCoeff
)
", %%mm1
\n\t
"
"movq "
MANGLE
(
bgr2VCoeff
)
", %%mm3
\n\t
"
"pmaddwd %%mm4, %%mm1
\n\t
"
"pmaddwd %%mm2, %%mm3
\n\t
"
"pmaddwd %%mm6, %%mm4
\n\t
"
"pmaddwd %%mm6, %%mm2
\n\t
"
#ifndef FAST_BGR2YV12
"psrad $8, %%mm4
\n\t
"
"psrad $8, %%mm1
\n\t
"
"psrad $8, %%mm2
\n\t
"
"psrad $8, %%mm3
\n\t
"
#endif
"packssdw %%mm2, %%mm4
\n\t
"
"packssdw %%mm3, %%mm1
\n\t
"
"pmaddwd %%mm5, %%mm4
\n\t
"
"pmaddwd %%mm5, %%mm1
\n\t
"
"addl $24, %%ebx
\n\t
"
"packssdw %%mm1, %%mm4
\n\t
"
// V3 V2 U3 U2
"psraw $7, %%mm4
\n\t
"
"movq %%mm0, %%mm1
\n\t
"
"punpckldq %%mm4, %%mm0
\n\t
"
"punpckhdq %%mm4, %%mm1
\n\t
"
"packsswb %%mm1, %%mm0
\n\t
"
"paddb "
MANGLE
(
bgr2UVOffset
)
", %%mm0
\n\t
"
"movd %%mm0, (%2, %%eax)
\n\t
"
"punpckhdq %%mm0, %%mm0
\n\t
"
"movd %%mm0, (%3, %%eax)
\n\t
"
"addl $4, %%eax
\n\t
"
" js 1b
\n\t
"
:
:
"r"
(
src1
+
width
*
6
),
"r"
(
src2
+
width
*
6
),
"r"
(
dstU
+
width
),
"r"
(
dstV
+
width
),
"g"
(
-
width
)
:
"%eax"
,
"%ebx"
);
#else
int
i
;
for
(
i
=
0
;
i
<
width
;
i
++
)
{
int
b
=
src1
[
6
*
i
+
0
]
+
src1
[
6
*
i
+
3
]
+
src2
[
6
*
i
+
0
]
+
src2
[
6
*
i
+
3
];
int
g
=
src1
[
6
*
i
+
1
]
+
src1
[
6
*
i
+
4
]
+
src2
[
6
*
i
+
1
]
+
src2
[
6
*
i
+
4
];
int
r
=
src1
[
6
*
i
+
2
]
+
src1
[
6
*
i
+
5
]
+
src2
[
6
*
i
+
2
]
+
src2
[
6
*
i
+
5
];
dstU
[
i
]
=
((
RU
*
r
+
GU
*
g
+
BU
*
b
)
>>
(
RGB2YUV_SHIFT
+
2
))
+
128
;
dstV
[
i
]
=
((
RV
*
r
+
GV
*
g
+
BV
*
b
)
>>
(
RGB2YUV_SHIFT
+
2
))
+
128
;
}
#endif
}
static
inline
void
RENAME
(
bgr16ToY
)(
uint8_t
*
dst
,
uint8_t
*
src
,
int
width
)
{
int
i
;
for
(
i
=
0
;
i
<
width
;
i
++
)
{
int
d
=
((
uint16_t
*
)
src
)[
i
];
int
b
=
d
&
0x1F
;
int
g
=
(
d
>>
5
)
&
0x3F
;
int
r
=
(
d
>>
11
)
&
0x1F
;
dst
[
i
]
=
((
2
*
RY
*
r
+
GY
*
g
+
2
*
BY
*
b
)
>>
(
RGB2YUV_SHIFT
-
2
))
+
16
;
}
}
static
inline
void
RENAME
(
bgr16ToUV
)(
uint8_t
*
dstU
,
uint8_t
*
dstV
,
uint8_t
*
src1
,
uint8_t
*
src2
,
int
width
)
{
int
i
;
for
(
i
=
0
;
i
<
width
;
i
++
)
{
int
d0
=
((
uint32_t
*
)
src1
)[
i
];
int
d1
=
((
uint32_t
*
)
src2
)[
i
];
int
dl
=
(
d0
&
0x07E0F81F
)
+
(
d1
&
0x07E0F81F
);
int
dh
=
((
d0
>>
5
)
&
0x07C0F83F
)
+
((
d1
>>
5
)
&
0x07C0F83F
);
int
dh2
=
(
dh
>>
11
)
+
(
dh
<<
21
);
int
d
=
dh2
+
dl
;
int
b
=
d
&
0x7F
;
int
r
=
(
d
>>
11
)
&
0x7F
;
int
g
=
d
>>
21
;
dstU
[
i
]
=
((
2
*
RU
*
r
+
GU
*
g
+
2
*
BU
*
b
)
>>
(
RGB2YUV_SHIFT
+
2
-
2
))
+
128
;
dstV
[
i
]
=
((
2
*
RV
*
r
+
GV
*
g
+
2
*
BV
*
b
)
>>
(
RGB2YUV_SHIFT
+
2
-
2
))
+
128
;
}
}
static
inline
void
RENAME
(
bgr15ToY
)(
uint8_t
*
dst
,
uint8_t
*
src
,
int
width
)
{
int
i
;
for
(
i
=
0
;
i
<
width
;
i
++
)
{
int
d
=
((
uint16_t
*
)
src
)[
i
];
int
b
=
d
&
0x1F
;
int
g
=
(
d
>>
5
)
&
0x1F
;
int
r
=
(
d
>>
10
)
&
0x1F
;
dst
[
i
]
=
((
RY
*
r
+
GY
*
g
+
BY
*
b
)
>>
(
RGB2YUV_SHIFT
-
3
))
+
16
;
}
}
static
inline
void
RENAME
(
bgr15ToUV
)(
uint8_t
*
dstU
,
uint8_t
*
dstV
,
uint8_t
*
src1
,
uint8_t
*
src2
,
int
width
)
{
int
i
;
for
(
i
=
0
;
i
<
width
;
i
++
)
{
int
d0
=
((
uint32_t
*
)
src1
)[
i
];
int
d1
=
((
uint32_t
*
)
src2
)[
i
];
int
dl
=
(
d0
&
0x03E07C1F
)
+
(
d1
&
0x03E07C1F
);
int
dh
=
((
d0
>>
5
)
&
0x03E0F81F
)
+
((
d1
>>
5
)
&
0x03E0F81F
);
int
dh2
=
(
dh
>>
11
)
+
(
dh
<<
21
);
int
d
=
dh2
+
dl
;
int
b
=
d
&
0x7F
;
int
r
=
(
d
>>
10
)
&
0x7F
;
int
g
=
d
>>
21
;
dstU
[
i
]
=
((
RU
*
r
+
GU
*
g
+
BU
*
b
)
>>
(
RGB2YUV_SHIFT
+
2
-
3
))
+
128
;
dstV
[
i
]
=
((
RV
*
r
+
GV
*
g
+
BV
*
b
)
>>
(
RGB2YUV_SHIFT
+
2
-
3
))
+
128
;
}
}
static
inline
void
RENAME
(
rgb32ToY
)(
uint8_t
*
dst
,
uint8_t
*
src
,
int
width
)
{
int
i
;
for
(
i
=
0
;
i
<
width
;
i
++
)
{
int
r
=
((
uint32_t
*
)
src
)[
i
]
&
0xFF
;
int
g
=
(((
uint32_t
*
)
src
)[
i
]
>>
8
)
&
0xFF
;
int
b
=
(((
uint32_t
*
)
src
)[
i
]
>>
16
)
&
0xFF
;
dst
[
i
]
=
((
RY
*
r
+
GY
*
g
+
BY
*
b
+
(
33
<<
(
RGB2YUV_SHIFT
-
1
))
)
>>
RGB2YUV_SHIFT
);
}
}
static
inline
void
RENAME
(
rgb32ToUV
)(
uint8_t
*
dstU
,
uint8_t
*
dstV
,
uint8_t
*
src1
,
uint8_t
*
src2
,
int
width
)
{
int
i
;
for
(
i
=
0
;
i
<
width
;
i
++
)
{
const
int
a
=
((
uint32_t
*
)
src1
)[
2
*
i
+
0
];
const
int
e
=
((
uint32_t
*
)
src1
)[
2
*
i
+
1
];
const
int
c
=
((
uint32_t
*
)
src2
)[
2
*
i
+
0
];
const
int
d
=
((
uint32_t
*
)
src2
)[
2
*
i
+
1
];
const
int
l
=
(
a
&
0xFF00FF
)
+
(
e
&
0xFF00FF
)
+
(
c
&
0xFF00FF
)
+
(
d
&
0xFF00FF
);
const
int
h
=
(
a
&
0x00FF00
)
+
(
e
&
0x00FF00
)
+
(
c
&
0x00FF00
)
+
(
d
&
0x00FF00
);
const
int
r
=
l
&
0x3FF
;
const
int
g
=
h
>>
8
;
const
int
b
=
l
>>
16
;
dstU
[
i
]
=
((
RU
*
r
+
GU
*
g
+
BU
*
b
)
>>
(
RGB2YUV_SHIFT
+
2
))
+
128
;
dstV
[
i
]
=
((
RV
*
r
+
GV
*
g
+
BV
*
b
)
>>
(
RGB2YUV_SHIFT
+
2
))
+
128
;
}
}
static
inline
void
RENAME
(
rgb24ToY
)(
uint8_t
*
dst
,
uint8_t
*
src
,
int
width
)
{
int
i
;
for
(
i
=
0
;
i
<
width
;
i
++
)
{
int
r
=
src
[
i
*
3
+
0
];
int
g
=
src
[
i
*
3
+
1
];
int
b
=
src
[
i
*
3
+
2
];
dst
[
i
]
=
((
RY
*
r
+
GY
*
g
+
BY
*
b
+
(
33
<<
(
RGB2YUV_SHIFT
-
1
))
)
>>
RGB2YUV_SHIFT
);
}
}
static
inline
void
RENAME
(
rgb24ToUV
)(
uint8_t
*
dstU
,
uint8_t
*
dstV
,
uint8_t
*
src1
,
uint8_t
*
src2
,
int
width
)
{
int
i
;
for
(
i
=
0
;
i
<
width
;
i
++
)
{
int
r
=
src1
[
6
*
i
+
0
]
+
src1
[
6
*
i
+
3
]
+
src2
[
6
*
i
+
0
]
+
src2
[
6
*
i
+
3
];
int
g
=
src1
[
6
*
i
+
1
]
+
src1
[
6
*
i
+
4
]
+
src2
[
6
*
i
+
1
]
+
src2
[
6
*
i
+
4
];
int
b
=
src1
[
6
*
i
+
2
]
+
src1
[
6
*
i
+
5
]
+
src2
[
6
*
i
+
2
]
+
src2
[
6
*
i
+
5
];
dstU
[
i
]
=
((
RU
*
r
+
GU
*
g
+
BU
*
b
)
>>
(
RGB2YUV_SHIFT
+
2
))
+
128
;
dstV
[
i
]
=
((
RV
*
r
+
GV
*
g
+
BV
*
b
)
>>
(
RGB2YUV_SHIFT
+
2
))
+
128
;
}
}
// Bilinear / Bicubic scaling
static
inline
void
RENAME
(
hScale
)(
int16_t
*
dst
,
int
dstW
,
uint8_t
*
src
,
int
srcW
,
int
xInc
,
int16_t
*
filter
,
int16_t
*
filterPos
,
int
filterSize
)
{
#ifdef HAVE_MMX
assert
(
filterSize
%
4
==
0
&&
filterSize
>
0
);
if
(
filterSize
==
4
)
// allways true for upscaling, sometimes for down too
{
int
counter
=
-
2
*
dstW
;
filter
-=
counter
*
2
;
filterPos
-=
counter
/
2
;
dst
-=
counter
/
2
;
asm
volatile
(
"pxor %%mm7, %%mm7
\n\t
"
"movq "
MANGLE
(
w02
)
", %%mm6
\n\t
"
"pushl %%ebp
\n\t
"
// we use 7 regs here ...
"movl %%eax, %%ebp
\n\t
"
".balign 16
\n\t
"
"1:
\n\t
"
"movzwl (%2, %%ebp), %%eax
\n\t
"
"movzwl 2(%2, %%ebp), %%ebx
\n\t
"
"movq (%1, %%ebp, 4), %%mm1
\n\t
"
"movq 8(%1, %%ebp, 4), %%mm3
\n\t
"
"movd (%3, %%eax), %%mm0
\n\t
"
"movd (%3, %%ebx), %%mm2
\n\t
"
"punpcklbw %%mm7, %%mm0
\n\t
"
"punpcklbw %%mm7, %%mm2
\n\t
"
"pmaddwd %%mm1, %%mm0
\n\t
"
"pmaddwd %%mm2, %%mm3
\n\t
"
"psrad $8, %%mm0
\n\t
"
"psrad $8, %%mm3
\n\t
"
"packssdw %%mm3, %%mm0
\n\t
"
"pmaddwd %%mm6, %%mm0
\n\t
"
"packssdw %%mm0, %%mm0
\n\t
"
"movd %%mm0, (%4, %%ebp)
\n\t
"
"addl $4, %%ebp
\n\t
"
" jnc 1b
\n\t
"
"popl %%ebp
\n\t
"
:
"+a"
(
counter
)
:
"c"
(
filter
),
"d"
(
filterPos
),
"S"
(
src
),
"D"
(
dst
)
:
"%ebx"
);
}
else
if
(
filterSize
==
8
)
{
int
counter
=
-
2
*
dstW
;
filter
-=
counter
*
4
;
filterPos
-=
counter
/
2
;
dst
-=
counter
/
2
;
asm
volatile
(
"pxor %%mm7, %%mm7
\n\t
"
"movq "
MANGLE
(
w02
)
", %%mm6
\n\t
"
"pushl %%ebp
\n\t
"
// we use 7 regs here ...
"movl %%eax, %%ebp
\n\t
"
".balign 16
\n\t
"
"1:
\n\t
"
"movzwl (%2, %%ebp), %%eax
\n\t
"
"movzwl 2(%2, %%ebp), %%ebx
\n\t
"
"movq (%1, %%ebp, 8), %%mm1
\n\t
"
"movq 16(%1, %%ebp, 8), %%mm3
\n\t
"
"movd (%3, %%eax), %%mm0
\n\t
"
"movd (%3, %%ebx), %%mm2
\n\t
"
"punpcklbw %%mm7, %%mm0
\n\t
"
"punpcklbw %%mm7, %%mm2
\n\t
"
"pmaddwd %%mm1, %%mm0
\n\t
"
"pmaddwd %%mm2, %%mm3
\n\t
"
"movq 8(%1, %%ebp, 8), %%mm1
\n\t
"
"movq 24(%1, %%ebp, 8), %%mm5
\n\t
"
"movd 4(%3, %%eax), %%mm4
\n\t
"
"movd 4(%3, %%ebx), %%mm2
\n\t
"
"punpcklbw %%mm7, %%mm4
\n\t
"
"punpcklbw %%mm7, %%mm2
\n\t
"
"pmaddwd %%mm1, %%mm4
\n\t
"
"pmaddwd %%mm2, %%mm5
\n\t
"
"paddd %%mm4, %%mm0
\n\t
"
"paddd %%mm5, %%mm3
\n\t
"
"psrad $8, %%mm0
\n\t
"
"psrad $8, %%mm3
\n\t
"
"packssdw %%mm3, %%mm0
\n\t
"
"pmaddwd %%mm6, %%mm0
\n\t
"
"packssdw %%mm0, %%mm0
\n\t
"
"movd %%mm0, (%4, %%ebp)
\n\t
"
"addl $4, %%ebp
\n\t
"
" jnc 1b
\n\t
"
"popl %%ebp
\n\t
"
:
"+a"
(
counter
)
:
"c"
(
filter
),
"d"
(
filterPos
),
"S"
(
src
),
"D"
(
dst
)
:
"%ebx"
);
}
else
{
int
counter
=
-
2
*
dstW
;
// filter-= counter*filterSize/2;
filterPos
-=
counter
/
2
;
dst
-=
counter
/
2
;
asm
volatile
(
"pxor %%mm7, %%mm7
\n\t
"
"movq "
MANGLE
(
w02
)
", %%mm6
\n\t
"
".balign 16
\n\t
"
"1:
\n\t
"
"movl %2, %%ecx
\n\t
"
"movzwl (%%ecx, %0), %%eax
\n\t
"
"movzwl 2(%%ecx, %0), %%ebx
\n\t
"
"movl %5, %%ecx
\n\t
"
"pxor %%mm4, %%mm4
\n\t
"
"pxor %%mm5, %%mm5
\n\t
"
"2:
\n\t
"
"movq (%1), %%mm1
\n\t
"
"movq (%1, %6), %%mm3
\n\t
"
"movd (%%ecx, %%eax), %%mm0
\n\t
"
"movd (%%ecx, %%ebx), %%mm2
\n\t
"
"punpcklbw %%mm7, %%mm0
\n\t
"
"punpcklbw %%mm7, %%mm2
\n\t
"
"pmaddwd %%mm1, %%mm0
\n\t
"
"pmaddwd %%mm2, %%mm3
\n\t
"
"paddd %%mm3, %%mm5
\n\t
"
"paddd %%mm0, %%mm4
\n\t
"
"addl $8, %1
\n\t
"
"addl $4, %%ecx
\n\t
"
"cmpl %4, %%ecx
\n\t
"
" jb 2b
\n\t
"
"addl %6, %1
\n\t
"
"psrad $8, %%mm4
\n\t
"
"psrad $8, %%mm5
\n\t
"
"packssdw %%mm5, %%mm4
\n\t
"
"pmaddwd %%mm6, %%mm4
\n\t
"
"packssdw %%mm4, %%mm4
\n\t
"
"movl %3, %%eax
\n\t
"
"movd %%mm4, (%%eax, %0)
\n\t
"
"addl $4, %0
\n\t
"
" jnc 1b
\n\t
"
:
"+r"
(
counter
),
"+r"
(
filter
)
:
"m"
(
filterPos
),
"m"
(
dst
),
"m"
(
src
+
filterSize
),
"m"
(
src
),
"r"
(
filterSize
*
2
)
:
"%ebx"
,
"%eax"
,
"%ecx"
);
}
#else
#ifdef HAVE_ALTIVEC
hScale_altivec_real
(
dst
,
dstW
,
src
,
srcW
,
xInc
,
filter
,
filterPos
,
filterSize
);
#else
int
i
;
for
(
i
=
0
;
i
<
dstW
;
i
++
)
{
int
j
;
int
srcPos
=
filterPos
[
i
];
int
val
=
0
;
// printf("filterPos: %d\n", filterPos[i]);
for
(
j
=
0
;
j
<
filterSize
;
j
++
)
{
// printf("filter: %d, src: %d\n", filter[i], src[srcPos + j]);
val
+=
((
int
)
src
[
srcPos
+
j
])
*
filter
[
filterSize
*
i
+
j
];
}
// filter += hFilterSize;
dst
[
i
]
=
MIN
(
MAX
(
0
,
val
>>
7
),
(
1
<<
15
)
-
1
);
// the cubic equation does overflow ...
// dst[i] = val>>7;
}
#endif
#endif
}
// *** horizontal scale Y line to temp buffer
static
inline
void
RENAME
(
hyscale
)(
uint16_t
*
dst
,
int
dstWidth
,
uint8_t
*
src
,
int
srcW
,
int
xInc
,
int
flags
,
int
canMMX2BeUsed
,
int16_t
*
hLumFilter
,
int16_t
*
hLumFilterPos
,
int
hLumFilterSize
,
void
*
funnyYCode
,
int
srcFormat
,
uint8_t
*
formatConvBuffer
,
int16_t
*
mmx2Filter
,
int32_t
*
mmx2FilterPos
)
{
if
(
srcFormat
==
IMGFMT_YUY2
)
{
RENAME
(
yuy2ToY
)(
formatConvBuffer
,
src
,
srcW
);
src
=
formatConvBuffer
;
}
else
if
(
srcFormat
==
IMGFMT_UYVY
)
{
RENAME
(
uyvyToY
)(
formatConvBuffer
,
src
,
srcW
);
src
=
formatConvBuffer
;
}
else
if
(
srcFormat
==
IMGFMT_BGR32
)
{
RENAME
(
bgr32ToY
)(
formatConvBuffer
,
src
,
srcW
);
src
=
formatConvBuffer
;
}
else
if
(
srcFormat
==
IMGFMT_BGR24
)
{
RENAME
(
bgr24ToY
)(
formatConvBuffer
,
src
,
srcW
);
src
=
formatConvBuffer
;
}
else
if
(
srcFormat
==
IMGFMT_BGR16
)
{
RENAME
(
bgr16ToY
)(
formatConvBuffer
,
src
,
srcW
);
src
=
formatConvBuffer
;
}
else
if
(
srcFormat
==
IMGFMT_BGR15
)
{
RENAME
(
bgr15ToY
)(
formatConvBuffer
,
src
,
srcW
);
src
=
formatConvBuffer
;
}
else
if
(
srcFormat
==
IMGFMT_RGB32
)
{
RENAME
(
rgb32ToY
)(
formatConvBuffer
,
src
,
srcW
);
src
=
formatConvBuffer
;
}
else
if
(
srcFormat
==
IMGFMT_RGB24
)
{
RENAME
(
rgb24ToY
)(
formatConvBuffer
,
src
,
srcW
);
src
=
formatConvBuffer
;
}
#ifdef HAVE_MMX
// use the new MMX scaler if the mmx2 can't be used (its faster than the x86asm one)
if
(
!
(
flags
&
SWS_FAST_BILINEAR
)
||
(
!
canMMX2BeUsed
))
#else
if
(
!
(
flags
&
SWS_FAST_BILINEAR
))
#endif
{
RENAME
(
hScale
)(
dst
,
dstWidth
,
src
,
srcW
,
xInc
,
hLumFilter
,
hLumFilterPos
,
hLumFilterSize
);
}
else
// Fast Bilinear upscale / crap downscale
{
#ifdef ARCH_X86
#ifdef HAVE_MMX2
int
i
;
if
(
canMMX2BeUsed
)
{
asm
volatile
(
"pxor %%mm7, %%mm7
\n\t
"
"movl %0, %%ecx
\n\t
"
"movl %1, %%edi
\n\t
"
"movl %2, %%edx
\n\t
"
"movl %3, %%ebx
\n\t
"
"xorl %%eax, %%eax
\n\t
"
// i
PREFETCH
" (%%ecx)
\n\t
"
PREFETCH
" 32(%%ecx)
\n\t
"
PREFETCH
" 64(%%ecx)
\n\t
"
#define FUNNY_Y_CODE \
"movl (%%ebx), %%esi \n\t"\
"call *%4 \n\t"\
"addl (%%ebx, %%eax), %%ecx \n\t"\
"addl %%eax, %%edi \n\t"\
"xorl %%eax, %%eax \n\t"\
FUNNY_Y_CODE
FUNNY_Y_CODE
FUNNY_Y_CODE
FUNNY_Y_CODE
FUNNY_Y_CODE
FUNNY_Y_CODE
FUNNY_Y_CODE
FUNNY_Y_CODE
::
"m"
(
src
),
"m"
(
dst
),
"m"
(
mmx2Filter
),
"m"
(
mmx2FilterPos
),
"m"
(
funnyYCode
)
:
"%eax"
,
"%ebx"
,
"%ecx"
,
"%edx"
,
"%esi"
,
"%edi"
);
for
(
i
=
dstWidth
-
1
;
(
i
*
xInc
)
>>
16
>=
srcW
-
1
;
i
--
)
dst
[
i
]
=
src
[
srcW
-
1
]
*
128
;
}
else
{
#endif
//NO MMX just normal asm ...
asm
volatile
(
"xorl %%eax, %%eax
\n\t
"
// i
"xorl %%ebx, %%ebx
\n\t
"
// xx
"xorl %%ecx, %%ecx
\n\t
"
// 2*xalpha
".balign 16
\n\t
"
"1:
\n\t
"
"movzbl (%0, %%ebx), %%edi
\n\t
"
//src[xx]
"movzbl 1(%0, %%ebx), %%esi
\n\t
"
//src[xx+1]
"subl %%edi, %%esi
\n\t
"
//src[xx+1] - src[xx]
"imull %%ecx, %%esi
\n\t
"
//(src[xx+1] - src[xx])*2*xalpha
"shll $16, %%edi
\n\t
"
"addl %%edi, %%esi
\n\t
"
//src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
"movl %1, %%edi
\n\t
"
"shrl $9, %%esi
\n\t
"
"movw %%si, (%%edi, %%eax, 2)
\n\t
"
"addw %4, %%cx
\n\t
"
//2*xalpha += xInc&0xFF
"adcl %3, %%ebx
\n\t
"
//xx+= xInc>>8 + carry
"movzbl (%0, %%ebx), %%edi
\n\t
"
//src[xx]
"movzbl 1(%0, %%ebx), %%esi
\n\t
"
//src[xx+1]
"subl %%edi, %%esi
\n\t
"
//src[xx+1] - src[xx]
"imull %%ecx, %%esi
\n\t
"
//(src[xx+1] - src[xx])*2*xalpha
"shll $16, %%edi
\n\t
"
"addl %%edi, %%esi
\n\t
"
//src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
"movl %1, %%edi
\n\t
"
"shrl $9, %%esi
\n\t
"
"movw %%si, 2(%%edi, %%eax, 2)
\n\t
"
"addw %4, %%cx
\n\t
"
//2*xalpha += xInc&0xFF
"adcl %3, %%ebx
\n\t
"
//xx+= xInc>>8 + carry
"addl $2, %%eax
\n\t
"
"cmpl %2, %%eax
\n\t
"
" jb 1b
\n\t
"
::
"r"
(
src
),
"m"
(
dst
),
"m"
(
dstWidth
),
"m"
(
xInc
>>
16
),
"m"
(
xInc
&
0xFFFF
)
:
"%eax"
,
"%ebx"
,
"%ecx"
,
"%edi"
,
"%esi"
);
#ifdef HAVE_MMX2
}
//if MMX2 can't be used
#endif
#else
int
i
;
unsigned
int
xpos
=
0
;
for
(
i
=
0
;
i
<
dstWidth
;
i
++
)
{
register
unsigned
int
xx
=
xpos
>>
16
;
register
unsigned
int
xalpha
=
(
xpos
&
0xFFFF
)
>>
9
;
dst
[
i
]
=
(
src
[
xx
]
<<
7
)
+
(
src
[
xx
+
1
]
-
src
[
xx
])
*
xalpha
;
xpos
+=
xInc
;
}
#endif
}
}
inline
static
void
RENAME
(
hcscale
)(
uint16_t
*
dst
,
int
dstWidth
,
uint8_t
*
src1
,
uint8_t
*
src2
,
int
srcW
,
int
xInc
,
int
flags
,
int
canMMX2BeUsed
,
int16_t
*
hChrFilter
,
int16_t
*
hChrFilterPos
,
int
hChrFilterSize
,
void
*
funnyUVCode
,
int
srcFormat
,
uint8_t
*
formatConvBuffer
,
int16_t
*
mmx2Filter
,
int32_t
*
mmx2FilterPos
)
{
if
(
srcFormat
==
IMGFMT_YUY2
)
{
RENAME
(
yuy2ToUV
)(
formatConvBuffer
,
formatConvBuffer
+
2048
,
src1
,
src2
,
srcW
);
src1
=
formatConvBuffer
;
src2
=
formatConvBuffer
+
2048
;
}
else
if
(
srcFormat
==
IMGFMT_UYVY
)
{
RENAME
(
uyvyToUV
)(
formatConvBuffer
,
formatConvBuffer
+
2048
,
src1
,
src2
,
srcW
);
src1
=
formatConvBuffer
;
src2
=
formatConvBuffer
+
2048
;
}
else
if
(
srcFormat
==
IMGFMT_BGR32
)
{
RENAME
(
bgr32ToUV
)(
formatConvBuffer
,
formatConvBuffer
+
2048
,
src1
,
src2
,
srcW
);
src1
=
formatConvBuffer
;
src2
=
formatConvBuffer
+
2048
;
}
else
if
(
srcFormat
==
IMGFMT_BGR24
)
{
RENAME
(
bgr24ToUV
)(
formatConvBuffer
,
formatConvBuffer
+
2048
,
src1
,
src2
,
srcW
);
src1
=
formatConvBuffer
;
src2
=
formatConvBuffer
+
2048
;
}
else
if
(
srcFormat
==
IMGFMT_BGR16
)
{
RENAME
(
bgr16ToUV
)(
formatConvBuffer
,
formatConvBuffer
+
2048
,
src1
,
src2
,
srcW
);
src1
=
formatConvBuffer
;
src2
=
formatConvBuffer
+
2048
;
}
else
if
(
srcFormat
==
IMGFMT_BGR15
)
{
RENAME
(
bgr15ToUV
)(
formatConvBuffer
,
formatConvBuffer
+
2048
,
src1
,
src2
,
srcW
);
src1
=
formatConvBuffer
;
src2
=
formatConvBuffer
+
2048
;
}
else
if
(
srcFormat
==
IMGFMT_RGB32
)
{
RENAME
(
rgb32ToUV
)(
formatConvBuffer
,
formatConvBuffer
+
2048
,
src1
,
src2
,
srcW
);
src1
=
formatConvBuffer
;
src2
=
formatConvBuffer
+
2048
;
}
else
if
(
srcFormat
==
IMGFMT_RGB24
)
{
RENAME
(
rgb24ToUV
)(
formatConvBuffer
,
formatConvBuffer
+
2048
,
src1
,
src2
,
srcW
);
src1
=
formatConvBuffer
;
src2
=
formatConvBuffer
+
2048
;
}
else
if
(
isGray
(
srcFormat
))
{
return
;
}
#ifdef HAVE_MMX
// use the new MMX scaler if the mmx2 can't be used (its faster than the x86asm one)
if
(
!
(
flags
&
SWS_FAST_BILINEAR
)
||
(
!
canMMX2BeUsed
))
#else
if
(
!
(
flags
&
SWS_FAST_BILINEAR
))
#endif
{
RENAME
(
hScale
)(
dst
,
dstWidth
,
src1
,
srcW
,
xInc
,
hChrFilter
,
hChrFilterPos
,
hChrFilterSize
);
RENAME
(
hScale
)(
dst
+
2048
,
dstWidth
,
src2
,
srcW
,
xInc
,
hChrFilter
,
hChrFilterPos
,
hChrFilterSize
);
}
else
// Fast Bilinear upscale / crap downscale
{
#ifdef ARCH_X86
#ifdef HAVE_MMX2
int
i
;
if
(
canMMX2BeUsed
)
{
asm
volatile
(
"pxor %%mm7, %%mm7
\n\t
"
"movl %0, %%ecx
\n\t
"
"movl %1, %%edi
\n\t
"
"movl %2, %%edx
\n\t
"
"movl %3, %%ebx
\n\t
"
"xorl %%eax, %%eax
\n\t
"
// i
PREFETCH
" (%%ecx)
\n\t
"
PREFETCH
" 32(%%ecx)
\n\t
"
PREFETCH
" 64(%%ecx)
\n\t
"
#define FUNNY_UV_CODE \
"movl (%%ebx), %%esi \n\t"\
"call *%4 \n\t"\
"addl (%%ebx, %%eax), %%ecx \n\t"\
"addl %%eax, %%edi \n\t"\
"xorl %%eax, %%eax \n\t"\
FUNNY_UV_CODE
FUNNY_UV_CODE
FUNNY_UV_CODE
FUNNY_UV_CODE
"xorl %%eax, %%eax
\n\t
"
// i
"movl %5, %%ecx
\n\t
"
// src
"movl %1, %%edi
\n\t
"
// buf1
"addl $4096, %%edi
\n\t
"
PREFETCH
" (%%ecx)
\n\t
"
PREFETCH
" 32(%%ecx)
\n\t
"
PREFETCH
" 64(%%ecx)
\n\t
"
FUNNY_UV_CODE
FUNNY_UV_CODE
FUNNY_UV_CODE
FUNNY_UV_CODE
::
"m"
(
src1
),
"m"
(
dst
),
"m"
(
mmx2Filter
),
"m"
(
mmx2FilterPos
),
"m"
(
funnyUVCode
),
"m"
(
src2
)
:
"%eax"
,
"%ebx"
,
"%ecx"
,
"%edx"
,
"%esi"
,
"%edi"
);
for
(
i
=
dstWidth
-
1
;
(
i
*
xInc
)
>>
16
>=
srcW
-
1
;
i
--
)
{
// printf("%d %d %d\n", dstWidth, i, srcW);
dst
[
i
]
=
src1
[
srcW
-
1
]
*
128
;
dst
[
i
+
2048
]
=
src2
[
srcW
-
1
]
*
128
;
}
}
else
{
#endif
asm
volatile
(
"xorl %%eax, %%eax
\n\t
"
// i
"xorl %%ebx, %%ebx
\n\t
"
// xx
"xorl %%ecx, %%ecx
\n\t
"
// 2*xalpha
".balign 16
\n\t
"
"1:
\n\t
"
"movl %0, %%esi
\n\t
"
"movzbl (%%esi, %%ebx), %%edi
\n\t
"
//src[xx]
"movzbl 1(%%esi, %%ebx), %%esi
\n\t
"
//src[xx+1]
"subl %%edi, %%esi
\n\t
"
//src[xx+1] - src[xx]
"imull %%ecx, %%esi
\n\t
"
//(src[xx+1] - src[xx])*2*xalpha
"shll $16, %%edi
\n\t
"
"addl %%edi, %%esi
\n\t
"
//src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
"movl %1, %%edi
\n\t
"
"shrl $9, %%esi
\n\t
"
"movw %%si, (%%edi, %%eax, 2)
\n\t
"
"movzbl (%5, %%ebx), %%edi
\n\t
"
//src[xx]
"movzbl 1(%5, %%ebx), %%esi
\n\t
"
//src[xx+1]
"subl %%edi, %%esi
\n\t
"
//src[xx+1] - src[xx]
"imull %%ecx, %%esi
\n\t
"
//(src[xx+1] - src[xx])*2*xalpha
"shll $16, %%edi
\n\t
"
"addl %%edi, %%esi
\n\t
"
//src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
"movl %1, %%edi
\n\t
"
"shrl $9, %%esi
\n\t
"
"movw %%si, 4096(%%edi, %%eax, 2)
\n\t
"
"addw %4, %%cx
\n\t
"
//2*xalpha += xInc&0xFF
"adcl %3, %%ebx
\n\t
"
//xx+= xInc>>8 + carry
"addl $1, %%eax
\n\t
"
"cmpl %2, %%eax
\n\t
"
" jb 1b
\n\t
"
::
"m"
(
src1
),
"m"
(
dst
),
"m"
(
dstWidth
),
"m"
(
xInc
>>
16
),
"m"
(
xInc
&
0xFFFF
),
"r"
(
src2
)
:
"%eax"
,
"%ebx"
,
"%ecx"
,
"%edi"
,
"%esi"
);
#ifdef HAVE_MMX2
}
//if MMX2 can't be used
#endif
#else
int
i
;
unsigned
int
xpos
=
0
;
for
(
i
=
0
;
i
<
dstWidth
;
i
++
)
{
register
unsigned
int
xx
=
xpos
>>
16
;
register
unsigned
int
xalpha
=
(
xpos
&
0xFFFF
)
>>
9
;
dst
[
i
]
=
(
src1
[
xx
]
*
(
xalpha
^
127
)
+
src1
[
xx
+
1
]
*
xalpha
);
dst
[
i
+
2048
]
=
(
src2
[
xx
]
*
(
xalpha
^
127
)
+
src2
[
xx
+
1
]
*
xalpha
);
/* slower
dst[i]= (src1[xx]<<7) + (src1[xx+1] - src1[xx])*xalpha;
dst[i+2048]=(src2[xx]<<7) + (src2[xx+1] - src2[xx])*xalpha;
*/
xpos
+=
xInc
;
}
#endif
}
}
static
int
RENAME
(
swScale
)(
SwsContext
*
c
,
uint8_t
*
src
[],
int
srcStride
[],
int
srcSliceY
,
int
srcSliceH
,
uint8_t
*
dst
[],
int
dstStride
[]){
/* load a few things into local vars to make the code more readable? and faster */
const
int
srcW
=
c
->
srcW
;
const
int
dstW
=
c
->
dstW
;
const
int
dstH
=
c
->
dstH
;
const
int
chrDstW
=
c
->
chrDstW
;
const
int
chrSrcW
=
c
->
chrSrcW
;
const
int
lumXInc
=
c
->
lumXInc
;
const
int
chrXInc
=
c
->
chrXInc
;
const
int
dstFormat
=
c
->
dstFormat
;
const
int
srcFormat
=
c
->
srcFormat
;
const
int
flags
=
c
->
flags
;
const
int
canMMX2BeUsed
=
c
->
canMMX2BeUsed
;
int16_t
*
vLumFilterPos
=
c
->
vLumFilterPos
;
int16_t
*
vChrFilterPos
=
c
->
vChrFilterPos
;
int16_t
*
hLumFilterPos
=
c
->
hLumFilterPos
;
int16_t
*
hChrFilterPos
=
c
->
hChrFilterPos
;
int16_t
*
vLumFilter
=
c
->
vLumFilter
;
int16_t
*
vChrFilter
=
c
->
vChrFilter
;
int16_t
*
hLumFilter
=
c
->
hLumFilter
;
int16_t
*
hChrFilter
=
c
->
hChrFilter
;
int32_t
*
lumMmxFilter
=
c
->
lumMmxFilter
;
int32_t
*
chrMmxFilter
=
c
->
chrMmxFilter
;
const
int
vLumFilterSize
=
c
->
vLumFilterSize
;
const
int
vChrFilterSize
=
c
->
vChrFilterSize
;
const
int
hLumFilterSize
=
c
->
hLumFilterSize
;
const
int
hChrFilterSize
=
c
->
hChrFilterSize
;
int16_t
**
lumPixBuf
=
c
->
lumPixBuf
;
int16_t
**
chrPixBuf
=
c
->
chrPixBuf
;
const
int
vLumBufSize
=
c
->
vLumBufSize
;
const
int
vChrBufSize
=
c
->
vChrBufSize
;
uint8_t
*
funnyYCode
=
c
->
funnyYCode
;
uint8_t
*
funnyUVCode
=
c
->
funnyUVCode
;
uint8_t
*
formatConvBuffer
=
c
->
formatConvBuffer
;
const
int
chrSrcSliceY
=
srcSliceY
>>
c
->
chrSrcVSubSample
;
const
int
chrSrcSliceH
=
-
((
-
srcSliceH
)
>>
c
->
chrSrcVSubSample
);
int
lastDstY
;
/* vars whch will change and which we need to storw back in the context */
int
dstY
=
c
->
dstY
;
int
lumBufIndex
=
c
->
lumBufIndex
;
int
chrBufIndex
=
c
->
chrBufIndex
;
int
lastInLumBuf
=
c
->
lastInLumBuf
;
int
lastInChrBuf
=
c
->
lastInChrBuf
;
if
(
isPacked
(
c
->
srcFormat
)){
src
[
0
]
=
src
[
1
]
=
src
[
2
]
=
src
[
0
];
srcStride
[
0
]
=
srcStride
[
1
]
=
srcStride
[
2
]
=
srcStride
[
0
];
}
srcStride
[
1
]
<<=
c
->
vChrDrop
;
srcStride
[
2
]
<<=
c
->
vChrDrop
;
// printf("swscale %X %X %X -> %X %X %X\n", (int)src[0], (int)src[1], (int)src[2],
// (int)dst[0], (int)dst[1], (int)dst[2]);
#if 0 //self test FIXME move to a vfilter or something
{
static volatile int i=0;
i++;
if(srcFormat==IMGFMT_YV12 && i==1 && srcSliceH>= c->srcH)
selfTest(src, srcStride, c->srcW, c->srcH);
i--;
}
#endif
//printf("sws Strides:%d %d %d -> %d %d %d\n", srcStride[0],srcStride[1],srcStride[2],
//dstStride[0],dstStride[1],dstStride[2]);
if
(
dstStride
[
0
]
%
8
!=
0
||
dstStride
[
1
]
%
8
!=
0
||
dstStride
[
2
]
%
8
!=
0
)
{
static
int
firstTime
=
1
;
//FIXME move this into the context perhaps
if
(
flags
&
SWS_PRINT_INFO
&&
firstTime
)
{
MSG_WARN
(
"SwScaler: Warning: dstStride is not aligned!
\n
"
"SwScaler: ->cannot do aligned memory acesses anymore
\n
"
);
firstTime
=
0
;
}
}
/* Note the user might start scaling the picture in the middle so this will not get executed
this is not really intended but works currently, so ppl might do it */
if
(
srcSliceY
==
0
){
lumBufIndex
=
0
;
chrBufIndex
=
0
;
dstY
=
0
;
lastInLumBuf
=
-
1
;
lastInChrBuf
=
-
1
;
}
lastDstY
=
dstY
;
for
(;
dstY
<
dstH
;
dstY
++
){
unsigned
char
*
dest
=
dst
[
0
]
+
dstStride
[
0
]
*
dstY
;
const
int
chrDstY
=
dstY
>>
c
->
chrDstVSubSample
;
unsigned
char
*
uDest
=
dst
[
1
]
+
dstStride
[
1
]
*
chrDstY
;
unsigned
char
*
vDest
=
dst
[
2
]
+
dstStride
[
2
]
*
chrDstY
;
const
int
firstLumSrcY
=
vLumFilterPos
[
dstY
];
//First line needed as input
const
int
firstChrSrcY
=
vChrFilterPos
[
chrDstY
];
//First line needed as input
const
int
lastLumSrcY
=
firstLumSrcY
+
vLumFilterSize
-
1
;
// Last line needed as input
const
int
lastChrSrcY
=
firstChrSrcY
+
vChrFilterSize
-
1
;
// Last line needed as input
//printf("dstY:%d dstH:%d firstLumSrcY:%d lastInLumBuf:%d vLumBufSize: %d vChrBufSize: %d slice: %d %d vLumFilterSize: %d firstChrSrcY: %d vChrFilterSize: %d c->chrSrcVSubSample: %d\n",
// dstY, dstH, firstLumSrcY, lastInLumBuf, vLumBufSize, vChrBufSize, srcSliceY, srcSliceH, vLumFilterSize, firstChrSrcY, vChrFilterSize, c->chrSrcVSubSample);
//handle holes (FAST_BILINEAR & weird filters)
if
(
firstLumSrcY
>
lastInLumBuf
)
lastInLumBuf
=
firstLumSrcY
-
1
;
if
(
firstChrSrcY
>
lastInChrBuf
)
lastInChrBuf
=
firstChrSrcY
-
1
;
//printf("%d %d %d\n", firstChrSrcY, lastInChrBuf, vChrBufSize);
ASSERT
(
firstLumSrcY
>=
lastInLumBuf
-
vLumBufSize
+
1
)
ASSERT
(
firstChrSrcY
>=
lastInChrBuf
-
vChrBufSize
+
1
)
// Do we have enough lines in this slice to output the dstY line
if
(
lastLumSrcY
<
srcSliceY
+
srcSliceH
&&
lastChrSrcY
<
-
((
-
srcSliceY
-
srcSliceH
)
>>
c
->
chrSrcVSubSample
))
{
//Do horizontal scaling
while
(
lastInLumBuf
<
lastLumSrcY
)
{
uint8_t
*
s
=
src
[
0
]
+
(
lastInLumBuf
+
1
-
srcSliceY
)
*
srcStride
[
0
];
lumBufIndex
++
;
// printf("%d %d %d %d\n", lumBufIndex, vLumBufSize, lastInLumBuf, lastLumSrcY);
ASSERT
(
lumBufIndex
<
2
*
vLumBufSize
)
ASSERT
(
lastInLumBuf
+
1
-
srcSliceY
<
srcSliceH
)
ASSERT
(
lastInLumBuf
+
1
-
srcSliceY
>=
0
)
// printf("%d %d\n", lumBufIndex, vLumBufSize);
RENAME
(
hyscale
)(
lumPixBuf
[
lumBufIndex
],
dstW
,
s
,
srcW
,
lumXInc
,
flags
,
canMMX2BeUsed
,
hLumFilter
,
hLumFilterPos
,
hLumFilterSize
,
funnyYCode
,
c
->
srcFormat
,
formatConvBuffer
,
c
->
lumMmx2Filter
,
c
->
lumMmx2FilterPos
);
lastInLumBuf
++
;
}
while
(
lastInChrBuf
<
lastChrSrcY
)
{
uint8_t
*
src1
=
src
[
1
]
+
(
lastInChrBuf
+
1
-
chrSrcSliceY
)
*
srcStride
[
1
];
uint8_t
*
src2
=
src
[
2
]
+
(
lastInChrBuf
+
1
-
chrSrcSliceY
)
*
srcStride
[
2
];
chrBufIndex
++
;
ASSERT
(
chrBufIndex
<
2
*
vChrBufSize
)
ASSERT
(
lastInChrBuf
+
1
-
chrSrcSliceY
<
(
chrSrcSliceH
))
ASSERT
(
lastInChrBuf
+
1
-
chrSrcSliceY
>=
0
)
//FIXME replace parameters through context struct (some at least)
if
(
!
(
isGray
(
srcFormat
)
||
isGray
(
dstFormat
)))
RENAME
(
hcscale
)(
chrPixBuf
[
chrBufIndex
],
chrDstW
,
src1
,
src2
,
chrSrcW
,
chrXInc
,
flags
,
canMMX2BeUsed
,
hChrFilter
,
hChrFilterPos
,
hChrFilterSize
,
funnyUVCode
,
c
->
srcFormat
,
formatConvBuffer
,
c
->
chrMmx2Filter
,
c
->
chrMmx2FilterPos
);
lastInChrBuf
++
;
}
//wrap buf index around to stay inside the ring buffer
if
(
lumBufIndex
>=
vLumBufSize
)
lumBufIndex
-=
vLumBufSize
;
if
(
chrBufIndex
>=
vChrBufSize
)
chrBufIndex
-=
vChrBufSize
;
}
else
// not enough lines left in this slice -> load the rest in the buffer
{
/* printf("%d %d Last:%d %d LastInBuf:%d %d Index:%d %d Y:%d FSize: %d %d BSize: %d %d\n",
firstChrSrcY,firstLumSrcY,lastChrSrcY,lastLumSrcY,
lastInChrBuf,lastInLumBuf,chrBufIndex,lumBufIndex,dstY,vChrFilterSize,vLumFilterSize,
vChrBufSize, vLumBufSize);*/
//Do horizontal scaling
while
(
lastInLumBuf
+
1
<
srcSliceY
+
srcSliceH
)
{
uint8_t
*
s
=
src
[
0
]
+
(
lastInLumBuf
+
1
-
srcSliceY
)
*
srcStride
[
0
];
lumBufIndex
++
;
ASSERT
(
lumBufIndex
<
2
*
vLumBufSize
)
ASSERT
(
lastInLumBuf
+
1
-
srcSliceY
<
srcSliceH
)
ASSERT
(
lastInLumBuf
+
1
-
srcSliceY
>=
0
)
RENAME
(
hyscale
)(
lumPixBuf
[
lumBufIndex
],
dstW
,
s
,
srcW
,
lumXInc
,
flags
,
canMMX2BeUsed
,
hLumFilter
,
hLumFilterPos
,
hLumFilterSize
,
funnyYCode
,
c
->
srcFormat
,
formatConvBuffer
,
c
->
lumMmx2Filter
,
c
->
lumMmx2FilterPos
);
lastInLumBuf
++
;
}
while
(
lastInChrBuf
+
1
<
(
chrSrcSliceY
+
chrSrcSliceH
))
{
uint8_t
*
src1
=
src
[
1
]
+
(
lastInChrBuf
+
1
-
chrSrcSliceY
)
*
srcStride
[
1
];
uint8_t
*
src2
=
src
[
2
]
+
(
lastInChrBuf
+
1
-
chrSrcSliceY
)
*
srcStride
[
2
];
chrBufIndex
++
;
ASSERT
(
chrBufIndex
<
2
*
vChrBufSize
)
ASSERT
(
lastInChrBuf
+
1
-
chrSrcSliceY
<
chrSrcSliceH
)
ASSERT
(
lastInChrBuf
+
1
-
chrSrcSliceY
>=
0
)
if
(
!
(
isGray
(
srcFormat
)
||
isGray
(
dstFormat
)))
RENAME
(
hcscale
)(
chrPixBuf
[
chrBufIndex
],
chrDstW
,
src1
,
src2
,
chrSrcW
,
chrXInc
,
flags
,
canMMX2BeUsed
,
hChrFilter
,
hChrFilterPos
,
hChrFilterSize
,
funnyUVCode
,
c
->
srcFormat
,
formatConvBuffer
,
c
->
chrMmx2Filter
,
c
->
chrMmx2FilterPos
);
lastInChrBuf
++
;
}
//wrap buf index around to stay inside the ring buffer
if
(
lumBufIndex
>=
vLumBufSize
)
lumBufIndex
-=
vLumBufSize
;
if
(
chrBufIndex
>=
vChrBufSize
)
chrBufIndex
-=
vChrBufSize
;
break
;
//we can't output a dstY line so let's try with the next slice
}
#ifdef HAVE_MMX
b5Dither
=
dither8
[
dstY
&
1
];
g6Dither
=
dither4
[
dstY
&
1
];
g5Dither
=
dither8
[
dstY
&
1
];
r5Dither
=
dither8
[(
dstY
+
1
)
&
1
];
#endif
if
(
dstY
<
dstH
-
2
)
{
int16_t
**
lumSrcPtr
=
lumPixBuf
+
lumBufIndex
+
firstLumSrcY
-
lastInLumBuf
+
vLumBufSize
;
int16_t
**
chrSrcPtr
=
chrPixBuf
+
chrBufIndex
+
firstChrSrcY
-
lastInChrBuf
+
vChrBufSize
;
#ifdef HAVE_MMX
int
i
;
for
(
i
=
0
;
i
<
vLumFilterSize
;
i
++
)
{
lumMmxFilter
[
4
*
i
+
0
]
=
(
int32_t
)
lumSrcPtr
[
i
];
lumMmxFilter
[
4
*
i
+
2
]
=
lumMmxFilter
[
4
*
i
+
3
]
=
((
uint16_t
)
vLumFilter
[
dstY
*
vLumFilterSize
+
i
])
*
0x10001
;
}
for
(
i
=
0
;
i
<
vChrFilterSize
;
i
++
)
{
chrMmxFilter
[
4
*
i
+
0
]
=
(
int32_t
)
chrSrcPtr
[
i
];
chrMmxFilter
[
4
*
i
+
2
]
=
chrMmxFilter
[
4
*
i
+
3
]
=
((
uint16_t
)
vChrFilter
[
chrDstY
*
vChrFilterSize
+
i
])
*
0x10001
;
}
#endif
if
(
isPlanarYUV
(
dstFormat
)
||
isGray
(
dstFormat
))
//YV12 like
{
const
int
chrSkipMask
=
(
1
<<
c
->
chrDstVSubSample
)
-
1
;
if
((
dstY
&
chrSkipMask
)
||
isGray
(
dstFormat
))
uDest
=
vDest
=
NULL
;
//FIXME split functions in lumi / chromi
if
(
vLumFilterSize
==
1
&&
vChrFilterSize
==
1
)
// Unscaled YV12
{
int16_t
*
lumBuf
=
lumPixBuf
[
0
];
int16_t
*
chrBuf
=
chrPixBuf
[
0
];
RENAME
(
yuv2yuv1
)(
lumBuf
,
chrBuf
,
dest
,
uDest
,
vDest
,
dstW
,
chrDstW
);
}
else
//General YV12
{
RENAME
(
yuv2yuvX
)(
c
,
vLumFilter
+
dstY
*
vLumFilterSize
,
lumSrcPtr
,
vLumFilterSize
,
vChrFilter
+
chrDstY
*
vChrFilterSize
,
chrSrcPtr
,
vChrFilterSize
,
dest
,
uDest
,
vDest
,
dstW
,
chrDstW
);
}
}
else
{
ASSERT
(
lumSrcPtr
+
vLumFilterSize
-
1
<
lumPixBuf
+
vLumBufSize
*
2
);
ASSERT
(
chrSrcPtr
+
vChrFilterSize
-
1
<
chrPixBuf
+
vChrBufSize
*
2
);
if
(
vLumFilterSize
==
1
&&
vChrFilterSize
==
2
)
//Unscaled RGB
{
int
chrAlpha
=
vChrFilter
[
2
*
dstY
+
1
];
RENAME
(
yuv2packed1
)(
c
,
*
lumSrcPtr
,
*
chrSrcPtr
,
*
(
chrSrcPtr
+
1
),
dest
,
dstW
,
chrAlpha
,
dstFormat
,
flags
,
dstY
);
}
else
if
(
vLumFilterSize
==
2
&&
vChrFilterSize
==
2
)
//BiLinear Upscale RGB
{
int
lumAlpha
=
vLumFilter
[
2
*
dstY
+
1
];
int
chrAlpha
=
vChrFilter
[
2
*
dstY
+
1
];
RENAME
(
yuv2packed2
)(
c
,
*
lumSrcPtr
,
*
(
lumSrcPtr
+
1
),
*
chrSrcPtr
,
*
(
chrSrcPtr
+
1
),
dest
,
dstW
,
lumAlpha
,
chrAlpha
,
dstY
);
}
else
//General RGB
{
RENAME
(
yuv2packedX
)(
c
,
vLumFilter
+
dstY
*
vLumFilterSize
,
lumSrcPtr
,
vLumFilterSize
,
vChrFilter
+
dstY
*
vChrFilterSize
,
chrSrcPtr
,
vChrFilterSize
,
dest
,
dstW
,
dstY
);
}
}
}
else
// hmm looks like we can't use MMX here without overwriting this array's tail
{
int16_t
**
lumSrcPtr
=
lumPixBuf
+
lumBufIndex
+
firstLumSrcY
-
lastInLumBuf
+
vLumBufSize
;
int16_t
**
chrSrcPtr
=
chrPixBuf
+
chrBufIndex
+
firstChrSrcY
-
lastInChrBuf
+
vChrBufSize
;
if
(
isPlanarYUV
(
dstFormat
)
||
isGray
(
dstFormat
))
//YV12
{
const
int
chrSkipMask
=
(
1
<<
c
->
chrDstVSubSample
)
-
1
;
if
((
dstY
&
chrSkipMask
)
||
isGray
(
dstFormat
))
uDest
=
vDest
=
NULL
;
//FIXME split functions in lumi / chromi
yuv2yuvXinC
(
vLumFilter
+
dstY
*
vLumFilterSize
,
lumSrcPtr
,
vLumFilterSize
,
vChrFilter
+
chrDstY
*
vChrFilterSize
,
chrSrcPtr
,
vChrFilterSize
,
dest
,
uDest
,
vDest
,
dstW
,
chrDstW
);
}
else
{
ASSERT
(
lumSrcPtr
+
vLumFilterSize
-
1
<
lumPixBuf
+
vLumBufSize
*
2
);
ASSERT
(
chrSrcPtr
+
vChrFilterSize
-
1
<
chrPixBuf
+
vChrBufSize
*
2
);
yuv2packedXinC
(
c
,
vLumFilter
+
dstY
*
vLumFilterSize
,
lumSrcPtr
,
vLumFilterSize
,
vChrFilter
+
dstY
*
vChrFilterSize
,
chrSrcPtr
,
vChrFilterSize
,
dest
,
dstW
,
dstY
);
}
}
}
#ifdef HAVE_MMX
__asm
__volatile
(
SFENCE
:::
"memory"
);
__asm
__volatile
(
EMMS
:::
"memory"
);
#endif
/* store changed local vars back in the context */
c
->
dstY
=
dstY
;
c
->
lumBufIndex
=
lumBufIndex
;
c
->
chrBufIndex
=
chrBufIndex
;
c
->
lastInLumBuf
=
lastInLumBuf
;
c
->
lastInChrBuf
=
lastInChrBuf
;
return
dstY
-
lastDstY
;
}
modules/video_filter/swscale/yuv2rgb.c
deleted
100644 → 0
View file @
3a2462f6
/*
* yuv2rgb.c, Software YUV to RGB coverter
*
* Copyright (C) 1999, Aaron Holtzman <aholtzma@ess.engr.uvic.ca>
* All Rights Reserved.
*
* Functions broken out from display_x11.c and several new modes
* added by Håkan Hjort <d95hjort@dtek.chalmers.se>
*
* 15 & 16 bpp support by Franck Sicard <Franck.Sicard@solsoft.fr>
*
* This file is part of mpeg2dec, a free MPEG-2 video decoder
*
* mpeg2dec is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2, or (at your option)
* any later version.
*
* mpeg2dec is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with GNU Make; see the file COPYING. If not, write to
* the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
*
* MMX/MMX2 Template stuff from Michael Niedermayer (michaelni@gmx.at) (needed for fast movntq support)
* 1,4,8bpp support by Michael Niedermayer (michaelni@gmx.at)
* context / deglobalize stuff by Michael Niedermayer
*/
#include <stdio.h>
#include <stdlib.h>
#include <inttypes.h>
#include <assert.h>
#include "config.h"
//#include "video_out.h"
#include "rgb2rgb.h"
#include "swscale.h"
#include "swscale_internal.h"
#include "common.h"
#ifdef HAVE_MLIB
#include "yuv2rgb_mlib.c"
#endif
#define DITHER1XBPP // only for mmx
const
uint8_t
__attribute__
((
aligned
(
8
)))
dither_2x2_4
[
2
][
8
]
=
{
{
1
,
3
,
1
,
3
,
1
,
3
,
1
,
3
,
},
{
2
,
0
,
2
,
0
,
2
,
0
,
2
,
0
,
},
};
const
uint8_t
__attribute__
((
aligned
(
8
)))
dither_2x2_8
[
2
][
8
]
=
{
{
6
,
2
,
6
,
2
,
6
,
2
,
6
,
2
,
},
{
0
,
4
,
0
,
4
,
0
,
4
,
0
,
4
,
},
};
const
uint8_t
__attribute__
((
aligned
(
8
)))
dither_8x8_32
[
8
][
8
]
=
{
{
17
,
9
,
23
,
15
,
16
,
8
,
22
,
14
,
},
{
5
,
29
,
3
,
27
,
4
,
28
,
2
,
26
,
},
{
21
,
13
,
19
,
11
,
20
,
12
,
18
,
10
,
},
{
0
,
24
,
6
,
30
,
1
,
25
,
7
,
31
,
},
{
16
,
8
,
22
,
14
,
17
,
9
,
23
,
15
,
},
{
4
,
28
,
2
,
26
,
5
,
29
,
3
,
27
,
},
{
20
,
12
,
18
,
10
,
21
,
13
,
19
,
11
,
},
{
1
,
25
,
7
,
31
,
0
,
24
,
6
,
30
,
},
};
#if 0
const uint8_t __attribute__((aligned(8))) dither_8x8_64[8][8]={
{ 0, 48, 12, 60, 3, 51, 15, 63, },
{ 32, 16, 44, 28, 35, 19, 47, 31, },
{ 8, 56, 4, 52, 11, 59, 7, 55, },
{ 40, 24, 36, 20, 43, 27, 39, 23, },
{ 2, 50, 14, 62, 1, 49, 13, 61, },
{ 34, 18, 46, 30, 33, 17, 45, 29, },
{ 10, 58, 6, 54, 9, 57, 5, 53, },
{ 42, 26, 38, 22, 41, 25, 37, 21, },
};
#endif
const
uint8_t
__attribute__
((
aligned
(
8
)))
dither_8x8_73
[
8
][
8
]
=
{
{
0
,
55
,
14
,
68
,
3
,
58
,
17
,
72
,
},
{
37
,
18
,
50
,
32
,
40
,
22
,
54
,
35
,
},
{
9
,
64
,
5
,
59
,
13
,
67
,
8
,
63
,
},
{
46
,
27
,
41
,
23
,
49
,
31
,
44
,
26
,
},
{
2
,
57
,
16
,
71
,
1
,
56
,
15
,
70
,
},
{
39
,
21
,
52
,
34
,
38
,
19
,
51
,
33
,
},
{
11
,
66
,
7
,
62
,
10
,
65
,
6
,
60
,
},
{
48
,
30
,
43
,
25
,
47
,
29
,
42
,
24
,
},
};
#if 0
const uint8_t __attribute__((aligned(8))) dither_8x8_128[8][8]={
{ 68, 36, 92, 60, 66, 34, 90, 58, },
{ 20, 116, 12, 108, 18, 114, 10, 106, },
{ 84, 52, 76, 44, 82, 50, 74, 42, },
{ 0, 96, 24, 120, 6, 102, 30, 126, },
{ 64, 32, 88, 56, 70, 38, 94, 62, },
{ 16, 112, 8, 104, 22, 118, 14, 110, },
{ 80, 48, 72, 40, 86, 54, 78, 46, },
{ 4, 100, 28, 124, 2, 98, 26, 122, },
};
#endif
#if 1
const
uint8_t
__attribute__
((
aligned
(
8
)))
dither_8x8_220
[
8
][
8
]
=
{
{
117
,
62
,
158
,
103
,
113
,
58
,
155
,
100
,
},
{
34
,
199
,
21
,
186
,
31
,
196
,
17
,
182
,
},
{
144
,
89
,
131
,
76
,
141
,
86
,
127
,
72
,
},
{
0
,
165
,
41
,
206
,
10
,
175
,
52
,
217
,
},
{
110
,
55
,
151
,
96
,
120
,
65
,
162
,
107
,
},
{
28
,
193
,
14
,
179
,
38
,
203
,
24
,
189
,
},
{
138
,
83
,
124
,
69
,
148
,
93
,
134
,
79
,
},
{
7
,
172
,
48
,
213
,
3
,
168
,
45
,
210
,
},
};
#elif 1
// tries to correct a gamma of 1.5
const
uint8_t
__attribute__
((
aligned
(
8
)))
dither_8x8_220
[
8
][
8
]
=
{
{
0
,
143
,
18
,
200
,
2
,
156
,
25
,
215
,
},
{
78
,
28
,
125
,
64
,
89
,
36
,
138
,
74
,
},
{
10
,
180
,
3
,
161
,
16
,
195
,
8
,
175
,
},
{
109
,
51
,
93
,
38
,
121
,
60
,
105
,
47
,
},
{
1
,
152
,
23
,
210
,
0
,
147
,
20
,
205
,
},
{
85
,
33
,
134
,
71
,
81
,
30
,
130
,
67
,
},
{
14
,
190
,
6
,
171
,
12
,
185
,
5
,
166
,
},
{
117
,
57
,
101
,
44
,
113
,
54
,
97
,
41
,
},
};
#elif 1
// tries to correct a gamma of 2.0
const
uint8_t
__attribute__
((
aligned
(
8
)))
dither_8x8_220
[
8
][
8
]
=
{
{
0
,
124
,
8
,
193
,
0
,
140
,
12
,
213
,
},
{
55
,
14
,
104
,
42
,
66
,
19
,
119
,
52
,
},
{
3
,
168
,
1
,
145
,
6
,
187
,
3
,
162
,
},
{
86
,
31
,
70
,
21
,
99
,
39
,
82
,
28
,
},
{
0
,
134
,
11
,
206
,
0
,
129
,
9
,
200
,
},
{
62
,
17
,
114
,
48
,
58
,
16
,
109
,
45
,
},
{
5
,
181
,
2
,
157
,
4
,
175
,
1
,
151
,
},
{
95
,
36
,
78
,
26
,
90
,
34
,
74
,
24
,
},
};
#else
// tries to correct a gamma of 2.5
const
uint8_t
__attribute__
((
aligned
(
8
)))
dither_8x8_220
[
8
][
8
]
=
{
{
0
,
107
,
3
,
187
,
0
,
125
,
6
,
212
,
},
{
39
,
7
,
86
,
28
,
49
,
11
,
102
,
36
,
},
{
1
,
158
,
0
,
131
,
3
,
180
,
1
,
151
,
},
{
68
,
19
,
52
,
12
,
81
,
25
,
64
,
17
,
},
{
0
,
119
,
5
,
203
,
0
,
113
,
4
,
195
,
},
{
45
,
9
,
96
,
33
,
42
,
8
,
91
,
30
,
},
{
2
,
172
,
1
,
144
,
2
,
165
,
0
,
137
,
},
{
77
,
23
,
60
,
15
,
72
,
21
,
56
,
14
,
},
};
#endif
#ifdef ARCH_X86
/* hope these constant values are cache line aligned */
uint64_t
attribute_used
__attribute__
((
aligned
(
8
)))
mmx_00ffw
=
0x00ff00ff00ff00ffULL
;
uint64_t
attribute_used
__attribute__
((
aligned
(
8
)))
mmx_redmask
=
0xf8f8f8f8f8f8f8f8ULL
;
uint64_t
attribute_used
__attribute__
((
aligned
(
8
)))
mmx_grnmask
=
0xfcfcfcfcfcfcfcfcULL
;
uint64_t
attribute_used
__attribute__
((
aligned
(
8
)))
M24A
=
0x00FF0000FF0000FFULL
;
uint64_t
attribute_used
__attribute__
((
aligned
(
8
)))
M24B
=
0xFF0000FF0000FF00ULL
;
uint64_t
attribute_used
__attribute__
((
aligned
(
8
)))
M24C
=
0x0000FF0000FF0000ULL
;
// the volatile is required because gcc otherwise optimizes some writes away not knowing that these
// are read in the asm block
volatile
uint64_t
attribute_used
__attribute__
((
aligned
(
8
)))
b5Dither
;
volatile
uint64_t
attribute_used
__attribute__
((
aligned
(
8
)))
g5Dither
;
volatile
uint64_t
attribute_used
__attribute__
((
aligned
(
8
)))
g6Dither
;
volatile
uint64_t
attribute_used
__attribute__
((
aligned
(
8
)))
r5Dither
;
uint64_t
__attribute__
((
aligned
(
8
)))
dither4
[
2
]
=
{
0x0103010301030103LL
,
0x0200020002000200LL
,};
uint64_t
__attribute__
((
aligned
(
8
)))
dither8
[
2
]
=
{
0x0602060206020602LL
,
0x0004000400040004LL
,};
#undef HAVE_MMX
#undef ARCH_X86
//MMX versions
#undef RENAME
#define HAVE_MMX
#undef HAVE_MMX2
#undef HAVE_3DNOW
#define ARCH_X86
#define RENAME(a) a ## _MMX
#include "yuv2rgb_template.c"
//MMX2 versions
#undef RENAME
#define HAVE_MMX
#define HAVE_MMX2
#undef HAVE_3DNOW
#define ARCH_X86
#define RENAME(a) a ## _MMX2
#include "yuv2rgb_template.c"
#endif // CAN_COMPILE_X86_ASM
const
int32_t
Inverse_Table_6_9
[
8
][
4
]
=
{
{
117504
,
138453
,
13954
,
34903
},
/* no sequence_display_extension */
{
117504
,
138453
,
13954
,
34903
},
/* ITU-R Rec. 709 (1990) */
{
104597
,
132201
,
25675
,
53279
},
/* unspecified */
{
104597
,
132201
,
25675
,
53279
},
/* reserved */
{
104448
,
132798
,
24759
,
53109
},
/* FCC */
{
104597
,
132201
,
25675
,
53279
},
/* ITU-R Rec. 624-4 System B, G */
{
104597
,
132201
,
25675
,
53279
},
/* SMPTE 170M */
{
117579
,
136230
,
16907
,
35559
}
/* SMPTE 240M (1987) */
};
#define RGB(i) \
U = pu[i]; \
V = pv[i]; \
r = c->table_rV[V]; \
g = c->table_gU[U] + c->table_gV[V]; \
b = c->table_bU[U];
#define DST1(i) \
Y = py_1[2*i]; \
dst_1[2*i] = r[Y] + g[Y] + b[Y]; \
Y = py_1[2*i+1]; \
dst_1[2*i+1] = r[Y] + g[Y] + b[Y];
#define DST2(i) \
Y = py_2[2*i]; \
dst_2[2*i] = r[Y] + g[Y] + b[Y]; \
Y = py_2[2*i+1]; \
dst_2[2*i+1] = r[Y] + g[Y] + b[Y];
#define DST1RGB(i) \
Y = py_1[2*i]; \
dst_1[6*i] = r[Y]; dst_1[6*i+1] = g[Y]; dst_1[6*i+2] = b[Y]; \
Y = py_1[2*i+1]; \
dst_1[6*i+3] = r[Y]; dst_1[6*i+4] = g[Y]; dst_1[6*i+5] = b[Y];
#define DST2RGB(i) \
Y = py_2[2*i]; \
dst_2[6*i] = r[Y]; dst_2[6*i+1] = g[Y]; dst_2[6*i+2] = b[Y]; \
Y = py_2[2*i+1]; \
dst_2[6*i+3] = r[Y]; dst_2[6*i+4] = g[Y]; dst_2[6*i+5] = b[Y];
#define DST1BGR(i) \
Y = py_1[2*i]; \
dst_1[6*i] = b[Y]; dst_1[6*i+1] = g[Y]; dst_1[6*i+2] = r[Y]; \
Y = py_1[2*i+1]; \
dst_1[6*i+3] = b[Y]; dst_1[6*i+4] = g[Y]; dst_1[6*i+5] = r[Y];
#define DST2BGR(i) \
Y = py_2[2*i]; \
dst_2[6*i] = b[Y]; dst_2[6*i+1] = g[Y]; dst_2[6*i+2] = r[Y]; \
Y = py_2[2*i+1]; \
dst_2[6*i+3] = b[Y]; dst_2[6*i+4] = g[Y]; dst_2[6*i+5] = r[Y];
#define PROLOG(func_name, dst_type) \
static int func_name(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY, \
int srcSliceH, uint8_t* dst[], int dstStride[]){\
int y;\
\
if(c->srcFormat == IMGFMT_422P){\
srcStride[1] *= 2;\
srcStride[2] *= 2;\
}\
for(y=0; y<srcSliceH; y+=2){\
dst_type *dst_1= (dst_type*)(dst[0] + (y+srcSliceY )*dstStride[0]);\
dst_type *dst_2= (dst_type*)(dst[0] + (y+srcSliceY+1)*dstStride[0]);\
dst_type *r, *g, *b;\
uint8_t *py_1= src[0] + y*srcStride[0];\
uint8_t *py_2= py_1 + srcStride[0];\
uint8_t *pu= src[1] + (y>>1)*srcStride[1];\
uint8_t *pv= src[2] + (y>>1)*srcStride[2];\
unsigned int h_size= c->dstW>>3;\
while (h_size--) {\
int U, V, Y;\
#define EPILOG(dst_delta)\
pu += 4;\
pv += 4;\
py_1 += 8;\
py_2 += 8;\
dst_1 += dst_delta;\
dst_2 += dst_delta;\
}\
}\
return srcSliceH;\
}
PROLOG
(
yuv2rgb_c_32
,
uint32_t
)
RGB
(
0
);
DST1
(
0
);
DST2
(
0
);
RGB
(
1
);
DST2
(
1
);
DST1
(
1
);
RGB
(
2
);
DST1
(
2
);
DST2
(
2
);
RGB
(
3
);
DST2
(
3
);
DST1
(
3
);
EPILOG
(
8
)
PROLOG
(
yuv2rgb_c_24_rgb
,
uint8_t
)
RGB
(
0
);
DST1RGB
(
0
);
DST2RGB
(
0
);
RGB
(
1
);
DST2RGB
(
1
);
DST1RGB
(
1
);
RGB
(
2
);
DST1RGB
(
2
);
DST2RGB
(
2
);
RGB
(
3
);
DST2RGB
(
3
);
DST1RGB
(
3
);
EPILOG
(
24
)
// only trivial mods from yuv2rgb_c_24_rgb
PROLOG
(
yuv2rgb_c_24_bgr
,
uint8_t
)
RGB
(
0
);
DST1BGR
(
0
);
DST2BGR
(
0
);
RGB
(
1
);
DST2BGR
(
1
);
DST1BGR
(
1
);
RGB
(
2
);
DST1BGR
(
2
);
DST2BGR
(
2
);
RGB
(
3
);
DST2BGR
(
3
);
DST1BGR
(
3
);
EPILOG
(
24
)
// This is exactly the same code as yuv2rgb_c_32 except for the types of
// r, g, b, dst_1, dst_2
PROLOG
(
yuv2rgb_c_16
,
uint16_t
)
RGB
(
0
);
DST1
(
0
);
DST2
(
0
);
RGB
(
1
);
DST2
(
1
);
DST1
(
1
);
RGB
(
2
);
DST1
(
2
);
DST2
(
2
);
RGB
(
3
);
DST2
(
3
);
DST1
(
3
);
EPILOG
(
8
)
// This is exactly the same code as yuv2rgb_c_32 except for the types of
// r, g, b, dst_1, dst_2
PROLOG
(
yuv2rgb_c_8
,
uint8_t
)
RGB
(
0
);
DST1
(
0
);
DST2
(
0
);
RGB
(
1
);
DST2
(
1
);
DST1
(
1
);
RGB
(
2
);
DST1
(
2
);
DST2
(
2
);
RGB
(
3
);
DST2
(
3
);
DST1
(
3
);
EPILOG
(
8
)
// r, g, b, dst_1, dst_2
PROLOG
(
yuv2rgb_c_8_ordered_dither
,
uint8_t
)
const
uint8_t
*
d32
=
dither_8x8_32
[
y
&
7
];
const
uint8_t
*
d64
=
dither_8x8_73
[
y
&
7
];
#define DST1bpp8(i,o) \
Y = py_1[2*i]; \
dst_1[2*i] = r[Y+d32[0+o]] + g[Y+d32[0+o]] + b[Y+d64[0+o]]; \
Y = py_1[2*i+1]; \
dst_1[2*i+1] = r[Y+d32[1+o]] + g[Y+d32[1+o]] + b[Y+d64[1+o]];
#define DST2bpp8(i,o) \
Y = py_2[2*i]; \
dst_2[2*i] = r[Y+d32[8+o]] + g[Y+d32[8+o]] + b[Y+d64[8+o]]; \
Y = py_2[2*i+1]; \
dst_2[2*i+1] = r[Y+d32[9+o]] + g[Y+d32[9+o]] + b[Y+d64[9+o]];
RGB
(
0
);
DST1bpp8
(
0
,
0
);
DST2bpp8
(
0
,
0
);
RGB
(
1
);
DST2bpp8
(
1
,
2
);
DST1bpp8
(
1
,
2
);
RGB
(
2
);
DST1bpp8
(
2
,
4
);
DST2bpp8
(
2
,
4
);
RGB
(
3
);
DST2bpp8
(
3
,
6
);
DST1bpp8
(
3
,
6
);
EPILOG
(
8
)
// This is exactly the same code as yuv2rgb_c_32 except for the types of
// r, g, b, dst_1, dst_2
PROLOG
(
yuv2rgb_c_4
,
uint8_t
)
int
acc
;
#define DST1_4(i) \
Y = py_1[2*i]; \
acc = r[Y] + g[Y] + b[Y]; \
Y = py_1[2*i+1]; \
acc |= (r[Y] + g[Y] + b[Y])<<4;\
dst_1[i] = acc;
#define DST2_4(i) \
Y = py_2[2*i]; \
acc = r[Y] + g[Y] + b[Y]; \
Y = py_2[2*i+1]; \
acc |= (r[Y] + g[Y] + b[Y])<<4;\
dst_2[i] = acc;
RGB
(
0
);
DST1_4
(
0
);
DST2_4
(
0
);
RGB
(
1
);
DST2_4
(
1
);
DST1_4
(
1
);
RGB
(
2
);
DST1_4
(
2
);
DST2_4
(
2
);
RGB
(
3
);
DST2_4
(
3
);
DST1_4
(
3
);
EPILOG
(
4
)
PROLOG
(
yuv2rgb_c_4_ordered_dither
,
uint8_t
)
const
uint8_t
*
d64
=
dither_8x8_73
[
y
&
7
];
const
uint8_t
*
d128
=
dither_8x8_220
[
y
&
7
];
int
acc
;
#define DST1bpp4(i,o) \
Y = py_1[2*i]; \
acc = r[Y+d128[0+o]] + g[Y+d64[0+o]] + b[Y+d128[0+o]]; \
Y = py_1[2*i+1]; \
acc |= (r[Y+d128[1+o]] + g[Y+d64[1+o]] + b[Y+d128[1+o]])<<4;\
dst_1[i]= acc;
#define DST2bpp4(i,o) \
Y = py_2[2*i]; \
acc = r[Y+d128[8+o]] + g[Y+d64[8+o]] + b[Y+d128[8+o]]; \
Y = py_2[2*i+1]; \
acc |= (r[Y+d128[9+o]] + g[Y+d64[9+o]] + b[Y+d128[9+o]])<<4;\
dst_2[i]= acc;
RGB
(
0
);
DST1bpp4
(
0
,
0
);
DST2bpp4
(
0
,
0
);
RGB
(
1
);
DST2bpp4
(
1
,
2
);
DST1bpp4
(
1
,
2
);
RGB
(
2
);
DST1bpp4
(
2
,
4
);
DST2bpp4
(
2
,
4
);
RGB
(
3
);
DST2bpp4
(
3
,
6
);
DST1bpp4
(
3
,
6
);
EPILOG
(
4
)
// This is exactly the same code as yuv2rgb_c_32 except for the types of
// r, g, b, dst_1, dst_2
PROLOG
(
yuv2rgb_c_4b
,
uint8_t
)
RGB
(
0
);
DST1
(
0
);
DST2
(
0
);
RGB
(
1
);
DST2
(
1
);
DST1
(
1
);
RGB
(
2
);
DST1
(
2
);
DST2
(
2
);
RGB
(
3
);
DST2
(
3
);
DST1
(
3
);
EPILOG
(
8
)
PROLOG
(
yuv2rgb_c_4b_ordered_dither
,
uint8_t
)
const
uint8_t
*
d64
=
dither_8x8_73
[
y
&
7
];
const
uint8_t
*
d128
=
dither_8x8_220
[
y
&
7
];
#define DST1bpp4b(i,o) \
Y = py_1[2*i]; \
dst_1[2*i] = r[Y+d128[0+o]] + g[Y+d64[0+o]] + b[Y+d128[0+o]]; \
Y = py_1[2*i+1]; \
dst_1[2*i+1] = r[Y+d128[1+o]] + g[Y+d64[1+o]] + b[Y+d128[1+o]];
#define DST2bpp4b(i,o) \
Y = py_2[2*i]; \
dst_2[2*i] = r[Y+d128[8+o]] + g[Y+d64[8+o]] + b[Y+d128[8+o]]; \
Y = py_2[2*i+1]; \
dst_2[2*i+1] = r[Y+d128[9+o]] + g[Y+d64[9+o]] + b[Y+d128[9+o]];
RGB
(
0
);
DST1bpp4b
(
0
,
0
);
DST2bpp4b
(
0
,
0
);
RGB
(
1
);
DST2bpp4b
(
1
,
2
);
DST1bpp4b
(
1
,
2
);
RGB
(
2
);
DST1bpp4b
(
2
,
4
);
DST2bpp4b
(
2
,
4
);
RGB
(
3
);
DST2bpp4b
(
3
,
6
);
DST1bpp4b
(
3
,
6
);
EPILOG
(
8
)
PROLOG
(
yuv2rgb_c_1_ordered_dither
,
uint8_t
)
const
uint8_t
*
d128
=
dither_8x8_220
[
y
&
7
];
char
out_1
=
0
,
out_2
=
0
;
g
=
c
->
table_gU
[
128
]
+
c
->
table_gV
[
128
];
#define DST1bpp1(i,o) \
Y = py_1[2*i]; \
out_1+= out_1 + g[Y+d128[0+o]]; \
Y = py_1[2*i+1]; \
out_1+= out_1 + g[Y+d128[1+o]];
#define DST2bpp1(i,o) \
Y = py_2[2*i]; \
out_2+= out_2 + g[Y+d128[8+o]]; \
Y = py_2[2*i+1]; \
out_2+= out_2 + g[Y+d128[9+o]];
DST1bpp1
(
0
,
0
);
DST2bpp1
(
0
,
0
);
DST2bpp1
(
1
,
2
);
DST1bpp1
(
1
,
2
);
DST1bpp1
(
2
,
4
);
DST2bpp1
(
2
,
4
);
DST2bpp1
(
3
,
6
);
DST1bpp1
(
3
,
6
);
dst_1
[
0
]
=
out_1
;
dst_2
[
0
]
=
out_2
;
EPILOG
(
1
)
SwsFunc
yuv2rgb_get_func_ptr
(
SwsContext
*
c
)
{
#ifdef ARCH_X86
if
(
c
->
flags
&
SWS_CPU_CAPS_MMX2
){
switch
(
c
->
dstFormat
){
case
IMGFMT_BGR32
:
return
yuv420_rgb32_MMX2
;
case
IMGFMT_BGR24
:
return
yuv420_rgb24_MMX2
;
case
IMGFMT_BGR16
:
return
yuv420_rgb16_MMX2
;
case
IMGFMT_BGR15
:
return
yuv420_rgb15_MMX2
;
}
}
if
(
c
->
flags
&
SWS_CPU_CAPS_MMX
){
switch
(
c
->
dstFormat
){
case
IMGFMT_BGR32
:
return
yuv420_rgb32_MMX
;
case
IMGFMT_BGR24
:
return
yuv420_rgb24_MMX
;
case
IMGFMT_BGR16
:
return
yuv420_rgb16_MMX
;
case
IMGFMT_BGR15
:
return
yuv420_rgb15_MMX
;
}
}
#endif
#ifdef HAVE_MLIB
{
SwsFunc
t
=
yuv2rgb_init_mlib
(
c
);
if
(
t
)
return
t
;
}
#endif
#ifdef HAVE_ALTIVEC
if
(
c
->
flags
&
SWS_CPU_CAPS_ALTIVEC
)
{
SwsFunc
t
=
yuv2rgb_init_altivec
(
c
);
if
(
t
)
return
t
;
}
#endif
MSG_WARN
(
"No accelerated colorspace conversion found
\n
"
);
switch
(
c
->
dstFormat
){
case
IMGFMT_RGB32
:
case
IMGFMT_BGR32
:
return
yuv2rgb_c_32
;
case
IMGFMT_RGB24
:
return
yuv2rgb_c_24_rgb
;
case
IMGFMT_BGR24
:
return
yuv2rgb_c_24_bgr
;
case
IMGFMT_RGB16
:
case
IMGFMT_BGR16
:
case
IMGFMT_RGB15
:
case
IMGFMT_BGR15
:
return
yuv2rgb_c_16
;
case
IMGFMT_RGB8
:
case
IMGFMT_BGR8
:
return
yuv2rgb_c_8_ordered_dither
;
case
IMGFMT_RGB4
:
case
IMGFMT_BGR4
:
return
yuv2rgb_c_4_ordered_dither
;
case
IMGFMT_RG4B
:
case
IMGFMT_BG4B
:
return
yuv2rgb_c_4b_ordered_dither
;
case
IMGFMT_RGB1
:
case
IMGFMT_BGR1
:
return
yuv2rgb_c_1_ordered_dither
;
default:
assert
(
0
);
}
return
NULL
;
}
static
int
div_round
(
int
dividend
,
int
divisor
)
{
if
(
dividend
>
0
)
return
(
dividend
+
(
divisor
>>
1
))
/
divisor
;
else
return
-
((
-
dividend
+
(
divisor
>>
1
))
/
divisor
);
}
int
yuv2rgb_c_init_tables
(
SwsContext
*
c
,
const
int
inv_table
[
4
],
int
fullRange
,
int
brightness
,
int
contrast
,
int
saturation
)
{
const
int
isRgb
=
IMGFMT_IS_BGR
(
c
->
dstFormat
);
const
int
bpp
=
isRgb
?
IMGFMT_RGB_DEPTH
(
c
->
dstFormat
)
:
IMGFMT_BGR_DEPTH
(
c
->
dstFormat
);
int
i
;
uint8_t
table_Y
[
1024
];
uint32_t
*
table_32
=
0
;
uint16_t
*
table_16
=
0
;
uint8_t
*
table_8
=
0
;
uint8_t
*
table_332
=
0
;
uint8_t
*
table_121
=
0
;
uint8_t
*
table_1
=
0
;
int
entry_size
=
0
;
void
*
table_r
=
0
,
*
table_g
=
0
,
*
table_b
=
0
;
void
*
table_start
;
int64_t
crv
=
inv_table
[
0
];
int64_t
cbu
=
inv_table
[
1
];
int64_t
cgu
=
-
inv_table
[
2
];
int64_t
cgv
=
-
inv_table
[
3
];
int64_t
cy
=
1
<<
16
;
int64_t
oy
=
0
;
//printf("%lld %lld %lld %lld %lld\n", cy, crv, cbu, cgu, cgv);
if
(
!
fullRange
){
cy
=
(
cy
*
255
)
/
219
;
oy
=
16
<<
16
;
}
cy
=
(
cy
*
contrast
)
>>
16
;
crv
=
(
crv
*
contrast
*
saturation
)
>>
32
;
cbu
=
(
cbu
*
contrast
*
saturation
)
>>
32
;
cgu
=
(
cgu
*
contrast
*
saturation
)
>>
32
;
cgv
=
(
cgv
*
contrast
*
saturation
)
>>
32
;
//printf("%lld %lld %lld %lld %lld\n", cy, crv, cbu, cgu, cgv);
oy
-=
256
*
brightness
;
for
(
i
=
0
;
i
<
1024
;
i
++
)
{
int
j
;
j
=
(
cy
*
(((
i
-
384
)
<<
16
)
-
oy
)
+
(
1
<<
31
))
>>
32
;
j
=
(
j
<
0
)
?
0
:
((
j
>
255
)
?
255
:
j
);
table_Y
[
i
]
=
j
;
}
switch
(
bpp
)
{
case
32
:
table_start
=
table_32
=
malloc
((
197
+
2
*
682
+
256
+
132
)
*
sizeof
(
uint32_t
));
entry_size
=
sizeof
(
uint32_t
);
table_r
=
table_32
+
197
;
table_b
=
table_32
+
197
+
685
;
table_g
=
table_32
+
197
+
2
*
682
;
for
(
i
=
-
197
;
i
<
256
+
197
;
i
++
)
((
uint32_t
*
)
table_r
)[
i
]
=
table_Y
[
i
+
384
]
<<
(
isRgb
?
16
:
0
);
for
(
i
=
-
132
;
i
<
256
+
132
;
i
++
)
((
uint32_t
*
)
table_g
)[
i
]
=
table_Y
[
i
+
384
]
<<
8
;
for
(
i
=
-
232
;
i
<
256
+
232
;
i
++
)
((
uint32_t
*
)
table_b
)[
i
]
=
table_Y
[
i
+
384
]
<<
(
isRgb
?
0
:
16
);
break
;
case
24
:
table_start
=
table_8
=
malloc
((
256
+
2
*
232
)
*
sizeof
(
uint8_t
));
entry_size
=
sizeof
(
uint8_t
);
table_r
=
table_g
=
table_b
=
table_8
+
232
;
for
(
i
=
-
232
;
i
<
256
+
232
;
i
++
)
((
uint8_t
*
)
table_b
)[
i
]
=
table_Y
[
i
+
384
];
break
;
case
15
:
case
16
:
table_start
=
table_16
=
malloc
((
197
+
2
*
682
+
256
+
132
)
*
sizeof
(
uint16_t
));
entry_size
=
sizeof
(
uint16_t
);
table_r
=
table_16
+
197
;
table_b
=
table_16
+
197
+
685
;
table_g
=
table_16
+
197
+
2
*
682
;
for
(
i
=
-
197
;
i
<
256
+
197
;
i
++
)
{
int
j
=
table_Y
[
i
+
384
]
>>
3
;
if
(
isRgb
)
j
<<=
((
bpp
==
16
)
?
11
:
10
);
((
uint16_t
*
)
table_r
)[
i
]
=
j
;
}
for
(
i
=
-
132
;
i
<
256
+
132
;
i
++
)
{
int
j
=
table_Y
[
i
+
384
]
>>
((
bpp
==
16
)
?
2
:
3
);
((
uint16_t
*
)
table_g
)[
i
]
=
j
<<
5
;
}
for
(
i
=
-
232
;
i
<
256
+
232
;
i
++
)
{
int
j
=
table_Y
[
i
+
384
]
>>
3
;
if
(
!
isRgb
)
j
<<=
((
bpp
==
16
)
?
11
:
10
);
((
uint16_t
*
)
table_b
)[
i
]
=
j
;
}
break
;
case
8
:
table_start
=
table_332
=
malloc
((
197
+
2
*
682
+
256
+
132
)
*
sizeof
(
uint8_t
));
entry_size
=
sizeof
(
uint8_t
);
table_r
=
table_332
+
197
;
table_b
=
table_332
+
197
+
685
;
table_g
=
table_332
+
197
+
2
*
682
;
for
(
i
=
-
197
;
i
<
256
+
197
;
i
++
)
{
int
j
=
(
table_Y
[
i
+
384
-
16
]
+
18
)
/
36
;
if
(
isRgb
)
j
<<=
5
;
((
uint8_t
*
)
table_r
)[
i
]
=
j
;
}
for
(
i
=
-
132
;
i
<
256
+
132
;
i
++
)
{
int
j
=
(
table_Y
[
i
+
384
-
16
]
+
18
)
/
36
;
if
(
!
isRgb
)
j
<<=
1
;
((
uint8_t
*
)
table_g
)[
i
]
=
j
<<
2
;
}
for
(
i
=
-
232
;
i
<
256
+
232
;
i
++
)
{
int
j
=
(
table_Y
[
i
+
384
-
37
]
+
43
)
/
85
;
if
(
!
isRgb
)
j
<<=
6
;
((
uint8_t
*
)
table_b
)[
i
]
=
j
;
}
break
;
case
4
:
case
4
|
128
:
table_start
=
table_121
=
malloc
((
197
+
2
*
682
+
256
+
132
)
*
sizeof
(
uint8_t
));
entry_size
=
sizeof
(
uint8_t
);
table_r
=
table_121
+
197
;
table_b
=
table_121
+
197
+
685
;
table_g
=
table_121
+
197
+
2
*
682
;
for
(
i
=
-
197
;
i
<
256
+
197
;
i
++
)
{
int
j
=
table_Y
[
i
+
384
-
110
]
>>
7
;
if
(
isRgb
)
j
<<=
3
;
((
uint8_t
*
)
table_r
)[
i
]
=
j
;
}
for
(
i
=
-
132
;
i
<
256
+
132
;
i
++
)
{
int
j
=
(
table_Y
[
i
+
384
-
37
]
+
43
)
/
85
;
((
uint8_t
*
)
table_g
)[
i
]
=
j
<<
1
;
}
for
(
i
=
-
232
;
i
<
256
+
232
;
i
++
)
{
int
j
=
table_Y
[
i
+
384
-
110
]
>>
7
;
if
(
!
isRgb
)
j
<<=
3
;
((
uint8_t
*
)
table_b
)[
i
]
=
j
;
}
break
;
case
1
:
table_start
=
table_1
=
malloc
(
256
*
2
*
sizeof
(
uint8_t
));
entry_size
=
sizeof
(
uint8_t
);
table_g
=
table_1
;
table_r
=
table_b
=
NULL
;
for
(
i
=
0
;
i
<
256
+
256
;
i
++
)
{
int
j
=
table_Y
[
i
+
384
-
110
]
>>
7
;
((
uint8_t
*
)
table_g
)[
i
]
=
j
;
}
break
;
default:
table_start
=
NULL
;
MSG_ERR
(
"%ibpp not supported by yuv2rgb
\n
"
,
bpp
);
//free mem?
return
-
1
;
}
for
(
i
=
0
;
i
<
256
;
i
++
)
{
c
->
table_rV
[
i
]
=
table_r
+
entry_size
*
div_round
(
crv
*
(
i
-
128
),
76309
);
c
->
table_gU
[
i
]
=
table_g
+
entry_size
*
div_round
(
cgu
*
(
i
-
128
),
76309
);
c
->
table_gV
[
i
]
=
entry_size
*
div_round
(
cgv
*
(
i
-
128
),
76309
);
c
->
table_bU
[
i
]
=
table_b
+
entry_size
*
div_round
(
cbu
*
(
i
-
128
),
76309
);
}
if
(
c
->
yuvTable
)
free
(
c
->
yuvTable
);
c
->
yuvTable
=
table_start
;
return
0
;
}
modules/video_filter/swscale/yuv2rgb_altivec.c
deleted
100644 → 0
View file @
3a2462f6
/*
marc.hoffman@analog.com March 8, 2004
Altivec Acceleration for Color Space Conversion revision 0.2
convert I420 YV12 to RGB in various formats,
it rejects images that are not in 420 formats
it rejects images that don't have widths of multiples of 16
it rejects images that don't have heights of multiples of 2
reject defers to C simulation codes.
lots of optimizations to be done here
1. need to fix saturation code, I just couldn't get it to fly with packs and adds.
so we currently use max min to clip
2. the inefficient use of chroma loading needs a bit of brushing up
3. analysis of pipeline stalls needs to be done, use shark to identify pipeline stalls
MODIFIED to calculate coeffs from currently selected color space.
MODIFIED core to be a macro which you spec the output format.
ADDED UYVY conversion which is never called due to some thing in SWSCALE.
CORRECTED algorithim selection to be strict on input formats.
ADDED runtime detection of altivec.
ADDED altivec_yuv2packedX vertical scl + RGB converter
March 27,2004
PERFORMANCE ANALYSIS
The C version use 25% of the processor or ~250Mips for D1 video rawvideo used as test
The ALTIVEC version uses 10% of the processor or ~100Mips for D1 video same sequence
720*480*30 ~10MPS
so we have roughly 10clocks per pixel this is too high something has to be wrong.
OPTIMIZED clip codes to utilize vec_max and vec_packs removing the need for vec_min.
OPTIMIZED DST OUTPUT cache/dma controls. we are pretty much
guaranteed to have the input video frame it was just decompressed so
it probably resides in L1 caches. However we are creating the
output video stream this needs to use the DSTST instruction to
optimize for the cache. We couple this with the fact that we are
not going to be visiting the input buffer again so we mark it Least
Recently Used. This shaves 25% of the processor cycles off.
Now MEMCPY is the largest mips consumer in the system, probably due
to the inefficient X11 stuff.
GL libraries seem to be very slow on this machine 1.33Ghz PB running
Jaguar, this is not the case for my 1Ghz PB. I thought it might be
a versioning issues, however i have libGL.1.2.dylib for both
machines. ((We need to figure this out now))
GL2 libraries work now with patch for RGB32
NOTE quartz vo driver ARGB32_to_RGB24 consumes 30% of the processor
Integrated luma prescaling adjustment for saturation/contrast/brightness adjustment.
*/
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <inttypes.h>
#include <assert.h>
#include "config.h"
#include "rgb2rgb.h"
#include "swscale.h"
#include "swscale_internal.h"
#include "mangle.h"
#include "img_format.h" //FIXME try to reduce dependency of such stuff
#undef PROFILE_THE_BEAST
#undef INC_SCALING
typedef
unsigned
char
ubyte
;
typedef
signed
char
sbyte
;
/* RGB interleaver, 16 planar pels 8-bit samples per channel in
homogeneous vector registers x0,x1,x2 are interleaved with the
following technique:
o0 = vec_mergeh (x0,x1);
o1 = vec_perm (o0, x2, perm_rgb_0);
o2 = vec_perm (o0, x2, perm_rgb_1);
o3 = vec_mergel (x0,x1);
o4 = vec_perm (o3,o2,perm_rgb_2);
o5 = vec_perm (o3,o2,perm_rgb_3);
perm_rgb_0: o0(RG).h v1(B) --> o1*
0 1 2 3 4
rgbr|gbrg|brgb|rgbr
0010 0100 1001 0010
0102 3145 2673 894A
perm_rgb_1: o0(RG).h v1(B) --> o2
0 1 2 3 4
gbrg|brgb|bbbb|bbbb
0100 1001 1111 1111
B5CD 6EF7 89AB CDEF
perm_rgb_2: o3(RG).l o2(rgbB.l) --> o4*
0 1 2 3 4
gbrg|brgb|rgbr|gbrg
1111 1111 0010 0100
89AB CDEF 0182 3945
perm_rgb_2: o3(RG).l o2(rgbB.l) ---> o5*
0 1 2 3 4
brgb|rgbr|gbrg|brgb
1001 0010 0100 1001
a67b 89cA BdCD eEFf
*/
static
const
vector
unsigned
char
perm_rgb_0
=
(
vector
unsigned
char
)(
0x00
,
0x01
,
0x10
,
0x02
,
0x03
,
0x11
,
0x04
,
0x05
,
0x12
,
0x06
,
0x07
,
0x13
,
0x08
,
0x09
,
0x14
,
0x0a
),
perm_rgb_1
=
(
vector
unsigned
char
)(
0x0b
,
0x15
,
0x0c
,
0x0d
,
0x16
,
0x0e
,
0x0f
,
0x17
,
0x18
,
0x19
,
0x1a
,
0x1b
,
0x1c
,
0x1d
,
0x1e
,
0x1f
),
perm_rgb_2
=
(
vector
unsigned
char
)(
0x10
,
0x11
,
0x12
,
0x13
,
0x14
,
0x15
,
0x16
,
0x17
,
0x00
,
0x01
,
0x18
,
0x02
,
0x03
,
0x19
,
0x04
,
0x05
),
perm_rgb_3
=
(
vector
unsigned
char
)(
0x1a
,
0x06
,
0x07
,
0x1b
,
0x08
,
0x09
,
0x1c
,
0x0a
,
0x0b
,
0x1d
,
0x0c
,
0x0d
,
0x1e
,
0x0e
,
0x0f
,
0x1f
);
#define vec_merge3(x2,x1,x0,y0,y1,y2) \
do { \
typeof(x0) o0,o2,o3; \
o0 = vec_mergeh (x0,x1); \
y0 = vec_perm (o0, x2, perm_rgb_0);\
o2 = vec_perm (o0, x2, perm_rgb_1);\
o3 = vec_mergel (x0,x1); \
y1 = vec_perm (o3,o2,perm_rgb_2); \
y2 = vec_perm (o3,o2,perm_rgb_3); \
} while(0)
#define vec_mstrgb24(x0,x1,x2,ptr) \
do { \
typeof(x0) _0,_1,_2; \
vec_merge3 (x0,x1,x2,_0,_1,_2); \
vec_st (_0, 0, ptr++); \
vec_st (_1, 0, ptr++); \
vec_st (_2, 0, ptr++); \
} while (0);
#define vec_mstbgr24(x0,x1,x2,ptr) \
do { \
typeof(x0) _0,_1,_2; \
vec_merge3 (x2,x1,x0,_0,_1,_2); \
vec_st (_0, 0, ptr++); \
vec_st (_1, 0, ptr++); \
vec_st (_2, 0, ptr++); \
} while (0);
/* pack the pixels in rgb0 format
msb R
lsb 0
*/
#define vec_mstrgb32(T,x0,x1,x2,x3,ptr) \
do { \
T _0,_1,_2,_3; \
_0 = vec_mergeh (x0,x1); \
_1 = vec_mergeh (x2,x3); \
_2 = (T)vec_mergeh ((vector unsigned short)_0,(vector unsigned short)_1); \
_3 = (T)vec_mergel ((vector unsigned short)_0,(vector unsigned short)_1); \
vec_st (_2, 0*16, (T *)ptr); \
vec_st (_3, 1*16, (T *)ptr); \
_0 = vec_mergel (x0,x1); \
_1 = vec_mergel (x2,x3); \
_2 = (T)vec_mergeh ((vector unsigned short)_0,(vector unsigned short)_1); \
_3 = (T)vec_mergel ((vector unsigned short)_0,(vector unsigned short)_1); \
vec_st (_2, 2*16, (T *)ptr); \
vec_st (_3, 3*16, (T *)ptr); \
ptr += 4; \
} while (0);
/*
| 1 0 1.4021 | | Y |
| 1 -0.3441 -0.7142 |x| Cb|
| 1 1.7718 0 | | Cr|
Y: [-128 127]
Cb/Cr : [-128 127]
typical yuv conversion work on Y: 0-255 this version has been optimized for jpeg decode.
*/
#define vec_unh(x) \
(vector signed short) \
vec_perm(x,(typeof(x))(0),\
(vector unsigned char)(0x10,0x00,0x10,0x01,0x10,0x02,0x10,0x03,\
0x10,0x04,0x10,0x05,0x10,0x06,0x10,0x07))
#define vec_unl(x) \
(vector signed short) \
vec_perm(x,(typeof(x))(0),\
(vector unsigned char)(0x10,0x08,0x10,0x09,0x10,0x0A,0x10,0x0B,\
0x10,0x0C,0x10,0x0D,0x10,0x0E,0x10,0x0F))
#define vec_clip(x) \
vec_max (vec_min (x, (typeof(x))(255)), (typeof(x))(0))
#define vec_packclp_a(x,y) \
(vector unsigned char)vec_pack (vec_clip (x), vec_clip (y))
#define vec_packclp(x,y) \
(vector unsigned char)vec_packs \
((vector unsigned short)vec_max (x,(vector signed short) (0)), \
(vector unsigned short)vec_max (y,(vector signed short) (0)))
//#define out_pixels(a,b,c,ptr) vec_mstrgb32(typeof(a),((typeof (a))(0)),a,a,a,ptr)
static
inline
void
cvtyuvtoRGB
(
SwsContext
*
c
,
vector
signed
short
Y
,
vector
signed
short
U
,
vector
signed
short
V
,
vector
signed
short
*
R
,
vector
signed
short
*
G
,
vector
signed
short
*
B
)
{
vector
signed
short
vx
,
ux
,
uvx
;
Y
=
vec_mradds
(
Y
,
c
->
CY
,
c
->
OY
);
U
=
vec_sub
(
U
,(
vector
signed
short
)(
128
));
V
=
vec_sub
(
V
,(
vector
signed
short
)(
128
));
// ux = (CBU*(u<<c->CSHIFT)+0x4000)>>15;
ux
=
vec_sl
(
U
,
c
->
CSHIFT
);
*
B
=
vec_mradds
(
ux
,
c
->
CBU
,
Y
);
// vx = (CRV*(v<<c->CSHIFT)+0x4000)>>15;
vx
=
vec_sl
(
V
,
c
->
CSHIFT
);
*
R
=
vec_mradds
(
vx
,
c
->
CRV
,
Y
);
// uvx = ((CGU*u) + (CGV*v))>>15;
uvx
=
vec_mradds
(
U
,
c
->
CGU
,
Y
);
*
G
=
vec_mradds
(
V
,
c
->
CGV
,
uvx
);
}
/*
------------------------------------------------------------------------------
CS converters
------------------------------------------------------------------------------
*/
#define DEFCSP420_CVT(name,out_pixels) \
static int altivec_##name (SwsContext *c, \
unsigned char **in, int *instrides, \
int srcSliceY, int srcSliceH, \
unsigned char **oplanes, int *outstrides) \
{ \
int w = c->srcW; \
int h = srcSliceH; \
int i,j; \
int instrides_scl[3]; \
vector unsigned char y0,y1; \
\
vector signed char u,v; \
\
vector signed short Y0,Y1,Y2,Y3; \
vector signed short U,V; \
vector signed short vx,ux,uvx; \
vector signed short vx0,ux0,uvx0; \
vector signed short vx1,ux1,uvx1; \
vector signed short R0,G0,B0; \
vector signed short R1,G1,B1; \
vector unsigned char R,G,B; \
\
vector unsigned char *uivP, *vivP; \
vector unsigned char align_perm; \
\
vector signed short \
lCY = c->CY, \
lOY = c->OY, \
lCRV = c->CRV, \
lCBU = c->CBU, \
lCGU = c->CGU, \
lCGV = c->CGV; \
\
vector unsigned short lCSHIFT = c->CSHIFT; \
\
ubyte *y1i = in[0]; \
ubyte *y2i = in[0]+w; \
ubyte *ui = in[1]; \
ubyte *vi = in[2]; \
\
vector unsigned char *oute \
= (vector unsigned char *) \
(oplanes[0]+srcSliceY*outstrides[0]); \
vector unsigned char *outo \
= (vector unsigned char *) \
(oplanes[0]+srcSliceY*outstrides[0]+outstrides[0]); \
\
\
instrides_scl[0] = instrides[0]; \
instrides_scl[1] = instrides[1]-w/2;
/* the loop moves ui by w/2 */
\
instrides_scl[2] = instrides[2]-w/2;
/* the loop moves vi by w/2 */
\
\
\
for (i=0;i<h/2;i++) { \
vec_dstst (outo, (0x02000002|(((w*3+32)/32)<<16)), 0); \
vec_dstst (oute, (0x02000002|(((w*3+32)/32)<<16)), 1); \
\
for (j=0;j<w/16;j++) { \
\
y0 = vec_ldl (0,y1i); \
y1 = vec_ldl (0,y2i); \
uivP = (vector unsigned char *)ui; \
vivP = (vector unsigned char *)vi; \
\
align_perm = vec_lvsl (0, ui); \
u = (vector signed char)vec_perm (uivP[0], uivP[1], align_perm); \
\
align_perm = vec_lvsl (0, vi); \
v = (vector signed char)vec_perm (vivP[0], vivP[1], align_perm); \
\
u = (vector signed char)vec_sub (u, (vector signed char)(128)); \
v = (vector signed char)vec_sub (v, (vector signed char)(128)); \
U = vec_unpackh (u); \
V = vec_unpackh (v); \
\
\
Y0 = vec_unh (y0); \
Y1 = vec_unl (y0); \
Y2 = vec_unh (y1); \
Y3 = vec_unl (y1); \
\
Y0 = vec_mradds (Y0, lCY, lOY); \
Y1 = vec_mradds (Y1, lCY, lOY); \
Y2 = vec_mradds (Y2, lCY, lOY); \
Y3 = vec_mradds (Y3, lCY, lOY); \
\
/* ux = (CBU*(u<<CSHIFT)+0x4000)>>15 */
\
ux = vec_sl (U, lCSHIFT); \
ux = vec_mradds (ux, lCBU, (vector signed short)(0)); \
ux0 = vec_mergeh (ux,ux); \
ux1 = vec_mergel (ux,ux); \
\
/* vx = (CRV*(v<<CSHIFT)+0x4000)>>15; */
\
vx = vec_sl (V, lCSHIFT); \
vx = vec_mradds (vx, lCRV, (vector signed short)(0)); \
vx0 = vec_mergeh (vx,vx); \
vx1 = vec_mergel (vx,vx); \
\
/* uvx = ((CGU*u) + (CGV*v))>>15 */
\
uvx = vec_mradds (U, lCGU, (vector signed short)(0)); \
uvx = vec_mradds (V, lCGV, uvx); \
uvx0 = vec_mergeh (uvx,uvx); \
uvx1 = vec_mergel (uvx,uvx); \
\
R0 = vec_add (Y0,vx0); \
G0 = vec_add (Y0,uvx0); \
B0 = vec_add (Y0,ux0); \
R1 = vec_add (Y1,vx1); \
G1 = vec_add (Y1,uvx1); \
B1 = vec_add (Y1,ux1); \
\
R = vec_packclp (R0,R1); \
G = vec_packclp (G0,G1); \
B = vec_packclp (B0,B1); \
\
out_pixels(R,G,B,oute); \
\
R0 = vec_add (Y2,vx0); \
G0 = vec_add (Y2,uvx0); \
B0 = vec_add (Y2,ux0); \
R1 = vec_add (Y3,vx1); \
G1 = vec_add (Y3,uvx1); \
B1 = vec_add (Y3,ux1); \
R = vec_packclp (R0,R1); \
G = vec_packclp (G0,G1); \
B = vec_packclp (B0,B1); \
\
\
out_pixels(R,G,B,outo); \
\
y1i += 16; \
y2i += 16; \
ui += 8; \
vi += 8; \
\
} \
\
outo += (outstrides[0])>>4; \
oute += (outstrides[0])>>4; \
\
ui += instrides_scl[1]; \
vi += instrides_scl[2]; \
y1i += instrides_scl[0]; \
y2i += instrides_scl[0]; \
} \
return srcSliceH; \
}
#define out_abgr(a,b,c,ptr) vec_mstrgb32(typeof(a),((typeof (a))(0)),c,b,a,ptr)
#define out_bgra(a,b,c,ptr) vec_mstrgb32(typeof(a),c,b,a,((typeof (a))(0)),ptr)
#define out_rgba(a,b,c,ptr) vec_mstrgb32(typeof(a),a,b,c,((typeof (a))(0)),ptr)
#define out_argb(a,b,c,ptr) vec_mstrgb32(typeof(a),((typeof (a))(0)),a,b,c,ptr)
#define out_rgb24(a,b,c,ptr) vec_mstrgb24(a,b,c,ptr)
#define out_bgr24(a,b,c,ptr) vec_mstrgb24(c,b,a,ptr)
DEFCSP420_CVT
(
yuv2_abgr32
,
out_abgr
)
DEFCSP420_CVT
(
yuv2_bgra32
,
out_argb
)
DEFCSP420_CVT
(
yuv2_rgba32
,
out_rgba
)
DEFCSP420_CVT
(
yuv2_argb32
,
out_argb
)
DEFCSP420_CVT
(
yuv2_rgb24
,
out_rgb24
)
DEFCSP420_CVT
(
yuv2_bgr24
,
out_bgr24
)
// uyvy|uyvy|uyvy|uyvy
// 0123 4567 89ab cdef
static
const
vector
unsigned
char
demux_u
=
(
vector
unsigned
char
)(
0x10
,
0x00
,
0x10
,
0x00
,
0x10
,
0x04
,
0x10
,
0x04
,
0x10
,
0x08
,
0x10
,
0x08
,
0x10
,
0x0c
,
0x10
,
0x0c
),
demux_v
=
(
vector
unsigned
char
)(
0x10
,
0x02
,
0x10
,
0x02
,
0x10
,
0x06
,
0x10
,
0x06
,
0x10
,
0x0A
,
0x10
,
0x0A
,
0x10
,
0x0E
,
0x10
,
0x0E
),
demux_y
=
(
vector
unsigned
char
)(
0x10
,
0x01
,
0x10
,
0x03
,
0x10
,
0x05
,
0x10
,
0x07
,
0x10
,
0x09
,
0x10
,
0x0B
,
0x10
,
0x0D
,
0x10
,
0x0F
);
/*
this is so I can play live CCIR raw video
*/
static
int
altivec_uyvy_rgb32
(
SwsContext
*
c
,
unsigned
char
**
in
,
int
*
instrides
,
int
srcSliceY
,
int
srcSliceH
,
unsigned
char
**
oplanes
,
int
*
outstrides
)
{
int
w
=
c
->
srcW
;
int
h
=
srcSliceH
;
int
i
,
j
;
vector
unsigned
char
uyvy
;
vector
signed
short
Y
,
U
,
V
;
vector
signed
short
vx
,
ux
,
uvx
;
vector
signed
short
R0
,
G0
,
B0
,
R1
,
G1
,
B1
;
vector
unsigned
char
R
,
G
,
B
;
vector
unsigned
char
*
out
;
ubyte
*
img
;
img
=
in
[
0
];
out
=
(
vector
unsigned
char
*
)(
oplanes
[
0
]
+
srcSliceY
*
outstrides
[
0
]);
for
(
i
=
0
;
i
<
h
;
i
++
)
{
for
(
j
=
0
;
j
<
w
/
16
;
j
++
)
{
uyvy
=
vec_ld
(
0
,
img
);
U
=
(
vector
signed
short
)
vec_perm
(
uyvy
,
(
vector
unsigned
char
)(
0
),
demux_u
);
V
=
(
vector
signed
short
)
vec_perm
(
uyvy
,
(
vector
unsigned
char
)(
0
),
demux_v
);
Y
=
(
vector
signed
short
)
vec_perm
(
uyvy
,
(
vector
unsigned
char
)(
0
),
demux_y
);
cvtyuvtoRGB
(
c
,
Y
,
U
,
V
,
&
R0
,
&
G0
,
&
B0
);
uyvy
=
vec_ld
(
16
,
img
);
U
=
(
vector
signed
short
)
vec_perm
(
uyvy
,
(
vector
unsigned
char
)(
0
),
demux_u
);
V
=
(
vector
signed
short
)
vec_perm
(
uyvy
,
(
vector
unsigned
char
)(
0
),
demux_v
);
Y
=
(
vector
signed
short
)
vec_perm
(
uyvy
,
(
vector
unsigned
char
)(
0
),
demux_y
);
cvtyuvtoRGB
(
c
,
Y
,
U
,
V
,
&
R1
,
&
G1
,
&
B1
);
R
=
vec_packclp
(
R0
,
R1
);
G
=
vec_packclp
(
G0
,
G1
);
B
=
vec_packclp
(
B0
,
B1
);
// vec_mstbgr24 (R,G,B, out);
out_rgba
(
R
,
G
,
B
,
out
);
img
+=
32
;
}
}
return
srcSliceH
;
}
/* Ok currently the acceleration routine only supports
inputs of widths a multiple of 16
and heights a multiple 2
So we just fall back to the C codes for this.
*/
SwsFunc
yuv2rgb_init_altivec
(
SwsContext
*
c
)
{
if
(
!
(
c
->
flags
&
SWS_CPU_CAPS_ALTIVEC
))
return
NULL
;
/*
and this seems not to matter too much I tried a bunch of
videos with abnormal widths and mplayer crashes else where.
mplayer -vo x11 -rawvideo on:w=350:h=240 raw-350x240.eyuv
boom with X11 bad match.
*/
if
((
c
->
srcW
&
0xf
)
!=
0
)
return
NULL
;
switch
(
c
->
srcFormat
)
{
case
IMGFMT_YVU9
:
case
IMGFMT_IF09
:
case
IMGFMT_YV12
:
case
IMGFMT_I420
:
case
IMGFMT_IYUV
:
case
IMGFMT_CLPL
:
case
IMGFMT_Y800
:
case
IMGFMT_Y8
:
case
IMGFMT_NV12
:
case
IMGFMT_NV21
:
if
((
c
->
srcH
&
0x1
)
!=
0
)
return
NULL
;
switch
(
c
->
dstFormat
){
case
IMGFMT_RGB24
:
MSG_WARN
(
"ALTIVEC: Color Space RGB24
\n
"
);
return
altivec_yuv2_rgb24
;
case
IMGFMT_BGR24
:
MSG_WARN
(
"ALTIVEC: Color Space BGR24
\n
"
);
return
altivec_yuv2_bgr24
;
case
IMGFMT_RGB32
:
MSG_WARN
(
"ALTIVEC: Color Space ARGB32
\n
"
);
return
altivec_yuv2_argb32
;
case
IMGFMT_BGR32
:
MSG_WARN
(
"ALTIVEC: Color Space BGRA32
\n
"
);
// return profile_altivec_bgra32;
return
altivec_yuv2_bgra32
;
default:
return
NULL
;
}
break
;
case
IMGFMT_UYVY
:
switch
(
c
->
dstFormat
){
case
IMGFMT_RGB32
:
MSG_WARN
(
"ALTIVEC: Color Space UYVY -> RGB32
\n
"
);
return
altivec_uyvy_rgb32
;
default:
return
NULL
;
}
break
;
}
return
NULL
;
}
void
yuv2rgb_altivec_init_tables
(
SwsContext
*
c
,
const
int
inv_table
[
4
])
{
vector
signed
short
CY
,
CRV
,
CBU
,
CGU
,
CGV
,
OY
,
Y0
;
int64_t
crv
__attribute__
((
aligned
(
16
)))
=
inv_table
[
0
];
int64_t
cbu
__attribute__
((
aligned
(
16
)))
=
inv_table
[
1
];
int64_t
cgu
__attribute__
((
aligned
(
16
)))
=
inv_table
[
2
];
int64_t
cgv
__attribute__
((
aligned
(
16
)))
=
inv_table
[
3
];
int64_t
cy
=
(
1
<<
16
)
-
1
;
int64_t
oy
=
0
;
short
tmp
__attribute__
((
aligned
(
16
)));
if
((
c
->
flags
&
SWS_CPU_CAPS_ALTIVEC
)
==
0
)
return
;
cy
=
(
cy
*
c
->
contrast
)
>>
17
;
crv
=
(
crv
*
c
->
contrast
*
c
->
saturation
)
>>
32
;
cbu
=
(
cbu
*
c
->
contrast
*
c
->
saturation
)
>>
32
;
cgu
=
(
cgu
*
c
->
contrast
*
c
->
saturation
)
>>
32
;
cgv
=
(
cgv
*
c
->
contrast
*
c
->
saturation
)
>>
32
;
oy
-=
256
*
c
->
brightness
;
tmp
=
cy
;
CY
=
vec_lde
(
0
,
&
tmp
);
CY
=
vec_splat
(
CY
,
0
);
tmp
=
oy
;
OY
=
vec_lde
(
0
,
&
tmp
);
OY
=
vec_splat
(
OY
,
0
);
tmp
=
crv
>>
3
;
CRV
=
vec_lde
(
0
,
&
tmp
);
CRV
=
vec_splat
(
CRV
,
0
);
tmp
=
cbu
>>
3
;
CBU
=
vec_lde
(
0
,
&
tmp
);
CBU
=
vec_splat
(
CBU
,
0
);
tmp
=
-
(
cgu
>>
1
);
CGU
=
vec_lde
(
0
,
&
tmp
);
CGU
=
vec_splat
(
CGU
,
0
);
tmp
=
-
(
cgv
>>
1
);
CGV
=
vec_lde
(
0
,
&
tmp
);
CGV
=
vec_splat
(
CGV
,
0
);
c
->
CSHIFT
=
(
vector
unsigned
short
)(
2
);
c
->
CY
=
CY
;
c
->
OY
=
OY
;
c
->
CRV
=
CRV
;
c
->
CBU
=
CBU
;
c
->
CGU
=
CGU
;
c
->
CGV
=
CGV
;
#if 0
printf ("cy: %hvx\n", CY);
printf ("oy: %hvx\n", OY);
printf ("crv: %hvx\n", CRV);
printf ("cbu: %hvx\n", CBU);
printf ("cgv: %hvx\n", CGV);
printf ("cgu: %hvx\n", CGU);
#endif
return
;
}
void
altivec_yuv2packedX
(
SwsContext
*
c
,
int16_t
*
lumFilter
,
int16_t
**
lumSrc
,
int
lumFilterSize
,
int16_t
*
chrFilter
,
int16_t
**
chrSrc
,
int
chrFilterSize
,
uint8_t
*
dest
,
int
dstW
,
int
dstY
)
{
int
i
,
j
;
short
tmp
__attribute__
((
aligned
(
16
)));
short
*
p
;
short
*
f
;
vector
signed
short
X
,
X0
,
X1
,
Y0
,
U0
,
V0
,
Y1
,
U1
,
V1
,
U
,
V
;
vector
signed
short
R0
,
G0
,
B0
,
R1
,
G1
,
B1
;
vector
unsigned
char
R
,
G
,
B
,
pels
[
3
];
vector
unsigned
char
*
out
,
*
nout
;
vector
signed
short
RND
=
(
vector
signed
short
)(
1
<<
3
);
vector
unsigned
short
SCL
=
(
vector
unsigned
short
)(
4
);
unsigned
long
scratch
[
16
]
__attribute__
((
aligned
(
16
)));
vector
signed
short
*
vYCoeffsBank
,
*
vCCoeffsBank
;
vector
signed
short
*
YCoeffs
,
*
CCoeffs
;
vYCoeffsBank
=
malloc
(
sizeof
(
vector
signed
short
)
*
lumFilterSize
*
dstW
);
vCCoeffsBank
=
malloc
(
sizeof
(
vector
signed
short
)
*
chrFilterSize
*
dstW
);
for
(
i
=
0
;
i
<
lumFilterSize
*
dstW
;
i
++
)
{
tmp
=
c
->
vLumFilter
[
i
];
p
=
&
vYCoeffsBank
[
i
];
for
(
j
=
0
;
j
<
8
;
j
++
)
p
[
j
]
=
tmp
;
}
for
(
i
=
0
;
i
<
chrFilterSize
*
dstW
;
i
++
)
{
tmp
=
c
->
vChrFilter
[
i
];
p
=
&
vCCoeffsBank
[
i
];
for
(
j
=
0
;
j
<
8
;
j
++
)
p
[
j
]
=
tmp
;
}
YCoeffs
=
vYCoeffsBank
+
dstY
*
lumFilterSize
;
CCoeffs
=
vCCoeffsBank
+
dstY
*
chrFilterSize
;
out
=
(
vector
unsigned
char
*
)
dest
;
for
(
i
=
0
;
i
<
dstW
;
i
+=
16
){
Y0
=
RND
;
Y1
=
RND
;
/* extract 16 coeffs from lumSrc */
for
(
j
=
0
;
j
<
lumFilterSize
;
j
++
)
{
X0
=
vec_ld
(
0
,
&
lumSrc
[
j
][
i
]);
X1
=
vec_ld
(
16
,
&
lumSrc
[
j
][
i
]);
Y0
=
vec_mradds
(
X0
,
YCoeffs
[
j
],
Y0
);
Y1
=
vec_mradds
(
X1
,
YCoeffs
[
j
],
Y1
);
}
U
=
RND
;
V
=
RND
;
/* extract 8 coeffs from U,V */
for
(
j
=
0
;
j
<
chrFilterSize
;
j
++
)
{
X
=
vec_ld
(
0
,
&
chrSrc
[
j
][
i
/
2
]);
U
=
vec_mradds
(
X
,
CCoeffs
[
j
],
U
);
X
=
vec_ld
(
0
,
&
chrSrc
[
j
][
i
/
2
+
2048
]);
V
=
vec_mradds
(
X
,
CCoeffs
[
j
],
V
);
}
/* scale and clip signals */
Y0
=
vec_sra
(
Y0
,
SCL
);
Y1
=
vec_sra
(
Y1
,
SCL
);
U
=
vec_sra
(
U
,
SCL
);
V
=
vec_sra
(
V
,
SCL
);
Y0
=
vec_clip
(
Y0
);
Y1
=
vec_clip
(
Y1
);
U
=
vec_clip
(
U
);
V
=
vec_clip
(
V
);
/* now we have
Y0= y0 y1 y2 y3 y4 y5 y6 y7 Y1= y8 y9 y10 y11 y12 y13 y14 y15
U= u0 u1 u2 u3 u4 u5 u6 u7 V= v0 v1 v2 v3 v4 v5 v6 v7
Y0= y0 y1 y2 y3 y4 y5 y6 y7 Y1= y8 y9 y10 y11 y12 y13 y14 y15
U0= u0 u0 u1 u1 u2 u2 u3 u3 U1= u4 u4 u5 u5 u6 u6 u7 u7
V0= v0 v0 v1 v1 v2 v2 v3 v3 V1= v4 v4 v5 v5 v6 v6 v7 v7
*/
U0
=
vec_mergeh
(
U
,
U
);
V0
=
vec_mergeh
(
V
,
V
);
U1
=
vec_mergel
(
U
,
U
);
V1
=
vec_mergel
(
V
,
V
);
cvtyuvtoRGB
(
c
,
Y0
,
U0
,
V0
,
&
R0
,
&
G0
,
&
B0
);
cvtyuvtoRGB
(
c
,
Y1
,
U1
,
V1
,
&
R1
,
&
G1
,
&
B1
);
R
=
vec_packclp
(
R0
,
R1
);
G
=
vec_packclp
(
G0
,
G1
);
B
=
vec_packclp
(
B0
,
B1
);
out_rgba
(
R
,
G
,
B
,
out
);
}
if
(
i
<
dstW
)
{
i
-=
16
;
Y0
=
RND
;
Y1
=
RND
;
/* extract 16 coeffs from lumSrc */
for
(
j
=
0
;
j
<
lumFilterSize
;
j
++
)
{
X0
=
vec_ld
(
0
,
&
lumSrc
[
j
][
i
]);
X1
=
vec_ld
(
16
,
&
lumSrc
[
j
][
i
]);
Y0
=
vec_mradds
(
X0
,
YCoeffs
[
j
],
Y0
);
Y1
=
vec_mradds
(
X1
,
YCoeffs
[
j
],
Y1
);
}
U
=
RND
;
V
=
RND
;
/* extract 8 coeffs from U,V */
for
(
j
=
0
;
j
<
chrFilterSize
;
j
++
)
{
X
=
vec_ld
(
0
,
&
chrSrc
[
j
][
i
/
2
]);
U
=
vec_mradds
(
X
,
CCoeffs
[
j
],
U
);
X
=
vec_ld
(
0
,
&
chrSrc
[
j
][
i
/
2
+
2048
]);
V
=
vec_mradds
(
X
,
CCoeffs
[
j
],
V
);
}
/* scale and clip signals */
Y0
=
vec_sra
(
Y0
,
SCL
);
Y1
=
vec_sra
(
Y1
,
SCL
);
U
=
vec_sra
(
U
,
SCL
);
V
=
vec_sra
(
V
,
SCL
);
Y0
=
vec_clip
(
Y0
);
Y1
=
vec_clip
(
Y1
);
U
=
vec_clip
(
U
);
V
=
vec_clip
(
V
);
/* now we have
Y0= y0 y1 y2 y3 y4 y5 y6 y7 Y1= y8 y9 y10 y11 y12 y13 y14 y15
U= u0 u1 u2 u3 u4 u5 u6 u7 V= v0 v1 v2 v3 v4 v5 v6 v7
Y0= y0 y1 y2 y3 y4 y5 y6 y7 Y1= y8 y9 y10 y11 y12 y13 y14 y15
U0= u0 u0 u1 u1 u2 u2 u3 u3 U1= u4 u4 u5 u5 u6 u6 u7 u7
V0= v0 v0 v1 v1 v2 v2 v3 v3 V1= v4 v4 v5 v5 v6 v6 v7 v7
*/
U0
=
vec_mergeh
(
U
,
U
);
V0
=
vec_mergeh
(
V
,
V
);
U1
=
vec_mergel
(
U
,
U
);
V1
=
vec_mergel
(
V
,
V
);
cvtyuvtoRGB
(
c
,
Y0
,
U0
,
V0
,
&
R0
,
&
G0
,
&
B0
);
cvtyuvtoRGB
(
c
,
Y1
,
U1
,
V1
,
&
R1
,
&
G1
,
&
B1
);
R
=
vec_packclp
(
R0
,
R1
);
G
=
vec_packclp
(
G0
,
G1
);
B
=
vec_packclp
(
B0
,
B1
);
nout
=
(
vector
unsigned
char
*
)
scratch
;
out_rgba
(
R
,
G
,
B
,
nout
);
memcpy
(
&
((
uint32_t
*
)
dest
)[
i
],
scratch
,
(
dstW
-
i
)
/
4
);
}
if
(
vYCoeffsBank
)
free
(
vYCoeffsBank
);
if
(
vCCoeffsBank
)
free
(
vCCoeffsBank
);
}
modules/video_filter/swscale/yuv2rgb_mlib.c
deleted
100644 → 0
View file @
3a2462f6
/*
* yuv2rgb_mlib.c, Software YUV to RGB coverter using mediaLib
*
* Copyright (C) 2000, Håkan Hjort <d95hjort@dtek.chalmers.se>
* All Rights Reserved.
*
* This file is part of mpeg2dec, a free MPEG-2 video decoder
*
* mpeg2dec is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2, or (at your option)
* any later version.
*
* mpeg2dec is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with GNU Make; see the file COPYING. If not, write to
* the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
*
*/
#include <mlib_types.h>
#include <mlib_status.h>
#include <mlib_sys.h>
#include <mlib_video.h>
#include <inttypes.h>
#include <stdlib.h>
#include <assert.h>
#include "img_format.h" //FIXME try to reduce dependency of such stuff
#include "swscale.h"
static
int
mlib_YUV2ARGB420_32
(
SwsContext
*
c
,
uint8_t
*
src
[],
int
srcStride
[],
int
srcSliceY
,
int
srcSliceH
,
uint8_t
*
dst
[],
int
dstStride
[]){
if
(
c
->
srcFormat
==
IMGFMT_422P
){
srcStride
[
1
]
*=
2
;
srcStride
[
2
]
*=
2
;
}
assert
(
srcStride
[
1
]
==
srcStride
[
2
]);
mlib_VideoColorYUV2ARGB420
(
dst
[
0
]
+
srcSliceY
*
dstStride
[
0
],
src
[
0
],
src
[
1
],
src
[
2
],
c
->
dstW
,
srcSliceH
,
dstStride
[
0
],
srcStride
[
0
],
srcStride
[
1
]);
return
srcSliceH
;
}
static
int
mlib_YUV2ABGR420_32
(
SwsContext
*
c
,
uint8_t
*
src
[],
int
srcStride
[],
int
srcSliceY
,
int
srcSliceH
,
uint8_t
*
dst
[],
int
dstStride
[]){
if
(
c
->
srcFormat
==
IMGFMT_422P
){
srcStride
[
1
]
*=
2
;
srcStride
[
2
]
*=
2
;
}
assert
(
srcStride
[
1
]
==
srcStride
[
2
]);
mlib_VideoColorYUV2ABGR420
(
dst
[
0
]
+
srcSliceY
*
dstStride
[
0
],
src
[
0
],
src
[
1
],
src
[
2
],
c
->
dstW
,
srcSliceH
,
dstStride
[
0
],
srcStride
[
0
],
srcStride
[
1
]);
return
srcSliceH
;
}
static
int
mlib_YUV2RGB420_24
(
SwsContext
*
c
,
uint8_t
*
src
[],
int
srcStride
[],
int
srcSliceY
,
int
srcSliceH
,
uint8_t
*
dst
[],
int
dstStride
[]){
if
(
c
->
srcFormat
==
IMGFMT_422P
){
srcStride
[
1
]
*=
2
;
srcStride
[
2
]
*=
2
;
}
assert
(
srcStride
[
1
]
==
srcStride
[
2
]);
mlib_VideoColorYUV2RGB420
(
dst
[
0
]
+
srcSliceY
*
dstStride
[
0
],
src
[
0
],
src
[
1
],
src
[
2
],
c
->
dstW
,
srcSliceH
,
dstStride
[
0
],
srcStride
[
0
],
srcStride
[
1
]);
return
srcSliceH
;
}
SwsFunc
yuv2rgb_init_mlib
(
SwsContext
*
c
)
{
switch
(
c
->
dstFormat
){
case
IMGFMT_RGB24
:
return
mlib_YUV2RGB420_24
;
case
IMGFMT_RGB32
:
return
mlib_YUV2ARGB420_32
;
case
IMGFMT_BGR32
:
return
mlib_YUV2ABGR420_32
;
default:
return
NULL
;
}
}
modules/video_filter/swscale/yuv2rgb_template.c
deleted
100644 → 0
View file @
3a2462f6
/*
* yuv2rgb_mmx.c, Software YUV to RGB coverter with Intel MMX "technology"
*
* Copyright (C) 2000, Silicon Integrated System Corp.
* All Rights Reserved.
*
* Author: Olie Lho <ollie@sis.com.tw>
*
* This file is part of mpeg2dec, a free MPEG-2 video decoder
*
* mpeg2dec is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2, or (at your option)
* any later version.
*
* mpeg2dec is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with GNU Make; see the file COPYING. If not, write to
* the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
*
* 15,24 bpp and dithering from Michael Niedermayer (michaelni@gmx.at)
* MMX/MMX2 Template stuff from Michael Niedermayer (needed for fast movntq support)
* context / deglobalize stuff by Michael Niedermayer
*/
#undef MOVNTQ
#undef EMMS
#undef SFENCE
#ifdef HAVE_3DNOW
/* On K6 femms is faster of emms. On K7 femms is directly mapped on emms. */
#define EMMS "femms"
#else
#define EMMS "emms"
#endif
#ifdef HAVE_MMX2
#define MOVNTQ "movntq"
#define SFENCE "sfence"
#else
#define MOVNTQ "movq"
#define SFENCE "/nop"
#endif
#define YUV2RGB \
/* Do the multiply part of the conversion for even and odd pixels,
register usage:
mm0 -> Cblue, mm1 -> Cred, mm2 -> Cgreen even pixels,
mm3 -> Cblue, mm4 -> Cred, mm5 -> Cgreen odd pixels,
mm6 -> Y even, mm7 -> Y odd */
\
/* convert the chroma part */
\
"punpcklbw %%mm4, %%mm0;"
/* scatter 4 Cb 00 u3 00 u2 00 u1 00 u0 */
\
"punpcklbw %%mm4, %%mm1;"
/* scatter 4 Cr 00 v3 00 v2 00 v1 00 v0 */
\
\
"psllw $3, %%mm0;"
/* Promote precision */
\
"psllw $3, %%mm1;"
/* Promote precision */
\
\
"psubsw "U_OFFSET"(%4), %%mm0;"
/* Cb -= 128 */
\
"psubsw "V_OFFSET"(%4), %%mm1;"
/* Cr -= 128 */
\
\
"movq %%mm0, %%mm2;"
/* Copy 4 Cb 00 u3 00 u2 00 u1 00 u0 */
\
"movq %%mm1, %%mm3;"
/* Copy 4 Cr 00 v3 00 v2 00 v1 00 v0 */
\
\
"pmulhw "UG_COEFF"(%4), %%mm2;"
/* Mul Cb with green coeff -> Cb green */
\
"pmulhw "VG_COEFF"(%4), %%mm3;"
/* Mul Cr with green coeff -> Cr green */
\
\
"pmulhw "UB_COEFF"(%4), %%mm0;"
/* Mul Cb -> Cblue 00 b3 00 b2 00 b1 00 b0 */
\
"pmulhw "VR_COEFF"(%4), %%mm1;"
/* Mul Cr -> Cred 00 r3 00 r2 00 r1 00 r0 */
\
\
"paddsw %%mm3, %%mm2;"
/* Cb green + Cr green -> Cgreen */
\
\
/* convert the luma part */
\
"movq %%mm6, %%mm7;"
/* Copy 8 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 */
\
"pand "MANGLE(mmx_00ffw)", %%mm6;"
/* get Y even 00 Y6 00 Y4 00 Y2 00 Y0 */
\
\
"psrlw $8, %%mm7;"
/* get Y odd 00 Y7 00 Y5 00 Y3 00 Y1 */
\
\
"psllw $3, %%mm6;"
/* Promote precision */
\
"psllw $3, %%mm7;"
/* Promote precision */
\
\
"psubw "Y_OFFSET"(%4), %%mm6;"
/* Y -= 16 */
\
"psubw "Y_OFFSET"(%4), %%mm7;"
/* Y -= 16 */
\
\
"pmulhw "Y_COEFF"(%4), %%mm6;"
/* Mul 4 Y even 00 y6 00 y4 00 y2 00 y0 */
\
"pmulhw "Y_COEFF"(%4), %%mm7;"
/* Mul 4 Y odd 00 y7 00 y5 00 y3 00 y1 */
\
\
/* Do the addition part of the conversion for even and odd pixels,
register usage:
mm0 -> Cblue, mm1 -> Cred, mm2 -> Cgreen even pixels,
mm3 -> Cblue, mm4 -> Cred, mm5 -> Cgreen odd pixels,
mm6 -> Y even, mm7 -> Y odd */
\
"movq %%mm0, %%mm3;"
/* Copy Cblue */
\
"movq %%mm1, %%mm4;"
/* Copy Cred */
\
"movq %%mm2, %%mm5;"
/* Copy Cgreen */
\
\
"paddsw %%mm6, %%mm0;"
/* Y even + Cblue 00 B6 00 B4 00 B2 00 B0 */
\
"paddsw %%mm7, %%mm3;"
/* Y odd + Cblue 00 B7 00 B5 00 B3 00 B1 */
\
\
"paddsw %%mm6, %%mm1;"
/* Y even + Cred 00 R6 00 R4 00 R2 00 R0 */
\
"paddsw %%mm7, %%mm4;"
/* Y odd + Cred 00 R7 00 R5 00 R3 00 R1 */
\
\
"paddsw %%mm6, %%mm2;"
/* Y even + Cgreen 00 G6 00 G4 00 G2 00 G0 */
\
"paddsw %%mm7, %%mm5;"
/* Y odd + Cgreen 00 G7 00 G5 00 G3 00 G1 */
\
\
/* Limit RGB even to 0..255 */
\
"packuswb %%mm0, %%mm0;"
/* B6 B4 B2 B0 B6 B4 B2 B0 */
\
"packuswb %%mm1, %%mm1;"
/* R6 R4 R2 R0 R6 R4 R2 R0 */
\
"packuswb %%mm2, %%mm2;"
/* G6 G4 G2 G0 G6 G4 G2 G0 */
\
\
/* Limit RGB odd to 0..255 */
\
"packuswb %%mm3, %%mm3;"
/* B7 B5 B3 B1 B7 B5 B3 B1 */
\
"packuswb %%mm4, %%mm4;"
/* R7 R5 R3 R1 R7 R5 R3 R1 */
\
"packuswb %%mm5, %%mm5;"
/* G7 G5 G3 G1 G7 G5 G3 G1 */
\
\
/* Interleave RGB even and odd */
\
"punpcklbw %%mm3, %%mm0;"
/* B7 B6 B5 B4 B3 B2 B1 B0 */
\
"punpcklbw %%mm4, %%mm1;"
/* R7 R6 R5 R4 R3 R2 R1 R0 */
\
"punpcklbw %%mm5, %%mm2;"
/* G7 G6 G5 G4 G3 G2 G1 G0 */
\
static
inline
int
RENAME
(
yuv420_rgb16
)(
SwsContext
*
c
,
uint8_t
*
src
[],
int
srcStride
[],
int
srcSliceY
,
int
srcSliceH
,
uint8_t
*
dst
[],
int
dstStride
[]){
int
y
,
h_size
;
if
(
c
->
srcFormat
==
IMGFMT_422P
){
srcStride
[
1
]
*=
2
;
srcStride
[
2
]
*=
2
;
}
h_size
=
(
c
->
dstW
+
7
)
&~
7
;
if
(
h_size
*
2
>
dstStride
[
0
])
h_size
-=
8
;
__asm__
__volatile__
(
"pxor %mm4, %mm4;"
/* zero mm4 */
);
//printf("%X %X %X %X %X %X %X %X %X %X\n", (int)&c->redDither, (int)&b5Dither, (int)src[0], (int)src[1], (int)src[2], (int)dst[0],
//srcStride[0],srcStride[1],srcStride[2],dstStride[0]);
for
(
y
=
0
;
y
<
srcSliceH
;
y
++
)
{
uint8_t
*
_image
=
dst
[
0
]
+
(
y
+
srcSliceY
)
*
dstStride
[
0
];
uint8_t
*
_py
=
src
[
0
]
+
y
*
srcStride
[
0
];
uint8_t
*
_pu
=
src
[
1
]
+
(
y
>>
1
)
*
srcStride
[
1
];
uint8_t
*
_pv
=
src
[
2
]
+
(
y
>>
1
)
*
srcStride
[
2
];
int
index
=
-
h_size
/
2
;
b5Dither
=
dither8
[
y
&
1
];
g6Dither
=
dither4
[
y
&
1
];
g5Dither
=
dither8
[
y
&
1
];
r5Dither
=
dither8
[(
y
+
1
)
&
1
];
/* this mmx assembly code deals with SINGLE scan line at a time, it convert 8
pixels in each iteration */
__asm__
__volatile__
(
/* load data for start of next scan line */
"movd (%2, %0), %%mm0;"
/* Load 4 Cb 00 00 00 00 u3 u2 u1 u0 */
"movd (%3, %0), %%mm1;"
/* Load 4 Cr 00 00 00 00 v3 v2 v1 v0 */
"movq (%5, %0, 2), %%mm6;"
/* Load 8 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 */
// ".balign 16 \n\t"
"1:
\n\t
"
/* no speed diference on my p3@500 with prefetch,
* if it is faster for anyone with -benchmark then tell me
PREFETCH" 64(%0) \n\t"
PREFETCH" 64(%1) \n\t"
PREFETCH" 64(%2) \n\t"
*/
YUV2RGB
#ifdef DITHER1XBPP
"paddusb "
MANGLE
(
b5Dither
)
", %%mm0;"
"paddusb "
MANGLE
(
g6Dither
)
", %%mm2;"
"paddusb "
MANGLE
(
r5Dither
)
", %%mm1;"
#endif
/* mask unneeded bits off */
"pand "
MANGLE
(
mmx_redmask
)
", %%mm0;"
/* b7b6b5b4 b3_0_0_0 b7b6b5b4 b3_0_0_0 */
"pand "
MANGLE
(
mmx_grnmask
)
", %%mm2;"
/* g7g6g5g4 g3g2_0_0 g7g6g5g4 g3g2_0_0 */
"pand "
MANGLE
(
mmx_redmask
)
", %%mm1;"
/* r7r6r5r4 r3_0_0_0 r7r6r5r4 r3_0_0_0 */
"psrlw $3,%%mm0;"
/* 0_0_0_b7 b6b5b4b3 0_0_0_b7 b6b5b4b3 */
"pxor %%mm4, %%mm4;"
/* zero mm4 */
"movq %%mm0, %%mm5;"
/* Copy B7-B0 */
"movq %%mm2, %%mm7;"
/* Copy G7-G0 */
/* convert rgb24 plane to rgb16 pack for pixel 0-3 */
"punpcklbw %%mm4, %%mm2;"
/* 0_0_0_0 0_0_0_0 g7g6g5g4 g3g2_0_0 */
"punpcklbw %%mm1, %%mm0;"
/* r7r6r5r4 r3_0_0_0 0_0_0_b7 b6b5b4b3 */
"psllw $3, %%mm2;"
/* 0_0_0_0 0_g7g6g5 g4g3g2_0 0_0_0_0 */
"por %%mm2, %%mm0;"
/* r7r6r5r4 r3g7g6g5 g4g3g2b7 b6b5b4b3 */
"movq 8 (%5, %0, 2), %%mm6;"
/* Load 8 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 */
MOVNTQ
" %%mm0, (%1);"
/* store pixel 0-3 */
/* convert rgb24 plane to rgb16 pack for pixel 0-3 */
"punpckhbw %%mm4, %%mm7;"
/* 0_0_0_0 0_0_0_0 g7g6g5g4 g3g2_0_0 */
"punpckhbw %%mm1, %%mm5;"
/* r7r6r5r4 r3_0_0_0 0_0_0_b7 b6b5b4b3 */
"psllw $3, %%mm7;"
/* 0_0_0_0 0_g7g6g5 g4g3g2_0 0_0_0_0 */
"movd 4 (%2, %0), %%mm0;"
/* Load 4 Cb 00 00 00 00 u3 u2 u1 u0 */
"por %%mm7, %%mm5;"
/* r7r6r5r4 r3g7g6g5 g4g3g2b7 b6b5b4b3 */
"movd 4 (%3, %0), %%mm1;"
/* Load 4 Cr 00 00 00 00 v3 v2 v1 v0 */
MOVNTQ
" %%mm5, 8 (%1);"
/* store pixel 4-7 */
"addl $16, %1
\n\t
"
"addl $4, %0
\n\t
"
" js 1b
\n\t
"
:
"+r"
(
index
),
"+r"
(
_image
)
:
"r"
(
_pu
-
index
),
"r"
(
_pv
-
index
),
"r"
(
&
c
->
redDither
),
"r"
(
_py
-
2
*
index
)
);
}
__asm__
__volatile__
(
EMMS
);
return
srcSliceH
;
}
static
inline
int
RENAME
(
yuv420_rgb15
)(
SwsContext
*
c
,
uint8_t
*
src
[],
int
srcStride
[],
int
srcSliceY
,
int
srcSliceH
,
uint8_t
*
dst
[],
int
dstStride
[]){
int
y
,
h_size
;
if
(
c
->
srcFormat
==
IMGFMT_422P
){
srcStride
[
1
]
*=
2
;
srcStride
[
2
]
*=
2
;
}
h_size
=
(
c
->
dstW
+
7
)
&~
7
;
if
(
h_size
*
2
>
dstStride
[
0
])
h_size
-=
8
;
__asm__
__volatile__
(
"pxor %mm4, %mm4;"
/* zero mm4 */
);
//printf("%X %X %X %X %X %X %X %X %X %X\n", (int)&c->redDither, (int)&b5Dither, (int)src[0], (int)src[1], (int)src[2], (int)dst[0],
//srcStride[0],srcStride[1],srcStride[2],dstStride[0]);
for
(
y
=
0
;
y
<
srcSliceH
;
y
++
)
{
uint8_t
*
_image
=
dst
[
0
]
+
(
y
+
srcSliceY
)
*
dstStride
[
0
];
uint8_t
*
_py
=
src
[
0
]
+
y
*
srcStride
[
0
];
uint8_t
*
_pu
=
src
[
1
]
+
(
y
>>
1
)
*
srcStride
[
1
];
uint8_t
*
_pv
=
src
[
2
]
+
(
y
>>
1
)
*
srcStride
[
2
];
int
index
=
-
h_size
/
2
;
b5Dither
=
dither8
[
y
&
1
];
g6Dither
=
dither4
[
y
&
1
];
g5Dither
=
dither8
[
y
&
1
];
r5Dither
=
dither8
[(
y
+
1
)
&
1
];
/* this mmx assembly code deals with SINGLE scan line at a time, it convert 8
pixels in each iteration */
__asm__
__volatile__
(
/* load data for start of next scan line */
"movd (%2, %0), %%mm0;"
/* Load 4 Cb 00 00 00 00 u3 u2 u1 u0 */
"movd (%3, %0), %%mm1;"
/* Load 4 Cr 00 00 00 00 v3 v2 v1 v0 */
"movq (%5, %0, 2), %%mm6;"
/* Load 8 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 */
// ".balign 16 \n\t"
"1:
\n\t
"
YUV2RGB
#ifdef DITHER1XBPP
"paddusb "
MANGLE
(
b5Dither
)
", %%mm0
\n\t
"
"paddusb "
MANGLE
(
g5Dither
)
", %%mm2
\n\t
"
"paddusb "
MANGLE
(
r5Dither
)
", %%mm1
\n\t
"
#endif
/* mask unneeded bits off */
"pand "
MANGLE
(
mmx_redmask
)
", %%mm0;"
/* b7b6b5b4 b3_0_0_0 b7b6b5b4 b3_0_0_0 */
"pand "
MANGLE
(
mmx_redmask
)
", %%mm2;"
/* g7g6g5g4 g3_0_0_0 g7g6g5g4 g3_0_0_0 */
"pand "
MANGLE
(
mmx_redmask
)
", %%mm1;"
/* r7r6r5r4 r3_0_0_0 r7r6r5r4 r3_0_0_0 */
"psrlw $3,%%mm0;"
/* 0_0_0_b7 b6b5b4b3 0_0_0_b7 b6b5b4b3 */
"psrlw $1,%%mm1;"
/* 0_r7r6r5 r4r3_0_0 0_r7r6r5 r4r3_0_0 */
"pxor %%mm4, %%mm4;"
/* zero mm4 */
"movq %%mm0, %%mm5;"
/* Copy B7-B0 */
"movq %%mm2, %%mm7;"
/* Copy G7-G0 */
/* convert rgb24 plane to rgb16 pack for pixel 0-3 */
"punpcklbw %%mm4, %%mm2;"
/* 0_0_0_0 0_0_0_0 g7g6g5g4 g3_0_0_0 */
"punpcklbw %%mm1, %%mm0;"
/* r7r6r5r4 r3_0_0_0 0_0_0_b7 b6b5b4b3 */
"psllw $2, %%mm2;"
/* 0_0_0_0 0_0_g7g6 g5g4g3_0 0_0_0_0 */
"por %%mm2, %%mm0;"
/* 0_r7r6r5 r4r3g7g6 g5g4g3b7 b6b5b4b3 */
"movq 8 (%5, %0, 2), %%mm6;"
/* Load 8 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 */
MOVNTQ
" %%mm0, (%1);"
/* store pixel 0-3 */
/* convert rgb24 plane to rgb16 pack for pixel 0-3 */
"punpckhbw %%mm4, %%mm7;"
/* 0_0_0_0 0_0_0_0 0_g7g6g5 g4g3_0_0 */
"punpckhbw %%mm1, %%mm5;"
/* r7r6r5r4 r3_0_0_0 0_0_0_b7 b6b5b4b3 */
"psllw $2, %%mm7;"
/* 0_0_0_0 0_0_g7g6 g5g4g3_0 0_0_0_0 */
"movd 4 (%2, %0), %%mm0;"
/* Load 4 Cb 00 00 00 00 u3 u2 u1 u0 */
"por %%mm7, %%mm5;"
/* 0_r7r6r5 r4r3g7g6 g5g4g3b7 b6b5b4b3 */
"movd 4 (%3, %0), %%mm1;"
/* Load 4 Cr 00 00 00 00 v3 v2 v1 v0 */
MOVNTQ
" %%mm5, 8 (%1);"
/* store pixel 4-7 */
"addl $16, %1
\n\t
"
"addl $4, %0
\n\t
"
" js 1b
\n\t
"
:
"+r"
(
index
),
"+r"
(
_image
)
:
"r"
(
_pu
-
index
),
"r"
(
_pv
-
index
),
"r"
(
&
c
->
redDither
),
"r"
(
_py
-
2
*
index
)
);
}
__asm__
__volatile__
(
EMMS
);
return
srcSliceH
;
}
static
inline
int
RENAME
(
yuv420_rgb24
)(
SwsContext
*
c
,
uint8_t
*
src
[],
int
srcStride
[],
int
srcSliceY
,
int
srcSliceH
,
uint8_t
*
dst
[],
int
dstStride
[]){
int
y
,
h_size
;
if
(
c
->
srcFormat
==
IMGFMT_422P
){
srcStride
[
1
]
*=
2
;
srcStride
[
2
]
*=
2
;
}
h_size
=
(
c
->
dstW
+
7
)
&~
7
;
if
(
h_size
*
3
>
dstStride
[
0
])
h_size
-=
8
;
__asm__
__volatile__
(
"pxor %mm4, %mm4;"
/* zero mm4 */
);
for
(
y
=
0
;
y
<
srcSliceH
;
y
++
)
{
uint8_t
*
_image
=
dst
[
0
]
+
(
y
+
srcSliceY
)
*
dstStride
[
0
];
uint8_t
*
_py
=
src
[
0
]
+
y
*
srcStride
[
0
];
uint8_t
*
_pu
=
src
[
1
]
+
(
y
>>
1
)
*
srcStride
[
1
];
uint8_t
*
_pv
=
src
[
2
]
+
(
y
>>
1
)
*
srcStride
[
2
];
int
index
=
-
h_size
/
2
;
/* this mmx assembly code deals with SINGLE scan line at a time, it convert 8
pixels in each iteration */
__asm__
__volatile__
(
/* load data for start of next scan line */
"movd (%2, %0), %%mm0;"
/* Load 4 Cb 00 00 00 00 u3 u2 u1 u0 */
"movd (%3, %0), %%mm1;"
/* Load 4 Cr 00 00 00 00 v3 v2 v1 v0 */
"movq (%5, %0, 2), %%mm6;"
/* Load 8 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 */
// ".balign 16 \n\t"
"1:
\n\t
"
YUV2RGB
/* mm0=B, %%mm2=G, %%mm1=R */
#ifdef HAVE_MMX2
"movq "
MANGLE
(
M24A
)
", %%mm4
\n\t
"
"movq "
MANGLE
(
M24C
)
", %%mm7
\n\t
"
"pshufw $0x50, %%mm0, %%mm5
\n\t
"
/* B3 B2 B3 B2 B1 B0 B1 B0 */
"pshufw $0x50, %%mm2, %%mm3
\n\t
"
/* G3 G2 G3 G2 G1 G0 G1 G0 */
"pshufw $0x00, %%mm1, %%mm6
\n\t
"
/* R1 R0 R1 R0 R1 R0 R1 R0 */
"pand %%mm4, %%mm5
\n\t
"
/* B2 B1 B0 */
"pand %%mm4, %%mm3
\n\t
"
/* G2 G1 G0 */
"pand %%mm7, %%mm6
\n\t
"
/* R1 R0 */
"psllq $8, %%mm3
\n\t
"
/* G2 G1 G0 */
"por %%mm5, %%mm6
\n\t
"
"por %%mm3, %%mm6
\n\t
"
MOVNTQ
" %%mm6, (%1)
\n\t
"
"psrlq $8, %%mm2
\n\t
"
/* 00 G7 G6 G5 G4 G3 G2 G1 */
"pshufw $0xA5, %%mm0, %%mm5
\n\t
"
/* B5 B4 B5 B4 B3 B2 B3 B2 */
"pshufw $0x55, %%mm2, %%mm3
\n\t
"
/* G4 G3 G4 G3 G4 G3 G4 G3 */
"pshufw $0xA5, %%mm1, %%mm6
\n\t
"
/* R5 R4 R5 R4 R3 R2 R3 R2 */
"pand "
MANGLE
(
M24B
)
", %%mm5
\n\t
"
/* B5 B4 B3 */
"pand %%mm7, %%mm3
\n\t
"
/* G4 G3 */
"pand %%mm4, %%mm6
\n\t
"
/* R4 R3 R2 */
"por %%mm5, %%mm3
\n\t
"
/* B5 G4 B4 G3 B3 */
"por %%mm3, %%mm6
\n\t
"
MOVNTQ
" %%mm6, 8(%1)
\n\t
"
"pshufw $0xFF, %%mm0, %%mm5
\n\t
"
/* B7 B6 B7 B6 B7 B6 B6 B7 */
"pshufw $0xFA, %%mm2, %%mm3
\n\t
"
/* 00 G7 00 G7 G6 G5 G6 G5 */
"pshufw $0xFA, %%mm1, %%mm6
\n\t
"
/* R7 R6 R7 R6 R5 R4 R5 R4 */
"movd 4 (%2, %0), %%mm0;"
/* Load 4 Cb 00 00 00 00 u3 u2 u1 u0 */
"pand %%mm7, %%mm5
\n\t
"
/* B7 B6 */
"pand %%mm4, %%mm3
\n\t
"
/* G7 G6 G5 */
"pand "
MANGLE
(
M24B
)
", %%mm6
\n\t
"
/* R7 R6 R5 */
"movd 4 (%3, %0), %%mm1;"
/* Load 4 Cr 00 00 00 00 v3 v2 v1 v0 */
\
"por %%mm5, %%mm3
\n\t
"
"por %%mm3, %%mm6
\n\t
"
MOVNTQ
" %%mm6, 16(%1)
\n\t
"
"movq 8 (%5, %0, 2), %%mm6;"
/* Load 8 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 */
"pxor %%mm4, %%mm4
\n\t
"
#else
"pxor %%mm4, %%mm4
\n\t
"
"movq %%mm0, %%mm5
\n\t
"
/* B */
"movq %%mm1, %%mm6
\n\t
"
/* R */
"punpcklbw %%mm2, %%mm0
\n\t
"
/* GBGBGBGB 0 */
"punpcklbw %%mm4, %%mm1
\n\t
"
/* 0R0R0R0R 0 */
"punpckhbw %%mm2, %%mm5
\n\t
"
/* GBGBGBGB 2 */
"punpckhbw %%mm4, %%mm6
\n\t
"
/* 0R0R0R0R 2 */
"movq %%mm0, %%mm7
\n\t
"
/* GBGBGBGB 0 */
"movq %%mm5, %%mm3
\n\t
"
/* GBGBGBGB 2 */
"punpcklwd %%mm1, %%mm7
\n\t
"
/* 0RGB0RGB 0 */
"punpckhwd %%mm1, %%mm0
\n\t
"
/* 0RGB0RGB 1 */
"punpcklwd %%mm6, %%mm5
\n\t
"
/* 0RGB0RGB 2 */
"punpckhwd %%mm6, %%mm3
\n\t
"
/* 0RGB0RGB 3 */
"movq %%mm7, %%mm2
\n\t
"
/* 0RGB0RGB 0 */
"movq %%mm0, %%mm6
\n\t
"
/* 0RGB0RGB 1 */
"movq %%mm5, %%mm1
\n\t
"
/* 0RGB0RGB 2 */
"movq %%mm3, %%mm4
\n\t
"
/* 0RGB0RGB 3 */
"psllq $40, %%mm7
\n\t
"
/* RGB00000 0 */
"psllq $40, %%mm0
\n\t
"
/* RGB00000 1 */
"psllq $40, %%mm5
\n\t
"
/* RGB00000 2 */
"psllq $40, %%mm3
\n\t
"
/* RGB00000 3 */
"punpckhdq %%mm2, %%mm7
\n\t
"
/* 0RGBRGB0 0 */
"punpckhdq %%mm6, %%mm0
\n\t
"
/* 0RGBRGB0 1 */
"punpckhdq %%mm1, %%mm5
\n\t
"
/* 0RGBRGB0 2 */
"punpckhdq %%mm4, %%mm3
\n\t
"
/* 0RGBRGB0 3 */
"psrlq $8, %%mm7
\n\t
"
/* 00RGBRGB 0 */
"movq %%mm0, %%mm6
\n\t
"
/* 0RGBRGB0 1 */
"psllq $40, %%mm0
\n\t
"
/* GB000000 1 */
"por %%mm0, %%mm7
\n\t
"
/* GBRGBRGB 0 */
MOVNTQ
" %%mm7, (%1)
\n\t
"
"movd 4 (%2, %0), %%mm0;"
/* Load 4 Cb 00 00 00 00 u3 u2 u1 u0 */
"psrlq $24, %%mm6
\n\t
"
/* 0000RGBR 1 */
"movq %%mm5, %%mm1
\n\t
"
/* 0RGBRGB0 2 */
"psllq $24, %%mm5
\n\t
"
/* BRGB0000 2 */
"por %%mm5, %%mm6
\n\t
"
/* BRGBRGBR 1 */
MOVNTQ
" %%mm6, 8(%1)
\n\t
"
"movq 8 (%5, %0, 2), %%mm6;"
/* Load 8 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 */
"psrlq $40, %%mm1
\n\t
"
/* 000000RG 2 */
"psllq $8, %%mm3
\n\t
"
/* RGBRGB00 3 */
"por %%mm3, %%mm1
\n\t
"
/* RGBRGBRG 2 */
MOVNTQ
" %%mm1, 16(%1)
\n\t
"
"movd 4 (%3, %0), %%mm1;"
/* Load 4 Cr 00 00 00 00 v3 v2 v1 v0 */
"pxor %%mm4, %%mm4
\n\t
"
#endif
"addl $24, %1
\n\t
"
"addl $4, %0
\n\t
"
" js 1b
\n\t
"
:
"+r"
(
index
),
"+r"
(
_image
)
:
"r"
(
_pu
-
index
),
"r"
(
_pv
-
index
),
"r"
(
&
c
->
redDither
),
"r"
(
_py
-
2
*
index
)
);
}
__asm__
__volatile__
(
EMMS
);
return
srcSliceH
;
}
static
inline
int
RENAME
(
yuv420_rgb32
)(
SwsContext
*
c
,
uint8_t
*
src
[],
int
srcStride
[],
int
srcSliceY
,
int
srcSliceH
,
uint8_t
*
dst
[],
int
dstStride
[]){
int
y
,
h_size
;
if
(
c
->
srcFormat
==
IMGFMT_422P
){
srcStride
[
1
]
*=
2
;
srcStride
[
2
]
*=
2
;
}
h_size
=
(
c
->
dstW
+
7
)
&~
7
;
if
(
h_size
*
4
>
dstStride
[
0
])
h_size
-=
8
;
__asm__
__volatile__
(
"pxor %mm4, %mm4;"
/* zero mm4 */
);
for
(
y
=
0
;
y
<
srcSliceH
;
y
++
)
{
uint8_t
*
_image
=
dst
[
0
]
+
(
y
+
srcSliceY
)
*
dstStride
[
0
];
uint8_t
*
_py
=
src
[
0
]
+
y
*
srcStride
[
0
];
uint8_t
*
_pu
=
src
[
1
]
+
(
y
>>
1
)
*
srcStride
[
1
];
uint8_t
*
_pv
=
src
[
2
]
+
(
y
>>
1
)
*
srcStride
[
2
];
int
index
=
-
h_size
/
2
;
/* this mmx assembly code deals with SINGLE scan line at a time, it convert 8
pixels in each iteration */
__asm__
__volatile__
(
/* load data for start of next scan line */
"movd (%2, %0), %%mm0;"
/* Load 4 Cb 00 00 00 00 u3 u2 u1 u0 */
"movd (%3, %0), %%mm1;"
/* Load 4 Cr 00 00 00 00 v3 v2 v1 v0 */
"movq (%5, %0, 2), %%mm6;"
/* Load 8 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 */
// ".balign 16 \n\t"
"1:
\n\t
"
YUV2RGB
/* convert RGB plane to RGB packed format,
mm0 -> B, mm1 -> R, mm2 -> G, mm3 -> 0,
mm4 -> GB, mm5 -> AR pixel 4-7,
mm6 -> GB, mm7 -> AR pixel 0-3 */
"pxor %%mm3, %%mm3;"
/* zero mm3 */
"movq %%mm0, %%mm6;"
/* B7 B6 B5 B4 B3 B2 B1 B0 */
"movq %%mm1, %%mm7;"
/* R7 R6 R5 R4 R3 R2 R1 R0 */
"movq %%mm0, %%mm4;"
/* B7 B6 B5 B4 B3 B2 B1 B0 */
"movq %%mm1, %%mm5;"
/* R7 R6 R5 R4 R3 R2 R1 R0 */
"punpcklbw %%mm2, %%mm6;"
/* G3 B3 G2 B2 G1 B1 G0 B0 */
"punpcklbw %%mm3, %%mm7;"
/* 00 R3 00 R2 00 R1 00 R0 */
"punpcklwd %%mm7, %%mm6;"
/* 00 R1 B1 G1 00 R0 B0 G0 */
MOVNTQ
" %%mm6, (%1);"
/* Store ARGB1 ARGB0 */
"movq %%mm0, %%mm6;"
/* B7 B6 B5 B4 B3 B2 B1 B0 */
"punpcklbw %%mm2, %%mm6;"
/* G3 B3 G2 B2 G1 B1 G0 B0 */
"punpckhwd %%mm7, %%mm6;"
/* 00 R3 G3 B3 00 R2 B3 G2 */
MOVNTQ
" %%mm6, 8 (%1);"
/* Store ARGB3 ARGB2 */
"punpckhbw %%mm2, %%mm4;"
/* G7 B7 G6 B6 G5 B5 G4 B4 */
"punpckhbw %%mm3, %%mm5;"
/* 00 R7 00 R6 00 R5 00 R4 */
"punpcklwd %%mm5, %%mm4;"
/* 00 R5 B5 G5 00 R4 B4 G4 */
MOVNTQ
" %%mm4, 16 (%1);"
/* Store ARGB5 ARGB4 */
"movq %%mm0, %%mm4;"
/* B7 B6 B5 B4 B3 B2 B1 B0 */
"punpckhbw %%mm2, %%mm4;"
/* G7 B7 G6 B6 G5 B5 G4 B4 */
"punpckhwd %%mm5, %%mm4;"
/* 00 R7 G7 B7 00 R6 B6 G6 */
MOVNTQ
" %%mm4, 24 (%1);"
/* Store ARGB7 ARGB6 */
"movd 4 (%2, %0), %%mm0;"
/* Load 4 Cb 00 00 00 00 u3 u2 u1 u0 */
"movd 4 (%3, %0), %%mm1;"
/* Load 4 Cr 00 00 00 00 v3 v2 v1 v0 */
"pxor %%mm4, %%mm4;"
/* zero mm4 */
"movq 8 (%5, %0, 2), %%mm6;"
/* Load 8 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 */
"addl $32, %1
\n\t
"
"addl $4, %0
\n\t
"
" js 1b
\n\t
"
:
"+r"
(
index
),
"+r"
(
_image
)
:
"r"
(
_pu
-
index
),
"r"
(
_pv
-
index
),
"r"
(
&
c
->
redDither
),
"r"
(
_py
-
2
*
index
)
);
}
__asm__
__volatile__
(
EMMS
);
return
srcSliceH
;
}
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment