YUV->RGB convertor in NEON

Signed-off-by: Rafaël Carré <funman@videolan.org> (cherry picked from commit 6983b2f386678ebc9554e334b0d5622f301c0c35) Signed-off-by: Jean-Baptiste Kempf <jb@videolan.org>

YUV->RGB convertor in NEON
Signed-off-by: Rafaël Carré <funman@videolan.org> (cherry picked from commit 6983b2f386678ebc9554e334b0d5622f301c0c35) Signed-off-by: Jean-Baptiste Kempf <jb@videolan.org>
617106df · Sébastien Toque · Jean-Baptiste Kempf · 4ee436f7 · 617106df · 617106df
Commit 617106df authored Oct 04, 2011 by Sébastien Toque Committed by Jean-Baptiste Kempf Jan 27, 2012
6 changed files
--- a/modules/arm_neon/Modules.am
+++ b/modules/arm_neon/Modules.am
@@ -18,7 +18,17 @@ libchroma_yuv_neon_plugin_la_CFLAGS = $(AM_CFLAGS)
 libchroma_yuv_neon_plugin_la_LIBADD = $(AM_LIBADD)
 libchroma_yuv_neon_plugin_la_DEPENDENCIES =

+libyuv_rgb_neon_plugin_la_SOURCES = \
+	i420_rgb.S \
+	nv21_rgb.S \
+	nv12_rgb.S \
+	yuv_rgb.c
+libyuv_rgb_neon_plugin_la_CFLAGS = $(AM_CFLAGS)
+libyuv_rgb_neon_plugin_la_LIBADD = $(AM_LIBADD)
+libyuv_rgb_neon_plugin_la_DEPENDENCIES =
+
 libvlc_LTLIBRARIES += \
 	libaudio_format_neon_plugin.la \
 	libchroma_yuv_neon_plugin.la \
+	libyuv_rgb_neon_plugin.la \
 	$(NULL)
--- a/modules/arm_neon/chroma_neon.h
+++ b/modules/arm_neon/chroma_neon.h
@@ -64,3 +64,15 @@ void yuyv_i422_neon (struct yuv_planes *const out,
 /* UYVY to I422 conversion. */
 void uyvy_i422_neon (struct yuv_planes *const out,
                     const struct yuv_pack *const in, int width, int height);
+
+/* I420 to RGBA conversion. */
+void i420_rgb_neon (struct yuv_pack *const out,
+                    const struct yuv_planes *const in, int width, int height);
+
+/* NV21 to RGBA conversion. */
+void nv21_rgb_neon (struct yuv_pack *const out,
+                    const struct yuv_planes *const in, int width, int height);
+
+/* NV12 to RGBA conversion. */
+void nv12_rgb_neon (struct yuv_pack *const out,
+                    const struct yuv_planes *const in, int width, int height);
--- a/modules/arm_neon/i420_rgb.S
+++ b/modules/arm_neon/i420_rgb.S
+ @*****************************************************************************
+ @ i420_rgb.S : ARM NEONv1 I420 to RGB chroma conversion
+ @*****************************************************************************
+ @ Copyright (C) 2011 Sébastien Toque
+ @                    Rémi Denis-Courmont
+ @
+ @ This program is free software; you can redistribute it and/or modify
+ @ it under the terms of the GNU General Public License as published by
+ @ the Free Software Foundation; either version 2 of the License, or
+ @ (at your option) any later version.
+ @
+ @ This program is distributed in the hope that it will be useful,
+ @ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ @ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ @ GNU General Public License for more details.
+ @
+ @ You should have received a copy of the GNU General Public License
+ @ along with this program; if not, write to the Free Software Foundation,
+ @ Inc., 51 Franklin Street, Fifth Floor, Boston MA 02110-1301, USA.
+ @****************************************************************************/
+
+	.fpu neon
+	.text
+
+/* ARM */
+#define O1	r0
+#define O2	r1
+#define WIDTH	r2
+#define HEIGHT	r3
+#define Y1	r4
+#define Y2	r5
+#define U	r6
+#define V	r7
+#define YPITCH	r8
+#define OPAD	r10
+#define YPAD	r11
+#define COUNT	ip
+#define OPITCH	lr
+
+/* NEON */
+#define coefY	D0
+#define coefRV	D1
+#define coefGU	D2
+#define coefGV	D3
+#define coefBU	D4
+#define Rc	Q3
+#define Gc	Q4
+#define Bc	Q5
+
+#define u	D24
+#define v	D25
+#define y1	D28
+#define y2	D29
+
+#define chro_r	Q6
+#define chro_g	Q7
+#define chro_b	Q8
+#define red		Q9
+#define green	Q10
+#define blue	Q11
+#define lumi	Q15
+
+#define red1	D24
+#define green1	D25
+#define blue1	D26
+#define alpha1	D27
+#define red2	D28
+#define green2	D29
+#define blue2	D30
+#define alpha2	D31
+
+coefficients:
+    .short  -15872
+    .short    4992
+    .short  -18432
+
+	.align
+	.global i420_rgb_neon
+	.type	i420_rgb_neon, %function
+i420_rgb_neon:
+	push		{r4-r8,r10-r11,lr}
+	vpush		{q4-q7}
+
+	/* load arguments */
+	ldmia		r0,	{O1, OPITCH}
+	ldmia		r1,	{Y1, U, V, YPITCH}
+
+	/* round the width to be a multiple of 16 */
+	ands		OPAD, WIDTH, #15
+	sub			WIDTH, WIDTH, OPAD
+	addne		WIDTH, WIDTH, #16
+
+	/* init constants (scale value by 64) */
+	vmov.u8		coefY, #74
+	vmov.u8		coefRV, #115
+	vmov.u8		coefGU, #14
+	vmov.u8		coefGV, #34
+	vmov.u8		coefBU, #135
+	adr			OPAD, coefficients
+	vld1.s16	{d6[], d7[]}, [OPAD]!
+	vld1.s16	{d8[], d9[]}, [OPAD]!
+	vld1.s16	{d10[], d11[]}, [OPAD]!
+	vmov.u8		alpha1, #255
+
+	/* init padding */
+	cmp			HEIGHT,	#0
+	sub			OPAD,	OPITCH,	WIDTH, lsl #2
+	sub			YPAD,	YPITCH,	WIDTH
+
+loop_row:
+	movgts	COUNT,	WIDTH
+	add		O2,	O1,	OPITCH
+	add		Y2,	Y1,	YPITCH
+	/* exit if all rows have been processed */
+	vpople	{q4-q7}
+	pople	{r4-r8,r10-r11,pc}
+
+loop_col:
+
+	/* Common U & V */
+
+	vld1.u8	{u}, [U,:64]!
+	vld1.u8	{v}, [V,:64]!
+
+	vmull.u8	chro_r, v, coefRV
+	vmull.u8	chro_g, u, coefGU
+	vmlal.u8	chro_g, v, coefGV
+	vmull.u8	chro_b, u, coefBU
+
+	vadd.s16	chro_r, Rc, chro_r
+	vsub.s16	chro_g, Gc, chro_g
+	vadd.s16	chro_b, Bc, chro_b
+
+	PLD	[U]
+	PLD	[V]
+
+	/* Y Top Row */
+	vld2.u8	{y1,y2}, [Y1,:128]!
+
+	/* y1 : chrominance + luminance, then clamp (divide by 64) */
+	vmull.u8	lumi, y1, coefY
+	vqadd.s16	red, lumi, chro_r
+	vqadd.s16	green, lumi, chro_g
+	vqadd.s16	blue, lumi, chro_b
+	vqrshrun.s16	red1, red, #6
+	vqrshrun.s16	green1, green, #6
+	vqrshrun.s16	blue1, blue, #6
+
+	/* y2 : chrominance + luminance, then clamp (divide by 64) */
+	vmull.u8	lumi, y2, coefY
+	vqadd.s16	red, lumi, chro_r
+	vqadd.s16	green, lumi, chro_g
+	vqadd.s16	blue, lumi, chro_b
+	vqrshrun.s16	red2, red, #6
+	vqrshrun.s16	green2, green, #6
+	vqrshrun.s16	blue2, blue, #6
+
+	PLD	[Y1]
+
+	vmov.u8	alpha2, #255
+	vzip.u8	red1, red2
+	vzip.u8	green1, green2
+	vzip.u8	blue1, blue2
+
+	vst4.u8		{red1,green1,blue1,alpha1}, [O1,:128]!
+	vst4.u8		{red2,green2,blue2,alpha2}, [O1,:128]!
+
+	/* Y Bottom Row */
+	vld2.u8	{y1,y2}, [Y2,:128]!
+
+	/* y1 : chrominance + luminance, then clamp (divide by 64) */
+	vmull.u8	lumi, y1, coefY
+	vqadd.s16	red, lumi, chro_r
+	vqadd.s16	green, lumi, chro_g
+	vqadd.s16	blue, lumi, chro_b
+	vqrshrun.s16	red1, red, #6
+	vqrshrun.s16	green1, green, #6
+	vqrshrun.s16	blue1, blue, #6
+
+	/* y2 : chrominance + luminance, then clamp (divide by 64) */
+	vmull.u8	lumi, y2, coefY
+	vqadd.s16	red, lumi, chro_r
+	vqadd.s16	green, lumi, chro_g
+	vqadd.s16	blue, lumi, chro_b
+	vqrshrun.s16	red2, red, #6
+	vqrshrun.s16	green2, green, #6
+	vqrshrun.s16	blue2, blue, #6
+
+	PLD	[Y2]
+
+	vmov.u8	alpha2, #255
+	vzip.u8	red1, red2
+	vzip.u8	green1, green2
+	vzip.u8	blue1, blue2
+
+	vst4.u8		{red1,green1,blue1,alpha1}, [O2,:128]!
+	vst4.u8		{red2,green2,blue2,alpha2}, [O2,:128]!
+
+	/* next columns (x16) */
+	subs	COUNT,	COUNT,	#16
+	bgt		loop_col
+
+	/* next rows (x2) */
+	subs	HEIGHT,	#2
+	add		O1,	O2,	OPAD
+	add		Y1,	Y2,	YPAD
+	add		U,	U,	YPAD,	lsr #1
+	add		V,	V,	YPAD,	lsr #1
+	b		loop_row
--- a/modules/arm_neon/nv12_rgb.S
+++ b/modules/arm_neon/nv12_rgb.S
+ @*****************************************************************************
+ @ nv12_rgb.S : ARM NEONv1 NV12 to RGB chroma conversion
+ @*****************************************************************************
+ @ Copyright (C) 2011 Sébastien Toque
+ @                    Rémi Denis-Courmont
+ @
+ @ This program is free software; you can redistribute it and/or modify
+ @ it under the terms of the GNU General Public License as published by
+ @ the Free Software Foundation; either version 2 of the License, or
+ @ (at your option) any later version.
+ @
+ @ This program is distributed in the hope that it will be useful,
+ @ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ @ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ @ GNU General Public License for more details.
+ @
+ @ You should have received a copy of the GNU General Public License
+ @ along with this program; if not, write to the Free Software Foundation,
+ @ Inc., 51 Franklin Street, Fifth Floor, Boston MA 02110-1301, USA.
+ @****************************************************************************/
+
+	.fpu neon
+	.text
+
+/* ARM */
+#define O1	r0
+#define O2	r1
+#define WIDTH	r2
+#define HEIGHT	r3
+#define Y1	r4
+#define Y2	r5
+#define U	r6
+#define V	r7
+#define YPITCH	r8
+#define OPAD	r10
+#define YPAD	r11
+#define COUNT	ip
+#define OPITCH	lr
+
+/* NEON */
+#define coefY	D0
+#define coefRV	D1
+#define coefGU	D2
+#define coefGV	D3
+#define coefBU	D4
+#define Rc	Q3
+#define Gc	Q4
+#define Bc	Q5
+
+#define u	D24
+#define v	D25
+#define y1	D28
+#define y2	D29
+
+#define chro_r	Q6
+#define chro_g	Q7
+#define chro_b	Q8
+#define red		Q9
+#define green	Q10
+#define blue	Q11
+#define lumi	Q15
+
+#define red1	D24
+#define green1	D25
+#define blue1	D26
+#define alpha1	D27
+#define red2	D28
+#define green2	D29
+#define blue2	D30
+#define alpha2	D31
+
+coefficients:
+    .short  -15872
+    .short    4992
+    .short  -18432
+
+	.align
+	.global nv12_rgb_neon
+	.type	nv12_rgb_neon, %function
+nv12_rgb_neon:
+	push		{r4-r8,r10-r11,lr}
+	vpush		{q4-q7}
+
+	/* load arguments */
+	ldmia		r0,	{O1, OPITCH}
+	ldmia		r1,	{Y1, U, V, YPITCH}
+
+	/* round the width to be a multiple of 16 */
+	ands		OPAD, WIDTH, #15
+	sub			WIDTH, WIDTH, OPAD
+	addne		WIDTH, WIDTH, #16
+
+	/* init constants (scale value by 64) */
+	vmov.u8		coefY, #74
+	vmov.u8		coefRV, #115
+	vmov.u8		coefGU, #14
+	vmov.u8		coefGV, #34
+	vmov.u8		coefBU, #135
+	adr			OPAD, coefficients
+	vld1.s16	{d6[], d7[]}, [OPAD]!
+	vld1.s16	{d8[], d9[]}, [OPAD]!
+	vld1.s16	{d10[], d11[]}, [OPAD]!
+	vmov.u8		alpha1, #255
+
+	/* init padding */
+	cmp			HEIGHT,	#0
+	sub			OPAD,	OPITCH,	WIDTH, lsl #2
+	sub			YPAD,	YPITCH,	WIDTH
+
+loop_row:
+	movgts	COUNT,	WIDTH
+	add		O2,	O1,	OPITCH
+	add		Y2,	Y1,	YPITCH
+	/* exit if all rows have been processed */
+	vpople	{q4-q7}
+	pople	{r4-r8,r10-r11,pc}
+
+loop_col:
+
+	/* Common U & V */
+
+	vld2.u8	{u,v}, [U,:128]!
+
+	vmull.u8	chro_r, v, coefRV
+	vmull.u8	chro_g, u, coefGU
+	vmlal.u8	chro_g, v, coefGV
+	vmull.u8	chro_b, u, coefBU
+
+	vadd.s16	chro_r, Rc, chro_r
+	vsub.s16	chro_g, Gc, chro_g
+	vadd.s16	chro_b, Bc, chro_b
+
+	PLD	[U]
+
+	/* Y Top Row */
+	vld2.u8	{y1,y2}, [Y1,:128]!
+
+	/* y1 : chrominance + luminance, then clamp (divide by 64) */
+	vmull.u8	lumi, y1, coefY
+	vqadd.s16	red, lumi, chro_r
+	vqadd.s16	green, lumi, chro_g
+	vqadd.s16	blue, lumi, chro_b
+	vqrshrun.s16	red1, red, #6
+	vqrshrun.s16	green1, green, #6
+	vqrshrun.s16	blue1, blue, #6
+
+	/* y2 : chrominance + luminance, then clamp (divide by 64) */
+	vmull.u8	lumi, y2, coefY
+	vqadd.s16	red, lumi, chro_r
+	vqadd.s16	green, lumi, chro_g
+	vqadd.s16	blue, lumi, chro_b
+	vqrshrun.s16	red2, red, #6
+	vqrshrun.s16	green2, green, #6
+	vqrshrun.s16	blue2, blue, #6
+
+	PLD	[Y1]
+
+	vmov.u8	alpha2, #255
+	vzip.u8	red1, red2
+	vzip.u8	green1, green2
+	vzip.u8	blue1, blue2
+
+	vst4.u8		{red1,green1,blue1,alpha1}, [O1,:128]!
+	vst4.u8		{red2,green2,blue2,alpha2}, [O1,:128]!
+
+	/* Y Bottom Row */
+	vld2.u8	{y1,y2}, [Y2,:128]!
+
+	/* y1 : chrominance + luminance, then clamp (divide by 64) */
+	vmull.u8	lumi, y1, coefY
+	vqadd.s16	red, lumi, chro_r
+	vqadd.s16	green, lumi, chro_g
+	vqadd.s16	blue, lumi, chro_b
+	vqrshrun.s16	red1, red, #6
+	vqrshrun.s16	green1, green, #6
+	vqrshrun.s16	blue1, blue, #6
+
+	/* y2 : chrominance + luminance, then clamp (divide by 64) */
+	vmull.u8	lumi, y2, coefY
+	vqadd.s16	red, lumi, chro_r
+	vqadd.s16	green, lumi, chro_g
+	vqadd.s16	blue, lumi, chro_b
+	vqrshrun.s16	red2, red, #6
+	vqrshrun.s16	green2, green, #6
+	vqrshrun.s16	blue2, blue, #6
+
+	PLD	[Y2]
+
+	vmov.u8	alpha2, #255
+	vzip.u8	red1, red2
+	vzip.u8	green1, green2
+	vzip.u8	blue1, blue2
+
+	vst4.u8		{red1,green1,blue1,alpha1}, [O2,:128]!
+	vst4.u8		{red2,green2,blue2,alpha2}, [O2,:128]!
+
+	/* next columns (x16) */
+	subs	COUNT,	COUNT,	#16
+	bgt		loop_col
+
+	/* next rows (x2) */
+	subs	HEIGHT,	#2
+	add		O1,	O2,	OPAD
+	add		Y1,	Y2,	YPAD
+	add		U,	U,	YPAD
+	b		loop_row
--- a/modules/arm_neon/nv21_rgb.S
+++ b/modules/arm_neon/nv21_rgb.S
+ @*****************************************************************************
+ @ nv21_rgb.S : ARM NEONv1 NV21 to RGB chroma conversion
+ @*****************************************************************************
+ @ Copyright (C) 2011 Sébastien Toque
+ @                    Rémi Denis-Courmont
+ @
+ @ This program is free software; you can redistribute it and/or modify
+ @ it under the terms of the GNU General Public License as published by
+ @ the Free Software Foundation; either version 2 of the License, or
+ @ (at your option) any later version.
+ @
+ @ This program is distributed in the hope that it will be useful,
+ @ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ @ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ @ GNU General Public License for more details.
+ @
+ @ You should have received a copy of the GNU General Public License
+ @ along with this program; if not, write to the Free Software Foundation,
+ @ Inc., 51 Franklin Street, Fifth Floor, Boston MA 02110-1301, USA.
+ @****************************************************************************/
+
+	.fpu neon
+	.text
+
+/* ARM */
+#define O1	r0
+#define O2	r1
+#define WIDTH	r2
+#define HEIGHT	r3
+#define Y1	r4
+#define Y2	r5
+#define U	r6
+#define V	r7
+#define YPITCH	r8
+#define OPAD	r10
+#define YPAD	r11
+#define COUNT	ip
+#define OPITCH	lr
+
+/* NEON */
+#define coefY	D0
+#define coefRV	D1
+#define coefGU	D2
+#define coefGV	D3
+#define coefBU	D4
+#define Rc	Q3
+#define Gc	Q4
+#define Bc	Q5
+
+#define u	D24
+#define v	D25
+#define y1	D28
+#define y2	D29
+
+#define chro_r	Q6
+#define chro_g	Q7
+#define chro_b	Q8
+#define red		Q9
+#define green	Q10
+#define blue	Q11
+#define lumi	Q15
+
+#define red1	D24
+#define green1	D25
+#define blue1	D26
+#define alpha1	D27
+#define red2	D28
+#define green2	D29
+#define blue2	D30
+#define alpha2	D31
+
+coefficients:
+    .short  -15872
+    .short    4992
+    .short  -18432
+
+	.align
+	.global nv21_rgb_neon
+	.type	nv21_rgb_neon, %function
+nv21_rgb_neon:
+	push		{r4-r8,r10-r11,lr}
+	vpush		{q4-q7}
+
+	/* load arguments */
+	ldmia		r0,	{O1, OPITCH}
+	ldmia		r1,	{Y1, U, V, YPITCH}
+
+	/* round the width to be a multiple of 16 */
+	ands		OPAD, WIDTH, #15
+	sub			WIDTH, WIDTH, OPAD
+	addne		WIDTH, WIDTH, #16
+
+	/* init constants (scale value by 64) */
+	vmov.u8		coefY, #74
+	vmov.u8		coefRV, #115
+	vmov.u8		coefGU, #14
+	vmov.u8		coefGV, #34
+	vmov.u8		coefBU, #135
+	adr			OPAD, coefficients
+	vld1.s16	{d6[], d7[]}, [OPAD]!
+	vld1.s16	{d8[], d9[]}, [OPAD]!
+	vld1.s16	{d10[], d11[]}, [OPAD]!
+	vmov.u8		alpha1, #255
+
+	/* init padding */
+	cmp			HEIGHT,	#0
+	sub			OPAD,	OPITCH,	WIDTH, lsl #2
+	sub			YPAD,	YPITCH,	WIDTH
+
+loop_row:
+	movgts	COUNT,	WIDTH
+	add		O2,	O1,	OPITCH
+	add		Y2,	Y1,	YPITCH
+	/* exit if all rows have been processed */
+	vpople	{q4-q7}
+	pople	{r4-r8,r10-r11,pc}
+
+loop_col:
+
+	/* Common U & V */
+
+	vld2.u8	{u,v}, [U,:128]!
+
+	vmull.u8	chro_r, u, coefRV
+	vmull.u8	chro_g, v, coefGU
+	vmlal.u8	chro_g, u, coefGV
+	vmull.u8	chro_b, v, coefBU
+
+	vadd.s16	chro_r, Rc, chro_r
+	vsub.s16	chro_g, Gc, chro_g
+	vadd.s16	chro_b, Bc, chro_b
+
+	PLD	[U]
+
+	/* Y Top Row */
+	vld2.u8	{y1,y2}, [Y1,:128]!
+
+	/* y1 : chrominance + luminance, then clamp (divide by 64) */
+	vmull.u8	lumi, y1, coefY
+	vqadd.s16	red, lumi, chro_r
+	vqadd.s16	green, lumi, chro_g
+	vqadd.s16	blue, lumi, chro_b
+	vqrshrun.s16	red1, red, #6
+	vqrshrun.s16	green1, green, #6
+	vqrshrun.s16	blue1, blue, #6
+
+	/* y2 : chrominance + luminance, then clamp (divide by 64) */
+	vmull.u8	lumi, y2, coefY
+	vqadd.s16	red, lumi, chro_r
+	vqadd.s16	green, lumi, chro_g
+	vqadd.s16	blue, lumi, chro_b
+	vqrshrun.s16	red2, red, #6
+	vqrshrun.s16	green2, green, #6
+	vqrshrun.s16	blue2, blue, #6
+
+	PLD	[Y1]
+
+	vmov.u8	alpha2, #255
+	vzip.u8	red1, red2
+	vzip.u8	green1, green2
+	vzip.u8	blue1, blue2
+
+	vst4.u8		{red1,green1,blue1,alpha1}, [O1,:128]!
+	vst4.u8		{red2,green2,blue2,alpha2}, [O1,:128]!
+
+	/* Y Bottom Row */
+	vld2.u8	{y1,y2}, [Y2,:128]!
+
+	/* y1 : chrominance + luminance, then clamp (divide by 64) */
+	vmull.u8	lumi, y1, coefY
+	vqadd.s16	red, lumi, chro_r
+	vqadd.s16	green, lumi, chro_g
+	vqadd.s16	blue, lumi, chro_b
+	vqrshrun.s16	red1, red, #6
+	vqrshrun.s16	green1, green, #6
+	vqrshrun.s16	blue1, blue, #6
+
+	/* y2 : chrominance + luminance, then clamp (divide by 64) */
+	vmull.u8	lumi, y2, coefY
+	vqadd.s16	red, lumi, chro_r
+	vqadd.s16	green, lumi, chro_g
+	vqadd.s16	blue, lumi, chro_b
+	vqrshrun.s16	red2, red, #6
+	vqrshrun.s16	green2, green, #6
+	vqrshrun.s16	blue2, blue, #6
+
+	PLD	[Y2]
+
+	vmov.u8	alpha2, #255
+	vzip.u8	red1, red2
+	vzip.u8	green1, green2
+	vzip.u8	blue1, blue2
+
+	vst4.u8		{red1,green1,blue1,alpha1}, [O2,:128]!
+	vst4.u8		{red2,green2,blue2,alpha2}, [O2,:128]!
+
+	/* next columns (x16) */
+	subs	COUNT,	COUNT,	#16
+	bgt		loop_col
+
+	/* next rows (x2) */
+	subs	HEIGHT,	#2
+	add		O1,	O2,	OPAD
+	add		Y1,	Y2,	YPAD
+	add		U,	U,	YPAD
+	b		loop_row
--- a/modules/arm_neon/yuv_rgb.c
+++ b/modules/arm_neon/yuv_rgb.c
+/*****************************************************************************
+ * yuv_rgb.c : ARM NEONv1 YUV to RGB32 chroma conversion for VLC
+ *****************************************************************************
+ * Copyright (C) 2011 Sébastien Toque
+ *                    Rémi Denis-Courmont
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston MA 02110-1301, USA.
+ *****************************************************************************/
+
+#ifdef HAVE_CONFIG_H
+# include "config.h"
+#endif
+
+#include <vlc_common.h>
+#include <vlc_plugin.h>
+#include <vlc_filter.h>
+#include <vlc_cpu.h>
+#include "chroma_neon.h"
+
+static int Open (vlc_object_t *);
+
+vlc_module_begin ()
+    set_description (N_("ARM NEON video chroma YUV->RGBA"))
+    set_capability ("video filter2", 250)
+    set_callbacks (Open, NULL)
+vlc_module_end ()
+
+/*
+static int CoefY[256];
+static int CoefRV[256];
+static int CoefGU[256];
+static int CoefGV[256];
+static int CoefBU[256];
+
+// C reference version of the converter
+static void I420_RGBA_C (filter_t *filter, picture_t *src, picture_t *dst)
+{
+    const uint8_t *const out = dst->p->p_pixels;
+    const size_t width = src->p[Y_PLANE].i_visible_pitch;
+    const size_t height = src->p[Y_PLANE].i_visible_lines;
+
+    const int ypitch = src->p[Y_PLANE].i_pitch;
+    const int uvpitch = src->p[U_PLANE].i_pitch;
+    const int dpitch = dst->p->i_pitch / dst->p->i_pixel_pitch;
+
+    for (size_t j = 0; j <  height; ++j)
+    {
+        const int y = j * ypitch;
+        const int u = (j>>1) * uvpitch;
+        const int d = j * dpitch;
+
+        for (size_t i = 0; i < width; ++i)
+        {
+            uint8_t Y = src->Y_PIXELS[y + i];
+            uint8_t U = src->U_PIXELS[u + (i>>1)];
+            uint8_t V = src->V_PIXELS[u + (i>>1)];
+
+            //coef = float * Precision + .5 (Precision=32768)
+            int R = CoefY[Y] + CoefRV[V];
+            int G = CoefY[Y] + CoefGU[U] + CoefGV[V];
+            int B = CoefY[Y] + CoefBU[U];
+
+            //rgb = (rgb+Precision/2) / Precision (Precision=32768)
+            R = R >> 15;
+            G = G >> 15;
+            B = B >> 15;
+
+            if (unlikely(R < 0)) R = 0;
+            if (unlikely(G < 0)) G = 0;
+            if (unlikely(B < 0)) B = 0;
+            if (unlikely(R > 255)) R = 255;
+            if (unlikely(G > 255)) G = 255;
+            if (unlikely(B > 255)) B = 255;
+
+            ((uint32_t*)out)[d + i] = R | (G<<8) | (B<<16) | (0xff<<24);
+        }
+    }
+}*/
+
+static void I420_RGBA (filter_t *filter, picture_t *src, picture_t *dst)
+{
+    struct yuv_pack out = { dst->p->p_pixels, dst->p->i_pitch };
+    struct yuv_planes in = { src->Y_PIXELS, src->U_PIXELS, src->V_PIXELS, src->Y_PITCH };
+    i420_rgb_neon (&out, &in, filter->fmt_in.video.i_width, filter->fmt_in.video.i_height);
+}
+static void YV12_RGBA (filter_t *filter, picture_t *src, picture_t *dst)
+{
+    struct yuv_pack out = { dst->p->p_pixels, dst->p->i_pitch };
+    struct yuv_planes in = { src->Y_PIXELS, src->V_PIXELS, src->U_PIXELS, src->Y_PITCH };
+    i420_rgb_neon (&out, &in, filter->fmt_in.video.i_width, filter->fmt_in.video.i_height);
+}
+
+static void NV21_RGBA (filter_t *filter, picture_t *src, picture_t *dst)
+{
+    struct yuv_pack out = { dst->p->p_pixels, dst->p->i_pitch };
+    struct yuv_planes in = { src->Y_PIXELS, src->U_PIXELS, src->V_PIXELS, src->Y_PITCH };
+    nv21_rgb_neon (&out, &in, filter->fmt_in.video.i_width, filter->fmt_in.video.i_height);
+}
+
+static void NV12_RGBA (filter_t *filter, picture_t *src, picture_t *dst)
+{
+    struct yuv_pack out = { dst->p->p_pixels, dst->p->i_pitch };
+    struct yuv_planes in = { src->Y_PIXELS, src->U_PIXELS, src->V_PIXELS, src->Y_PITCH };
+    nv12_rgb_neon (&out, &in, filter->fmt_in.video.i_width, filter->fmt_in.video.i_height);
+}
+
+VIDEO_FILTER_WRAPPER (I420_RGBA)
+VIDEO_FILTER_WRAPPER (YV12_RGBA)
+VIDEO_FILTER_WRAPPER (NV21_RGBA)
+VIDEO_FILTER_WRAPPER (NV12_RGBA)
+
+static int Open (vlc_object_t *obj)
+{
+    filter_t *filter = (filter_t *)obj;
+
+    if (((filter->fmt_in.video.i_width | filter->fmt_in.video.i_height) & 1)
+     || (filter->fmt_in.video.i_width != filter->fmt_out.video.i_width)
+     || (filter->fmt_in.video.i_height != filter->fmt_out.video.i_height))
+        return VLC_EGENERIC;
+
+    switch (filter->fmt_out.video.i_chroma)
+    {
+        case VLC_CODEC_RGB32:
+            switch (filter->fmt_in.video.i_chroma)
+            {
+                case VLC_CODEC_I420:
+                    filter->pf_video_filter = I420_RGBA_Filter;
+                    break;
+                case VLC_CODEC_YV12:
+                    filter->pf_video_filter = YV12_RGBA_Filter;
+                    break;
+                case VLC_CODEC_NV21:
+                    filter->pf_video_filter = NV21_RGBA_Filter;
+                    break;
+                case VLC_CODEC_NV12:
+                    filter->pf_video_filter = NV12_RGBA_Filter;
+                    break;
+                default:
+                    return VLC_EGENERIC;
+            }
+            break;
+
+        default:
+            return VLC_EGENERIC;
+    }
+
+    //precompute some values for the C version
+    /*const int coefY  = (int)(1.164 * 32768 + 0.5);
+    const int coefRV = (int)(1.793 * 32768 + 0.5);
+    const int coefGU = (int)(0.213 * 32768 + 0.5);
+    const int coefGV = (int)(0.533 * 32768 + 0.5);
+    const int coefBU = (int)(2.113 * 32768 + 0.5);
+    for (int i=0; i<256; ++i)
+    {
+        CoefY[i] = coefY * (i-16) + 16384;
+        CoefRV[i] = coefRV*(i-128);
+        CoefGU[i] = -coefGU*(i-128);
+        CoefGV[i] = -coefGV*(i-128);
+        CoefBU[i] = coefBU*(i-128);
+    }*/
+
+    msg_Dbg(filter, "%4.4s(%dx%d) to %4.4s(%dx%d)",
+            (char*)&filter->fmt_in.video.i_chroma, filter->fmt_in.video.i_width, filter->fmt_in.video.i_height,
+            (char*)&filter->fmt_out.video.i_chroma, filter->fmt_out.video.i_width, filter->fmt_out.video.i_height);
+
+    return VLC_SUCCESS;
+}