deinterlace: rewrite ARM optimizations for 8-bits merge

- Assembler code out of line - ARM NEON run-time detection - Better choice of registers - Prefetching - ARMv6 SIMD optimizations where Advanced SIMD not available Scheduling is not completely optimal.

deinterlace: rewrite ARM optimizations for 8-bits merge
- Assembler code out of line - ARM NEON run-time detection - Better choice of registers - Prefetching - ARMv6 SIMD optimizations where Advanced SIMD not available Scheduling is not completely optimal.
95eb7971 · Rémi Denis-Courmont · f6c77658 · 95eb7971 · 95eb7971 · 95eb7971
Commit 95eb7971 authored Aug 03, 2012 by Rémi Denis-Courmont
5 changed files
--- a/modules/video_filter/Modules.am
+++ b/modules/video_filter/Modules.am
@@ -28,6 +28,9 @@ libdeinterlace_plugin_la_SOURCES = \
 	deinterlace/yadif.h deinterlace/yadif_template.h \
 	deinterlace/algo_phosphor.c deinterlace/algo_phosphor.h \
 	deinterlace/algo_ivtc.c deinterlace/algo_ivtc.h
+if HAVE_NEON
+libdeinterlace_plugin_la_SOURCES += deinterlace/merge_arm.S
+endif
 libdeinterlace_plugin_la_CFLAGS = $(AM_CFLAGS)
 libdeinterlace_plugin_la_LIBADD = $(AM_LIBADD)
 libdeinterlace_plugin_la_DEPENDENCIES =

--- a/modules/video_filter/deinterlace/deinterlace.c
+++ b/modules/video_filter/deinterlace/deinterlace.c
@@ -656,9 +656,12 @@ int Open( vlc_object_t *p_this )
    }
    else
 #endif
-#if defined __ARM_NEON__ // FIXME: runtime detect support
+#if defined(__arm__)
    if( chroma->pixel_size == 1 && vlc_CPU_ARM_NEON() )
-        p_sys->pf_merge = MergeNEON;
+        p_sys->pf_merge = merge8_arm_neon;
+    else
+    if( chroma->pixel_size == 1 && vlc_CPU_ARMv6() )
+        p_sys->pf_merge = merge8_armv6;
    else
 #endif
    {

--- a/modules/video_filter/deinterlace/merge.c
+++ b/modules/video_filter/deinterlace/merge.c
@@ -7,7 +7,6 @@
 * Author: Sam Hocevar <sam@zoy.org>                      (generic C routine)
 *         Sigmund Augdal Helberg <sigmunau@videolan.org> (MMXEXT, 3DNow, SSE2)
 *         Eric Petit <eric.petit@lapsus.org>             (Altivec)
- *         Rémi Denis-Courmont <remi@remlab.net>          (ARM NEON)
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
@@ -243,66 +242,6 @@ void MergeAltivec( void *_p_dest, const void *_p_s1,
 }
 #endif
-#ifdef __ARM_NEON__
-void MergeNEON (void *restrict out, const void *in1,
-                const void *in2, size_t n)
-{
-    uint8_t *outp = out;
-    const uint8_t *in1p = in1;
-    const uint8_t *in2p = in2;
-    size_t mis = __MIN((16 - ((uintptr_t)outp & 15)) & 15, n);
-    if (mis)
-    {
-        Merge8BitGeneric (outp, in1p, in2p, mis);
-        outp += mis;
-        in1p += mis;
-        in2p += mis;
-        n -= mis;
-    }
-    uint8_t *end = outp + (n & ~15);
-    if ((((uintptr_t)in1p)|((uintptr_t)in2p)) & 15)
-        while (outp < end)
-            asm volatile (
-                "vld1.u8  {q0-q1}, [%[in1]]!\n"
-                "vld1.u8  {q2-q3}, [%[in2]]!\n"
-                "vhadd.u8 q4, q0, q2\n"
-                "vld1.u8  {q6-q7}, [%[in1]]!\n"
-                "vhadd.u8 q5, q1, q3\n"
-                "vld1.u8  {q8-q9}, [%[in2]]!\n"
-                "vhadd.u8 q10, q6, q8\n"
-                "vhadd.u8 q11, q7, q9\n"
-                "vst1.u8  {q4-q5}, [%[out],:128]!\n"
-                "vst1.u8  {q10-q11}, [%[out],:128]!\n"
-                : [out] "+r" (outp), [in1] "+r" (in1p), [in2] "+r" (in2p)
-                :
-                : "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
-                  "q8", "q9", "q10", "q11", "memory");
-    else
-         while (outp < end)
-            asm volatile (
-                "vld1.u8  {q0-q1}, [%[in1],:128]!\n"
-                "vld1.u8  {q2-q3}, [%[in2],:128]!\n"
-                "vhadd.u8 q4, q0, q2\n"
-                "vld1.u8  {q6-q7}, [%[in1],:128]!\n"
-                "vhadd.u8 q5, q1, q3\n"
-                "vld1.u8  {q8-q9}, [%[in2],:128]!\n"
-                "vhadd.u8 q10, q6, q8\n"
-                "vhadd.u8 q11, q7, q9\n"
-                "vst1.u8  {q4-q5}, [%[out],:128]!\n"
-                "vst1.u8  {q10-q11}, [%[out],:128]!\n"
-                : [out] "+r" (outp), [in1] "+r" (in1p), [in2] "+r" (in2p)
-                :
-                : "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
-                  "q8", "q9", "q10", "q11", "memory");
-    n &= 15;
-    if (n)
-        Merge8BitGeneric (outp, in1p, in2p, n);
-}
-#endif
 /*****************************************************************************
 * EndMerge routines
 *****************************************************************************/

--- a/modules/video_filter/deinterlace/merge.h
+++ b/modules/video_filter/deinterlace/merge.h
@@ -158,16 +158,16 @@ void Merge8BitSSE2( void *, const void *, const void *, size_t );
 void Merge16BitSSE2( void *, const void *, const void *, size_t );
 #endif
-#if defined __ARM_NEON__
+#ifdef __arm__
 /**
 * ARM NEON routine to blend pixels from two picture lines.
- *
- * @param _p_dest Target
- * @param _p_s1 Source line A
- * @param _p_s2 Source line B
- * @param i_bytes Number of bytes to merge
 */
-void MergeNEON (void *, const void *, const void *, size_t);
+void merge8_arm_neon (void *, const void *, const void *, size_t);
+/**
+ * ARMv6 SIMD routine to blend pixels from two picture lines.
+ */
+void merge8_armv6 (void *, const void *, const void *, size_t);
 #endif
 /*****************************************************************************

--- a/modules/video_filter/deinterlace/merge_arm.S
+++ b/modules/video_filter/deinterlace/merge_arm.S
+ @*****************************************************************************
+ @ i420_yuyv.S : ARM NEONv1 I420 to YUYV chroma conversion
+ @*****************************************************************************
+ @ Copyright (C) 2009-2012 Rémi Denis-Courmont
+ @
+ @ This program is free software; you can redistribute it and/or modify
+ @ it under the terms of the GNU Lesser General Public License as published by
+ @ the Free Software Foundation; either version 2.1 of the License, or
+ @ (at your option) any later version.
+ @
+ @ This program is distributed in the hope that it will be useful,
+ @ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ @ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ @ GNU Lesser General Public License for more details.
+ @
+ @ You should have received a copy of the GNU Lesser General Public License
+ @ along with this program; if not, write to the Free Software Foundation,
+ @ Inc., 51 Franklin Street, Fifth Floor, Boston MA 02110-1301, USA.
+ @****************************************************************************/
+	.syntax	unified
+	.arm
+	.arch	armv6
+	.fpu	neon
+	.text
+#define	DEST	r0
+#define	SRC1	r1
+#define	SRC2	r2
+#define	SIZE	r3
+	.align 2
+	.global merge8_arm_neon
+	.type	merge8_arm_neon, %function
+	@ NOTE: Offset and pitch must be multiple of 16-bytes in VLC.
+merge8_arm_neon:
+	cmp		SIZE,	#64
+	blo		2f
+1:
+	pld		[SRC1, #64]
+	vld1.u8		{q0-q1},	[SRC1,:128]!
+	pld		[SRC2, #64]
+	vld1.u8		{q8-q9},	[SRC2,:128]!
+	vhadd.u8	q0,	q0,	q8
+	sub		SIZE,	SIZE,	#64
+	vld1.u8		{q2-q3},	[SRC1,:128]!
+	vhadd.u8	q1,	q1,	q9
+	vld1.u8		{q10-q11},	[SRC2,:128]!
+	vhadd.u8	q2,	q2,	q10
+	cmp		SIZE,	#64
+	vhadd.u8	q3,	q3,	q11
+	vst1.u8		{q0-q1},	[DEST,:128]!
+	vst1.u8		{q2-q3},	[DEST,:128]!
+	bhs		1b
+2:
+	cmp		SIZE,	#32
+	blo		3f
+	vld1.u8		{q0-q1},	[SRC1,:128]!
+	sub		SIZE,	SIZE,	#32
+	vld1.u8		{q8-q9},	[SRC2,:128]!
+	vhadd.u8	q0,	q0,	q8
+	vhadd.u8	q1,	q1,	q9
+	vst1.u8		{q0-q1},	[DEST,:128]!
+3:
+	cmp		SIZE,	#16
+	bxlo		lr
+	vld1.u8		{q0},		[SRC1,:128]!
+	sub		SIZE,	SIZE,	#16
+	vld1.u8		{q8},		[SRC2,:128]!
+	vhadd.u8	q0,	q0,	q8
+	vst1.u8		{q0},		[DEST,:128]!
+	bx		lr
+	.align 2
+	.global merge8_armv6
+	.type	merge8_armv6, %function
+merge8_armv6:
+	push		{r4-r9,lr}
+1:
+	pld		[SRC1, #64]
+	ldm		SRC1!,	{r4-r5}
+	pld		[SRC2, #64]
+	ldm		SRC2!,	{r8-r9}
+	subs		SIZE,	SIZE,	#16
+	uhadd8		r4,	r4,	r8
+	ldm		SRC1!,	{r6-r7}
+	uhadd8		r5,	r5,	r9
+	ldm		SRC2!,	{ip,lr}
+	uhadd8		r6,	r6,	ip
+	stm		DEST!,	{r4-r5}
+	uhadd8		r7,	r7,	lr
+	stm		DEST!,	{r6-r7}
+	popeq		{r4-r9,pc}
+	b		1b