NEON converter: unroll fi32->s16n conversion

Main loop now burns 3 cycles per 8 values, if I compute right. This is not quite main bottleneck, but it is pretty much always used (since we output 16-bits PCM).

NEON converter: unroll fi32->s16n conversion
Main loop now burns 3 cycles per 8 values, if I compute right. This is not quite main bottleneck, but it is pretty much always used (since we output 16-bits PCM).
56bbc336 · Rémi Denis-Courmont · a2671feb · 56bbc336 · 56bbc336 · 56bbc336
Commit 56bbc336 authored Sep 28, 2009 by Rémi Denis-Courmont
3 changed files
--- a/modules/audio_filter/converter/Modules.am
+++ b/modules/audio_filter/converter/Modules.am
 SOURCES_converter_fixed = fixed.c
 SOURCES_converter_float = float.c
-SOURCES_converter_neon = neon.c
+SOURCES_converter_neon = \
+	neon_s32_s16.S \
+	neon.c
 SOURCES_a52tospdif = a52tospdif.c
 SOURCES_a52tofloat32 = a52tofloat32.c
 SOURCES_dtstospdif = dtstospdif.c

--- a/modules/audio_filter/converter/neon.c
+++ b/modules/audio_filter/converter/neon.c
@@ -28,6 +28,8 @@
 #include <vlc_filter.h>
 #include <vlc_cpu.h>

+#include <assert.h>
+
 static int Open (vlc_object_t *);

 vlc_module_begin ()
@@ -129,63 +131,32 @@ static block_t *Do_F32_S32 (filter_t *filter, block_t *inbuf)
    return inbuf;
 }

+void s32_s16_neon_unaligned (int16_t *out, const int32_t *in, unsigned nb);
+void s32_s16_neon (int16_t *out, const int32_t *in, unsigned nb);
+
 /**
 * Signed 32-bits fixed point to signed 16-bits integer
 */
 static block_t *Do_S32_S16 (filter_t *filter, block_t *inbuf)
 {
-    unsigned nb_samples = inbuf->i_nb_samples
-                     * aout_FormatNbChannels (&filter->fmt_in.audio);
-    int32_t *inp = (int32_t *)inbuf->p_buffer;
-    const int32_t *endp = inp + nb_samples;
-    int16_t *outp = (int16_t *)inp;
-
-    while (nb_samples & 3)
-    {
-        const int16_t roundup = 1 << 12;
-        asm volatile (
-            "qadd r0, %[inv], %[roundup]\n"
-            "ssat %[outv], #16, r0, asr #13\n"
-            : [outv] "=r" (*outp)
-            : [inv] "r" (*inp), [roundup] "r" (roundup)
-            : "r0");
-        inp++;
-        outp++;
-        nb_samples--;
-    }
+    const int32_t *in = (int32_t *)inbuf->p_buffer;
+    int16_t *out = (int16_t *)in;
+    unsigned nb;

-    if (nb_samples & 4)
-        asm volatile (
-            "vld1.s32 {q0}, [%[inp]]!\n"
-            "vrshrn.i32 d0, q0, #13\n"
-            "vst1.s16 {d0}, [%[outp]]!\n"
-            : [outp] "+r" (outp), [inp] "+r" (inp)
-            :
-            : "q0", "memory");
+    nb = ((-(uintptr_t)in) & 12) >> 2;
+    out += nb; /* fix up misalignment */
+    inbuf->p_buffer += 2 * nb;

-    if (nb_samples & 8)
-        asm volatile (
-            "vld1.s32 {q0-q1}, [%[inp]]!\n"
-            "vrshrn.i32 d0, q0, #13\n"
-            "vrshrn.i32 d1, q1, #13\n"
-            "vst1.s16 {q0}, [%[outp]]!\n"
-            : [outp] "+r" (outp), [inp] "+r" (inp)
-            :
-            : "q0", "q1", "memory");
+    s32_s16_neon_unaligned (out, in, nb);
+    in += nb;
+    out += nb;

-    while (inp != endp)
-        asm volatile (
-            "vld1.s32 {q0-q1}, [%[inp]]!\n"
-            "vld1.s32 {q2-q3}, [%[inp]]!\n"
-            "vrshrn.s32 d0, q0, #13\n"
-            "vrshrn.s32 d1, q1, #13\n"
-            "vrshrn.s32 d2, q2, #13\n"
-            "vrshrn.s32 d3, q3, #13\n"
-            "vst1.s16 {q0-q1}, [%[outp]]!\n"
-            : [outp] "+r" (outp), [inp] "+r" (inp)
-            :
-            : "q0", "q1", "q2", "q3", "memory");
+    nb = inbuf->i_nb_samples
+         * aout_FormatNbChannels (&filter->fmt_in.audio) - nb;
+    assert (!(((uintptr_t)in) & 15));
+    assert (!(((uintptr_t)out) & 15));

+    s32_s16_neon (out, in, nb);
    inbuf->i_buffer /= 2;
    return inbuf;
 }
--- a/modules/audio_filter/converter/neon_s32_s16.S
+++ b/modules/audio_filter/converter/neon_s32_s16.S
+ @*****************************************************************************
+ @ neon_s32_s16.S : ARM NEONv1 fi32 to s16n audio sample conversion
+ @*****************************************************************************
+ @ Copyright (C) 2009 Rémi Denis-Courmont
+ @
+ @ This program is free software; you can redistribute it and/or modify
+ @ it under the terms of the GNU General Public License as published by
+ @ the Free Software Foundation; either version 2 of the License, or
+ @ (at your option) any later version.
+ @
+ @ This program is distributed in the hope that it will be useful,
+ @ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ @ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ @ GNU General Public License for more details.
+ @
+ @ You should have received a copy of the GNU General Public License
+ @ along with this program; if not, write to the Free Software Foundation,
+ @ Inc., 51 Franklin Street, Fifth Floor, Boston MA 02110-1301, USA.
+ @****************************************************************************/
+
+	.fpu neon
+	.text
+
+#define	OUT	r0
+#define	IN	r1
+#define	N	r2
+#define	BUF	r3
+#define HALF	ip
+
+	.align
+	.global s32_s16_neon
+	.type	s32_s16_neon, %function
+	@ Converts fixed-point 32-bits to signed 16-bits
+	@ Input and output must be on 128-bits boundary
+s32_s16_neon:
+	pld		[IN]
+2:
+	cmp		N,	#8
+	blt		s32_s16_neon_unaligned
+	vld1.s32	{q8-q9},	[IN,:128]!
+
+3:	@ Main loop
+	pld		[IN, #64]
+	sub		N,	#8
+	vqrshrn.s32	d16,	q8,	#13
+	vqrshrn.s32	d17,	q9,	#13
+	cmp		N,	#8
+	blt		4f
+	vld1.s32	{q10-q11},	[IN,:128]!
+	sub		N,	#8
+	vqrshrn.s32	d18,	q10,	#13
+	vqrshrn.s32	d19,	q11,	#13
+	cmp		N,	#8
+	blt		5f
+	vld1.s32	{q12-q13},	[IN,:128]!
+	sub		N,	#8
+	vqrshrn.s32	d20,	q12,	#13
+	vqrshrn.s32	d21,	q13,	#13
+	vst1.s16	{d16-d19},	[OUT,:128]!
+	cmp		N,	#8
+	blt		6f
+	vld1.s32	{q8-q9},	[IN,:128]!
+	vst1.s16	{d20-d21},	[OUT,:128]!
+	b		3b
+4:
+	vst1.s16	{d16-d17},	[OUT,:128]!
+	b		7f
+5:
+	vst1.s16	{d16-d19},	[OUT,:128]!
+	b		7f
+6:
+	vst1.s16	{d20-d21},	[OUT,:128]!
+7:
+	cmp		N,	#4
+	blt		s32_s16_neon_unaligned
+	vld1.s32	{q8},		[IN,:128]!
+	sub		N,	#4
+	vqrshrn.s32	d16,	q8,	#13
+	vst1.s16	{d16},		[OUT,:64]!
+
+	@ Fall through for last 0-3 samples
+
+	.global	s32_s16_neon_unaligned
+	.type	s32_s16_neon_unaligned, %function
+	@ Converts fixed-point 32-bits to signed 16-bits
+	@ Input must be on 32-bits boundary, output on 16-bits
+s32_s16_neon_unaligned:
+	mov		HALF,	#4096
+1:
+	cmp		N,	#0
+	bxeq		lr
+
+	ldr		BUF,	[IN]
+	add		IN,	#4
+	add		OUT,	#2
+	qadd		BUF,	HALF,	BUF
+	sub		N,	#1
+	ssat		BUF,	#16,	BUF, asr #13
+	strh		BUF,	[OUT, #-2]
+	b		1b