Commit 56bbc336 authored by Rémi Denis-Courmont's avatar Rémi Denis-Courmont

NEON converter: unroll fi32->s16n conversion

Main loop now burns 3 cycles per 8 values, if I compute right.
This is not quite main bottleneck, but it is pretty much always used
(since we output 16-bits PCM).
parent a2671feb
SOURCES_converter_fixed = fixed.c
SOURCES_converter_float = float.c
SOURCES_converter_neon = neon.c
SOURCES_converter_neon = \
neon_s32_s16.S \
neon.c
SOURCES_a52tospdif = a52tospdif.c
SOURCES_a52tofloat32 = a52tofloat32.c
SOURCES_dtstospdif = dtstospdif.c
......
......@@ -28,6 +28,8 @@
#include <vlc_filter.h>
#include <vlc_cpu.h>
#include <assert.h>
static int Open (vlc_object_t *);
vlc_module_begin ()
......@@ -129,63 +131,32 @@ static block_t *Do_F32_S32 (filter_t *filter, block_t *inbuf)
return inbuf;
}
void s32_s16_neon_unaligned (int16_t *out, const int32_t *in, unsigned nb);
void s32_s16_neon (int16_t *out, const int32_t *in, unsigned nb);
/**
* Signed 32-bits fixed point to signed 16-bits integer
*/
static block_t *Do_S32_S16 (filter_t *filter, block_t *inbuf)
{
unsigned nb_samples = inbuf->i_nb_samples
* aout_FormatNbChannels (&filter->fmt_in.audio);
int32_t *inp = (int32_t *)inbuf->p_buffer;
const int32_t *endp = inp + nb_samples;
int16_t *outp = (int16_t *)inp;
while (nb_samples & 3)
{
const int16_t roundup = 1 << 12;
asm volatile (
"qadd r0, %[inv], %[roundup]\n"
"ssat %[outv], #16, r0, asr #13\n"
: [outv] "=r" (*outp)
: [inv] "r" (*inp), [roundup] "r" (roundup)
: "r0");
inp++;
outp++;
nb_samples--;
}
const int32_t *in = (int32_t *)inbuf->p_buffer;
int16_t *out = (int16_t *)in;
unsigned nb;
if (nb_samples & 4)
asm volatile (
"vld1.s32 {q0}, [%[inp]]!\n"
"vrshrn.i32 d0, q0, #13\n"
"vst1.s16 {d0}, [%[outp]]!\n"
: [outp] "+r" (outp), [inp] "+r" (inp)
:
: "q0", "memory");
nb = ((-(uintptr_t)in) & 12) >> 2;
out += nb; /* fix up misalignment */
inbuf->p_buffer += 2 * nb;
if (nb_samples & 8)
asm volatile (
"vld1.s32 {q0-q1}, [%[inp]]!\n"
"vrshrn.i32 d0, q0, #13\n"
"vrshrn.i32 d1, q1, #13\n"
"vst1.s16 {q0}, [%[outp]]!\n"
: [outp] "+r" (outp), [inp] "+r" (inp)
:
: "q0", "q1", "memory");
s32_s16_neon_unaligned (out, in, nb);
in += nb;
out += nb;
while (inp != endp)
asm volatile (
"vld1.s32 {q0-q1}, [%[inp]]!\n"
"vld1.s32 {q2-q3}, [%[inp]]!\n"
"vrshrn.s32 d0, q0, #13\n"
"vrshrn.s32 d1, q1, #13\n"
"vrshrn.s32 d2, q2, #13\n"
"vrshrn.s32 d3, q3, #13\n"
"vst1.s16 {q0-q1}, [%[outp]]!\n"
: [outp] "+r" (outp), [inp] "+r" (inp)
:
: "q0", "q1", "q2", "q3", "memory");
nb = inbuf->i_nb_samples
* aout_FormatNbChannels (&filter->fmt_in.audio) - nb;
assert (!(((uintptr_t)in) & 15));
assert (!(((uintptr_t)out) & 15));
s32_s16_neon (out, in, nb);
inbuf->i_buffer /= 2;
return inbuf;
}
@*****************************************************************************
@ neon_s32_s16.S : ARM NEONv1 fi32 to s16n audio sample conversion
@*****************************************************************************
@ Copyright (C) 2009 Rémi Denis-Courmont
@
@ This program is free software; you can redistribute it and/or modify
@ it under the terms of the GNU General Public License as published by
@ the Free Software Foundation; either version 2 of the License, or
@ (at your option) any later version.
@
@ This program is distributed in the hope that it will be useful,
@ but WITHOUT ANY WARRANTY; without even the implied warranty of
@ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
@ GNU General Public License for more details.
@
@ You should have received a copy of the GNU General Public License
@ along with this program; if not, write to the Free Software Foundation,
@ Inc., 51 Franklin Street, Fifth Floor, Boston MA 02110-1301, USA.
@****************************************************************************/
.fpu neon
.text
#define OUT r0
#define IN r1
#define N r2
#define BUF r3
#define HALF ip
.align
.global s32_s16_neon
.type s32_s16_neon, %function
@ Converts fixed-point 32-bits to signed 16-bits
@ Input and output must be on 128-bits boundary
s32_s16_neon:
pld [IN]
2:
cmp N, #8
blt s32_s16_neon_unaligned
vld1.s32 {q8-q9}, [IN,:128]!
3: @ Main loop
pld [IN, #64]
sub N, #8
vqrshrn.s32 d16, q8, #13
vqrshrn.s32 d17, q9, #13
cmp N, #8
blt 4f
vld1.s32 {q10-q11}, [IN,:128]!
sub N, #8
vqrshrn.s32 d18, q10, #13
vqrshrn.s32 d19, q11, #13
cmp N, #8
blt 5f
vld1.s32 {q12-q13}, [IN,:128]!
sub N, #8
vqrshrn.s32 d20, q12, #13
vqrshrn.s32 d21, q13, #13
vst1.s16 {d16-d19}, [OUT,:128]!
cmp N, #8
blt 6f
vld1.s32 {q8-q9}, [IN,:128]!
vst1.s16 {d20-d21}, [OUT,:128]!
b 3b
4:
vst1.s16 {d16-d17}, [OUT,:128]!
b 7f
5:
vst1.s16 {d16-d19}, [OUT,:128]!
b 7f
6:
vst1.s16 {d20-d21}, [OUT,:128]!
7:
cmp N, #4
blt s32_s16_neon_unaligned
vld1.s32 {q8}, [IN,:128]!
sub N, #4
vqrshrn.s32 d16, q8, #13
vst1.s16 {d16}, [OUT,:64]!
@ Fall through for last 0-3 samples
.global s32_s16_neon_unaligned
.type s32_s16_neon_unaligned, %function
@ Converts fixed-point 32-bits to signed 16-bits
@ Input must be on 32-bits boundary, output on 16-bits
s32_s16_neon_unaligned:
mov HALF, #4096
1:
cmp N, #0
bxeq lr
ldr BUF, [IN]
add IN, #4
add OUT, #2
qadd BUF, HALF, BUF
sub N, #1
ssat BUF, #16, BUF, asr #13
strh BUF, [OUT, #-2]
b 1b
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment