Commit 95eb7971 authored by Rémi Denis-Courmont's avatar Rémi Denis-Courmont

deinterlace: rewrite ARM optimizations for 8-bits merge

 - Assembler code out of line
 - ARM NEON run-time detection
 - Better choice of registers
 - Prefetching
 - ARMv6 SIMD optimizations where Advanced SIMD not available

Scheduling is not completely optimal.
parent f6c77658
......@@ -28,6 +28,9 @@ libdeinterlace_plugin_la_SOURCES = \
deinterlace/yadif.h deinterlace/yadif_template.h \
deinterlace/algo_phosphor.c deinterlace/algo_phosphor.h \
deinterlace/algo_ivtc.c deinterlace/algo_ivtc.h
if HAVE_NEON
libdeinterlace_plugin_la_SOURCES += deinterlace/merge_arm.S
endif
libdeinterlace_plugin_la_CFLAGS = $(AM_CFLAGS)
libdeinterlace_plugin_la_LIBADD = $(AM_LIBADD)
libdeinterlace_plugin_la_DEPENDENCIES =
......
......@@ -656,9 +656,12 @@ int Open( vlc_object_t *p_this )
}
else
#endif
#if defined __ARM_NEON__ // FIXME: runtime detect support
#if defined(__arm__)
if( chroma->pixel_size == 1 && vlc_CPU_ARM_NEON() )
p_sys->pf_merge = MergeNEON;
p_sys->pf_merge = merge8_arm_neon;
else
if( chroma->pixel_size == 1 && vlc_CPU_ARMv6() )
p_sys->pf_merge = merge8_armv6;
else
#endif
{
......
......@@ -7,7 +7,6 @@
* Author: Sam Hocevar <sam@zoy.org> (generic C routine)
* Sigmund Augdal Helberg <sigmunau@videolan.org> (MMXEXT, 3DNow, SSE2)
* Eric Petit <eric.petit@lapsus.org> (Altivec)
* Rémi Denis-Courmont <remi@remlab.net> (ARM NEON)
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
......@@ -243,66 +242,6 @@ void MergeAltivec( void *_p_dest, const void *_p_s1,
}
#endif
#ifdef __ARM_NEON__
void MergeNEON (void *restrict out, const void *in1,
const void *in2, size_t n)
{
uint8_t *outp = out;
const uint8_t *in1p = in1;
const uint8_t *in2p = in2;
size_t mis = __MIN((16 - ((uintptr_t)outp & 15)) & 15, n);
if (mis)
{
Merge8BitGeneric (outp, in1p, in2p, mis);
outp += mis;
in1p += mis;
in2p += mis;
n -= mis;
}
uint8_t *end = outp + (n & ~15);
if ((((uintptr_t)in1p)|((uintptr_t)in2p)) & 15)
while (outp < end)
asm volatile (
"vld1.u8 {q0-q1}, [%[in1]]!\n"
"vld1.u8 {q2-q3}, [%[in2]]!\n"
"vhadd.u8 q4, q0, q2\n"
"vld1.u8 {q6-q7}, [%[in1]]!\n"
"vhadd.u8 q5, q1, q3\n"
"vld1.u8 {q8-q9}, [%[in2]]!\n"
"vhadd.u8 q10, q6, q8\n"
"vhadd.u8 q11, q7, q9\n"
"vst1.u8 {q4-q5}, [%[out],:128]!\n"
"vst1.u8 {q10-q11}, [%[out],:128]!\n"
: [out] "+r" (outp), [in1] "+r" (in1p), [in2] "+r" (in2p)
:
: "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
"q8", "q9", "q10", "q11", "memory");
else
while (outp < end)
asm volatile (
"vld1.u8 {q0-q1}, [%[in1],:128]!\n"
"vld1.u8 {q2-q3}, [%[in2],:128]!\n"
"vhadd.u8 q4, q0, q2\n"
"vld1.u8 {q6-q7}, [%[in1],:128]!\n"
"vhadd.u8 q5, q1, q3\n"
"vld1.u8 {q8-q9}, [%[in2],:128]!\n"
"vhadd.u8 q10, q6, q8\n"
"vhadd.u8 q11, q7, q9\n"
"vst1.u8 {q4-q5}, [%[out],:128]!\n"
"vst1.u8 {q10-q11}, [%[out],:128]!\n"
: [out] "+r" (outp), [in1] "+r" (in1p), [in2] "+r" (in2p)
:
: "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
"q8", "q9", "q10", "q11", "memory");
n &= 15;
if (n)
Merge8BitGeneric (outp, in1p, in2p, n);
}
#endif
/*****************************************************************************
* EndMerge routines
*****************************************************************************/
......
......@@ -158,16 +158,16 @@ void Merge8BitSSE2( void *, const void *, const void *, size_t );
void Merge16BitSSE2( void *, const void *, const void *, size_t );
#endif
#if defined __ARM_NEON__
#ifdef __arm__
/**
* ARM NEON routine to blend pixels from two picture lines.
*
* @param _p_dest Target
* @param _p_s1 Source line A
* @param _p_s2 Source line B
* @param i_bytes Number of bytes to merge
*/
void MergeNEON (void *, const void *, const void *, size_t);
void merge8_arm_neon (void *, const void *, const void *, size_t);
/**
* ARMv6 SIMD routine to blend pixels from two picture lines.
*/
void merge8_armv6 (void *, const void *, const void *, size_t);
#endif
/*****************************************************************************
......
@*****************************************************************************
@ i420_yuyv.S : ARM NEONv1 I420 to YUYV chroma conversion
@*****************************************************************************
@ Copyright (C) 2009-2012 Rémi Denis-Courmont
@
@ This program is free software; you can redistribute it and/or modify
@ it under the terms of the GNU Lesser General Public License as published by
@ the Free Software Foundation; either version 2.1 of the License, or
@ (at your option) any later version.
@
@ This program is distributed in the hope that it will be useful,
@ but WITHOUT ANY WARRANTY; without even the implied warranty of
@ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
@ GNU Lesser General Public License for more details.
@
@ You should have received a copy of the GNU Lesser General Public License
@ along with this program; if not, write to the Free Software Foundation,
@ Inc., 51 Franklin Street, Fifth Floor, Boston MA 02110-1301, USA.
@****************************************************************************/
.syntax unified
.arm
.arch armv6
.fpu neon
.text
#define DEST r0
#define SRC1 r1
#define SRC2 r2
#define SIZE r3
.align 2
.global merge8_arm_neon
.type merge8_arm_neon, %function
@ NOTE: Offset and pitch must be multiple of 16-bytes in VLC.
merge8_arm_neon:
cmp SIZE, #64
blo 2f
1:
pld [SRC1, #64]
vld1.u8 {q0-q1}, [SRC1,:128]!
pld [SRC2, #64]
vld1.u8 {q8-q9}, [SRC2,:128]!
vhadd.u8 q0, q0, q8
sub SIZE, SIZE, #64
vld1.u8 {q2-q3}, [SRC1,:128]!
vhadd.u8 q1, q1, q9
vld1.u8 {q10-q11}, [SRC2,:128]!
vhadd.u8 q2, q2, q10
cmp SIZE, #64
vhadd.u8 q3, q3, q11
vst1.u8 {q0-q1}, [DEST,:128]!
vst1.u8 {q2-q3}, [DEST,:128]!
bhs 1b
2:
cmp SIZE, #32
blo 3f
vld1.u8 {q0-q1}, [SRC1,:128]!
sub SIZE, SIZE, #32
vld1.u8 {q8-q9}, [SRC2,:128]!
vhadd.u8 q0, q0, q8
vhadd.u8 q1, q1, q9
vst1.u8 {q0-q1}, [DEST,:128]!
3:
cmp SIZE, #16
bxlo lr
vld1.u8 {q0}, [SRC1,:128]!
sub SIZE, SIZE, #16
vld1.u8 {q8}, [SRC2,:128]!
vhadd.u8 q0, q0, q8
vst1.u8 {q0}, [DEST,:128]!
bx lr
.align 2
.global merge8_armv6
.type merge8_armv6, %function
merge8_armv6:
push {r4-r9,lr}
1:
pld [SRC1, #64]
ldm SRC1!, {r4-r5}
pld [SRC2, #64]
ldm SRC2!, {r8-r9}
subs SIZE, SIZE, #16
uhadd8 r4, r4, r8
ldm SRC1!, {r6-r7}
uhadd8 r5, r5, r9
ldm SRC2!, {ip,lr}
uhadd8 r6, r6, ip
stm DEST!, {r4-r5}
uhadd8 r7, r7, lr
stm DEST!, {r6-r7}
popeq {r4-r9,pc}
b 1b
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment