Commit 3d64b908 authored by Rémi Denis-Courmont's avatar Rémi Denis-Courmont

deinterlace: ARM optimizations for 16-bits merge

parent 95eb7971
...@@ -657,11 +657,13 @@ int Open( vlc_object_t *p_this ) ...@@ -657,11 +657,13 @@ int Open( vlc_object_t *p_this )
else else
#endif #endif
#if defined(__arm__) #if defined(__arm__)
if( chroma->pixel_size == 1 && vlc_CPU_ARM_NEON() ) if( vlc_CPU_ARM_NEON() )
p_sys->pf_merge = merge8_arm_neon; p_sys->pf_merge =
(chroma->pixel_size == 1) ? merge8_arm_neon : merge16_arm_neon;
else else
if( chroma->pixel_size == 1 && vlc_CPU_ARMv6() ) if( vlc_CPU_ARMv6() )
p_sys->pf_merge = merge8_armv6; p_sys->pf_merge =
(chroma->pixel_size == 1) ? merge8_armv6 : merge16_armv6;
else else
#endif #endif
{ {
......
...@@ -163,11 +163,13 @@ void Merge16BitSSE2( void *, const void *, const void *, size_t ); ...@@ -163,11 +163,13 @@ void Merge16BitSSE2( void *, const void *, const void *, size_t );
* ARM NEON routine to blend pixels from two picture lines. * ARM NEON routine to blend pixels from two picture lines.
*/ */
void merge8_arm_neon (void *, const void *, const void *, size_t); void merge8_arm_neon (void *, const void *, const void *, size_t);
void merge16_arm_neon (void *, const void *, const void *, size_t);
/** /**
* ARMv6 SIMD routine to blend pixels from two picture lines. * ARMv6 SIMD routine to blend pixels from two picture lines.
*/ */
void merge8_armv6 (void *, const void *, const void *, size_t); void merge8_armv6 (void *, const void *, const void *, size_t);
void merge16_armv6 (void *, const void *, const void *, size_t);
#endif #endif
/***************************************************************************** /*****************************************************************************
......
...@@ -71,6 +71,47 @@ merge8_arm_neon: ...@@ -71,6 +71,47 @@ merge8_arm_neon:
vst1.u8 {q0}, [DEST,:128]! vst1.u8 {q0}, [DEST,:128]!
bx lr bx lr
.align 2
.global merge16_arm_neon
.type merge16_arm_neon, %function
merge16_arm_neon:
cmp SIZE, #64
blo 2f
1:
pld [SRC1, #64]
vld1.u16 {q0-q1}, [SRC1,:128]!
pld [SRC2, #64]
vld1.u16 {q8-q9}, [SRC2,:128]!
vhadd.u16 q0, q0, q8
sub SIZE, SIZE, #64
vld1.u16 {q2-q3}, [SRC1,:128]!
vhadd.u16 q1, q1, q9
vld1.u16 {q10-q11}, [SRC2,:128]!
vhadd.u16 q2, q2, q10
cmp SIZE, #64
vhadd.u16 q3, q3, q11
vst1.u16 {q0-q1}, [DEST,:128]!
vst1.u16 {q2-q3}, [DEST,:128]!
bhs 1b
2:
cmp SIZE, #32
blo 3f
vld1.u16 {q0-q1}, [SRC1,:128]!
sub SIZE, SIZE, #32
vld1.u16 {q8-q9}, [SRC2,:128]!
vhadd.u16 q0, q0, q8
vhadd.u16 q1, q1, q9
vst1.u16 {q0-q1}, [DEST,:128]!
3:
cmp SIZE, #16
bxlo lr
vld1.u16 {q0}, [SRC1,:128]!
sub SIZE, SIZE, #16
vld1.u16 {q8}, [SRC2,:128]!
vhadd.u16 q0, q0, q8
vst1.u16 {q0}, [DEST,:128]!
bx lr
.align 2 .align 2
.global merge8_armv6 .global merge8_armv6
.type merge8_armv6, %function .type merge8_armv6, %function
...@@ -92,3 +133,25 @@ merge8_armv6: ...@@ -92,3 +133,25 @@ merge8_armv6:
stm DEST!, {r6-r7} stm DEST!, {r6-r7}
popeq {r4-r9,pc} popeq {r4-r9,pc}
b 1b b 1b
.align 2
.global merge16_armv6
.type merge16_armv6, %function
merge16_armv6:
push {r4-r9,lr}
1:
pld [SRC1, #64]
ldm SRC1!, {r4-r5}
pld [SRC2, #64]
ldm SRC2!, {r8-r9}
subs SIZE, SIZE, #16
uhadd16 r4, r4, r8
ldm SRC1!, {r6-r7}
uhadd16 r5, r5, r9
ldm SRC2!, {ip,lr}
uhadd16 r6, r6, ip
stm DEST!, {r4-r5}
uhadd16 r7, r7, lr
stm DEST!, {r6-r7}
popeq {r4-r9,pc}
b 1b
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment