Commit 0e770b17 authored by Rémi Denis-Courmont's avatar Rémi Denis-Courmont

i420->YUYV NEON: rewrite using VZIP

This is over twice faster. Thanks to Måns Rullgård for the hint.
parent af4dd740
@**************************************************************************** @*****************************************************************************
@ i420_yuyv_neon.S : ARM NEONv1 I420 to YUYV chroma conversion @ i420_yuyv_neon.S : ARM NEONv1 I420 to YUYV chroma conversion
@***************************************************************************** @*****************************************************************************
@ Copyright (C) 2009 Rémi Denis-Courmont @ Copyright (C) 2009 Rémi Denis-Courmont
...@@ -14,8 +14,8 @@ ...@@ -14,8 +14,8 @@
@ GNU General Public License for more details. @ GNU General Public License for more details.
@ @
@ You should have received a copy of the GNU General Public License @ You should have received a copy of the GNU General Public License
@ along with this program; if not, write to the Free Software @ along with this program; if not, write to the Free Software Foundation,
@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston MA 02110-1301, USA. @ Inc., 51 Franklin Street, Fifth Floor, Boston MA 02110-1301, USA.
@****************************************************************************/ @****************************************************************************/
.fpu neon .fpu neon
...@@ -32,41 +32,26 @@ ...@@ -32,41 +32,26 @@
#define END_O1 r12 #define END_O1 r12
.align .align
.global i420_uyvy_neon
.type i420_uyvy_neon, %function
i420_uyvy_neon:
adr r12, indexes+64
b i420_pack_neon
.global i420_yuyv_neon .global i420_yuyv_neon
.type i420_yuyv_neon, %function .type i420_yuyv_neon, %function
i420_yuyv_neon: i420_yuyv_neon:
adr r12, indexes
.hidden i420_pack_neon
i420_pack_neon:
push {r4-r7, lr} push {r4-r7, lr}
vld1.u8 {d24-d27}, [r12]!
ldmia r1, {Y1, U, V} ldmia r1, {Y1, U, V}
vld1.u8 {d28-d31}, [r12]
add O2, O1, PITCH, lsl #1 add O2, O1, PITCH, lsl #1
add Y2, Y1, PITCH add Y2, Y1, PITCH
1: 1:
mov END_O1, O2 mov END_O1, O2
2: 2:
vld1.u8 {d0-d1}, [Y1,:128]!
vld1.u8 {d2}, [U,:64]! vld1.u8 {d2}, [U,:64]!
vld1.u8 {d3}, [V,:64]! vld1.u8 {d3}, [V,:64]!
vld1.u8 {d4-d5}, [Y2,:128]! vzip.u8 d2, d3
vtbl.u8 d16, {d0-d3}, d24 vld1.u8 {q0}, [Y1,:128]!
vtbl.u8 d17, {d0-d3}, d25 vmov q3, q1
vtbl.u8 d18, {d0-d3}, d26 vzip.u8 q0, q1
vtbl.u8 d19, {d0-d3}, d27 vld1.u8 {q2}, [Y2,:128]!
vtbl.u8 d20, {d2-d5}, d28 vzip.u8 q2, q3
vtbl.u8 d21, {d2-d5}, d29 vst1.u8 {q0-q1}, [O1,:128]!
vtbl.u8 d22, {d2-d5}, d30 vst1.u8 {q2-q3}, [O2,:128]!
vtbl.u8 d23, {d2-d5}, d31
vst1.u8 {d16-d19}, [O1,:128]!
vst1.u8 {d20-d23}, [O2,:128]!
cmp O1, END_O1 cmp O1, END_O1
bne 2b bne 2b
...@@ -82,25 +67,37 @@ i420_pack_neon: ...@@ -82,25 +67,37 @@ i420_pack_neon:
pop {r4-r7, pc} pop {r4-r7, pc}
.hidden indexes .global i420_uyvy_neon
indexes: .type i420_uyvy_neon, %function
@ YUYV1 i420_uyvy_neon:
.byte 0x00, 0x10, 0x01, 0x18, 0x02, 0x11, 0x03, 0x19 push {r4-r7, lr}
.byte 0x04, 0x12, 0x05, 0x1A, 0x06, 0x13, 0x07, 0x1B ldmia r1, {Y1, U, V}
.byte 0x08, 0x14, 0x09, 0x1C, 0x0A, 0x15, 0x0B, 0x1D add O2, O1, PITCH, lsl #1
.byte 0x0C, 0x16, 0x0D, 0x1E, 0x0E, 0x17, 0x0F, 0x1F add Y2, Y1, PITCH
@ YUYV2 1:
.byte 0x10, 0x00, 0x11, 0x08, 0x12, 0x01, 0x13, 0x09 mov END_O1, O2
.byte 0x14, 0x02, 0x15, 0x0A, 0x16, 0x03, 0x17, 0x0B 2:
.byte 0x18, 0x04, 0x19, 0x0C, 0x1A, 0x05, 0x1B, 0x0D vld1.u8 {d0}, [U,:64]!
.byte 0x1C, 0x06, 0x1D, 0x0E, 0x1E, 0x07, 0x1F, 0x0F vld1.u8 {d1}, [V,:64]!
@ UYVY1 vzip.u8 d0, d1
.byte 0x10, 0x00, 0x18, 0x01, 0x11, 0x02, 0x19, 0x03 vld1.u8 {q1}, [Y1,:128]!
.byte 0x12, 0x04, 0x1A, 0x05, 0x13, 0x06, 0x1B, 0x07 vmov q2, q0
.byte 0x14, 0x08, 0x1C, 0x09, 0x15, 0x0A, 0x1D, 0x0B vzip.u8 q0, q1
.byte 0x16, 0x0C, 0x1E, 0x0D, 0x17, 0x0E, 0x1F, 0x0F vld1.u8 {q3}, [Y2,:128]!
@ UYVY2 vzip.u8 q2, q3
.byte 0x00, 0x10, 0x08, 0x11, 0x01, 0x12, 0x09, 0x13 vst1.u8 {q0-q1}, [O1,:128]!
.byte 0x02, 0x14, 0x0A, 0x15, 0x03, 0x16, 0x0B, 0x17 vst1.u8 {q2-q3}, [O2,:128]!
.byte 0x04, 0x18, 0x0C, 0x19, 0x05, 0x1A, 0x0D, 0x1B
.byte 0x06, 0x1C, 0x0E, 0x1D, 0x07, 0x1E, 0x0F, 0x1F cmp O1, END_O1
bne 2b
sub HEIGHT, #2
mov O1, O2
add O2, PITCH, lsl #1
mov Y1, Y2
add Y2, PITCH
cmp HEIGHT, #0
bne 1b
pop {r4-r7, pc}
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment