Commit 29d7063d authored by Rémi Denis-Courmont's avatar Rémi Denis-Courmont

Clean up NEON chroma converter

 - do not assume output pitch equals (double) pixel width
 - improve function prototypes
 - hand zero-width or zero-height corner cases in ASM (totally useless)
 - use ARM conditon flag (opS) as appropriate
parent 5fb50884
...@@ -9,14 +9,14 @@ libaudio_format_neon_plugin_la_CFLAGS = $(AM_CFLAGS) ...@@ -9,14 +9,14 @@ libaudio_format_neon_plugin_la_CFLAGS = $(AM_CFLAGS)
libaudio_format_neon_plugin_la_LIBADD = $(AM_LIBADD) libaudio_format_neon_plugin_la_LIBADD = $(AM_LIBADD)
libaudio_format_neon_plugin_la_DEPENDENCIES = libaudio_format_neon_plugin_la_DEPENDENCIES =
libi420_yuy2_neon_plugin_la_SOURCES = \ libchroma_yuv_neon_plugin_la_SOURCES = \
i420_yuyv.S \ i420_yuyv.S \
i420_yuy2.c chroma_yuv.c chroma_neon.h
libi420_yuy2_neon_plugin_la_CFLAGS = $(AM_CFLAGS) libchroma_yuv_neon_plugin_la_CFLAGS = $(AM_CFLAGS)
libi420_yuy2_neon_plugin_la_LIBADD = $(AM_LIBADD) libchroma_yuv_neon_plugin_la_LIBADD = $(AM_LIBADD)
libi420_yuy2_neon_plugin_la_DEPENDENCIES = libchroma_yuv_neon_plugin_la_DEPENDENCIES =
libvlc_LTLIBRARIES += \ libvlc_LTLIBRARIES += \
libaudio_format_neon_plugin.la \ libaudio_format_neon_plugin.la \
libi420_yuy2_neon_plugin.la \ libchroma_yuv_neon_plugin.la \
$(NULL) $(NULL)
/*****************************************************************************
* chroma_neon.h
*****************************************************************************
* Copyright (C) 2011 Rémi Denis-Courmont
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston MA 02110-1301, USA.
*****************************************************************************/
/* Planes must start on a 16-bytes boundary. Pitches must be multiples of 16
* bytes even for subsampled components. */
/* Planar picture buffer.
* Pitch corresponds to luminance component in bytes. Chrominance pitches are
* inferred from the color subsampling ratio. */
struct yuv_planes
{
void *y, *u, *v;
size_t pitch;
};
/* Packed picture buffer. Pitch is in bytes (_not_ pixels). */
struct yuv_pack
{
void *yuv;
size_t pitch;
};
/* I420 to YUYV conversion. */
void i420_yuyv_neon (struct yuv_pack *const out,
const struct yuv_planes *const in,
int width, int height);
/* I420 to UYVY conversion. */
void i420_uyvy_neon (struct yuv_pack *const out,
const struct yuv_planes *const in,
int width, int height);
...@@ -26,6 +26,7 @@ ...@@ -26,6 +26,7 @@
#include <vlc_plugin.h> #include <vlc_plugin.h>
#include <vlc_filter.h> #include <vlc_filter.h>
#include <vlc_cpu.h> #include <vlc_cpu.h>
#include "chroma_neon.h"
static int Open (vlc_object_t *); static int Open (vlc_object_t *);
...@@ -35,58 +36,48 @@ vlc_module_begin () ...@@ -35,58 +36,48 @@ vlc_module_begin ()
set_callbacks (Open, NULL) set_callbacks (Open, NULL)
vlc_module_end () vlc_module_end ()
void i420_yuyv_neon (uint8_t *out, const uint8_t **in, #define DEFINE_PACK(pack, pict) \
unsigned int pitch, unsigned int s_off, struct yuv_pack pack = { (pict)->Y_PIXELS, (pict)->Y_PITCH }
unsigned int height); #define DEFINE_PLANES(planes, pict) \
struct yuv_planes planes = { \
(pict)->Y_PIXELS, (pict)->U_PIXELS, (pict)->V_PIXELS, (pict)->Y_PITCH }
#define DEFINE_PLANES_SWAP(planes, pict) \
struct yuv_planes planes = { \
(pict)->Y_PIXELS, (pict)->V_PIXELS, (pict)->U_PIXELS, (pict)->Y_PITCH }
static void I420_YUYV (filter_t *filter, picture_t *src, picture_t *dst) static void I420_YUYV (filter_t *filter, picture_t *src, picture_t *dst)
{ {
uint8_t *out = dst->p->p_pixels; DEFINE_PACK(out, dst);
const uint8_t *yuv[3] = { src->Y_PIXELS, src->U_PIXELS, src->V_PIXELS, }; DEFINE_PLANES(in, src);
size_t height = filter->fmt_in.video.i_height; i420_yuyv_neon (&out, &in, filter->fmt_in.video.i_width,
int i_pitch = (dst->p->i_pitch >> 1) & ~0xF; filter->fmt_in.video.i_height);
int s_offset = src->p->i_pitch - i_pitch;
i420_yuyv_neon (out, yuv, i_pitch, s_offset, height);
} }
VIDEO_FILTER_WRAPPER (I420_YUYV) VIDEO_FILTER_WRAPPER (I420_YUYV)
static void YV12_YUYV (filter_t *filter, picture_t *src, picture_t *dst) static void YV12_YUYV (filter_t *filter, picture_t *src, picture_t *dst)
{ {
uint8_t *out = dst->p->p_pixels; DEFINE_PACK(out, dst);
const uint8_t *yuv[3] = { src->Y_PIXELS, src->V_PIXELS, src->U_PIXELS, }; DEFINE_PLANES_SWAP(in, src);
size_t height = filter->fmt_in.video.i_height; i420_yuyv_neon (&out, &in, filter->fmt_in.video.i_width,
int i_pitch = (dst->p->i_pitch >> 1) & ~0xF; filter->fmt_in.video.i_height);
int s_offset = src->p->i_pitch - i_pitch;
i420_yuyv_neon (out, yuv, i_pitch, s_offset, height);
} }
VIDEO_FILTER_WRAPPER (YV12_YUYV) VIDEO_FILTER_WRAPPER (YV12_YUYV)
void i420_uyvy_neon (uint8_t *out, const uint8_t **in,
uintptr_t pitch, uintptr_t s_off, uintptr_t height);
static void I420_UYVY (filter_t *filter, picture_t *src, picture_t *dst) static void I420_UYVY (filter_t *filter, picture_t *src, picture_t *dst)
{ {
uint8_t *out = dst->p->p_pixels; DEFINE_PACK(out, dst);
const uint8_t *yuv[3] = { src->Y_PIXELS, src->U_PIXELS, src->V_PIXELS, }; DEFINE_PLANES(in, src);
size_t height = filter->fmt_in.video.i_height; i420_uyvy_neon (&out, &in, filter->fmt_in.video.i_width,
int i_pitch = (dst->p->i_pitch >> 1) & ~0xF; filter->fmt_in.video.i_height);
int s_offset = src->p->i_pitch - i_pitch;
i420_uyvy_neon (out, yuv, i_pitch, s_offset, height);
} }
VIDEO_FILTER_WRAPPER (I420_UYVY) VIDEO_FILTER_WRAPPER (I420_UYVY)
static void YV12_UYVY (filter_t *filter, picture_t *src, picture_t *dst) static void YV12_UYVY (filter_t *filter, picture_t *src, picture_t *dst)
{ {
uint8_t *out = dst->p->p_pixels; DEFINE_PACK(out, dst);
const uint8_t *yuv[3] = { src->Y_PIXELS, src->V_PIXELS, src->U_PIXELS, }; DEFINE_PLANES_SWAP(in, src);
size_t height = filter->fmt_in.video.i_height; i420_uyvy_neon (&out, &in, filter->fmt_in.video.i_width,
int i_pitch = (dst->p->i_pitch >> 1) & ~0xF; filter->fmt_in.video.i_height);
int s_offset = src->p->i_pitch - i_pitch;
i420_uyvy_neon (out, yuv, i_pitch, s_offset, height);
} }
VIDEO_FILTER_WRAPPER (YV12_UYVY) VIDEO_FILTER_WRAPPER (YV12_UYVY)
......
@***************************************************************************** @*****************************************************************************
@ i420_yuyv_neon.S : ARM NEONv1 I420 to YUYV chroma conversion @ i420_yuyv_neon.S : ARM NEONv1 I420 to YUYV chroma conversion
@***************************************************************************** @*****************************************************************************
@ Copyright (C) 2009 Rémi Denis-Courmont @ Copyright (C) 2009-2011 Rémi Denis-Courmont
@ @
@ This program is free software; you can redistribute it and/or modify @ This program is free software; you can redistribute it and/or modify
@ it under the terms of the GNU General Public License as published by @ it under the terms of the GNU General Public License as published by
...@@ -23,28 +23,33 @@ ...@@ -23,28 +23,33 @@
#define O1 r0 #define O1 r0
#define O2 r1 #define O2 r1
#define PITCH r2 #define WIDTH r2
#define S_OFF r3 #define HEIGHT r3
#define Y1 r4 #define Y1 r4
#define Y2 r5 #define Y2 r5
#define U r6 #define U r6
#define V r7 #define V r7
#define HEIGHT r8 #define YPITCH r8
#define END_O1 r12 #define OPAD r10
#define YPAD r11
#define COUNT ip
#define OPITCH lr
.align .align
.global i420_yuyv_neon .global i420_yuyv_neon
.type i420_yuyv_neon, %function .type i420_yuyv_neon, %function
i420_yuyv_neon: i420_yuyv_neon:
push {r4-r8, lr} push {r4-r8,r10-r11,lr}
ldr HEIGHT, [sp, #(4*6)] ldmia r0, {O1, OPITCH}
ldmia r1, {Y1, U, V} ldmia r1, {Y1, U, V, YPITCH}
add O2, O1, PITCH, lsl #1 cmp HEIGHT, #0
add Y2, Y1, PITCH sub OPAD, OPITCH, WIDTH, lsl #1
add Y2, S_OFF sub YPAD, YPITCH, WIDTH
1: 1:
mov END_O1, O2 movgts COUNT, WIDTH
pld [Y2] add O2, O1, OPITCH
add Y2, Y1, YPITCH
pople {r4-r8,r10-r11,pc}
2: 2:
pld [U, #64] pld [U, #64]
vld1.u8 {d2}, [U,:64]! vld1.u8 {d2}, [U,:64]!
...@@ -52,6 +57,7 @@ i420_yuyv_neon: ...@@ -52,6 +57,7 @@ i420_yuyv_neon:
vld1.u8 {d3}, [V,:64]! vld1.u8 {d3}, [V,:64]!
pld [Y1, #64] pld [Y1, #64]
vzip.u8 d2, d3 vzip.u8 d2, d3
subs COUNT, COUNT, #16
vld1.u8 {q0}, [Y1,:128]! vld1.u8 {q0}, [Y1,:128]!
pld [Y2, #64] pld [Y2, #64]
vmov q3, q1 vmov q3, q1
...@@ -60,36 +66,29 @@ i420_yuyv_neon: ...@@ -60,36 +66,29 @@ i420_yuyv_neon:
vzip.u8 q2, q3 vzip.u8 q2, q3
vst1.u8 {q0-q1}, [O1,:128]! vst1.u8 {q0-q1}, [O1,:128]!
vst1.u8 {q2-q3}, [O2,:128]! vst1.u8 {q2-q3}, [O2,:128]!
bgt 2b
cmp O1, END_O1 subs HEIGHT, #2
bne 2b add O1, O2, OPAD
add Y1, Y2, YPAD
sub HEIGHT, #2 add U, U, YPAD, lsr #1
mov O1, O2 add V, V, YPAD, lsr #1
add O2, PITCH, lsl #1 b 1b
add Y2, S_OFF
mov Y1, Y2
add Y2, PITCH
add Y2, S_OFF
add U, S_OFF, lsr #1
add V, S_OFF, lsr #1
cmp HEIGHT, #0
bne 1b
pop {r4-r8, pc}
.global i420_uyvy_neon .global i420_uyvy_neon
.type i420_uyvy_neon, %function .type i420_uyvy_neon, %function
i420_uyvy_neon: i420_uyvy_neon:
push {r4-r8, lr} push {r4-r8,r10-r11,lr}
ldr HEIGHT, [sp, #(4*6)] ldmia r0, {O1, OPITCH}
ldmia r1, {Y1, U, V} ldmia r1, {Y1, U, V, YPITCH}
add O2, O1, PITCH, lsl #1 cmp HEIGHT, #0
add Y2, Y1, PITCH sub OPAD, OPITCH, WIDTH, lsl #1
add Y2, S_OFF sub YPAD, YPITCH, WIDTH
1: 1:
mov END_O1, O2 movgts COUNT, WIDTH
add O2, O1, OPITCH
add Y2, Y1, YPITCH
pople {r4-r8,r10-r11,pc}
2: 2:
pld [U, #64] pld [U, #64]
vld1.u8 {d0}, [U,:64]! vld1.u8 {d0}, [U,:64]!
...@@ -97,6 +96,7 @@ i420_uyvy_neon: ...@@ -97,6 +96,7 @@ i420_uyvy_neon:
vld1.u8 {d1}, [V,:64]! vld1.u8 {d1}, [V,:64]!
pld [Y1, #64] pld [Y1, #64]
vzip.u8 d0, d1 vzip.u8 d0, d1
subs COUNT, COUNT, #16
vld1.u8 {q1}, [Y1,:128]! vld1.u8 {q1}, [Y1,:128]!
pld [Y2, #64] pld [Y2, #64]
vmov q2, q0 vmov q2, q0
...@@ -105,21 +105,11 @@ i420_uyvy_neon: ...@@ -105,21 +105,11 @@ i420_uyvy_neon:
vzip.u8 q2, q3 vzip.u8 q2, q3
vst1.u8 {q0-q1}, [O1,:128]! vst1.u8 {q0-q1}, [O1,:128]!
vst1.u8 {q2-q3}, [O2,:128]! vst1.u8 {q2-q3}, [O2,:128]!
bgt 2b
cmp O1, END_O1 subs HEIGHT, #2
bne 2b add O1, O2, OPAD
add Y1, Y2, YPAD
sub HEIGHT, #2 add U, U, YPAD, lsr #1
mov O1, O2 add V, V, YPAD, lsr #1
add O2, PITCH, lsl #1 b 1b
add Y2, S_OFF
mov Y1, Y2
add Y2, PITCH
add Y2, S_OFF
add U, S_OFF, lsr #1
add V, S_OFF, lsr #1
cmp HEIGHT, #0
bne 1b
pop {r4-r8, pc}
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment