Commit ec0f02a4 authored by Antoine Lejeune's avatar Antoine Lejeune Committed by Rémi Denis-Courmont

Maemo: Add the swscale_nokia770 library

Its interface is close to swscale and the library used part of the
swscale module code.
The library was developped by Siarhei Siamashka.
Signed-off-by: default avatarRémi Denis-Courmont <rdenis@simphalempin.com>
parent 374571ac
...@@ -4853,12 +4853,13 @@ dnl Maemo ...@@ -4853,12 +4853,13 @@ dnl Maemo
dnl dnl
AC_ARG_ENABLE(maemo, AC_ARG_ENABLE(maemo,
[ --enable-maemo Internet tablets based on Maemo SDK (default disabled)]) [ --enable-maemo Internet tablets based on Maemo SDK (default disabled)])
if test "${enable_maemo}" != "no" if test "${enable_maemo}" = "yes"
then then
PKG_CHECK_MODULES(HILDON, [hildon-1 hildon-fm-2], [ PKG_CHECK_MODULES(HILDON, [hildon-1 hildon-fm-2], [
VLC_ADD_CFLAGS([maemo],[${HILDON_CFLAGS}]) VLC_ADD_CFLAGS([maemo],[${HILDON_CFLAGS}])
VLC_ADD_LIBS([maemo],[${HILDON_LIBS}]) VLC_ADD_LIBS([maemo],[${HILDON_LIBS}])
VLC_ADD_PLUGIN([maemo]) VLC_ADD_PLUGIN([maemo])
VLC_ADD_PLUGIN([swscale_maemo])
AC_DEFINE([BUILD_MAEMO], 1, [Define if you're using Maemo interfaces]) AC_DEFINE([BUILD_MAEMO], 1, [Define if you're using Maemo interfaces])
ALIASES="${ALIASES} mvlc" ALIASES="${ALIASES} mvlc"
], [ ], [
......
...@@ -43,6 +43,7 @@ SOURCES_chain = chain.c ...@@ -43,6 +43,7 @@ SOURCES_chain = chain.c
SOURCES_postproc = postproc.c SOURCES_postproc = postproc.c
SOURCES_swscale = swscale.c ../codec/avcodec/chroma.c SOURCES_swscale = swscale.c ../codec/avcodec/chroma.c
SOURCES_imgresample = imgresample.c ../codec/avcodec/chroma.c SOURCES_imgresample = imgresample.c ../codec/avcodec/chroma.c
SOURCES_swscale_maemo = swscale_maemo.c libswscale_nokia770/arm_jit_swscale.c libswscale_nokia770/arm_colorconv.S libswscale_nokia770/arm_jit_swscale.h libswscale_nokia770/arm_colorconv.h
SOURCES_scene = scene.c SOURCES_scene = scene.c
SOURCES_yuvp = yuvp.c SOURCES_yuvp = yuvp.c
noinst_HEADERS = filter_common.h filter_picture.h noinst_HEADERS = filter_common.h filter_picture.h
/*
* ARM assembly optimized color format conversion functions
* (YV12 -> YUY2, YV12 -> some custom YUV420 format used by
* Epson graphics chip in Nokia N800)
*
* Copyright (C) 2007 Siarhei Siamashka <ssvb@users.sourceforge.net>
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public License
* version 2.1 as published by the Free Software Foundation.
*
* This library is distributed in the hope that it will be useful, but
* WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with this library; if not, write to the Free Software
* Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
* 02110-1301 USA
*/
.text
/*******************************************************/
.align
.global yv12_to_yuy2_line_arm
.func yv12_to_yuy2_line_arm
yv12_to_yuy2_line_arm:
#define DST r0
#define SRC_Y r1
#define SRC_U r2
#define SRC_V r3
#define WIDTH ip
ldr ip, [sp], #0
stmfd sp!, {r4-r8, r10, lr}
#define TMP1 r8
#define TMP2 r10
#define TMP3 lr
bic WIDTH, #1
subs WIDTH, #8
blt 2f
1:
ldrb r4, [SRC_Y], #1
ldrb TMP1, [SRC_U], #1
ldrb TMP2, [SRC_Y], #1
ldrb TMP3, [SRC_V], #1
add r4, r4, TMP1, lsl #8
add r4, r4, TMP2, lsl #16
add r4, r4, TMP3, lsl #24
ldrb r5, [SRC_Y], #1
ldrb TMP1, [SRC_U], #1
ldrb TMP2, [SRC_Y], #1
ldrb TMP3, [SRC_V], #1
add r5, r5, TMP1, lsl #8
add r5, r5, TMP2, lsl #16
add r5, r5, TMP3, lsl #24
ldrb r6, [SRC_Y], #1
ldrb TMP1, [SRC_U], #1
ldrb TMP2, [SRC_Y], #1
ldrb TMP3, [SRC_V], #1
add r6, r6, TMP1, lsl #8
add r6, r6, TMP2, lsl #16
add r6, r6, TMP3, lsl #24
ldrb r7, [SRC_Y], #1
ldrb TMP1, [SRC_U], #1
ldrb TMP2, [SRC_Y], #1
ldrb TMP3, [SRC_V], #1
add r7, r7, TMP1, lsl #8
add r7, r7, TMP2, lsl #16
add r7, r7, TMP3, lsl #24
stmia DST!, {r4-r7}
subs WIDTH, WIDTH, #8
bge 1b
2:
adds WIDTH, WIDTH, #8
ble 4f
3:
ldrb r4, [SRC_Y], #1
ldrb TMP1, [SRC_U], #1
ldrb TMP2, [SRC_Y], #1
ldrb TMP3, [SRC_V], #1
add r4, r4, TMP1, lsl #8
add r4, r4, TMP2, lsl #16
add r4, r4, TMP3, lsl #24
str r4, [DST], #4
subs WIDTH, WIDTH, #2
bgt 3b
4:
ldmfd sp!, {r4-r8, r10, pc}
#undef DST
#undef SRC_Y
#undef SRC_U
#undef SRC_V
#undef WIDTH
#undef TMP1
#undef TMP2
#undef TMP3
.endfunc
/*******************************************************/
#define DST r0
#define SRC_Y r1
#define SRC_U r2
#define WIDTH r3
#define TMP1 r10
#define TMP2 r11
#define TMP3 lr
.macro YUV420_function_template function_name, USE_PLD, USE_ARMV6
.align
.global \function_name
.func \function_name
\function_name:
/* Read information about 4 pixels, convert them to YUV420 and store into 6 bytes using 16-bit writes */
.macro CONVERT_4_PIXELS_MACROBLOCK
ldrb r4, [SRC_Y], #1
ldrb TMP1, [SRC_U], #1
ldrb r5, [SRC_U], #1
ldrb TMP2, [SRC_Y], #1
ldrb r6, [SRC_Y, #1]
ldrb TMP3, [SRC_Y], #2
add r4, r4, TMP1, lsl #8
add r5, r5, TMP2, lsl #8
add r6, r6, TMP3, lsl #8
strh r4, [DST], #2
strh r5, [DST], #2
strh r6, [DST], #2
.endm
.if \USE_ARMV6
.macro CONVERT_8_PIXELS_MACROBLOCK_1 DST_REG1, DST_REG2, FLAG1, FLAG2, PLD_FLAG
.if \FLAG1 == 0
ldrb \DST_REG1, [SRC_U], #1
ldrh TMP1, [SRC_Y], #2
ldrb TMP2, [SRC_U], #1
.endif
.if \FLAG2 == 1
ldrh \DST_REG2, [SRC_Y], #2
.endif
.if \PLD_FLAG == 1
pld [SRC_Y, #48]
.endif
add \DST_REG1, \DST_REG1, TMP1, lsl #8
add \DST_REG1, \DST_REG1, TMP2, lsl #24
.if \FLAG2 == 1
ldrb TMP1, [SRC_U], #1
ldrb TMP2, [SRC_Y], #1
.endif
rev16 \DST_REG1, \DST_REG1
.endm
.macro CONVERT_8_PIXELS_MACROBLOCK_2 DST_REG1, DST_REG2, FLAG1, FLAG2, DUMMY1
.if \FLAG1 == 0
ldrh \DST_REG1, [SRC_Y], #2
ldrb TMP1, [SRC_U], #1
ldrb TMP2, [SRC_Y], #1
.endif
.if \FLAG2 == 1
ldrb \DST_REG2, [SRC_Y], #1
.endif
add \DST_REG1, \DST_REG1, TMP1, lsl #16
add \DST_REG1, \DST_REG1, TMP2, lsl #24
.if \FLAG2 == 1
ldrb TMP1, [SRC_U], #1
ldrh TMP2, [SRC_Y], #2
.endif
rev16 \DST_REG1, \DST_REG1
.endm
.macro CONVERT_8_PIXELS_MACROBLOCK_3 DST_REG1, DST_REG2, FLAG1, FLAG2, DUMMY1
.if \FLAG1 == 0
ldrb \DST_REG1, [SRC_Y], #1
ldrb TMP1, [SRC_U], #1
ldrh TMP2, [SRC_Y], #2
.endif
.if \FLAG2 == 1
ldrb \DST_REG2, [SRC_U], #1
.endif
add \DST_REG1, \DST_REG1, TMP1, lsl #8
add \DST_REG1, \DST_REG1, TMP2, lsl #16
.if \FLAG2 == 1
ldrh TMP1, [SRC_Y], #2
ldrb TMP2, [SRC_U], #1
.endif
rev16 \DST_REG1, \DST_REG1
.endm
.else
/* Prepare the first 32-bit output value for 8 pixels macroblock */
.macro CONVERT_8_PIXELS_MACROBLOCK_1 DST_REG, DUMMY1, DUMMY2, DUMMY3, PLD_FLAG
ldrb \DST_REG, [SRC_Y], #1
ldrb TMP1, [SRC_U], #1
ldrb TMP2, [SRC_U], #1
ldrb TMP3, [SRC_Y], #1
.if \USE_PLD && (\PLD_FLAG == 1)
pld [SRC_Y, #48]
.endif
add \DST_REG, \DST_REG, TMP1, lsl #8
add \DST_REG, \DST_REG, TMP2, lsl #16
add \DST_REG, \DST_REG, TMP3, lsl #24
.endm
/* Prepare the second 32-bit output value for 8 pixels macroblock */
.macro CONVERT_8_PIXELS_MACROBLOCK_2 DST_REG, DUMMY1, DUMMY2, DUMMY3, DUMMY4
ldrb \DST_REG, [SRC_Y, #1]
ldrb TMP1, [SRC_Y], #2
ldrb TMP2, [SRC_Y], #1
ldrb TMP3, [SRC_U], #1
add \DST_REG, \DST_REG, TMP1, lsl #8
add \DST_REG, \DST_REG, TMP2, lsl #16
add \DST_REG, \DST_REG, TMP3, lsl #24
.endm
/* Prepare the third 32-bit output value for 8 pixels macroblock */
.macro CONVERT_8_PIXELS_MACROBLOCK_3 DST_REG, DUMMY1, DUMMY2, DUMMY3, DUMMY4
ldrb \DST_REG, [SRC_U], #1
ldrb TMP1, [SRC_Y], #1
ldrb TMP2, [SRC_Y, #1]
ldrb TMP3, [SRC_Y], #2
add \DST_REG, \DST_REG, TMP1, lsl #8
add \DST_REG, \DST_REG, TMP2, lsl #16
add \DST_REG, \DST_REG, TMP3, lsl #24
.endm
.endif
.if \USE_PLD
pld [SRC_Y]
.endif
stmfd sp!, {r4-r8, r10-r11, lr}
/* Destination buffer should be at least 16-bit aligned, image width should be multiple of 4 */
bic DST, #1
bic WIDTH, #3
/* Ensure 32-bit alignment of the destination buffer */
tst DST, #2
beq 1f
subs WIDTH, #4
blt 6f
CONVERT_4_PIXELS_MACROBLOCK
1:
subs WIDTH, #32
blt 3f
2: /* Convert 32 pixels per loop iteration */
CONVERT_8_PIXELS_MACROBLOCK_1 r4, r6, 0, 1, 1 /* Also do cache preload for SRC_Y */
CONVERT_8_PIXELS_MACROBLOCK_2 r6, r7, 1, 1, 0
CONVERT_8_PIXELS_MACROBLOCK_3 r7, r8, 1, 1, 0
CONVERT_8_PIXELS_MACROBLOCK_1 r8, r5, 1, 1, 0
stmia DST!, {r4, r6, r7, r8}
subs WIDTH, #32
CONVERT_8_PIXELS_MACROBLOCK_2 r5, r6, 1, 1, 0
CONVERT_8_PIXELS_MACROBLOCK_3 r6, r7, 1, 1, 0
CONVERT_8_PIXELS_MACROBLOCK_1 r7, r8, 1, 1, 0
CONVERT_8_PIXELS_MACROBLOCK_2 r8, r4, 1, 1, 0
stmia DST!, {r5, r6, r7, r8}
.if \USE_PLD
/* Do cache preload for SRC_U */
pld [SRC_U, #48]
.endif
CONVERT_8_PIXELS_MACROBLOCK_3 r4, r6, 1, 1, 0
CONVERT_8_PIXELS_MACROBLOCK_1 r6, r7, 1, 1, 0
CONVERT_8_PIXELS_MACROBLOCK_2 r7, r8, 1, 1, 0
CONVERT_8_PIXELS_MACROBLOCK_3 r8, r4, 1, 0, 0
stmia DST!, {r4, r6, r7, r8}
bge 2b
3:
adds WIDTH, WIDTH, #32
ble 6f
subs WIDTH, WIDTH, #8
blt 5f
4: /* Convert remaining pixels processing them 8 per iteration */
CONVERT_8_PIXELS_MACROBLOCK_1 r4, r5, 0, 1, 0
CONVERT_8_PIXELS_MACROBLOCK_2 r5, r6, 1, 1, 0
CONVERT_8_PIXELS_MACROBLOCK_3 r6, r7, 1, 0, 0
stmia DST!, {r4-r6}
subs WIDTH, WIDTH, #8
bge 4b
5: /* Convert the last 4 pixels if needed */
adds WIDTH, WIDTH, #8
ble 6f
CONVERT_4_PIXELS_MACROBLOCK
subs WIDTH, #4
bgt 4b
6: /* Restore all registers and return */
ldmfd sp!, {r4-r8, r10-r11, pc}
.purgem CONVERT_4_PIXELS_MACROBLOCK
.purgem CONVERT_8_PIXELS_MACROBLOCK_1
.purgem CONVERT_8_PIXELS_MACROBLOCK_2
.purgem CONVERT_8_PIXELS_MACROBLOCK_3
#undef DST
#undef SRC_Y
#undef SRC_U
#undef WIDTH
#undef TMP1
#undef TMP2
#undef TMP3
.endfunc
.endm
YUV420_function_template yv12_to_yuv420_line_arm, 0, 0
YUV420_function_template yv12_to_yuv420_line_armv5, 1, 0
YUV420_function_template yv12_to_yuv420_line_armv6, 1, 1
/*
* ARM assembly optimized color format conversion functions
* (YV12 -> YUY2, YV12 -> some custom YUV420 format used by
* Epson graphics chip in Nokia N800)
*
* Copyright (C) 2007 Siarhei Siamashka <ssvb@users.sourceforge.net>
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public License
* version 2.1 as published by the Free Software Foundation.
*
* This library is distributed in the hope that it will be useful, but
* WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with this library; if not, write to the Free Software
* Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
* 02110-1301 USA
*/
#ifndef __ARM_COLORCONV_H__
#define __ARM_COLORCONV_H__
#include <stdint.h>
/**
* Convert a line of pixels from YV12 to YUY2 color format
* @param dst - destination buffer for YUY2 pixel data, it should be 32-bit aligned
* @param src_y - pointer to Y plane
* @param src_u - pointer to U plane
* @param src_v - pointer to V plane
* @param w - number of pixels to convert (should be multiple of 2)
*/
void yv12_to_yuy2_line_arm(uint32_t *dst, const uint16_t *src_y, const uint8_t *src_u, const uint8_t *src_v, int w);
/**
* Convert a line of pixels from YV12 to YUV420 color format
* @param dst - destination buffer for YUV420 pixel data, it should be at least 16-bit aligned
* @param src_y - pointer to Y plane
* @param src_c - pointer to chroma plane (U for even lines, V for odd lines)
* @param w - number of pixels to convert (should be multiple of 4)
*/
void yv12_to_yuv420_line_arm(uint16_t *dst, const uint8_t *src_y, const uint8_t *src_c, int w);
/**
* Convert a line of pixels from YV12 to YUV420 color format
* @param dst - destination buffer for YUV420 pixel data, it should be at least 16-bit aligned
* @param src_y - pointer to Y plane
* @param src_c - pointer to chroma plane (U for even lines, V for odd lines)
* @param w - number of pixels to convert (should be multiple of 4)
*/
void yv12_to_yuv420_line_armv5(uint16_t *dst, const uint8_t *src_y, const uint8_t *src_c, int w);
/**
* Convert a line of pixels from YV12 to YUV420 color format
* @param dst - destination buffer for YUV420 pixel data, it should be at least 16-bit aligned
* @param src_y - pointer to Y plane, it should be 16-bit aligned
* @param src_c - pointer to chroma plane (U for even lines, V for odd lines)
* @param w - number of pixels to convert (should be multiple of 4)
*/
void yv12_to_yuv420_line_armv6(uint16_t *dst, const uint16_t *src_y, const uint8_t *src_c, int w);
#endif
/*
* Fast JIT powered scaler for ARM
*
* Copyright (C) 2007 Siarhei Siamashka <ssvb@users.sourceforge.net>
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public License
* version 2.1 as published by the Free Software Foundation.
*
* This library is distributed in the hope that it will be useful, but
* WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with this library; if not, write to the Free Software
* Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
* 02110-1301 USA
*/
#include <stdio.h>
#include <math.h>
#include <stdint.h>
#include <stdlib.h>
#include <sys/mman.h>
#include <string.h>
#include "arm_jit_swscale.h"
#include "arm_colorconv.h"
/* Size of cpu instructions cache, we should never exceed it in generated code */
#define INSTRUCTIONS_CACHE_SIZE 32768
/* Supported output formats */
#define FMT_OMAPFB_YUV422 1
#define FMT_OMAPFB_YUV420 2
extern void __clear_cache (char *beg, char *end);
/*
* API is similar to API from ffmpeg libswscale
*/
typedef struct SwsContextArmJit {
int fmt;
int source_w;
int source_h;
int target_w;
int target_h;
uint32_t *codebuffer;
int *linebuffer;
int armv6_is_supported;
} SwsContextArmJit;
//#define JIT_DEBUG
#define INTERPOLATE_COPY_FIRST 0
#define INTERPOLATE_AVERAGE_1_3 1
#define INTERPOLATE_AVERAGE_2_2 2
#define INTERPOLATE_AVERAGE_3_1 3
/**
* Get two nearest pixels from the source image
*
* @todo get rid of the floating point math
*/
static inline int get_pix(int quality, int orig_w, int dest_w, int x, int *p1, int *p2)
{
double offs = ((double)x + 0.5) / (double)dest_w * (double)orig_w;
double dist;
int pix1 = floor(offs - 0.5);
int pix2 = ceil(offs - 0.5);
// Special boundary cases
if (pix1 < 0) {
*p1 = *p2 = 0;
return INTERPOLATE_COPY_FIRST;
}
if (pix2 >= orig_w) {
*p1 = *p2 = orig_w - 1;
return INTERPOLATE_COPY_FIRST;
}
dist = offs - ((double)pix1 + 0.5);
#if 0
if (quality >= 3) {
if (dist > 0.125 && dist < 0.375) {
*p1 = pix1;
*p2 = pix2;
return INTERPOLATE_AVERAGE_3_1;
}
if (dist > 0.625 && dist < 0.875) {
*p1 = pix1;
*p2 = pix2;
return INTERPOLATE_AVERAGE_1_3;
}
}
#endif
if (quality >= 2) {
if (dist > 0.25 && dist < 0.75) {
*p1 = pix1;
*p2 = pix2;
return INTERPOLATE_AVERAGE_2_2;
}
}
if (dist < 0.5) {
*p1 = *p2 = pix1;
return INTERPOLATE_COPY_FIRST;
} else {
*p1 = *p2 = pix2;
return INTERPOLATE_COPY_FIRST;
}
}
static uint32_t *generate_arm_cmd_ldrb_r_r_offs(uint32_t *cmdbuffer, int dstreg, int basereg, int offset)
{
#ifdef JIT_DEBUG
printf("ldrb r%d, [r%d, #%d]\n", dstreg, basereg, offset);
#endif
*cmdbuffer++ = 0xE5D00000 | (basereg << 16) | (dstreg << 12) | (offset);
return cmdbuffer;
}
static uint32_t *generate_arm_cmd_add_r_r_r_lsl(uint32_t *cmdbuffer, int dstreg, int r1, int r2, int r2_shift)
{
#ifdef JIT_DEBUG
printf("add r%d, r%d, r%d, lsl #%d\n", dstreg, r1, r2, r2_shift);
#endif
*cmdbuffer++ = 0xE0800000 | (r1 << 16) | (dstreg << 12) | (r2_shift << 7) | (r2);
return cmdbuffer;
}
static uint32_t *generate_arm_cmd_mov_r_r_lsr(uint32_t *cmdbuffer, int dstreg, int r, int shift)
{
#ifdef JIT_DEBUG
printf("mov r%d, r%d, lsr #%d\n", dstreg, r, shift);
#endif
*cmdbuffer++ = 0xE1A00020 | (dstreg << 12) | (shift << 7) | (r);
return cmdbuffer;
}
/**
* Generation of 32-bit output scaled data
* @param quality - scaling quality level
* @param buf1reg - register that holds a pointer to the buffer with data for the first output byte
* @param buf2reg - register that holds a pointer to the buffer with data for the second output byte
* @param buf3reg - register that holds a pointer to the buffer with data for the third output byte
* @param buf4reg - register that holds a pointer to the buffer with data for the fourth output byte
*/
static uint32_t *generate_32bit_scaled_data_write(
uint32_t *p,
int quality, int orig_w, int dest_w,
int buf1reg, int size1, int offs1,
int buf2reg, int size2, int offs2,
int buf3reg, int size3, int offs3,
int buf4reg, int size4, int offs4)
{
int p1, p2;
int type_y1, type_y2, type_u, type_v;
// First stage: perform data loading
type_y1 = get_pix(quality, orig_w / size1, dest_w / size1, offs1 / size1, &p1, &p2);
if (type_y1 == INTERPOLATE_COPY_FIRST) {
// Special case, no interpolation is needed, so load this data
// directly into destination register
p = generate_arm_cmd_ldrb_r_r_offs(p, 4, buf1reg, p1);
} else {
p = generate_arm_cmd_ldrb_r_r_offs(p, 5, buf1reg, p1);
p = generate_arm_cmd_ldrb_r_r_offs(p, 6, buf1reg, p2);
}
// u
type_u = get_pix(quality, orig_w / size2, dest_w / size2, offs2 / size2, &p1, &p2);
p = generate_arm_cmd_ldrb_r_r_offs(p, 7, buf2reg, p1);
if (type_u != INTERPOLATE_COPY_FIRST) p = generate_arm_cmd_ldrb_r_r_offs(p, 8, buf2reg, p2);
// y2
type_y2 = get_pix(quality, orig_w / size3, dest_w / size3, offs3 / size3, &p1, &p2);
p = generate_arm_cmd_ldrb_r_r_offs(p, 9, buf3reg, p1);
if (type_y2 != INTERPOLATE_COPY_FIRST) p = generate_arm_cmd_ldrb_r_r_offs(p, 10, buf3reg, p2);
// v
type_v = get_pix(quality, orig_w / size4, dest_w / size4, offs4 / size4, &p1, &p2);
p = generate_arm_cmd_ldrb_r_r_offs(p, 11, buf4reg, p1);
if (type_v != INTERPOLATE_COPY_FIRST) p = generate_arm_cmd_ldrb_r_r_offs(p, 12, buf4reg, p2);
// Second stage: perform data shuffling
if (type_y1 == INTERPOLATE_AVERAGE_2_2) {
p = generate_arm_cmd_add_r_r_r_lsl(p, 14, 5, 6, 0);
p = generate_arm_cmd_mov_r_r_lsr(p, 4, 14, 1);
}
if (type_u == INTERPOLATE_COPY_FIRST) {
p = generate_arm_cmd_add_r_r_r_lsl(p, 4, 4, 7, 8);
} else if (type_u == INTERPOLATE_AVERAGE_2_2) {
p = generate_arm_cmd_add_r_r_r_lsl(p, 14, 7, 8, 0);
p = generate_arm_cmd_mov_r_r_lsr(p, 14, 14, 1);
p = generate_arm_cmd_add_r_r_r_lsl(p, 4, 4, 14, 8);
}
if (type_y2 == INTERPOLATE_COPY_FIRST) {
p = generate_arm_cmd_add_r_r_r_lsl(p, 4, 4, 9, 16);
} else if (type_y2 == INTERPOLATE_AVERAGE_2_2) {
p = generate_arm_cmd_add_r_r_r_lsl(p, 14, 9, 10, 0);
p = generate_arm_cmd_mov_r_r_lsr(p, 14, 14, 1);
p = generate_arm_cmd_add_r_r_r_lsl(p, 4, 4, 14, 16);
}
if (type_v == INTERPOLATE_COPY_FIRST) {
p = generate_arm_cmd_add_r_r_r_lsl(p, 4, 4, 11, 24);
} else if (type_v == INTERPOLATE_AVERAGE_2_2) {
p = generate_arm_cmd_add_r_r_r_lsl(p, 14, 11, 12, 0);
p = generate_arm_cmd_mov_r_r_lsr(p, 14, 14, 1);
p = generate_arm_cmd_add_r_r_r_lsl(p, 4, 4, 14, 24);
}
// Third stage: store data and advance output buffer pointer
*p++ = 0xE4834004; // str r4, [r3], #4
return p;
}
/**
* Scaler code should assume:
* r0 - y plane
* r1 - u plane
* r2 - v plane
* r3 - destination buffer
* r4 - result for storage into output buffer
* r5, r6 - source data for y1 calculation
* r7, r8 - source data for u calculation
* r9, r10 - source data for y2 calculation
* r11, r12 - source data for v calculation
* r14 (lr) - accumulator
*
* @param cmdbuffer - bugger for dynamically generated code
* @return - number of instructions generated
*/
static int generate_yuv420p_to_yuyv422_line_scaler(uint32_t *cmdbuffer, int maxcmdcount, int orig_w, int dest_w, int quality)
{
int i, p1, p2, cmdcount;
int type_y1, type_y2, type_u, type_v;
uint32_t *p = cmdbuffer;
*p++ = 0xE92D4FF0; // stmfd sp!, {r4-r11, lr} @ save all registers
// Process a pair of destination pixels per loop iteration (it should result in 32-bit value write)
for (i = 0; i < dest_w; i += 2) {
p = generate_32bit_scaled_data_write(
p, quality, orig_w, dest_w,
0, 1, i + 0,
1, 2, i,
0, 1, i + 1,
2, 2, i);
}
*p++ = 0xE8BD8FF0; // ldmfd sp!, {r4-r11, pc} @ restore all registers and return
cmdcount = p - cmdbuffer;
#ifdef JIT_DEBUG
printf("@ number of instructions = %d\n", cmdcount);
FILE *f = fopen("cmdbuf.bin", "w+");
fwrite(cmdbuffer, 1, INSTRUCTIONS_CACHE_SIZE, f);
fclose(f);
#endif
return cmdcount;
}
static int generate_yuv420p_to_yuv420_line_scaler(uint32_t *cmdbuffer, int maxcmdcount, int orig_w, int dest_w, int quality)
{
int i = 0, p1, p2, cmdcount;
int type_y1, type_y2, type_u, type_v;
uint32_t *p = cmdbuffer;
#define SRC_Y 0
#define SRC_U 1
*p++ = 0xE92D4FF0; // stmfd sp!, {r4-r11, lr} @ save all registers
while (i + 8 <= dest_w) {
p = generate_32bit_scaled_data_write(
p, quality, orig_w, dest_w,
SRC_Y, 1, i + 0 * 1,
SRC_U, 2, i + 0 * 2,
SRC_U, 2, i + 1 * 2,
SRC_Y, 1, i + 1 * 1);
p = generate_32bit_scaled_data_write(
p, quality, orig_w, dest_w,
SRC_Y, 1, i + 3 * 1,
SRC_Y, 1, i + 2 * 1,
SRC_Y, 1, i + 4 * 1,
SRC_U, 2, i + 2 * 2);
p = generate_32bit_scaled_data_write(
p, quality, orig_w, dest_w,
SRC_U, 2, i + 3 * 2,
SRC_Y, 1, i + 5 * 1,
SRC_Y, 1, i + 7 * 1,
SRC_Y, 1, i + 6 * 1);
i += 8;
}
*p++ = 0xE8BD8FF0; // ldmfd sp!, {r4-r11, pc} @ restore all registers and return
cmdcount = p - cmdbuffer;
#ifdef JIT_DEBUG
printf("@ number of instructions = %d\n", cmdcount);
FILE *f = fopen("cmdbuf.bin", "w+");
fwrite(cmdbuffer, 1, INSTRUCTIONS_CACHE_SIZE, f);
fclose(f);
#endif
return cmdcount;
}
/******************************************************************************/
static struct SwsContextArmJit *sws_arm_jit_create_scaler_internal(int source_w, int source_h, int target_w, int target_h, int quality, int fmt)
{
int i, p1, p2;
uint32_t *p = mmap(0, INSTRUCTIONS_CACHE_SIZE, PROT_READ | PROT_WRITE | PROT_EXEC, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
if (fmt == FMT_OMAPFB_YUV422) {
generate_yuv420p_to_yuyv422_line_scaler(p, INSTRUCTIONS_CACHE_SIZE / 4, source_w, target_w, quality);
} else if (fmt == FMT_OMAPFB_YUV420) {
generate_yuv420p_to_yuv420_line_scaler(p, INSTRUCTIONS_CACHE_SIZE / 4, source_w, target_w, quality);
} else {
return NULL;
}
int *linebuffer = (int *)malloc(target_h * sizeof(int));
for (i = 0; i < target_h; i ++) {
get_pix(1, source_h, target_h, i, &p1, &p2);
linebuffer[i] = p1;
}
__clear_cache((char *)p, (char *)p + INSTRUCTIONS_CACHE_SIZE);
SwsContextArmJit *context = (SwsContextArmJit *)malloc(sizeof(SwsContextArmJit));
memset(context, 0, sizeof(SwsContextArmJit));
context->source_w = source_w;
context->source_h = source_h;
context->target_w = target_w;
context->target_h = target_h;
context->codebuffer = p;
context->linebuffer = linebuffer;
context->fmt = fmt;
context->armv6_is_supported = 0;
return context;
}
struct SwsContextArmJit *sws_arm_jit_create_omapfb_yuv422_scaler(int source_w, int source_h, int target_w, int target_h, int quality)
{
return sws_arm_jit_create_scaler_internal(source_w, source_h, target_w, target_h, quality, FMT_OMAPFB_YUV422);
}
struct SwsContextArmJit *sws_arm_jit_create_omapfb_yuv420_scaler(int source_w, int source_h, int target_w, int target_h, int quality)
{
return sws_arm_jit_create_scaler_internal(source_w, source_h, target_w, target_h, quality, FMT_OMAPFB_YUV420);
}
struct SwsContextArmJit *sws_arm_jit_create_omapfb_yuv420_scaler_armv6(int source_w, int source_h, int target_w, int target_h, int quality)
{
struct SwsContextArmJit *s = sws_arm_jit_create_scaler_internal(source_w, source_h, target_w, target_h, quality, FMT_OMAPFB_YUV420);
if (s) s->armv6_is_supported = 1;
return s;
}
void sws_arm_jit_free(SwsContextArmJit *context)
{
if (!context) return;
munmap(context->codebuffer, INSTRUCTIONS_CACHE_SIZE);
free(context->linebuffer);
free(context);
}
static int sws_arm_jit_vscaleonly_internal(SwsContextArmJit *context, uint8_t* src[], int srcStride[], uint8_t* dst[], int dstStride[])
{
int i, j;
if (context->fmt == FMT_OMAPFB_YUV420) {
void (*yv12_to_yuv420_line)(uint16_t *dst, const uint16_t *src_y, const uint8_t *src_c, int w) =
yv12_to_yuv420_line_arm;
if (context->armv6_is_supported) yv12_to_yuv420_line = yv12_to_yuv420_line_armv6;
for (i = 0; i < context->target_h; i++) {
j = context->linebuffer[i];
if (i & 1) {
yv12_to_yuv420_line((uint16_t *)(dst[0] + i * dstStride[0]),
src[0] + j * srcStride[0], src[2] + (j / 2) * srcStride[2], context->target_w);
} else {
yv12_to_yuv420_line((uint16_t *)(dst[0] + i * dstStride[0]),
src[0] + j * srcStride[0], src[1] + (j / 2) * srcStride[1], context->target_w);
}
}
return 1;
} else if (context->fmt == FMT_OMAPFB_YUV422) {
void (*yv12_to_yuy2_line)(uint16_t *dst, const uint16_t *src_y, const uint8_t *src_u, const uint8_t *src_v, int w) =
yv12_to_yuy2_line_arm;
for (i = 0; i < context->target_h; i++) {
j = context->linebuffer[i];
yv12_to_yuy2_line(
dst[0] + i * dstStride[0],
src[0] + j * srcStride[0],
src[1] + (j / 2) * srcStride[1],
src[2] + (j / 2) * srcStride[2],
context->target_w);
}
return 1;
}
return 0;
}
static int sws_arm_jit_scale_internal(SwsContextArmJit *context, uint8_t* src[], int srcStride[], uint8_t* dst[], int dstStride[])
{
int i, j;
void (*scale_line)(uint8_t *y, uint8_t *u, uint8_t *v, uint8_t *out) =
(void (*)(uint8_t *, uint8_t *, uint8_t *, uint8_t *))context->codebuffer;
if (context->source_w == context->target_w)
return sws_arm_jit_vscaleonly_internal(context, src, srcStride, dst, dstStride);
if (context->fmt == FMT_OMAPFB_YUV422) {
for (i = 0; i < context->target_h; i++) {
j = context->linebuffer[i];
scale_line(
src[0] + j * srcStride[0],
src[1] + (j / 2) * srcStride[1],
src[2] + (j / 2) * srcStride[2],
dst[0] + i * dstStride[0]);
}
return 1;
} else if (context->fmt == FMT_OMAPFB_YUV420) {
for (i = 0; i < context->target_h; i++) {
j = context->linebuffer[i];
scale_line(
src[0] + j * srcStride[0],
(i & 1) ? (src[2] + (j / 2) * srcStride[2]) : (src[1] + (j / 2) * srcStride[1]),
0,
dst[0] + i * dstStride[0]);
}
return 1;
}
return 0;
}
int sws_arm_jit_scale(SwsContextArmJit *context, uint8_t* src[], int srcStride[], int y, int h, uint8_t* dst[], int dstStride[])
{
if (y != 0 || h != context->source_h) return 0; // Slices are not supported yet
return sws_arm_jit_scale_internal(context, src, srcStride, dst, dstStride);
}
/*
* Fast JIT powered scaler for ARM
*
* Copyright (C) 2007 Siarhei Siamashka <ssvb@users.sourceforge.net>
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public License
* version 2.1 as published by the Free Software Foundation.
*
* This library is distributed in the hope that it will be useful, but
* WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with this library; if not, write to the Free Software
* Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
* 02110-1301 USA
*/
#ifndef ARM_JIT_SWSCALE_H
#define ARM_JIT_SWSCALE_H
#include <stdint.h>
struct SwsContextArmJit;
struct SwsContextArmJit *sws_arm_jit_create_omapfb_yuv422_scaler(int source_w, int source_h, int target_w, int target_h, int quality);
struct SwsContextArmJit *sws_arm_jit_create_omapfb_yuv420_scaler(int source_w, int source_h, int target_w, int target_h, int quality);
struct SwsContextArmJit *sws_arm_jit_create_omapfb_yuv420_scaler_armv6(int source_w, int source_h, int target_w, int target_h, int quality);
int sws_arm_jit_scale(struct SwsContextArmJit *context, uint8_t* src[], int srcStride[], int y, int h, uint8_t* dst[], int dstStride[]);
void sws_arm_jit_free(struct SwsContextArmJit *context);
#endif
/*****************************************************************************
* swscale_maemo.c: scaling and chroma conversion using libswscale_nokia770
*****************************************************************************
* Copyright (C) 1999-2008 the VideoLAN team
* $Id$
*
* Authors: Antoine Lejeune <phytos@videolan.org>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston MA 02110-1301, USA.
*****************************************************************************/
/*****************************************************************************
* Preamble
*****************************************************************************/
#ifdef HAVE_CONFIG_H
# include "config.h"
#endif
#include <vlc_common.h>
#include <vlc_plugin.h>
#include <vlc_vout.h>
#include <vlc_filter.h>
#include "libswscale_nokia770/arm_jit_swscale.h"
#include "libswscale_nokia770/arm_colorconv.h"
/****************************************************************************
* Local prototypes
****************************************************************************/
static int OpenScaler( vlc_object_t * );
static void CloseScaler( vlc_object_t * );
static picture_t *Filter( filter_t *, picture_t * );
static int Init( filter_t * );
/*****************************************************************************
* Module descriptor
*****************************************************************************/
vlc_module_begin();
set_description( N_("Video scaling filter") );
set_capability( "video filter2", 1000 );
set_category( CAT_VIDEO );
set_subcategory( SUBCAT_VIDEO_VFILTER );
set_callbacks( OpenScaler, CloseScaler );
vlc_module_end();
/*****************************************************************************
* filter_sys_t : filter descriptor
*****************************************************************************/
struct filter_sys_t
{
struct SwsContextArmJit *ctx;
es_format_t fmt_in;
es_format_t fmt_out;
};
/*****************************************************************************
* OpenScaler: probe the filter and return score
*****************************************************************************/
static int OpenScaler( vlc_object_t *p_this )
{
filter_t *p_filter = (filter_t*)p_this;
filter_sys_t *p_sys;
/* Allocate the memory needed to store the decoder's structure */
if( ( p_filter->p_sys = p_sys =
(filter_sys_t *)malloc(sizeof(filter_sys_t)) ) == NULL )
{
return VLC_ENOMEM;
}
/* Misc init */
p_sys->ctx = NULL;
p_filter->pf_video_filter = Filter;
es_format_Init( &p_sys->fmt_in, 0, 0 );
es_format_Init( &p_sys->fmt_out, 0, 0 );
if( Init( p_filter ) )
{
free( p_sys );
return VLC_EGENERIC;
}
msg_Dbg( p_filter, "%ix%i chroma: %4.4s -> %ix%i chroma: %4.4s",
p_filter->fmt_in.video.i_width, p_filter->fmt_in.video.i_height,
(char *)&p_filter->fmt_in.video.i_chroma,
p_filter->fmt_out.video.i_width, p_filter->fmt_out.video.i_height,
(char *)&p_filter->fmt_out.video.i_chroma );
return VLC_SUCCESS;
}
/*****************************************************************************
* CloseFilter: clean up the filter
*****************************************************************************/
static void CloseScaler( vlc_object_t *p_this )
{
filter_t *p_filter = (filter_t*)p_this;
filter_sys_t *p_sys = p_filter->p_sys;
if( p_sys->ctx )
sws_arm_jit_free( p_sys->ctx );
free( p_sys );
}
/*****************************************************************************
* Helpers
*****************************************************************************/
static bool IsFmtSimilar( const video_format_t *p_fmt1, const video_format_t *p_fmt2 )
{
return p_fmt1->i_chroma == p_fmt2->i_chroma &&
p_fmt1->i_width == p_fmt2->i_width &&
p_fmt1->i_height == p_fmt2->i_height;
}
static int Init( filter_t *p_filter )
{
filter_sys_t *p_sys = p_filter->p_sys;
if( IsFmtSimilar( &p_filter->fmt_in.video, &p_sys->fmt_in ) &&
IsFmtSimilar( &p_filter->fmt_out.video, &p_sys->fmt_out ) &&
p_sys->ctx )
{
return VLC_SUCCESS;
}
if( ( p_filter->fmt_in.video.i_chroma != VLC_FOURCC('I','4','2','0') &&
p_filter->fmt_in.video.i_chroma != VLC_FOURCC('I','Y','U','V') &&
p_filter->fmt_in.video.i_chroma != VLC_FOURCC('Y','V','1','2') ) ||
p_filter->fmt_out.video.i_chroma != VLC_FOURCC('Y','4','2','0') )
{
msg_Err( p_filter, "format not supported" );
return VLC_EGENERIC;
}
if( p_sys->ctx )
sws_arm_jit_free( p_sys->ctx );
p_sys->ctx =
sws_arm_jit_create_omapfb_yuv420_scaler_armv6(
p_filter->fmt_in.video.i_width,
p_filter->fmt_in.video.i_height,
p_filter->fmt_out.video.i_width,
p_filter->fmt_out.video.i_height, 2 );
if( !p_sys->ctx )
{
msg_Err( p_filter, "could not init SwScaler" );
return VLC_EGENERIC;
}
p_sys->fmt_in = p_filter->fmt_in;
p_sys->fmt_out = p_filter->fmt_out;
return VLC_SUCCESS;
}
/****************************************************************************
* Filter: the whole thing
****************************************************************************
* This function is called just after the thread is launched.
****************************************************************************/
static picture_t *Filter( filter_t *p_filter, picture_t *p_pic )
{
filter_sys_t *p_sys = p_filter->p_sys;
uint8_t *src[3]; int src_stride[3];
uint8_t *dst[3]; int dst_stride[3];
picture_t *p_pic_dst;
int i_plane;
int i_nb_planes = p_pic->i_planes;
/* Check if format properties changed */
if( Init( p_filter ) != VLC_SUCCESS )
return NULL;
/* Request output picture */
p_pic_dst = p_filter->pf_vout_buffer_new( p_filter );
if( !p_pic_dst )
{
msg_Warn( p_filter, "can't get output picture" );
return NULL;
}
for( i_plane = 0; i_plane < __MIN(3, p_pic->i_planes); i_plane++ )
{
src[i_plane] = p_pic->p[i_plane].p_pixels;
src_stride[i_plane] = p_pic->p[i_plane].i_pitch;
}
for( i_plane = 0; i_plane < __MIN(3, i_nb_planes); i_plane++ )
{
dst[i_plane] = p_pic_dst->p[i_plane].p_pixels;
dst_stride[i_plane] = p_pic_dst->p[i_plane].i_pitch;
}
sws_arm_jit_scale( p_sys->ctx, src, src_stride, 0,
p_filter->fmt_in.video.i_height, dst, dst_stride);
picture_CopyProperties( p_pic_dst, p_pic );
picture_Release( p_pic );
return p_pic_dst;
}
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment