Commit d9354df3 authored by gpoirier's avatar gpoirier

use shorter types vec_"type" instead of the too long vector "type"

part 1 of h264 luma interpolation 8x8 for altivec contributed by
Mauricio Alvarez % lokifo A gmail P com %
Original thread:
Date: Jun 26, 2007 8:15 PM
Subject: Re: [FFmpeg-devel] [PATCH] h264 luma interpolation 8x8 for altivec


git-svn-id: file:///var/local/repositories/ffmpeg/trunk@10090 9553f0bf-9b14-0410-a0b8-cfaf0461ba5b
parent 29734a56
...@@ -186,32 +186,32 @@ void put_no_rnd_h264_chroma_mc8_altivec(uint8_t * dst, uint8_t * src, int stride ...@@ -186,32 +186,32 @@ void put_no_rnd_h264_chroma_mc8_altivec(uint8_t * dst, uint8_t * src, int stride
((8 - x) * (y)), ((8 - x) * (y)),
((x) * (y))}; ((x) * (y))};
register int i; register int i;
vector unsigned char fperm; vec_u8_t fperm;
const vector signed int vABCD = vec_ld(0, ABCD); const vec_s32_t vABCD = vec_ld(0, ABCD);
const vector signed short vA = vec_splat((vector signed short)vABCD, 1); const vec_s16_t vA = vec_splat((vec_s16_t)vABCD, 1);
const vector signed short vB = vec_splat((vector signed short)vABCD, 3); const vec_s16_t vB = vec_splat((vec_s16_t)vABCD, 3);
const vector signed short vC = vec_splat((vector signed short)vABCD, 5); const vec_s16_t vC = vec_splat((vec_s16_t)vABCD, 5);
const vector signed short vD = vec_splat((vector signed short)vABCD, 7); const vec_s16_t vD = vec_splat((vec_s16_t)vABCD, 7);
const vector signed int vzero = vec_splat_s32(0); LOAD_ZERO;
const vector signed short v28ss = vec_sub(vec_sl(vec_splat_s16(1),vec_splat_u16(5)),vec_splat_s16(4)); const vec_s16_t v28ss = vec_sub(vec_sl(vec_splat_s16(1),vec_splat_u16(5)),vec_splat_s16(4));
const vector unsigned short v6us = vec_splat_u16(6); const vec_u16_t v6us = vec_splat_u16(6);
register int loadSecond = (((unsigned long)src) % 16) <= 7 ? 0 : 1; register int loadSecond = (((unsigned long)src) % 16) <= 7 ? 0 : 1;
register int reallyBadAlign = (((unsigned long)src) % 16) == 15 ? 1 : 0; register int reallyBadAlign = (((unsigned long)src) % 16) == 15 ? 1 : 0;
vector unsigned char vsrcAuc, vsrcBuc, vsrcperm0, vsrcperm1; vec_u8_t vsrcAuc, vsrcBuc, vsrcperm0, vsrcperm1;
vector unsigned char vsrc0uc, vsrc1uc; vec_u8_t vsrc0uc, vsrc1uc;
vector signed short vsrc0ssH, vsrc1ssH; vec_s16_t vsrc0ssH, vsrc1ssH;
vector unsigned char vsrcCuc, vsrc2uc, vsrc3uc; vec_u8_t vsrcCuc, vsrc2uc, vsrc3uc;
vector signed short vsrc2ssH, vsrc3ssH, psum; vec_s16_t vsrc2ssH, vsrc3ssH, psum;
vector unsigned char vdst, ppsum, fsum; vec_u8_t vdst, ppsum, fsum;
if (((unsigned long)dst) % 16 == 0) { if (((unsigned long)dst) % 16 == 0) {
fperm = (vector unsigned char)AVV(0x10, 0x11, 0x12, 0x13, fperm = (vec_u8_t)AVV(0x10, 0x11, 0x12, 0x13,
0x14, 0x15, 0x16, 0x17, 0x14, 0x15, 0x16, 0x17,
0x08, 0x09, 0x0A, 0x0B, 0x08, 0x09, 0x0A, 0x0B,
0x0C, 0x0D, 0x0E, 0x0F); 0x0C, 0x0D, 0x0E, 0x0F);
} else { } else {
fperm = (vector unsigned char)AVV(0x00, 0x01, 0x02, 0x03, fperm = (vec_u8_t)AVV(0x00, 0x01, 0x02, 0x03,
0x04, 0x05, 0x06, 0x07, 0x04, 0x05, 0x06, 0x07,
0x18, 0x19, 0x1A, 0x1B, 0x18, 0x19, 0x1A, 0x1B,
0x1C, 0x1D, 0x1E, 0x1F); 0x1C, 0x1D, 0x1E, 0x1F);
...@@ -230,10 +230,8 @@ void put_no_rnd_h264_chroma_mc8_altivec(uint8_t * dst, uint8_t * src, int stride ...@@ -230,10 +230,8 @@ void put_no_rnd_h264_chroma_mc8_altivec(uint8_t * dst, uint8_t * src, int stride
else else
vsrc1uc = vec_perm(vsrcAuc, vsrcBuc, vsrcperm1); vsrc1uc = vec_perm(vsrcAuc, vsrcBuc, vsrcperm1);
vsrc0ssH = (vector signed short)vec_mergeh((vector unsigned char)vzero, vsrc0ssH = (vec_s16_t)vec_mergeh(zero_u8v, (vec_u8_t)vsrc0uc);
(vector unsigned char)vsrc0uc); vsrc1ssH = (vec_s16_t)vec_mergeh(zero_u8v, (vec_u8_t)vsrc1uc);
vsrc1ssH = (vector signed short)vec_mergeh((vector unsigned char)vzero,
(vector unsigned char)vsrc1uc);
if (!loadSecond) {// -> !reallyBadAlign if (!loadSecond) {// -> !reallyBadAlign
for (i = 0 ; i < h ; i++) { for (i = 0 ; i < h ; i++) {
...@@ -244,10 +242,8 @@ void put_no_rnd_h264_chroma_mc8_altivec(uint8_t * dst, uint8_t * src, int stride ...@@ -244,10 +242,8 @@ void put_no_rnd_h264_chroma_mc8_altivec(uint8_t * dst, uint8_t * src, int stride
vsrc2uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm0); vsrc2uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm0);
vsrc3uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm1); vsrc3uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm1);
vsrc2ssH = (vector signed short)vec_mergeh((vector unsigned char)vzero, vsrc2ssH = (vec_s16_t)vec_mergeh(zero_u8v, (vec_u8_t)vsrc2uc);
(vector unsigned char)vsrc2uc); vsrc3ssH = (vec_s16_t)vec_mergeh(zero_u8v, (vec_u8_t)vsrc3uc);
vsrc3ssH = (vector signed short)vec_mergeh((vector unsigned char)vzero,
(vector unsigned char)vsrc3uc);
psum = vec_mladd(vA, vsrc0ssH, vec_splat_s16(0)); psum = vec_mladd(vA, vsrc0ssH, vec_splat_s16(0));
psum = vec_mladd(vB, vsrc1ssH, psum); psum = vec_mladd(vB, vsrc1ssH, psum);
...@@ -257,7 +253,7 @@ void put_no_rnd_h264_chroma_mc8_altivec(uint8_t * dst, uint8_t * src, int stride ...@@ -257,7 +253,7 @@ void put_no_rnd_h264_chroma_mc8_altivec(uint8_t * dst, uint8_t * src, int stride
psum = vec_sra(psum, v6us); psum = vec_sra(psum, v6us);
vdst = vec_ld(0, dst); vdst = vec_ld(0, dst);
ppsum = (vector unsigned char)vec_packsu(psum, psum); ppsum = (vec_u8_t)vec_packsu(psum, psum);
fsum = vec_perm(vdst, ppsum, fperm); fsum = vec_perm(vdst, ppsum, fperm);
vec_st(fsum, 0, dst); vec_st(fsum, 0, dst);
...@@ -269,7 +265,7 @@ void put_no_rnd_h264_chroma_mc8_altivec(uint8_t * dst, uint8_t * src, int stride ...@@ -269,7 +265,7 @@ void put_no_rnd_h264_chroma_mc8_altivec(uint8_t * dst, uint8_t * src, int stride
src += stride; src += stride;
} }
} else { } else {
vector unsigned char vsrcDuc; vec_u8_t vsrcDuc;
for (i = 0 ; i < h ; i++) { for (i = 0 ; i < h ; i++) {
vsrcCuc = vec_ld(stride + 0, src); vsrcCuc = vec_ld(stride + 0, src);
vsrcDuc = vec_ld(stride + 16, src); vsrcDuc = vec_ld(stride + 16, src);
...@@ -280,10 +276,8 @@ void put_no_rnd_h264_chroma_mc8_altivec(uint8_t * dst, uint8_t * src, int stride ...@@ -280,10 +276,8 @@ void put_no_rnd_h264_chroma_mc8_altivec(uint8_t * dst, uint8_t * src, int stride
else else
vsrc3uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm1); vsrc3uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm1);
vsrc2ssH = (vector signed short)vec_mergeh((vector unsigned char)vzero, vsrc2ssH = (vec_s16_t)vec_mergeh(zero_u8v, (vec_u8_t)vsrc2uc);
(vector unsigned char)vsrc2uc); vsrc3ssH = (vec_s16_t)vec_mergeh(zero_u8v, (vec_u8_t)vsrc3uc);
vsrc3ssH = (vector signed short)vec_mergeh((vector unsigned char)vzero,
(vector unsigned char)vsrc3uc);
psum = vec_mladd(vA, vsrc0ssH, vec_splat_s16(0)); psum = vec_mladd(vA, vsrc0ssH, vec_splat_s16(0));
psum = vec_mladd(vB, vsrc1ssH, psum); psum = vec_mladd(vB, vsrc1ssH, psum);
...@@ -293,7 +287,7 @@ void put_no_rnd_h264_chroma_mc8_altivec(uint8_t * dst, uint8_t * src, int stride ...@@ -293,7 +287,7 @@ void put_no_rnd_h264_chroma_mc8_altivec(uint8_t * dst, uint8_t * src, int stride
psum = vec_sr(psum, v6us); psum = vec_sr(psum, v6us);
vdst = vec_ld(0, dst); vdst = vec_ld(0, dst);
ppsum = (vector unsigned char)vec_pack(psum, psum); ppsum = (vec_u8_t)vec_pack(psum, psum);
fsum = vec_perm(vdst, ppsum, fperm); fsum = vec_perm(vdst, ppsum, fperm);
vec_st(fsum, 0, dst); vec_st(fsum, 0, dst);
...@@ -312,7 +306,7 @@ static inline void put_pixels16_l2_altivec( uint8_t * dst, const uint8_t * src1, ...@@ -312,7 +306,7 @@ static inline void put_pixels16_l2_altivec( uint8_t * dst, const uint8_t * src1,
int src_stride1, int h) int src_stride1, int h)
{ {
int i; int i;
vector unsigned char a, b, d, tmp1, tmp2, mask, mask_, edges, align; vec_u8_t a, b, d, tmp1, tmp2, mask, mask_, edges, align;
mask_ = vec_lvsl(0, src2); mask_ = vec_lvsl(0, src2);
...@@ -354,7 +348,7 @@ static inline void avg_pixels16_l2_altivec( uint8_t * dst, const uint8_t * src1, ...@@ -354,7 +348,7 @@ static inline void avg_pixels16_l2_altivec( uint8_t * dst, const uint8_t * src1,
int src_stride1, int h) int src_stride1, int h)
{ {
int i; int i;
vector unsigned char a, b, d, tmp1, tmp2, mask, mask_, edges, align; vec_u8_t a, b, d, tmp1, tmp2, mask, mask_, edges, align;
mask_ = vec_lvsl(0, src2); mask_ = vec_lvsl(0, src2);
...@@ -567,8 +561,7 @@ void ff_h264_idct8_add_altivec( uint8_t *dst, DCTELEM *dct, int stride ) { ...@@ -567,8 +561,7 @@ void ff_h264_idct8_add_altivec( uint8_t *dst, DCTELEM *dct, int stride ) {
const vec_u16_t twov = vec_splat_u16(2); const vec_u16_t twov = vec_splat_u16(2);
const vec_u16_t sixv = vec_splat_u16(6); const vec_u16_t sixv = vec_splat_u16(6);
const vec_u8_t sel = (vec_u8_t) AVV(0,0,0,0,0,0,0,0, const vec_u8_t sel = (vec_u8_t) AVV(0,0,0,0,0,0,0,0,-1,-1,-1,-1,-1,-1,-1,-1);
-1,-1,-1,-1,-1,-1,-1,-1);
LOAD_ZERO; LOAD_ZERO;
dct[0] += 32; // rounding for the >>6 at the end dct[0] += 32; // rounding for the >>6 at the end
...@@ -601,10 +594,10 @@ void ff_h264_idct8_add_altivec( uint8_t *dst, DCTELEM *dct, int stride ) { ...@@ -601,10 +594,10 @@ void ff_h264_idct8_add_altivec( uint8_t *dst, DCTELEM *dct, int stride ) {
} }
#define transpose4x16(r0, r1, r2, r3) { \ #define transpose4x16(r0, r1, r2, r3) { \
register vector unsigned char r4; \ register vec_u8_t r4; \
register vector unsigned char r5; \ register vec_u8_t r5; \
register vector unsigned char r6; \ register vec_u8_t r6; \
register vector unsigned char r7; \ register vec_u8_t r7; \
\ \
r4 = vec_mergeh(r0, r2); /*0, 2 set 0*/ \ r4 = vec_mergeh(r0, r2); /*0, 2 set 0*/ \
r5 = vec_mergel(r0, r2); /*0, 2 set 1*/ \ r5 = vec_mergel(r0, r2); /*0, 2 set 1*/ \
...@@ -618,8 +611,8 @@ void ff_h264_idct8_add_altivec( uint8_t *dst, DCTELEM *dct, int stride ) { ...@@ -618,8 +611,8 @@ void ff_h264_idct8_add_altivec( uint8_t *dst, DCTELEM *dct, int stride ) {
} }
static inline void write16x4(uint8_t *dst, int dst_stride, static inline void write16x4(uint8_t *dst, int dst_stride,
register vector unsigned char r0, register vector unsigned char r1, register vec_u8_t r0, register vec_u8_t r1,
register vector unsigned char r2, register vector unsigned char r3) { register vec_u8_t r2, register vec_u8_t r3) {
DECLARE_ALIGNED_16(unsigned char, result[64]); DECLARE_ALIGNED_16(unsigned char, result[64]);
uint32_t *src_int = (uint32_t *)result, *dst_int = (uint32_t *)dst; uint32_t *src_int = (uint32_t *)result, *dst_int = (uint32_t *)dst;
int int_dst_stride = dst_stride/4; int int_dst_stride = dst_stride/4;
...@@ -651,16 +644,16 @@ static inline void write16x4(uint8_t *dst, int dst_stride, ...@@ -651,16 +644,16 @@ static inline void write16x4(uint8_t *dst, int dst_stride,
\todo FIXME: see if we can't spare some vec_lvsl() by them factorizing \todo FIXME: see if we can't spare some vec_lvsl() by them factorizing
out of unaligned_load() */ out of unaligned_load() */
#define readAndTranspose16x6(src, src_stride, r8, r9, r10, r11, r12, r13) {\ #define readAndTranspose16x6(src, src_stride, r8, r9, r10, r11, r12, r13) {\
register vector unsigned char r0 = unaligned_load(0, src);\ register vec_u8_t r0 = unaligned_load(0, src); \
register vector unsigned char r1 = unaligned_load( src_stride, src);\ register vec_u8_t r1 = unaligned_load( src_stride, src); \
register vector unsigned char r2 = unaligned_load(2* src_stride, src);\ register vec_u8_t r2 = unaligned_load(2* src_stride, src); \
register vector unsigned char r3 = unaligned_load(3* src_stride, src);\ register vec_u8_t r3 = unaligned_load(3* src_stride, src); \
register vector unsigned char r4 = unaligned_load(4* src_stride, src);\ register vec_u8_t r4 = unaligned_load(4* src_stride, src); \
register vector unsigned char r5 = unaligned_load(5* src_stride, src);\ register vec_u8_t r5 = unaligned_load(5* src_stride, src); \
register vector unsigned char r6 = unaligned_load(6* src_stride, src);\ register vec_u8_t r6 = unaligned_load(6* src_stride, src); \
register vector unsigned char r7 = unaligned_load(7* src_stride, src);\ register vec_u8_t r7 = unaligned_load(7* src_stride, src); \
register vector unsigned char r14 = unaligned_load(14*src_stride, src);\ register vec_u8_t r14 = unaligned_load(14*src_stride, src); \
register vector unsigned char r15 = unaligned_load(15*src_stride, src);\ register vec_u8_t r15 = unaligned_load(15*src_stride, src); \
\ \
r8 = unaligned_load( 8*src_stride, src); \ r8 = unaligned_load( 8*src_stride, src); \
r9 = unaligned_load( 9*src_stride, src); \ r9 = unaligned_load( 9*src_stride, src); \
...@@ -710,26 +703,26 @@ static inline void write16x4(uint8_t *dst, int dst_stride, ...@@ -710,26 +703,26 @@ static inline void write16x4(uint8_t *dst, int dst_stride,
} }
// out: o = |x-y| < a // out: o = |x-y| < a
static inline vector unsigned char diff_lt_altivec ( register vector unsigned char x, static inline vec_u8_t diff_lt_altivec ( register vec_u8_t x,
register vector unsigned char y, register vec_u8_t y,
register vector unsigned char a) { register vec_u8_t a) {
register vector unsigned char diff = vec_subs(x, y); register vec_u8_t diff = vec_subs(x, y);
register vector unsigned char diffneg = vec_subs(y, x); register vec_u8_t diffneg = vec_subs(y, x);
register vector unsigned char o = vec_or(diff, diffneg); /* |x-y| */ register vec_u8_t o = vec_or(diff, diffneg); /* |x-y| */
o = (vector unsigned char)vec_cmplt(o, a); o = (vec_u8_t)vec_cmplt(o, a);
return o; return o;
} }
static inline vector unsigned char h264_deblock_mask ( register vector unsigned char p0, static inline vec_u8_t h264_deblock_mask ( register vec_u8_t p0,
register vector unsigned char p1, register vec_u8_t p1,
register vector unsigned char q0, register vec_u8_t q0,
register vector unsigned char q1, register vec_u8_t q1,
register vector unsigned char alpha, register vec_u8_t alpha,
register vector unsigned char beta) { register vec_u8_t beta) {
register vector unsigned char mask; register vec_u8_t mask;
register vector unsigned char tempmask; register vec_u8_t tempmask;
mask = diff_lt_altivec(p0, q0, alpha); mask = diff_lt_altivec(p0, q0, alpha);
tempmask = diff_lt_altivec(p1, p0, beta); tempmask = diff_lt_altivec(p1, p0, beta);
...@@ -741,19 +734,19 @@ static inline vector unsigned char h264_deblock_mask ( register vector unsigned ...@@ -741,19 +734,19 @@ static inline vector unsigned char h264_deblock_mask ( register vector unsigned
} }
// out: newp1 = clip((p2 + ((p0 + q0 + 1) >> 1)) >> 1, p1-tc0, p1+tc0) // out: newp1 = clip((p2 + ((p0 + q0 + 1) >> 1)) >> 1, p1-tc0, p1+tc0)
static inline vector unsigned char h264_deblock_q1(register vector unsigned char p0, static inline vec_u8_t h264_deblock_q1(register vec_u8_t p0,
register vector unsigned char p1, register vec_u8_t p1,
register vector unsigned char p2, register vec_u8_t p2,
register vector unsigned char q0, register vec_u8_t q0,
register vector unsigned char tc0) { register vec_u8_t tc0) {
register vector unsigned char average = vec_avg(p0, q0); register vec_u8_t average = vec_avg(p0, q0);
register vector unsigned char temp; register vec_u8_t temp;
register vector unsigned char uncliped; register vec_u8_t uncliped;
register vector unsigned char ones; register vec_u8_t ones;
register vector unsigned char max; register vec_u8_t max;
register vector unsigned char min; register vec_u8_t min;
register vector unsigned char newp1; register vec_u8_t newp1;
temp = vec_xor(average, p2); temp = vec_xor(average, p2);
average = vec_avg(average, p2); /*avg(p2, avg(p0, q0)) */ average = vec_avg(average, p2); /*avg(p2, avg(p0, q0)) */
...@@ -769,16 +762,16 @@ static inline vector unsigned char h264_deblock_q1(register vector unsigned char ...@@ -769,16 +762,16 @@ static inline vector unsigned char h264_deblock_q1(register vector unsigned char
#define h264_deblock_p0_q0(p0, p1, q0, q1, tc0masked) { \ #define h264_deblock_p0_q0(p0, p1, q0, q1, tc0masked) { \
\ \
const vector unsigned char A0v = vec_sl(vec_splat_u8(10), vec_splat_u8(4)); \ const vec_u8_t A0v = vec_sl(vec_splat_u8(10), vec_splat_u8(4)); \
\ \
register vector unsigned char pq0bit = vec_xor(p0,q0); \ register vec_u8_t pq0bit = vec_xor(p0,q0); \
register vector unsigned char q1minus; \ register vec_u8_t q1minus; \
register vector unsigned char p0minus; \ register vec_u8_t p0minus; \
register vector unsigned char stage1; \ register vec_u8_t stage1; \
register vector unsigned char stage2; \ register vec_u8_t stage2; \
register vector unsigned char vec160; \ register vec_u8_t vec160; \
register vector unsigned char delta; \ register vec_u8_t delta; \
register vector unsigned char deltaneg; \ register vec_u8_t deltaneg; \
\ \
q1minus = vec_nor(q1, q1); /* 255 - q1 */ \ q1minus = vec_nor(q1, q1); /* 255 - q1 */ \
stage1 = vec_avg(p1, q1minus); /* (p1 - q1 + 256)>>1 */ \ stage1 = vec_avg(p1, q1minus); /* (p1 - q1 + 256)>>1 */ \
...@@ -801,16 +794,16 @@ static inline vector unsigned char h264_deblock_q1(register vector unsigned char ...@@ -801,16 +794,16 @@ static inline vector unsigned char h264_deblock_q1(register vector unsigned char
#define h264_loop_filter_luma_altivec(p2, p1, p0, q0, q1, q2, alpha, beta, tc0) { \ #define h264_loop_filter_luma_altivec(p2, p1, p0, q0, q1, q2, alpha, beta, tc0) { \
DECLARE_ALIGNED_16(unsigned char, temp[16]); \ DECLARE_ALIGNED_16(unsigned char, temp[16]); \
register vector unsigned char alphavec; \ register vec_u8_t alphavec; \
register vector unsigned char betavec; \ register vec_u8_t betavec; \
register vector unsigned char mask; \ register vec_u8_t mask; \
register vector unsigned char p1mask; \ register vec_u8_t p1mask; \
register vector unsigned char q1mask; \ register vec_u8_t q1mask; \
register vector signed char tc0vec; \ register vector signed char tc0vec; \
register vector unsigned char finaltc0; \ register vec_u8_t finaltc0; \
register vector unsigned char tc0masked; \ register vec_u8_t tc0masked; \
register vector unsigned char newp1; \ register vec_u8_t newp1; \
register vector unsigned char newq1; \ register vec_u8_t newq1; \
\ \
temp[0] = alpha; \ temp[0] = alpha; \
temp[1] = beta; \ temp[1] = beta; \
...@@ -824,18 +817,18 @@ static inline vector unsigned char h264_deblock_q1(register vector unsigned char ...@@ -824,18 +817,18 @@ static inline vector unsigned char h264_deblock_q1(register vector unsigned char
tc0vec = vec_mergeh(tc0vec, tc0vec); \ tc0vec = vec_mergeh(tc0vec, tc0vec); \
tc0vec = vec_mergeh(tc0vec, tc0vec); \ tc0vec = vec_mergeh(tc0vec, tc0vec); \
mask = vec_and(mask, vec_cmpgt(tc0vec, vec_splat_s8(-1))); /* if tc0[i] >= 0 */ \ mask = vec_and(mask, vec_cmpgt(tc0vec, vec_splat_s8(-1))); /* if tc0[i] >= 0 */ \
finaltc0 = vec_and((vector unsigned char)tc0vec, mask); /* tc = tc0 */ \ finaltc0 = vec_and((vec_u8_t)tc0vec, mask); /* tc = tc0 */ \
\ \
p1mask = diff_lt_altivec(p2, p0, betavec); \ p1mask = diff_lt_altivec(p2, p0, betavec); \
p1mask = vec_and(p1mask, mask); /* if( |p2 - p0| < beta) */ \ p1mask = vec_and(p1mask, mask); /* if( |p2 - p0| < beta) */ \
tc0masked = vec_and(p1mask, (vector unsigned char)tc0vec); \ tc0masked = vec_and(p1mask, (vec_u8_t)tc0vec); \
finaltc0 = vec_sub(finaltc0, p1mask); /* tc++ */ \ finaltc0 = vec_sub(finaltc0, p1mask); /* tc++ */ \
newp1 = h264_deblock_q1(p0, p1, p2, q0, tc0masked); \ newp1 = h264_deblock_q1(p0, p1, p2, q0, tc0masked); \
/*end if*/ \ /*end if*/ \
\ \
q1mask = diff_lt_altivec(q2, q0, betavec); \ q1mask = diff_lt_altivec(q2, q0, betavec); \
q1mask = vec_and(q1mask, mask); /* if ( |q2 - q0| < beta ) */\ q1mask = vec_and(q1mask, mask); /* if ( |q2 - q0| < beta ) */\
tc0masked = vec_and(q1mask, (vector unsigned char)tc0vec); \ tc0masked = vec_and(q1mask, (vec_u8_t)tc0vec); \
finaltc0 = vec_sub(finaltc0, q1mask); /* tc++ */ \ finaltc0 = vec_sub(finaltc0, q1mask); /* tc++ */ \
newq1 = h264_deblock_q1(p0, q1, q2, q0, tc0masked); \ newq1 = h264_deblock_q1(p0, q1, q2, q0, tc0masked); \
/*end if*/ \ /*end if*/ \
...@@ -848,12 +841,12 @@ static inline vector unsigned char h264_deblock_q1(register vector unsigned char ...@@ -848,12 +841,12 @@ static inline vector unsigned char h264_deblock_q1(register vector unsigned char
static void h264_v_loop_filter_luma_altivec(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0) { static void h264_v_loop_filter_luma_altivec(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0) {
if((tc0[0] & tc0[1] & tc0[2] & tc0[3]) >= 0) { if((tc0[0] & tc0[1] & tc0[2] & tc0[3]) >= 0) {
register vector unsigned char p2 = vec_ld(-3*stride, pix); register vec_u8_t p2 = vec_ld(-3*stride, pix);
register vector unsigned char p1 = vec_ld(-2*stride, pix); register vec_u8_t p1 = vec_ld(-2*stride, pix);
register vector unsigned char p0 = vec_ld(-1*stride, pix); register vec_u8_t p0 = vec_ld(-1*stride, pix);
register vector unsigned char q0 = vec_ld(0, pix); register vec_u8_t q0 = vec_ld(0, pix);
register vector unsigned char q1 = vec_ld(stride, pix); register vec_u8_t q1 = vec_ld(stride, pix);
register vector unsigned char q2 = vec_ld(2*stride, pix); register vec_u8_t q2 = vec_ld(2*stride, pix);
h264_loop_filter_luma_altivec(p2, p1, p0, q0, q1, q2, alpha, beta, tc0); h264_loop_filter_luma_altivec(p2, p1, p0, q0, q1, q2, alpha, beta, tc0);
vec_st(p1, -2*stride, pix); vec_st(p1, -2*stride, pix);
vec_st(p0, -1*stride, pix); vec_st(p0, -1*stride, pix);
...@@ -864,7 +857,7 @@ static void h264_v_loop_filter_luma_altivec(uint8_t *pix, int stride, int alpha, ...@@ -864,7 +857,7 @@ static void h264_v_loop_filter_luma_altivec(uint8_t *pix, int stride, int alpha,
static void h264_h_loop_filter_luma_altivec(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0) { static void h264_h_loop_filter_luma_altivec(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0) {
register vector unsigned char line0, line1, line2, line3, line4, line5; register vec_u8_t line0, line1, line2, line3, line4, line5;
if((tc0[0] & tc0[1] & tc0[2] & tc0[3]) < 0) if((tc0[0] & tc0[1] & tc0[2] & tc0[3]) < 0)
return; return;
readAndTranspose16x6(pix-3, stride, line0, line1, line2, line3, line4, line5); readAndTranspose16x6(pix-3, stride, line0, line1, line2, line3, line4, line5);
......
...@@ -27,34 +27,34 @@ void PREFIX_h264_chroma_mc8_altivec(uint8_t * dst, uint8_t * src, int stride, in ...@@ -27,34 +27,34 @@ void PREFIX_h264_chroma_mc8_altivec(uint8_t * dst, uint8_t * src, int stride, in
((8 - x) * (y)), ((8 - x) * (y)),
((x) * (y))}; ((x) * (y))};
register int i; register int i;
vector unsigned char fperm; vec_u8_t fperm;
const vector signed int vABCD = vec_ld(0, ABCD); const vec_s32_t vABCD = vec_ld(0, ABCD);
const vector signed short vA = vec_splat((vector signed short)vABCD, 1); const vec_s16_t vA = vec_splat((vec_s16_t)vABCD, 1);
const vector signed short vB = vec_splat((vector signed short)vABCD, 3); const vec_s16_t vB = vec_splat((vec_s16_t)vABCD, 3);
const vector signed short vC = vec_splat((vector signed short)vABCD, 5); const vec_s16_t vC = vec_splat((vec_s16_t)vABCD, 5);
const vector signed short vD = vec_splat((vector signed short)vABCD, 7); const vec_s16_t vD = vec_splat((vec_s16_t)vABCD, 7);
const vector signed int vzero = vec_splat_s32(0); LOAD_ZERO;
const vector signed short v32ss = vec_sl(vec_splat_s16(1),vec_splat_u16(5)); const vec_s16_t v32ss = vec_sl(vec_splat_s16(1),vec_splat_u16(5));
const vector unsigned short v6us = vec_splat_u16(6); const vec_u16_t v6us = vec_splat_u16(6);
register int loadSecond = (((unsigned long)src) % 16) <= 7 ? 0 : 1; register int loadSecond = (((unsigned long)src) % 16) <= 7 ? 0 : 1;
register int reallyBadAlign = (((unsigned long)src) % 16) == 15 ? 1 : 0; register int reallyBadAlign = (((unsigned long)src) % 16) == 15 ? 1 : 0;
vector unsigned char vsrcAuc, vsrcBuc, vsrcperm0, vsrcperm1; vec_u8_t vsrcAuc, vsrcBuc, vsrcperm0, vsrcperm1;
vector unsigned char vsrc0uc, vsrc1uc; vec_u8_t vsrc0uc, vsrc1uc;
vector signed short vsrc0ssH, vsrc1ssH; vec_s16_t vsrc0ssH, vsrc1ssH;
vector unsigned char vsrcCuc, vsrc2uc, vsrc3uc; vec_u8_t vsrcCuc, vsrc2uc, vsrc3uc;
vector signed short vsrc2ssH, vsrc3ssH, psum; vec_s16_t vsrc2ssH, vsrc3ssH, psum;
vector unsigned char vdst, ppsum, vfdst, fsum; vec_u8_t vdst, ppsum, vfdst, fsum;
POWERPC_PERF_START_COUNT(PREFIX_h264_chroma_mc8_num, 1); POWERPC_PERF_START_COUNT(PREFIX_h264_chroma_mc8_num, 1);
if (((unsigned long)dst) % 16 == 0) { if (((unsigned long)dst) % 16 == 0) {
fperm = (vector unsigned char)AVV(0x10, 0x11, 0x12, 0x13, fperm = (vec_u8_t)AVV(0x10, 0x11, 0x12, 0x13,
0x14, 0x15, 0x16, 0x17, 0x14, 0x15, 0x16, 0x17,
0x08, 0x09, 0x0A, 0x0B, 0x08, 0x09, 0x0A, 0x0B,
0x0C, 0x0D, 0x0E, 0x0F); 0x0C, 0x0D, 0x0E, 0x0F);
} else { } else {
fperm = (vector unsigned char)AVV(0x00, 0x01, 0x02, 0x03, fperm = (vec_u8_t)AVV(0x00, 0x01, 0x02, 0x03,
0x04, 0x05, 0x06, 0x07, 0x04, 0x05, 0x06, 0x07,
0x18, 0x19, 0x1A, 0x1B, 0x18, 0x19, 0x1A, 0x1B,
0x1C, 0x1D, 0x1E, 0x1F); 0x1C, 0x1D, 0x1E, 0x1F);
...@@ -73,10 +73,8 @@ void PREFIX_h264_chroma_mc8_altivec(uint8_t * dst, uint8_t * src, int stride, in ...@@ -73,10 +73,8 @@ void PREFIX_h264_chroma_mc8_altivec(uint8_t * dst, uint8_t * src, int stride, in
else else
vsrc1uc = vec_perm(vsrcAuc, vsrcBuc, vsrcperm1); vsrc1uc = vec_perm(vsrcAuc, vsrcBuc, vsrcperm1);
vsrc0ssH = (vector signed short)vec_mergeh((vector unsigned char)vzero, vsrc0ssH = (vec_s16_t)vec_mergeh(zero_u8v,(vec_u8_t)vsrc0uc);
(vector unsigned char)vsrc0uc); vsrc1ssH = (vec_s16_t)vec_mergeh(zero_u8v,(vec_u8_t)vsrc1uc);
vsrc1ssH = (vector signed short)vec_mergeh((vector unsigned char)vzero,
(vector unsigned char)vsrc1uc);
if (!loadSecond) {// -> !reallyBadAlign if (!loadSecond) {// -> !reallyBadAlign
for (i = 0 ; i < h ; i++) { for (i = 0 ; i < h ; i++) {
...@@ -87,10 +85,8 @@ void PREFIX_h264_chroma_mc8_altivec(uint8_t * dst, uint8_t * src, int stride, in ...@@ -87,10 +85,8 @@ void PREFIX_h264_chroma_mc8_altivec(uint8_t * dst, uint8_t * src, int stride, in
vsrc2uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm0); vsrc2uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm0);
vsrc3uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm1); vsrc3uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm1);
vsrc2ssH = (vector signed short)vec_mergeh((vector unsigned char)vzero, vsrc2ssH = (vec_s16_t)vec_mergeh(zero_u8v,(vec_u8_t)vsrc2uc);
(vector unsigned char)vsrc2uc); vsrc3ssH = (vec_s16_t)vec_mergeh(zero_u8v,(vec_u8_t)vsrc3uc);
vsrc3ssH = (vector signed short)vec_mergeh((vector unsigned char)vzero,
(vector unsigned char)vsrc3uc);
psum = vec_mladd(vA, vsrc0ssH, vec_splat_s16(0)); psum = vec_mladd(vA, vsrc0ssH, vec_splat_s16(0));
psum = vec_mladd(vB, vsrc1ssH, psum); psum = vec_mladd(vB, vsrc1ssH, psum);
...@@ -100,7 +96,7 @@ void PREFIX_h264_chroma_mc8_altivec(uint8_t * dst, uint8_t * src, int stride, in ...@@ -100,7 +96,7 @@ void PREFIX_h264_chroma_mc8_altivec(uint8_t * dst, uint8_t * src, int stride, in
psum = vec_sra(psum, v6us); psum = vec_sra(psum, v6us);
vdst = vec_ld(0, dst); vdst = vec_ld(0, dst);
ppsum = (vector unsigned char)vec_packsu(psum, psum); ppsum = (vec_u8_t)vec_packsu(psum, psum);
vfdst = vec_perm(vdst, ppsum, fperm); vfdst = vec_perm(vdst, ppsum, fperm);
OP_U8_ALTIVEC(fsum, vfdst, vdst); OP_U8_ALTIVEC(fsum, vfdst, vdst);
...@@ -114,7 +110,7 @@ void PREFIX_h264_chroma_mc8_altivec(uint8_t * dst, uint8_t * src, int stride, in ...@@ -114,7 +110,7 @@ void PREFIX_h264_chroma_mc8_altivec(uint8_t * dst, uint8_t * src, int stride, in
src += stride; src += stride;
} }
} else { } else {
vector unsigned char vsrcDuc; vec_u8_t vsrcDuc;
for (i = 0 ; i < h ; i++) { for (i = 0 ; i < h ; i++) {
vsrcCuc = vec_ld(stride + 0, src); vsrcCuc = vec_ld(stride + 0, src);
vsrcDuc = vec_ld(stride + 16, src); vsrcDuc = vec_ld(stride + 16, src);
...@@ -125,10 +121,8 @@ void PREFIX_h264_chroma_mc8_altivec(uint8_t * dst, uint8_t * src, int stride, in ...@@ -125,10 +121,8 @@ void PREFIX_h264_chroma_mc8_altivec(uint8_t * dst, uint8_t * src, int stride, in
else else
vsrc3uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm1); vsrc3uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm1);
vsrc2ssH = (vector signed short)vec_mergeh((vector unsigned char)vzero, vsrc2ssH = (vec_s16_t)vec_mergeh(zero_u8v,(vec_u8_t)vsrc2uc);
(vector unsigned char)vsrc2uc); vsrc3ssH = (vec_s16_t)vec_mergeh(zero_u8v,(vec_u8_t)vsrc3uc);
vsrc3ssH = (vector signed short)vec_mergeh((vector unsigned char)vzero,
(vector unsigned char)vsrc3uc);
psum = vec_mladd(vA, vsrc0ssH, vec_splat_s16(0)); psum = vec_mladd(vA, vsrc0ssH, vec_splat_s16(0));
psum = vec_mladd(vB, vsrc1ssH, psum); psum = vec_mladd(vB, vsrc1ssH, psum);
...@@ -138,7 +132,7 @@ void PREFIX_h264_chroma_mc8_altivec(uint8_t * dst, uint8_t * src, int stride, in ...@@ -138,7 +132,7 @@ void PREFIX_h264_chroma_mc8_altivec(uint8_t * dst, uint8_t * src, int stride, in
psum = vec_sr(psum, v6us); psum = vec_sr(psum, v6us);
vdst = vec_ld(0, dst); vdst = vec_ld(0, dst);
ppsum = (vector unsigned char)vec_pack(psum, psum); ppsum = (vec_u8_t)vec_pack(psum, psum);
vfdst = vec_perm(vdst, ppsum, fperm); vfdst = vec_perm(vdst, ppsum, fperm);
OP_U8_ALTIVEC(fsum, vfdst, vdst); OP_U8_ALTIVEC(fsum, vfdst, vdst);
...@@ -160,44 +154,39 @@ static void PREFIX_h264_qpel16_h_lowpass_altivec(uint8_t * dst, uint8_t * src, i ...@@ -160,44 +154,39 @@ static void PREFIX_h264_qpel16_h_lowpass_altivec(uint8_t * dst, uint8_t * src, i
POWERPC_PERF_DECLARE(PREFIX_h264_qpel16_h_lowpass_num, 1); POWERPC_PERF_DECLARE(PREFIX_h264_qpel16_h_lowpass_num, 1);
register int i; register int i;
const vector signed int vzero = vec_splat_s32(0); LOAD_ZERO;
const vector unsigned char permM2 = vec_lvsl(-2, src); const vec_u8_t permM2 = vec_lvsl(-2, src);
const vector unsigned char permM1 = vec_lvsl(-1, src); const vec_u8_t permM1 = vec_lvsl(-1, src);
const vector unsigned char permP0 = vec_lvsl(+0, src); const vec_u8_t permP0 = vec_lvsl(+0, src);
const vector unsigned char permP1 = vec_lvsl(+1, src); const vec_u8_t permP1 = vec_lvsl(+1, src);
const vector unsigned char permP2 = vec_lvsl(+2, src); const vec_u8_t permP2 = vec_lvsl(+2, src);
const vector unsigned char permP3 = vec_lvsl(+3, src); const vec_u8_t permP3 = vec_lvsl(+3, src);
const vector signed short v5ss = vec_splat_s16(5); const vec_s16_t v5ss = vec_splat_s16(5);
const vector unsigned short v5us = vec_splat_u16(5); const vec_u16_t v5us = vec_splat_u16(5);
const vector signed short v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2)); const vec_s16_t v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2));
const vector signed short v16ss = vec_sl(vec_splat_s16(1),vec_splat_u16(4)); const vec_s16_t v16ss = vec_sl(vec_splat_s16(1),vec_splat_u16(4));
const vector unsigned char dstperm = vec_lvsr(0, dst); const vec_u8_t dstperm = vec_lvsr(0, dst);
const vector unsigned char neg1 = const vec_u8_t neg1 = (const vec_u8_t) vec_splat_s8(-1);
(const vector unsigned char) vec_splat_s8(-1); const vec_u8_t dstmask = vec_perm(zero_u8v, neg1, dstperm);
const vector unsigned char dstmask = vec_u8_t srcM2, srcM1, srcP0, srcP1, srcP2, srcP3;
vec_perm((const vector unsigned char)vzero,
neg1, dstperm);
vector unsigned char srcM2, srcM1, srcP0, srcP1, srcP2, srcP3;
register int align = ((((unsigned long)src) - 2) % 16); register int align = ((((unsigned long)src) - 2) % 16);
vector signed short srcP0A, srcP0B, srcP1A, srcP1B, vec_s16_t srcP0A, srcP0B, srcP1A, srcP1B,
srcP2A, srcP2B, srcP3A, srcP3B, srcP2A, srcP2B, srcP3A, srcP3B,
srcM1A, srcM1B, srcM2A, srcM2B, srcM1A, srcM1B, srcM2A, srcM2B,
sum1A, sum1B, sum2A, sum2B, sum3A, sum3B, sum1A, sum1B, sum2A, sum2B, sum3A, sum3B,
pp1A, pp1B, pp2A, pp2B, pp3A, pp3B, pp1A, pp1B, pp2A, pp2B, pp3A, pp3B,
psumA, psumB, sumA, sumB; psumA, psumB, sumA, sumB;
vector unsigned char sum, dst1, dst2, vdst, fsum, vec_u8_t sum, dst1, dst2, vdst, fsum, rsum, fdst1, fdst2;
rsum, fdst1, fdst2;
POWERPC_PERF_START_COUNT(PREFIX_h264_qpel16_h_lowpass_num, 1); POWERPC_PERF_START_COUNT(PREFIX_h264_qpel16_h_lowpass_num, 1);
for (i = 0 ; i < 16 ; i ++) { for (i = 0 ; i < 16 ; i ++) {
vector unsigned char srcR1 = vec_ld(-2, src); vec_u8_t srcR1 = vec_ld(-2, src);
vector unsigned char srcR2 = vec_ld(14, src); vec_u8_t srcR2 = vec_ld(14, src);
switch (align) { switch (align) {
default: { default: {
...@@ -217,7 +206,7 @@ static void PREFIX_h264_qpel16_h_lowpass_altivec(uint8_t * dst, uint8_t * src, i ...@@ -217,7 +206,7 @@ static void PREFIX_h264_qpel16_h_lowpass_altivec(uint8_t * dst, uint8_t * src, i
srcP3 = srcR2; srcP3 = srcR2;
} break; } break;
case 12: { case 12: {
vector unsigned char srcR3 = vec_ld(30, src); vec_u8_t srcR3 = vec_ld(30, src);
srcM2 = vec_perm(srcR1, srcR2, permM2); srcM2 = vec_perm(srcR1, srcR2, permM2);
srcM1 = vec_perm(srcR1, srcR2, permM1); srcM1 = vec_perm(srcR1, srcR2, permM1);
srcP0 = vec_perm(srcR1, srcR2, permP0); srcP0 = vec_perm(srcR1, srcR2, permP0);
...@@ -226,7 +215,7 @@ static void PREFIX_h264_qpel16_h_lowpass_altivec(uint8_t * dst, uint8_t * src, i ...@@ -226,7 +215,7 @@ static void PREFIX_h264_qpel16_h_lowpass_altivec(uint8_t * dst, uint8_t * src, i
srcP3 = vec_perm(srcR2, srcR3, permP3); srcP3 = vec_perm(srcR2, srcR3, permP3);
} break; } break;
case 13: { case 13: {
vector unsigned char srcR3 = vec_ld(30, src); vec_u8_t srcR3 = vec_ld(30, src);
srcM2 = vec_perm(srcR1, srcR2, permM2); srcM2 = vec_perm(srcR1, srcR2, permM2);
srcM1 = vec_perm(srcR1, srcR2, permM1); srcM1 = vec_perm(srcR1, srcR2, permM1);
srcP0 = vec_perm(srcR1, srcR2, permP0); srcP0 = vec_perm(srcR1, srcR2, permP0);
...@@ -235,7 +224,7 @@ static void PREFIX_h264_qpel16_h_lowpass_altivec(uint8_t * dst, uint8_t * src, i ...@@ -235,7 +224,7 @@ static void PREFIX_h264_qpel16_h_lowpass_altivec(uint8_t * dst, uint8_t * src, i
srcP3 = vec_perm(srcR2, srcR3, permP3); srcP3 = vec_perm(srcR2, srcR3, permP3);
} break; } break;
case 14: { case 14: {
vector unsigned char srcR3 = vec_ld(30, src); vec_u8_t srcR3 = vec_ld(30, src);
srcM2 = vec_perm(srcR1, srcR2, permM2); srcM2 = vec_perm(srcR1, srcR2, permM2);
srcM1 = vec_perm(srcR1, srcR2, permM1); srcM1 = vec_perm(srcR1, srcR2, permM1);
srcP0 = srcR2; srcP0 = srcR2;
...@@ -244,7 +233,7 @@ static void PREFIX_h264_qpel16_h_lowpass_altivec(uint8_t * dst, uint8_t * src, i ...@@ -244,7 +233,7 @@ static void PREFIX_h264_qpel16_h_lowpass_altivec(uint8_t * dst, uint8_t * src, i
srcP3 = vec_perm(srcR2, srcR3, permP3); srcP3 = vec_perm(srcR2, srcR3, permP3);
} break; } break;
case 15: { case 15: {
vector unsigned char srcR3 = vec_ld(30, src); vec_u8_t srcR3 = vec_ld(30, src);
srcM2 = vec_perm(srcR1, srcR2, permM2); srcM2 = vec_perm(srcR1, srcR2, permM2);
srcM1 = srcR2; srcM1 = srcR2;
srcP0 = vec_perm(srcR2, srcR3, permP0); srcP0 = vec_perm(srcR2, srcR3, permP0);
...@@ -254,32 +243,20 @@ static void PREFIX_h264_qpel16_h_lowpass_altivec(uint8_t * dst, uint8_t * src, i ...@@ -254,32 +243,20 @@ static void PREFIX_h264_qpel16_h_lowpass_altivec(uint8_t * dst, uint8_t * src, i
} break; } break;
} }
srcP0A = (vector signed short) srcP0A = (vec_s16_t) vec_mergeh(zero_u8v, srcP0);
vec_mergeh((vector unsigned char)vzero, srcP0); srcP0B = (vec_s16_t) vec_mergel(zero_u8v, srcP0);
srcP0B = (vector signed short) srcP1A = (vec_s16_t) vec_mergeh(zero_u8v, srcP1);
vec_mergel((vector unsigned char)vzero, srcP0); srcP1B = (vec_s16_t) vec_mergel(zero_u8v, srcP1);
srcP1A = (vector signed short)
vec_mergeh((vector unsigned char)vzero, srcP1); srcP2A = (vec_s16_t) vec_mergeh(zero_u8v, srcP2);
srcP1B = (vector signed short) srcP2B = (vec_s16_t) vec_mergel(zero_u8v, srcP2);
vec_mergel((vector unsigned char)vzero, srcP1); srcP3A = (vec_s16_t) vec_mergeh(zero_u8v, srcP3);
srcP3B = (vec_s16_t) vec_mergel(zero_u8v, srcP3);
srcP2A = (vector signed short)
vec_mergeh((vector unsigned char)vzero, srcP2); srcM1A = (vec_s16_t) vec_mergeh(zero_u8v, srcM1);
srcP2B = (vector signed short) srcM1B = (vec_s16_t) vec_mergel(zero_u8v, srcM1);
vec_mergel((vector unsigned char)vzero, srcP2); srcM2A = (vec_s16_t) vec_mergeh(zero_u8v, srcM2);
srcP3A = (vector signed short) srcM2B = (vec_s16_t) vec_mergel(zero_u8v, srcM2);
vec_mergeh((vector unsigned char)vzero, srcP3);
srcP3B = (vector signed short)
vec_mergel((vector unsigned char)vzero, srcP3);
srcM1A = (vector signed short)
vec_mergeh((vector unsigned char)vzero, srcM1);
srcM1B = (vector signed short)
vec_mergel((vector unsigned char)vzero, srcM1);
srcM2A = (vector signed short)
vec_mergeh((vector unsigned char)vzero, srcM2);
srcM2B = (vector signed short)
vec_mergel((vector unsigned char)vzero, srcM2);
sum1A = vec_adds(srcP0A, srcP1A); sum1A = vec_adds(srcP0A, srcP1A);
sum1B = vec_adds(srcP0B, srcP1B); sum1B = vec_adds(srcP0B, srcP1B);
...@@ -291,8 +268,8 @@ static void PREFIX_h264_qpel16_h_lowpass_altivec(uint8_t * dst, uint8_t * src, i ...@@ -291,8 +268,8 @@ static void PREFIX_h264_qpel16_h_lowpass_altivec(uint8_t * dst, uint8_t * src, i
pp1A = vec_mladd(sum1A, v20ss, v16ss); pp1A = vec_mladd(sum1A, v20ss, v16ss);
pp1B = vec_mladd(sum1B, v20ss, v16ss); pp1B = vec_mladd(sum1B, v20ss, v16ss);
pp2A = vec_mladd(sum2A, v5ss, (vector signed short)vzero); pp2A = vec_mladd(sum2A, v5ss, zero_s16v);
pp2B = vec_mladd(sum2B, v5ss, (vector signed short)vzero); pp2B = vec_mladd(sum2B, v5ss, zero_s16v);
pp3A = vec_add(sum3A, pp1A); pp3A = vec_add(sum3A, pp1A);
pp3B = vec_add(sum3B, pp1B); pp3B = vec_add(sum3B, pp1B);
...@@ -330,67 +307,56 @@ static void PREFIX_h264_qpel16_v_lowpass_altivec(uint8_t * dst, uint8_t * src, i ...@@ -330,67 +307,56 @@ static void PREFIX_h264_qpel16_v_lowpass_altivec(uint8_t * dst, uint8_t * src, i
register int i; register int i;
const vector signed int vzero = vec_splat_s32(0); LOAD_ZERO;
const vector unsigned char perm = vec_lvsl(0, src); const vec_u8_t perm = vec_lvsl(0, src);
const vector signed short v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2)); const vec_s16_t v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2));
const vector unsigned short v5us = vec_splat_u16(5); const vec_u16_t v5us = vec_splat_u16(5);
const vector signed short v5ss = vec_splat_s16(5); const vec_s16_t v5ss = vec_splat_s16(5);
const vector signed short v16ss = vec_sl(vec_splat_s16(1),vec_splat_u16(4)); const vec_s16_t v16ss = vec_sl(vec_splat_s16(1),vec_splat_u16(4));
const vector unsigned char dstperm = vec_lvsr(0, dst); const vec_u8_t dstperm = vec_lvsr(0, dst);
const vector unsigned char neg1 = (const vector unsigned char)vec_splat_s8(-1); const vec_u8_t neg1 = (const vec_u8_t)vec_splat_s8(-1);
const vector unsigned char dstmask = vec_perm((const vector unsigned char)vzero, neg1, dstperm); const vec_u8_t dstmask = vec_perm(zero_u8v, neg1, dstperm);
uint8_t *srcbis = src - (srcStride * 2); uint8_t *srcbis = src - (srcStride * 2);
const vector unsigned char srcM2a = vec_ld(0, srcbis); const vec_u8_t srcM2a = vec_ld(0, srcbis);
const vector unsigned char srcM2b = vec_ld(16, srcbis); const vec_u8_t srcM2b = vec_ld(16, srcbis);
const vector unsigned char srcM2 = vec_perm(srcM2a, srcM2b, perm); const vec_u8_t srcM2 = vec_perm(srcM2a, srcM2b, perm);
// srcbis += srcStride; // srcbis += srcStride;
const vector unsigned char srcM1a = vec_ld(0, srcbis += srcStride); const vec_u8_t srcM1a = vec_ld(0, srcbis += srcStride);
const vector unsigned char srcM1b = vec_ld(16, srcbis); const vec_u8_t srcM1b = vec_ld(16, srcbis);
const vector unsigned char srcM1 = vec_perm(srcM1a, srcM1b, perm); const vec_u8_t srcM1 = vec_perm(srcM1a, srcM1b, perm);
// srcbis += srcStride; // srcbis += srcStride;
const vector unsigned char srcP0a = vec_ld(0, srcbis += srcStride); const vec_u8_t srcP0a = vec_ld(0, srcbis += srcStride);
const vector unsigned char srcP0b = vec_ld(16, srcbis); const vec_u8_t srcP0b = vec_ld(16, srcbis);
const vector unsigned char srcP0 = vec_perm(srcP0a, srcP0b, perm); const vec_u8_t srcP0 = vec_perm(srcP0a, srcP0b, perm);
// srcbis += srcStride; // srcbis += srcStride;
const vector unsigned char srcP1a = vec_ld(0, srcbis += srcStride); const vec_u8_t srcP1a = vec_ld(0, srcbis += srcStride);
const vector unsigned char srcP1b = vec_ld(16, srcbis); const vec_u8_t srcP1b = vec_ld(16, srcbis);
const vector unsigned char srcP1 = vec_perm(srcP1a, srcP1b, perm); const vec_u8_t srcP1 = vec_perm(srcP1a, srcP1b, perm);
// srcbis += srcStride; // srcbis += srcStride;
const vector unsigned char srcP2a = vec_ld(0, srcbis += srcStride); const vec_u8_t srcP2a = vec_ld(0, srcbis += srcStride);
const vector unsigned char srcP2b = vec_ld(16, srcbis); const vec_u8_t srcP2b = vec_ld(16, srcbis);
const vector unsigned char srcP2 = vec_perm(srcP2a, srcP2b, perm); const vec_u8_t srcP2 = vec_perm(srcP2a, srcP2b, perm);
// srcbis += srcStride; // srcbis += srcStride;
vector signed short srcM2ssA = (vector signed short) vec_s16_t srcM2ssA = (vec_s16_t) vec_mergeh(zero_u8v, srcM2);
vec_mergeh((vector unsigned char)vzero, srcM2); vec_s16_t srcM2ssB = (vec_s16_t) vec_mergel(zero_u8v, srcM2);
vector signed short srcM2ssB = (vector signed short) vec_s16_t srcM1ssA = (vec_s16_t) vec_mergeh(zero_u8v, srcM1);
vec_mergel((vector unsigned char)vzero, srcM2); vec_s16_t srcM1ssB = (vec_s16_t) vec_mergel(zero_u8v, srcM1);
vector signed short srcM1ssA = (vector signed short) vec_s16_t srcP0ssA = (vec_s16_t) vec_mergeh(zero_u8v, srcP0);
vec_mergeh((vector unsigned char)vzero, srcM1); vec_s16_t srcP0ssB = (vec_s16_t) vec_mergel(zero_u8v, srcP0);
vector signed short srcM1ssB = (vector signed short) vec_s16_t srcP1ssA = (vec_s16_t) vec_mergeh(zero_u8v, srcP1);
vec_mergel((vector unsigned char)vzero, srcM1); vec_s16_t srcP1ssB = (vec_s16_t) vec_mergel(zero_u8v, srcP1);
vector signed short srcP0ssA = (vector signed short) vec_s16_t srcP2ssA = (vec_s16_t) vec_mergeh(zero_u8v, srcP2);
vec_mergeh((vector unsigned char)vzero, srcP0); vec_s16_t srcP2ssB = (vec_s16_t) vec_mergel(zero_u8v, srcP2);
vector signed short srcP0ssB = (vector signed short)
vec_mergel((vector unsigned char)vzero, srcP0); vec_s16_t pp1A, pp1B, pp2A, pp2B, pp3A, pp3B,
vector signed short srcP1ssA = (vector signed short)
vec_mergeh((vector unsigned char)vzero, srcP1);
vector signed short srcP1ssB = (vector signed short)
vec_mergel((vector unsigned char)vzero, srcP1);
vector signed short srcP2ssA = (vector signed short)
vec_mergeh((vector unsigned char)vzero, srcP2);
vector signed short srcP2ssB = (vector signed short)
vec_mergel((vector unsigned char)vzero, srcP2);
vector signed short pp1A, pp1B, pp2A, pp2B, pp3A, pp3B,
psumA, psumB, sumA, sumB, psumA, psumB, sumA, sumB,
srcP3ssA, srcP3ssB, srcP3ssA, srcP3ssB,
sum1A, sum1B, sum2A, sum2B, sum3A, sum3B; sum1A, sum1B, sum2A, sum2B, sum3A, sum3B;
vector unsigned char sum, dst1, dst2, vdst, fsum, rsum, fdst1, fdst2, vec_u8_t sum, dst1, dst2, vdst, fsum, rsum, fdst1, fdst2, srcP3a, srcP3b, srcP3;
srcP3a, srcP3b, srcP3;
POWERPC_PERF_START_COUNT(PREFIX_h264_qpel16_v_lowpass_num, 1); POWERPC_PERF_START_COUNT(PREFIX_h264_qpel16_v_lowpass_num, 1);
...@@ -398,10 +364,8 @@ static void PREFIX_h264_qpel16_v_lowpass_altivec(uint8_t * dst, uint8_t * src, i ...@@ -398,10 +364,8 @@ static void PREFIX_h264_qpel16_v_lowpass_altivec(uint8_t * dst, uint8_t * src, i
srcP3a = vec_ld(0, srcbis += srcStride); srcP3a = vec_ld(0, srcbis += srcStride);
srcP3b = vec_ld(16, srcbis); srcP3b = vec_ld(16, srcbis);
srcP3 = vec_perm(srcP3a, srcP3b, perm); srcP3 = vec_perm(srcP3a, srcP3b, perm);
srcP3ssA = (vector signed short) srcP3ssA = (vec_s16_t) vec_mergeh(zero_u8v, srcP3);
vec_mergeh((vector unsigned char)vzero, srcP3); srcP3ssB = (vec_s16_t) vec_mergel(zero_u8v, srcP3);
srcP3ssB = (vector signed short)
vec_mergel((vector unsigned char)vzero, srcP3);
// srcbis += srcStride; // srcbis += srcStride;
sum1A = vec_adds(srcP0ssA, srcP1ssA); sum1A = vec_adds(srcP0ssA, srcP1ssA);
...@@ -425,8 +389,8 @@ static void PREFIX_h264_qpel16_v_lowpass_altivec(uint8_t * dst, uint8_t * src, i ...@@ -425,8 +389,8 @@ static void PREFIX_h264_qpel16_v_lowpass_altivec(uint8_t * dst, uint8_t * src, i
pp1A = vec_mladd(sum1A, v20ss, v16ss); pp1A = vec_mladd(sum1A, v20ss, v16ss);
pp1B = vec_mladd(sum1B, v20ss, v16ss); pp1B = vec_mladd(sum1B, v20ss, v16ss);
pp2A = vec_mladd(sum2A, v5ss, (vector signed short)vzero); pp2A = vec_mladd(sum2A, v5ss, zero_s16v);
pp2B = vec_mladd(sum2B, v5ss, (vector signed short)vzero); pp2B = vec_mladd(sum2B, v5ss, zero_s16v);
pp3A = vec_add(sum3A, pp1A); pp3A = vec_add(sum3A, pp1A);
pp3B = vec_add(sum3B, pp1B); pp3B = vec_add(sum3B, pp1B);
...@@ -461,58 +425,56 @@ static void PREFIX_h264_qpel16_v_lowpass_altivec(uint8_t * dst, uint8_t * src, i ...@@ -461,58 +425,56 @@ static void PREFIX_h264_qpel16_v_lowpass_altivec(uint8_t * dst, uint8_t * src, i
static void PREFIX_h264_qpel16_hv_lowpass_altivec(uint8_t * dst, int16_t * tmp, uint8_t * src, int dstStride, int tmpStride, int srcStride) { static void PREFIX_h264_qpel16_hv_lowpass_altivec(uint8_t * dst, int16_t * tmp, uint8_t * src, int dstStride, int tmpStride, int srcStride) {
POWERPC_PERF_DECLARE(PREFIX_h264_qpel16_hv_lowpass_num, 1); POWERPC_PERF_DECLARE(PREFIX_h264_qpel16_hv_lowpass_num, 1);
register int i; register int i;
const vector signed int vzero = vec_splat_s32(0); LOAD_ZERO;
const vector unsigned char permM2 = vec_lvsl(-2, src); const vec_u8_t permM2 = vec_lvsl(-2, src);
const vector unsigned char permM1 = vec_lvsl(-1, src); const vec_u8_t permM1 = vec_lvsl(-1, src);
const vector unsigned char permP0 = vec_lvsl(+0, src); const vec_u8_t permP0 = vec_lvsl(+0, src);
const vector unsigned char permP1 = vec_lvsl(+1, src); const vec_u8_t permP1 = vec_lvsl(+1, src);
const vector unsigned char permP2 = vec_lvsl(+2, src); const vec_u8_t permP2 = vec_lvsl(+2, src);
const vector unsigned char permP3 = vec_lvsl(+3, src); const vec_u8_t permP3 = vec_lvsl(+3, src);
const vector signed short v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2)); const vec_s16_t v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2));
const vector unsigned int v10ui = vec_splat_u32(10); const vec_u32_t v10ui = vec_splat_u32(10);
const vector signed short v5ss = vec_splat_s16(5); const vec_s16_t v5ss = vec_splat_s16(5);
const vector signed short v1ss = vec_splat_s16(1); const vec_s16_t v1ss = vec_splat_s16(1);
const vector signed int v512si = vec_sl(vec_splat_s32(1),vec_splat_u32(9)); const vec_s32_t v512si = vec_sl(vec_splat_s32(1),vec_splat_u32(9));
const vector unsigned int v16ui = vec_sl(vec_splat_u32(1),vec_splat_u32(4)); const vec_u32_t v16ui = vec_sl(vec_splat_u32(1),vec_splat_u32(4));
register int align = ((((unsigned long)src) - 2) % 16); register int align = ((((unsigned long)src) - 2) % 16);
const vector unsigned char neg1 = (const vector unsigned char) const vec_u8_t neg1 = (const vec_u8_t) vec_splat_s8(-1);
vec_splat_s8(-1);
vector signed short srcP0A, srcP0B, srcP1A, srcP1B, vec_s16_t srcP0A, srcP0B, srcP1A, srcP1B,
srcP2A, srcP2B, srcP3A, srcP3B, srcP2A, srcP2B, srcP3A, srcP3B,
srcM1A, srcM1B, srcM2A, srcM2B, srcM1A, srcM1B, srcM2A, srcM2B,
sum1A, sum1B, sum2A, sum2B, sum3A, sum3B, sum1A, sum1B, sum2A, sum2B, sum3A, sum3B,
pp1A, pp1B, pp2A, pp2B, psumA, psumB; pp1A, pp1B, pp2A, pp2B, psumA, psumB;
const vector unsigned char dstperm = vec_lvsr(0, dst); const vec_u8_t dstperm = vec_lvsr(0, dst);
const vector unsigned char dstmask = vec_perm((const vector unsigned char)vzero, neg1, dstperm); const vec_u8_t dstmask = vec_perm(zero_u8v, neg1, dstperm);
const vector unsigned char mperm = (const vector unsigned char) const vec_u8_t mperm = (const vec_u8_t)
AVV(0x00, 0x08, 0x01, 0x09, 0x02, 0x0A, 0x03, 0x0B, AVV(0x00, 0x08, 0x01, 0x09, 0x02, 0x0A, 0x03, 0x0B,
0x04, 0x0C, 0x05, 0x0D, 0x06, 0x0E, 0x07, 0x0F); 0x04, 0x0C, 0x05, 0x0D, 0x06, 0x0E, 0x07, 0x0F);
int16_t *tmpbis = tmp; int16_t *tmpbis = tmp;
vector signed short tmpM1ssA, tmpM1ssB, tmpM2ssA, tmpM2ssB, vec_s16_t tmpM1ssA, tmpM1ssB, tmpM2ssA, tmpM2ssB,
tmpP0ssA, tmpP0ssB, tmpP1ssA, tmpP1ssB, tmpP0ssA, tmpP0ssB, tmpP1ssA, tmpP1ssB,
tmpP2ssA, tmpP2ssB; tmpP2ssA, tmpP2ssB;
vector signed int pp1Ae, pp1Ao, pp1Be, pp1Bo, pp2Ae, pp2Ao, pp2Be, pp2Bo, vec_s32_t pp1Ae, pp1Ao, pp1Be, pp1Bo, pp2Ae, pp2Ao, pp2Be, pp2Bo,
pp3Ae, pp3Ao, pp3Be, pp3Bo, pp1cAe, pp1cAo, pp1cBe, pp1cBo, pp3Ae, pp3Ao, pp3Be, pp3Bo, pp1cAe, pp1cAo, pp1cBe, pp1cBo,
pp32Ae, pp32Ao, pp32Be, pp32Bo, sumAe, sumAo, sumBe, sumBo, pp32Ae, pp32Ao, pp32Be, pp32Bo, sumAe, sumAo, sumBe, sumBo,
ssumAe, ssumAo, ssumBe, ssumBo; ssumAe, ssumAo, ssumBe, ssumBo;
vector unsigned char fsum, sumv, sum, dst1, dst2, vdst, vec_u8_t fsum, sumv, sum, dst1, dst2, vdst, rsum, fdst1, fdst2;
rsum, fdst1, fdst2; vec_s16_t ssume, ssumo;
vector signed short ssume, ssumo;
POWERPC_PERF_START_COUNT(PREFIX_h264_qpel16_hv_lowpass_num, 1); POWERPC_PERF_START_COUNT(PREFIX_h264_qpel16_hv_lowpass_num, 1);
src -= (2 * srcStride); src -= (2 * srcStride);
for (i = 0 ; i < 21 ; i ++) { for (i = 0 ; i < 21 ; i ++) {
vector unsigned char srcM2, srcM1, srcP0, srcP1, srcP2, srcP3; vec_u8_t srcM2, srcM1, srcP0, srcP1, srcP2, srcP3;
vector unsigned char srcR1 = vec_ld(-2, src); vec_u8_t srcR1 = vec_ld(-2, src);
vector unsigned char srcR2 = vec_ld(14, src); vec_u8_t srcR2 = vec_ld(14, src);
switch (align) { switch (align) {
default: { default: {
...@@ -532,7 +494,7 @@ static void PREFIX_h264_qpel16_hv_lowpass_altivec(uint8_t * dst, int16_t * tmp, ...@@ -532,7 +494,7 @@ static void PREFIX_h264_qpel16_hv_lowpass_altivec(uint8_t * dst, int16_t * tmp,
srcP3 = srcR2; srcP3 = srcR2;
} break; } break;
case 12: { case 12: {
vector unsigned char srcR3 = vec_ld(30, src); vec_u8_t srcR3 = vec_ld(30, src);
srcM2 = vec_perm(srcR1, srcR2, permM2); srcM2 = vec_perm(srcR1, srcR2, permM2);
srcM1 = vec_perm(srcR1, srcR2, permM1); srcM1 = vec_perm(srcR1, srcR2, permM1);
srcP0 = vec_perm(srcR1, srcR2, permP0); srcP0 = vec_perm(srcR1, srcR2, permP0);
...@@ -541,7 +503,7 @@ static void PREFIX_h264_qpel16_hv_lowpass_altivec(uint8_t * dst, int16_t * tmp, ...@@ -541,7 +503,7 @@ static void PREFIX_h264_qpel16_hv_lowpass_altivec(uint8_t * dst, int16_t * tmp,
srcP3 = vec_perm(srcR2, srcR3, permP3); srcP3 = vec_perm(srcR2, srcR3, permP3);
} break; } break;
case 13: { case 13: {
vector unsigned char srcR3 = vec_ld(30, src); vec_u8_t srcR3 = vec_ld(30, src);
srcM2 = vec_perm(srcR1, srcR2, permM2); srcM2 = vec_perm(srcR1, srcR2, permM2);
srcM1 = vec_perm(srcR1, srcR2, permM1); srcM1 = vec_perm(srcR1, srcR2, permM1);
srcP0 = vec_perm(srcR1, srcR2, permP0); srcP0 = vec_perm(srcR1, srcR2, permP0);
...@@ -550,7 +512,7 @@ static void PREFIX_h264_qpel16_hv_lowpass_altivec(uint8_t * dst, int16_t * tmp, ...@@ -550,7 +512,7 @@ static void PREFIX_h264_qpel16_hv_lowpass_altivec(uint8_t * dst, int16_t * tmp,
srcP3 = vec_perm(srcR2, srcR3, permP3); srcP3 = vec_perm(srcR2, srcR3, permP3);
} break; } break;
case 14: { case 14: {
vector unsigned char srcR3 = vec_ld(30, src); vec_u8_t srcR3 = vec_ld(30, src);
srcM2 = vec_perm(srcR1, srcR2, permM2); srcM2 = vec_perm(srcR1, srcR2, permM2);
srcM1 = vec_perm(srcR1, srcR2, permM1); srcM1 = vec_perm(srcR1, srcR2, permM1);
srcP0 = srcR2; srcP0 = srcR2;
...@@ -559,7 +521,7 @@ static void PREFIX_h264_qpel16_hv_lowpass_altivec(uint8_t * dst, int16_t * tmp, ...@@ -559,7 +521,7 @@ static void PREFIX_h264_qpel16_hv_lowpass_altivec(uint8_t * dst, int16_t * tmp,
srcP3 = vec_perm(srcR2, srcR3, permP3); srcP3 = vec_perm(srcR2, srcR3, permP3);
} break; } break;
case 15: { case 15: {
vector unsigned char srcR3 = vec_ld(30, src); vec_u8_t srcR3 = vec_ld(30, src);
srcM2 = vec_perm(srcR1, srcR2, permM2); srcM2 = vec_perm(srcR1, srcR2, permM2);
srcM1 = srcR2; srcM1 = srcR2;
srcP0 = vec_perm(srcR2, srcR3, permP0); srcP0 = vec_perm(srcR2, srcR3, permP0);
...@@ -569,32 +531,20 @@ static void PREFIX_h264_qpel16_hv_lowpass_altivec(uint8_t * dst, int16_t * tmp, ...@@ -569,32 +531,20 @@ static void PREFIX_h264_qpel16_hv_lowpass_altivec(uint8_t * dst, int16_t * tmp,
} break; } break;
} }
srcP0A = (vector signed short) srcP0A = (vec_s16_t) vec_mergeh(zero_u8v, srcP0);
vec_mergeh((vector unsigned char)vzero, srcP0); srcP0B = (vec_s16_t) vec_mergel(zero_u8v, srcP0);
srcP0B = (vector signed short) srcP1A = (vec_s16_t) vec_mergeh(zero_u8v, srcP1);
vec_mergel((vector unsigned char)vzero, srcP0); srcP1B = (vec_s16_t) vec_mergel(zero_u8v, srcP1);
srcP1A = (vector signed short)
vec_mergeh((vector unsigned char)vzero, srcP1); srcP2A = (vec_s16_t) vec_mergeh(zero_u8v, srcP2);
srcP1B = (vector signed short) srcP2B = (vec_s16_t) vec_mergel(zero_u8v, srcP2);
vec_mergel((vector unsigned char)vzero, srcP1); srcP3A = (vec_s16_t) vec_mergeh(zero_u8v, srcP3);
srcP3B = (vec_s16_t) vec_mergel(zero_u8v, srcP3);
srcP2A = (vector signed short)
vec_mergeh((vector unsigned char)vzero, srcP2); srcM1A = (vec_s16_t) vec_mergeh(zero_u8v, srcM1);
srcP2B = (vector signed short) srcM1B = (vec_s16_t) vec_mergel(zero_u8v, srcM1);
vec_mergel((vector unsigned char)vzero, srcP2); srcM2A = (vec_s16_t) vec_mergeh(zero_u8v, srcM2);
srcP3A = (vector signed short) srcM2B = (vec_s16_t) vec_mergel(zero_u8v, srcM2);
vec_mergeh((vector unsigned char)vzero, srcP3);
srcP3B = (vector signed short)
vec_mergel((vector unsigned char)vzero, srcP3);
srcM1A = (vector signed short)
vec_mergeh((vector unsigned char)vzero, srcM1);
srcM1B = (vector signed short)
vec_mergel((vector unsigned char)vzero, srcM1);
srcM2A = (vector signed short)
vec_mergeh((vector unsigned char)vzero, srcM2);
srcM2B = (vector signed short)
vec_mergel((vector unsigned char)vzero, srcM2);
sum1A = vec_adds(srcP0A, srcP1A); sum1A = vec_adds(srcP0A, srcP1A);
sum1B = vec_adds(srcP0B, srcP1B); sum1B = vec_adds(srcP0B, srcP1B);
...@@ -606,8 +556,8 @@ static void PREFIX_h264_qpel16_hv_lowpass_altivec(uint8_t * dst, int16_t * tmp, ...@@ -606,8 +556,8 @@ static void PREFIX_h264_qpel16_hv_lowpass_altivec(uint8_t * dst, int16_t * tmp,
pp1A = vec_mladd(sum1A, v20ss, sum3A); pp1A = vec_mladd(sum1A, v20ss, sum3A);
pp1B = vec_mladd(sum1B, v20ss, sum3B); pp1B = vec_mladd(sum1B, v20ss, sum3B);
pp2A = vec_mladd(sum2A, v5ss, (vector signed short)vzero); pp2A = vec_mladd(sum2A, v5ss, zero_s16v);
pp2B = vec_mladd(sum2B, v5ss, (vector signed short)vzero); pp2B = vec_mladd(sum2B, v5ss, zero_s16v);
psumA = vec_sub(pp1A, pp2A); psumA = vec_sub(pp1A, pp2A);
psumB = vec_sub(pp1B, pp2B); psumB = vec_sub(pp1B, pp2B);
...@@ -636,15 +586,15 @@ static void PREFIX_h264_qpel16_hv_lowpass_altivec(uint8_t * dst, int16_t * tmp, ...@@ -636,15 +586,15 @@ static void PREFIX_h264_qpel16_hv_lowpass_altivec(uint8_t * dst, int16_t * tmp,
tmpbis += tmpStride; tmpbis += tmpStride;
for (i = 0 ; i < 16 ; i++) { for (i = 0 ; i < 16 ; i++) {
const vector signed short tmpP3ssA = vec_ld(0, tmpbis); const vec_s16_t tmpP3ssA = vec_ld(0, tmpbis);
const vector signed short tmpP3ssB = vec_ld(16, tmpbis); const vec_s16_t tmpP3ssB = vec_ld(16, tmpbis);
const vector signed short sum1A = vec_adds(tmpP0ssA, tmpP1ssA); const vec_s16_t sum1A = vec_adds(tmpP0ssA, tmpP1ssA);
const vector signed short sum1B = vec_adds(tmpP0ssB, tmpP1ssB); const vec_s16_t sum1B = vec_adds(tmpP0ssB, tmpP1ssB);
const vector signed short sum2A = vec_adds(tmpM1ssA, tmpP2ssA); const vec_s16_t sum2A = vec_adds(tmpM1ssA, tmpP2ssA);
const vector signed short sum2B = vec_adds(tmpM1ssB, tmpP2ssB); const vec_s16_t sum2B = vec_adds(tmpM1ssB, tmpP2ssB);
const vector signed short sum3A = vec_adds(tmpM2ssA, tmpP3ssA); const vec_s16_t sum3A = vec_adds(tmpM2ssA, tmpP3ssA);
const vector signed short sum3B = vec_adds(tmpM2ssB, tmpP3ssB); const vec_s16_t sum3B = vec_adds(tmpM2ssB, tmpP3ssB);
tmpbis += tmpStride; tmpbis += tmpStride;
...@@ -669,9 +619,9 @@ static void PREFIX_h264_qpel16_hv_lowpass_altivec(uint8_t * dst, int16_t * tmp, ...@@ -669,9 +619,9 @@ static void PREFIX_h264_qpel16_hv_lowpass_altivec(uint8_t * dst, int16_t * tmp,
pp2Be = vec_mule(sum2B, v5ss); pp2Be = vec_mule(sum2B, v5ss);
pp2Bo = vec_mulo(sum2B, v5ss); pp2Bo = vec_mulo(sum2B, v5ss);
pp3Ae = vec_sra((vector signed int)sum3A, v16ui); pp3Ae = vec_sra((vec_s32_t)sum3A, v16ui);
pp3Ao = vec_mulo(sum3A, v1ss); pp3Ao = vec_mulo(sum3A, v1ss);
pp3Be = vec_sra((vector signed int)sum3B, v16ui); pp3Be = vec_sra((vec_s32_t)sum3B, v16ui);
pp3Bo = vec_mulo(sum3B, v1ss); pp3Bo = vec_mulo(sum3B, v1ss);
pp1cAe = vec_add(pp1Ae, v512si); pp1cAe = vec_add(pp1Ae, v512si);
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment