Commit c435c416 authored by diego's avatar diego

cosmetics: Make libavcodec/ppc/dsputil_altivec.c conform to style guidelines.

This includes indentation changes, comment reformatting, consistent brace                                                                                                             
placement and some prettyprinting.


git-svn-id: file:///var/local/repositories/ffmpeg/trunk@14318 9553f0bf-9b14-0410-a0b8-cfaf0461ba5b
parent 33297bf0
...@@ -39,12 +39,10 @@ int sad16_x2_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h ...@@ -39,12 +39,10 @@ int sad16_x2_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h
s = 0; s = 0;
sad = (vector unsigned int)vec_splat_u32(0); sad = (vector unsigned int)vec_splat_u32(0);
for(i=0;i<h;i++) { for (i = 0; i < h; i++) {
/* /* Read unaligned pixels into our vectors. The vectors are as follows:
Read unaligned pixels into our vectors. The vectors are as follows:
pix1v: pix1[0]-pix1[15] pix1v: pix1[0]-pix1[15]
pix2v: pix2[0]-pix2[15] pix2iv: pix2[1]-pix2[16] pix2v: pix2[0]-pix2[15] pix2iv: pix2[1]-pix2[16] */
*/
tv = (vector unsigned char *) pix1; tv = (vector unsigned char *) pix1;
pix1v = vec_perm(tv[0], tv[1], vec_lvsl(0, pix1)); pix1v = vec_perm(tv[0], tv[1], vec_lvsl(0, pix1));
...@@ -88,24 +86,20 @@ int sad16_y2_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h ...@@ -88,24 +86,20 @@ int sad16_y2_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h
s = 0; s = 0;
sad = (vector unsigned int)vec_splat_u32(0); sad = (vector unsigned int)vec_splat_u32(0);
/* /* Due to the fact that pix3 = pix2 + line_size, the pix3 of one
Due to the fact that pix3 = pix2 + line_size, the pix3 of one
iteration becomes pix2 in the next iteration. We can use this iteration becomes pix2 in the next iteration. We can use this
fact to avoid a potentially expensive unaligned read, each fact to avoid a potentially expensive unaligned read, each
time around the loop. time around the loop.
Read unaligned pixels into our vectors. The vectors are as follows: Read unaligned pixels into our vectors. The vectors are as follows:
pix2v: pix2[0]-pix2[15] pix2v: pix2[0]-pix2[15]
Split the pixel vectors into shorts Split the pixel vectors into shorts */
*/
tv = (vector unsigned char *) &pix2[0]; tv = (vector unsigned char *) &pix2[0];
pix2v = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix2[0])); pix2v = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix2[0]));
for(i=0;i<h;i++) { for (i = 0; i < h; i++) {
/* /* Read unaligned pixels into our vectors. The vectors are as follows:
Read unaligned pixels into our vectors. The vectors are as follows:
pix1v: pix1[0]-pix1[15] pix1v: pix1[0]-pix1[15]
pix3v: pix3[0]-pix3[15] pix3v: pix3[0]-pix3[15] */
*/
tv = (vector unsigned char *) pix1; tv = (vector unsigned char *) pix1;
pix1v = vec_perm(tv[0], tv[1], vec_lvsl(0, pix1)); pix1v = vec_perm(tv[0], tv[1], vec_lvsl(0, pix1));
...@@ -154,15 +148,13 @@ int sad16_xy2_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int ...@@ -154,15 +148,13 @@ int sad16_xy2_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int
s = 0; s = 0;
/* /* Due to the fact that pix3 = pix2 + line_size, the pix3 of one
Due to the fact that pix3 = pix2 + line_size, the pix3 of one
iteration becomes pix2 in the next iteration. We can use this iteration becomes pix2 in the next iteration. We can use this
fact to avoid a potentially expensive unaligned read, as well fact to avoid a potentially expensive unaligned read, as well
as some splitting, and vector addition each time around the loop. as some splitting, and vector addition each time around the loop.
Read unaligned pixels into our vectors. The vectors are as follows: Read unaligned pixels into our vectors. The vectors are as follows:
pix2v: pix2[0]-pix2[15] pix2iv: pix2[1]-pix2[16] pix2v: pix2[0]-pix2[15] pix2iv: pix2[1]-pix2[16]
Split the pixel vectors into shorts Split the pixel vectors into shorts */
*/
tv = (vector unsigned char *) &pix2[0]; tv = (vector unsigned char *) &pix2[0];
pix2v = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix2[0])); pix2v = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix2[0]));
...@@ -176,12 +168,10 @@ int sad16_xy2_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int ...@@ -176,12 +168,10 @@ int sad16_xy2_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int
t1 = vec_add(pix2hv, pix2ihv); t1 = vec_add(pix2hv, pix2ihv);
t2 = vec_add(pix2lv, pix2ilv); t2 = vec_add(pix2lv, pix2ilv);
for(i=0;i<h;i++) { for (i = 0; i < h; i++) {
/* /* Read unaligned pixels into our vectors. The vectors are as follows:
Read unaligned pixels into our vectors. The vectors are as follows:
pix1v: pix1[0]-pix1[15] pix1v: pix1[0]-pix1[15]
pix3v: pix3[0]-pix3[15] pix3iv: pix3[1]-pix3[16] pix3v: pix3[0]-pix3[15] pix3iv: pix3[1]-pix3[16] */
*/
tv = (vector unsigned char *) pix1; tv = (vector unsigned char *) pix1;
pix1v = vec_perm(tv[0], tv[1], vec_lvsl(0, pix1)); pix1v = vec_perm(tv[0], tv[1], vec_lvsl(0, pix1));
...@@ -191,13 +181,11 @@ int sad16_xy2_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int ...@@ -191,13 +181,11 @@ int sad16_xy2_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int
tv = (vector unsigned char *) &pix3[1]; tv = (vector unsigned char *) &pix3[1];
pix3iv = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix3[1])); pix3iv = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix3[1]));
/* /* Note that AltiVec does have vec_avg, but this works on vector pairs
Note that AltiVec does have vec_avg, but this works on vector pairs
and rounds up. We could do avg(avg(a,b),avg(c,d)), but the rounding and rounds up. We could do avg(avg(a,b),avg(c,d)), but the rounding
would mean that, for example, avg(3,0,0,1) = 2, when it should be 1. would mean that, for example, avg(3,0,0,1) = 2, when it should be 1.
Instead, we have to split the pixel vectors into vectors of shorts, Instead, we have to split the pixel vectors into vectors of shorts,
and do the averaging by hand. and do the averaging by hand. */
*/
/* Split the pixel vectors into shorts */ /* Split the pixel vectors into shorts */
pix3hv = (vector unsigned short) vec_mergeh(zero, pix3v); pix3hv = (vector unsigned short) vec_mergeh(zero, pix3v);
...@@ -248,7 +236,7 @@ int sad16_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h) ...@@ -248,7 +236,7 @@ int sad16_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
sad = (vector unsigned int)vec_splat_u32(0); sad = (vector unsigned int)vec_splat_u32(0);
for(i=0;i<h;i++) { for (i = 0; i < h; i++) {
/* Read potentially unaligned pixels into t1 and t2 */ /* Read potentially unaligned pixels into t1 and t2 */
perm1 = vec_lvsl(0, pix1); perm1 = vec_lvsl(0, pix1);
pix1v = (vector unsigned char *) pix1; pix1v = (vector unsigned char *) pix1;
...@@ -291,7 +279,7 @@ int sad8_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h) ...@@ -291,7 +279,7 @@ int sad8_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
permclear = (vector unsigned char)AVV(255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0); permclear = (vector unsigned char)AVV(255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0);
for(i=0;i<h;i++) { for (i = 0; i < h; i++) {
/* Read potentially unaligned pixels into t1 and t2 /* Read potentially unaligned pixels into t1 and t2
Since we're reading 16 pixels, and actually only want 8, Since we're reading 16 pixels, and actually only want 8,
mask out the last 8 pixels. The 0s don't change the sum. */ mask out the last 8 pixels. The 0s don't change the sum. */
...@@ -373,7 +361,7 @@ int sse8_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h) ...@@ -373,7 +361,7 @@ int sse8_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
permclear = (vector unsigned char)AVV(255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0); permclear = (vector unsigned char)AVV(255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0);
for(i=0;i<h;i++) { for (i = 0; i < h; i++) {
/* Read potentially unaligned pixels into t1 and t2 /* Read potentially unaligned pixels into t1 and t2
Since we're reading 16 pixels, and actually only want 8, Since we're reading 16 pixels, and actually only want 8,
mask out the last 8 pixels. The 0s don't change the sum. */ mask out the last 8 pixels. The 0s don't change the sum. */
...@@ -384,10 +372,8 @@ int sse8_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h) ...@@ -384,10 +372,8 @@ int sse8_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
t1 = vec_and(vec_perm(pix1v[0], pix1v[1], perm1), permclear); t1 = vec_and(vec_perm(pix1v[0], pix1v[1], perm1), permclear);
t2 = vec_and(vec_perm(pix2v[0], pix2v[1], perm2), permclear); t2 = vec_and(vec_perm(pix2v[0], pix2v[1], perm2), permclear);
/* /* Since we want to use unsigned chars, we can take advantage
Since we want to use unsigned chars, we can take advantage of the fact that abs(a-b)^2 = (a-b)^2. */
of the fact that abs(a-b)^2 = (a-b)^2.
*/
/* Calculate abs differences vector */ /* Calculate abs differences vector */
t3 = vec_max(t1, t2); t3 = vec_max(t1, t2);
...@@ -426,7 +412,7 @@ int sse16_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h) ...@@ -426,7 +412,7 @@ int sse16_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
sum = (vector unsigned int)vec_splat_u32(0); sum = (vector unsigned int)vec_splat_u32(0);
for(i=0;i<h;i++) { for (i = 0; i < h; i++) {
/* Read potentially unaligned pixels into t1 and t2 */ /* Read potentially unaligned pixels into t1 and t2 */
perm1 = vec_lvsl(0, pix1); perm1 = vec_lvsl(0, pix1);
pix1v = (vector unsigned char *) pix1; pix1v = (vector unsigned char *) pix1;
...@@ -435,10 +421,8 @@ int sse16_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h) ...@@ -435,10 +421,8 @@ int sse16_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
t1 = vec_perm(pix1v[0], pix1v[1], perm1); t1 = vec_perm(pix1v[0], pix1v[1], perm1);
t2 = vec_perm(pix2v[0], pix2v[1], perm2); t2 = vec_perm(pix2v[0], pix2v[1], perm2);
/* /* Since we want to use unsigned chars, we can take advantage
Since we want to use unsigned chars, we can take advantage of the fact that abs(a-b)^2 = (a-b)^2. */
of the fact that abs(a-b)^2 = (a-b)^2.
*/
/* Calculate abs differences vector */ /* Calculate abs differences vector */
t3 = vec_max(t1, t2); t3 = vec_max(t1, t2);
...@@ -500,8 +484,7 @@ void get_pixels_altivec(DCTELEM *restrict block, const uint8_t *pixels, int line ...@@ -500,8 +484,7 @@ void get_pixels_altivec(DCTELEM *restrict block, const uint8_t *pixels, int line
const vector unsigned char zero = (const vector unsigned char)vec_splat_u8(0); const vector unsigned char zero = (const vector unsigned char)vec_splat_u8(0);
vector signed short shorts; vector signed short shorts;
for(i=0;i<8;i++) for (i = 0; i < 8; i++) {
{
// Read potentially unaligned pixels. // Read potentially unaligned pixels.
// We're reading 16 pixels, and actually only want 8, // We're reading 16 pixels, and actually only want 8,
// but we simply ignore the extras. // but we simply ignore the extras.
...@@ -527,8 +510,7 @@ void diff_pixels_altivec(DCTELEM *restrict block, const uint8_t *s1, ...@@ -527,8 +510,7 @@ void diff_pixels_altivec(DCTELEM *restrict block, const uint8_t *s1,
const vector unsigned char zero = (const vector unsigned char)vec_splat_u8(0); const vector unsigned char zero = (const vector unsigned char)vec_splat_u8(0);
vector signed short shorts1, shorts2; vector signed short shorts1, shorts2;
for(i=0;i<4;i++) for (i = 0; i < 4; i++) {
{
// Read potentially unaligned pixels // Read potentially unaligned pixels
// We're reading 16 pixels, and actually only want 8, // We're reading 16 pixels, and actually only want 8,
// but we simply ignore the extras. // but we simply ignore the extras.
...@@ -596,16 +578,14 @@ void add_bytes_altivec(uint8_t *dst, uint8_t *src, int w) { ...@@ -596,16 +578,14 @@ void add_bytes_altivec(uint8_t *dst, uint8_t *src, int w) {
register vector unsigned char vdst, vsrc; register vector unsigned char vdst, vsrc;
/* dst and src are 16 bytes-aligned (guaranteed) */ /* dst and src are 16 bytes-aligned (guaranteed) */
for(i = 0 ; (i + 15) < w ; i+=16) for (i = 0 ; (i + 15) < w ; i+=16) {
{
vdst = vec_ld(i, (unsigned char*)dst); vdst = vec_ld(i, (unsigned char*)dst);
vsrc = vec_ld(i, (unsigned char*)src); vsrc = vec_ld(i, (unsigned char*)src);
vdst = vec_add(vsrc, vdst); vdst = vec_add(vsrc, vdst);
vec_st(vdst, i, (unsigned char*)dst); vec_st(vdst, i, (unsigned char*)dst);
} }
/* if w is not a multiple of 16 */ /* if w is not a multiple of 16 */
for (; (i < w) ; i++) for (; (i < w) ; i++) {
{
dst[i] = src[i]; dst[i] = src[i];
} }
} }
...@@ -632,7 +612,7 @@ POWERPC_PERF_START_COUNT(altivec_put_pixels16_num, 1); ...@@ -632,7 +612,7 @@ POWERPC_PERF_START_COUNT(altivec_put_pixels16_num, 1);
// -funroll-loops w/ this is bad - 74 cycles again. // -funroll-loops w/ this is bad - 74 cycles again.
// all this is on a 7450, tuning for the 7450 // all this is on a 7450, tuning for the 7450
#if 0 #if 0
for(i=0; i<h; i++) { for (i = 0; i < h; i++) {
pixelsv1 = vec_ld(0, (unsigned char*)pixels); pixelsv1 = vec_ld(0, (unsigned char*)pixels);
pixelsv2 = vec_ld(16, (unsigned char*)pixels); pixelsv2 = vec_ld(16, (unsigned char*)pixels);
vec_st(vec_perm(pixelsv1, pixelsv2, perm), vec_st(vec_perm(pixelsv1, pixelsv2, perm),
...@@ -641,7 +621,7 @@ POWERPC_PERF_START_COUNT(altivec_put_pixels16_num, 1); ...@@ -641,7 +621,7 @@ POWERPC_PERF_START_COUNT(altivec_put_pixels16_num, 1);
block +=line_size; block +=line_size;
} }
#else #else
for(i=0; i<h; i+=4) { for (i = 0; i < h; i += 4) {
pixelsv1 = vec_ld(0, (unsigned char*)pixels); pixelsv1 = vec_ld(0, (unsigned char*)pixels);
pixelsv2 = vec_ld(15, (unsigned char*)pixels); pixelsv2 = vec_ld(15, (unsigned char*)pixels);
pixelsv1B = vec_ld(line_size, (unsigned char*)pixels); pixelsv1B = vec_ld(line_size, (unsigned char*)pixels);
...@@ -676,7 +656,7 @@ POWERPC_PERF_DECLARE(altivec_avg_pixels16_num, 1); ...@@ -676,7 +656,7 @@ POWERPC_PERF_DECLARE(altivec_avg_pixels16_num, 1);
POWERPC_PERF_START_COUNT(altivec_avg_pixels16_num, 1); POWERPC_PERF_START_COUNT(altivec_avg_pixels16_num, 1);
for(i=0; i<h; i++) { for (i = 0; i < h; i++) {
pixelsv1 = vec_ld(0, (unsigned char*)pixels); pixelsv1 = vec_ld(0, (unsigned char*)pixels);
pixelsv2 = vec_ld(16, (unsigned char*)pixels); pixelsv2 = vec_ld(16, (unsigned char*)pixels);
blockv = vec_ld(0, block); blockv = vec_ld(0, block);
...@@ -700,10 +680,8 @@ POWERPC_PERF_DECLARE(altivec_avg_pixels8_num, 1); ...@@ -700,10 +680,8 @@ POWERPC_PERF_DECLARE(altivec_avg_pixels8_num, 1);
POWERPC_PERF_START_COUNT(altivec_avg_pixels8_num, 1); POWERPC_PERF_START_COUNT(altivec_avg_pixels8_num, 1);
for (i = 0; i < h; i++) { for (i = 0; i < h; i++) {
/* /* block is 8 bytes-aligned, so we're either in the
block is 8 bytes-aligned, so we're either in the left block (16 bytes-aligned) or in the right block (not) */
left block (16 bytes-aligned) or in the right block (not)
*/
int rightside = ((unsigned long)block & 0x0000000F); int rightside = ((unsigned long)block & 0x0000000F);
blockv = vec_ld(0, block); blockv = vec_ld(0, block);
...@@ -711,12 +689,9 @@ POWERPC_PERF_START_COUNT(altivec_avg_pixels8_num, 1); ...@@ -711,12 +689,9 @@ POWERPC_PERF_START_COUNT(altivec_avg_pixels8_num, 1);
pixelsv2 = vec_ld(16, (unsigned char*)pixels); pixelsv2 = vec_ld(16, (unsigned char*)pixels);
pixelsv = vec_perm(pixelsv1, pixelsv2, vec_lvsl(0, pixels)); pixelsv = vec_perm(pixelsv1, pixelsv2, vec_lvsl(0, pixels));
if (rightside) if (rightside) {
{
pixelsv = vec_perm(blockv, pixelsv, vcprm(0,1,s0,s1)); pixelsv = vec_perm(blockv, pixelsv, vcprm(0,1,s0,s1));
} } else {
else
{
pixelsv = vec_perm(blockv, pixelsv, vcprm(s0,s1,2,3)); pixelsv = vec_perm(blockv, pixelsv, vcprm(s0,s1,2,3));
} }
...@@ -736,25 +711,18 @@ void put_pixels8_xy2_altivec(uint8_t *block, const uint8_t *pixels, int line_siz ...@@ -736,25 +711,18 @@ void put_pixels8_xy2_altivec(uint8_t *block, const uint8_t *pixels, int line_siz
{ {
POWERPC_PERF_DECLARE(altivec_put_pixels8_xy2_num, 1); POWERPC_PERF_DECLARE(altivec_put_pixels8_xy2_num, 1);
register int i; register int i;
register vector unsigned char register vector unsigned char pixelsv1, pixelsv2, pixelsavg;
pixelsv1, pixelsv2, register vector unsigned char blockv, temp1, temp2;
pixelsavg; register vector unsigned short pixelssum1, pixelssum2, temp3;
register vector unsigned char
blockv, temp1, temp2;
register vector unsigned short
pixelssum1, pixelssum2, temp3;
register const vector unsigned char vczero = (const vector unsigned char)vec_splat_u8(0); register const vector unsigned char vczero = (const vector unsigned char)vec_splat_u8(0);
register const vector unsigned short vctwo = (const vector unsigned short)vec_splat_u16(2); register const vector unsigned short vctwo = (const vector unsigned short)vec_splat_u16(2);
temp1 = vec_ld(0, pixels); temp1 = vec_ld(0, pixels);
temp2 = vec_ld(16, pixels); temp2 = vec_ld(16, pixels);
pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels)); pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels));
if ((((unsigned long)pixels) & 0x0000000F) == 0x0000000F) if ((((unsigned long)pixels) & 0x0000000F) == 0x0000000F) {
{
pixelsv2 = temp2; pixelsv2 = temp2;
} } else {
else
{
pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(1, pixels)); pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(1, pixels));
} }
pixelsv1 = vec_mergeh(vczero, pixelsv1); pixelsv1 = vec_mergeh(vczero, pixelsv1);
...@@ -771,12 +739,9 @@ POWERPC_PERF_START_COUNT(altivec_put_pixels8_xy2_num, 1); ...@@ -771,12 +739,9 @@ POWERPC_PERF_START_COUNT(altivec_put_pixels8_xy2_num, 1);
temp1 = vec_ld(line_size, pixels); temp1 = vec_ld(line_size, pixels);
temp2 = vec_ld(line_size + 16, pixels); temp2 = vec_ld(line_size + 16, pixels);
pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(line_size, pixels)); pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(line_size, pixels));
if (((((unsigned long)pixels) + line_size) & 0x0000000F) == 0x0000000F) if (((((unsigned long)pixels) + line_size) & 0x0000000F) == 0x0000000F) {
{
pixelsv2 = temp2; pixelsv2 = temp2;
} } else {
else
{
pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(line_size + 1, pixels)); pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(line_size + 1, pixels));
} }
...@@ -789,12 +754,9 @@ POWERPC_PERF_START_COUNT(altivec_put_pixels8_xy2_num, 1); ...@@ -789,12 +754,9 @@ POWERPC_PERF_START_COUNT(altivec_put_pixels8_xy2_num, 1);
pixelssum1 = vec_add(pixelssum2, vctwo); pixelssum1 = vec_add(pixelssum2, vctwo);
pixelsavg = vec_packsu(temp3, (vector unsigned short) vczero); pixelsavg = vec_packsu(temp3, (vector unsigned short) vczero);
if (rightside) if (rightside) {
{
blockv = vec_perm(blockv, pixelsavg, vcprm(0, 1, s0, s1)); blockv = vec_perm(blockv, pixelsavg, vcprm(0, 1, s0, s1));
} } else {
else
{
blockv = vec_perm(blockv, pixelsavg, vcprm(s0, s1, 2, 3)); blockv = vec_perm(blockv, pixelsavg, vcprm(s0, s1, 2, 3));
} }
...@@ -812,13 +774,9 @@ void put_no_rnd_pixels8_xy2_altivec(uint8_t *block, const uint8_t *pixels, int l ...@@ -812,13 +774,9 @@ void put_no_rnd_pixels8_xy2_altivec(uint8_t *block, const uint8_t *pixels, int l
{ {
POWERPC_PERF_DECLARE(altivec_put_no_rnd_pixels8_xy2_num, 1); POWERPC_PERF_DECLARE(altivec_put_no_rnd_pixels8_xy2_num, 1);
register int i; register int i;
register vector unsigned char register vector unsigned char pixelsv1, pixelsv2, pixelsavg;
pixelsv1, pixelsv2, register vector unsigned char blockv, temp1, temp2;
pixelsavg; register vector unsigned short pixelssum1, pixelssum2, temp3;
register vector unsigned char
blockv, temp1, temp2;
register vector unsigned short
pixelssum1, pixelssum2, temp3;
register const vector unsigned char vczero = (const vector unsigned char)vec_splat_u8(0); register const vector unsigned char vczero = (const vector unsigned char)vec_splat_u8(0);
register const vector unsigned short vcone = (const vector unsigned short)vec_splat_u16(1); register const vector unsigned short vcone = (const vector unsigned short)vec_splat_u16(1);
register const vector unsigned short vctwo = (const vector unsigned short)vec_splat_u16(2); register const vector unsigned short vctwo = (const vector unsigned short)vec_splat_u16(2);
...@@ -826,12 +784,9 @@ POWERPC_PERF_DECLARE(altivec_put_no_rnd_pixels8_xy2_num, 1); ...@@ -826,12 +784,9 @@ POWERPC_PERF_DECLARE(altivec_put_no_rnd_pixels8_xy2_num, 1);
temp1 = vec_ld(0, pixels); temp1 = vec_ld(0, pixels);
temp2 = vec_ld(16, pixels); temp2 = vec_ld(16, pixels);
pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels)); pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels));
if ((((unsigned long)pixels) & 0x0000000F) == 0x0000000F) if ((((unsigned long)pixels) & 0x0000000F) == 0x0000000F) {
{
pixelsv2 = temp2; pixelsv2 = temp2;
} } else {
else
{
pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(1, pixels)); pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(1, pixels));
} }
pixelsv1 = vec_mergeh(vczero, pixelsv1); pixelsv1 = vec_mergeh(vczero, pixelsv1);
...@@ -848,12 +803,9 @@ POWERPC_PERF_START_COUNT(altivec_put_no_rnd_pixels8_xy2_num, 1); ...@@ -848,12 +803,9 @@ POWERPC_PERF_START_COUNT(altivec_put_no_rnd_pixels8_xy2_num, 1);
temp1 = vec_ld(line_size, pixels); temp1 = vec_ld(line_size, pixels);
temp2 = vec_ld(line_size + 16, pixels); temp2 = vec_ld(line_size + 16, pixels);
pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(line_size, pixels)); pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(line_size, pixels));
if (((((unsigned long)pixels) + line_size) & 0x0000000F) == 0x0000000F) if (((((unsigned long)pixels) + line_size) & 0x0000000F) == 0x0000000F) {
{
pixelsv2 = temp2; pixelsv2 = temp2;
} } else {
else
{
pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(line_size + 1, pixels)); pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(line_size + 1, pixels));
} }
...@@ -866,12 +818,9 @@ POWERPC_PERF_START_COUNT(altivec_put_no_rnd_pixels8_xy2_num, 1); ...@@ -866,12 +818,9 @@ POWERPC_PERF_START_COUNT(altivec_put_no_rnd_pixels8_xy2_num, 1);
pixelssum1 = vec_add(pixelssum2, vcone); pixelssum1 = vec_add(pixelssum2, vcone);
pixelsavg = vec_packsu(temp3, (vector unsigned short) vczero); pixelsavg = vec_packsu(temp3, (vector unsigned short) vczero);
if (rightside) if (rightside) {
{
blockv = vec_perm(blockv, pixelsavg, vcprm(0, 1, s0, s1)); blockv = vec_perm(blockv, pixelsavg, vcprm(0, 1, s0, s1));
} } else {
else
{
blockv = vec_perm(blockv, pixelsavg, vcprm(s0, s1, 2, 3)); blockv = vec_perm(blockv, pixelsavg, vcprm(s0, s1, 2, 3));
} }
...@@ -889,13 +838,10 @@ void put_pixels16_xy2_altivec(uint8_t * block, const uint8_t * pixels, int line_ ...@@ -889,13 +838,10 @@ void put_pixels16_xy2_altivec(uint8_t * block, const uint8_t * pixels, int line_
{ {
POWERPC_PERF_DECLARE(altivec_put_pixels16_xy2_num, 1); POWERPC_PERF_DECLARE(altivec_put_pixels16_xy2_num, 1);
register int i; register int i;
register vector unsigned char register vector unsigned char pixelsv1, pixelsv2, pixelsv3, pixelsv4;
pixelsv1, pixelsv2, pixelsv3, pixelsv4; register vector unsigned char blockv, temp1, temp2;
register vector unsigned char register vector unsigned short temp3, temp4,
blockv, temp1, temp2; pixelssum1, pixelssum2, pixelssum3, pixelssum4;
register vector unsigned short
pixelssum1, pixelssum2, temp3,
pixelssum3, pixelssum4, temp4;
register const vector unsigned char vczero = (const vector unsigned char)vec_splat_u8(0); register const vector unsigned char vczero = (const vector unsigned char)vec_splat_u8(0);
register const vector unsigned short vctwo = (const vector unsigned short)vec_splat_u16(2); register const vector unsigned short vctwo = (const vector unsigned short)vec_splat_u16(2);
...@@ -904,12 +850,9 @@ POWERPC_PERF_START_COUNT(altivec_put_pixels16_xy2_num, 1); ...@@ -904,12 +850,9 @@ POWERPC_PERF_START_COUNT(altivec_put_pixels16_xy2_num, 1);
temp1 = vec_ld(0, pixels); temp1 = vec_ld(0, pixels);
temp2 = vec_ld(16, pixels); temp2 = vec_ld(16, pixels);
pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels)); pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels));
if ((((unsigned long)pixels) & 0x0000000F) == 0x0000000F) if ((((unsigned long)pixels) & 0x0000000F) == 0x0000000F) {
{
pixelsv2 = temp2; pixelsv2 = temp2;
} } else {
else
{
pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(1, pixels)); pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(1, pixels));
} }
pixelsv3 = vec_mergel(vczero, pixelsv1); pixelsv3 = vec_mergel(vczero, pixelsv1);
...@@ -929,12 +872,9 @@ POWERPC_PERF_START_COUNT(altivec_put_pixels16_xy2_num, 1); ...@@ -929,12 +872,9 @@ POWERPC_PERF_START_COUNT(altivec_put_pixels16_xy2_num, 1);
temp1 = vec_ld(line_size, pixels); temp1 = vec_ld(line_size, pixels);
temp2 = vec_ld(line_size + 16, pixels); temp2 = vec_ld(line_size + 16, pixels);
pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(line_size, pixels)); pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(line_size, pixels));
if (((((unsigned long)pixels) + line_size) & 0x0000000F) == 0x0000000F) if (((((unsigned long)pixels) + line_size) & 0x0000000F) == 0x0000000F) {
{
pixelsv2 = temp2; pixelsv2 = temp2;
} } else {
else
{
pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(line_size + 1, pixels)); pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(line_size + 1, pixels));
} }
...@@ -971,13 +911,10 @@ void put_no_rnd_pixels16_xy2_altivec(uint8_t * block, const uint8_t * pixels, in ...@@ -971,13 +911,10 @@ void put_no_rnd_pixels16_xy2_altivec(uint8_t * block, const uint8_t * pixels, in
{ {
POWERPC_PERF_DECLARE(altivec_put_no_rnd_pixels16_xy2_num, 1); POWERPC_PERF_DECLARE(altivec_put_no_rnd_pixels16_xy2_num, 1);
register int i; register int i;
register vector unsigned char register vector unsigned char pixelsv1, pixelsv2, pixelsv3, pixelsv4;
pixelsv1, pixelsv2, pixelsv3, pixelsv4; register vector unsigned char blockv, temp1, temp2;
register vector unsigned char register vector unsigned short temp3, temp4,
blockv, temp1, temp2; pixelssum1, pixelssum2, pixelssum3, pixelssum4;
register vector unsigned short
pixelssum1, pixelssum2, temp3,
pixelssum3, pixelssum4, temp4;
register const vector unsigned char vczero = (const vector unsigned char)vec_splat_u8(0); register const vector unsigned char vczero = (const vector unsigned char)vec_splat_u8(0);
register const vector unsigned short vcone = (const vector unsigned short)vec_splat_u16(1); register const vector unsigned short vcone = (const vector unsigned short)vec_splat_u16(1);
register const vector unsigned short vctwo = (const vector unsigned short)vec_splat_u16(2); register const vector unsigned short vctwo = (const vector unsigned short)vec_splat_u16(2);
...@@ -987,12 +924,9 @@ POWERPC_PERF_START_COUNT(altivec_put_no_rnd_pixels16_xy2_num, 1); ...@@ -987,12 +924,9 @@ POWERPC_PERF_START_COUNT(altivec_put_no_rnd_pixels16_xy2_num, 1);
temp1 = vec_ld(0, pixels); temp1 = vec_ld(0, pixels);
temp2 = vec_ld(16, pixels); temp2 = vec_ld(16, pixels);
pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels)); pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels));
if ((((unsigned long)pixels) & 0x0000000F) == 0x0000000F) if ((((unsigned long)pixels) & 0x0000000F) == 0x0000000F) {
{
pixelsv2 = temp2; pixelsv2 = temp2;
} } else {
else
{
pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(1, pixels)); pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(1, pixels));
} }
pixelsv3 = vec_mergel(vczero, pixelsv1); pixelsv3 = vec_mergel(vczero, pixelsv1);
...@@ -1012,12 +946,9 @@ POWERPC_PERF_START_COUNT(altivec_put_no_rnd_pixels16_xy2_num, 1); ...@@ -1012,12 +946,9 @@ POWERPC_PERF_START_COUNT(altivec_put_no_rnd_pixels16_xy2_num, 1);
temp1 = vec_ld(line_size, pixels); temp1 = vec_ld(line_size, pixels);
temp2 = vec_ld(line_size + 16, pixels); temp2 = vec_ld(line_size + 16, pixels);
pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(line_size, pixels)); pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(line_size, pixels));
if (((((unsigned long)pixels) + line_size) & 0x0000000F) == 0x0000000F) if (((((unsigned long)pixels) + line_size) & 0x0000000F) == 0x0000000F) {
{
pixelsv2 = temp2; pixelsv2 = temp2;
} } else {
else
{
pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(line_size + 1, pixels)); pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(line_size + 1, pixels));
} }
...@@ -1088,11 +1019,9 @@ POWERPC_PERF_START_COUNT(altivec_hadamard8_diff8x8_num, 1); ...@@ -1088,11 +1019,9 @@ POWERPC_PERF_START_COUNT(altivec_hadamard8_diff8x8_num, 1);
dstO = vec_perm(dst1, dst2, vec_lvsl(stride * i, dst)); \ dstO = vec_perm(dst1, dst2, vec_lvsl(stride * i, dst)); \
/* promote the unsigned chars to signed shorts */ \ /* promote the unsigned chars to signed shorts */ \
/* we're in the 8x8 function, we only care for the first 8 */ \ /* we're in the 8x8 function, we only care for the first 8 */ \
srcV = \ srcV = (vector signed short)vec_mergeh((vector signed char)vzero, \
(vector signed short)vec_mergeh((vector signed char)vzero, \
(vector signed char)srcO); \ (vector signed char)srcO); \
dstV = \ dstV = (vector signed short)vec_mergeh((vector signed char)vzero, \
(vector signed short)vec_mergeh((vector signed char)vzero, \
(vector signed char)dstO); \ (vector signed char)dstO); \
/* subtractions inside the first butterfly */ \ /* subtractions inside the first butterfly */ \
but0 = vec_sub(srcV, dstV); \ but0 = vec_sub(srcV, dstV); \
...@@ -1159,25 +1088,22 @@ POWERPC_PERF_STOP_COUNT(altivec_hadamard8_diff8x8_num, 1); ...@@ -1159,25 +1088,22 @@ POWERPC_PERF_STOP_COUNT(altivec_hadamard8_diff8x8_num, 1);
} }
/* /*
16x8 works with 16 elements ; it allows to avoid replicating 16x8 works with 16 elements; it allows to avoid replicating loads, and
loads, and give the compiler more rooms for scheduling. give the compiler more rooms for scheduling. It's only used from
It's only used from inside hadamard8_diff16_altivec. inside hadamard8_diff16_altivec.
Unfortunately, it seems gcc-3.3 is a bit dumb, and Unfortunately, it seems gcc-3.3 is a bit dumb, and the compiled code has a LOT
the compiled code has a LOT of spill code, it seems of spill code, it seems gcc (unlike xlc) cannot keep everything in registers
gcc (unlike xlc) cannot keep everything in registers by itself. The following code include hand-made registers allocation. It's not
by itself. The following code include hand-made clean, but on a 7450 the resulting code is much faster (best case fall from
registers allocation. It's not clean, but on 700+ cycles to 550).
a 7450 the resulting code is much faster (best case
fall from 700+ cycles to 550). xlc doesn't add spill code, but it doesn't know how to schedule for the 7450,
and its code isn't much faster than gcc-3.3 on the 7450 (but uses 25% less
xlc doesn't add spill code, but it doesn't know how to instructions...)
schedule for the 7450, and its code isn't much faster than
gcc-3.3 on the 7450 (but uses 25% less instructions...) On the 970, the hand-made RA is still a win (around 690 vs. around 780), but
xlc goes to around 660 on the regular C code...
On the 970, the hand-made RA is still a win (around 690
vs. around 780), but xlc goes to around 660 on the
regular C code...
*/ */
static int hadamard8_diff16x8_altivec(/*MpegEncContext*/ void *s, uint8_t *dst, uint8_t *src, int stride, int h) { static int hadamard8_diff16x8_altivec(/*MpegEncContext*/ void *s, uint8_t *dst, uint8_t *src, int stride, int h) {
...@@ -1255,17 +1181,13 @@ static int hadamard8_diff16x8_altivec(/*MpegEncContext*/ void *s, uint8_t *dst, ...@@ -1255,17 +1181,13 @@ static int hadamard8_diff16x8_altivec(/*MpegEncContext*/ void *s, uint8_t *dst,
dst2 = vec_ld((stride * i) + 16, dst); \ dst2 = vec_ld((stride * i) + 16, dst); \
dstO = vec_perm(dst1, dst2, vec_lvsl(stride * i, dst)); \ dstO = vec_perm(dst1, dst2, vec_lvsl(stride * i, dst)); \
/* promote the unsigned chars to signed shorts */ \ /* promote the unsigned chars to signed shorts */ \
srcV = \ srcV = (vector signed short)vec_mergeh((vector signed char)vzero, \
(vector signed short)vec_mergeh((vector signed char)vzero, \
(vector signed char)srcO); \ (vector signed char)srcO); \
dstV = \ dstV = (vector signed short)vec_mergeh((vector signed char)vzero, \
(vector signed short)vec_mergeh((vector signed char)vzero, \
(vector signed char)dstO); \ (vector signed char)dstO); \
srcW = \ srcW = (vector signed short)vec_mergel((vector signed char)vzero, \
(vector signed short)vec_mergel((vector signed char)vzero, \
(vector signed char)srcO); \ (vector signed char)srcO); \
dstW = \ dstW = (vector signed short)vec_mergel((vector signed char)vzero, \
(vector signed short)vec_mergel((vector signed char)vzero, \
(vector signed char)dstO); \ (vector signed char)dstO); \
/* subtractions inside the first butterfly */ \ /* subtractions inside the first butterfly */ \
but0 = vec_sub(srcV, dstV); \ but0 = vec_sub(srcV, dstV); \
...@@ -1401,7 +1323,7 @@ static void vorbis_inverse_coupling_altivec(float *mag, float *ang, ...@@ -1401,7 +1323,7 @@ static void vorbis_inverse_coupling_altivec(float *mag, float *ang,
vector bool int t0, t1; vector bool int t0, t1;
const vector unsigned int v_31 = //XXX const vector unsigned int v_31 = //XXX
vec_add(vec_add(vec_splat_u32(15),vec_splat_u32(15)),vec_splat_u32(1)); vec_add(vec_add(vec_splat_u32(15),vec_splat_u32(15)),vec_splat_u32(1));
for(i=0; i<blocksize; i+=4) { for (i = 0; i < blocksize; i += 4) {
m = vec_ld(0, mag+i); m = vec_ld(0, mag+i);
a = vec_ld(0, ang+i); a = vec_ld(0, ang+i);
t0 = vec_cmple(m, (vector float)vec_splat_u32(0)); t0 = vec_cmple(m, (vector float)vec_splat_u32(0));
...@@ -1452,8 +1374,7 @@ POWERPC_PERF_START_COUNT(altivec_avg_pixels8_xy2_num, 1); ...@@ -1452,8 +1374,7 @@ POWERPC_PERF_START_COUNT(altivec_avg_pixels8_xy2_num, 1);
temp1 = vec_ld(line_size, pixels); temp1 = vec_ld(line_size, pixels);
temp2 = vec_ld(line_size + 16, pixels); temp2 = vec_ld(line_size + 16, pixels);
pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(line_size, pixels)); pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(line_size, pixels));
if (((((unsigned long)pixels) + line_size) & 0x0000000F) == 0x0000000F) if (((((unsigned long)pixels) + line_size) & 0x0000000F) == 0x0000000F) {
{
pixelsv2 = temp2; pixelsv2 = temp2;
} else { } else {
pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(line_size + 1, pixels)); pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(line_size + 1, pixels));
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment