Commit c435c416 authored by diego's avatar diego

cosmetics: Make libavcodec/ppc/dsputil_altivec.c conform to style guidelines.

This includes indentation changes, comment reformatting, consistent brace                                                                                                             
placement and some prettyprinting.


git-svn-id: file:///var/local/repositories/ffmpeg/trunk@14318 9553f0bf-9b14-0410-a0b8-cfaf0461ba5b
parent 33297bf0
...@@ -39,12 +39,10 @@ int sad16_x2_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h ...@@ -39,12 +39,10 @@ int sad16_x2_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h
s = 0; s = 0;
sad = (vector unsigned int)vec_splat_u32(0); sad = (vector unsigned int)vec_splat_u32(0);
for(i=0;i<h;i++) { for (i = 0; i < h; i++) {
/* /* Read unaligned pixels into our vectors. The vectors are as follows:
Read unaligned pixels into our vectors. The vectors are as follows:
pix1v: pix1[0]-pix1[15] pix1v: pix1[0]-pix1[15]
pix2v: pix2[0]-pix2[15] pix2iv: pix2[1]-pix2[16] pix2v: pix2[0]-pix2[15] pix2iv: pix2[1]-pix2[16] */
*/
tv = (vector unsigned char *) pix1; tv = (vector unsigned char *) pix1;
pix1v = vec_perm(tv[0], tv[1], vec_lvsl(0, pix1)); pix1v = vec_perm(tv[0], tv[1], vec_lvsl(0, pix1));
...@@ -88,24 +86,20 @@ int sad16_y2_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h ...@@ -88,24 +86,20 @@ int sad16_y2_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h
s = 0; s = 0;
sad = (vector unsigned int)vec_splat_u32(0); sad = (vector unsigned int)vec_splat_u32(0);
/* /* Due to the fact that pix3 = pix2 + line_size, the pix3 of one
Due to the fact that pix3 = pix2 + line_size, the pix3 of one
iteration becomes pix2 in the next iteration. We can use this iteration becomes pix2 in the next iteration. We can use this
fact to avoid a potentially expensive unaligned read, each fact to avoid a potentially expensive unaligned read, each
time around the loop. time around the loop.
Read unaligned pixels into our vectors. The vectors are as follows: Read unaligned pixels into our vectors. The vectors are as follows:
pix2v: pix2[0]-pix2[15] pix2v: pix2[0]-pix2[15]
Split the pixel vectors into shorts Split the pixel vectors into shorts */
*/
tv = (vector unsigned char *) &pix2[0]; tv = (vector unsigned char *) &pix2[0];
pix2v = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix2[0])); pix2v = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix2[0]));
for(i=0;i<h;i++) { for (i = 0; i < h; i++) {
/* /* Read unaligned pixels into our vectors. The vectors are as follows:
Read unaligned pixels into our vectors. The vectors are as follows:
pix1v: pix1[0]-pix1[15] pix1v: pix1[0]-pix1[15]
pix3v: pix3[0]-pix3[15] pix3v: pix3[0]-pix3[15] */
*/
tv = (vector unsigned char *) pix1; tv = (vector unsigned char *) pix1;
pix1v = vec_perm(tv[0], tv[1], vec_lvsl(0, pix1)); pix1v = vec_perm(tv[0], tv[1], vec_lvsl(0, pix1));
...@@ -154,34 +148,30 @@ int sad16_xy2_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int ...@@ -154,34 +148,30 @@ int sad16_xy2_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int
s = 0; s = 0;
/* /* Due to the fact that pix3 = pix2 + line_size, the pix3 of one
Due to the fact that pix3 = pix2 + line_size, the pix3 of one
iteration becomes pix2 in the next iteration. We can use this iteration becomes pix2 in the next iteration. We can use this
fact to avoid a potentially expensive unaligned read, as well fact to avoid a potentially expensive unaligned read, as well
as some splitting, and vector addition each time around the loop. as some splitting, and vector addition each time around the loop.
Read unaligned pixels into our vectors. The vectors are as follows: Read unaligned pixels into our vectors. The vectors are as follows:
pix2v: pix2[0]-pix2[15] pix2iv: pix2[1]-pix2[16] pix2v: pix2[0]-pix2[15] pix2iv: pix2[1]-pix2[16]
Split the pixel vectors into shorts Split the pixel vectors into shorts */
*/
tv = (vector unsigned char *) &pix2[0]; tv = (vector unsigned char *) &pix2[0];
pix2v = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix2[0])); pix2v = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix2[0]));
tv = (vector unsigned char *) &pix2[1]; tv = (vector unsigned char *) &pix2[1];
pix2iv = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix2[1])); pix2iv = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix2[1]));
pix2hv = (vector unsigned short) vec_mergeh(zero, pix2v); pix2hv = (vector unsigned short) vec_mergeh(zero, pix2v);
pix2lv = (vector unsigned short) vec_mergel(zero, pix2v); pix2lv = (vector unsigned short) vec_mergel(zero, pix2v);
pix2ihv = (vector unsigned short) vec_mergeh(zero, pix2iv); pix2ihv = (vector unsigned short) vec_mergeh(zero, pix2iv);
pix2ilv = (vector unsigned short) vec_mergel(zero, pix2iv); pix2ilv = (vector unsigned short) vec_mergel(zero, pix2iv);
t1 = vec_add(pix2hv, pix2ihv); t1 = vec_add(pix2hv, pix2ihv);
t2 = vec_add(pix2lv, pix2ilv); t2 = vec_add(pix2lv, pix2ilv);
for(i=0;i<h;i++) { for (i = 0; i < h; i++) {
/* /* Read unaligned pixels into our vectors. The vectors are as follows:
Read unaligned pixels into our vectors. The vectors are as follows:
pix1v: pix1[0]-pix1[15] pix1v: pix1[0]-pix1[15]
pix3v: pix3[0]-pix3[15] pix3iv: pix3[1]-pix3[16] pix3v: pix3[0]-pix3[15] pix3iv: pix3[1]-pix3[16] */
*/
tv = (vector unsigned char *) pix1; tv = (vector unsigned char *) pix1;
pix1v = vec_perm(tv[0], tv[1], vec_lvsl(0, pix1)); pix1v = vec_perm(tv[0], tv[1], vec_lvsl(0, pix1));
...@@ -191,17 +181,15 @@ int sad16_xy2_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int ...@@ -191,17 +181,15 @@ int sad16_xy2_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int
tv = (vector unsigned char *) &pix3[1]; tv = (vector unsigned char *) &pix3[1];
pix3iv = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix3[1])); pix3iv = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix3[1]));
/* /* Note that AltiVec does have vec_avg, but this works on vector pairs
Note that AltiVec does have vec_avg, but this works on vector pairs and rounds up. We could do avg(avg(a,b),avg(c,d)), but the rounding
and rounds up. We could do avg(avg(a,b),avg(c,d)), but the rounding would mean that, for example, avg(3,0,0,1) = 2, when it should be 1.
would mean that, for example, avg(3,0,0,1) = 2, when it should be 1. Instead, we have to split the pixel vectors into vectors of shorts,
Instead, we have to split the pixel vectors into vectors of shorts, and do the averaging by hand. */
and do the averaging by hand.
*/
/* Split the pixel vectors into shorts */ /* Split the pixel vectors into shorts */
pix3hv = (vector unsigned short) vec_mergeh(zero, pix3v); pix3hv = (vector unsigned short) vec_mergeh(zero, pix3v);
pix3lv = (vector unsigned short) vec_mergel(zero, pix3v); pix3lv = (vector unsigned short) vec_mergel(zero, pix3v);
pix3ihv = (vector unsigned short) vec_mergeh(zero, pix3iv); pix3ihv = (vector unsigned short) vec_mergeh(zero, pix3iv);
pix3ilv = (vector unsigned short) vec_mergel(zero, pix3iv); pix3ilv = (vector unsigned short) vec_mergel(zero, pix3iv);
...@@ -248,7 +236,7 @@ int sad16_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h) ...@@ -248,7 +236,7 @@ int sad16_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
sad = (vector unsigned int)vec_splat_u32(0); sad = (vector unsigned int)vec_splat_u32(0);
for(i=0;i<h;i++) { for (i = 0; i < h; i++) {
/* Read potentially unaligned pixels into t1 and t2 */ /* Read potentially unaligned pixels into t1 and t2 */
perm1 = vec_lvsl(0, pix1); perm1 = vec_lvsl(0, pix1);
pix1v = (vector unsigned char *) pix1; pix1v = (vector unsigned char *) pix1;
...@@ -291,7 +279,7 @@ int sad8_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h) ...@@ -291,7 +279,7 @@ int sad8_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
permclear = (vector unsigned char)AVV(255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0); permclear = (vector unsigned char)AVV(255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0);
for(i=0;i<h;i++) { for (i = 0; i < h; i++) {
/* Read potentially unaligned pixels into t1 and t2 /* Read potentially unaligned pixels into t1 and t2
Since we're reading 16 pixels, and actually only want 8, Since we're reading 16 pixels, and actually only want 8,
mask out the last 8 pixels. The 0s don't change the sum. */ mask out the last 8 pixels. The 0s don't change the sum. */
...@@ -373,7 +361,7 @@ int sse8_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h) ...@@ -373,7 +361,7 @@ int sse8_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
permclear = (vector unsigned char)AVV(255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0); permclear = (vector unsigned char)AVV(255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0);
for(i=0;i<h;i++) { for (i = 0; i < h; i++) {
/* Read potentially unaligned pixels into t1 and t2 /* Read potentially unaligned pixels into t1 and t2
Since we're reading 16 pixels, and actually only want 8, Since we're reading 16 pixels, and actually only want 8,
mask out the last 8 pixels. The 0s don't change the sum. */ mask out the last 8 pixels. The 0s don't change the sum. */
...@@ -384,10 +372,8 @@ int sse8_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h) ...@@ -384,10 +372,8 @@ int sse8_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
t1 = vec_and(vec_perm(pix1v[0], pix1v[1], perm1), permclear); t1 = vec_and(vec_perm(pix1v[0], pix1v[1], perm1), permclear);
t2 = vec_and(vec_perm(pix2v[0], pix2v[1], perm2), permclear); t2 = vec_and(vec_perm(pix2v[0], pix2v[1], perm2), permclear);
/* /* Since we want to use unsigned chars, we can take advantage
Since we want to use unsigned chars, we can take advantage of the fact that abs(a-b)^2 = (a-b)^2. */
of the fact that abs(a-b)^2 = (a-b)^2.
*/
/* Calculate abs differences vector */ /* Calculate abs differences vector */
t3 = vec_max(t1, t2); t3 = vec_max(t1, t2);
...@@ -426,7 +412,7 @@ int sse16_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h) ...@@ -426,7 +412,7 @@ int sse16_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
sum = (vector unsigned int)vec_splat_u32(0); sum = (vector unsigned int)vec_splat_u32(0);
for(i=0;i<h;i++) { for (i = 0; i < h; i++) {
/* Read potentially unaligned pixels into t1 and t2 */ /* Read potentially unaligned pixels into t1 and t2 */
perm1 = vec_lvsl(0, pix1); perm1 = vec_lvsl(0, pix1);
pix1v = (vector unsigned char *) pix1; pix1v = (vector unsigned char *) pix1;
...@@ -435,10 +421,8 @@ int sse16_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h) ...@@ -435,10 +421,8 @@ int sse16_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
t1 = vec_perm(pix1v[0], pix1v[1], perm1); t1 = vec_perm(pix1v[0], pix1v[1], perm1);
t2 = vec_perm(pix2v[0], pix2v[1], perm2); t2 = vec_perm(pix2v[0], pix2v[1], perm2);
/* /* Since we want to use unsigned chars, we can take advantage
Since we want to use unsigned chars, we can take advantage of the fact that abs(a-b)^2 = (a-b)^2. */
of the fact that abs(a-b)^2 = (a-b)^2.
*/
/* Calculate abs differences vector */ /* Calculate abs differences vector */
t3 = vec_max(t1, t2); t3 = vec_max(t1, t2);
...@@ -500,8 +484,7 @@ void get_pixels_altivec(DCTELEM *restrict block, const uint8_t *pixels, int line ...@@ -500,8 +484,7 @@ void get_pixels_altivec(DCTELEM *restrict block, const uint8_t *pixels, int line
const vector unsigned char zero = (const vector unsigned char)vec_splat_u8(0); const vector unsigned char zero = (const vector unsigned char)vec_splat_u8(0);
vector signed short shorts; vector signed short shorts;
for(i=0;i<8;i++) for (i = 0; i < 8; i++) {
{
// Read potentially unaligned pixels. // Read potentially unaligned pixels.
// We're reading 16 pixels, and actually only want 8, // We're reading 16 pixels, and actually only want 8,
// but we simply ignore the extras. // but we simply ignore the extras.
...@@ -527,8 +510,7 @@ void diff_pixels_altivec(DCTELEM *restrict block, const uint8_t *s1, ...@@ -527,8 +510,7 @@ void diff_pixels_altivec(DCTELEM *restrict block, const uint8_t *s1,
const vector unsigned char zero = (const vector unsigned char)vec_splat_u8(0); const vector unsigned char zero = (const vector unsigned char)vec_splat_u8(0);
vector signed short shorts1, shorts2; vector signed short shorts1, shorts2;
for(i=0;i<4;i++) for (i = 0; i < 4; i++) {
{
// Read potentially unaligned pixels // Read potentially unaligned pixels
// We're reading 16 pixels, and actually only want 8, // We're reading 16 pixels, and actually only want 8,
// but we simply ignore the extras. // but we simply ignore the extras.
...@@ -596,17 +578,15 @@ void add_bytes_altivec(uint8_t *dst, uint8_t *src, int w) { ...@@ -596,17 +578,15 @@ void add_bytes_altivec(uint8_t *dst, uint8_t *src, int w) {
register vector unsigned char vdst, vsrc; register vector unsigned char vdst, vsrc;
/* dst and src are 16 bytes-aligned (guaranteed) */ /* dst and src are 16 bytes-aligned (guaranteed) */
for(i = 0 ; (i + 15) < w ; i+=16) for (i = 0 ; (i + 15) < w ; i+=16) {
{ vdst = vec_ld(i, (unsigned char*)dst);
vdst = vec_ld(i, (unsigned char*)dst); vsrc = vec_ld(i, (unsigned char*)src);
vsrc = vec_ld(i, (unsigned char*)src); vdst = vec_add(vsrc, vdst);
vdst = vec_add(vsrc, vdst); vec_st(vdst, i, (unsigned char*)dst);
vec_st(vdst, i, (unsigned char*)dst);
} }
/* if w is not a multiple of 16 */ /* if w is not a multiple of 16 */
for (; (i < w) ; i++) for (; (i < w) ; i++) {
{ dst[i] = src[i];
dst[i] = src[i];
} }
} }
...@@ -632,34 +612,34 @@ POWERPC_PERF_START_COUNT(altivec_put_pixels16_num, 1); ...@@ -632,34 +612,34 @@ POWERPC_PERF_START_COUNT(altivec_put_pixels16_num, 1);
// -funroll-loops w/ this is bad - 74 cycles again. // -funroll-loops w/ this is bad - 74 cycles again.
// all this is on a 7450, tuning for the 7450 // all this is on a 7450, tuning for the 7450
#if 0 #if 0
for(i=0; i<h; i++) { for (i = 0; i < h; i++) {
pixelsv1 = vec_ld(0, (unsigned char*)pixels); pixelsv1 = vec_ld(0, (unsigned char*)pixels);
pixelsv2 = vec_ld(16, (unsigned char*)pixels); pixelsv2 = vec_ld(16, (unsigned char*)pixels);
vec_st(vec_perm(pixelsv1, pixelsv2, perm), vec_st(vec_perm(pixelsv1, pixelsv2, perm),
0, (unsigned char*)block); 0, (unsigned char*)block);
pixels+=line_size; pixels+=line_size;
block +=line_size; block +=line_size;
} }
#else #else
for(i=0; i<h; i+=4) { for (i = 0; i < h; i += 4) {
pixelsv1 = vec_ld(0, (unsigned char*)pixels); pixelsv1 = vec_ld(0, (unsigned char*)pixels);
pixelsv2 = vec_ld(15, (unsigned char*)pixels); pixelsv2 = vec_ld(15, (unsigned char*)pixels);
pixelsv1B = vec_ld(line_size, (unsigned char*)pixels); pixelsv1B = vec_ld(line_size, (unsigned char*)pixels);
pixelsv2B = vec_ld(15 + line_size, (unsigned char*)pixels); pixelsv2B = vec_ld(15 + line_size, (unsigned char*)pixels);
pixelsv1C = vec_ld(line_size_2, (unsigned char*)pixels); pixelsv1C = vec_ld(line_size_2, (unsigned char*)pixels);
pixelsv2C = vec_ld(15 + line_size_2, (unsigned char*)pixels); pixelsv2C = vec_ld(15 + line_size_2, (unsigned char*)pixels);
pixelsv1D = vec_ld(line_size_3, (unsigned char*)pixels); pixelsv1D = vec_ld(line_size_3, (unsigned char*)pixels);
pixelsv2D = vec_ld(15 + line_size_3, (unsigned char*)pixels); pixelsv2D = vec_ld(15 + line_size_3, (unsigned char*)pixels);
vec_st(vec_perm(pixelsv1, pixelsv2, perm), vec_st(vec_perm(pixelsv1, pixelsv2, perm),
0, (unsigned char*)block); 0, (unsigned char*)block);
vec_st(vec_perm(pixelsv1B, pixelsv2B, perm), vec_st(vec_perm(pixelsv1B, pixelsv2B, perm),
line_size, (unsigned char*)block); line_size, (unsigned char*)block);
vec_st(vec_perm(pixelsv1C, pixelsv2C, perm), vec_st(vec_perm(pixelsv1C, pixelsv2C, perm),
line_size_2, (unsigned char*)block); line_size_2, (unsigned char*)block);
vec_st(vec_perm(pixelsv1D, pixelsv2D, perm), vec_st(vec_perm(pixelsv1D, pixelsv2D, perm),
line_size_3, (unsigned char*)block); line_size_3, (unsigned char*)block);
pixels+=line_size_4; pixels+=line_size_4;
block +=line_size_4; block +=line_size_4;
} }
#endif #endif
POWERPC_PERF_STOP_COUNT(altivec_put_pixels16_num, 1); POWERPC_PERF_STOP_COUNT(altivec_put_pixels16_num, 1);
...@@ -676,15 +656,15 @@ POWERPC_PERF_DECLARE(altivec_avg_pixels16_num, 1); ...@@ -676,15 +656,15 @@ POWERPC_PERF_DECLARE(altivec_avg_pixels16_num, 1);
POWERPC_PERF_START_COUNT(altivec_avg_pixels16_num, 1); POWERPC_PERF_START_COUNT(altivec_avg_pixels16_num, 1);
for(i=0; i<h; i++) { for (i = 0; i < h; i++) {
pixelsv1 = vec_ld(0, (unsigned char*)pixels); pixelsv1 = vec_ld(0, (unsigned char*)pixels);
pixelsv2 = vec_ld(16, (unsigned char*)pixels); pixelsv2 = vec_ld(16, (unsigned char*)pixels);
blockv = vec_ld(0, block); blockv = vec_ld(0, block);
pixelsv = vec_perm(pixelsv1, pixelsv2, perm); pixelsv = vec_perm(pixelsv1, pixelsv2, perm);
blockv = vec_avg(blockv,pixelsv); blockv = vec_avg(blockv,pixelsv);
vec_st(blockv, 0, (unsigned char*)block); vec_st(blockv, 0, (unsigned char*)block);
pixels+=line_size; pixels+=line_size;
block +=line_size; block +=line_size;
} }
POWERPC_PERF_STOP_COUNT(altivec_avg_pixels16_num, 1); POWERPC_PERF_STOP_COUNT(altivec_avg_pixels16_num, 1);
...@@ -700,32 +680,27 @@ POWERPC_PERF_DECLARE(altivec_avg_pixels8_num, 1); ...@@ -700,32 +680,27 @@ POWERPC_PERF_DECLARE(altivec_avg_pixels8_num, 1);
POWERPC_PERF_START_COUNT(altivec_avg_pixels8_num, 1); POWERPC_PERF_START_COUNT(altivec_avg_pixels8_num, 1);
for (i = 0; i < h; i++) { for (i = 0; i < h; i++) {
/* /* block is 8 bytes-aligned, so we're either in the
block is 8 bytes-aligned, so we're either in the left block (16 bytes-aligned) or in the right block (not) */
left block (16 bytes-aligned) or in the right block (not) int rightside = ((unsigned long)block & 0x0000000F);
*/
int rightside = ((unsigned long)block & 0x0000000F); blockv = vec_ld(0, block);
pixelsv1 = vec_ld(0, (unsigned char*)pixels);
blockv = vec_ld(0, block); pixelsv2 = vec_ld(16, (unsigned char*)pixels);
pixelsv1 = vec_ld(0, (unsigned char*)pixels); pixelsv = vec_perm(pixelsv1, pixelsv2, vec_lvsl(0, pixels));
pixelsv2 = vec_ld(16, (unsigned char*)pixels);
pixelsv = vec_perm(pixelsv1, pixelsv2, vec_lvsl(0, pixels)); if (rightside) {
pixelsv = vec_perm(blockv, pixelsv, vcprm(0,1,s0,s1));
if (rightside) } else {
{ pixelsv = vec_perm(blockv, pixelsv, vcprm(s0,s1,2,3));
pixelsv = vec_perm(blockv, pixelsv, vcprm(0,1,s0,s1)); }
}
else blockv = vec_avg(blockv, pixelsv);
{
pixelsv = vec_perm(blockv, pixelsv, vcprm(s0,s1,2,3)); vec_st(blockv, 0, block);
}
pixels += line_size;
blockv = vec_avg(blockv, pixelsv); block += line_size;
vec_st(blockv, 0, block);
pixels += line_size;
block += line_size;
} }
POWERPC_PERF_STOP_COUNT(altivec_avg_pixels8_num, 1); POWERPC_PERF_STOP_COUNT(altivec_avg_pixels8_num, 1);
...@@ -735,74 +710,61 @@ POWERPC_PERF_STOP_COUNT(altivec_avg_pixels8_num, 1); ...@@ -735,74 +710,61 @@ POWERPC_PERF_STOP_COUNT(altivec_avg_pixels8_num, 1);
void put_pixels8_xy2_altivec(uint8_t *block, const uint8_t *pixels, int line_size, int h) void put_pixels8_xy2_altivec(uint8_t *block, const uint8_t *pixels, int line_size, int h)
{ {
POWERPC_PERF_DECLARE(altivec_put_pixels8_xy2_num, 1); POWERPC_PERF_DECLARE(altivec_put_pixels8_xy2_num, 1);
register int i; register int i;
register vector unsigned char register vector unsigned char pixelsv1, pixelsv2, pixelsavg;
pixelsv1, pixelsv2, register vector unsigned char blockv, temp1, temp2;
pixelsavg; register vector unsigned short pixelssum1, pixelssum2, temp3;
register vector unsigned char register const vector unsigned char vczero = (const vector unsigned char)vec_splat_u8(0);
blockv, temp1, temp2; register const vector unsigned short vctwo = (const vector unsigned short)vec_splat_u16(2);
register vector unsigned short
pixelssum1, pixelssum2, temp3; temp1 = vec_ld(0, pixels);
register const vector unsigned char vczero = (const vector unsigned char)vec_splat_u8(0); temp2 = vec_ld(16, pixels);
register const vector unsigned short vctwo = (const vector unsigned short)vec_splat_u16(2); pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels));
if ((((unsigned long)pixels) & 0x0000000F) == 0x0000000F) {
temp1 = vec_ld(0, pixels); pixelsv2 = temp2;
temp2 = vec_ld(16, pixels); } else {
pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels)); pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(1, pixels));
if ((((unsigned long)pixels) & 0x0000000F) == 0x0000000F) }
{ pixelsv1 = vec_mergeh(vczero, pixelsv1);
pixelsv2 = temp2; pixelsv2 = vec_mergeh(vczero, pixelsv2);
} pixelssum1 = vec_add((vector unsigned short)pixelsv1,
else (vector unsigned short)pixelsv2);
{ pixelssum1 = vec_add(pixelssum1, vctwo);
pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(1, pixels));
}
pixelsv1 = vec_mergeh(vczero, pixelsv1);
pixelsv2 = vec_mergeh(vczero, pixelsv2);
pixelssum1 = vec_add((vector unsigned short)pixelsv1,
(vector unsigned short)pixelsv2);
pixelssum1 = vec_add(pixelssum1, vctwo);
POWERPC_PERF_START_COUNT(altivec_put_pixels8_xy2_num, 1); POWERPC_PERF_START_COUNT(altivec_put_pixels8_xy2_num, 1);
for (i = 0; i < h ; i++) { for (i = 0; i < h ; i++) {
int rightside = ((unsigned long)block & 0x0000000F); int rightside = ((unsigned long)block & 0x0000000F);
blockv = vec_ld(0, block); blockv = vec_ld(0, block);
temp1 = vec_ld(line_size, pixels); temp1 = vec_ld(line_size, pixels);
temp2 = vec_ld(line_size + 16, pixels); temp2 = vec_ld(line_size + 16, pixels);
pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(line_size, pixels)); pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(line_size, pixels));
if (((((unsigned long)pixels) + line_size) & 0x0000000F) == 0x0000000F) if (((((unsigned long)pixels) + line_size) & 0x0000000F) == 0x0000000F) {
{ pixelsv2 = temp2;
pixelsv2 = temp2; } else {
} pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(line_size + 1, pixels));
else }
{
pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(line_size + 1, pixels)); pixelsv1 = vec_mergeh(vczero, pixelsv1);
} pixelsv2 = vec_mergeh(vczero, pixelsv2);
pixelssum2 = vec_add((vector unsigned short)pixelsv1,
pixelsv1 = vec_mergeh(vczero, pixelsv1); (vector unsigned short)pixelsv2);
pixelsv2 = vec_mergeh(vczero, pixelsv2); temp3 = vec_add(pixelssum1, pixelssum2);
pixelssum2 = vec_add((vector unsigned short)pixelsv1, temp3 = vec_sra(temp3, vctwo);
(vector unsigned short)pixelsv2); pixelssum1 = vec_add(pixelssum2, vctwo);
temp3 = vec_add(pixelssum1, pixelssum2); pixelsavg = vec_packsu(temp3, (vector unsigned short) vczero);
temp3 = vec_sra(temp3, vctwo);
pixelssum1 = vec_add(pixelssum2, vctwo); if (rightside) {
pixelsavg = vec_packsu(temp3, (vector unsigned short) vczero); blockv = vec_perm(blockv, pixelsavg, vcprm(0, 1, s0, s1));
} else {
if (rightside) blockv = vec_perm(blockv, pixelsavg, vcprm(s0, s1, 2, 3));
{ }
blockv = vec_perm(blockv, pixelsavg, vcprm(0, 1, s0, s1));
} vec_st(blockv, 0, block);
else
{ block += line_size;
blockv = vec_perm(blockv, pixelsavg, vcprm(s0, s1, 2, 3)); pixels += line_size;
} }
vec_st(blockv, 0, block);
block += line_size;
pixels += line_size;
}
POWERPC_PERF_STOP_COUNT(altivec_put_pixels8_xy2_num, 1); POWERPC_PERF_STOP_COUNT(altivec_put_pixels8_xy2_num, 1);
} }
...@@ -811,75 +773,62 @@ POWERPC_PERF_STOP_COUNT(altivec_put_pixels8_xy2_num, 1); ...@@ -811,75 +773,62 @@ POWERPC_PERF_STOP_COUNT(altivec_put_pixels8_xy2_num, 1);
void put_no_rnd_pixels8_xy2_altivec(uint8_t *block, const uint8_t *pixels, int line_size, int h) void put_no_rnd_pixels8_xy2_altivec(uint8_t *block, const uint8_t *pixels, int line_size, int h)
{ {
POWERPC_PERF_DECLARE(altivec_put_no_rnd_pixels8_xy2_num, 1); POWERPC_PERF_DECLARE(altivec_put_no_rnd_pixels8_xy2_num, 1);
register int i; register int i;
register vector unsigned char register vector unsigned char pixelsv1, pixelsv2, pixelsavg;
pixelsv1, pixelsv2, register vector unsigned char blockv, temp1, temp2;
pixelsavg; register vector unsigned short pixelssum1, pixelssum2, temp3;
register vector unsigned char register const vector unsigned char vczero = (const vector unsigned char)vec_splat_u8(0);
blockv, temp1, temp2; register const vector unsigned short vcone = (const vector unsigned short)vec_splat_u16(1);
register vector unsigned short register const vector unsigned short vctwo = (const vector unsigned short)vec_splat_u16(2);
pixelssum1, pixelssum2, temp3;
register const vector unsigned char vczero = (const vector unsigned char)vec_splat_u8(0); temp1 = vec_ld(0, pixels);
register const vector unsigned short vcone = (const vector unsigned short)vec_splat_u16(1); temp2 = vec_ld(16, pixels);
register const vector unsigned short vctwo = (const vector unsigned short)vec_splat_u16(2); pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels));
if ((((unsigned long)pixels) & 0x0000000F) == 0x0000000F) {
temp1 = vec_ld(0, pixels); pixelsv2 = temp2;
temp2 = vec_ld(16, pixels); } else {
pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels)); pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(1, pixels));
if ((((unsigned long)pixels) & 0x0000000F) == 0x0000000F) }
{ pixelsv1 = vec_mergeh(vczero, pixelsv1);
pixelsv2 = temp2; pixelsv2 = vec_mergeh(vczero, pixelsv2);
} pixelssum1 = vec_add((vector unsigned short)pixelsv1,
else (vector unsigned short)pixelsv2);
{ pixelssum1 = vec_add(pixelssum1, vcone);
pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(1, pixels));
}
pixelsv1 = vec_mergeh(vczero, pixelsv1);
pixelsv2 = vec_mergeh(vczero, pixelsv2);
pixelssum1 = vec_add((vector unsigned short)pixelsv1,
(vector unsigned short)pixelsv2);
pixelssum1 = vec_add(pixelssum1, vcone);
POWERPC_PERF_START_COUNT(altivec_put_no_rnd_pixels8_xy2_num, 1); POWERPC_PERF_START_COUNT(altivec_put_no_rnd_pixels8_xy2_num, 1);
for (i = 0; i < h ; i++) { for (i = 0; i < h ; i++) {
int rightside = ((unsigned long)block & 0x0000000F); int rightside = ((unsigned long)block & 0x0000000F);
blockv = vec_ld(0, block); blockv = vec_ld(0, block);
temp1 = vec_ld(line_size, pixels); temp1 = vec_ld(line_size, pixels);
temp2 = vec_ld(line_size + 16, pixels); temp2 = vec_ld(line_size + 16, pixels);
pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(line_size, pixels)); pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(line_size, pixels));
if (((((unsigned long)pixels) + line_size) & 0x0000000F) == 0x0000000F) if (((((unsigned long)pixels) + line_size) & 0x0000000F) == 0x0000000F) {
{ pixelsv2 = temp2;
pixelsv2 = temp2; } else {
} pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(line_size + 1, pixels));
else }
{
pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(line_size + 1, pixels)); pixelsv1 = vec_mergeh(vczero, pixelsv1);
} pixelsv2 = vec_mergeh(vczero, pixelsv2);
pixelssum2 = vec_add((vector unsigned short)pixelsv1,
pixelsv1 = vec_mergeh(vczero, pixelsv1); (vector unsigned short)pixelsv2);
pixelsv2 = vec_mergeh(vczero, pixelsv2); temp3 = vec_add(pixelssum1, pixelssum2);
pixelssum2 = vec_add((vector unsigned short)pixelsv1, temp3 = vec_sra(temp3, vctwo);
(vector unsigned short)pixelsv2); pixelssum1 = vec_add(pixelssum2, vcone);
temp3 = vec_add(pixelssum1, pixelssum2); pixelsavg = vec_packsu(temp3, (vector unsigned short) vczero);
temp3 = vec_sra(temp3, vctwo);
pixelssum1 = vec_add(pixelssum2, vcone); if (rightside) {
pixelsavg = vec_packsu(temp3, (vector unsigned short) vczero); blockv = vec_perm(blockv, pixelsavg, vcprm(0, 1, s0, s1));
} else {
if (rightside) blockv = vec_perm(blockv, pixelsavg, vcprm(s0, s1, 2, 3));
{ }
blockv = vec_perm(blockv, pixelsavg, vcprm(0, 1, s0, s1));
} vec_st(blockv, 0, block);
else
{ block += line_size;
blockv = vec_perm(blockv, pixelsavg, vcprm(s0, s1, 2, 3)); pixels += line_size;
} }
vec_st(blockv, 0, block);
block += line_size;
pixels += line_size;
}
POWERPC_PERF_STOP_COUNT(altivec_put_no_rnd_pixels8_xy2_num, 1); POWERPC_PERF_STOP_COUNT(altivec_put_no_rnd_pixels8_xy2_num, 1);
} }
...@@ -888,80 +837,71 @@ POWERPC_PERF_STOP_COUNT(altivec_put_no_rnd_pixels8_xy2_num, 1); ...@@ -888,80 +837,71 @@ POWERPC_PERF_STOP_COUNT(altivec_put_no_rnd_pixels8_xy2_num, 1);
void put_pixels16_xy2_altivec(uint8_t * block, const uint8_t * pixels, int line_size, int h) void put_pixels16_xy2_altivec(uint8_t * block, const uint8_t * pixels, int line_size, int h)
{ {
POWERPC_PERF_DECLARE(altivec_put_pixels16_xy2_num, 1); POWERPC_PERF_DECLARE(altivec_put_pixels16_xy2_num, 1);
register int i; register int i;
register vector unsigned char register vector unsigned char pixelsv1, pixelsv2, pixelsv3, pixelsv4;
pixelsv1, pixelsv2, pixelsv3, pixelsv4; register vector unsigned char blockv, temp1, temp2;
register vector unsigned char register vector unsigned short temp3, temp4,
blockv, temp1, temp2; pixelssum1, pixelssum2, pixelssum3, pixelssum4;
register vector unsigned short register const vector unsigned char vczero = (const vector unsigned char)vec_splat_u8(0);
pixelssum1, pixelssum2, temp3, register const vector unsigned short vctwo = (const vector unsigned short)vec_splat_u16(2);
pixelssum3, pixelssum4, temp4;
register const vector unsigned char vczero = (const vector unsigned char)vec_splat_u8(0);
register const vector unsigned short vctwo = (const vector unsigned short)vec_splat_u16(2);
POWERPC_PERF_START_COUNT(altivec_put_pixels16_xy2_num, 1); POWERPC_PERF_START_COUNT(altivec_put_pixels16_xy2_num, 1);
temp1 = vec_ld(0, pixels); temp1 = vec_ld(0, pixels);
temp2 = vec_ld(16, pixels); temp2 = vec_ld(16, pixels);
pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels)); pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels));
if ((((unsigned long)pixels) & 0x0000000F) == 0x0000000F) if ((((unsigned long)pixels) & 0x0000000F) == 0x0000000F) {
{ pixelsv2 = temp2;
pixelsv2 = temp2; } else {
} pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(1, pixels));
else }
{ pixelsv3 = vec_mergel(vczero, pixelsv1);
pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(1, pixels)); pixelsv4 = vec_mergel(vczero, pixelsv2);
} pixelsv1 = vec_mergeh(vczero, pixelsv1);
pixelsv3 = vec_mergel(vczero, pixelsv1); pixelsv2 = vec_mergeh(vczero, pixelsv2);
pixelsv4 = vec_mergel(vczero, pixelsv2); pixelssum3 = vec_add((vector unsigned short)pixelsv3,
pixelsv1 = vec_mergeh(vczero, pixelsv1); (vector unsigned short)pixelsv4);
pixelsv2 = vec_mergeh(vczero, pixelsv2); pixelssum3 = vec_add(pixelssum3, vctwo);
pixelssum3 = vec_add((vector unsigned short)pixelsv3, pixelssum1 = vec_add((vector unsigned short)pixelsv1,
(vector unsigned short)pixelsv4); (vector unsigned short)pixelsv2);
pixelssum3 = vec_add(pixelssum3, vctwo); pixelssum1 = vec_add(pixelssum1, vctwo);
pixelssum1 = vec_add((vector unsigned short)pixelsv1,
(vector unsigned short)pixelsv2); for (i = 0; i < h ; i++) {
pixelssum1 = vec_add(pixelssum1, vctwo); blockv = vec_ld(0, block);
for (i = 0; i < h ; i++) { temp1 = vec_ld(line_size, pixels);
blockv = vec_ld(0, block); temp2 = vec_ld(line_size + 16, pixels);
pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(line_size, pixels));
temp1 = vec_ld(line_size, pixels); if (((((unsigned long)pixels) + line_size) & 0x0000000F) == 0x0000000F) {
temp2 = vec_ld(line_size + 16, pixels); pixelsv2 = temp2;
pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(line_size, pixels)); } else {
if (((((unsigned long)pixels) + line_size) & 0x0000000F) == 0x0000000F) pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(line_size + 1, pixels));
{ }
pixelsv2 = temp2;
} pixelsv3 = vec_mergel(vczero, pixelsv1);
else pixelsv4 = vec_mergel(vczero, pixelsv2);
{ pixelsv1 = vec_mergeh(vczero, pixelsv1);
pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(line_size + 1, pixels)); pixelsv2 = vec_mergeh(vczero, pixelsv2);
}
pixelssum4 = vec_add((vector unsigned short)pixelsv3,
pixelsv3 = vec_mergel(vczero, pixelsv1); (vector unsigned short)pixelsv4);
pixelsv4 = vec_mergel(vczero, pixelsv2); pixelssum2 = vec_add((vector unsigned short)pixelsv1,
pixelsv1 = vec_mergeh(vczero, pixelsv1); (vector unsigned short)pixelsv2);
pixelsv2 = vec_mergeh(vczero, pixelsv2); temp4 = vec_add(pixelssum3, pixelssum4);
temp4 = vec_sra(temp4, vctwo);
pixelssum4 = vec_add((vector unsigned short)pixelsv3, temp3 = vec_add(pixelssum1, pixelssum2);
(vector unsigned short)pixelsv4); temp3 = vec_sra(temp3, vctwo);
pixelssum2 = vec_add((vector unsigned short)pixelsv1,
(vector unsigned short)pixelsv2); pixelssum3 = vec_add(pixelssum4, vctwo);
temp4 = vec_add(pixelssum3, pixelssum4); pixelssum1 = vec_add(pixelssum2, vctwo);
temp4 = vec_sra(temp4, vctwo);
temp3 = vec_add(pixelssum1, pixelssum2); blockv = vec_packsu(temp3, temp4);
temp3 = vec_sra(temp3, vctwo);
vec_st(blockv, 0, block);
pixelssum3 = vec_add(pixelssum4, vctwo);
pixelssum1 = vec_add(pixelssum2, vctwo); block += line_size;
pixels += line_size;
blockv = vec_packsu(temp3, temp4); }
vec_st(blockv, 0, block);
block += line_size;
pixels += line_size;
}
POWERPC_PERF_STOP_COUNT(altivec_put_pixels16_xy2_num, 1); POWERPC_PERF_STOP_COUNT(altivec_put_pixels16_xy2_num, 1);
} }
...@@ -970,81 +910,72 @@ POWERPC_PERF_STOP_COUNT(altivec_put_pixels16_xy2_num, 1); ...@@ -970,81 +910,72 @@ POWERPC_PERF_STOP_COUNT(altivec_put_pixels16_xy2_num, 1);
void put_no_rnd_pixels16_xy2_altivec(uint8_t * block, const uint8_t * pixels, int line_size, int h) void put_no_rnd_pixels16_xy2_altivec(uint8_t * block, const uint8_t * pixels, int line_size, int h)
{ {
POWERPC_PERF_DECLARE(altivec_put_no_rnd_pixels16_xy2_num, 1); POWERPC_PERF_DECLARE(altivec_put_no_rnd_pixels16_xy2_num, 1);
register int i; register int i;
register vector unsigned char register vector unsigned char pixelsv1, pixelsv2, pixelsv3, pixelsv4;
pixelsv1, pixelsv2, pixelsv3, pixelsv4; register vector unsigned char blockv, temp1, temp2;
register vector unsigned char register vector unsigned short temp3, temp4,
blockv, temp1, temp2; pixelssum1, pixelssum2, pixelssum3, pixelssum4;
register vector unsigned short register const vector unsigned char vczero = (const vector unsigned char)vec_splat_u8(0);
pixelssum1, pixelssum2, temp3, register const vector unsigned short vcone = (const vector unsigned short)vec_splat_u16(1);
pixelssum3, pixelssum4, temp4; register const vector unsigned short vctwo = (const vector unsigned short)vec_splat_u16(2);
register const vector unsigned char vczero = (const vector unsigned char)vec_splat_u8(0);
register const vector unsigned short vcone = (const vector unsigned short)vec_splat_u16(1);
register const vector unsigned short vctwo = (const vector unsigned short)vec_splat_u16(2);
POWERPC_PERF_START_COUNT(altivec_put_no_rnd_pixels16_xy2_num, 1); POWERPC_PERF_START_COUNT(altivec_put_no_rnd_pixels16_xy2_num, 1);
temp1 = vec_ld(0, pixels); temp1 = vec_ld(0, pixels);
temp2 = vec_ld(16, pixels); temp2 = vec_ld(16, pixels);
pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels)); pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels));
if ((((unsigned long)pixels) & 0x0000000F) == 0x0000000F) if ((((unsigned long)pixels) & 0x0000000F) == 0x0000000F) {
{ pixelsv2 = temp2;
pixelsv2 = temp2; } else {
} pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(1, pixels));
else }
{ pixelsv3 = vec_mergel(vczero, pixelsv1);
pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(1, pixels)); pixelsv4 = vec_mergel(vczero, pixelsv2);
} pixelsv1 = vec_mergeh(vczero, pixelsv1);
pixelsv3 = vec_mergel(vczero, pixelsv1); pixelsv2 = vec_mergeh(vczero, pixelsv2);
pixelsv4 = vec_mergel(vczero, pixelsv2); pixelssum3 = vec_add((vector unsigned short)pixelsv3,
pixelsv1 = vec_mergeh(vczero, pixelsv1); (vector unsigned short)pixelsv4);
pixelsv2 = vec_mergeh(vczero, pixelsv2); pixelssum3 = vec_add(pixelssum3, vcone);
pixelssum3 = vec_add((vector unsigned short)pixelsv3, pixelssum1 = vec_add((vector unsigned short)pixelsv1,
(vector unsigned short)pixelsv4); (vector unsigned short)pixelsv2);
pixelssum3 = vec_add(pixelssum3, vcone); pixelssum1 = vec_add(pixelssum1, vcone);
pixelssum1 = vec_add((vector unsigned short)pixelsv1,
(vector unsigned short)pixelsv2); for (i = 0; i < h ; i++) {
pixelssum1 = vec_add(pixelssum1, vcone); blockv = vec_ld(0, block);
for (i = 0; i < h ; i++) { temp1 = vec_ld(line_size, pixels);
blockv = vec_ld(0, block); temp2 = vec_ld(line_size + 16, pixels);
pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(line_size, pixels));
temp1 = vec_ld(line_size, pixels); if (((((unsigned long)pixels) + line_size) & 0x0000000F) == 0x0000000F) {
temp2 = vec_ld(line_size + 16, pixels); pixelsv2 = temp2;
pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(line_size, pixels)); } else {
if (((((unsigned long)pixels) + line_size) & 0x0000000F) == 0x0000000F) pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(line_size + 1, pixels));
{ }
pixelsv2 = temp2;
} pixelsv3 = vec_mergel(vczero, pixelsv1);
else pixelsv4 = vec_mergel(vczero, pixelsv2);
{ pixelsv1 = vec_mergeh(vczero, pixelsv1);
pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(line_size + 1, pixels)); pixelsv2 = vec_mergeh(vczero, pixelsv2);
}
pixelssum4 = vec_add((vector unsigned short)pixelsv3,
pixelsv3 = vec_mergel(vczero, pixelsv1); (vector unsigned short)pixelsv4);
pixelsv4 = vec_mergel(vczero, pixelsv2); pixelssum2 = vec_add((vector unsigned short)pixelsv1,
pixelsv1 = vec_mergeh(vczero, pixelsv1); (vector unsigned short)pixelsv2);
pixelsv2 = vec_mergeh(vczero, pixelsv2); temp4 = vec_add(pixelssum3, pixelssum4);
temp4 = vec_sra(temp4, vctwo);
pixelssum4 = vec_add((vector unsigned short)pixelsv3, temp3 = vec_add(pixelssum1, pixelssum2);
(vector unsigned short)pixelsv4); temp3 = vec_sra(temp3, vctwo);
pixelssum2 = vec_add((vector unsigned short)pixelsv1,
(vector unsigned short)pixelsv2); pixelssum3 = vec_add(pixelssum4, vcone);
temp4 = vec_add(pixelssum3, pixelssum4); pixelssum1 = vec_add(pixelssum2, vcone);
temp4 = vec_sra(temp4, vctwo);
temp3 = vec_add(pixelssum1, pixelssum2); blockv = vec_packsu(temp3, temp4);
temp3 = vec_sra(temp3, vctwo);
vec_st(blockv, 0, block);
pixelssum3 = vec_add(pixelssum4, vcone);
pixelssum1 = vec_add(pixelssum2, vcone); block += line_size;
pixels += line_size;
blockv = vec_packsu(temp3, temp4); }
vec_st(blockv, 0, block);
block += line_size;
pixels += line_size;
}
POWERPC_PERF_STOP_COUNT(altivec_put_no_rnd_pixels16_xy2_num, 1); POWERPC_PERF_STOP_COUNT(altivec_put_no_rnd_pixels16_xy2_num, 1);
} }
...@@ -1057,7 +988,7 @@ POWERPC_PERF_DECLARE(altivec_hadamard8_diff8x8_num, 1); ...@@ -1057,7 +988,7 @@ POWERPC_PERF_DECLARE(altivec_hadamard8_diff8x8_num, 1);
register vector signed short temp0, temp1, temp2, temp3, temp4, register vector signed short temp0, temp1, temp2, temp3, temp4,
temp5, temp6, temp7; temp5, temp6, temp7;
POWERPC_PERF_START_COUNT(altivec_hadamard8_diff8x8_num, 1); POWERPC_PERF_START_COUNT(altivec_hadamard8_diff8x8_num, 1);
{ {
register const vector signed short vprod1 =(const vector signed short) register const vector signed short vprod1 =(const vector signed short)
AVV( 1,-1, 1,-1, 1,-1, 1,-1); AVV( 1,-1, 1,-1, 1,-1, 1,-1);
register const vector signed short vprod2 =(const vector signed short) register const vector signed short vprod2 =(const vector signed short)
...@@ -1074,34 +1005,32 @@ POWERPC_PERF_START_COUNT(altivec_hadamard8_diff8x8_num, 1); ...@@ -1074,34 +1005,32 @@ POWERPC_PERF_START_COUNT(altivec_hadamard8_diff8x8_num, 1);
AVV(0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F, AVV(0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F,
0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07); 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07);
#define ONEITERBUTTERFLY(i, res) \ #define ONEITERBUTTERFLY(i, res) \
{ \ { \
register vector unsigned char src1, src2, srcO; \ register vector unsigned char src1, src2, srcO; \
register vector unsigned char dst1, dst2, dstO; \ register vector unsigned char dst1, dst2, dstO; \
register vector signed short srcV, dstV; \ register vector signed short srcV, dstV; \
register vector signed short but0, but1, but2, op1, op2, op3; \ register vector signed short but0, but1, but2, op1, op2, op3; \
src1 = vec_ld(stride * i, src); \ src1 = vec_ld(stride * i, src); \
src2 = vec_ld((stride * i) + 15, src); \ src2 = vec_ld((stride * i) + 15, src); \
srcO = vec_perm(src1, src2, vec_lvsl(stride * i, src)); \ srcO = vec_perm(src1, src2, vec_lvsl(stride * i, src)); \
dst1 = vec_ld(stride * i, dst); \ dst1 = vec_ld(stride * i, dst); \
dst2 = vec_ld((stride * i) + 15, dst); \ dst2 = vec_ld((stride * i) + 15, dst); \
dstO = vec_perm(dst1, dst2, vec_lvsl(stride * i, dst)); \ dstO = vec_perm(dst1, dst2, vec_lvsl(stride * i, dst)); \
/* promote the unsigned chars to signed shorts */ \ /* promote the unsigned chars to signed shorts */ \
/* we're in the 8x8 function, we only care for the first 8 */ \ /* we're in the 8x8 function, we only care for the first 8 */ \
srcV = \ srcV = (vector signed short)vec_mergeh((vector signed char)vzero, \
(vector signed short)vec_mergeh((vector signed char)vzero, \ (vector signed char)srcO); \
(vector signed char)srcO); \ dstV = (vector signed short)vec_mergeh((vector signed char)vzero, \
dstV = \ (vector signed char)dstO); \
(vector signed short)vec_mergeh((vector signed char)vzero, \ /* subtractions inside the first butterfly */ \
(vector signed char)dstO); \ but0 = vec_sub(srcV, dstV); \
/* subtractions inside the first butterfly */ \ op1 = vec_perm(but0, but0, perm1); \
but0 = vec_sub(srcV, dstV); \ but1 = vec_mladd(but0, vprod1, op1); \
op1 = vec_perm(but0, but0, perm1); \ op2 = vec_perm(but1, but1, perm2); \
but1 = vec_mladd(but0, vprod1, op1); \ but2 = vec_mladd(but1, vprod2, op2); \
op2 = vec_perm(but1, but1, perm2); \ op3 = vec_perm(but2, but2, perm3); \
but2 = vec_mladd(but1, vprod2, op2); \ res = vec_mladd(but2, vprod3, op3); \
op3 = vec_perm(but2, but2, perm3); \
res = vec_mladd(but2, vprod3, op3); \
} }
ONEITERBUTTERFLY(0, temp0); ONEITERBUTTERFLY(0, temp0);
ONEITERBUTTERFLY(1, temp1); ONEITERBUTTERFLY(1, temp1);
...@@ -1111,9 +1040,9 @@ POWERPC_PERF_START_COUNT(altivec_hadamard8_diff8x8_num, 1); ...@@ -1111,9 +1040,9 @@ POWERPC_PERF_START_COUNT(altivec_hadamard8_diff8x8_num, 1);
ONEITERBUTTERFLY(5, temp5); ONEITERBUTTERFLY(5, temp5);
ONEITERBUTTERFLY(6, temp6); ONEITERBUTTERFLY(6, temp6);
ONEITERBUTTERFLY(7, temp7); ONEITERBUTTERFLY(7, temp7);
} }
#undef ONEITERBUTTERFLY #undef ONEITERBUTTERFLY
{ {
register vector signed int vsum; register vector signed int vsum;
register vector signed short line0 = vec_add(temp0, temp1); register vector signed short line0 = vec_add(temp0, temp1);
register vector signed short line1 = vec_sub(temp0, temp1); register vector signed short line1 = vec_sub(temp0, temp1);
...@@ -1153,31 +1082,28 @@ POWERPC_PERF_START_COUNT(altivec_hadamard8_diff8x8_num, 1); ...@@ -1153,31 +1082,28 @@ POWERPC_PERF_START_COUNT(altivec_hadamard8_diff8x8_num, 1);
vsum = vec_sums(vsum, (vector signed int)vzero); vsum = vec_sums(vsum, (vector signed int)vzero);
vsum = vec_splat(vsum, 3); vsum = vec_splat(vsum, 3);
vec_ste(vsum, 0, &sum); vec_ste(vsum, 0, &sum);
} }
POWERPC_PERF_STOP_COUNT(altivec_hadamard8_diff8x8_num, 1); POWERPC_PERF_STOP_COUNT(altivec_hadamard8_diff8x8_num, 1);
return sum; return sum;
} }
/* /*
16x8 works with 16 elements ; it allows to avoid replicating 16x8 works with 16 elements; it allows to avoid replicating loads, and
loads, and give the compiler more rooms for scheduling. give the compiler more rooms for scheduling. It's only used from
It's only used from inside hadamard8_diff16_altivec. inside hadamard8_diff16_altivec.
Unfortunately, it seems gcc-3.3 is a bit dumb, and Unfortunately, it seems gcc-3.3 is a bit dumb, and the compiled code has a LOT
the compiled code has a LOT of spill code, it seems of spill code, it seems gcc (unlike xlc) cannot keep everything in registers
gcc (unlike xlc) cannot keep everything in registers by itself. The following code include hand-made registers allocation. It's not
by itself. The following code include hand-made clean, but on a 7450 the resulting code is much faster (best case fall from
registers allocation. It's not clean, but on 700+ cycles to 550).
a 7450 the resulting code is much faster (best case
fall from 700+ cycles to 550). xlc doesn't add spill code, but it doesn't know how to schedule for the 7450,
and its code isn't much faster than gcc-3.3 on the 7450 (but uses 25% less
xlc doesn't add spill code, but it doesn't know how to instructions...)
schedule for the 7450, and its code isn't much faster than
gcc-3.3 on the 7450 (but uses 25% less instructions...) On the 970, the hand-made RA is still a win (around 690 vs. around 780), but
xlc goes to around 660 on the regular C code...
On the 970, the hand-made RA is still a win (around 690
vs. around 780), but xlc goes to around 660 on the
regular C code...
*/ */
static int hadamard8_diff16x8_altivec(/*MpegEncContext*/ void *s, uint8_t *dst, uint8_t *src, int stride, int h) { static int hadamard8_diff16x8_altivec(/*MpegEncContext*/ void *s, uint8_t *dst, uint8_t *src, int stride, int h) {
...@@ -1202,7 +1128,7 @@ static int hadamard8_diff16x8_altivec(/*MpegEncContext*/ void *s, uint8_t *dst, ...@@ -1202,7 +1128,7 @@ static int hadamard8_diff16x8_altivec(/*MpegEncContext*/ void *s, uint8_t *dst,
temp7S REG_v(v15); temp7S REG_v(v15);
register const vector unsigned char vzero REG_v(v31)= register const vector unsigned char vzero REG_v(v31)=
(const vector unsigned char)vec_splat_u8(0); (const vector unsigned char)vec_splat_u8(0);
{ {
register const vector signed short vprod1 REG_v(v16)= register const vector signed short vprod1 REG_v(v16)=
(const vector signed short)AVV( 1,-1, 1,-1, 1,-1, 1,-1); (const vector signed short)AVV( 1,-1, 1,-1, 1,-1, 1,-1);
register const vector signed short vprod2 REG_v(v17)= register const vector signed short vprod2 REG_v(v17)=
...@@ -1222,66 +1148,62 @@ static int hadamard8_diff16x8_altivec(/*MpegEncContext*/ void *s, uint8_t *dst, ...@@ -1222,66 +1148,62 @@ static int hadamard8_diff16x8_altivec(/*MpegEncContext*/ void *s, uint8_t *dst,
AVV(0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F, AVV(0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F,
0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07); 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07);
#define ONEITERBUTTERFLY(i, res1, res2) \ #define ONEITERBUTTERFLY(i, res1, res2) \
{ \ { \
register vector unsigned char src1 REG_v(v22), \ register vector unsigned char src1 REG_v(v22), \
src2 REG_v(v23), \ src2 REG_v(v23), \
dst1 REG_v(v24), \ dst1 REG_v(v24), \
dst2 REG_v(v25), \ dst2 REG_v(v25), \
srcO REG_v(v22), \ srcO REG_v(v22), \
dstO REG_v(v23); \ dstO REG_v(v23); \
\ \
register vector signed short srcV REG_v(v24), \ register vector signed short srcV REG_v(v24), \
dstV REG_v(v25), \ dstV REG_v(v25), \
srcW REG_v(v26), \ srcW REG_v(v26), \
dstW REG_v(v27), \ dstW REG_v(v27), \
but0 REG_v(v28), \ but0 REG_v(v28), \
but0S REG_v(v29), \ but0S REG_v(v29), \
op1 REG_v(v30), \ op1 REG_v(v30), \
but1 REG_v(v22), \ but1 REG_v(v22), \
op1S REG_v(v23), \ op1S REG_v(v23), \
but1S REG_v(v24), \ but1S REG_v(v24), \
op2 REG_v(v25), \ op2 REG_v(v25), \
but2 REG_v(v26), \ but2 REG_v(v26), \
op2S REG_v(v27), \ op2S REG_v(v27), \
but2S REG_v(v28), \ but2S REG_v(v28), \
op3 REG_v(v29), \ op3 REG_v(v29), \
op3S REG_v(v30); \ op3S REG_v(v30); \
\ \
src1 = vec_ld(stride * i, src); \ src1 = vec_ld(stride * i, src); \
src2 = vec_ld((stride * i) + 16, src); \ src2 = vec_ld((stride * i) + 16, src); \
srcO = vec_perm(src1, src2, vec_lvsl(stride * i, src)); \ srcO = vec_perm(src1, src2, vec_lvsl(stride * i, src)); \
dst1 = vec_ld(stride * i, dst); \ dst1 = vec_ld(stride * i, dst); \
dst2 = vec_ld((stride * i) + 16, dst); \ dst2 = vec_ld((stride * i) + 16, dst); \
dstO = vec_perm(dst1, dst2, vec_lvsl(stride * i, dst)); \ dstO = vec_perm(dst1, dst2, vec_lvsl(stride * i, dst)); \
/* promote the unsigned chars to signed shorts */ \ /* promote the unsigned chars to signed shorts */ \
srcV = \ srcV = (vector signed short)vec_mergeh((vector signed char)vzero, \
(vector signed short)vec_mergeh((vector signed char)vzero, \ (vector signed char)srcO); \
(vector signed char)srcO); \ dstV = (vector signed short)vec_mergeh((vector signed char)vzero, \
dstV = \ (vector signed char)dstO); \
(vector signed short)vec_mergeh((vector signed char)vzero, \ srcW = (vector signed short)vec_mergel((vector signed char)vzero, \
(vector signed char)dstO); \ (vector signed char)srcO); \
srcW = \ dstW = (vector signed short)vec_mergel((vector signed char)vzero, \
(vector signed short)vec_mergel((vector signed char)vzero, \ (vector signed char)dstO); \
(vector signed char)srcO); \ /* subtractions inside the first butterfly */ \
dstW = \ but0 = vec_sub(srcV, dstV); \
(vector signed short)vec_mergel((vector signed char)vzero, \ but0S = vec_sub(srcW, dstW); \
(vector signed char)dstO); \ op1 = vec_perm(but0, but0, perm1); \
/* subtractions inside the first butterfly */ \ but1 = vec_mladd(but0, vprod1, op1); \
but0 = vec_sub(srcV, dstV); \ op1S = vec_perm(but0S, but0S, perm1); \
but0S = vec_sub(srcW, dstW); \ but1S = vec_mladd(but0S, vprod1, op1S); \
op1 = vec_perm(but0, but0, perm1); \ op2 = vec_perm(but1, but1, perm2); \
but1 = vec_mladd(but0, vprod1, op1); \ but2 = vec_mladd(but1, vprod2, op2); \
op1S = vec_perm(but0S, but0S, perm1); \ op2S = vec_perm(but1S, but1S, perm2); \
but1S = vec_mladd(but0S, vprod1, op1S); \ but2S = vec_mladd(but1S, vprod2, op2S); \
op2 = vec_perm(but1, but1, perm2); \ op3 = vec_perm(but2, but2, perm3); \
but2 = vec_mladd(but1, vprod2, op2); \ res1 = vec_mladd(but2, vprod3, op3); \
op2S = vec_perm(but1S, but1S, perm2); \ op3S = vec_perm(but2S, but2S, perm3); \
but2S = vec_mladd(but1S, vprod2, op2S); \ res2 = vec_mladd(but2S, vprod3, op3S); \
op3 = vec_perm(but2, but2, perm3); \
res1 = vec_mladd(but2, vprod3, op3); \
op3S = vec_perm(but2S, but2S, perm3); \
res2 = vec_mladd(but2S, vprod3, op3S); \
} }
ONEITERBUTTERFLY(0, temp0, temp0S); ONEITERBUTTERFLY(0, temp0, temp0S);
ONEITERBUTTERFLY(1, temp1, temp1S); ONEITERBUTTERFLY(1, temp1, temp1S);
...@@ -1291,9 +1213,9 @@ static int hadamard8_diff16x8_altivec(/*MpegEncContext*/ void *s, uint8_t *dst, ...@@ -1291,9 +1213,9 @@ static int hadamard8_diff16x8_altivec(/*MpegEncContext*/ void *s, uint8_t *dst,
ONEITERBUTTERFLY(5, temp5, temp5S); ONEITERBUTTERFLY(5, temp5, temp5S);
ONEITERBUTTERFLY(6, temp6, temp6S); ONEITERBUTTERFLY(6, temp6, temp6S);
ONEITERBUTTERFLY(7, temp7, temp7S); ONEITERBUTTERFLY(7, temp7, temp7S);
} }
#undef ONEITERBUTTERFLY #undef ONEITERBUTTERFLY
{ {
register vector signed int vsum; register vector signed int vsum;
register vector signed short line0S, line1S, line2S, line3S, line4S, register vector signed short line0S, line1S, line2S, line3S, line4S,
line5S, line6S, line7S, line0BS,line2BS, line5S, line6S, line7S, line0BS,line2BS,
...@@ -1375,8 +1297,8 @@ static int hadamard8_diff16x8_altivec(/*MpegEncContext*/ void *s, uint8_t *dst, ...@@ -1375,8 +1297,8 @@ static int hadamard8_diff16x8_altivec(/*MpegEncContext*/ void *s, uint8_t *dst,
vsum = vec_sums(vsum, (vector signed int)vzero); vsum = vec_sums(vsum, (vector signed int)vzero);
vsum = vec_splat(vsum, 3); vsum = vec_splat(vsum, 3);
vec_ste(vsum, 0, &sum); vec_ste(vsum, 0, &sum);
} }
return sum; return sum;
} }
int hadamard8_diff16_altivec(/*MpegEncContext*/ void *s, uint8_t *dst, uint8_t *src, int stride, int h){ int hadamard8_diff16_altivec(/*MpegEncContext*/ void *s, uint8_t *dst, uint8_t *src, int stride, int h){
...@@ -1401,7 +1323,7 @@ static void vorbis_inverse_coupling_altivec(float *mag, float *ang, ...@@ -1401,7 +1323,7 @@ static void vorbis_inverse_coupling_altivec(float *mag, float *ang,
vector bool int t0, t1; vector bool int t0, t1;
const vector unsigned int v_31 = //XXX const vector unsigned int v_31 = //XXX
vec_add(vec_add(vec_splat_u32(15),vec_splat_u32(15)),vec_splat_u32(1)); vec_add(vec_add(vec_splat_u32(15),vec_splat_u32(15)),vec_splat_u32(1));
for(i=0; i<blocksize; i+=4) { for (i = 0; i < blocksize; i += 4) {
m = vec_ld(0, mag+i); m = vec_ld(0, mag+i);
a = vec_ld(0, ang+i); a = vec_ld(0, ang+i);
t0 = vec_cmple(m, (vector float)vec_splat_u32(0)); t0 = vec_cmple(m, (vector float)vec_splat_u32(0));
...@@ -1452,8 +1374,7 @@ POWERPC_PERF_START_COUNT(altivec_avg_pixels8_xy2_num, 1); ...@@ -1452,8 +1374,7 @@ POWERPC_PERF_START_COUNT(altivec_avg_pixels8_xy2_num, 1);
temp1 = vec_ld(line_size, pixels); temp1 = vec_ld(line_size, pixels);
temp2 = vec_ld(line_size + 16, pixels); temp2 = vec_ld(line_size + 16, pixels);
pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(line_size, pixels)); pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(line_size, pixels));
if (((((unsigned long)pixels) + line_size) & 0x0000000F) == 0x0000000F) if (((((unsigned long)pixels) + line_size) & 0x0000000F) == 0x0000000F) {
{
pixelsv2 = temp2; pixelsv2 = temp2;
} else { } else {
pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(line_size + 1, pixels)); pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(line_size + 1, pixels));
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment