Commit ffe23fbf authored by mru's avatar mru

ARM: NEON optimised vector_fmul_window

git-svn-id: file:///var/local/repositories/ffmpeg/trunk@16868 9553f0bf-9b14-0410-a0b8-cfaf0461ba5b
parent ae4a75c1
...@@ -147,6 +147,9 @@ void ff_h264_idct_add8_neon(uint8_t **dest, const int *block_offset, ...@@ -147,6 +147,9 @@ void ff_h264_idct_add8_neon(uint8_t **dest, const int *block_offset,
const uint8_t nnzc[6*8]); const uint8_t nnzc[6*8]);
void ff_vector_fmul_neon(float *dst, const float *src, int len); void ff_vector_fmul_neon(float *dst, const float *src, int len);
void ff_vector_fmul_window_neon(float *dst, const float *src0,
const float *src1, const float *win,
float add_bias, int len);
void ff_float_to_int16_neon(int16_t *, const float *, long); void ff_float_to_int16_neon(int16_t *, const float *, long);
void ff_float_to_int16_interleave_neon(int16_t *, const float **, long, int); void ff_float_to_int16_interleave_neon(int16_t *, const float **, long, int);
...@@ -245,6 +248,7 @@ void ff_dsputil_init_neon(DSPContext *c, AVCodecContext *avctx) ...@@ -245,6 +248,7 @@ void ff_dsputil_init_neon(DSPContext *c, AVCodecContext *avctx)
c->h264_idct_add8 = ff_h264_idct_add8_neon; c->h264_idct_add8 = ff_h264_idct_add8_neon;
c->vector_fmul = ff_vector_fmul_neon; c->vector_fmul = ff_vector_fmul_neon;
c->vector_fmul_window = ff_vector_fmul_window_neon;
if (!(avctx->flags & CODEC_FLAG_BITEXACT)) { if (!(avctx->flags & CODEC_FLAG_BITEXACT)) {
c->float_to_int16 = ff_float_to_int16_neon; c->float_to_int16 = ff_float_to_int16_neon;
......
...@@ -649,3 +649,53 @@ function ff_vector_fmul_neon, export=1 ...@@ -649,3 +649,53 @@ function ff_vector_fmul_neon, export=1
3: vst1.64 {d16-d19},[r3,:128]! 3: vst1.64 {d16-d19},[r3,:128]!
bx lr bx lr
.endfunc .endfunc
function ff_vector_fmul_window_neon, export=1
vld1.32 {d16[],d17[]}, [sp,:32]
push {r4,r5,lr}
ldr lr, [sp, #16]
sub r2, r2, #8
sub r5, lr, #2
add r2, r2, r5, lsl #2
add r4, r3, r5, lsl #3
add ip, r0, r5, lsl #3
mov r5, #-16
vld1.64 {d0,d1}, [r1,:128]!
vld1.64 {d2,d3}, [r2,:128], r5
vld1.64 {d4,d5}, [r3,:128]!
vld1.64 {d6,d7}, [r4,:128], r5
1: subs lr, lr, #4
vmov q11, q8
vmla.f32 d22, d0, d4
vmov q10, q8
vmla.f32 d23, d1, d5
vrev64.32 q3, q3
vmla.f32 d20, d0, d7
vrev64.32 q1, q1
vmla.f32 d21, d1, d6
beq 2f
vmla.f32 d22, d3, d7
vld1.64 {d0,d1}, [r1,:128]!
vmla.f32 d23, d2, d6
vld1.64 {d18,d19},[r2,:128], r5
vmls.f32 d20, d3, d4
vld1.64 {d24,d25},[r3,:128]!
vmls.f32 d21, d2, d5
vld1.64 {d6,d7}, [r4,:128], r5
vmov q1, q9
vrev64.32 q11, q11
vmov q2, q12
vswp d22, d23
vst1.64 {d20,d21},[r0,:128]!
vst1.64 {d22,d23},[ip,:128], r5
b 1b
2: vmla.f32 d22, d3, d7
vmla.f32 d23, d2, d6
vmls.f32 d20, d3, d4
vmls.f32 d21, d2, d5
vrev64.32 q11, q11
vswp d22, d23
vst1.64 {d20,d21},[r0,:128]!
vst1.64 {d22,d23},[ip,:128], r5
pop {r4,r5,pc}
.endfunc
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment