snowdsp_mmx.c 38.2 KB
Newer Older
1 2 3 4
/*
 * MMX and SSE2 optimized snow DSP utils
 * Copyright (c) 2005-2006 Robert Edele <yartrebo@earthlink.net>
 *
5 6 7
 * This file is part of FFmpeg.
 *
 * FFmpeg is free software; you can redistribute it and/or
8 9
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
10
 * version 2.1 of the License, or (at your option) any later version.
11
 *
12
 * FFmpeg is distributed in the hope that it will be useful,
13 14 15 16 17
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
18
 * License along with FFmpeg; if not, write to the Free Software
19 20 21
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 */

22 23
#include "avcodec.h"
#include "snow.h"
24
#include "x86_cpu.h"
25

26
void ff_snow_horizontal_compose97i_sse2(IDWTELEM *b, int width){
corey's avatar
corey committed
27 28
    const int w2= (width+1)>>1;
    // SSE2 code runs faster with pointers aligned on a 32-byte boundary.
29 30
    IDWTELEM temp_buf[(width>>1) + 4];
    IDWTELEM * const temp = temp_buf + 4 - (((int)temp_buf & 0xF) >> 2);
corey's avatar
corey committed
31 32 33 34 35
    const int w_l= (width>>1);
    const int w_r= w2 - 1;
    int i;

    { // Lift 0
36 37
        IDWTELEM * const ref = b + w2 - 1;
        IDWTELEM b_0 = b[0]; //By allowing the first entry in b[0] to be calculated twice
corey's avatar
corey committed
38 39 40 41 42 43 44
        // (the first time erroneously), we allow the SSE2 code to run an extra pass.
        // The savings in code and time are well worth having to store this value and
        // calculate b[0] correctly afterwards.

        i = 0;
        asm volatile(
            "pcmpeqd   %%xmm7, %%xmm7         \n\t"
45 46
            "psllw        $15, %%xmm7         \n\t"
            "psrlw        $13, %%xmm7         \n\t"
corey's avatar
corey committed
47
        ::);
48
        for(; i<w_l-15; i+=16){
corey's avatar
corey committed
49 50 51
            asm volatile(
                "movdqu   (%1), %%xmm1        \n\t"
                "movdqu 16(%1), %%xmm5        \n\t"
52 53 54 55
                "movdqu  2(%1), %%xmm2        \n\t"
                "movdqu 18(%1), %%xmm6        \n\t"
                "paddw  %%xmm1, %%xmm2        \n\t"
                "paddw  %%xmm5, %%xmm6        \n\t"
corey's avatar
corey committed
56 57
                "movdqa %%xmm2, %%xmm0        \n\t"
                "movdqa %%xmm6, %%xmm4        \n\t"
58 59 60 61 62 63 64 65
                "paddw  %%xmm2, %%xmm2        \n\t"
                "paddw  %%xmm6, %%xmm6        \n\t"
                "paddw  %%xmm0, %%xmm2        \n\t"
                "paddw  %%xmm4, %%xmm6        \n\t"
                "paddw  %%xmm7, %%xmm2        \n\t"
                "paddw  %%xmm7, %%xmm6        \n\t"
                "psraw      $3, %%xmm2        \n\t"
                "psraw      $3, %%xmm6        \n\t"
corey's avatar
corey committed
66 67
                "movdqa   (%0), %%xmm0        \n\t"
                "movdqa 16(%0), %%xmm4        \n\t"
68 69
                "psubw  %%xmm2, %%xmm0        \n\t"
                "psubw  %%xmm6, %%xmm4        \n\t"
corey's avatar
corey committed
70 71 72 73 74 75 76 77 78 79 80
                "movdqa %%xmm0, (%0)          \n\t"
                "movdqa %%xmm4, 16(%0)        \n\t"
                :: "r"(&b[i]), "r"(&ref[i])
                : "memory"
            );
        }
        snow_horizontal_compose_lift_lead_out(i, b, b, ref, width, w_l, 0, W_DM, W_DO, W_DS);
        b[0] = b_0 - ((W_DM * 2 * ref[1]+W_DO)>>W_DS);
    }

    { // Lift 1
81
        IDWTELEM * const dst = b+w2;
corey's avatar
corey committed
82 83

        i = 0;
84
        for(; (((long)&dst[i]) & 0x1F) && i<w_r; i++){
corey's avatar
corey committed
85 86
            dst[i] = dst[i] - (b[i] + b[i + 1]);
        }
87
        for(; i<w_r-15; i+=16){
corey's avatar
corey committed
88 89 90
            asm volatile(
                "movdqu   (%1), %%xmm1        \n\t"
                "movdqu 16(%1), %%xmm5        \n\t"
91 92 93 94
                "movdqu  2(%1), %%xmm2        \n\t"
                "movdqu 18(%1), %%xmm6        \n\t"
                "paddw  %%xmm1, %%xmm2        \n\t"
                "paddw  %%xmm5, %%xmm6        \n\t"
corey's avatar
corey committed
95 96
                "movdqa   (%0), %%xmm0        \n\t"
                "movdqa 16(%0), %%xmm4        \n\t"
97 98
                "psubw  %%xmm2, %%xmm0        \n\t"
                "psubw  %%xmm6, %%xmm4        \n\t"
corey's avatar
corey committed
99 100 101 102 103 104 105 106 107 108
                "movdqa %%xmm0, (%0)          \n\t"
                "movdqa %%xmm4, 16(%0)        \n\t"
                :: "r"(&dst[i]), "r"(&b[i])
                : "memory"
            );
        }
        snow_horizontal_compose_lift_lead_out(i, dst, dst, b, width, w_r, 1, W_CM, W_CO, W_CS);
    }

    { // Lift 2
109 110
        IDWTELEM * const ref = b+w2 - 1;
        IDWTELEM b_0 = b[0];
corey's avatar
corey committed
111 112 113

        i = 0;
        asm volatile(
114
            "psllw          $1, %%xmm7        \n\t"
corey's avatar
corey committed
115
        ::);
116
        for(; i<w_l-15; i+=16){
corey's avatar
corey committed
117 118 119
            asm volatile(
                "movdqu   (%1), %%xmm1        \n\t"
                "movdqu 16(%1), %%xmm5        \n\t"
120 121 122 123 124 125
                "movdqu  2(%1), %%xmm0        \n\t"
                "movdqu 18(%1), %%xmm4        \n\t" //FIXME try aligned reads and shifts
                "paddw  %%xmm1, %%xmm0        \n\t"
                "paddw  %%xmm5, %%xmm4        \n\t"
                "paddw  %%xmm7, %%xmm0        \n\t"
                "paddw  %%xmm7, %%xmm4        \n\t"
126 127
                "movdqa   (%0), %%xmm1        \n\t"
                "movdqa 16(%0), %%xmm5        \n\t"
128 129 130 131 132 133 134 135
                "psraw      $2, %%xmm0        \n\t"
                "psraw      $2, %%xmm4        \n\t"
                "paddw  %%xmm1, %%xmm0        \n\t"
                "paddw  %%xmm5, %%xmm4        \n\t"
                "psraw      $2, %%xmm0        \n\t"
                "psraw      $2, %%xmm4        \n\t"
                "paddw  %%xmm1, %%xmm0        \n\t"
                "paddw  %%xmm5, %%xmm4        \n\t"
corey's avatar
corey committed
136 137 138 139 140 141 142
                "movdqa %%xmm0, (%0)          \n\t"
                "movdqa %%xmm4, 16(%0)        \n\t"
                :: "r"(&b[i]), "r"(&ref[i])
                : "memory"
            );
        }
        snow_horizontal_compose_liftS_lead_out(i, b, b, ref, width, w_l);
143
        b[0] = b_0 + ((2 * ref[1] + W_BO-1 + 4 * b_0) >> W_BS);
corey's avatar
corey committed
144 145 146
    }

    { // Lift 3
147
        IDWTELEM * const src = b+w2;
corey's avatar
corey committed
148 149

        i = 0;
150
        for(; (((long)&temp[i]) & 0x1F) && i<w_r; i++){
corey's avatar
corey committed
151 152 153 154
            temp[i] = src[i] - ((-W_AM*(b[i] + b[i+1]))>>W_AS);
        }
        for(; i<w_r-7; i+=8){
            asm volatile(
155 156 157 158
                "movdqu  2(%1), %%xmm2        \n\t"
                "movdqu 18(%1), %%xmm6        \n\t"
                "paddw    (%1), %%xmm2        \n\t"
                "paddw  16(%1), %%xmm6        \n\t"
159 160
                "movdqu   (%0), %%xmm0        \n\t"
                "movdqu 16(%0), %%xmm4        \n\t"
161 162 163 164 165 166
                "paddw  %%xmm2, %%xmm0        \n\t"
                "paddw  %%xmm6, %%xmm4        \n\t"
                "psraw      $1, %%xmm2        \n\t"
                "psraw      $1, %%xmm6        \n\t"
                "paddw  %%xmm0, %%xmm2        \n\t"
                "paddw  %%xmm4, %%xmm6        \n\t"
corey's avatar
corey committed
167 168 169 170 171 172
                "movdqa %%xmm2, (%2)          \n\t"
                "movdqa %%xmm6, 16(%2)        \n\t"
                :: "r"(&src[i]), "r"(&b[i]), "r"(&temp[i])
                 : "memory"
               );
        }
173
        snow_horizontal_compose_lift_lead_out(i, temp, src, b, width, w_r, 1, -W_AM, W_AO+1, W_AS);
corey's avatar
corey committed
174 175 176 177 178
    }

    {
        snow_interleave_line_header(&i, width, b, temp);

179
        for (; (i & 0x3E) != 0x3E; i-=2){
corey's avatar
corey committed
180 181 182
            b[i+1] = temp[i>>1];
            b[i] = b[i>>1];
        }
183
        for (i-=62; i>=0; i-=64){
corey's avatar
corey committed
184 185 186 187 188 189 190 191 192
            asm volatile(
                "movdqa      (%1), %%xmm0       \n\t"
                "movdqa    16(%1), %%xmm2       \n\t"
                "movdqa    32(%1), %%xmm4       \n\t"
                "movdqa    48(%1), %%xmm6       \n\t"
                "movdqa      (%1), %%xmm1       \n\t"
                "movdqa    16(%1), %%xmm3       \n\t"
                "movdqa    32(%1), %%xmm5       \n\t"
                "movdqa    48(%1), %%xmm7       \n\t"
193 194 195 196
                "punpcklwd   (%2), %%xmm0       \n\t"
                "punpcklwd 16(%2), %%xmm2       \n\t"
                "punpcklwd 32(%2), %%xmm4       \n\t"
                "punpcklwd 48(%2), %%xmm6       \n\t"
corey's avatar
corey committed
197 198 199 200
                "movdqa    %%xmm0, (%0)         \n\t"
                "movdqa    %%xmm2, 32(%0)       \n\t"
                "movdqa    %%xmm4, 64(%0)       \n\t"
                "movdqa    %%xmm6, 96(%0)       \n\t"
201 202 203 204
                "punpckhwd   (%2), %%xmm1       \n\t"
                "punpckhwd 16(%2), %%xmm3       \n\t"
                "punpckhwd 32(%2), %%xmm5       \n\t"
                "punpckhwd 48(%2), %%xmm7       \n\t"
corey's avatar
corey committed
205 206 207 208 209 210 211 212 213 214 215
                "movdqa    %%xmm1, 16(%0)       \n\t"
                "movdqa    %%xmm3, 48(%0)       \n\t"
                "movdqa    %%xmm5, 80(%0)       \n\t"
                "movdqa    %%xmm7, 112(%0)      \n\t"
                :: "r"(&(b)[i]), "r"(&(b)[i>>1]), "r"(&(temp)[i>>1])
                 : "memory"
               );
        }
    }
}

216
void ff_snow_horizontal_compose97i_mmx(IDWTELEM *b, int width){
corey's avatar
corey committed
217
    const int w2= (width+1)>>1;
218
    IDWTELEM temp[width >> 1];
corey's avatar
corey committed
219 220 221 222 223
    const int w_l= (width>>1);
    const int w_r= w2 - 1;
    int i;

    { // Lift 0
224
        IDWTELEM * const ref = b + w2 - 1;
corey's avatar
corey committed
225 226 227 228

        i = 1;
        b[0] = b[0] - ((W_DM * 2 * ref[1]+W_DO)>>W_DS);
        asm volatile(
229 230 231
            "pcmpeqw    %%mm7, %%mm7         \n\t"
            "psllw        $15, %%mm7         \n\t"
            "psrlw        $13, %%mm7         \n\t"
corey's avatar
corey committed
232
           ::);
233
        for(; i<w_l-7; i+=8){
corey's avatar
corey committed
234 235 236
            asm volatile(
                "movq     (%1), %%mm2        \n\t"
                "movq    8(%1), %%mm6        \n\t"
237 238
                "paddw   2(%1), %%mm2        \n\t"
                "paddw  10(%1), %%mm6        \n\t"
corey's avatar
corey committed
239 240
                "movq    %%mm2, %%mm0        \n\t"
                "movq    %%mm6, %%mm4        \n\t"
241 242 243 244 245 246 247 248
                "paddw   %%mm2, %%mm2        \n\t"
                "paddw   %%mm6, %%mm6        \n\t"
                "paddw   %%mm0, %%mm2        \n\t"
                "paddw   %%mm4, %%mm6        \n\t"
                "paddw   %%mm7, %%mm2        \n\t"
                "paddw   %%mm7, %%mm6        \n\t"
                "psraw      $3, %%mm2        \n\t"
                "psraw      $3, %%mm6        \n\t"
corey's avatar
corey committed
249 250
                "movq     (%0), %%mm0        \n\t"
                "movq    8(%0), %%mm4        \n\t"
251 252
                "psubw   %%mm2, %%mm0        \n\t"
                "psubw   %%mm6, %%mm4        \n\t"
corey's avatar
corey committed
253 254 255 256 257 258 259 260 261 262
                "movq    %%mm0, (%0)         \n\t"
                "movq    %%mm4, 8(%0)        \n\t"
                :: "r"(&b[i]), "r"(&ref[i])
                 : "memory"
               );
        }
        snow_horizontal_compose_lift_lead_out(i, b, b, ref, width, w_l, 0, W_DM, W_DO, W_DS);
    }

    { // Lift 1
263
        IDWTELEM * const dst = b+w2;
corey's avatar
corey committed
264 265

        i = 0;
266
        for(; i<w_r-7; i+=8){
corey's avatar
corey committed
267 268 269
            asm volatile(
                "movq     (%1), %%mm2        \n\t"
                "movq    8(%1), %%mm6        \n\t"
270 271
                "paddw   2(%1), %%mm2        \n\t"
                "paddw  10(%1), %%mm6        \n\t"
corey's avatar
corey committed
272 273
                "movq     (%0), %%mm0        \n\t"
                "movq    8(%0), %%mm4        \n\t"
274 275
                "psubw   %%mm2, %%mm0        \n\t"
                "psubw   %%mm6, %%mm4        \n\t"
corey's avatar
corey committed
276 277 278 279 280 281 282 283 284 285
                "movq    %%mm0, (%0)         \n\t"
                "movq    %%mm4, 8(%0)        \n\t"
                :: "r"(&dst[i]), "r"(&b[i])
                 : "memory"
               );
        }
        snow_horizontal_compose_lift_lead_out(i, dst, dst, b, width, w_r, 1, W_CM, W_CO, W_CS);
    }

    { // Lift 2
286
        IDWTELEM * const ref = b+w2 - 1;
corey's avatar
corey committed
287 288

        i = 1;
289
        b[0] = b[0] + (((2 * ref[1] + W_BO) + 4 * b[0]) >> W_BS);
corey's avatar
corey committed
290
        asm volatile(
291
            "psllw          $1, %%mm7        \n\t"
corey's avatar
corey committed
292
           ::);
293
        for(; i<w_l-7; i+=8){
corey's avatar
corey committed
294 295 296
            asm volatile(
                "movq     (%1), %%mm0        \n\t"
                "movq    8(%1), %%mm4        \n\t"
297 298 299 300 301 302
                "paddw   2(%1), %%mm0        \n\t"
                "paddw  10(%1), %%mm4        \n\t"
                "paddw   %%mm7, %%mm0        \n\t"
                "paddw   %%mm7, %%mm4        \n\t"
                "psraw      $2, %%mm0        \n\t"
                "psraw      $2, %%mm4        \n\t"
303 304
                "movq     (%0), %%mm1        \n\t"
                "movq    8(%0), %%mm5        \n\t"
305 306 307 308 309 310
                "paddw   %%mm1, %%mm0        \n\t"
                "paddw   %%mm5, %%mm4        \n\t"
                "psraw      $2, %%mm0        \n\t"
                "psraw      $2, %%mm4        \n\t"
                "paddw   %%mm1, %%mm0        \n\t"
                "paddw   %%mm5, %%mm4        \n\t"
corey's avatar
corey committed
311 312 313 314 315 316 317 318 319 320
                "movq    %%mm0, (%0)         \n\t"
                "movq    %%mm4, 8(%0)        \n\t"
                :: "r"(&b[i]), "r"(&ref[i])
                 : "memory"
               );
        }
        snow_horizontal_compose_liftS_lead_out(i, b, b, ref, width, w_l);
    }

    { // Lift 3
321
        IDWTELEM * const src = b+w2;
corey's avatar
corey committed
322 323
        i = 0;

324
        for(; i<w_r-7; i+=8){
corey's avatar
corey committed
325
            asm volatile(
326 327 328 329
                "movq    2(%1), %%mm2        \n\t"
                "movq   10(%1), %%mm6        \n\t"
                "paddw    (%1), %%mm2        \n\t"
                "paddw   8(%1), %%mm6        \n\t"
330 331
                "movq     (%0), %%mm0        \n\t"
                "movq    8(%0), %%mm4        \n\t"
332 333 334 335 336 337
                "paddw   %%mm2, %%mm0        \n\t"
                "paddw   %%mm6, %%mm4        \n\t"
                "psraw      $1, %%mm2        \n\t"
                "psraw      $1, %%mm6        \n\t"
                "paddw   %%mm0, %%mm2        \n\t"
                "paddw   %%mm4, %%mm6        \n\t"
corey's avatar
corey committed
338 339 340 341 342 343
                "movq    %%mm2, (%2)         \n\t"
                "movq    %%mm6, 8(%2)        \n\t"
                :: "r"(&src[i]), "r"(&b[i]), "r"(&temp[i])
                 : "memory"
               );
        }
344
        snow_horizontal_compose_lift_lead_out(i, temp, src, b, width, w_r, 1, -W_AM, W_AO+1, W_AS);
corey's avatar
corey committed
345 346 347 348 349
    }

    {
        snow_interleave_line_header(&i, width, b, temp);

350
        for (; (i & 0x1E) != 0x1E; i-=2){
corey's avatar
corey committed
351 352 353
            b[i+1] = temp[i>>1];
            b[i] = b[i>>1];
        }
354
        for (i-=30; i>=0; i-=32){
corey's avatar
corey committed
355 356 357 358 359 360 361 362 363
            asm volatile(
                "movq        (%1), %%mm0       \n\t"
                "movq       8(%1), %%mm2       \n\t"
                "movq      16(%1), %%mm4       \n\t"
                "movq      24(%1), %%mm6       \n\t"
                "movq        (%1), %%mm1       \n\t"
                "movq       8(%1), %%mm3       \n\t"
                "movq      16(%1), %%mm5       \n\t"
                "movq      24(%1), %%mm7       \n\t"
364 365 366 367
                "punpcklwd   (%2), %%mm0       \n\t"
                "punpcklwd  8(%2), %%mm2       \n\t"
                "punpcklwd 16(%2), %%mm4       \n\t"
                "punpcklwd 24(%2), %%mm6       \n\t"
corey's avatar
corey committed
368 369 370 371
                "movq       %%mm0, (%0)        \n\t"
                "movq       %%mm2, 16(%0)      \n\t"
                "movq       %%mm4, 32(%0)      \n\t"
                "movq       %%mm6, 48(%0)      \n\t"
372 373 374 375
                "punpckhwd   (%2), %%mm1       \n\t"
                "punpckhwd  8(%2), %%mm3       \n\t"
                "punpckhwd 16(%2), %%mm5       \n\t"
                "punpckhwd 24(%2), %%mm7       \n\t"
corey's avatar
corey committed
376 377 378 379 380 381 382 383 384 385 386
                "movq       %%mm1, 8(%0)       \n\t"
                "movq       %%mm3, 24(%0)      \n\t"
                "movq       %%mm5, 40(%0)      \n\t"
                "movq       %%mm7, 56(%0)      \n\t"
                :: "r"(&b[i]), "r"(&b[i>>1]), "r"(&temp[i>>1])
                 : "memory"
               );
        }
    }
}

387
#define snow_vertical_compose_sse2_load_add(op,r,t0,t1,t2,t3)\
388 389 390 391
        ""op" (%%"r",%%"REG_d",2), %%"t0"      \n\t"\
        ""op" 16(%%"r",%%"REG_d",2), %%"t1"    \n\t"\
        ""op" 32(%%"r",%%"REG_d",2), %%"t2"    \n\t"\
        ""op" 48(%%"r",%%"REG_d",2), %%"t3"    \n\t"
392 393 394 395 396

#define snow_vertical_compose_sse2_load(r,t0,t1,t2,t3)\
        snow_vertical_compose_sse2_load_add("movdqa",r,t0,t1,t2,t3)

#define snow_vertical_compose_sse2_add(r,t0,t1,t2,t3)\
397
        snow_vertical_compose_sse2_load_add("paddw",r,t0,t1,t2,t3)
398 399

#define snow_vertical_compose_sse2_sub(s0,s1,s2,s3,t0,t1,t2,t3)\
400 401 402 403
        "psubw %%"s0", %%"t0" \n\t"\
        "psubw %%"s1", %%"t1" \n\t"\
        "psubw %%"s2", %%"t2" \n\t"\
        "psubw %%"s3", %%"t3" \n\t"
404 405

#define snow_vertical_compose_sse2_store(w,s0,s1,s2,s3)\
406 407 408 409
        "movdqa %%"s0", (%%"w",%%"REG_d",2)      \n\t"\
        "movdqa %%"s1", 16(%%"w",%%"REG_d",2)    \n\t"\
        "movdqa %%"s2", 32(%%"w",%%"REG_d",2)    \n\t"\
        "movdqa %%"s3", 48(%%"w",%%"REG_d",2)    \n\t"
410 411

#define snow_vertical_compose_sse2_sra(n,t0,t1,t2,t3)\
412 413 414 415
        "psraw $"n", %%"t0" \n\t"\
        "psraw $"n", %%"t1" \n\t"\
        "psraw $"n", %%"t2" \n\t"\
        "psraw $"n", %%"t3" \n\t"
416 417

#define snow_vertical_compose_sse2_r2r_add(s0,s1,s2,s3,t0,t1,t2,t3)\
418 419 420 421
        "paddw %%"s0", %%"t0" \n\t"\
        "paddw %%"s1", %%"t1" \n\t"\
        "paddw %%"s2", %%"t2" \n\t"\
        "paddw %%"s3", %%"t3" \n\t"
422 423 424 425 426 427 428

#define snow_vertical_compose_sse2_move(s0,s1,s2,s3,t0,t1,t2,t3)\
        "movdqa %%"s0", %%"t0" \n\t"\
        "movdqa %%"s1", %%"t1" \n\t"\
        "movdqa %%"s2", %%"t2" \n\t"\
        "movdqa %%"s3", %%"t3" \n\t"

429
void ff_snow_vertical_compose97i_sse2(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2, IDWTELEM *b3, IDWTELEM *b4, IDWTELEM *b5, int width){
430 431
    long i = width;

432
    while(i & 0x1F)
433 434 435 436 437 438 439 440 441 442 443 444 445
    {
        i--;
        b4[i] -= (W_DM*(b3[i] + b5[i])+W_DO)>>W_DS;
        b3[i] -= (W_CM*(b2[i] + b4[i])+W_CO)>>W_CS;
        b2[i] += (W_BM*(b1[i] + b3[i])+4*b2[i]+W_BO)>>W_BS;
        b1[i] += (W_AM*(b0[i] + b2[i])+W_AO)>>W_AS;
    }

         asm volatile (
        "jmp 2f                                      \n\t"
        "1:                                          \n\t"

        "mov %6, %%"REG_a"                           \n\t"
446
        "mov %4, %%"REG_S"                           \n\t"
447

448
        snow_vertical_compose_sse2_load(REG_S,"xmm0","xmm2","xmm4","xmm6")
449 450
        snow_vertical_compose_sse2_add(REG_a,"xmm0","xmm2","xmm4","xmm6")
        snow_vertical_compose_sse2_move("xmm0","xmm2","xmm4","xmm6","xmm1","xmm3","xmm5","xmm7")
451
        snow_vertical_compose_sse2_sra("1","xmm0","xmm2","xmm4","xmm6")
452 453 454
        snow_vertical_compose_sse2_r2r_add("xmm1","xmm3","xmm5","xmm7","xmm0","xmm2","xmm4","xmm6")

        "pcmpeqd %%xmm1, %%xmm1                      \n\t"
455
        "psllw $15, %%xmm1                           \n\t"
456
        "psrlw $14, %%xmm1                           \n\t"
457 458 459
        "mov %5, %%"REG_a"                           \n\t"

        snow_vertical_compose_sse2_r2r_add("xmm1","xmm1","xmm1","xmm1","xmm0","xmm2","xmm4","xmm6")
460
        snow_vertical_compose_sse2_sra("2","xmm0","xmm2","xmm4","xmm6")
461 462 463 464
        snow_vertical_compose_sse2_load(REG_a,"xmm1","xmm3","xmm5","xmm7")
        snow_vertical_compose_sse2_sub("xmm0","xmm2","xmm4","xmm6","xmm1","xmm3","xmm5","xmm7")
        snow_vertical_compose_sse2_store(REG_a,"xmm1","xmm3","xmm5","xmm7")
        "mov %3, %%"REG_c"                           \n\t"
465
        snow_vertical_compose_sse2_load(REG_S,"xmm0","xmm2","xmm4","xmm6")
466 467
        snow_vertical_compose_sse2_add(REG_c,"xmm1","xmm3","xmm5","xmm7")
        snow_vertical_compose_sse2_sub("xmm1","xmm3","xmm5","xmm7","xmm0","xmm2","xmm4","xmm6")
468
        snow_vertical_compose_sse2_store(REG_S,"xmm0","xmm2","xmm4","xmm6")
469 470
        "mov %2, %%"REG_a"                           \n\t"
        snow_vertical_compose_sse2_add(REG_a,"xmm0","xmm2","xmm4","xmm6")
471 472
        snow_vertical_compose_sse2_sra("2","xmm0","xmm2","xmm4","xmm6")
        snow_vertical_compose_sse2_add(REG_c,"xmm0","xmm2","xmm4","xmm6")
473 474

        "pcmpeqd %%xmm1, %%xmm1                      \n\t"
475 476
        "psllw $15, %%xmm1                           \n\t"
        "psrlw $14, %%xmm1                           \n\t"
477
        "mov %1, %%"REG_S"                           \n\t"
478 479

        snow_vertical_compose_sse2_r2r_add("xmm1","xmm1","xmm1","xmm1","xmm0","xmm2","xmm4","xmm6")
480
        snow_vertical_compose_sse2_sra("2","xmm0","xmm2","xmm4","xmm6")
481 482
        snow_vertical_compose_sse2_add(REG_c,"xmm0","xmm2","xmm4","xmm6")
        snow_vertical_compose_sse2_store(REG_c,"xmm0","xmm2","xmm4","xmm6")
483
        snow_vertical_compose_sse2_add(REG_S,"xmm0","xmm2","xmm4","xmm6")
484 485
        snow_vertical_compose_sse2_move("xmm0","xmm2","xmm4","xmm6","xmm1","xmm3","xmm5","xmm7")
        snow_vertical_compose_sse2_sra("1","xmm0","xmm2","xmm4","xmm6")
486
        snow_vertical_compose_sse2_r2r_add("xmm1","xmm3","xmm5","xmm7","xmm0","xmm2","xmm4","xmm6")
487 488 489 490
        snow_vertical_compose_sse2_add(REG_a,"xmm0","xmm2","xmm4","xmm6")
        snow_vertical_compose_sse2_store(REG_a,"xmm0","xmm2","xmm4","xmm6")

        "2:                                          \n\t"
491
        "sub $32, %%"REG_d"                          \n\t"
492 493 494 495
        "jge 1b                                      \n\t"
        :"+d"(i)
        :
        "m"(b0),"m"(b1),"m"(b2),"m"(b3),"m"(b4),"m"(b5):
496
        "%"REG_a"","%"REG_S"","%"REG_c"");
497 498 499
}

#define snow_vertical_compose_mmx_load_add(op,r,t0,t1,t2,t3)\
500 501 502 503
        ""op" (%%"r",%%"REG_d",2), %%"t0"   \n\t"\
        ""op" 8(%%"r",%%"REG_d",2), %%"t1"  \n\t"\
        ""op" 16(%%"r",%%"REG_d",2), %%"t2" \n\t"\
        ""op" 24(%%"r",%%"REG_d",2), %%"t3" \n\t"
504 505 506 507 508

#define snow_vertical_compose_mmx_load(r,t0,t1,t2,t3)\
        snow_vertical_compose_mmx_load_add("movq",r,t0,t1,t2,t3)

#define snow_vertical_compose_mmx_add(r,t0,t1,t2,t3)\
509
        snow_vertical_compose_mmx_load_add("paddw",r,t0,t1,t2,t3)
510 511 512 513 514

#define snow_vertical_compose_mmx_sub(s0,s1,s2,s3,t0,t1,t2,t3)\
        snow_vertical_compose_sse2_sub(s0,s1,s2,s3,t0,t1,t2,t3)

#define snow_vertical_compose_mmx_store(w,s0,s1,s2,s3)\
515 516 517 518
        "movq %%"s0", (%%"w",%%"REG_d",2)   \n\t"\
        "movq %%"s1", 8(%%"w",%%"REG_d",2)  \n\t"\
        "movq %%"s2", 16(%%"w",%%"REG_d",2) \n\t"\
        "movq %%"s3", 24(%%"w",%%"REG_d",2) \n\t"
519 520 521 522 523 524 525 526 527 528 529 530 531

#define snow_vertical_compose_mmx_sra(n,t0,t1,t2,t3)\
        snow_vertical_compose_sse2_sra(n,t0,t1,t2,t3)

#define snow_vertical_compose_mmx_r2r_add(s0,s1,s2,s3,t0,t1,t2,t3)\
        snow_vertical_compose_sse2_r2r_add(s0,s1,s2,s3,t0,t1,t2,t3)

#define snow_vertical_compose_mmx_move(s0,s1,s2,s3,t0,t1,t2,t3)\
        "movq %%"s0", %%"t0" \n\t"\
        "movq %%"s1", %%"t1" \n\t"\
        "movq %%"s2", %%"t2" \n\t"\
        "movq %%"s3", %%"t3" \n\t"

532
void ff_snow_vertical_compose97i_mmx(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2, IDWTELEM *b3, IDWTELEM *b4, IDWTELEM *b5, int width){
533
    long i = width;
534
    while(i & 15)
535 536 537 538 539 540 541 542 543 544 545 546 547
    {
        i--;
        b4[i] -= (W_DM*(b3[i] + b5[i])+W_DO)>>W_DS;
        b3[i] -= (W_CM*(b2[i] + b4[i])+W_CO)>>W_CS;
        b2[i] += (W_BM*(b1[i] + b3[i])+4*b2[i]+W_BO)>>W_BS;
        b1[i] += (W_AM*(b0[i] + b2[i])+W_AO)>>W_AS;
    }

    asm volatile(
        "jmp 2f                                      \n\t"
        "1:                                          \n\t"

        "mov %6, %%"REG_a"                           \n\t"
548
        "mov %4, %%"REG_S"                           \n\t"
549

550
        snow_vertical_compose_mmx_load(REG_S,"mm0","mm2","mm4","mm6")
551 552
        snow_vertical_compose_mmx_add(REG_a,"mm0","mm2","mm4","mm6")
        snow_vertical_compose_mmx_move("mm0","mm2","mm4","mm6","mm1","mm3","mm5","mm7")
553
        snow_vertical_compose_mmx_sra("1","mm0","mm2","mm4","mm6")
554 555
        snow_vertical_compose_mmx_r2r_add("mm1","mm3","mm5","mm7","mm0","mm2","mm4","mm6")

556 557
        "pcmpeqw %%mm1, %%mm1                        \n\t"
        "psllw $15, %%mm1                            \n\t"
558
        "psrlw $14, %%mm1                            \n\t"
559 560 561
        "mov %5, %%"REG_a"                           \n\t"

        snow_vertical_compose_mmx_r2r_add("mm1","mm1","mm1","mm1","mm0","mm2","mm4","mm6")
562
        snow_vertical_compose_mmx_sra("2","mm0","mm2","mm4","mm6")
563 564 565 566
        snow_vertical_compose_mmx_load(REG_a,"mm1","mm3","mm5","mm7")
        snow_vertical_compose_mmx_sub("mm0","mm2","mm4","mm6","mm1","mm3","mm5","mm7")
        snow_vertical_compose_mmx_store(REG_a,"mm1","mm3","mm5","mm7")
        "mov %3, %%"REG_c"                           \n\t"
567
        snow_vertical_compose_mmx_load(REG_S,"mm0","mm2","mm4","mm6")
568 569
        snow_vertical_compose_mmx_add(REG_c,"mm1","mm3","mm5","mm7")
        snow_vertical_compose_mmx_sub("mm1","mm3","mm5","mm7","mm0","mm2","mm4","mm6")
570
        snow_vertical_compose_mmx_store(REG_S,"mm0","mm2","mm4","mm6")
571 572
        "mov %2, %%"REG_a"                           \n\t"
        snow_vertical_compose_mmx_add(REG_a,"mm0","mm2","mm4","mm6")
573 574
        snow_vertical_compose_mmx_sra("2","mm0","mm2","mm4","mm6")
        snow_vertical_compose_mmx_add(REG_c,"mm0","mm2","mm4","mm6")
575

576 577 578
        "pcmpeqw %%mm1, %%mm1                        \n\t"
        "psllw $15, %%mm1                            \n\t"
        "psrlw $14, %%mm1                            \n\t"
579
        "mov %1, %%"REG_S"                           \n\t"
580 581

        snow_vertical_compose_mmx_r2r_add("mm1","mm1","mm1","mm1","mm0","mm2","mm4","mm6")
582
        snow_vertical_compose_mmx_sra("2","mm0","mm2","mm4","mm6")
583 584
        snow_vertical_compose_mmx_add(REG_c,"mm0","mm2","mm4","mm6")
        snow_vertical_compose_mmx_store(REG_c,"mm0","mm2","mm4","mm6")
585
        snow_vertical_compose_mmx_add(REG_S,"mm0","mm2","mm4","mm6")
586 587
        snow_vertical_compose_mmx_move("mm0","mm2","mm4","mm6","mm1","mm3","mm5","mm7")
        snow_vertical_compose_mmx_sra("1","mm0","mm2","mm4","mm6")
588
        snow_vertical_compose_mmx_r2r_add("mm1","mm3","mm5","mm7","mm0","mm2","mm4","mm6")
589 590 591 592
        snow_vertical_compose_mmx_add(REG_a,"mm0","mm2","mm4","mm6")
        snow_vertical_compose_mmx_store(REG_a,"mm0","mm2","mm4","mm6")

        "2:                                          \n\t"
593
        "sub $16, %%"REG_d"                          \n\t"
594 595 596 597
        "jge 1b                                      \n\t"
        :"+d"(i)
        :
        "m"(b0),"m"(b1),"m"(b2),"m"(b3),"m"(b4),"m"(b5):
598
        "%"REG_a"","%"REG_S"","%"REG_c"");
599
}
600 601

#define snow_inner_add_yblock_sse2_header \
602
    IDWTELEM * * dst_array = sb->line + src_y;\
603
    long tmp;\
604
    asm volatile(\
605 606 607
             "mov  %7, %%"REG_c"             \n\t"\
             "mov  %6, %2                    \n\t"\
             "mov  %4, %%"REG_S"             \n\t"\
608 609
             "pxor %%xmm7, %%xmm7            \n\t" /* 0 */\
             "pcmpeqd %%xmm3, %%xmm3         \n\t"\
610 611
             "psllw $15, %%xmm3              \n\t"\
             "psrlw $12, %%xmm3              \n\t" /* FRAC_BITS >> 1 */\
612 613 614
             "1:                             \n\t"\
             "mov %1, %%"REG_D"              \n\t"\
             "mov (%%"REG_D"), %%"REG_D"     \n\t"\
615
             "add %3, %%"REG_D"              \n\t"
616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662

#define snow_inner_add_yblock_sse2_start_8(out_reg1, out_reg2, ptr_offset, s_offset)\
             "mov "PTR_SIZE"*"ptr_offset"(%%"REG_a"), %%"REG_d"; \n\t"\
             "movq (%%"REG_d"), %%"out_reg1" \n\t"\
             "movq (%%"REG_d", %%"REG_c"), %%"out_reg2" \n\t"\
             "punpcklbw %%xmm7, %%"out_reg1" \n\t"\
             "punpcklbw %%xmm7, %%"out_reg2" \n\t"\
             "movq "s_offset"(%%"REG_S"), %%xmm0 \n\t"\
             "movq "s_offset"+16(%%"REG_S"), %%xmm4 \n\t"\
             "punpcklbw %%xmm7, %%xmm0       \n\t"\
             "punpcklbw %%xmm7, %%xmm4       \n\t"\
             "pmullw %%xmm0, %%"out_reg1"    \n\t"\
             "pmullw %%xmm4, %%"out_reg2"    \n\t"

#define snow_inner_add_yblock_sse2_start_16(out_reg1, out_reg2, ptr_offset, s_offset)\
             "mov "PTR_SIZE"*"ptr_offset"(%%"REG_a"), %%"REG_d"; \n\t"\
             "movq (%%"REG_d"), %%"out_reg1" \n\t"\
             "movq 8(%%"REG_d"), %%"out_reg2" \n\t"\
             "punpcklbw %%xmm7, %%"out_reg1" \n\t"\
             "punpcklbw %%xmm7, %%"out_reg2" \n\t"\
             "movq "s_offset"(%%"REG_S"), %%xmm0 \n\t"\
             "movq "s_offset"+8(%%"REG_S"), %%xmm4 \n\t"\
             "punpcklbw %%xmm7, %%xmm0       \n\t"\
             "punpcklbw %%xmm7, %%xmm4       \n\t"\
             "pmullw %%xmm0, %%"out_reg1"    \n\t"\
             "pmullw %%xmm4, %%"out_reg2"    \n\t"

#define snow_inner_add_yblock_sse2_accum_8(ptr_offset, s_offset) \
             snow_inner_add_yblock_sse2_start_8("xmm2", "xmm6", ptr_offset, s_offset)\
             "paddusw %%xmm2, %%xmm1         \n\t"\
             "paddusw %%xmm6, %%xmm5         \n\t"

#define snow_inner_add_yblock_sse2_accum_16(ptr_offset, s_offset) \
             snow_inner_add_yblock_sse2_start_16("xmm2", "xmm6", ptr_offset, s_offset)\
             "paddusw %%xmm2, %%xmm1         \n\t"\
             "paddusw %%xmm6, %%xmm5         \n\t"

#define snow_inner_add_yblock_sse2_end_common1\
             "add $32, %%"REG_S"             \n\t"\
             "add %%"REG_c", %0              \n\t"\
             "add %%"REG_c", "PTR_SIZE"*3(%%"REG_a");\n\t"\
             "add %%"REG_c", "PTR_SIZE"*2(%%"REG_a");\n\t"\
             "add %%"REG_c", "PTR_SIZE"*1(%%"REG_a");\n\t"\
             "add %%"REG_c", (%%"REG_a")     \n\t"

#define snow_inner_add_yblock_sse2_end_common2\
             "jnz 1b                         \n\t"\
663
             :"+m"(dst8),"+m"(dst_array),"=&r"(tmp)\
664
             :\
665
             "rm"((long)(src_x<<1)),"m"(obmc),"a"(block),"m"((long)b_h),"m"((long)src_stride):\
666
             "%"REG_c"","%"REG_S"","%"REG_D"","%"REG_d"");
667 668 669 670 671 672

#define snow_inner_add_yblock_sse2_end_8\
             "sal $1, %%"REG_c"              \n\t"\
             "add $"PTR_SIZE"*2, %1          \n\t"\
             snow_inner_add_yblock_sse2_end_common1\
             "sar $1, %%"REG_c"              \n\t"\
673
             "sub $2, %2                     \n\t"\
674 675 676 677 678
             snow_inner_add_yblock_sse2_end_common2

#define snow_inner_add_yblock_sse2_end_16\
             "add $"PTR_SIZE"*1, %1          \n\t"\
             snow_inner_add_yblock_sse2_end_common1\
679
             "dec %2                         \n\t"\
680 681
             snow_inner_add_yblock_sse2_end_common2

682
static void inner_add_yblock_bw_8_obmc_16_bh_even_sse2(const uint8_t *obmc, const long obmc_stride, uint8_t * * block, int b_w, long b_h,
683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703
                      int src_x, int src_y, long src_stride, slice_buffer * sb, int add, uint8_t * dst8){
snow_inner_add_yblock_sse2_header
snow_inner_add_yblock_sse2_start_8("xmm1", "xmm5", "3", "0")
snow_inner_add_yblock_sse2_accum_8("2", "8")
snow_inner_add_yblock_sse2_accum_8("1", "128")
snow_inner_add_yblock_sse2_accum_8("0", "136")

             "mov %0, %%"REG_d"              \n\t"
             "movdqa (%%"REG_D"), %%xmm0     \n\t"
             "movdqa %%xmm1, %%xmm2          \n\t"

             "punpckhwd %%xmm7, %%xmm1       \n\t"
             "punpcklwd %%xmm7, %%xmm2       \n\t"
             "paddd %%xmm2, %%xmm0           \n\t"
             "movdqa 16(%%"REG_D"), %%xmm2   \n\t"
             "paddd %%xmm1, %%xmm2           \n\t"
             "paddd %%xmm3, %%xmm0           \n\t"
             "paddd %%xmm3, %%xmm2           \n\t"

             "mov %1, %%"REG_D"              \n\t"
             "mov "PTR_SIZE"(%%"REG_D"), %%"REG_D";\n\t"
704
             "add %3, %%"REG_D"              \n\t"
705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729

             "movdqa (%%"REG_D"), %%xmm4     \n\t"
             "movdqa %%xmm5, %%xmm6          \n\t"
             "punpckhwd %%xmm7, %%xmm5       \n\t"
             "punpcklwd %%xmm7, %%xmm6       \n\t"
             "paddd %%xmm6, %%xmm4           \n\t"
             "movdqa 16(%%"REG_D"), %%xmm6   \n\t"
             "paddd %%xmm5, %%xmm6           \n\t"
             "paddd %%xmm3, %%xmm4           \n\t"
             "paddd %%xmm3, %%xmm6           \n\t"

             "psrad $8, %%xmm0               \n\t" /* FRAC_BITS. */
             "psrad $8, %%xmm2               \n\t" /* FRAC_BITS. */
             "packssdw %%xmm2, %%xmm0        \n\t"
             "packuswb %%xmm7, %%xmm0        \n\t"
             "movq %%xmm0, (%%"REG_d")       \n\t"

             "psrad $8, %%xmm4               \n\t" /* FRAC_BITS. */
             "psrad $8, %%xmm6               \n\t" /* FRAC_BITS. */
             "packssdw %%xmm6, %%xmm4        \n\t"
             "packuswb %%xmm7, %%xmm4        \n\t"
             "movq %%xmm4, (%%"REG_d",%%"REG_c");\n\t"
snow_inner_add_yblock_sse2_end_8
}

730
static void inner_add_yblock_bw_16_obmc_32_sse2(const uint8_t *obmc, const long obmc_stride, uint8_t * * block, int b_w, long b_h,
731 732 733 734 735 736 737 738
                      int src_x, int src_y, long src_stride, slice_buffer * sb, int add, uint8_t * dst8){
snow_inner_add_yblock_sse2_header
snow_inner_add_yblock_sse2_start_16("xmm1", "xmm5", "3", "0")
snow_inner_add_yblock_sse2_accum_16("2", "16")
snow_inner_add_yblock_sse2_accum_16("1", "512")
snow_inner_add_yblock_sse2_accum_16("0", "528")

             "mov %0, %%"REG_d"              \n\t"
739 740 741 742 743 744 745 746 747 748 749
             "psrlw $4, %%xmm1               \n\t"
             "psrlw $4, %%xmm5               \n\t"
             "paddw   (%%"REG_D"), %%xmm1    \n\t"
             "paddw 16(%%"REG_D"), %%xmm5    \n\t"
             "paddw %%xmm3, %%xmm1           \n\t"
             "paddw %%xmm3, %%xmm5           \n\t"
             "psraw $4, %%xmm1               \n\t" /* FRAC_BITS. */
             "psraw $4, %%xmm5               \n\t" /* FRAC_BITS. */
             "packuswb %%xmm5, %%xmm1        \n\t"

             "movdqu %%xmm1, (%%"REG_d")       \n\t"
750 751 752 753 754

snow_inner_add_yblock_sse2_end_16
}

#define snow_inner_add_yblock_mmx_header \
755
    IDWTELEM * * dst_array = sb->line + src_y;\
756
    long tmp;\
757
    asm volatile(\
758 759 760
             "mov  %7, %%"REG_c"             \n\t"\
             "mov  %6, %2                    \n\t"\
             "mov  %4, %%"REG_S"             \n\t"\
761 762
             "pxor %%mm7, %%mm7              \n\t" /* 0 */\
             "pcmpeqd %%mm3, %%mm3           \n\t"\
763 764
             "psllw $15, %%mm3               \n\t"\
             "psrlw $12, %%mm3               \n\t" /* FRAC_BITS >> 1 */\
765 766 767
             "1:                             \n\t"\
             "mov %1, %%"REG_D"              \n\t"\
             "mov (%%"REG_D"), %%"REG_D"     \n\t"\
768
             "add %3, %%"REG_D"              \n\t"
769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789

#define snow_inner_add_yblock_mmx_start(out_reg1, out_reg2, ptr_offset, s_offset, d_offset)\
             "mov "PTR_SIZE"*"ptr_offset"(%%"REG_a"), %%"REG_d"; \n\t"\
             "movd "d_offset"(%%"REG_d"), %%"out_reg1" \n\t"\
             "movd "d_offset"+4(%%"REG_d"), %%"out_reg2" \n\t"\
             "punpcklbw %%mm7, %%"out_reg1" \n\t"\
             "punpcklbw %%mm7, %%"out_reg2" \n\t"\
             "movd "s_offset"(%%"REG_S"), %%mm0 \n\t"\
             "movd "s_offset"+4(%%"REG_S"), %%mm4 \n\t"\
             "punpcklbw %%mm7, %%mm0       \n\t"\
             "punpcklbw %%mm7, %%mm4       \n\t"\
             "pmullw %%mm0, %%"out_reg1"    \n\t"\
             "pmullw %%mm4, %%"out_reg2"    \n\t"

#define snow_inner_add_yblock_mmx_accum(ptr_offset, s_offset, d_offset) \
             snow_inner_add_yblock_mmx_start("mm2", "mm6", ptr_offset, s_offset, d_offset)\
             "paddusw %%mm2, %%mm1         \n\t"\
             "paddusw %%mm6, %%mm5         \n\t"

#define snow_inner_add_yblock_mmx_mix(read_offset, write_offset)\
             "mov %0, %%"REG_d"              \n\t"\
790 791 792 793 794 795 796 797 798 799
             "psrlw $4, %%mm1                \n\t"\
             "psrlw $4, %%mm5                \n\t"\
             "paddw "read_offset"(%%"REG_D"), %%mm1 \n\t"\
             "paddw "read_offset"+8(%%"REG_D"), %%mm5 \n\t"\
             "paddw %%mm3, %%mm1             \n\t"\
             "paddw %%mm3, %%mm5             \n\t"\
             "psraw $4, %%mm1                \n\t"\
             "psraw $4, %%mm5                \n\t"\
             "packuswb %%mm5, %%mm1          \n\t"\
             "movq %%mm1, "write_offset"(%%"REG_d") \n\t"
800 801 802 803 804 805 806 807 808

#define snow_inner_add_yblock_mmx_end(s_step)\
             "add $"s_step", %%"REG_S"             \n\t"\
             "add %%"REG_c", "PTR_SIZE"*3(%%"REG_a");\n\t"\
             "add %%"REG_c", "PTR_SIZE"*2(%%"REG_a");\n\t"\
             "add %%"REG_c", "PTR_SIZE"*1(%%"REG_a");\n\t"\
             "add %%"REG_c", (%%"REG_a")     \n\t"\
             "add $"PTR_SIZE"*1, %1          \n\t"\
             "add %%"REG_c", %0              \n\t"\
809
             "dec %2                         \n\t"\
810
             "jnz 1b                         \n\t"\
811
             :"+m"(dst8),"+m"(dst_array),"=&r"(tmp)\
812
             :\
813
             "rm"((long)(src_x<<1)),"m"(obmc),"a"(block),"m"((long)b_h),"m"((long)src_stride):\
814
             "%"REG_c"","%"REG_S"","%"REG_D"","%"REG_d"");
815

816
static void inner_add_yblock_bw_8_obmc_16_mmx(const uint8_t *obmc, const long obmc_stride, uint8_t * * block, int b_w, long b_h,
817 818 819 820 821 822 823 824 825 826
                      int src_x, int src_y, long src_stride, slice_buffer * sb, int add, uint8_t * dst8){
snow_inner_add_yblock_mmx_header
snow_inner_add_yblock_mmx_start("mm1", "mm5", "3", "0", "0")
snow_inner_add_yblock_mmx_accum("2", "8", "0")
snow_inner_add_yblock_mmx_accum("1", "128", "0")
snow_inner_add_yblock_mmx_accum("0", "136", "0")
snow_inner_add_yblock_mmx_mix("0", "0")
snow_inner_add_yblock_mmx_end("16")
}

827
static void inner_add_yblock_bw_16_obmc_32_mmx(const uint8_t *obmc, const long obmc_stride, uint8_t * * block, int b_w, long b_h,
828 829 830 831 832 833 834 835 836 837 838 839
                      int src_x, int src_y, long src_stride, slice_buffer * sb, int add, uint8_t * dst8){
snow_inner_add_yblock_mmx_header
snow_inner_add_yblock_mmx_start("mm1", "mm5", "3", "0", "0")
snow_inner_add_yblock_mmx_accum("2", "16", "0")
snow_inner_add_yblock_mmx_accum("1", "512", "0")
snow_inner_add_yblock_mmx_accum("0", "528", "0")
snow_inner_add_yblock_mmx_mix("0", "0")

snow_inner_add_yblock_mmx_start("mm1", "mm5", "3", "8", "8")
snow_inner_add_yblock_mmx_accum("2", "24", "8")
snow_inner_add_yblock_mmx_accum("1", "520", "8")
snow_inner_add_yblock_mmx_accum("0", "536", "8")
840
snow_inner_add_yblock_mmx_mix("16", "8")
841 842 843
snow_inner_add_yblock_mmx_end("32")
}

844
void ff_snow_inner_add_yblock_sse2(const uint8_t *obmc, const int obmc_stride, uint8_t * * block, int b_w, int b_h,
845 846 847 848 849 850 851 852 853 854 855 856 857
                           int src_x, int src_y, int src_stride, slice_buffer * sb, int add, uint8_t * dst8){

    if (b_w == 16)
        inner_add_yblock_bw_16_obmc_32_sse2(obmc, obmc_stride, block, b_w, b_h, src_x,src_y, src_stride, sb, add, dst8);
    else if (b_w == 8 && obmc_stride == 16) {
        if (!(b_h & 1))
            inner_add_yblock_bw_8_obmc_16_bh_even_sse2(obmc, obmc_stride, block, b_w, b_h, src_x,src_y, src_stride, sb, add, dst8);
        else
            inner_add_yblock_bw_8_obmc_16_mmx(obmc, obmc_stride, block, b_w, b_h, src_x,src_y, src_stride, sb, add, dst8);
    } else
         ff_snow_inner_add_yblock(obmc, obmc_stride, block, b_w, b_h, src_x,src_y, src_stride, sb, add, dst8);
}

858
void ff_snow_inner_add_yblock_mmx(const uint8_t *obmc, const int obmc_stride, uint8_t * * block, int b_w, int b_h,
859 860 861 862 863 864 865 866
                          int src_x, int src_y, int src_stride, slice_buffer * sb, int add, uint8_t * dst8){
    if (b_w == 16)
        inner_add_yblock_bw_16_obmc_32_mmx(obmc, obmc_stride, block, b_w, b_h, src_x,src_y, src_stride, sb, add, dst8);
    else if (b_w == 8 && obmc_stride == 16)
        inner_add_yblock_bw_8_obmc_16_mmx(obmc, obmc_stride, block, b_w, b_h, src_x,src_y, src_stride, sb, add, dst8);
    else
        ff_snow_inner_add_yblock(obmc, obmc_stride, block, b_w, b_h, src_x,src_y, src_stride, sb, add, dst8);
}