h264_idct.asm 25.1 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11
;*****************************************************************************
;* MMX/SSE2-optimized H.264 iDCT
;*****************************************************************************
;* Copyright (C) 2004-2005 Michael Niedermayer, Loren Merritt
;* Copyright (C) 2003-2008 x264 project
;*
;* Authors: Laurent Aimar <fenrir@via.ecp.fr>
;*          Loren Merritt <lorenm@u.washington.edu>
;*          Holger Lubitz <hal@duncan.ol.sub.de>
;*          Min Chen <chenm001.163.com>
;*
12
;* This file is part of Libav.
13
;*
14
;* Libav is free software; you can redistribute it and/or
15 16 17 18
;* modify it under the terms of the GNU Lesser General Public
;* License as published by the Free Software Foundation; either
;* version 2.1 of the License, or (at your option) any later version.
;*
19
;* Libav is distributed in the hope that it will be useful,
20 21 22 23 24
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
;* Lesser General Public License for more details.
;*
;* You should have received a copy of the GNU Lesser General Public
25
;* License along with Libav; if not, write to the Free Software
26
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
27 28
;*****************************************************************************

29
%include "libavutil/x86/x86inc.asm"
30 31 32 33 34
%include "x86util.asm"

SECTION_RODATA

; FIXME this table is a duplicate from h264data.h, and will be removed once the tables from, h264 have been split
35 36 37 38 39 40 41 42 43 44 45 46
scan8_mem: db  4+ 1*8, 5+ 1*8, 4+ 2*8, 5+ 2*8
           db  6+ 1*8, 7+ 1*8, 6+ 2*8, 7+ 2*8
           db  4+ 3*8, 5+ 3*8, 4+ 4*8, 5+ 4*8
           db  6+ 3*8, 7+ 3*8, 6+ 4*8, 7+ 4*8
           db  4+ 6*8, 5+ 6*8, 4+ 7*8, 5+ 7*8
           db  6+ 6*8, 7+ 6*8, 6+ 7*8, 7+ 7*8
           db  4+ 8*8, 5+ 8*8, 4+ 9*8, 5+ 9*8
           db  6+ 8*8, 7+ 8*8, 6+ 9*8, 7+ 9*8
           db  4+11*8, 5+11*8, 4+12*8, 5+12*8
           db  6+11*8, 7+11*8, 6+12*8, 7+12*8
           db  4+13*8, 5+13*8, 4+14*8, 5+14*8
           db  6+13*8, 7+13*8, 6+14*8, 7+14*8
47 48 49 50 51 52 53
%ifdef PIC
%define scan8 r11
%else
%define scan8 scan8_mem
%endif

cextern pw_32
54
cextern pw_1
55 56 57 58 59 60 61 62 63 64 65

SECTION .text

; %1=uint8_t *dst, %2=int16_t *block, %3=int stride
%macro IDCT4_ADD 3
    ; Load dct coeffs
    movq         m0, [%2]
    movq         m1, [%2+8]
    movq         m2, [%2+16]
    movq         m3, [%2+24]

66
    IDCT4_1D      w, 0, 1, 2, 3, 4, 5
67 68 69
    mova         m6, [pw_32]
    TRANSPOSE4x4W 0, 1, 2, 3, 4
    paddw        m0, m6
70
    IDCT4_1D      w, 0, 1, 2, 3, 4, 5
71 72 73 74 75 76 77 78 79
    pxor         m7, m7

    STORE_DIFFx2 m0, m1, m4, m5, m7, 6, %1, %3
    lea          %1, [%1+%3*2]
    STORE_DIFFx2 m2, m3, m4, m5, m7, 6, %1, %3
%endmacro

INIT_MMX
; ff_h264_idct_add_mmx(uint8_t *dst, int16_t *block, int stride)
80
cglobal h264_idct_add_8_mmx, 3, 3, 0
81 82 83 84 85 86
    IDCT4_ADD    r0, r1, r2
    RET

%macro IDCT8_1D 2
    mova         m0, m1
    psraw        m1, 1
87 88
    mova         m4, m5
    psraw        m4, 1
89 90 91 92 93 94 95 96 97
    paddw        m4, m5
    paddw        m1, m0
    paddw        m4, m7
    paddw        m1, m5
    psubw        m4, m0
    paddw        m1, m3

    psubw        m0, m3
    psubw        m5, m3
98
    psraw        m3, 1
99 100 101 102 103 104 105 106
    paddw        m0, m7
    psubw        m5, m7
    psraw        m7, 1
    psubw        m0, m3
    psubw        m5, m7

    mova         m7, m1
    psraw        m1, 2
107
    mova         m3, m4
108 109 110 111 112 113 114 115 116 117
    psraw        m3, 2
    paddw        m3, m0
    psraw        m0, 2
    paddw        m1, m5
    psraw        m5, 2
    psubw        m0, m4
    psubw        m7, m5

    mova         m5, m6
    psraw        m6, 1
118 119
    mova         m4, m2
    psraw        m4, 1
120
    paddw        m6, m2
121
    psubw        m4, m5
122 123 124

    mova         m2, %1
    mova         m5, %2
125 126 127 128 129 130 131
    SUMSUB_BA    w, 5, 2
    SUMSUB_BA    w, 6, 5
    SUMSUB_BA    w, 4, 2
    SUMSUB_BA    w, 7, 6
    SUMSUB_BA    w, 0, 4
    SUMSUB_BA    w, 3, 2
    SUMSUB_BA    w, 1, 5
132
    SWAP         7, 6, 4, 5, 2, 3, 1, 0 ; 70315246 -> 01234567
133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183
%endmacro

%macro IDCT8_1D_FULL 1
    mova         m7, [%1+112]
    mova         m6, [%1+ 96]
    mova         m5, [%1+ 80]
    mova         m3, [%1+ 48]
    mova         m2, [%1+ 32]
    mova         m1, [%1+ 16]
    IDCT8_1D   [%1], [%1+ 64]
%endmacro

; %1=int16_t *block, %2=int16_t *dstblock
%macro IDCT8_ADD_MMX_START 2
    IDCT8_1D_FULL %1
    mova       [%1], m7
    TRANSPOSE4x4W 0, 1, 2, 3, 7
    mova         m7, [%1]
    mova    [%2   ], m0
    mova    [%2+16], m1
    mova    [%2+32], m2
    mova    [%2+48], m3
    TRANSPOSE4x4W 4, 5, 6, 7, 3
    mova    [%2+ 8], m4
    mova    [%2+24], m5
    mova    [%2+40], m6
    mova    [%2+56], m7
%endmacro

; %1=uint8_t *dst, %2=int16_t *block, %3=int stride
%macro IDCT8_ADD_MMX_END 3
    IDCT8_1D_FULL %2
    mova    [%2   ], m5
    mova    [%2+16], m6
    mova    [%2+32], m7

    pxor         m7, m7
    STORE_DIFFx2 m0, m1, m5, m6, m7, 6, %1, %3
    lea          %1, [%1+%3*2]
    STORE_DIFFx2 m2, m3, m5, m6, m7, 6, %1, %3
    mova         m0, [%2   ]
    mova         m1, [%2+16]
    mova         m2, [%2+32]
    lea          %1, [%1+%3*2]
    STORE_DIFFx2 m4, m0, m5, m6, m7, 6, %1, %3
    lea          %1, [%1+%3*2]
    STORE_DIFFx2 m1, m2, m5, m6, m7, 6, %1, %3
%endmacro

INIT_MMX
; ff_h264_idct8_add_mmx(uint8_t *dst, int16_t *block, int stride)
184
cglobal h264_idct8_add_8_mmx, 3, 4, 0
185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243
    %assign pad 128+4-(stack_offset&7)
    SUB         rsp, pad

    add   word [r1], 32
    IDCT8_ADD_MMX_START r1  , rsp
    IDCT8_ADD_MMX_START r1+8, rsp+64
    lea          r3, [r0+4]
    IDCT8_ADD_MMX_END   r0  , rsp,   r2
    IDCT8_ADD_MMX_END   r3  , rsp+8, r2

    ADD         rsp, pad
    RET

; %1=uint8_t *dst, %2=int16_t *block, %3=int stride
%macro IDCT8_ADD_SSE 4
    IDCT8_1D_FULL %2
%ifdef ARCH_X86_64
    TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, 8
%else
    TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, [%2], [%2+16]
%endif
    paddw        m0, [pw_32]

%ifndef ARCH_X86_64
    mova    [%2   ], m0
    mova    [%2+16], m4
    IDCT8_1D   [%2], [%2+ 16]
    mova    [%2   ], m6
    mova    [%2+16], m7
%else
    SWAP          0, 8
    SWAP          4, 9
    IDCT8_1D     m8, m9
    SWAP          6, 8
    SWAP          7, 9
%endif

    pxor         m7, m7
    lea          %4, [%3*3]
    STORE_DIFF   m0, m6, m7, [%1     ]
    STORE_DIFF   m1, m6, m7, [%1+%3  ]
    STORE_DIFF   m2, m6, m7, [%1+%3*2]
    STORE_DIFF   m3, m6, m7, [%1+%4  ]
%ifndef ARCH_X86_64
    mova         m0, [%2   ]
    mova         m1, [%2+16]
%else
    SWAP          0, 8
    SWAP          1, 9
%endif
    lea          %1, [%1+%3*4]
    STORE_DIFF   m4, m6, m7, [%1     ]
    STORE_DIFF   m5, m6, m7, [%1+%3  ]
    STORE_DIFF   m0, m6, m7, [%1+%3*2]
    STORE_DIFF   m1, m6, m7, [%1+%4  ]
%endmacro

INIT_XMM
; ff_h264_idct8_add_sse2(uint8_t *dst, int16_t *block, int stride)
244
cglobal h264_idct8_add_8_sse2, 3, 4, 10
245 246 247 248 249 250 251 252
    IDCT8_ADD_SSE r0, r1, r2, r3
    RET

%macro DC_ADD_MMX2_INIT 2-3
%if %0 == 2
    movsx        %1, word [%1]
    add          %1, 32
    sar          %1, 6
253
    movd         m0, %1d
254 255 256 257
    lea          %1, [%2*3]
%else
    add          %3, 32
    sar          %3, 6
258
    movd         m0, %3d
259 260 261 262 263 264 265 266 267
    lea          %3, [%2*3]
%endif
    pshufw       m0, m0, 0
    pxor         m1, m1
    psubw        m1, m0
    packuswb     m0, m0
    packuswb     m1, m1
%endmacro

268
%macro DC_ADD_MMX2_OP 4
269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288
    %1           m2, [%2     ]
    %1           m3, [%2+%3  ]
    %1           m4, [%2+%3*2]
    %1           m5, [%2+%4  ]
    paddusb      m2, m0
    paddusb      m3, m0
    paddusb      m4, m0
    paddusb      m5, m0
    psubusb      m2, m1
    psubusb      m3, m1
    psubusb      m4, m1
    psubusb      m5, m1
    %1    [%2     ], m2
    %1    [%2+%3  ], m3
    %1    [%2+%3*2], m4
    %1    [%2+%4  ], m5
%endmacro

INIT_MMX
; ff_h264_idct_dc_add_mmx2(uint8_t *dst, int16_t *block, int stride)
289
cglobal h264_idct_dc_add_8_mmx2, 3, 3, 0
290 291 292 293 294
    DC_ADD_MMX2_INIT r1, r2
    DC_ADD_MMX2_OP movh, r0, r2, r1
    RET

; ff_h264_idct8_dc_add_mmx2(uint8_t *dst, int16_t *block, int stride)
295
cglobal h264_idct8_dc_add_8_mmx2, 3, 3, 0
296 297 298 299 300 301 302 303
    DC_ADD_MMX2_INIT r1, r2
    DC_ADD_MMX2_OP mova, r0, r2, r1
    lea          r0, [r0+r2*4]
    DC_ADD_MMX2_OP mova, r0, r2, r1
    RET

; ff_h264_idct_add16_mmx(uint8_t *dst, const int *block_offset,
;             DCTELEM *block, int stride, const uint8_t nnzc[6*8])
304
cglobal h264_idct_add16_8_mmx, 5, 7, 0
305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325
    xor          r5, r5
%ifdef PIC
    lea         r11, [scan8_mem]
%endif
.nextblock
    movzx        r6, byte [scan8+r5]
    movzx        r6, byte [r4+r6]
    test         r6, r6
    jz .skipblock
    mov         r6d, dword [r1+r5*4]
    lea          r6, [r0+r6]
    IDCT4_ADD    r6, r2, r3
.skipblock
    inc          r5
    add          r2, 32
    cmp          r5, 16
    jl .nextblock
    REP_RET

; ff_h264_idct8_add4_mmx(uint8_t *dst, const int *block_offset,
;                        DCTELEM *block, int stride, const uint8_t nnzc[6*8])
326
cglobal h264_idct8_add4_8_mmx, 5, 7, 0
327 328 329 330 331 332 333 334 335 336 337 338 339
    %assign pad 128+4-(stack_offset&7)
    SUB         rsp, pad

    xor          r5, r5
%ifdef PIC
    lea         r11, [scan8_mem]
%endif
.nextblock
    movzx        r6, byte [scan8+r5]
    movzx        r6, byte [r4+r6]
    test         r6, r6
    jz .skipblock
    mov         r6d, dword [r1+r5*4]
340
    add          r6, r0
341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357
    add   word [r2], 32
    IDCT8_ADD_MMX_START r2  , rsp
    IDCT8_ADD_MMX_START r2+8, rsp+64
    IDCT8_ADD_MMX_END   r6  , rsp,   r3
    mov         r6d, dword [r1+r5*4]
    lea          r6, [r0+r6+4]
    IDCT8_ADD_MMX_END   r6  , rsp+8, r3
.skipblock
    add          r5, 4
    add          r2, 128
    cmp          r5, 16
    jl .nextblock
    ADD         rsp, pad
    RET

; ff_h264_idct_add16_mmx2(uint8_t *dst, const int *block_offset,
;                         DCTELEM *block, int stride, const uint8_t nnzc[6*8])
358
cglobal h264_idct_add16_8_mmx2, 5, 7, 0
359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393
    xor          r5, r5
%ifdef PIC
    lea         r11, [scan8_mem]
%endif
.nextblock
    movzx        r6, byte [scan8+r5]
    movzx        r6, byte [r4+r6]
    test         r6, r6
    jz .skipblock
    cmp          r6, 1
    jnz .no_dc
    movsx        r6, word [r2]
    test         r6, r6
    jz .no_dc
    DC_ADD_MMX2_INIT r2, r3, r6
%ifdef ARCH_X86_64
%define dst_reg  r10
%define dst_regd r10d
%else
%define dst_reg  r1
%define dst_regd r1d
%endif
    mov    dst_regd, dword [r1+r5*4]
    lea     dst_reg, [r0+dst_reg]
    DC_ADD_MMX2_OP movh, dst_reg, r3, r6
%ifndef ARCH_X86_64
    mov          r1, r1m
%endif
    inc          r5
    add          r2, 32
    cmp          r5, 16
    jl .nextblock
    REP_RET
.no_dc
    mov         r6d, dword [r1+r5*4]
394
    add          r6, r0
395 396 397 398 399 400 401 402 403 404
    IDCT4_ADD    r6, r2, r3
.skipblock
    inc          r5
    add          r2, 32
    cmp          r5, 16
    jl .nextblock
    REP_RET

; ff_h264_idct_add16intra_mmx(uint8_t *dst, const int *block_offset,
;                             DCTELEM *block, int stride, const uint8_t nnzc[6*8])
405
cglobal h264_idct_add16intra_8_mmx, 5, 7, 0
406 407 408 409 410 411 412 413 414 415 416
    xor          r5, r5
%ifdef PIC
    lea         r11, [scan8_mem]
%endif
.nextblock
    movzx        r6, byte [scan8+r5]
    movzx        r6, byte [r4+r6]
    or          r6w, word [r2]
    test         r6, r6
    jz .skipblock
    mov         r6d, dword [r1+r5*4]
417
    add          r6, r0
418 419 420 421 422 423 424 425 426 427
    IDCT4_ADD    r6, r2, r3
.skipblock
    inc          r5
    add          r2, 32
    cmp          r5, 16
    jl .nextblock
    REP_RET

; ff_h264_idct_add16intra_mmx2(uint8_t *dst, const int *block_offset,
;                              DCTELEM *block, int stride, const uint8_t nnzc[6*8])
428
cglobal h264_idct_add16intra_8_mmx2, 5, 7, 0
429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458
    xor          r5, r5
%ifdef PIC
    lea         r11, [scan8_mem]
%endif
.nextblock
    movzx        r6, byte [scan8+r5]
    movzx        r6, byte [r4+r6]
    test         r6, r6
    jz .try_dc
    mov         r6d, dword [r1+r5*4]
    lea          r6, [r0+r6]
    IDCT4_ADD    r6, r2, r3
    inc          r5
    add          r2, 32
    cmp          r5, 16
    jl .nextblock
    REP_RET
.try_dc
    movsx        r6, word [r2]
    test         r6, r6
    jz .skipblock
    DC_ADD_MMX2_INIT r2, r3, r6
%ifdef ARCH_X86_64
%define dst_reg  r10
%define dst_regd r10d
%else
%define dst_reg  r1
%define dst_regd r1d
%endif
    mov    dst_regd, dword [r1+r5*4]
459
    add     dst_reg, r0
460 461 462 463 464 465 466 467 468 469 470 471 472
    DC_ADD_MMX2_OP movh, dst_reg, r3, r6
%ifndef ARCH_X86_64
    mov          r1, r1m
%endif
.skipblock
    inc          r5
    add          r2, 32
    cmp          r5, 16
    jl .nextblock
    REP_RET

; ff_h264_idct8_add4_mmx2(uint8_t *dst, const int *block_offset,
;                         DCTELEM *block, int stride, const uint8_t nnzc[6*8])
473
cglobal h264_idct8_add4_8_mmx2, 5, 7, 0
474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515
    %assign pad 128+4-(stack_offset&7)
    SUB         rsp, pad

    xor          r5, r5
%ifdef PIC
    lea         r11, [scan8_mem]
%endif
.nextblock
    movzx        r6, byte [scan8+r5]
    movzx        r6, byte [r4+r6]
    test         r6, r6
    jz .skipblock
    cmp          r6, 1
    jnz .no_dc
    movsx        r6, word [r2]
    test         r6, r6
    jz .no_dc
    DC_ADD_MMX2_INIT r2, r3, r6
%ifdef ARCH_X86_64
%define dst_reg  r10
%define dst_regd r10d
%else
%define dst_reg  r1
%define dst_regd r1d
%endif
    mov    dst_regd, dword [r1+r5*4]
    lea     dst_reg, [r0+dst_reg]
    DC_ADD_MMX2_OP mova, dst_reg, r3, r6
    lea     dst_reg, [dst_reg+r3*4]
    DC_ADD_MMX2_OP mova, dst_reg, r3, r6
%ifndef ARCH_X86_64
    mov          r1, r1m
%endif
    add          r5, 4
    add          r2, 128
    cmp          r5, 16
    jl .nextblock

    ADD         rsp, pad
    RET
.no_dc
    mov         r6d, dword [r1+r5*4]
516
    add          r6, r0
517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535
    add   word [r2], 32
    IDCT8_ADD_MMX_START r2  , rsp
    IDCT8_ADD_MMX_START r2+8, rsp+64
    IDCT8_ADD_MMX_END   r6  , rsp,   r3
    mov         r6d, dword [r1+r5*4]
    lea          r6, [r0+r6+4]
    IDCT8_ADD_MMX_END   r6  , rsp+8, r3
.skipblock
    add          r5, 4
    add          r2, 128
    cmp          r5, 16
    jl .nextblock

    ADD         rsp, pad
    RET

INIT_XMM
; ff_h264_idct8_add4_sse2(uint8_t *dst, const int *block_offset,
;                         DCTELEM *block, int stride, const uint8_t nnzc[6*8])
536
cglobal h264_idct8_add4_8_sse2, 5, 7, 10
537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560
    xor          r5, r5
%ifdef PIC
    lea         r11, [scan8_mem]
%endif
.nextblock
    movzx        r6, byte [scan8+r5]
    movzx        r6, byte [r4+r6]
    test         r6, r6
    jz .skipblock
    cmp          r6, 1
    jnz .no_dc
    movsx        r6, word [r2]
    test         r6, r6
    jz .no_dc
INIT_MMX
    DC_ADD_MMX2_INIT r2, r3, r6
%ifdef ARCH_X86_64
%define dst_reg  r10
%define dst_regd r10d
%else
%define dst_reg  r1
%define dst_regd r1d
%endif
    mov    dst_regd, dword [r1+r5*4]
561
    add     dst_reg, r0
562 563 564 565 566 567 568 569 570 571 572 573 574 575
    DC_ADD_MMX2_OP mova, dst_reg, r3, r6
    lea     dst_reg, [dst_reg+r3*4]
    DC_ADD_MMX2_OP mova, dst_reg, r3, r6
%ifndef ARCH_X86_64
    mov          r1, r1m
%endif
    add          r5, 4
    add          r2, 128
    cmp          r5, 16
    jl .nextblock
    REP_RET
.no_dc
INIT_XMM
    mov    dst_regd, dword [r1+r5*4]
576
    add     dst_reg, r0
577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613
    IDCT8_ADD_SSE dst_reg, r2, r3, r6
%ifndef ARCH_X86_64
    mov          r1, r1m
%endif
.skipblock
    add          r5, 4
    add          r2, 128
    cmp          r5, 16
    jl .nextblock
    REP_RET

INIT_MMX
h264_idct_add8_mmx_plane:
.nextblock
    movzx        r6, byte [scan8+r5]
    movzx        r6, byte [r4+r6]
    or          r6w, word [r2]
    test         r6, r6
    jz .skipblock
%ifdef ARCH_X86_64
    mov         r0d, dword [r1+r5*4]
    add          r0, [r10]
%else
    mov          r0, r1m ; XXX r1m here is actually r0m of the calling func
    mov          r0, [r0]
    add          r0, dword [r1+r5*4]
%endif
    IDCT4_ADD    r0, r2, r3
.skipblock
    inc          r5
    add          r2, 32
    test         r5, 3
    jnz .nextblock
    rep ret

; ff_h264_idct_add8_mmx(uint8_t **dest, const int *block_offset,
;                       DCTELEM *block, int stride, const uint8_t nnzc[6*8])
614
cglobal h264_idct_add8_8_mmx, 5, 7, 0
615 616 617 618 619 620 621 622 623
    mov          r5, 16
    add          r2, 512
%ifdef PIC
    lea         r11, [scan8_mem]
%endif
%ifdef ARCH_X86_64
    mov         r10, r0
%endif
    call         h264_idct_add8_mmx_plane
624 625
    mov          r5, 32
    add          r2, 384
626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676
%ifdef ARCH_X86_64
    add         r10, gprsize
%else
    add        r0mp, gprsize
%endif
    call         h264_idct_add8_mmx_plane
    RET

h264_idct_add8_mmx2_plane
.nextblock
    movzx        r6, byte [scan8+r5]
    movzx        r6, byte [r4+r6]
    test         r6, r6
    jz .try_dc
%ifdef ARCH_X86_64
    mov         r0d, dword [r1+r5*4]
    add          r0, [r10]
%else
    mov          r0, r1m ; XXX r1m here is actually r0m of the calling func
    mov          r0, [r0]
    add          r0, dword [r1+r5*4]
%endif
    IDCT4_ADD    r0, r2, r3
    inc          r5
    add          r2, 32
    test         r5, 3
    jnz .nextblock
    rep ret
.try_dc
    movsx        r6, word [r2]
    test         r6, r6
    jz .skipblock
    DC_ADD_MMX2_INIT r2, r3, r6
%ifdef ARCH_X86_64
    mov         r0d, dword [r1+r5*4]
    add          r0, [r10]
%else
    mov          r0, r1m ; XXX r1m here is actually r0m of the calling func
    mov          r0, [r0]
    add          r0, dword [r1+r5*4]
%endif
    DC_ADD_MMX2_OP movh, r0, r3, r6
.skipblock
    inc          r5
    add          r2, 32
    test         r5, 3
    jnz .nextblock
    rep ret

; ff_h264_idct_add8_mmx2(uint8_t **dest, const int *block_offset,
;                        DCTELEM *block, int stride, const uint8_t nnzc[6*8])
677
cglobal h264_idct_add8_8_mmx2, 5, 7, 0
678 679 680 681 682 683 684 685 686
    mov          r5, 16
    add          r2, 512
%ifdef ARCH_X86_64
    mov         r10, r0
%endif
%ifdef PIC
    lea         r11, [scan8_mem]
%endif
    call h264_idct_add8_mmx2_plane
687 688
    mov          r5, 32
    add          r2, 384
689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725
%ifdef ARCH_X86_64
    add         r10, gprsize
%else
    add        r0mp, gprsize
%endif
    call h264_idct_add8_mmx2_plane
    RET

INIT_MMX
; r0 = uint8_t *dst, r2 = int16_t *block, r3 = int stride, r6=clobbered
h264_idct_dc_add8_mmx2:
    movd         m0, [r2   ]          ;  0 0 X D
    punpcklwd    m0, [r2+32]          ;  x X d D
    paddsw       m0, [pw_32]
    psraw        m0, 6
    punpcklwd    m0, m0               ;  d d D D
    pxor         m1, m1               ;  0 0 0 0
    psubw        m1, m0               ; -d-d-D-D
    packuswb     m0, m1               ; -d-d-D-D d d D D
    pshufw       m1, m0, 0xFA         ; -d-d-d-d-D-D-D-D
    punpcklwd    m0, m0               ;  d d d d D D D D
    lea          r6, [r3*3]
    DC_ADD_MMX2_OP movq, r0, r3, r6
    ret

ALIGN 16
INIT_XMM
; r0 = uint8_t *dst (clobbered), r2 = int16_t *block, r3 = int stride
x264_add8x4_idct_sse2:
    movq   m0, [r2+ 0]
    movq   m1, [r2+ 8]
    movq   m2, [r2+16]
    movq   m3, [r2+24]
    movhps m0, [r2+32]
    movhps m1, [r2+40]
    movhps m2, [r2+48]
    movhps m3, [r2+56]
726
    IDCT4_1D w,0,1,2,3,4,5
727 728
    TRANSPOSE2x4x4W 0,1,2,3,4
    paddw m0, [pw_32]
729
    IDCT4_1D w,0,1,2,3,4,5
730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754
    pxor  m7, m7
    STORE_DIFFx2 m0, m1, m4, m5, m7, 6, r0, r3
    lea   r0, [r0+r3*2]
    STORE_DIFFx2 m2, m3, m4, m5, m7, 6, r0, r3
    ret

%macro add16_sse2_cycle 2
    movzx       r0, word [r4+%2]
    test        r0, r0
    jz .cycle%1end
    mov        r0d, dword [r1+%1*8]
%ifdef ARCH_X86_64
    add         r0, r10
%else
    add         r0, r0m
%endif
    call        x264_add8x4_idct_sse2
.cycle%1end
%if %1 < 7
    add         r2, 64
%endif
%endmacro

; ff_h264_idct_add16_sse2(uint8_t *dst, const int *block_offset,
;                         DCTELEM *block, int stride, const uint8_t nnzc[6*8])
755
cglobal h264_idct_add16_8_sse2, 5, 5, 8
756 757 758 759 760 761 762 763 764 765 766 767 768 769 770
%ifdef ARCH_X86_64
    mov        r10, r0
%endif
    ; unrolling of the loop leads to an average performance gain of
    ; 20-25%
    add16_sse2_cycle 0, 0xc
    add16_sse2_cycle 1, 0x14
    add16_sse2_cycle 2, 0xe
    add16_sse2_cycle 3, 0x16
    add16_sse2_cycle 4, 0x1c
    add16_sse2_cycle 5, 0x24
    add16_sse2_cycle 6, 0x1e
    add16_sse2_cycle 7, 0x26
    RET

771 772
%macro add16intra_sse2_cycle 2
    movzx       r0, word [r4+%2]
773
    test        r0, r0
774 775
    jz .try%1dc
    mov        r0d, dword [r1+%1*8]
776 777 778 779 780 781
%ifdef ARCH_X86_64
    add         r0, r10
%else
    add         r0, r0m
%endif
    call        x264_add8x4_idct_sse2
782 783
    jmp .cycle%1end
.try%1dc
784 785
    movsx       r0, word [r2   ]
    or         r0w, word [r2+32]
786 787
    jz .cycle%1end
    mov        r0d, dword [r1+%1*8]
788 789 790 791 792 793
%ifdef ARCH_X86_64
    add         r0, r10
%else
    add         r0, r0m
%endif
    call        h264_idct_dc_add8_mmx2
794 795
.cycle%1end
%if %1 < 7
796
    add         r2, 64
797 798 799 800 801
%endif
%endmacro

; ff_h264_idct_add16intra_sse2(uint8_t *dst, const int *block_offset,
;                              DCTELEM *block, int stride, const uint8_t nnzc[6*8])
802
cglobal h264_idct_add16intra_8_sse2, 5, 7, 8
803 804 805 806 807 808 809 810 811 812 813 814
%ifdef ARCH_X86_64
    mov        r10, r0
%endif
    add16intra_sse2_cycle 0, 0xc
    add16intra_sse2_cycle 1, 0x14
    add16intra_sse2_cycle 2, 0xe
    add16intra_sse2_cycle 3, 0x16
    add16intra_sse2_cycle 4, 0x1c
    add16intra_sse2_cycle 5, 0x24
    add16intra_sse2_cycle 6, 0x1e
    add16intra_sse2_cycle 7, 0x26
    RET
815

816 817
%macro add8_sse2_cycle 2
    movzx       r0, word [r4+%2]
818
    test        r0, r0
819
    jz .try%1dc
820
%ifdef ARCH_X86_64
821
    mov        r0d, dword [r1+(%1&1)*8+64*(1+(%1>>1))]
822 823
    add         r0, [r10]
%else
824
    mov         r0, r0m
825
    mov         r0, [r0]
826
    add         r0, dword [r1+(%1&1)*8+64*(1+(%1>>1))]
827 828
%endif
    call        x264_add8x4_idct_sse2
829 830
    jmp .cycle%1end
.try%1dc
831 832
    movsx       r0, word [r2   ]
    or         r0w, word [r2+32]
833
    jz .cycle%1end
834
%ifdef ARCH_X86_64
835
    mov        r0d, dword [r1+(%1&1)*8+64*(1+(%1>>1))]
836 837
    add         r0, [r10]
%else
838
    mov         r0, r0m
839
    mov         r0, [r0]
840
    add         r0, dword [r1+(%1&1)*8+64*(1+(%1>>1))]
841 842
%endif
    call        h264_idct_dc_add8_mmx2
843
.cycle%1end
844 845 846
%if %1 == 1
    add         r2, 384+64
%elif %1 < 3
847
    add         r2, 64
848 849
%endif
%endmacro
850 851 852

; ff_h264_idct_add8_sse2(uint8_t **dest, const int *block_offset,
;                        DCTELEM *block, int stride, const uint8_t nnzc[6*8])
853
cglobal h264_idct_add8_8_sse2, 5, 7, 8
854 855 856 857
    add          r2, 512
%ifdef ARCH_X86_64
    mov         r10, r0
%endif
858 859
    add8_sse2_cycle 0, 0x34
    add8_sse2_cycle 1, 0x3c
860 861 862 863 864
%ifdef ARCH_X86_64
    add         r10, gprsize
%else
    add        r0mp, gprsize
%endif
865 866
    add8_sse2_cycle 2, 0x5c
    add8_sse2_cycle 3, 0x64
867
    RET
868 869 870 871

;void ff_h264_luma_dc_dequant_idct_mmx(DCTELEM *output, DCTELEM *input, int qmul)

%macro WALSH4_1D 5
872 873
    SUMSUB_BADC w, %4, %3, %2, %1, %5
    SUMSUB_BADC w, %4, %2, %3, %1, %5
874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020
    SWAP %1, %4, %3
%endmacro

%macro DEQUANT_MMX 3
    mova        m7, [pw_1]
    mova        m4, %1
    punpcklwd   %1, m7
    punpckhwd   m4, m7
    mova        m5, %2
    punpcklwd   %2, m7
    punpckhwd   m5, m7
    movd        m7, t3d
    punpckldq   m7, m7
    pmaddwd     %1, m7
    pmaddwd     %2, m7
    pmaddwd     m4, m7
    pmaddwd     m5, m7
    psrad       %1, %3
    psrad       %2, %3
    psrad       m4, %3
    psrad       m5, %3
    packssdw    %1, m4
    packssdw    %2, m5
%endmacro

%macro STORE_WORDS_MMX 5
    movd  t0d, %1
    psrlq  %1, 32
    movd  t1d, %1
    mov [t2+%2*32], t0w
    mov [t2+%4*32], t1w
    shr   t0d, 16
    shr   t1d, 16
    mov [t2+%3*32], t0w
    mov [t2+%5*32], t1w
%endmacro

%macro DEQUANT_STORE_MMX 1
    DEQUANT_MMX m0, m1, %1
    STORE_WORDS_MMX m0,  0,  1,  4,  5
    STORE_WORDS_MMX m1,  2,  3,  6,  7

    DEQUANT_MMX m2, m3, %1
    STORE_WORDS_MMX m2,  8,  9, 12, 13
    STORE_WORDS_MMX m3, 10, 11, 14, 15
%endmacro

%macro STORE_WORDS_SSE 9
    movd  t0d, %1
    psrldq  %1, 4
    movd  t1d, %1
    psrldq  %1, 4
    mov [t2+%2*32], t0w
    mov [t2+%4*32], t1w
    shr   t0d, 16
    shr   t1d, 16
    mov [t2+%3*32], t0w
    mov [t2+%5*32], t1w
    movd  t0d, %1
    psrldq  %1, 4
    movd  t1d, %1
    mov [t2+%6*32], t0w
    mov [t2+%8*32], t1w
    shr   t0d, 16
    shr   t1d, 16
    mov [t2+%7*32], t0w
    mov [t2+%9*32], t1w
%endmacro

%macro DEQUANT_STORE_SSE2 1
    movd      xmm4, t3d
    movq      xmm5, [pw_1]
    pshufd    xmm4, xmm4, 0
    movq2dq   xmm0, m0
    movq2dq   xmm1, m1
    movq2dq   xmm2, m2
    movq2dq   xmm3, m3
    punpcklwd xmm0, xmm5
    punpcklwd xmm1, xmm5
    punpcklwd xmm2, xmm5
    punpcklwd xmm3, xmm5
    pmaddwd   xmm0, xmm4
    pmaddwd   xmm1, xmm4
    pmaddwd   xmm2, xmm4
    pmaddwd   xmm3, xmm4
    psrad     xmm0, %1
    psrad     xmm1, %1
    psrad     xmm2, %1
    psrad     xmm3, %1
    packssdw  xmm0, xmm1
    packssdw  xmm2, xmm3
    STORE_WORDS_SSE xmm0,  0,  1,  4,  5,  2,  3,  6,  7
    STORE_WORDS_SSE xmm2,  8,  9, 12, 13, 10, 11, 14, 15
%endmacro

%macro IDCT_DC_DEQUANT 2
cglobal h264_luma_dc_dequant_idct_%1, 3,4,%2
    movq        m3, [r1+24]
    movq        m2, [r1+16]
    movq        m1, [r1+ 8]
    movq        m0, [r1+ 0]
    WALSH4_1D    0,1,2,3,4
    TRANSPOSE4x4W 0,1,2,3,4
    WALSH4_1D    0,1,2,3,4

; shift, tmp, output, qmul
%ifdef WIN64
    DECLARE_REG_TMP 0,3,1,2
    ; we can't avoid this, because r0 is the shift register (ecx) on win64
    xchg        r0, t2
%elifdef ARCH_X86_64
    DECLARE_REG_TMP 3,1,0,2
%else
    DECLARE_REG_TMP 1,3,0,2
%endif

    cmp        t3d, 32767
    jg .big_qmul
    add        t3d, 128 << 16
%ifidn %1,mmx
    DEQUANT_STORE_MMX 8
%else
    DEQUANT_STORE_SSE2 8
%endif
    RET
.big_qmul:
    bsr        t0d, t3d
    add        t3d, 128 << 16
    mov        t1d, 7
    cmp        t0d, t1d
    cmovg      t0d, t1d
    inc        t1d
    shr        t3d, t0b
    sub        t1d, t0d
%ifidn %1,mmx
    movd        m6, t1d
    DEQUANT_STORE_MMX m6
%else
    movd      xmm6, t1d
    DEQUANT_STORE_SSE2 xmm6
%endif
    RET
%endmacro

INIT_MMX
IDCT_DC_DEQUANT mmx, 0
IDCT_DC_DEQUANT sse2, 7