diff --git a/libavcodec/x86/dsputil.asm b/libavcodec/x86/dsputil.asm index c91dd8eb69b4238a7a0b90bc65cdbf5d1986d097..747c6456666b6525486d2770f1cc7906f9945fe6 100644 --- a/libavcodec/x86/dsputil.asm +++ b/libavcodec/x86/dsputil.asm @@ -31,6 +31,8 @@ pb_zzzz3333zzzzbbbb: db -1,-1,-1,-1,3,3,3,3,-1,-1,-1,-1,11,11,11,11 pb_zz11zz55zz99zzdd: db -1,-1,1,1,-1,-1,5,5,-1,-1,9,9,-1,-1,13,13 pb_bswap32: db 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12 +cextern pb_80 + SECTION_TEXT %macro SCALARPRODUCT 0 @@ -573,3 +575,53 @@ CLEAR_BLOCKS 0 INIT_XMM sse %define ZERO xorps CLEAR_BLOCKS 1 + +;-------------------------------------------------------------------------- +;void ff_put_signed_pixels_clamped(const int16_t *block, uint8_t *pixels, +; int line_size) +;-------------------------------------------------------------------------- + +%macro PUT_SIGNED_PIXELS_CLAMPED_HALF 1 + mova m1, [blockq+mmsize*0+%1] + mova m2, [blockq+mmsize*2+%1] +%if mmsize == 8 + mova m3, [blockq+mmsize*4+%1] + mova m4, [blockq+mmsize*6+%1] +%endif + packsswb m1, [blockq+mmsize*1+%1] + packsswb m2, [blockq+mmsize*3+%1] +%if mmsize == 8 + packsswb m3, [blockq+mmsize*5+%1] + packsswb m4, [blockq+mmsize*7+%1] +%endif + paddb m1, m0 + paddb m2, m0 +%if mmsize == 8 + paddb m3, m0 + paddb m4, m0 + movq [pixelsq+lsizeq*0], m1 + movq [pixelsq+lsizeq*1], m2 + movq [pixelsq+lsizeq*2], m3 + movq [pixelsq+lsize3q ], m4 +%else + movq [pixelsq+lsizeq*0], m1 + movhps [pixelsq+lsizeq*1], m1 + movq [pixelsq+lsizeq*2], m2 + movhps [pixelsq+lsize3q ], m2 +%endif +%endmacro + +%macro PUT_SIGNED_PIXELS_CLAMPED 1 +cglobal put_signed_pixels_clamped, 3, 4, %1, block, pixels, lsize, lsize3 + mova m0, [pb_80] + lea lsize3q, [lsizeq*3] + PUT_SIGNED_PIXELS_CLAMPED_HALF 0 + lea pixelsq, [pixelsq+lsizeq*4] + PUT_SIGNED_PIXELS_CLAMPED_HALF 64 + RET +%endmacro + +INIT_MMX mmx +PUT_SIGNED_PIXELS_CLAMPED 0 +INIT_XMM sse2 +PUT_SIGNED_PIXELS_CLAMPED 3 diff --git a/libavcodec/x86/dsputil_init.c b/libavcodec/x86/dsputil_init.c index 4461ae464f6814dea3d17909cdc55b6334f37fc9..e274e671d743477aa302b7260cf71554a8650460 100644 --- a/libavcodec/x86/dsputil_init.c +++ b/libavcodec/x86/dsputil_init.c @@ -530,7 +530,6 @@ static av_cold void dsputil_init_mmx(DSPContext *c, AVCodecContext *avctx, { #if HAVE_MMX_INLINE c->put_pixels_clamped = ff_put_pixels_clamped_mmx; - c->put_signed_pixels_clamped = ff_put_signed_pixels_clamped_mmx; c->add_pixels_clamped = ff_add_pixels_clamped_mmx; if (!high_bit_depth) { @@ -550,6 +549,7 @@ static av_cold void dsputil_init_mmx(DSPContext *c, AVCodecContext *avctx, c->clear_blocks = ff_clear_blocks_mmx; } c->vector_clip_int32 = ff_vector_clip_int32_mmx; + c->put_signed_pixels_clamped = ff_put_signed_pixels_clamped_mmx; #endif /* HAVE_MMX_EXTERNAL */ } @@ -627,6 +627,7 @@ static av_cold void dsputil_init_sse2(DSPContext *c, AVCodecContext *avctx, c->vector_clip_int32 = ff_vector_clip_int32_sse2; } c->bswap_buf = ff_bswap32_buf_sse2; + c->put_signed_pixels_clamped = ff_put_signed_pixels_clamped_sse2; #endif /* HAVE_SSE2_EXTERNAL */ } diff --git a/libavcodec/x86/dsputil_mmx.c b/libavcodec/x86/dsputil_mmx.c index a9c584d88a8d1d69dc855a079f8ff0a9a65d5979..fa77a5c93821d644715b553b028b2f6a0dbd6be6 100644 --- a/libavcodec/x86/dsputil_mmx.c +++ b/libavcodec/x86/dsputil_mmx.c @@ -94,42 +94,6 @@ void ff_put_pixels_clamped_mmx(const int16_t *block, uint8_t *pixels, : "memory"); } -#define put_signed_pixels_clamped_mmx_half(off) \ - "movq "#off"(%2), %%mm1 \n\t" \ - "movq 16 + "#off"(%2), %%mm2 \n\t" \ - "movq 32 + "#off"(%2), %%mm3 \n\t" \ - "movq 48 + "#off"(%2), %%mm4 \n\t" \ - "packsswb 8 + "#off"(%2), %%mm1 \n\t" \ - "packsswb 24 + "#off"(%2), %%mm2 \n\t" \ - "packsswb 40 + "#off"(%2), %%mm3 \n\t" \ - "packsswb 56 + "#off"(%2), %%mm4 \n\t" \ - "paddb %%mm0, %%mm1 \n\t" \ - "paddb %%mm0, %%mm2 \n\t" \ - "paddb %%mm0, %%mm3 \n\t" \ - "paddb %%mm0, %%mm4 \n\t" \ - "movq %%mm1, (%0) \n\t" \ - "movq %%mm2, (%0, %3) \n\t" \ - "movq %%mm3, (%0, %3, 2) \n\t" \ - "movq %%mm4, (%0, %1) \n\t" - -void ff_put_signed_pixels_clamped_mmx(const int16_t *block, uint8_t *pixels, - int line_size) -{ - x86_reg line_skip = line_size; - x86_reg line_skip3; - - __asm__ volatile ( - "movq "MANGLE(ff_pb_80)", %%mm0 \n\t" - "lea (%3, %3, 2), %1 \n\t" - put_signed_pixels_clamped_mmx_half(0) - "lea (%0, %3, 4), %0 \n\t" - put_signed_pixels_clamped_mmx_half(64) - : "+&r" (pixels), "=&r" (line_skip3) - : "r" (block), "r" (line_skip) - NAMED_CONSTRAINTS_ADD(ff_pb_80) - : "memory"); -} - void ff_add_pixels_clamped_mmx(const int16_t *block, uint8_t *pixels, int line_size) { diff --git a/libavcodec/x86/dsputil_x86.h b/libavcodec/x86/dsputil_x86.h index 6a50a09e5d9d2a94ff2a81c2a0c4ece5c89abfa1..1f4711dd2df0602ca61bc156cbc69bb2e2d5a851 100644 --- a/libavcodec/x86/dsputil_x86.h +++ b/libavcodec/x86/dsputil_x86.h @@ -37,6 +37,8 @@ void ff_put_pixels_clamped_mmx(const int16_t *block, uint8_t *pixels, int line_size); void ff_put_signed_pixels_clamped_mmx(const int16_t *block, uint8_t *pixels, int line_size); +void ff_put_signed_pixels_clamped_sse2(const int16_t *block, uint8_t *pixels, + int line_size); void ff_clear_block_mmx(int16_t *block); void ff_clear_block_sse(int16_t *block);