提交 9f00b1cb 编写于 作者: D Daniel Kang 提交者: Luca Barbato

dsputilenc: x86: Convert pixel inline asm to yasm

Signed-off-by: NLuca Barbato <lu_zero@gentoo.org>
上级 c7df1532
......@@ -333,3 +333,155 @@ cglobal sse16, 5, 5, 8
paddd m7, m1
movd eax, m7 ; return value
RET
INIT_MMX mmx
; get_pixels_mmx(DCTELEM *block, const uint8_t *pixels, int line_size)
cglobal get_pixels, 3,4
movsxdifnidn r2, r2d
add r0, 128
mov r3, -128
pxor m7, m7
.loop:
mova m0, [r1]
mova m2, [r1+r2]
mova m1, m0
mova m3, m2
punpcklbw m0, m7
punpckhbw m1, m7
punpcklbw m2, m7
punpckhbw m3, m7
mova [r0+r3+ 0], m0
mova [r0+r3+ 8], m1
mova [r0+r3+16], m2
mova [r0+r3+24], m3
lea r1, [r1+r2*2]
add r3, 32
js .loop
REP_RET
INIT_XMM sse2
cglobal get_pixels, 3, 4
movsxdifnidn r2, r2d
lea r3, [r2*3]
pxor m4, m4
movh m0, [r1]
movh m1, [r1+r2]
movh m2, [r1+r2*2]
movh m3, [r1+r3]
lea r1, [r1+r2*4]
punpcklbw m0, m4
punpcklbw m1, m4
punpcklbw m2, m4
punpcklbw m3, m4
mova [r0], m0
mova [r0+0x10], m1
mova [r0+0x20], m2
mova [r0+0x30], m3
movh m0, [r1]
movh m1, [r1+r2*1]
movh m2, [r1+r2*2]
movh m3, [r1+r3]
punpcklbw m0, m4
punpcklbw m1, m4
punpcklbw m2, m4
punpcklbw m3, m4
mova [r0+0x40], m0
mova [r0+0x50], m1
mova [r0+0x60], m2
mova [r0+0x70], m3
RET
INIT_MMX mmx
; diff_pixels_mmx(DCTELEM *block, const uint8_t *s1, const unint8_t *s2, stride)
cglobal diff_pixels, 4,5
movsxdifnidn r3, r3d
pxor m7, m7
add r0, 128
mov r4, -128
.loop:
mova m0, [r1]
mova m2, [r2]
mova m1, m0
mova m3, m2
punpcklbw m0, m7
punpckhbw m1, m7
punpcklbw m2, m7
punpckhbw m3, m7
psubw m0, m2
psubw m1, m3
mova [r0+r4+0], m0
mova [r0+r4+8], m1
add r1, r3
add r2, r3
add r4, 16
jne .loop
REP_RET
INIT_MMX mmx
; pix_sum16_mmx(uint8_t * pix, int line_size)
cglobal pix_sum16, 2, 3
movsxdifnidn r1, r1d
mov r2, r1
neg r2
shl r2, 4
sub r0, r2
pxor m7, m7
pxor m6, m6
.loop:
mova m0, [r0+r2+0]
mova m1, [r0+r2+0]
mova m2, [r0+r2+8]
mova m3, [r0+r2+8]
punpcklbw m0, m7
punpckhbw m1, m7
punpcklbw m2, m7
punpckhbw m3, m7
paddw m1, m0
paddw m3, m2
paddw m3, m1
paddw m6, m3
add r2, r1
js .loop
mova m5, m6
psrlq m6, 32
paddw m6, m5
mova m5, m6
psrlq m6, 16
paddw m6, m5
movd eax, m6
and eax, 0xffff
RET
INIT_MMX mmx
; pix_norm1_mmx(uint8_t *pix, int line_size)
cglobal pix_norm1, 2, 4
movsxdifnidn r1, r1d
mov r2, 16
pxor m0, m0
pxor m7, m7
.loop:
mova m2, [r0+0]
mova m3, [r0+8]
mova m1, m2
punpckhbw m1, m0
punpcklbw m2, m0
mova m4, m3
punpckhbw m3, m0
punpcklbw m4, m0
pmaddwd m1, m1
pmaddwd m2, m2
pmaddwd m3, m3
pmaddwd m4, m4
paddd m2, m1
paddd m4, m3
paddd m7, m2
add r0, r1
paddd m7, m4
dec r2
jne .loop
mova m1, m7
psrlq m7, 32
paddd m1, m7
movd eax, m1
RET
......@@ -30,181 +30,14 @@
#include "libavcodec/mathops.h"
#include "dsputil_mmx.h"
void ff_get_pixels_mmx(DCTELEM *block, const uint8_t *pixels, int line_size);
void ff_get_pixels_sse2(DCTELEM *block, const uint8_t *pixels, int line_size);
void ff_diff_pixels_mmx(DCTELEM *block, const uint8_t *s1, const uint8_t *s2, int stride);
int ff_pix_sum16_mmx(uint8_t * pix, int line_size);
int ff_pix_norm1_mmx(uint8_t *pix, int line_size);
#if HAVE_INLINE_ASM
static void get_pixels_mmx(DCTELEM *block, const uint8_t *pixels, int line_size)
{
__asm__ volatile(
"mov $-128, %%"REG_a" \n\t"
"pxor %%mm7, %%mm7 \n\t"
".p2align 4 \n\t"
"1: \n\t"
"movq (%0), %%mm0 \n\t"
"movq (%0, %2), %%mm2 \n\t"
"movq %%mm0, %%mm1 \n\t"
"movq %%mm2, %%mm3 \n\t"
"punpcklbw %%mm7, %%mm0 \n\t"
"punpckhbw %%mm7, %%mm1 \n\t"
"punpcklbw %%mm7, %%mm2 \n\t"
"punpckhbw %%mm7, %%mm3 \n\t"
"movq %%mm0, (%1, %%"REG_a") \n\t"
"movq %%mm1, 8(%1, %%"REG_a") \n\t"
"movq %%mm2, 16(%1, %%"REG_a") \n\t"
"movq %%mm3, 24(%1, %%"REG_a") \n\t"
"add %3, %0 \n\t"
"add $32, %%"REG_a" \n\t"
"js 1b \n\t"
: "+r" (pixels)
: "r" (block+64), "r" ((x86_reg)line_size), "r" ((x86_reg)line_size*2)
: "%"REG_a
);
}
static void get_pixels_sse2(DCTELEM *block, const uint8_t *pixels, int line_size)
{
__asm__ volatile(
"pxor %%xmm4, %%xmm4 \n\t"
"movq (%0), %%xmm0 \n\t"
"movq (%0, %2), %%xmm1 \n\t"
"movq (%0, %2,2), %%xmm2 \n\t"
"movq (%0, %3), %%xmm3 \n\t"
"lea (%0,%2,4), %0 \n\t"
"punpcklbw %%xmm4, %%xmm0 \n\t"
"punpcklbw %%xmm4, %%xmm1 \n\t"
"punpcklbw %%xmm4, %%xmm2 \n\t"
"punpcklbw %%xmm4, %%xmm3 \n\t"
"movdqa %%xmm0, (%1) \n\t"
"movdqa %%xmm1, 16(%1) \n\t"
"movdqa %%xmm2, 32(%1) \n\t"
"movdqa %%xmm3, 48(%1) \n\t"
"movq (%0), %%xmm0 \n\t"
"movq (%0, %2), %%xmm1 \n\t"
"movq (%0, %2,2), %%xmm2 \n\t"
"movq (%0, %3), %%xmm3 \n\t"
"punpcklbw %%xmm4, %%xmm0 \n\t"
"punpcklbw %%xmm4, %%xmm1 \n\t"
"punpcklbw %%xmm4, %%xmm2 \n\t"
"punpcklbw %%xmm4, %%xmm3 \n\t"
"movdqa %%xmm0, 64(%1) \n\t"
"movdqa %%xmm1, 80(%1) \n\t"
"movdqa %%xmm2, 96(%1) \n\t"
"movdqa %%xmm3, 112(%1) \n\t"
: "+r" (pixels)
: "r" (block), "r" ((x86_reg)line_size), "r" ((x86_reg)line_size*3)
);
}
static inline void diff_pixels_mmx(DCTELEM *block, const uint8_t *s1, const uint8_t *s2, int stride)
{
__asm__ volatile(
"pxor %%mm7, %%mm7 \n\t"
"mov $-128, %%"REG_a" \n\t"
".p2align 4 \n\t"
"1: \n\t"
"movq (%0), %%mm0 \n\t"
"movq (%1), %%mm2 \n\t"
"movq %%mm0, %%mm1 \n\t"
"movq %%mm2, %%mm3 \n\t"
"punpcklbw %%mm7, %%mm0 \n\t"
"punpckhbw %%mm7, %%mm1 \n\t"
"punpcklbw %%mm7, %%mm2 \n\t"
"punpckhbw %%mm7, %%mm3 \n\t"
"psubw %%mm2, %%mm0 \n\t"
"psubw %%mm3, %%mm1 \n\t"
"movq %%mm0, (%2, %%"REG_a") \n\t"
"movq %%mm1, 8(%2, %%"REG_a") \n\t"
"add %3, %0 \n\t"
"add %3, %1 \n\t"
"add $16, %%"REG_a" \n\t"
"jnz 1b \n\t"
: "+r" (s1), "+r" (s2)
: "r" (block+64), "r" ((x86_reg)stride)
: "%"REG_a
);
}
static int pix_sum16_mmx(uint8_t * pix, int line_size){
const int h=16;
int sum;
x86_reg index= -line_size*h;
__asm__ volatile(
"pxor %%mm7, %%mm7 \n\t"
"pxor %%mm6, %%mm6 \n\t"
"1: \n\t"
"movq (%2, %1), %%mm0 \n\t"
"movq (%2, %1), %%mm1 \n\t"
"movq 8(%2, %1), %%mm2 \n\t"
"movq 8(%2, %1), %%mm3 \n\t"
"punpcklbw %%mm7, %%mm0 \n\t"
"punpckhbw %%mm7, %%mm1 \n\t"
"punpcklbw %%mm7, %%mm2 \n\t"
"punpckhbw %%mm7, %%mm3 \n\t"
"paddw %%mm0, %%mm1 \n\t"
"paddw %%mm2, %%mm3 \n\t"
"paddw %%mm1, %%mm3 \n\t"
"paddw %%mm3, %%mm6 \n\t"
"add %3, %1 \n\t"
" js 1b \n\t"
"movq %%mm6, %%mm5 \n\t"
"psrlq $32, %%mm6 \n\t"
"paddw %%mm5, %%mm6 \n\t"
"movq %%mm6, %%mm5 \n\t"
"psrlq $16, %%mm6 \n\t"
"paddw %%mm5, %%mm6 \n\t"
"movd %%mm6, %0 \n\t"
"andl $0xFFFF, %0 \n\t"
: "=&r" (sum), "+r" (index)
: "r" (pix - index), "r" ((x86_reg)line_size)
);
return sum;
}
static int pix_norm1_mmx(uint8_t *pix, int line_size) {
int tmp;
__asm__ volatile (
"movl $16,%%ecx\n"
"pxor %%mm0,%%mm0\n"
"pxor %%mm7,%%mm7\n"
"1:\n"
"movq (%0),%%mm2\n" /* mm2 = pix[0-7] */
"movq 8(%0),%%mm3\n" /* mm3 = pix[8-15] */
"movq %%mm2,%%mm1\n" /* mm1 = mm2 = pix[0-7] */
"punpckhbw %%mm0,%%mm1\n" /* mm1 = [pix4-7] */
"punpcklbw %%mm0,%%mm2\n" /* mm2 = [pix0-3] */
"movq %%mm3,%%mm4\n" /* mm4 = mm3 = pix[8-15] */
"punpckhbw %%mm0,%%mm3\n" /* mm3 = [pix12-15] */
"punpcklbw %%mm0,%%mm4\n" /* mm4 = [pix8-11] */
"pmaddwd %%mm1,%%mm1\n" /* mm1 = (pix0^2+pix1^2,pix2^2+pix3^2) */
"pmaddwd %%mm2,%%mm2\n" /* mm2 = (pix4^2+pix5^2,pix6^2+pix7^2) */
"pmaddwd %%mm3,%%mm3\n"
"pmaddwd %%mm4,%%mm4\n"
"paddd %%mm1,%%mm2\n" /* mm2 = (pix0^2+pix1^2+pix4^2+pix5^2,
pix2^2+pix3^2+pix6^2+pix7^2) */
"paddd %%mm3,%%mm4\n"
"paddd %%mm2,%%mm7\n"
"add %2, %0\n"
"paddd %%mm4,%%mm7\n"
"dec %%ecx\n"
"jnz 1b\n"
"movq %%mm7,%%mm1\n"
"psrlq $32, %%mm7\n" /* shift hi dword to lo */
"paddd %%mm7,%%mm1\n"
"movd %%mm1,%1\n"
: "+r" (pix), "=r"(tmp) : "r" ((x86_reg)line_size) : "%ecx" );
return tmp;
}
static int sse8_mmx(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {
int tmp;
__asm__ volatile (
......@@ -1111,10 +944,23 @@ hadamard_func(ssse3)
void ff_dsputilenc_init_mmx(DSPContext* c, AVCodecContext *avctx)
{
int mm_flags = av_get_cpu_flags();
#if HAVE_INLINE_ASM
int bit_depth = avctx->bits_per_raw_sample;
#if HAVE_YASM
if (EXTERNAL_MMX(mm_flags)) {
if (bit_depth <= 8)
c->get_pixels = ff_get_pixels_mmx;
c->diff_pixels = ff_diff_pixels_mmx;
c->pix_sum = ff_pix_sum16_mmx;
c->pix_norm1 = ff_pix_norm1_mmx;
}
if (EXTERNAL_SSE2(mm_flags))
if (bit_depth <= 8)
c->get_pixels = ff_get_pixels_sse2;
#endif /* HAVE_YASM */
#if HAVE_INLINE_ASM
if (mm_flags & AV_CPU_FLAG_MMX) {
const int dct_algo = avctx->dct_algo;
if (avctx->bits_per_raw_sample <= 8 &&
......@@ -1128,15 +974,10 @@ void ff_dsputilenc_init_mmx(DSPContext* c, AVCodecContext *avctx)
}
}
if (bit_depth <= 8)
c->get_pixels = get_pixels_mmx;
c->diff_pixels = diff_pixels_mmx;
c->pix_sum = pix_sum16_mmx;
c->diff_bytes= diff_bytes_mmx;
c->sum_abs_dctelem= sum_abs_dctelem_mmx;
c->pix_norm1 = pix_norm1_mmx;
c->sse[0] = sse16_mmx;
c->sse[1] = sse8_mmx;
c->vsad[4]= vsad_intra16_mmx;
......@@ -1166,8 +1007,6 @@ void ff_dsputilenc_init_mmx(DSPContext* c, AVCodecContext *avctx)
}
if(mm_flags & AV_CPU_FLAG_SSE2){
if (bit_depth <= 8)
c->get_pixels = get_pixels_sse2;
c->sum_abs_dctelem= sum_abs_dctelem_sse2;
}
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册