diff --git a/libavcodec/dsputil.h b/libavcodec/dsputil.h index 9b9a3ca846fe84095e1b71efc2bab763746b4021..fbe2c684e1f0249340bc4b168a6581d29346467a 100644 --- a/libavcodec/dsputil.h +++ b/libavcodec/dsputil.h @@ -45,6 +45,7 @@ void j_rev_dct (DCTELEM *data); void ff_fdct_mmx(DCTELEM *block); void ff_fdct_mmx2(DCTELEM *block); +void ff_fdct_sse2(DCTELEM *block); /* encoding scans */ extern const uint8_t ff_alternate_horizontal_scan[64]; diff --git a/libavcodec/i386/dsputil_mmx.c b/libavcodec/i386/dsputil_mmx.c index 4ff75f763e3d5b6630ce3080b85662f4279a70d5..6c6f108372c93c2a7a70e391669df227a8dc183d 100644 --- a/libavcodec/i386/dsputil_mmx.c +++ b/libavcodec/i386/dsputil_mmx.c @@ -2032,7 +2032,9 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx) #ifdef CONFIG_ENCODERS if(dct_algo==FF_DCT_AUTO || dct_algo==FF_DCT_MMX){ - if(mm_flags & MM_MMXEXT){ + if(mm_flags & MM_SSE2){ + c->fdct = ff_fdct_sse2; + }else if(mm_flags & MM_MMXEXT){ c->fdct = ff_fdct_mmx2; }else{ c->fdct = ff_fdct_mmx; diff --git a/libavcodec/i386/fdct_mmx.c b/libavcodec/i386/fdct_mmx.c index 2f0f82b009b8cd1dbf2dad4417124d3979073f04..8771607730e5353a42c0dd3ef768cff1f36e5bcb 100644 --- a/libavcodec/i386/fdct_mmx.c +++ b/libavcodec/i386/fdct_mmx.c @@ -2,11 +2,16 @@ * MMX optimized forward DCT * The gcc porting is Copyright (c) 2001 Fabrice Bellard. * cleanup/optimizations are Copyright (c) 2002-2004 Michael Niedermayer + * SSE2 optimization is Copyright (c) 2004 Denes Balatoni. * * from fdctam32.c - AP922 MMX(3D-Now) forward-DCT * * Intel Application Note AP-922 - fast, precise implementation of DCT * http://developer.intel.com/vtune/cbts/appnotes.htm + * + * Also of inspiration: + * a page about fdct at http://www.geocities.com/ssavekar/dct.htm + * Skal's fdct at http://skal.planet-d.net/coding/dct.html */ #include "../common.h" #include "mmx.h" @@ -27,10 +32,8 @@ #define BITS_FRW_ACC 3 //; 2 or 3 for accuracy #define SHIFT_FRW_COL BITS_FRW_ACC #define SHIFT_FRW_ROW (BITS_FRW_ACC + 17 - 3) -//#define RND_FRW_ROW (262144 * (BITS_FRW_ACC - 1)) //; 1 << (SHIFT_FRW_ROW-1) #define RND_FRW_ROW (1 << (SHIFT_FRW_ROW-1)) -//#define RND_FRW_COL (2 * (BITS_FRW_ACC - 1)) //; 1 << (SHIFT_FRW_COL-1) -#define RND_FRW_COL (1 << (SHIFT_FRW_COL-1)) +//#define RND_FRW_COL (1 << (SHIFT_FRW_COL-1)) //concatenated table, for forward DCT transformation static const int16_t fdct_tg_all_16[] ATTR_ALIGN(8) = { @@ -38,17 +41,17 @@ static const int16_t fdct_tg_all_16[] ATTR_ALIGN(8) = { 27146, 27146, 27146, 27146, // tg * (2<<16) + 0.5 -21746, -21746, -21746, -21746, // tg * (2<<16) + 0.5 }; -static const int16_t cos_4_16[4] ATTR_ALIGN(8) = { - -19195, -19195, -19195, -19195, //cos * (2<<16) + 0.5 -}; static const int16_t ocos_4_16[4] ATTR_ALIGN(8) = { 23170, 23170, 23170, 23170, //cos * (2<<15) + 0.5 }; -static const long long fdct_one_corr ATTR_ALIGN(8) = 0x0001000100010001LL; +static const long long fdct_one_corr ATTR_ALIGN(8) = 0x0001000100010001LL; + static const long fdct_r_row[2] ATTR_ALIGN(8) = {RND_FRW_ROW, RND_FRW_ROW }; +static const long fdct_r_row_sse2[4] ATTR_ALIGN(16) = {RND_FRW_ROW, RND_FRW_ROW, RND_FRW_ROW, RND_FRW_ROW}; + static const int16_t tab_frw_01234567[] ATTR_ALIGN(8) = { // forward_dct coeff table 16384, 16384, -8867, -21407, 16384, 16384, 21407, 8867, @@ -123,6 +126,133 @@ static const int16_t tab_frw_01234567[] ATTR_ALIGN(8) = { // forward_dct coeff 6270, 26722, 6270, -17855, }; +static const int16_t tab_frw_01234567_sse2[] ATTR_ALIGN(16) = { // forward_dct coeff table +#define TABLE_SSE2 C4, C4, C1, C3, -C6, -C2, -C1, -C5, \ + C4, C4, C5, C7, C2, C6, C3, -C7, \ + -C4, C4, C7, C3, C6, -C2, C7, -C5, \ + C4, -C4, C5, -C1, C2, -C6, C3, -C1, +// c1..c7 * cos(pi/4) * 2^15 +#define C1 22725 +#define C2 21407 +#define C3 19266 +#define C4 16384 +#define C5 12873 +#define C6 8867 +#define C7 4520 +TABLE_SSE2 + +#undef C1 +#undef C2 +#undef C3 +#undef C4 +#undef C5 +#undef C6 +#undef C7 +#define C1 31521 +#define C2 29692 +#define C3 26722 +#define C4 22725 +#define C5 17855 +#define C6 12299 +#define C7 6270 +TABLE_SSE2 + +#undef C1 +#undef C2 +#undef C3 +#undef C4 +#undef C5 +#undef C6 +#undef C7 +#define C1 29692 +#define C2 27969 +#define C3 25172 +#define C4 21407 +#define C5 16819 +#define C6 11585 +#define C7 5906 +TABLE_SSE2 + +#undef C1 +#undef C2 +#undef C3 +#undef C4 +#undef C5 +#undef C6 +#undef C7 +#define C1 26722 +#define C2 25172 +#define C3 22654 +#define C4 19266 +#define C5 15137 +#define C6 10426 +#define C7 5315 +TABLE_SSE2 + +#undef C1 +#undef C2 +#undef C3 +#undef C4 +#undef C5 +#undef C6 +#undef C7 +#define C1 22725 +#define C2 21407 +#define C3 19266 +#define C4 16384 +#define C5 12873 +#define C6 8867 +#define C7 4520 +TABLE_SSE2 + +#undef C1 +#undef C2 +#undef C3 +#undef C4 +#undef C5 +#undef C6 +#undef C7 +#define C1 26722 +#define C2 25172 +#define C3 22654 +#define C4 19266 +#define C5 15137 +#define C6 10426 +#define C7 5315 +TABLE_SSE2 + +#undef C1 +#undef C2 +#undef C3 +#undef C4 +#undef C5 +#undef C6 +#undef C7 +#define C1 29692 +#define C2 27969 +#define C3 25172 +#define C4 21407 +#define C5 16819 +#define C6 11585 +#define C7 5906 +TABLE_SSE2 + +#undef C1 +#undef C2 +#undef C3 +#undef C4 +#undef C5 +#undef C6 +#undef C7 +#define C1 31521 +#define C2 29692 +#define C3 26722 +#define C4 22725 +#define C5 17855 +#define C6 12299 +#define C7 6270 +TABLE_SSE2 +}; static always_inline void fdct_col(const int16_t *in, int16_t *out, int offset) { @@ -203,6 +333,69 @@ static always_inline void fdct_col(const int16_t *in, int16_t *out, int offset) movq_r2m(mm3, *(out + offset + 7 * 8)); } + +static always_inline void fdct_row_sse2(const int16_t *in, int16_t *out) +{ + asm volatile( + ".macro FDCT_ROW_SSE2_H1 i t \n\t" + "movq \\i(%0), %%xmm2 \n\t" + "movq \\i+8(%0), %%xmm0 \n\t" + "movdqa \\t+32(%1), %%xmm3 \n\t" + "movdqa \\t+48(%1), %%xmm7 \n\t" + "movdqa \\t(%1), %%xmm4 \n\t" + "movdqa \\t+16(%1), %%xmm5 \n\t" + ".endm \n\t" + ".macro FDCT_ROW_SSE2_H2 i t \n\t" + "movq \\i(%0), %%xmm2 \n\t" + "movq \\i+8(%0), %%xmm0 \n\t" + "movdqa \\t+32(%1), %%xmm3 \n\t" + "movdqa \\t+48(%1), %%xmm7 \n\t" + ".endm \n\t" + ".macro FDCT_ROW_SSE2 i \n\t" + "movq %%xmm2, %%xmm1 \n\t" + "pshuflw $27, %%xmm0, %%xmm0 \n\t" + "paddsw %%xmm0, %%xmm1 \n\t" + "psubsw %%xmm0, %%xmm2 \n\t" + "punpckldq %%xmm2, %%xmm1 \n\t" + "pshufd $78, %%xmm1, %%xmm2 \n\t" + "pmaddwd %%xmm2, %%xmm3 \n\t" + "pmaddwd %%xmm1, %%xmm7 \n\t" + "pmaddwd %%xmm5, %%xmm2 \n\t" + "pmaddwd %%xmm4, %%xmm1 \n\t" + "paddd %%xmm7, %%xmm3 \n\t" + "paddd %%xmm2, %%xmm1 \n\t" + "paddd %%xmm6, %%xmm3 \n\t" + "paddd %%xmm6, %%xmm1 \n\t" + "psrad %3, %%xmm3 \n\t" + "psrad %3, %%xmm1 \n\t" + "packssdw %%xmm3, %%xmm1 \n\t" + "movdqa %%xmm1, \\i(%4) \n\t" + ".endm \n\t" + "movdqa (%2), %%xmm6 \n\t" + "FDCT_ROW_SSE2_H1 0 0 \n\t" + "FDCT_ROW_SSE2 0 \n\t" + "FDCT_ROW_SSE2_H2 64 0 \n\t" + "FDCT_ROW_SSE2 64 \n\t" + + "FDCT_ROW_SSE2_H1 16 64 \n\t" + "FDCT_ROW_SSE2 16 \n\t" + "FDCT_ROW_SSE2_H2 112 64 \n\t" + "FDCT_ROW_SSE2 112 \n\t" + + "FDCT_ROW_SSE2_H1 32 128 \n\t" + "FDCT_ROW_SSE2 32 \n\t" + "FDCT_ROW_SSE2_H2 96 128 \n\t" + "FDCT_ROW_SSE2 96 \n\t" + + "FDCT_ROW_SSE2_H1 48 192 \n\t" + "FDCT_ROW_SSE2 48 \n\t" + "FDCT_ROW_SSE2_H2 80 192 \n\t" + "FDCT_ROW_SSE2 80 \n\t" + : + : "r" (in), "r" (tab_frw_01234567_sse2), "r" (fdct_r_row_sse2), "i" (SHIFT_FRW_ROW), "r" (out) + ); +} + static always_inline void fdct_row_mmx2(const int16_t *in, int16_t *out, const int16_t *table) { pshufw_m2r(*(in + 4), mm5, 0x1B); @@ -341,3 +534,18 @@ void ff_fdct_mmx2(int16_t *block) out += 8; } } + +void ff_fdct_sse2(int16_t *block) +{ + int64_t align_tmp[16] ATTR_ALIGN(8); + int16_t * const block_tmp= (int16_t*)align_tmp; + int16_t *block1; + int i; + + block1 = block_tmp; + fdct_col(block, block1, 0); + fdct_col(block, block1, 4); + + fdct_row_sse2(block1, block); +} + diff --git a/libavcodec/i386/mpegvideo_mmx.c b/libavcodec/i386/mpegvideo_mmx.c index daffd1d8d6568394eede69c8e3832849a196d239..1c0e9f5ae3cfd3ef1b6aeb2a4d936b0c297872cc 100644 --- a/libavcodec/i386/mpegvideo_mmx.c +++ b/libavcodec/i386/mpegvideo_mmx.c @@ -683,6 +683,12 @@ static void denoise_dct_sse2(MpegEncContext *s, DCTELEM *block){ #define RENAMEl(a) a ## _mmx2 #include "mpegvideo_mmx_template.c" +#undef RENAME +#undef RENAMEl +#define RENAME(a) a ## _SSE2 +#define RENAMEl(a) a ## _sse2 +#include "mpegvideo_mmx_template.c" + void MPV_common_init_mmx(MpegEncContext *s) { if (mm_flags & MM_MMX) { @@ -704,7 +710,9 @@ void MPV_common_init_mmx(MpegEncContext *s) } if(dct_algo==FF_DCT_AUTO || dct_algo==FF_DCT_MMX){ - if(mm_flags & MM_MMXEXT){ + if(mm_flags & MM_SSE2){ + s->dct_quantize= dct_quantize_SSE2; + } else if(mm_flags & MM_MMXEXT){ s->dct_quantize= dct_quantize_MMX2; } else { s->dct_quantize= dct_quantize_MMX;