SSE2 fdct by (Balatoni Denes <pnis at coder dot hu>)

Originally committed as revision 2729 to svn://svn.ffmpeg.org/ffmpeg/trunk

SSE2 fdct by (Balatoni Denes <pnis at coder dot hu>)
Originally committed as revision 2729 to svn://svn.ffmpeg.org/ffmpeg/trunk
8fd19ab2 · Michael Niedermayer · 5a603607 · 8fd19ab2 · 8fd19ab2 · 8fd19ab2
4 changed file
--- a/libavcodec/dsputil.h
+++ b/libavcodec/dsputil.h
@@ -45,6 +45,7 @@ void j_rev_dct (DCTELEM *data);

 void ff_fdct_mmx(DCTELEM *block);
 void ff_fdct_mmx2(DCTELEM *block);
+void ff_fdct_sse2(DCTELEM *block);

 /* encoding scans */
 extern const uint8_t ff_alternate_horizontal_scan[64];

--- a/libavcodec/i386/dsputil_mmx.c
+++ b/libavcodec/i386/dsputil_mmx.c
@@ -2032,7 +2032,9 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx)

 #ifdef CONFIG_ENCODERS
        if(dct_algo==FF_DCT_AUTO || dct_algo==FF_DCT_MMX){
-            if(mm_flags & MM_MMXEXT){
+            if(mm_flags & MM_SSE2){
+                c->fdct = ff_fdct_sse2;
+	    }else if(mm_flags & MM_MMXEXT){
                c->fdct = ff_fdct_mmx2;
            }else{
                c->fdct = ff_fdct_mmx;

--- a/libavcodec/i386/fdct_mmx.c
+++ b/libavcodec/i386/fdct_mmx.c
@@ -2,11 +2,16 @@
 * MMX optimized forward DCT
 * The gcc porting is Copyright (c) 2001 Fabrice Bellard.
 * cleanup/optimizations are Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
+ * SSE2 optimization is Copyright (c) 2004 Denes Balatoni.
 *
 * from  fdctam32.c - AP922 MMX(3D-Now) forward-DCT
 * 
 *  Intel Application Note AP-922 - fast, precise implementation of DCT
 *        http://developer.intel.com/vtune/cbts/appnotes.htm
+ *
+ * Also of inspiration:
+ * a page about fdct at http://www.geocities.com/ssavekar/dct.htm
+ * Skal's fdct at http://skal.planet-d.net/coding/dct.html
 */
 #include "../common.h"
 #include "mmx.h"
@@ -27,10 +32,8 @@
 #define BITS_FRW_ACC	3 //; 2 or 3 for accuracy
 #define SHIFT_FRW_COL	BITS_FRW_ACC
 #define SHIFT_FRW_ROW	(BITS_FRW_ACC + 17 - 3)
-//#define RND_FRW_ROW		(262144 * (BITS_FRW_ACC - 1)) //; 1 << (SHIFT_FRW_ROW-1)
 #define RND_FRW_ROW		(1 << (SHIFT_FRW_ROW-1))
-//#define RND_FRW_COL		(2 * (BITS_FRW_ACC - 1)) //; 1 << (SHIFT_FRW_COL-1)
-#define RND_FRW_COL		(1 << (SHIFT_FRW_COL-1))
+//#define RND_FRW_COL		(1 << (SHIFT_FRW_COL-1))

 //concatenated table, for forward DCT transformation
 static const int16_t fdct_tg_all_16[] ATTR_ALIGN(8) = {
@@ -38,17 +41,17 @@ static const int16_t fdct_tg_all_16[] ATTR_ALIGN(8) = {
    27146, 27146, 27146, 27146,		// tg * (2<<16) + 0.5
    -21746, -21746, -21746, -21746,	// tg * (2<<16) + 0.5
 };
-static const int16_t cos_4_16[4] ATTR_ALIGN(8) = {
-    -19195, -19195, -19195, -19195,	//cos * (2<<16) + 0.5
-};

 static const int16_t ocos_4_16[4] ATTR_ALIGN(8) = {
    23170, 23170, 23170, 23170,	//cos * (2<<15) + 0.5
 };

-static const long long  fdct_one_corr ATTR_ALIGN(8) = 0x0001000100010001LL;
+static const long long fdct_one_corr ATTR_ALIGN(8) = 0x0001000100010001LL;
+
 static const long fdct_r_row[2] ATTR_ALIGN(8) = {RND_FRW_ROW, RND_FRW_ROW };

+static const long fdct_r_row_sse2[4] ATTR_ALIGN(16) = {RND_FRW_ROW, RND_FRW_ROW, RND_FRW_ROW, RND_FRW_ROW};
+
 static const int16_t tab_frw_01234567[] ATTR_ALIGN(8) = {  // forward_dct coeff table
  16384,   16384,   -8867,  -21407, 
  16384,   16384,   21407,    8867, 
@@ -123,6 +126,133 @@ static const int16_t tab_frw_01234567[] ATTR_ALIGN(8) = {  // forward_dct coeff
   6270,   26722,    6270,  -17855, 
 };

+static const int16_t tab_frw_01234567_sse2[] ATTR_ALIGN(16) = {  // forward_dct coeff table  
+#define TABLE_SSE2 C4,  C4,  C1,  C3, -C6, -C2, -C1, -C5, \
+                   C4,  C4,  C5,  C7,  C2,  C6,  C3, -C7, \
+                  -C4,  C4,  C7,  C3,  C6, -C2,  C7, -C5, \
+                   C4, -C4,  C5, -C1,  C2, -C6,  C3, -C1, 
+// c1..c7 * cos(pi/4) * 2^15 
+#define C1 22725
+#define C2 21407
+#define C3 19266
+#define C4 16384
+#define C5 12873
+#define C6 8867
+#define C7 4520
+TABLE_SSE2
+
+#undef C1
+#undef C2
+#undef C3
+#undef C4
+#undef C5
+#undef C6
+#undef C7
+#define C1 31521
+#define C2 29692
+#define C3 26722
+#define C4 22725
+#define C5 17855
+#define C6 12299
+#define C7 6270
+TABLE_SSE2
+
+#undef C1
+#undef C2
+#undef C3
+#undef C4
+#undef C5
+#undef C6
+#undef C7
+#define C1 29692
+#define C2 27969
+#define C3 25172
+#define C4 21407
+#define C5 16819
+#define C6 11585
+#define C7 5906
+TABLE_SSE2
+
+#undef C1
+#undef C2
+#undef C3
+#undef C4
+#undef C5
+#undef C6
+#undef C7
+#define C1 26722
+#define C2 25172
+#define C3 22654
+#define C4 19266
+#define C5 15137
+#define C6 10426
+#define C7 5315
+TABLE_SSE2
+
+#undef C1
+#undef C2
+#undef C3
+#undef C4
+#undef C5
+#undef C6
+#undef C7
+#define C1 22725
+#define C2 21407
+#define C3 19266
+#define C4 16384
+#define C5 12873
+#define C6 8867
+#define C7 4520
+TABLE_SSE2
+
+#undef C1
+#undef C2
+#undef C3
+#undef C4
+#undef C5
+#undef C6
+#undef C7
+#define C1 26722
+#define C2 25172
+#define C3 22654
+#define C4 19266
+#define C5 15137
+#define C6 10426
+#define C7 5315
+TABLE_SSE2
+
+#undef C1
+#undef C2
+#undef C3
+#undef C4
+#undef C5
+#undef C6
+#undef C7
+#define C1 29692
+#define C2 27969
+#define C3 25172
+#define C4 21407
+#define C5 16819
+#define C6 11585
+#define C7 5906
+TABLE_SSE2
+
+#undef C1
+#undef C2
+#undef C3
+#undef C4
+#undef C5
+#undef C6
+#undef C7
+#define C1 31521
+#define C2 29692
+#define C3 26722
+#define C4 22725
+#define C5 17855
+#define C6 12299
+#define C7 6270
+TABLE_SSE2
+};

 static always_inline void fdct_col(const int16_t *in, int16_t *out, int offset)
 {
@@ -203,6 +333,69 @@ static always_inline void fdct_col(const int16_t *in, int16_t *out, int offset)
    movq_r2m(mm3, *(out + offset + 7 * 8));
 }

+
+static always_inline void fdct_row_sse2(const int16_t *in, int16_t *out)
+{
+    asm volatile(
+        ".macro FDCT_ROW_SSE2_H1 i t   \n\t"
+	"movq      \\i(%0), %%xmm2     \n\t"
+	"movq      \\i+8(%0), %%xmm0   \n\t"
+	"movdqa    \\t+32(%1), %%xmm3  \n\t"
+	"movdqa    \\t+48(%1), %%xmm7  \n\t"	
+	"movdqa    \\t(%1), %%xmm4     \n\t"
+	"movdqa    \\t+16(%1), %%xmm5  \n\t"	
+	".endm                         \n\t"
+        ".macro FDCT_ROW_SSE2_H2 i t   \n\t"
+	"movq      \\i(%0), %%xmm2     \n\t"
+	"movq      \\i+8(%0), %%xmm0   \n\t"
+	"movdqa    \\t+32(%1), %%xmm3  \n\t"
+	"movdqa    \\t+48(%1), %%xmm7  \n\t"	
+	".endm                         \n\t"
+	".macro FDCT_ROW_SSE2 i        \n\t"	
+	"movq      %%xmm2, %%xmm1      \n\t"
+	"pshuflw   $27, %%xmm0, %%xmm0 \n\t"
+	"paddsw    %%xmm0, %%xmm1      \n\t"
+	"psubsw    %%xmm0, %%xmm2      \n\t"
+	"punpckldq %%xmm2, %%xmm1      \n\t"
+	"pshufd    $78, %%xmm1, %%xmm2 \n\t"
+	"pmaddwd   %%xmm2, %%xmm3      \n\t"
+	"pmaddwd   %%xmm1, %%xmm7      \n\t"
+	"pmaddwd   %%xmm5, %%xmm2      \n\t"
+	"pmaddwd   %%xmm4, %%xmm1      \n\t"
+	"paddd     %%xmm7, %%xmm3      \n\t"	
+	"paddd     %%xmm2, %%xmm1      \n\t"
+	"paddd     %%xmm6, %%xmm3      \n\t"
+	"paddd     %%xmm6, %%xmm1      \n\t"
+	"psrad     %3, %%xmm3          \n\t"
+	"psrad     %3, %%xmm1          \n\t"
+	"packssdw  %%xmm3, %%xmm1      \n\t"
+	"movdqa    %%xmm1, \\i(%4)     \n\t"
+	".endm                         \n\t"	
+	"movdqa    (%2), %%xmm6        \n\t"		
+	"FDCT_ROW_SSE2_H1 0 0 \n\t"
+	"FDCT_ROW_SSE2 0 \n\t"
+	"FDCT_ROW_SSE2_H2 64 0 \n\t"
+	"FDCT_ROW_SSE2 64 \n\t"
+
+	"FDCT_ROW_SSE2_H1 16 64 \n\t"
+	"FDCT_ROW_SSE2 16 \n\t"
+	"FDCT_ROW_SSE2_H2 112 64 \n\t"
+	"FDCT_ROW_SSE2 112 \n\t"
+
+	"FDCT_ROW_SSE2_H1 32 128 \n\t"
+	"FDCT_ROW_SSE2 32 \n\t"
+	"FDCT_ROW_SSE2_H2 96 128 \n\t"
+	"FDCT_ROW_SSE2 96 \n\t"
+
+	"FDCT_ROW_SSE2_H1 48 192 \n\t"
+	"FDCT_ROW_SSE2 48 \n\t"
+	"FDCT_ROW_SSE2_H2 80 192 \n\t"
+	"FDCT_ROW_SSE2 80 \n\t"
+	:
+	: "r" (in), "r" (tab_frw_01234567_sse2), "r" (fdct_r_row_sse2), "i" (SHIFT_FRW_ROW), "r" (out)
+    );
+}
+
 static always_inline void fdct_row_mmx2(const int16_t *in, int16_t *out, const int16_t *table)
 { 
    pshufw_m2r(*(in + 4), mm5, 0x1B);
@@ -341,3 +534,18 @@ void ff_fdct_mmx2(int16_t *block)
        out += 8;
    }
 }
+
+void ff_fdct_sse2(int16_t *block) 
+{
+    int64_t align_tmp[16] ATTR_ALIGN(8);
+    int16_t * const block_tmp= (int16_t*)align_tmp;
+    int16_t *block1;
+    int i;
+
+    block1 = block_tmp;
+    fdct_col(block, block1, 0);
+    fdct_col(block, block1, 4);
+
+    fdct_row_sse2(block1, block);
+}
+
--- a/libavcodec/i386/mpegvideo_mmx.c
+++ b/libavcodec/i386/mpegvideo_mmx.c
@@ -683,6 +683,12 @@ static void  denoise_dct_sse2(MpegEncContext *s, DCTELEM *block){
 #define RENAMEl(a) a ## _mmx2
 #include "mpegvideo_mmx_template.c"

+#undef RENAME
+#undef RENAMEl
+#define RENAME(a) a ## _SSE2
+#define RENAMEl(a) a ## _sse2
+#include "mpegvideo_mmx_template.c"
+
 void MPV_common_init_mmx(MpegEncContext *s)
 {
    if (mm_flags & MM_MMX) {
@@ -704,7 +710,9 @@ void MPV_common_init_mmx(MpegEncContext *s)
 	}

        if(dct_algo==FF_DCT_AUTO || dct_algo==FF_DCT_MMX){
-            if(mm_flags & MM_MMXEXT){
+            if(mm_flags & MM_SSE2){
+                s->dct_quantize= dct_quantize_SSE2;
+            } else if(mm_flags & MM_MMXEXT){
                s->dct_quantize= dct_quantize_MMX2;
            } else {
                s->dct_quantize= dct_quantize_MMX;