mmx, mmx2, 3dnow optimized 24to32

Originally committed as revision 2512 to svn://svn.mplayerhq.hu/mplayer/trunk/postproc

mmx, mmx2, 3dnow optimized 24to32
Originally committed as revision 2512 to svn://svn.mplayerhq.hu/mplayer/trunk/postproc
96b956cc · Nick Kurshev · 49a0c6ee · 96b956cc · 96b956cc
隐藏空白更改
内联并排

Showing with 72 addition and 4 deletion

postproc/rgb2rgb.c postproc/rgb2rgb.c +36 -2

postproc/rgb2rgb_template.c postproc/rgb2rgb_template.c +36 -2

未找到文件。
--- a/postproc/rgb2rgb.c
+++ b/postproc/rgb2rgb.c
@@ -3,6 +3,26 @@
 #include "rgb2rgb.h"
 #include "mmx.h"

+#ifdef HAVE_3DNOW
+#define PREFETCH "prefetch"
+#define PREFETCHW "prefetchw"
+#elif HAVE_MMX2
+#define PREFETCH "prefetchnta"
+#define PREFETCHW "prefetcht0"
+#endif
+
+#ifdef HAVE_3DNOW
+#define EMMS "femms"
+#else
+#define EMMS "emms"
+#endif
+
+#ifdef HAVE_MMX2
+#define MOVNTQ "movntq"
+#else
+#define MOVNTQ "movq"
+#endif
+
 void rgb24to32(uint8_t *src,uint8_t *dst,uint32_t src_size)
 {
  uint8_t *dest = dst;
@@ -14,10 +34,23 @@ void rgb24to32(uint8_t *src,uint8_t *dst,uint32_t src_size)
 #endif
  end = s + src_size;
 #ifdef HAVE_MMX
+#ifdef PREFETCH
+  __asm __volatile(
+    PREFETCH" %0\n\t"
+    PREFETCH" 64%0\n\t"
+    PREFETCHW" %1\n\t"
+    PREFETCHW" 64%1\n\t"::"m"(*s),"m"(*dest):"memory");
+#endif
  mm_end = (uint8_t*)((((unsigned long)end)/16)*16);
  __asm __volatile("movq %0, %%mm7"::"m"(mask32):"memory");
  while(s < mm_end)
  {
+#ifdef PREFETCH
+    __asm __volatile(
+	PREFETCH" 128%0\n\t"
+	PREFETCHW" 128%1"
+	::"m"(*s),"m"(*dest):"memory");
+#endif
    __asm __volatile(
 	"movd	%1, %%mm0\n\t"
 	"movd	3%1, %%mm1\n\t"
@@ -27,14 +60,15 @@ void rgb24to32(uint8_t *src,uint8_t *dst,uint32_t src_size)
 	"punpckldq %%mm3, %%mm2\n\t"
 	"pand	%%mm7, %%mm0\n\t"
 	"pand	%%mm7, %%mm2\n\t"
-	"movq	%%mm0, %0\n\t"
-	"movq	%%mm2, 8%0"
+	MOVNTQ"	%%mm0, %0\n\t"
+	MOVNTQ"	%%mm2, 8%0"
 	:"=m"(*dest)
 	:"m"(*s)
 	:"memory");
    dest += 16;
    s += 12;
  }
+  __asm __volatile(EMMS:::"memory");
 #endif
  while(s < end)
  {

--- a/postproc/rgb2rgb_template.c
+++ b/postproc/rgb2rgb_template.c
@@ -3,6 +3,26 @@
 #include "rgb2rgb.h"
 #include "mmx.h"

+#ifdef HAVE_3DNOW
+#define PREFETCH "prefetch"
+#define PREFETCHW "prefetchw"
+#elif HAVE_MMX2
+#define PREFETCH "prefetchnta"
+#define PREFETCHW "prefetcht0"
+#endif
+
+#ifdef HAVE_3DNOW
+#define EMMS "femms"
+#else
+#define EMMS "emms"
+#endif
+
+#ifdef HAVE_MMX2
+#define MOVNTQ "movntq"
+#else
+#define MOVNTQ "movq"
+#endif
+
 void rgb24to32(uint8_t *src,uint8_t *dst,uint32_t src_size)
 {
  uint8_t *dest = dst;
@@ -14,10 +34,23 @@ void rgb24to32(uint8_t *src,uint8_t *dst,uint32_t src_size)
 #endif
  end = s + src_size;
 #ifdef HAVE_MMX
+#ifdef PREFETCH
+  __asm __volatile(
+    PREFETCH" %0\n\t"
+    PREFETCH" 64%0\n\t"
+    PREFETCHW" %1\n\t"
+    PREFETCHW" 64%1\n\t"::"m"(*s),"m"(*dest):"memory");
+#endif
  mm_end = (uint8_t*)((((unsigned long)end)/16)*16);
  __asm __volatile("movq %0, %%mm7"::"m"(mask32):"memory");
  while(s < mm_end)
  {
+#ifdef PREFETCH
+    __asm __volatile(
+	PREFETCH" 128%0\n\t"
+	PREFETCHW" 128%1"
+	::"m"(*s),"m"(*dest):"memory");
+#endif
    __asm __volatile(
 	"movd	%1, %%mm0\n\t"
 	"movd	3%1, %%mm1\n\t"
@@ -27,14 +60,15 @@ void rgb24to32(uint8_t *src,uint8_t *dst,uint32_t src_size)
 	"punpckldq %%mm3, %%mm2\n\t"
 	"pand	%%mm7, %%mm0\n\t"
 	"pand	%%mm7, %%mm2\n\t"
-	"movq	%%mm0, %0\n\t"
-	"movq	%%mm2, 8%0"
+	MOVNTQ"	%%mm0, %0\n\t"
+	MOVNTQ"	%%mm2, 8%0"
 	:"=m"(*dest)
 	:"m"(*s)
 	:"memory");
    dest += 16;
    s += 12;
  }
+  __asm __volatile(EMMS:::"memory");
 #endif
  while(s < end)
  {