From c09dc46524f868e3359bcef09fd1b56a1ad2cba1 Mon Sep 17 00:00:00 2001 From: Michael Niedermayer Date: Tue, 20 Nov 2001 17:47:52 +0000 Subject: [PATCH] cleanup precopy fewer lines from src to dst if possible speedup (due to cleanup of blockcopy) Originally committed as revision 3032 to svn://svn.mplayerhq.hu/mplayer/trunk/postproc --- postproc/postprocess.c | 799 ++------------------------------ postproc/postprocess_template.c | 799 ++------------------------------ 2 files changed, 96 insertions(+), 1502 deletions(-) diff --git a/postproc/postprocess.c b/postproc/postprocess.c index 6ac10ff40b..353e0da4a0 100644 --- a/postproc/postprocess.c +++ b/postproc/postprocess.c @@ -62,6 +62,7 @@ border remover optimize c versions try to unroll inner for(x=0 ... loop to avoid these damn if(x ... checks smart blur +commandline option for the deblock thresholds ... */ @@ -858,212 +859,6 @@ static inline void vertX1Filter(uint8_t *src, int stride, int QP) #endif } -/** - * Experimental Filter 1 (Horizontal) - * will not damage linear gradients - * Flat blocks should look like they where passed through the (1,1,2,2,4,2,2,1,1) 9-Tap filter - * can only smooth blocks at the expected locations (it cant smooth them if they did move) - * MMX2 version does correct clipping C version doesnt - * not identical with the vertical one - */ -static inline void horizX1Filter(uint8_t *src, int stride, int QP) -{ - int y; - static uint64_t *lut= NULL; - if(lut==NULL) - { - int i; - lut= (uint64_t*)memalign(8, 256*8); - for(i=0; i<256; i++) - { - int v= i < 128 ? 2*i : 2*(i-256); -/* -//Simulate 112242211 9-Tap filter - uint64_t a= (v/16) & 0xFF; - uint64_t b= (v/8) & 0xFF; - uint64_t c= (v/4) & 0xFF; - uint64_t d= (3*v/8) & 0xFF; -*/ -//Simulate piecewise linear interpolation - uint64_t a= (v/16) & 0xFF; - uint64_t b= (v*3/16) & 0xFF; - uint64_t c= (v*5/16) & 0xFF; - uint64_t d= (7*v/16) & 0xFF; - uint64_t A= (0x100 - a)&0xFF; - uint64_t B= (0x100 - b)&0xFF; - uint64_t C= (0x100 - c)&0xFF; - uint64_t D= (0x100 - c)&0xFF; - - lut[i] = (a<<56) | (b<<48) | (c<<40) | (d<<32) | - (D<<24) | (C<<16) | (B<<8) | (A); - //lut[i] = (v<<32) | (v<<24); - } - } - -#if 0 - asm volatile( - "pxor %%mm7, %%mm7 \n\t" // 0 -// "movq b80, %%mm6 \n\t" // MIN_SIGNED_BYTE - "leal (%0, %1), %%eax \n\t" - "leal (%%eax, %1, 4), %%ebx \n\t" - - "movq b80, %%mm6 \n\t" - "movd pQPb, %%mm5 \n\t" // QP - "movq %%mm5, %%mm4 \n\t" - "paddusb %%mm5, %%mm5 \n\t" // 2QP - "paddusb %%mm5, %%mm4 \n\t" // 3QP - "pxor %%mm5, %%mm5 \n\t" // 0 - "psubb %%mm4, %%mm5 \n\t" // -3QP - "por bm11111110, %%mm5 \n\t" // ...,FF,FF,-3QP - "psllq $24, %%mm5 \n\t" - -// 0 1 2 3 4 5 6 7 8 9 -// %0 eax eax+%1 eax+2%1 %0+4%1 ebx ebx+%1 ebx+2%1 %0+8%1 ebx+4%1 - -#define HX1old(a) \ - "movd " #a ", %%mm0 \n\t"\ - "movd 4" #a ", %%mm1 \n\t"\ - "punpckldq %%mm1, %%mm0 \n\t"\ - "movq %%mm0, %%mm1 \n\t"\ - "movq %%mm0, %%mm2 \n\t"\ - "psrlq $8, %%mm1 \n\t"\ - "psubusb %%mm1, %%mm2 \n\t"\ - "psubusb %%mm0, %%mm1 \n\t"\ - "por %%mm2, %%mm1 \n\t" /* p´x = |px - p(x+1)| */\ - "pcmpeqb %%mm7, %%mm2 \n\t" /* p´x = sgn[px - p(x+1)] */\ - "pshufw $0x00, %%mm1, %%mm3 \n\t" /* p´5 = |p1 - p2| */\ - PAVGB(%%mm1, %%mm3) /* p´5 = (|p2-p1| + |p6-p5|)/2 */\ - "psrlq $16, %%mm3 \n\t" /* p´3 = (|p2-p1| + |p6-p5|)/2 */\ - "psubusb %%mm3, %%mm1 \n\t" /* |p3-p4|-(|p2-p1| + |p6-p5|)/2 */\ - "paddb %%mm5, %%mm1 \n\t"\ - "psubusb %%mm5, %%mm1 \n\t"\ - PAVGB(%%mm7, %%mm1)\ - "pxor %%mm2, %%mm1 \n\t"\ - "psubb %%mm2, %%mm1 \n\t"\ - "psrlq $24, %%mm1 \n\t"\ - "movd %%mm1, %%ecx \n\t"\ - "paddb %%mm6, %%mm0 \n\t"\ - "paddsb (%3, %%ecx, 8), %%mm0 \n\t"\ - "paddb %%mm6, %%mm0 \n\t"\ - "movq %%mm0, " #a " \n\t"\ - -/* -HX1old((%0)) -HX1old((%%eax)) -HX1old((%%eax, %1)) -HX1old((%%eax, %1, 2)) -HX1old((%0, %1, 4)) -HX1old((%%ebx)) -HX1old((%%ebx, %1)) -HX1old((%%ebx, %1, 2)) -*/ - -//FIXME add some comments, its unreadable ... -#define HX1b(a, c, b, d) \ - "movd " #a ", %%mm0 \n\t"\ - "movd 4" #a ", %%mm1 \n\t"\ - "punpckldq %%mm1, %%mm0 \n\t"\ - "movd " #b ", %%mm4 \n\t"\ - "movq %%mm0, %%mm1 \n\t"\ - "movq %%mm0, %%mm2 \n\t"\ - "psrlq $8, %%mm1 \n\t"\ - "movd 4" #b ", %%mm3 \n\t"\ - "psubusb %%mm1, %%mm2 \n\t"\ - "psubusb %%mm0, %%mm1 \n\t"\ - "por %%mm2, %%mm1 \n\t" /* p´x = |px - p(x+1)| */\ - "pcmpeqb %%mm7, %%mm2 \n\t" /* p´x = sgn[px - p(x+1)] */\ - "punpckldq %%mm3, %%mm4 \n\t"\ - "movq %%mm1, %%mm3 \n\t"\ - "psllq $32, %%mm3 \n\t" /* p´5 = |p1 - p2| */\ - PAVGB(%%mm1, %%mm3) /* p´5 = (|p2-p1| + |p6-p5|)/2 */\ - "paddb %%mm6, %%mm0 \n\t"\ - "psrlq $16, %%mm3 \n\t" /* p´3 = (|p2-p1| + |p6-p5|)/2 */\ - "psubusb %%mm3, %%mm1 \n\t" /* |p3-p4|-(|p2-p1| + |p6-p5|)/2 */\ - "movq %%mm4, %%mm3 \n\t"\ - "paddb %%mm5, %%mm1 \n\t"\ - "psubusb %%mm5, %%mm1 \n\t"\ - "psrlq $8, %%mm3 \n\t"\ - PAVGB(%%mm7, %%mm1)\ - "pxor %%mm2, %%mm1 \n\t"\ - "psubb %%mm2, %%mm1 \n\t"\ - "movq %%mm4, %%mm2 \n\t"\ - "psrlq $24, %%mm1 \n\t"\ - "psubusb %%mm3, %%mm2 \n\t"\ - "movd %%mm1, %%ecx \n\t"\ - "psubusb %%mm4, %%mm3 \n\t"\ - "paddsb (%2, %%ecx, 8), %%mm0 \n\t"\ - "por %%mm2, %%mm3 \n\t" /* p´x = |px - p(x+1)| */\ - "paddb %%mm6, %%mm0 \n\t"\ - "pcmpeqb %%mm7, %%mm2 \n\t" /* p´x = sgn[px - p(x+1)] */\ - "movq %%mm3, %%mm1 \n\t"\ - "psllq $32, %%mm1 \n\t" /* p´5 = |p1 - p2| */\ - "movq %%mm0, " #a " \n\t"\ - PAVGB(%%mm3, %%mm1) /* p´5 = (|p2-p1| + |p6-p5|)/2 */\ - "paddb %%mm6, %%mm4 \n\t"\ - "psrlq $16, %%mm1 \n\t" /* p´3 = (|p2-p1| + |p6-p5|)/2 */\ - "psubusb %%mm1, %%mm3 \n\t" /* |p3-p4|-(|p2-p1| + |p6-p5|)/2 */\ - "paddb %%mm5, %%mm3 \n\t"\ - "psubusb %%mm5, %%mm3 \n\t"\ - PAVGB(%%mm7, %%mm3)\ - "pxor %%mm2, %%mm3 \n\t"\ - "psubb %%mm2, %%mm3 \n\t"\ - "psrlq $24, %%mm3 \n\t"\ - "movd " #c ", %%mm0 \n\t"\ - "movd 4" #c ", %%mm1 \n\t"\ - "punpckldq %%mm1, %%mm0 \n\t"\ - "paddb %%mm6, %%mm0 \n\t"\ - "paddsb (%2, %%ecx, 8), %%mm0 \n\t"\ - "paddb %%mm6, %%mm0 \n\t"\ - "movq %%mm0, " #c " \n\t"\ - "movd %%mm3, %%ecx \n\t"\ - "movd " #d ", %%mm0 \n\t"\ - "paddsb (%2, %%ecx, 8), %%mm4 \n\t"\ - "movd 4" #d ", %%mm1 \n\t"\ - "paddb %%mm6, %%mm4 \n\t"\ - "punpckldq %%mm1, %%mm0 \n\t"\ - "movq %%mm4, " #b " \n\t"\ - "paddb %%mm6, %%mm0 \n\t"\ - "paddsb (%2, %%ecx, 8), %%mm0 \n\t"\ - "paddb %%mm6, %%mm0 \n\t"\ - "movq %%mm0, " #d " \n\t"\ - -HX1b((%0),(%%eax),(%%eax, %1),(%%eax, %1, 2)) -HX1b((%0, %1, 4),(%%ebx),(%%ebx, %1),(%%ebx, %1, 2)) - - - : - : "r" (src), "r" (stride), "r" (lut) - : "%eax", "%ebx", "%ecx" - ); -#else - -//FIXME (has little in common with the mmx2 version) - for(y=0; y 0) - { -// printf("\nasm:%d c:%d\n", asmEq, numEq); - for(int y=0; y<8; y++) - { - for(int x=0; x<8; x++) - { - printf("%d ", src[x + y*stride]); - } - printf("\n"); - } - } -*/ -// printf("%d\n", numEq); - return numEq > hFlatnessThreshold; -} - -static inline int isHorizMinMaxOk(uint8_t src[], int stride, int QP) -{ - if(abs(src[0] - src[7]) > 2*QP) return 0; - - return 1; -} - -static inline void doHorizDefFilter(uint8_t dst[], int stride, int QP) -{ -#if 0 - asm volatile( - "leal (%0, %1), %%ecx \n\t" - "leal (%%ecx, %1, 4), %%ebx \n\t" -// 0 1 2 3 4 5 6 7 8 9 -// %0 ecx ecx+%1 ecx+2%1 %0+4%1 ebx ebx+%1 ebx+2%1 %0+8%1 ebx+4%1 - "pxor %%mm7, %%mm7 \n\t" - "movq bm00001000, %%mm6 \n\t" - "movd %2, %%mm5 \n\t" // QP - "movq %%mm5, %%mm4 \n\t" - "paddusb %%mm5, %%mm5 \n\t" // 2QP - "paddusb %%mm5, %%mm4 \n\t" // 3QP - "psllq $24, %%mm4 \n\t" - "pxor %%mm5, %%mm5 \n\t" // 0 - "psubb %%mm4, %%mm5 \n\t" // -QP - "leal tempBlock, %%eax \n\t" - -//FIXME? "unroll by 2" and mix -#ifdef HAVE_MMX2 -#define HDF(src, dst) \ - "movq " #src "(%%eax), %%mm0 \n\t"\ - "movq " #src "(%%eax), %%mm1 \n\t"\ - "movq " #src "(%%eax), %%mm2 \n\t"\ - "psrlq $8, %%mm1 \n\t"\ - "psubusb %%mm1, %%mm2 \n\t"\ - "psubusb %%mm0, %%mm1 \n\t"\ - "por %%mm2, %%mm1 \n\t" /* p´x = |px - p(x+1)| */\ - "pcmpeqb %%mm7, %%mm2 \n\t" /* p´x = sgn[px - p(x+1)] */\ - "pshufw $0x00, %%mm1, %%mm3 \n\t" /* p´5 = |p1 - p2| */\ - "pminub %%mm1, %%mm3 \n\t" /* p´5 = min(|p2-p1|, |p6-p5|)*/\ - "psrlq $16, %%mm3 \n\t" /* p´3 = min(|p2-p1|, |p6-p5|)*/\ - "psubusb %%mm3, %%mm1 \n\t" /* |p3-p4|-min(|p1-p2|,|p5-p6|) */\ - "paddb %%mm5, %%mm1 \n\t"\ - "psubusb %%mm5, %%mm1 \n\t"\ - "psrlw $2, %%mm1 \n\t"\ - "pxor %%mm2, %%mm1 \n\t"\ - "psubb %%mm2, %%mm1 \n\t"\ - "pand %%mm6, %%mm1 \n\t"\ - "psubb %%mm1, %%mm0 \n\t"\ - "psllq $8, %%mm1 \n\t"\ - "paddb %%mm1, %%mm0 \n\t"\ - "movd %%mm0, " #dst" \n\t"\ - "psrlq $32, %%mm0 \n\t"\ - "movd %%mm0, 4" #dst" \n\t" -#else -#define HDF(src, dst)\ - "movq " #src "(%%eax), %%mm0 \n\t"\ - "movq %%mm0, %%mm1 \n\t"\ - "movq %%mm0, %%mm2 \n\t"\ - "psrlq $8, %%mm1 \n\t"\ - "psubusb %%mm1, %%mm2 \n\t"\ - "psubusb %%mm0, %%mm1 \n\t"\ - "por %%mm2, %%mm1 \n\t" /* p´x = |px - p(x+1)| */\ - "pcmpeqb %%mm7, %%mm2 \n\t" /* p´x = sgn[px - p(x+1)] */\ - "movq %%mm1, %%mm3 \n\t"\ - "psllq $32, %%mm3 \n\t"\ - "movq %%mm3, %%mm4 \n\t"\ - "psubusb %%mm1, %%mm4 \n\t"\ - "psubb %%mm4, %%mm3 \n\t"\ - "psrlq $16, %%mm3 \n\t" /* p´3 = min(|p2-p1|, |p6-p5|)*/\ - "psubusb %%mm3, %%mm1 \n\t" /* |p3-p4|-min(|p1-p2|,|p5,ü6|) */\ - "paddb %%mm5, %%mm1 \n\t"\ - "psubusb %%mm5, %%mm1 \n\t"\ - "psrlw $2, %%mm1 \n\t"\ - "pxor %%mm2, %%mm1 \n\t"\ - "psubb %%mm2, %%mm1 \n\t"\ - "pand %%mm6, %%mm1 \n\t"\ - "psubb %%mm1, %%mm0 \n\t"\ - "psllq $8, %%mm1 \n\t"\ - "paddb %%mm1, %%mm0 \n\t"\ - "movd %%mm0, " #dst " \n\t"\ - "psrlq $32, %%mm0 \n\t"\ - "movd %%mm0, 4" #dst " \n\t" -#endif - HDF(0,(%0)) - HDF(8,(%%ecx)) - HDF(16,(%%ecx, %1)) - HDF(24,(%%ecx, %1, 2)) - HDF(32,(%0, %1, 4)) - HDF(40,(%%ebx)) - HDF(48,(%%ebx, %1)) - HDF(56,(%%ebx, %1, 2)) - : - : "r" (dst), "r" (stride), "r" (QP) - : "%eax", "%ebx", "%ecx" - ); -#else - int y; - for(y=0; y> 6; - d*= SIGN(-middleEnergy); - - if(q>0) - { - d= d<0 ? 0 : d; - d= d>q ? q : d; - } - else - { - d= d>0 ? 0 : d; - d= d>4; - dst[1]= ((dst[1]<<2) + ((first + sums[0] + sums[3])<<1) + sums[5] + 8)>>4; - dst[2]= ((dst[2]<<2) + ((first + sums[1] + sums[4])<<1) + sums[6] + 8)>>4; - dst[3]= ((dst[3]<<2) + ((sums[2] + sums[5])<<1) + sums[0] + sums[7] + 8)>>4; - dst[4]= ((dst[4]<<2) + ((sums[3] + sums[6])<<1) + sums[1] + sums[8] + 8)>>4; - dst[5]= ((dst[5]<<2) + ((last + sums[7] + sums[4])<<1) + sums[2] + 8)>>4; - dst[6]= (((last + dst[6])<<2) + ((dst[7] + sums[5])<<1) + sums[3] + 8)>>4; - dst[7]= ((sums[8]<<2) + ((last + sums[6])<<1) + sums[4] + 8)>>4; - - dst+= stride; - } -#endif -} - static inline void dering(uint8_t src[], int stride, int QP) { #if defined (HAVE_MMX2) || defined (HAVE_3DNOW) @@ -3533,8 +2834,6 @@ void postprocess(unsigned char * src[], int src_stride, vertical_size >>= 1; src_stride >>= 1; dst_stride >>= 1; -// mode&= ~(LINEAR_IPOL_DEINT_FILTER | LINEAR_BLEND_DEINT_FILTER | -// MEDIAN_DEINT_FILTER | CUBIC_IPOL_DEINT_FILTER); if(1) { @@ -3638,7 +2937,7 @@ int getPpModeForQuality(int quality){ * levelFix == 0 -> dont touch the brighness & contrast */ static inline void blockCopy(uint8_t dst[], int dstStride, uint8_t src[], int srcStride, - int numLines, int levelFix) + int levelFix) { #ifndef HAVE_MMX int i; @@ -3695,7 +2994,7 @@ SCALED_CPY : "%eax", "%ebx" ); #else - for(i=0; i>2) + "r" (dstStride) : "%eax", "%ebx" ); #else - for(i=0; imaxTmpNoise[2]; #endif + if(mode & CUBIC_IPOL_DEINT_FILTER) copyAhead=16; + else if(mode & LINEAR_BLEND_DEINT_FILTER) copyAhead=14; + else if( (mode & V_DEBLOCK) + || (mode & LINEAR_IPOL_DEINT_FILTER) + || (mode & MEDIAN_DEINT_FILTER)) copyAhead=13; + else if(mode & V_X1_FILTER) copyAhead=11; + else if(mode & V_RK1_FILTER) copyAhead=10; + else if(mode & DERING) copyAhead=9; + else copyAhead=8; + + copyAhead-= 8; + if(tempDst==NULL) { tempDst= (uint8_t*)memalign(8, 1024*24); @@ -3897,12 +3206,6 @@ static void postProcess(uint8_t src[], int srcStride, uint8_t dst[], int dstStri { #ifdef HAVE_MMX2 -/* - prefetchnta(srcBlock + (((x>>3)&3) + 5)*srcStride + 32); - prefetchnta(srcBlock + (((x>>3)&3) + 9)*srcStride + 32); - prefetcht0(dstBlock + (((x>>3)&3) + 5)*dstStride + 32); - prefetcht0(dstBlock + (((x>>3)&3) + 9)*dstStride + 32); -*/ /* prefetchnta(srcBlock + (((x>>2)&6) + 5)*srcStride + 32); prefetchnta(srcBlock + (((x>>2)&6) + 6)*srcStride + 32); @@ -3914,7 +3217,7 @@ static void postProcess(uint8_t src[], int srcStride, uint8_t dst[], int dstStri "movl %4, %%eax \n\t" "shrl $2, %%eax \n\t" "andl $6, %%eax \n\t" - "addl $8, %%eax \n\t" + "addl %5, %%eax \n\t" "movl %%eax, %%ebx \n\t" "imul %1, %%eax \n\t" "imul %3, %%ebx \n\t" @@ -3925,7 +3228,7 @@ static void postProcess(uint8_t src[], int srcStride, uint8_t dst[], int dstStri "prefetchnta 32(%%eax, %0) \n\t" "prefetcht0 32(%%ebx, %2) \n\t" :: "r" (srcBlock), "r" (srcStride), "r" (dstBlock), "r" (dstStride), - "m" (x) + "m" (x), "m" (copyAhead) : "%eax", "%ebx" ); @@ -3938,8 +3241,8 @@ static void postProcess(uint8_t src[], int srcStride, uint8_t dst[], int dstStri */ #endif - blockCopy(dstBlock + dstStride*8, dstStride, - srcBlock + srcStride*8, srcStride, 8, mode & LEVEL_FIX); + blockCopy(dstBlock + dstStride*copyAhead, dstStride, + srcBlock + srcStride*copyAhead, srcStride, mode & LEVEL_FIX); if(mode & LINEAR_IPOL_DEINT_FILTER) deInterlaceInterpolateLinear(dstBlock, dstStride); @@ -3955,7 +3258,7 @@ static void postProcess(uint8_t src[], int srcStride, uint8_t dst[], int dstStri dstBlock+=8; srcBlock+=8; } - memcpy(&(dst[y*dstStride]) + 8*dstStride, tempDst + 9*dstStride, 8*dstStride ); + memcpy(&(dst[y*dstStride]) + 8*dstStride, tempDst + 9*dstStride, copyAhead*dstStride ); } for(y=0; y= height) { int i; - /* copy from line 8 to 15 of src, these will be copied with + /* copy from line (copyAhead) to (copyAhead+7) of src, these will be copied with blockcopy to dst later */ - memcpy(tempSrc + srcStride*8, srcBlock + srcStride*8, - srcStride*MAX(height-y-8, 0) ); + memcpy(tempSrc + srcStride*copyAhead, srcBlock + srcStride*copyAhead, + srcStride*MAX(height-y-copyAhead, 0) ); - /* duplicate last line of src to fill the void upto line 15 */ - for(i=MAX(height-y, 8); i<=15; i++) + /* duplicate last line of src to fill the void upto line (copyAhead+7) */ + for(i=MAX(height-y, 8); i>3)&3) + 5)*srcStride + 32); - prefetchnta(srcBlock + (((x>>3)&3) + 9)*srcStride + 32); - prefetcht0(dstBlock + (((x>>3)&3) + 5)*dstStride + 32); - prefetcht0(dstBlock + (((x>>3)&3) + 9)*dstStride + 32); -*/ /* prefetchnta(srcBlock + (((x>>2)&6) + 5)*srcStride + 32); prefetchnta(srcBlock + (((x>>2)&6) + 6)*srcStride + 32); @@ -4057,7 +3354,7 @@ static void postProcess(uint8_t src[], int srcStride, uint8_t dst[], int dstStri "movl %4, %%eax \n\t" "shrl $2, %%eax \n\t" "andl $6, %%eax \n\t" - "addl $8, %%eax \n\t" + "addl %5, %%eax \n\t" "movl %%eax, %%ebx \n\t" "imul %1, %%eax \n\t" "imul %3, %%ebx \n\t" @@ -4068,7 +3365,7 @@ static void postProcess(uint8_t src[], int srcStride, uint8_t dst[], int dstStri "prefetchnta 32(%%eax, %0) \n\t" "prefetcht0 32(%%ebx, %2) \n\t" :: "r" (srcBlock), "r" (srcStride), "r" (dstBlock), "r" (dstStride), - "m" (x) + "m" (x), "m" (copyAhead) : "%eax", "%ebx" ); @@ -4100,8 +3397,8 @@ static void postProcess(uint8_t src[], int srcStride, uint8_t dst[], int dstStri } #endif - blockCopy(dstBlock + dstStride*8, dstStride, - srcBlock + srcStride*8, srcStride, 8, mode & LEVEL_FIX); + blockCopy(dstBlock + dstStride*copyAhead, dstStride, + srcBlock + srcStride*copyAhead, srcStride, mode & LEVEL_FIX); if(mode & LINEAR_IPOL_DEINT_FILTER) deInterlaceInterpolateLinear(dstBlock, dstStride); @@ -4160,7 +3457,7 @@ static void postProcess(uint8_t src[], int srcStride, uint8_t dst[], int dstStri vertX1Filter(tempBlock1, 16, QP); else if(mode & H_DEBLOCK) { - if( isVertDC(tempBlock1, 16)) + if( isVertDC(tempBlock1, 16) ) { if(isVertMinMaxOk(tempBlock1, 16, QP)) doVertLowPass(tempBlock1, 16, QP); @@ -4252,14 +3549,14 @@ static void postProcess(uint8_t src[], int srcStride, uint8_t dst[], int dstStri /* for(x=0; x 0) - { -// printf("\nasm:%d c:%d\n", asmEq, numEq); - for(int y=0; y<8; y++) - { - for(int x=0; x<8; x++) - { - printf("%d ", src[x + y*stride]); - } - printf("\n"); - } - } -*/ -// printf("%d\n", numEq); - return numEq > hFlatnessThreshold; -} - -static inline int isHorizMinMaxOk(uint8_t src[], int stride, int QP) -{ - if(abs(src[0] - src[7]) > 2*QP) return 0; - - return 1; -} - -static inline void doHorizDefFilter(uint8_t dst[], int stride, int QP) -{ -#if 0 - asm volatile( - "leal (%0, %1), %%ecx \n\t" - "leal (%%ecx, %1, 4), %%ebx \n\t" -// 0 1 2 3 4 5 6 7 8 9 -// %0 ecx ecx+%1 ecx+2%1 %0+4%1 ebx ebx+%1 ebx+2%1 %0+8%1 ebx+4%1 - "pxor %%mm7, %%mm7 \n\t" - "movq bm00001000, %%mm6 \n\t" - "movd %2, %%mm5 \n\t" // QP - "movq %%mm5, %%mm4 \n\t" - "paddusb %%mm5, %%mm5 \n\t" // 2QP - "paddusb %%mm5, %%mm4 \n\t" // 3QP - "psllq $24, %%mm4 \n\t" - "pxor %%mm5, %%mm5 \n\t" // 0 - "psubb %%mm4, %%mm5 \n\t" // -QP - "leal tempBlock, %%eax \n\t" - -//FIXME? "unroll by 2" and mix -#ifdef HAVE_MMX2 -#define HDF(src, dst) \ - "movq " #src "(%%eax), %%mm0 \n\t"\ - "movq " #src "(%%eax), %%mm1 \n\t"\ - "movq " #src "(%%eax), %%mm2 \n\t"\ - "psrlq $8, %%mm1 \n\t"\ - "psubusb %%mm1, %%mm2 \n\t"\ - "psubusb %%mm0, %%mm1 \n\t"\ - "por %%mm2, %%mm1 \n\t" /* p´x = |px - p(x+1)| */\ - "pcmpeqb %%mm7, %%mm2 \n\t" /* p´x = sgn[px - p(x+1)] */\ - "pshufw $0x00, %%mm1, %%mm3 \n\t" /* p´5 = |p1 - p2| */\ - "pminub %%mm1, %%mm3 \n\t" /* p´5 = min(|p2-p1|, |p6-p5|)*/\ - "psrlq $16, %%mm3 \n\t" /* p´3 = min(|p2-p1|, |p6-p5|)*/\ - "psubusb %%mm3, %%mm1 \n\t" /* |p3-p4|-min(|p1-p2|,|p5-p6|) */\ - "paddb %%mm5, %%mm1 \n\t"\ - "psubusb %%mm5, %%mm1 \n\t"\ - "psrlw $2, %%mm1 \n\t"\ - "pxor %%mm2, %%mm1 \n\t"\ - "psubb %%mm2, %%mm1 \n\t"\ - "pand %%mm6, %%mm1 \n\t"\ - "psubb %%mm1, %%mm0 \n\t"\ - "psllq $8, %%mm1 \n\t"\ - "paddb %%mm1, %%mm0 \n\t"\ - "movd %%mm0, " #dst" \n\t"\ - "psrlq $32, %%mm0 \n\t"\ - "movd %%mm0, 4" #dst" \n\t" -#else -#define HDF(src, dst)\ - "movq " #src "(%%eax), %%mm0 \n\t"\ - "movq %%mm0, %%mm1 \n\t"\ - "movq %%mm0, %%mm2 \n\t"\ - "psrlq $8, %%mm1 \n\t"\ - "psubusb %%mm1, %%mm2 \n\t"\ - "psubusb %%mm0, %%mm1 \n\t"\ - "por %%mm2, %%mm1 \n\t" /* p´x = |px - p(x+1)| */\ - "pcmpeqb %%mm7, %%mm2 \n\t" /* p´x = sgn[px - p(x+1)] */\ - "movq %%mm1, %%mm3 \n\t"\ - "psllq $32, %%mm3 \n\t"\ - "movq %%mm3, %%mm4 \n\t"\ - "psubusb %%mm1, %%mm4 \n\t"\ - "psubb %%mm4, %%mm3 \n\t"\ - "psrlq $16, %%mm3 \n\t" /* p´3 = min(|p2-p1|, |p6-p5|)*/\ - "psubusb %%mm3, %%mm1 \n\t" /* |p3-p4|-min(|p1-p2|,|p5,ü6|) */\ - "paddb %%mm5, %%mm1 \n\t"\ - "psubusb %%mm5, %%mm1 \n\t"\ - "psrlw $2, %%mm1 \n\t"\ - "pxor %%mm2, %%mm1 \n\t"\ - "psubb %%mm2, %%mm1 \n\t"\ - "pand %%mm6, %%mm1 \n\t"\ - "psubb %%mm1, %%mm0 \n\t"\ - "psllq $8, %%mm1 \n\t"\ - "paddb %%mm1, %%mm0 \n\t"\ - "movd %%mm0, " #dst " \n\t"\ - "psrlq $32, %%mm0 \n\t"\ - "movd %%mm0, 4" #dst " \n\t" -#endif - HDF(0,(%0)) - HDF(8,(%%ecx)) - HDF(16,(%%ecx, %1)) - HDF(24,(%%ecx, %1, 2)) - HDF(32,(%0, %1, 4)) - HDF(40,(%%ebx)) - HDF(48,(%%ebx, %1)) - HDF(56,(%%ebx, %1, 2)) - : - : "r" (dst), "r" (stride), "r" (QP) - : "%eax", "%ebx", "%ecx" - ); -#else - int y; - for(y=0; y> 6; - d*= SIGN(-middleEnergy); - - if(q>0) - { - d= d<0 ? 0 : d; - d= d>q ? q : d; - } - else - { - d= d>0 ? 0 : d; - d= d>4; - dst[1]= ((dst[1]<<2) + ((first + sums[0] + sums[3])<<1) + sums[5] + 8)>>4; - dst[2]= ((dst[2]<<2) + ((first + sums[1] + sums[4])<<1) + sums[6] + 8)>>4; - dst[3]= ((dst[3]<<2) + ((sums[2] + sums[5])<<1) + sums[0] + sums[7] + 8)>>4; - dst[4]= ((dst[4]<<2) + ((sums[3] + sums[6])<<1) + sums[1] + sums[8] + 8)>>4; - dst[5]= ((dst[5]<<2) + ((last + sums[7] + sums[4])<<1) + sums[2] + 8)>>4; - dst[6]= (((last + dst[6])<<2) + ((dst[7] + sums[5])<<1) + sums[3] + 8)>>4; - dst[7]= ((sums[8]<<2) + ((last + sums[6])<<1) + sums[4] + 8)>>4; - - dst+= stride; - } -#endif -} - static inline void dering(uint8_t src[], int stride, int QP) { #if defined (HAVE_MMX2) || defined (HAVE_3DNOW) @@ -3533,8 +2834,6 @@ void postprocess(unsigned char * src[], int src_stride, vertical_size >>= 1; src_stride >>= 1; dst_stride >>= 1; -// mode&= ~(LINEAR_IPOL_DEINT_FILTER | LINEAR_BLEND_DEINT_FILTER | -// MEDIAN_DEINT_FILTER | CUBIC_IPOL_DEINT_FILTER); if(1) { @@ -3638,7 +2937,7 @@ int getPpModeForQuality(int quality){ * levelFix == 0 -> dont touch the brighness & contrast */ static inline void blockCopy(uint8_t dst[], int dstStride, uint8_t src[], int srcStride, - int numLines, int levelFix) + int levelFix) { #ifndef HAVE_MMX int i; @@ -3695,7 +2994,7 @@ SCALED_CPY : "%eax", "%ebx" ); #else - for(i=0; i>2) + "r" (dstStride) : "%eax", "%ebx" ); #else - for(i=0; imaxTmpNoise[2]; #endif + if(mode & CUBIC_IPOL_DEINT_FILTER) copyAhead=16; + else if(mode & LINEAR_BLEND_DEINT_FILTER) copyAhead=14; + else if( (mode & V_DEBLOCK) + || (mode & LINEAR_IPOL_DEINT_FILTER) + || (mode & MEDIAN_DEINT_FILTER)) copyAhead=13; + else if(mode & V_X1_FILTER) copyAhead=11; + else if(mode & V_RK1_FILTER) copyAhead=10; + else if(mode & DERING) copyAhead=9; + else copyAhead=8; + + copyAhead-= 8; + if(tempDst==NULL) { tempDst= (uint8_t*)memalign(8, 1024*24); @@ -3897,12 +3206,6 @@ static void postProcess(uint8_t src[], int srcStride, uint8_t dst[], int dstStri { #ifdef HAVE_MMX2 -/* - prefetchnta(srcBlock + (((x>>3)&3) + 5)*srcStride + 32); - prefetchnta(srcBlock + (((x>>3)&3) + 9)*srcStride + 32); - prefetcht0(dstBlock + (((x>>3)&3) + 5)*dstStride + 32); - prefetcht0(dstBlock + (((x>>3)&3) + 9)*dstStride + 32); -*/ /* prefetchnta(srcBlock + (((x>>2)&6) + 5)*srcStride + 32); prefetchnta(srcBlock + (((x>>2)&6) + 6)*srcStride + 32); @@ -3914,7 +3217,7 @@ static void postProcess(uint8_t src[], int srcStride, uint8_t dst[], int dstStri "movl %4, %%eax \n\t" "shrl $2, %%eax \n\t" "andl $6, %%eax \n\t" - "addl $8, %%eax \n\t" + "addl %5, %%eax \n\t" "movl %%eax, %%ebx \n\t" "imul %1, %%eax \n\t" "imul %3, %%ebx \n\t" @@ -3925,7 +3228,7 @@ static void postProcess(uint8_t src[], int srcStride, uint8_t dst[], int dstStri "prefetchnta 32(%%eax, %0) \n\t" "prefetcht0 32(%%ebx, %2) \n\t" :: "r" (srcBlock), "r" (srcStride), "r" (dstBlock), "r" (dstStride), - "m" (x) + "m" (x), "m" (copyAhead) : "%eax", "%ebx" ); @@ -3938,8 +3241,8 @@ static void postProcess(uint8_t src[], int srcStride, uint8_t dst[], int dstStri */ #endif - blockCopy(dstBlock + dstStride*8, dstStride, - srcBlock + srcStride*8, srcStride, 8, mode & LEVEL_FIX); + blockCopy(dstBlock + dstStride*copyAhead, dstStride, + srcBlock + srcStride*copyAhead, srcStride, mode & LEVEL_FIX); if(mode & LINEAR_IPOL_DEINT_FILTER) deInterlaceInterpolateLinear(dstBlock, dstStride); @@ -3955,7 +3258,7 @@ static void postProcess(uint8_t src[], int srcStride, uint8_t dst[], int dstStri dstBlock+=8; srcBlock+=8; } - memcpy(&(dst[y*dstStride]) + 8*dstStride, tempDst + 9*dstStride, 8*dstStride ); + memcpy(&(dst[y*dstStride]) + 8*dstStride, tempDst + 9*dstStride, copyAhead*dstStride ); } for(y=0; y= height) { int i; - /* copy from line 8 to 15 of src, these will be copied with + /* copy from line (copyAhead) to (copyAhead+7) of src, these will be copied with blockcopy to dst later */ - memcpy(tempSrc + srcStride*8, srcBlock + srcStride*8, - srcStride*MAX(height-y-8, 0) ); + memcpy(tempSrc + srcStride*copyAhead, srcBlock + srcStride*copyAhead, + srcStride*MAX(height-y-copyAhead, 0) ); - /* duplicate last line of src to fill the void upto line 15 */ - for(i=MAX(height-y, 8); i<=15; i++) + /* duplicate last line of src to fill the void upto line (copyAhead+7) */ + for(i=MAX(height-y, 8); i>3)&3) + 5)*srcStride + 32); - prefetchnta(srcBlock + (((x>>3)&3) + 9)*srcStride + 32); - prefetcht0(dstBlock + (((x>>3)&3) + 5)*dstStride + 32); - prefetcht0(dstBlock + (((x>>3)&3) + 9)*dstStride + 32); -*/ /* prefetchnta(srcBlock + (((x>>2)&6) + 5)*srcStride + 32); prefetchnta(srcBlock + (((x>>2)&6) + 6)*srcStride + 32); @@ -4057,7 +3354,7 @@ static void postProcess(uint8_t src[], int srcStride, uint8_t dst[], int dstStri "movl %4, %%eax \n\t" "shrl $2, %%eax \n\t" "andl $6, %%eax \n\t" - "addl $8, %%eax \n\t" + "addl %5, %%eax \n\t" "movl %%eax, %%ebx \n\t" "imul %1, %%eax \n\t" "imul %3, %%ebx \n\t" @@ -4068,7 +3365,7 @@ static void postProcess(uint8_t src[], int srcStride, uint8_t dst[], int dstStri "prefetchnta 32(%%eax, %0) \n\t" "prefetcht0 32(%%ebx, %2) \n\t" :: "r" (srcBlock), "r" (srcStride), "r" (dstBlock), "r" (dstStride), - "m" (x) + "m" (x), "m" (copyAhead) : "%eax", "%ebx" ); @@ -4100,8 +3397,8 @@ static void postProcess(uint8_t src[], int srcStride, uint8_t dst[], int dstStri } #endif - blockCopy(dstBlock + dstStride*8, dstStride, - srcBlock + srcStride*8, srcStride, 8, mode & LEVEL_FIX); + blockCopy(dstBlock + dstStride*copyAhead, dstStride, + srcBlock + srcStride*copyAhead, srcStride, mode & LEVEL_FIX); if(mode & LINEAR_IPOL_DEINT_FILTER) deInterlaceInterpolateLinear(dstBlock, dstStride); @@ -4160,7 +3457,7 @@ static void postProcess(uint8_t src[], int srcStride, uint8_t dst[], int dstStri vertX1Filter(tempBlock1, 16, QP); else if(mode & H_DEBLOCK) { - if( isVertDC(tempBlock1, 16)) + if( isVertDC(tempBlock1, 16) ) { if(isVertMinMaxOk(tempBlock1, 16, QP)) doVertLowPass(tempBlock1, 16, QP); @@ -4252,14 +3549,14 @@ static void postProcess(uint8_t src[], int srcStride, uint8_t dst[], int dstStri /* for(x=0; x