qpel in mmx2/3dnow

qpel refinement quality parameter Originally committed as revision 1393 to svn://svn.ffmpeg.org/ffmpeg/trunk

qpel in mmx2/3dnow
qpel refinement quality parameter Originally committed as revision 1393 to svn://svn.ffmpeg.org/ffmpeg/trunk
826f429a · Michael Niedermayer · 70ac76c0 · 826f429a · 826f429a · 826f429a
11 changed file
--- a/libavcodec/avcodec.h
+++ b/libavcodec/avcodec.h
@@ -5,8 +5,8 @@

 #define LIBAVCODEC_VERSION_INT 0x000406
 #define LIBAVCODEC_VERSION     "0.4.6"
-#define LIBAVCODEC_BUILD       4651
-#define LIBAVCODEC_BUILD_STR   "4651"
+#define LIBAVCODEC_BUILD       4652
+#define LIBAVCODEC_BUILD_STR   "4652"

 enum CodecID {
    CODEC_ID_NONE, 
@@ -909,7 +909,7 @@ typedef struct AVCodecContext {
     * decoding: unused
     */
    int me_pre_cmp;
-    
+
    /**
     * ME pre pass diamond size & shape
     * encoding: set by user.
@@ -917,6 +917,13 @@ typedef struct AVCodecContext {
     */
    int pre_dia_size;

+    /**
+     * subpel ME quality
+     * encoding: set by user.
+     * decoding: unused
+     */
+    int me_subpel_quality;
+
 } AVCodecContext;

 typedef struct AVCodec {

--- a/libavcodec/dsputil.c
+++ b/libavcodec/dsputil.c
@@ -781,6 +781,7 @@ static inline void copy_block9(UINT8 *dst, UINT8 *src, int dstStride, int srcStr
    }
 }

+
 #define QPEL_MC(r, OPNAME, RND, OP) \
 static void OPNAME ## mpeg4_qpel8_h_lowpass(UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int h){\
    UINT8 *cm = cropTbl + MAX_NEG_CROP;\
@@ -830,6 +831,7 @@ static void OPNAME ## mpeg4_qpel8_v_lowpass(UINT8 *dst, UINT8 *src, int dstStrid
 static void OPNAME ## mpeg4_qpel16_h_lowpass(UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int h){\
    UINT8 *cm = cropTbl + MAX_NEG_CROP;\
    int i;\
+    \
    for(i=0; i<h; i++)\
    {\
        OP(dst[ 0], (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]));\
@@ -853,9 +855,10 @@ static void OPNAME ## mpeg4_qpel16_h_lowpass(UINT8 *dst, UINT8 *src, int dstStri
    }\
 }\
 \
-static void OPNAME ## mpeg4_qpel16_v_lowpass(UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int w){\
+static void OPNAME ## mpeg4_qpel16_v_lowpass(UINT8 *dst, UINT8 *src, int dstStride, int srcStride){\
    UINT8 *cm = cropTbl + MAX_NEG_CROP;\
    int i;\
+    const int w=16;\
    for(i=0; i<w; i++)\
    {\
        const int src0= src[0*srcStride];\
@@ -1046,21 +1049,21 @@ static void OPNAME ## qpel16_mc01_c(UINT8 *dst, UINT8 *src, int stride){\
    UINT8 full[24*17];\
    UINT8 half[256];\
    copy_block17(full, src, 24, stride, 17);\
-    put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24, 16);\
+    put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
    OPNAME ## pixels16_l2(dst, full, half, stride, 24, 16, 16);\
 }\
 \
 static void OPNAME ## qpel16_mc02_c(UINT8 *dst, UINT8 *src, int stride){\
    UINT8 full[24*17];\
    copy_block17(full, src, 24, stride, 17);\
-    OPNAME ## mpeg4_qpel16_v_lowpass(dst, full, stride, 24, 16);\
+    OPNAME ## mpeg4_qpel16_v_lowpass(dst, full, stride, 24);\
 }\
 \
 static void OPNAME ## qpel16_mc03_c(UINT8 *dst, UINT8 *src, int stride){\
    UINT8 full[24*17];\
    UINT8 half[256];\
    copy_block17(full, src, 24, stride, 17);\
-    put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24, 16);\
+    put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
    OPNAME ## pixels16_l2(dst, full+24, half, stride, 24, 16, 16);\
 }\
 static void OPNAME ## qpel16_mc11_c(UINT8 *dst, UINT8 *src, int stride){\
@@ -1070,8 +1073,8 @@ static void OPNAME ## qpel16_mc11_c(UINT8 *dst, UINT8 *src, int stride){\
    UINT8 halfHV[256];\
    copy_block17(full, src, 24, stride, 17);\
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
-    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24, 16);\
-    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16, 16);\
+    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
+    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
    OPNAME ## pixels16_l4(dst, full, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
 }\
 static void OPNAME ## qpel16_mc31_c(UINT8 *dst, UINT8 *src, int stride){\
@@ -1081,8 +1084,8 @@ static void OPNAME ## qpel16_mc31_c(UINT8 *dst, UINT8 *src, int stride){\
    UINT8 halfHV[256];\
    copy_block17(full, src, 24, stride, 17);\
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
-    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24, 16);\
-    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16, 16);\
+    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
+    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
    OPNAME ## pixels16_l4(dst, full+1, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
 }\
 static void OPNAME ## qpel16_mc13_c(UINT8 *dst, UINT8 *src, int stride){\
@@ -1092,8 +1095,8 @@ static void OPNAME ## qpel16_mc13_c(UINT8 *dst, UINT8 *src, int stride){\
    UINT8 halfHV[256];\
    copy_block17(full, src, 24, stride, 17);\
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
-    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24, 16);\
-    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16, 16);\
+    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
+    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
    OPNAME ## pixels16_l4(dst, full+24, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
 }\
 static void OPNAME ## qpel16_mc33_c(UINT8 *dst, UINT8 *src, int stride){\
@@ -1103,22 +1106,22 @@ static void OPNAME ## qpel16_mc33_c(UINT8 *dst, UINT8 *src, int stride){\
    UINT8 halfHV[256];\
    copy_block17(full, src, 24, stride, 17);\
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full  , 16, 24, 17);\
-    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24, 16);\
-    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16, 16);\
+    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
+    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
    OPNAME ## pixels16_l4(dst, full+25, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
 }\
 static void OPNAME ## qpel16_mc21_c(UINT8 *dst, UINT8 *src, int stride){\
    UINT8 halfH[272];\
    UINT8 halfHV[256];\
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
-    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16, 16);\
+    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
    OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
 }\
 static void OPNAME ## qpel16_mc23_c(UINT8 *dst, UINT8 *src, int stride){\
    UINT8 halfH[272];\
    UINT8 halfHV[256];\
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
-    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16, 16);\
+    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
    OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
 }\
 static void OPNAME ## qpel16_mc12_c(UINT8 *dst, UINT8 *src, int stride){\
@@ -1128,8 +1131,8 @@ static void OPNAME ## qpel16_mc12_c(UINT8 *dst, UINT8 *src, int stride){\
    UINT8 halfHV[256];\
    copy_block17(full, src, 24, stride, 17);\
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
-    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24, 16);\
-    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16, 16);\
+    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
+    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
    OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
 }\
 static void OPNAME ## qpel16_mc32_c(UINT8 *dst, UINT8 *src, int stride){\
@@ -1139,14 +1142,14 @@ static void OPNAME ## qpel16_mc32_c(UINT8 *dst, UINT8 *src, int stride){\
    UINT8 halfHV[256];\
    copy_block17(full, src, 24, stride, 17);\
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
-    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24, 16);\
-    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16, 16);\
+    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
+    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
    OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
 }\
 static void OPNAME ## qpel16_mc22_c(UINT8 *dst, UINT8 *src, int stride){\
    UINT8 halfH[272];\
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
-    OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16, 16);\
+    OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
 }

 #define op_avg(a, b) a = (((a)+cm[((b) + 16)>>5]+1)>>1)

--- a/libavcodec/dsputil.h
+++ b/libavcodec/dsputil.h
@@ -102,6 +102,7 @@ typedef struct DSPContext {
    me_cmp_func quant_psnr[2];
    int (*hadamard8_abs )(uint8_t *src, int stride, int mean);

+    me_cmp_func me_pre_cmp[11];
    me_cmp_func me_cmp[11];
    me_cmp_func me_sub_cmp[11];
    me_cmp_func mb_cmp[11];

--- a/libavcodec/i386/dsputil_mmx.c
+++ b/libavcodec/i386/dsputil_mmx.c
--- a/libavcodec/i386/dsputil_mmx_avg.h
+++ b/libavcodec/i386/dsputil_mmx_avg.h
@@ -53,6 +53,38 @@ static void DEF(put_pixels8_x2)(UINT8 *block, const UINT8 *pixels, int line_size
 	:"%eax", "memory");
 }

+static void DEF(put_pixels8_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
+{
+    __asm __volatile(
+	"1:				\n\t"
+	"movq	(%1), %%mm0		\n\t"
+	"addl	%4, %1			\n\t"
+	"movq	(%1), %%mm1		\n\t"
+	"addl	%4, %1			\n\t"
+	PAVGB" (%2), %%mm0		\n\t"
+	PAVGB" 8(%2), %%mm1		\n\t"
+	"movq	%%mm0, (%3)		\n\t"
+	"addl	%5, %3			\n\t"
+	"movq	%%mm1, (%3)		\n\t"
+	"addl	%5, %3			\n\t"
+	"movq	(%1), %%mm0		\n\t"
+	"addl	%4, %1			\n\t"
+	"movq	(%1), %%mm1		\n\t"
+	"addl	%4, %1			\n\t"
+	PAVGB" 16(%2), %%mm0		\n\t"
+	PAVGB" 24(%2), %%mm1		\n\t"
+	"movq	%%mm0, (%3)		\n\t"
+	"addl	%5, %3			\n\t"
+	"movq	%%mm1, (%3)		\n\t"
+	"addl	%5, %3			\n\t"
+        "addl	$32, %2			\n\t"
+	"subl	$4, %0			\n\t"
+	"jnz	1b			\n\t"
+	:"+g"(h), "+r"(src1), "+r"(src2), "+r"(dst)
+	:"r"(src1Stride), "r"(dstStride)
+	:"memory");
+}
+
 static void DEF(put_pixels16_x2)(UINT8 *block, const UINT8 *pixels, int line_size, int h)
 {
    __asm __volatile(
@@ -92,6 +124,34 @@ static void DEF(put_pixels16_x2)(UINT8 *block, const UINT8 *pixels, int line_siz
 	:"r" (line_size)
 	:"%eax", "memory");
 }
+
+static void DEF(put_pixels16_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
+{
+    __asm __volatile(
+	"1:				\n\t"
+	"movq	(%1), %%mm0		\n\t"
+	"movq	8(%1), %%mm1		\n\t"
+	"addl	%4, %1			\n\t"
+	PAVGB" (%2), %%mm0		\n\t"
+	PAVGB" 8(%2), %%mm1		\n\t"
+	"movq	%%mm0, (%3)		\n\t"
+	"movq	%%mm1, 8(%3)		\n\t"
+	"addl	%5, %3			\n\t"
+	"movq	(%1), %%mm0		\n\t"
+	"movq	8(%1), %%mm1		\n\t"
+	"addl	%4, %1			\n\t"
+	PAVGB" 16(%2), %%mm0		\n\t"
+	PAVGB" 24(%2), %%mm1		\n\t"
+	"movq	%%mm0, (%3)		\n\t"
+	"movq	%%mm1, 8(%3)		\n\t"
+	"addl	%5, %3			\n\t"
+        "addl	$32, %2			\n\t"
+	"subl	$2, %0			\n\t"
+	"jnz	1b			\n\t"
+	:"+g"(h), "+r"(src1), "+r"(src2), "+r"(dst)
+	:"r"(src1Stride), "r"(dstStride)
+	:"memory");
+}
 
 /* GL: this function does incorrect rounding if overflow */
 static void DEF(put_no_rnd_pixels8_x2)(UINT8 *block, const UINT8 *pixels, int line_size, int h)

--- a/libavcodec/i386/dsputil_mmx_rnd.h
+++ b/libavcodec/i386/dsputil_mmx_rnd.h
@@ -54,6 +54,42 @@ static void DEF(put, pixels8_x2)(UINT8 *block, const UINT8 *pixels, int line_siz
 	:"eax", "memory");
 }

+static void DEF(put, pixels8_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
+{
+    MOVQ_BFE(mm6);
+    __asm __volatile(
+	".balign 8			\n\t"
+	"1:				\n\t"
+	"movq	(%1), %%mm0		\n\t"
+	"movq	(%2), %%mm1		\n\t"
+	"addl	%4, %1			\n\t"
+	"movq	(%1), %%mm2		\n\t"
+	"movq	8(%2), %%mm3		\n\t"
+	"addl	%4, %1			\n\t"
+	PAVGBP(%%mm0, %%mm1, %%mm4,   %%mm2, %%mm3, %%mm5)
+	"movq	%%mm4, (%3)		\n\t"
+	"addl	%5, %3			\n\t"
+	"movq	%%mm5, (%3)		\n\t"
+	"addl	%5, %3			\n\t"
+	"movq	(%1), %%mm0		\n\t"
+	"movq	16(%2), %%mm1		\n\t"
+	"addl	%4, %1			\n\t"
+	"movq	(%1), %%mm2		\n\t"
+	"movq	24(%2), %%mm3		\n\t"
+	"addl	%4, %1			\n\t"
+	"addl	$32, %2			\n\t"
+	PAVGBP(%%mm0, %%mm1, %%mm4,   %%mm2, %%mm3, %%mm5)
+	"movq	%%mm4, (%3)		\n\t"
+	"addl	%5, %3			\n\t"
+	"movq	%%mm5, (%3)		\n\t"
+	"addl	%5, %3			\n\t"
+	"subl	$4, %0			\n\t"
+	"jnz	1b			\n\t"
+	:"+g"(h), "+r"(src1), "+r"(src2), "+r"(dst)
+	:"r"(src1Stride), "r"(dstStride)
+	:"memory");
+}
+
 static void DEF(put, pixels16_x2)(UINT8 *block, const UINT8 *pixels, int line_size, int h)
 {
    MOVQ_BFE(mm6);
@@ -90,7 +126,7 @@ static void DEF(put, pixels16_x2)(UINT8 *block, const UINT8 *pixels, int line_si
 	"movq	9(%1, %3), %%mm3	\n\t"
 	PAVGBP(%%mm0, %%mm1, %%mm4,   %%mm2, %%mm3, %%mm5)
 	"movq	%%mm4, 8(%2)		\n\t"
-	"movq	%%mm5, 8(%2, %3)		\n\t"
+	"movq	%%mm5, 8(%2, %3)	\n\t"
 	"addl	%%eax, %1		\n\t"
 	"addl	%%eax, %2		\n\t"
 	"subl	$4, %0			\n\t"
@@ -100,6 +136,38 @@ static void DEF(put, pixels16_x2)(UINT8 *block, const UINT8 *pixels, int line_si
 	:"eax", "memory");
 }

+static void DEF(put, pixels16_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
+{
+    MOVQ_BFE(mm6);
+    __asm __volatile(
+	".balign 8			\n\t"
+	"1:				\n\t"
+	"movq	(%1), %%mm0		\n\t"
+	"movq	(%2), %%mm1		\n\t"
+	"movq	8(%1), %%mm2		\n\t"
+	"movq	8(%2), %%mm3		\n\t"
+	"addl	%4, %1			\n\t"
+	PAVGBP(%%mm0, %%mm1, %%mm4,   %%mm2, %%mm3, %%mm5)
+	"movq	%%mm4, (%3)		\n\t"
+	"movq	%%mm5, 8(%3)		\n\t"
+	"addl	%5, %3			\n\t"
+	"movq	(%1), %%mm0		\n\t"
+	"movq	16(%2), %%mm1		\n\t"
+	"movq	8(%1), %%mm2		\n\t"
+	"movq	24(%2), %%mm3		\n\t"
+	"addl	%4, %1			\n\t"
+	PAVGBP(%%mm0, %%mm1, %%mm4,   %%mm2, %%mm3, %%mm5)
+	"movq	%%mm4, (%3)		\n\t"
+	"movq	%%mm5, 8(%3)		\n\t"
+	"addl	%5, %3			\n\t"
+	"addl	$32, %2			\n\t"
+	"subl	$2, %0			\n\t"
+	"jnz	1b			\n\t"
+	:"+g"(h), "+r"(src1), "+r"(src2), "+r"(dst)
+	:"r"(src1Stride), "r"(dstStride)
+	:"memory");
+}
+
 static void DEF(put, pixels8_y2)(UINT8 *block, const UINT8 *pixels, int line_size, int h)
 {
    MOVQ_BFE(mm6);
@@ -195,6 +263,124 @@ static void DEF(put, pixels8_xy2)(UINT8 *block, const UINT8 *pixels, int line_si
 	:"eax", "memory");
 }

+static void DEF(put, pixels8_l4)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int stride, int h)
+{
+    MOVQ_ZERO(mm7);
+    SET_RND(mm6); // =2 for rnd  and  =1 for no_rnd version
+    __asm __volatile(
+	".balign 8      		\n\t"
+	"1:				\n\t"
+	"movq	(%1), %%mm0		\n\t"
+	"movq	(%2), %%mm1		\n\t"
+	"movq	64(%2), %%mm2		\n\t"
+	"movq	136(%2), %%mm3		\n\t"
+	"punpcklbw %%mm7, %%mm0		\n\t"
+	"punpcklbw %%mm7, %%mm1		\n\t"
+	"punpcklbw %%mm7, %%mm2		\n\t"
+	"punpcklbw %%mm7, %%mm3		\n\t"
+	"paddusw %%mm6, %%mm0		\n\t"
+	"paddusw %%mm0, %%mm1		\n\t"
+	"paddusw %%mm2, %%mm3		\n\t"
+	"paddusw %%mm1, %%mm3		\n\t"
+	"psrlw	$2, %%mm3		\n\t"
+	"movq	(%1), %%mm0		\n\t"
+	"movq	(%2), %%mm1		\n\t"
+	"movq	64(%2), %%mm2		\n\t"
+	"movq	136(%2), %%mm4		\n\t"
+	"punpckhbw %%mm7, %%mm0		\n\t"
+	"punpckhbw %%mm7, %%mm1		\n\t"
+	"punpckhbw %%mm7, %%mm2		\n\t"
+	"punpckhbw %%mm7, %%mm4		\n\t"
+	"paddusw %%mm6, %%mm0		\n\t"
+	"paddusw %%mm0, %%mm1		\n\t"
+	"paddusw %%mm2, %%mm4		\n\t"
+	"paddusw %%mm1, %%mm4		\n\t"
+	"psrlw	$2, %%mm4		\n\t"
+	"packuswb  %%mm4, %%mm3		\n\t"
+	"movq	%%mm3, (%0)		\n\t"
+        "addl	%4, %0			\n\t"
+        "addl	%4, %1			\n\t"
+        "addl	$8, %2			\n\t" 
+        "decl	%3			\n\t"
+	"jnz	1b			\n\t"
+	:"+r"(dst), "+r"(src1), "+r"(src2), "+r"(h)
+	:"r"(stride)
+	:"memory");
+}
+
+static void DEF(put, pixels16_l4)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int stride, int h)
+{
+    MOVQ_ZERO(mm7);
+    SET_RND(mm6); // =2 for rnd  and  =1 for no_rnd version
+    __asm __volatile(
+	".balign 8      		\n\t"
+	"1:				\n\t"
+	"movq	(%1), %%mm0		\n\t"
+	"movq	(%2), %%mm1		\n\t"
+	"movq	256(%2), %%mm2		\n\t"
+	"movq	528(%2), %%mm3		\n\t"
+	"punpcklbw %%mm7, %%mm0		\n\t"
+	"punpcklbw %%mm7, %%mm1		\n\t"
+	"punpcklbw %%mm7, %%mm2		\n\t"
+	"punpcklbw %%mm7, %%mm3		\n\t"
+	"paddusw %%mm6, %%mm0		\n\t"
+	"paddusw %%mm0, %%mm1		\n\t"
+	"paddusw %%mm2, %%mm3		\n\t"
+	"paddusw %%mm1, %%mm3		\n\t"
+	"psrlw	$2, %%mm3		\n\t"
+	"movq	(%1), %%mm0		\n\t"
+	"movq	(%2), %%mm1		\n\t"
+	"movq	256(%2), %%mm2		\n\t"
+	"movq	528(%2), %%mm4		\n\t"
+	"punpckhbw %%mm7, %%mm0		\n\t"
+	"punpckhbw %%mm7, %%mm1		\n\t"
+	"punpckhbw %%mm7, %%mm2		\n\t"
+	"punpckhbw %%mm7, %%mm4		\n\t"
+	"paddusw %%mm6, %%mm0		\n\t"
+	"paddusw %%mm0, %%mm1		\n\t"
+	"paddusw %%mm2, %%mm4		\n\t"
+	"paddusw %%mm1, %%mm4		\n\t"
+	"psrlw	$2, %%mm4		\n\t"
+	"packuswb  %%mm4, %%mm3		\n\t"
+	"movq	%%mm3, (%0)		\n\t"
+	"movq	8(%1), %%mm0		\n\t"
+	"movq	8(%2), %%mm1		\n\t"
+	"movq	264(%2), %%mm2		\n\t"
+	"movq	536(%2), %%mm3		\n\t"
+	"punpcklbw %%mm7, %%mm0		\n\t"
+	"punpcklbw %%mm7, %%mm1		\n\t"
+	"punpcklbw %%mm7, %%mm2		\n\t"
+	"punpcklbw %%mm7, %%mm3		\n\t"
+	"paddusw %%mm6, %%mm0		\n\t"
+	"paddusw %%mm0, %%mm1		\n\t"
+	"paddusw %%mm2, %%mm3		\n\t"
+	"paddusw %%mm1, %%mm3		\n\t"
+	"psrlw	$2, %%mm3		\n\t"
+	"movq	8(%1), %%mm0		\n\t"
+	"movq	8(%2), %%mm1		\n\t"
+	"movq	264(%2), %%mm2		\n\t"
+	"movq	536(%2), %%mm4		\n\t"
+	"punpckhbw %%mm7, %%mm0		\n\t"
+	"punpckhbw %%mm7, %%mm1		\n\t"
+	"punpckhbw %%mm7, %%mm2		\n\t"
+	"punpckhbw %%mm7, %%mm4		\n\t"
+	"paddusw %%mm6, %%mm0		\n\t"
+	"paddusw %%mm0, %%mm1		\n\t"
+	"paddusw %%mm2, %%mm4		\n\t"
+	"paddusw %%mm1, %%mm4		\n\t"
+	"psrlw	$2, %%mm4		\n\t"
+	"packuswb  %%mm4, %%mm3		\n\t"
+	"movq	%%mm3, 8(%0)		\n\t"
+        "addl	%4, %0			\n\t"
+        "addl	%4, %1			\n\t"
+        "addl	$16, %2			\n\t" 
+        "decl	%3			\n\t"
+	"jnz	1b			\n\t"
+	:"+r"(dst), "+r"(src1), "+r"(src2), "+r"(h)
+	:"r"(stride)
+	:"memory");
+}
+
 // avg_pixels
 // in case more speed is needed - unroling would certainly help
 static void DEF(avg, pixels8)(UINT8 *block, const UINT8 *pixels, int line_size, int h)
@@ -259,6 +445,27 @@ static void DEF(avg, pixels8_x2)(UINT8 *block, const UINT8 *pixels, int line_siz
    } while (--h);
 }

+static void DEF(avg, pixels8_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
+{
+    MOVQ_BFE(mm6);
+    JUMPALIGN();
+    do {
+	__asm __volatile(
+	    "movq  %1, %%mm0		\n\t"
+	    "movq  %2, %%mm1		\n\t"
+	    "movq  %0, %%mm3		\n\t"
+	    PAVGB(%%mm0, %%mm1, %%mm2, %%mm6)
+	    PAVGB(%%mm3, %%mm2, %%mm0, %%mm6)
+	    "movq  %%mm0, %0		\n\t"
+	    :"+m"(*dst)
+	    :"m"(*src1), "m"(*src2)
+	    :"memory");
+	dst += dstStride;
+        src1 += src1Stride;
+        src2 += 8;
+    } while (--h);
+}
+
 static void DEF(avg, pixels16_x2)(UINT8 *block, const UINT8 *pixels, int line_size, int h)
 {
    MOVQ_BFE(mm6);
@@ -285,6 +492,33 @@ static void DEF(avg, pixels16_x2)(UINT8 *block, const UINT8 *pixels, int line_si
    } while (--h);
 }

+static void DEF(avg, pixels16_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
+{
+    MOVQ_BFE(mm6);
+    JUMPALIGN();
+    do {
+	__asm __volatile(
+	    "movq  %1, %%mm0		\n\t"
+	    "movq  %2, %%mm1		\n\t"
+	    "movq  %0, %%mm3		\n\t"
+	    PAVGB(%%mm0, %%mm1, %%mm2, %%mm6)
+	    PAVGB(%%mm3, %%mm2, %%mm0, %%mm6)
+	    "movq  %%mm0, %0		\n\t"
+	    "movq  8%1, %%mm0		\n\t"
+	    "movq  8%2, %%mm1		\n\t"
+	    "movq  8%0, %%mm3		\n\t"
+	    PAVGB(%%mm0, %%mm1, %%mm2, %%mm6)
+	    PAVGB(%%mm3, %%mm2, %%mm0, %%mm6)
+	    "movq  %%mm0, 8%0		\n\t"
+	    :"+m"(*dst)
+	    :"m"(*src1), "m"(*src2)
+	    :"memory");
+	dst += dstStride;
+        src1 += src1Stride;
+        src2 += 16;
+    } while (--h);
+}
+
 static void DEF(avg, pixels8_y2)(UINT8 *block, const UINT8 *pixels, int line_size, int h)
 {
    MOVQ_BFE(mm6);
@@ -399,6 +633,133 @@ static void DEF(avg, pixels8_xy2)(UINT8 *block, const UINT8 *pixels, int line_si
 	:"eax", "memory");
 }

+static void DEF(avg, pixels8_l4)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int stride, int h)
+{
+    MOVQ_ZERO(mm7);
+    SET_RND(mm6); // =2 for rnd  and  =1 for no_rnd version
+    MOVQ_BFE(mm5);
+    __asm __volatile(
+	".balign 8      		\n\t"
+	"1:				\n\t"
+	"movq	(%1), %%mm0		\n\t"
+	"movq	(%2), %%mm1		\n\t"
+	"movq	64(%2), %%mm2		\n\t"
+	"movq	136(%2), %%mm3		\n\t"
+	"punpcklbw %%mm7, %%mm0		\n\t"
+	"punpcklbw %%mm7, %%mm1		\n\t"
+	"punpcklbw %%mm7, %%mm2		\n\t"
+	"punpcklbw %%mm7, %%mm3		\n\t"
+	"paddusw %%mm6, %%mm0		\n\t"
+	"paddusw %%mm0, %%mm1		\n\t"
+	"paddusw %%mm2, %%mm3		\n\t"
+	"paddusw %%mm1, %%mm3		\n\t"
+	"psrlw	$2, %%mm3		\n\t"
+	"movq	(%1), %%mm0		\n\t"
+	"movq	(%2), %%mm1		\n\t"
+	"movq	64(%2), %%mm2		\n\t"
+	"movq	136(%4), %%mm4		\n\t"
+	"punpckhbw %%mm7, %%mm0		\n\t"
+	"punpckhbw %%mm7, %%mm1		\n\t"
+	"punpckhbw %%mm7, %%mm2		\n\t"
+	"punpckhbw %%mm7, %%mm4		\n\t"
+	"paddusw %%mm6, %%mm0		\n\t"
+	"paddusw %%mm0, %%mm1		\n\t"
+	"paddusw %%mm2, %%mm4		\n\t"
+	"paddusw %%mm1, %%mm4		\n\t"
+	"psrlw	$2, %%mm4		\n\t"
+	"packuswb  %%mm4, %%mm3		\n\t"
+	"movq	(%0), %%mm4		\n\t"
+        PAVGB(%%mm3, %%mm4, %%mm0, %%mm5)
+	"movq	%%mm3, (%0)		\n\t"
+        "addl	%4, %0			\n\t"
+        "addl	%4, %1			\n\t"
+        "addl	$8, %2			\n\t" 
+        "decl	%3			\n\t"
+	"jnz	1b			\n\t"
+	:"+r"(dst), "+r"(src1), "+r"(src2), "+r"(h)
+	:"r"(stride)
+	:"memory");
+}
+
+static void DEF(avg, pixels16_l4)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int stride, int h)
+{
+    MOVQ_ZERO(mm7);
+    SET_RND(mm6); // =2 for rnd  and  =1 for no_rnd version
+    MOVQ_BFE(mm5);
+    __asm __volatile(
+	".balign 8      		\n\t"
+	"1:				\n\t"
+	"movq	(%1), %%mm0		\n\t"
+	"movq	(%2), %%mm1		\n\t"
+	"movq	256(%2), %%mm2		\n\t"
+	"movq	528(%2), %%mm3		\n\t"
+	"punpcklbw %%mm7, %%mm0		\n\t"
+	"punpcklbw %%mm7, %%mm1		\n\t"
+	"punpcklbw %%mm7, %%mm2		\n\t"
+	"punpcklbw %%mm7, %%mm3		\n\t"
+	"paddusw %%mm6, %%mm0		\n\t"
+	"paddusw %%mm0, %%mm1		\n\t"
+	"paddusw %%mm2, %%mm3		\n\t"
+	"paddusw %%mm1, %%mm3		\n\t"
+	"psrlw	$2, %%mm3		\n\t"
+	"movq	(%1), %%mm0		\n\t"
+	"movq	(%2), %%mm1		\n\t"
+	"movq	256(%2), %%mm2		\n\t"
+	"movq	528(%4), %%mm4		\n\t"
+	"punpckhbw %%mm7, %%mm0		\n\t"
+	"punpckhbw %%mm7, %%mm1		\n\t"
+	"punpckhbw %%mm7, %%mm2		\n\t"
+	"punpckhbw %%mm7, %%mm4		\n\t"
+	"paddusw %%mm6, %%mm0		\n\t"
+	"paddusw %%mm0, %%mm1		\n\t"
+	"paddusw %%mm2, %%mm4		\n\t"
+	"paddusw %%mm1, %%mm4		\n\t"
+	"psrlw	$2, %%mm4		\n\t"
+	"packuswb  %%mm4, %%mm3		\n\t"
+	"movq	(%0), %%mm4		\n\t"
+        PAVGB(%%mm3, %%mm4, %%mm0, %%mm5)
+	"movq	%%mm3, (%0)		\n\t"
+	"movq	8(%1), %%mm0		\n\t"
+	"movq	8(%2), %%mm1		\n\t"
+	"movq	264(%2), %%mm2		\n\t"
+	"movq	536(%2), %%mm3		\n\t"
+	"punpcklbw %%mm7, %%mm0		\n\t"
+	"punpcklbw %%mm7, %%mm1		\n\t"
+	"punpcklbw %%mm7, %%mm2		\n\t"
+	"punpcklbw %%mm7, %%mm3		\n\t"
+	"paddusw %%mm6, %%mm0		\n\t"
+	"paddusw %%mm0, %%mm1		\n\t"
+	"paddusw %%mm2, %%mm3		\n\t"
+	"paddusw %%mm1, %%mm3		\n\t"
+	"psrlw	$2, %%mm3		\n\t"
+	"movq	8(%1), %%mm0		\n\t"
+	"movq	8(%2), %%mm1		\n\t"
+	"movq	264(%2), %%mm2		\n\t"
+	"movq	536(%4), %%mm4		\n\t"
+	"punpckhbw %%mm7, %%mm0		\n\t"
+	"punpckhbw %%mm7, %%mm1		\n\t"
+	"punpckhbw %%mm7, %%mm2		\n\t"
+	"punpckhbw %%mm7, %%mm4		\n\t"
+	"paddusw %%mm6, %%mm0		\n\t"
+	"paddusw %%mm0, %%mm1		\n\t"
+	"paddusw %%mm2, %%mm4		\n\t"
+	"paddusw %%mm1, %%mm4		\n\t"
+	"psrlw	$2, %%mm4		\n\t"
+	"packuswb  %%mm4, %%mm3		\n\t"
+	"movq	8(%0), %%mm4		\n\t"
+        PAVGB(%%mm3, %%mm4, %%mm0, %%mm5)
+	"movq	%%mm3, 8(%0)		\n\t"
+        "addl	%4, %0			\n\t"
+        "addl	%4, %1			\n\t"
+        "addl	$16, %2			\n\t" 
+        "decl	%3			\n\t"
+	"jnz	1b			\n\t"
+	:"+r"(dst), "+r"(src1), "+r"(src2), "+r"(h)
+	:"r"(stride)
+	:"memory");
+}
+
+
 //FIXME optimize
 static void DEF(put, pixels16_y2)(UINT8 *block, const UINT8 *pixels, int line_size, int h){
    DEF(put, pixels8_y2)(block  , pixels  , line_size, h);

--- a/libavcodec/motion_est.c
+++ b/libavcodec/motion_est.c
@@ -311,6 +311,7 @@ static inline int get_penalty_factor(MpegEncContext *s, int type){
 }

 void ff_init_me(MpegEncContext *s){
+    set_cmp(s, s->dsp.me_pre_cmp, s->avctx->me_pre_cmp);
    set_cmp(s, s->dsp.me_cmp, s->avctx->me_cmp);
    set_cmp(s, s->dsp.me_sub_cmp, s->avctx->me_sub_cmp);
    set_cmp(s, s->dsp.mb_cmp, s->avctx->mb_cmp);
@@ -336,6 +337,12 @@ void ff_init_me(MpegEncContext *s){
        s->me.motion_search[0]= simple_epzs_motion_search;
        s->me.motion_search[1]= simple_epzs_motion_search4;
    }
+    
+    if(s->avctx->me_pre_cmp&FF_CMP_CHROMA){
+        s->me.pre_motion_search= simple_chroma_epzs_motion_search;
+    }else{
+        s->me.pre_motion_search= simple_epzs_motion_search;
+    }
 }
      
 static int pix_dev(UINT8 * pix, int line_size, int mean)
@@ -1037,7 +1044,7 @@ int ff_pre_estimate_p_frame_motion(MpegEncContext * s,
    
    assert(s->quarter_sample==0 || s->quarter_sample==1);

-    s->me.penalty_factor    = get_penalty_factor(s, s->avctx->me_cmp);
+    s->me.pre_penalty_factor    = get_penalty_factor(s, s->avctx->me_pre_cmp);

    get_limits(s, &range, &xmin, &ymin, &xmax, &ymax, s->f_code);
    rel_xmin= xmin - mb_x*16;
@@ -1072,8 +1079,8 @@ int ff_pre_estimate_p_frame_motion(MpegEncContext * s,
        pred_x = P_MEDIAN[0];
        pred_y = P_MEDIAN[1];
    }
-    dmin = s->me.motion_search[0](s, 0, &mx, &my, P, pred_x, pred_y, rel_xmin, rel_ymin, rel_xmax, rel_ymax, 
-                                  &s->last_picture, s->p_mv_table, (1<<16)>>shift, mv_penalty);
+    dmin = s->me.pre_motion_search(s, 0, &mx, &my, P, pred_x, pred_y, rel_xmin, rel_ymin, rel_xmax, rel_ymax, 
+                                   &s->last_picture, s->p_mv_table, (1<<16)>>shift, mv_penalty);

    s->p_mv_table[xy][0] = mx<<shift;
    s->p_mv_table[xy][1] = my<<shift;

--- a/libavcodec/motion_est_template.c
+++ b/libavcodec/motion_est_template.c
@@ -268,6 +268,7 @@ static int RENAME(qpel_motion_search)(MpegEncContext * s,
    const int my = *my_ptr;   
    const int penalty_factor= s->me.sub_penalty_factor;
    const int map_generation= s->me.map_generation;
+    const int subpel_quality= s->avctx->me_subpel_quality;
    uint32_t *map= s->me.map;
    me_cmp_func cmp, chroma_cmp;
    me_cmp_func cmp_sub, chroma_cmp_sub;
@@ -309,7 +310,7 @@ static int RENAME(qpel_motion_search)(MpegEncContext * s,
        
        memset(best, 64, sizeof(int)*8);
 #if 1
-        if(s->avctx->dia_size>=2){        
+        if(s->me.dia_size>=2){        
            const int tl= score_map[(index-(1<<ME_MAP_SHIFT)-1)&(ME_MAP_SIZE-1)];
            const int bl= score_map[(index+(1<<ME_MAP_SHIFT)-1)&(ME_MAP_SIZE-1)];
            const int tr= score_map[(index-(1<<ME_MAP_SHIFT)+1)&(ME_MAP_SIZE-1)];
@@ -388,24 +389,34 @@ static int RENAME(qpel_motion_search)(MpegEncContext * s,
                }
            }            
        }
-        for(i=0; i<8; i++){
+        for(i=0; i<subpel_quality; i++){
            nx= best_pos[i][0];
            ny= best_pos[i][1];
            CHECK_QUARTER_MV(nx&3, ny&3, nx>>2, ny>>2)
        }
+
 #if 0
-            nx= FFMAX(4*mx - bx, bx - 4*mx);
-            ny= FFMAX(4*my - by, by - 4*my);
+            const int tl= score_map[(index-(1<<ME_MAP_SHIFT)-1)&(ME_MAP_SIZE-1)];
+            const int bl= score_map[(index+(1<<ME_MAP_SHIFT)-1)&(ME_MAP_SIZE-1)];
+            const int tr= score_map[(index-(1<<ME_MAP_SHIFT)+1)&(ME_MAP_SIZE-1)];
+            const int br= score_map[(index+(1<<ME_MAP_SHIFT)+1)&(ME_MAP_SIZE-1)];
+//            if(l < r && l < t && l < b && l < tl && l < bl && l < tr && l < br && bl < tl){
+            if(tl<br){
+
+//            nx= FFMAX(4*mx - bx, bx - 4*mx);
+//            ny= FFMAX(4*my - by, by - 4*my);
            
-            static int stats[4][4];
-            stats[nx][ny]++;
-            if(256*256*256*64 % (stats[0][0]+1) ==0){
-                for(i=0; i<16; i++){
-                    if((i&3)==0) printf("\n");
+            static int stats[7][7], count;
+            count++;
+            stats[4*mx - bx + 3][4*my - by + 3]++;
+            if(256*256*256*64 % count ==0){
+                for(i=0; i<49; i++){
+                    if((i%7)==0) printf("\n");
                    printf("%6d ", stats[0][i]);
                }
                printf("\n");
            }
+            }
 #endif
 #else

@@ -659,7 +670,7 @@ static inline int RENAME(sab_diamond_search)(MpegEncContext * s, int *best, int
 {
    me_cmp_func cmp, chroma_cmp;
    Minima minima[MAX_SAB_SIZE];
-    const int minima_count= ABS(s->avctx->dia_size);
+    const int minima_count= ABS(s->me.dia_size);
    int i, j;
    LOAD_COMMON(s->mb_x*16, s->mb_y*16);
    
@@ -744,7 +755,7 @@ static inline int RENAME(var_diamond_search)(MpegEncContext * s, int *best, int
    cmp= s->dsp.me_cmp[size];
    chroma_cmp= s->dsp.me_cmp[size+1];

-    for(dia_size=1; dia_size<=s->avctx->dia_size; dia_size++){
+    for(dia_size=1; dia_size<=s->me.dia_size; dia_size++){
        int dir, start, end;
        const int x= best[0];
        const int y= best[1];
@@ -893,15 +904,15 @@ static int RENAME(epzs_motion_search)(MpegEncContext * s, int block,
    }

 //check(best[0],best[1],0, b0)
-    if(s->avctx->dia_size==-1)
+    if(s->me.dia_size==-1)
        dmin= RENAME(funny_diamond_search)(s, best, dmin, ref_picture,
                                   pred_x, pred_y, penalty_factor, xmin, ymin, xmax, ymax, 
 				   shift, map, map_generation, size, mv_penalty);
-    else if(s->avctx->dia_size<-1)
+    else if(s->me.dia_size<-1)
        dmin= RENAME(sab_diamond_search)(s, best, dmin, ref_picture,
                                   pred_x, pred_y, penalty_factor, xmin, ymin, xmax, ymax, 
 				   shift, map, map_generation, size, mv_penalty);
-    else if(s->avctx->dia_size<2)
+    else if(s->me.dia_size<2)
        dmin= RENAME(small_diamond_search)(s, best, dmin, ref_picture,
                                   pred_x, pred_y, penalty_factor, xmin, ymin, xmax, ymax, 
 				   shift, map, map_generation, size, mv_penalty);
@@ -969,15 +980,15 @@ static int RENAME(epzs_motion_search4)(MpegEncContext * s, int block,
                        (last_mv[ref_mv_xy+ref_mv_stride][1]*ref_mv_scale + (1<<15))>>16)
    }

-    if(s->avctx->dia_size==-1)
+    if(s->me.dia_size==-1)
        dmin= RENAME(funny_diamond_search)(s, best, dmin, ref_picture,
                                   pred_x, pred_y, penalty_factor, xmin, ymin, xmax, ymax, 
 				   shift, map, map_generation, size, mv_penalty);
-    else if(s->avctx->dia_size<-1)
+    else if(s->me.dia_size<-1)
        dmin= RENAME(sab_diamond_search)(s, best, dmin, ref_picture,
                                   pred_x, pred_y, penalty_factor, xmin, ymin, xmax, ymax, 
 				   shift, map, map_generation, size, mv_penalty);
-    else if(s->avctx->dia_size<2)
+    else if(s->me.dia_size<2)
        dmin= RENAME(small_diamond_search)(s, best, dmin, ref_picture,
                                   pred_x, pred_y, penalty_factor, xmin, ymin, xmax, ymax, 
 				   shift, map, map_generation, size, mv_penalty);

--- a/libavcodec/mpegvideo.c
+++ b/libavcodec/mpegvideo.c
@@ -2786,12 +2786,12 @@ static void encode_picture(MpegEncContext *s, int picture_number)
        else if(s->pict_type!=B_TYPE)
            s->no_rounding ^= 1;          
    }
-
    /* Estimate motion for every MB */
    if(s->pict_type != I_TYPE){
        if(s->pict_type != B_TYPE){
            if((s->avctx->pre_me && s->last_non_b_pict_type==I_TYPE) || s->avctx->pre_me==2){
                s->me.pre_pass=1;
+                s->me.dia_size= s->avctx->pre_dia_size;

                for(mb_y=s->mb_height-1; mb_y >=0 ; mb_y--) {
                    for(mb_x=s->mb_width-1; mb_x >=0 ; mb_x--) {
@@ -2804,6 +2804,7 @@ static void encode_picture(MpegEncContext *s, int picture_number)
            }
        }

+        s->me.dia_size= s->avctx->dia_size;
        for(mb_y=0; mb_y < s->mb_height; mb_y++) {
            s->block_index[0]= s->block_wrap[0]*(mb_y*2 + 1) - 1;
            s->block_index[1]= s->block_wrap[0]*(mb_y*2 + 1);
@@ -2816,7 +2817,7 @@ static void encode_picture(MpegEncContext *s, int picture_number)
                s->block_index[1]+=2;
                s->block_index[2]+=2;
                s->block_index[3]+=2;
-
+                
                /* compute motion vector & mb_type and store in context */
                if(s->pict_type==B_TYPE)
                    ff_estimate_b_frame_motion(s, mb_x, mb_y);

--- a/libavcodec/mpegvideo.h
+++ b/libavcodec/mpegvideo.h
@@ -139,9 +139,11 @@ typedef struct MotionEstContext{
    uint32_t *map;                     /* map to avoid duplicate evaluations */
    uint32_t *score_map;               /* map to store the scores */
    int map_generation;  
+    int pre_penalty_factor;
    int penalty_factor;
    int sub_penalty_factor;
    int pre_pass;                      /* = 1 for the pre pass */
+    int dia_size;
    UINT16 (*mv_penalty)[MAX_MV*2+1];  /* amount of bits needed to encode a MV */
    int (*sub_motion_search)(struct MpegEncContext * s,
 				  int *mx_ptr, int *my_ptr, int dmin,
@@ -153,6 +155,11 @@ typedef struct MotionEstContext{
                             int P[10][2], int pred_x, int pred_y,
                             int xmin, int ymin, int xmax, int ymax, Picture *ref_picture, int16_t (*last_mv)[2], 
                             int ref_mv_scale, uint16_t * const mv_penalty);
+    int (*pre_motion_search)(struct MpegEncContext * s, int block,
+                             int *mx_ptr, int *my_ptr,
+                             int P[10][2], int pred_x, int pred_y,
+                             int xmin, int ymin, int xmax, int ymax, Picture *ref_picture, int16_t (*last_mv)[2], 
+                             int ref_mv_scale, uint16_t * const mv_penalty);
 }MotionEstContext;

 typedef struct MpegEncContext {

--- a/libavcodec/utils.c
+++ b/libavcodec/utils.c
@@ -234,6 +234,7 @@ void avcodec_get_context_defaults(AVCodecContext *s){
    s->me_method= ME_EPZS;
    s->get_buffer= avcodec_default_get_buffer;
    s->release_buffer= avcodec_default_release_buffer;
+    s->me_subpel_quality=8;
 }

 /**