some MMX optimizations for the CAVS decoder

Originally committed as revision 5846 to svn://svn.ffmpeg.org/ffmpeg/trunk

some MMX optimizations for the CAVS decoder
Originally committed as revision 5846 to svn://svn.ffmpeg.org/ffmpeg/trunk
595e7bd9 · Stefan Gehrer · 09be55df · 595e7bd9 · 595e7bd9 · 595e7bd9
6 changed file
--- a/libavcodec/Makefile
+++ b/libavcodec/Makefile
@@ -288,6 +288,9 @@ OBJS += i386/fdct_mmx.o i386/cputest.o \
 ifeq ($(CONFIG_GPL),yes)
 OBJS += i386/idct_mmx_xvid.o
 endif
+ifeq ($(CONFIG_CAVS_DECODER),yes)
+OBJS += i386/cavsdsp_mmx.o
+endif
 ifeq ($(TARGET_BUILTIN_VECTOR),yes)
 i386/fft_sse.o: CFLAGS+= -msse
 depend: CFLAGS+= -msse

--- a/libavcodec/avcodec.h
+++ b/libavcodec/avcodec.h
@@ -1194,6 +1194,7 @@ typedef struct AVCodecContext {
 #define FF_IDCT_VP3          12
 #define FF_IDCT_IPP          13
 #define FF_IDCT_XVIDMMX      14
+#define FF_IDCT_CAVS         15

    /**
     * slice count.

--- a/libavcodec/cavs.c
+++ b/libavcodec/cavs.c
@@ -78,6 +78,7 @@ typedef struct {
    int qp;
    int qp_fixed;
    int cbp;
+    ScanTable scantable;

    /** intra prediction is done with un-deblocked samples
     they are saved here before deblocking the MB  */
@@ -97,6 +98,7 @@ typedef struct {
    int scale_den[2];  ///< for scaling neighbouring MVs

    int got_keyframe;
+    DCTELEM *block;
 } AVSContext;

 /*****************************************************************************
@@ -649,10 +651,9 @@ static int decode_residual_block(AVSContext *h, GetBitContext *gb,
    int dqm = dequant_mul[qp];
    int dqs = dequant_shift[qp];
    int dqa = 1 << (dqs - 1);
-    const uint8_t *scantab = ff_zigzag_direct;
-    DCTELEM block[64];
+    const uint8_t *scantab = h->scantable.permutated;
+    DCTELEM *block = h->block;

-    memset(block,0,64*sizeof(DCTELEM));
    for(i=0;i<65;i++) {
        level_code = get_ue_code(gb,r->golomb_order);
        if(level_code >= ESCAPE_CODE) {
@@ -1135,8 +1136,10 @@ static int decode_pic(AVSContext *h) {
    enum mb_t mb_type;

    if (!s->context_initialized) {
+        s->avctx->idct_algo = FF_IDCT_CAVS;
        if (MPV_common_init(s) < 0)
            return -1;
+        ff_init_scantable(s->dsp.idct_permutation,&h->scantable,ff_zigzag_direct);
    }
    get_bits(&s->gb,16);//bbv_dwlay
    if(h->stc == PIC_PB_START_CODE) {
@@ -1281,6 +1284,7 @@ static void init_top_lines(AVSContext *h) {
    /* alloc space for co-located MVs and types */
    h->col_mv       = av_malloc( h->mb_width*h->mb_height*4*sizeof(vector_t));
    h->col_type_base = av_malloc(h->mb_width*h->mb_height);
+    h->block        = av_mallocz(64*sizeof(DCTELEM));
 }

 static int decode_seq_header(AVSContext *h) {
@@ -1478,6 +1482,7 @@ static int cavs_decode_end(AVCodecContext * avctx) {
    av_free(h->top_border_v);
    av_free(h->col_mv);
    av_free(h->col_type_base);
+    av_free(h->block);
    return 0;
 }


--- a/libavcodec/cavsdsp.c
+++ b/libavcodec/cavsdsp.c
@@ -246,6 +246,7 @@ static void cavs_idct8_add_c(uint8_t *dst, DCTELEM *block, int stride) {
        dst[i + 6*stride] = cm[ dst[i + 6*stride] + ((b1 - b5) >> 7)];
        dst[i + 7*stride] = cm[ dst[i + 7*stride] + ((b0 - b4) >> 7)];
    }
+    memset(block,0,64*sizeof(DCTELEM));
 }

 /*****************************************************************************

--- a/libavcodec/i386/cavsdsp_mmx.c
+++ b/libavcodec/i386/cavsdsp_mmx.c
--- a/libavcodec/i386/dsputil_mmx.c
+++ b/libavcodec/i386/dsputil_mmx.c
@@ -2622,6 +2622,22 @@ PREFETCH(prefetch_3dnow, prefetch)

 #include "h264dsp_mmx.c"

+/* AVS specific */
+void ff_cavsdsp_init_mmx2(DSPContext* c, AVCodecContext *avctx);
+
+void ff_put_cavs_qpel8_mc00_mmx2(uint8_t *dst, uint8_t *src, int stride) {
+    put_pixels8_mmx(dst, src, stride, 8);
+}
+void ff_avg_cavs_qpel8_mc00_mmx2(uint8_t *dst, uint8_t *src, int stride) {
+    avg_pixels8_mmx(dst, src, stride, 8);
+}
+void ff_put_cavs_qpel16_mc00_mmx2(uint8_t *dst, uint8_t *src, int stride) {
+    put_pixels16_mmx(dst, src, stride, 16);
+}
+void ff_avg_cavs_qpel16_mc00_mmx2(uint8_t *dst, uint8_t *src, int stride) {
+    avg_pixels16_mmx(dst, src, stride, 16);
+}
+
 /* external functions, from idct_mmx.c */
 void ff_mmx_idct(DCTELEM *block);
 void ff_mmxext_idct(DCTELEM *block);
@@ -2779,6 +2795,8 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx)
                    c->idct_permutation_type= FF_PARTTRANS_IDCT_PERM;
                }
 #endif
+            }else if(idct_algo==FF_IDCT_CAVS){
+                    c->idct_permutation_type= FF_TRANSPOSE_IDCT_PERM;
 #ifdef CONFIG_GPL
            }else if(idct_algo==FF_IDCT_XVIDMMX){
                if(mm_flags & MM_MMXEXT){
@@ -3012,6 +3030,10 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx)
            c->biweight_h264_pixels_tab[6]= ff_h264_biweight_4x4_mmx2;
            c->biweight_h264_pixels_tab[7]= ff_h264_biweight_4x2_mmx2;

+#ifdef CONFIG_CAVS_DECODER
+            ff_cavsdsp_init_mmx2(c, avctx);
+#endif
+
 #ifdef CONFIG_ENCODERS
            c->sub_hfyu_median_prediction= sub_hfyu_median_prediction_mmx2;
 #endif //CONFIG_ENCODERS