From 735acf567c3699933d03b27179fecd8f2b917ccd Mon Sep 17 00:00:00 2001
From: David Conrad <lessen42@gmail.com>
Date: Fri, 12 Feb 2010 22:01:38 +0000
Subject: [PATCH] Don't pre-calculate first_pixel

3.6% faster on Elephants_Dream_HD-q7-aq7.ogg on my penryn

Originally committed as revision 21781 to svn://svn.ffmpeg.org/ffmpeg/trunk
---
 libavcodec/vp3.c | 101 ++++++++++++-----------------------------------
 1 file changed, 26 insertions(+), 75 deletions(-)

diff --git a/libavcodec/vp3.c b/libavcodec/vp3.c
index 50af08aa52..692aec529b 100644
--- a/libavcodec/vp3.c
+++ b/libavcodec/vp3.c
@@ -53,9 +53,6 @@ typedef struct Coeff {
 //FIXME split things out into their own arrays
 typedef struct Vp3Fragment {
     Coeff *next_coeff;
-    /* address of first pixel taking into account which plane the fragment
-     * lives on as well as the plane stride */
-    int first_pixel;
     /* this is the macroblock that the fragment belongs to */
     uint16_t macroblock;
     uint8_t coding_method;
@@ -163,6 +160,7 @@ typedef struct Vp3DecodeContext {
     Coeff *coeffs;
     Coeff *next_coeff;
     int fragment_start[3];
+    int data_offset[3];
 
     ScanTable scantable;
 
@@ -178,7 +176,6 @@ typedef struct Vp3DecodeContext {
      * which of the fragments are coded */
     int *coded_fragment_list;
     int coded_fragment_list_index;
-    int pixel_addresses_initialized;
 
     /* track which fragments have already been decoded; called 'fast'
      * because this data structure avoids having to iterate through every
@@ -1401,6 +1398,7 @@ static void apply_loop_filter(Vp3DecodeContext *s, int plane, int ystart, int ye
     int stride          = s->current_frame.linesize[plane];
     uint8_t *plane_data = s->current_frame.data    [plane];
     if (!s->flipped_image) stride = -stride;
+    plane_data += s->data_offset[plane] + 8*ystart*stride;
 
     for (y = ystart; y < yend; y++) {
 
@@ -1414,14 +1412,14 @@ static void apply_loop_filter(Vp3DecodeContext *s, int plane, int ystart, int ye
                 /* do not perform left edge filter for left columns frags */
                 if (x > 0) {
                     s->dsp.vp3_h_loop_filter(
-                        plane_data + s->all_fragments[fragment].first_pixel,
+                        plane_data + 8*x,
                         stride, bounding_values);
                 }
 
                 /* do not perform top edge filter for top row fragments */
                 if (y > 0) {
                     s->dsp.vp3_v_loop_filter(
-                        plane_data + s->all_fragments[fragment].first_pixel,
+                        plane_data + 8*x,
                         stride, bounding_values);
                 }
 
@@ -1431,7 +1429,7 @@ static void apply_loop_filter(Vp3DecodeContext *s, int plane, int ystart, int ye
                 if ((x < width - 1) &&
                     (s->all_fragments[fragment + 1].coding_method == MODE_COPY)) {
                     s->dsp.vp3_h_loop_filter(
-                        plane_data + s->all_fragments[fragment + 1].first_pixel,
+                        plane_data + 8*x + 8,
                         stride, bounding_values);
                 }
 
@@ -1441,13 +1439,14 @@ static void apply_loop_filter(Vp3DecodeContext *s, int plane, int ystart, int ye
                 if ((y < height - 1) &&
                     (s->all_fragments[fragment + width].coding_method == MODE_COPY)) {
                     s->dsp.vp3_v_loop_filter(
-                        plane_data + s->all_fragments[fragment + width].first_pixel,
+                        plane_data + 8*x + 8*stride,
                         stride, bounding_values);
                 }
             }
 
             fragment++;
         }
+        plane_data += 8*stride;
     }
 }
 
@@ -1501,9 +1500,9 @@ static void render_slice(Vp3DecodeContext *s, int slice)
         return;
 
     for (plane = 0; plane < 3; plane++) {
-        uint8_t *output_plane = s->current_frame.data    [plane];
-        uint8_t *  last_plane = s->   last_frame.data    [plane];
-        uint8_t *golden_plane = s-> golden_frame.data    [plane];
+        uint8_t *output_plane = s->current_frame.data    [plane] + s->data_offset[plane];
+        uint8_t *  last_plane = s->   last_frame.data    [plane] + s->data_offset[plane];
+        uint8_t *golden_plane = s-> golden_frame.data    [plane] + s->data_offset[plane];
         int stride            = s->current_frame.linesize[plane];
         int plane_width       = s->width  >> !!plane;
         int plane_height      = s->height >> !!plane;
@@ -1522,6 +1521,7 @@ static void render_slice(Vp3DecodeContext *s, int slice)
 
             /* for each fragment in a row... */
             for (x = 0; x < plane_width; x += 8, i++) {
+                int first_pixel = y*stride + x;
 
                 if ((i < 0) || (i >= s->fragment_count)) {
                     av_log(s->avctx, AV_LOG_ERROR, "  vp3:render_slice(): bad fragment number (%d)\n", i);
@@ -1538,7 +1538,7 @@ static void render_slice(Vp3DecodeContext *s, int slice)
                     else
                         motion_source= last_plane;
 
-                    motion_source += s->all_fragments[i].first_pixel;
+                    motion_source += first_pixel;
                     motion_halfpel_index = 0;
 
                     /* sort out the motion vector if this fragment is coded
@@ -1584,12 +1584,12 @@ static void render_slice(Vp3DecodeContext *s, int slice)
                            put_no_rnd_pixels_tab is better optimzed */
                         if(motion_halfpel_index != 3){
                             s->dsp.put_no_rnd_pixels_tab[1][motion_halfpel_index](
-                                output_plane + s->all_fragments[i].first_pixel,
+                                output_plane + first_pixel,
                                 motion_source, stride, 8);
                         }else{
                             int d= (motion_x ^ motion_y)>>31; // d is 0 if motion_x and _y have the same sign, else -1
                             s->dsp.put_no_rnd_pixels_l2[1](
-                                output_plane + s->all_fragments[i].first_pixel,
+                                output_plane + first_pixel,
                                 motion_source - d,
                                 motion_source + stride + 1 + d,
                                 stride, 8);
@@ -1622,12 +1622,12 @@ static void render_slice(Vp3DecodeContext *s, int slice)
                         if(s->avctx->idct_algo!=FF_IDCT_VP3)
                             block[0] += 128<<3;
                         s->dsp.idct_put(
-                            output_plane + s->all_fragments[i].first_pixel,
+                            output_plane + first_pixel,
                             stride,
                             block);
                     } else {
                         s->dsp.idct_add(
-                            output_plane + s->all_fragments[i].first_pixel,
+                            output_plane + first_pixel,
                             stride,
                             block);
                     }
@@ -1635,8 +1635,8 @@ static void render_slice(Vp3DecodeContext *s, int slice)
 
                     /* copy directly from the previous frame */
                     s->dsp.put_pixels_tab[1][0](
-                        output_plane + s->all_fragments[i].first_pixel,
-                        last_plane + s->all_fragments[i].first_pixel,
+                        output_plane + first_pixel,
+                        last_plane + first_pixel,
                         stride, 8);
 
                 }
@@ -1661,54 +1661,6 @@ static void render_slice(Vp3DecodeContext *s, int slice)
         vp3_draw_horiz_band(s, 16*slice);
 }
 
-/*
- * This function computes the first pixel addresses for each fragment.
- * This function needs to be invoked after the first frame is allocated
- * so that it has access to the plane strides.
- */
-static void vp3_calculate_pixel_addresses(Vp3DecodeContext *s)
-{
-#define Y_INITIAL(chroma_shift)  s->flipped_image ? 1  : s->fragment_height >> chroma_shift
-#define Y_FINISHED(chroma_shift) s->flipped_image ? y <= s->fragment_height >> chroma_shift : y > 0
-
-    int i, x, y;
-    const int y_inc = s->flipped_image ? 1 : -1;
-
-    /* figure out the first pixel addresses for each of the fragments */
-    /* Y plane */
-    i = 0;
-    for (y = Y_INITIAL(0); Y_FINISHED(0); y += y_inc) {
-        for (x = 0; x < s->fragment_width; x++) {
-            s->all_fragments[i++].first_pixel =
-                s->golden_frame.linesize[0] * y * FRAGMENT_PIXELS -
-                    s->golden_frame.linesize[0] +
-                    x * FRAGMENT_PIXELS;
-        }
-    }
-
-    /* U plane */
-    i = s->fragment_start[1];
-    for (y = Y_INITIAL(1); Y_FINISHED(1); y += y_inc) {
-        for (x = 0; x < s->fragment_width / 2; x++) {
-            s->all_fragments[i++].first_pixel =
-                s->golden_frame.linesize[1] * y * FRAGMENT_PIXELS -
-                    s->golden_frame.linesize[1] +
-                    x * FRAGMENT_PIXELS;
-        }
-    }
-
-    /* V plane */
-    i = s->fragment_start[2];
-    for (y = Y_INITIAL(1); Y_FINISHED(1); y += y_inc) {
-        for (x = 0; x < s->fragment_width / 2; x++) {
-            s->all_fragments[i++].first_pixel =
-                s->golden_frame.linesize[2] * y * FRAGMENT_PIXELS -
-                    s->golden_frame.linesize[2] +
-                    x * FRAGMENT_PIXELS;
-        }
-    }
-}
-
 /*
  * This is the ffmpeg/libavcodec API init function.
  */
@@ -1775,7 +1727,6 @@ static av_cold int vp3_decode_init(AVCodecContext *avctx)
     s->coeffs = av_malloc(s->fragment_count * sizeof(Coeff) * 65);
     s->coded_fragment_list = av_malloc(s->fragment_count * sizeof(int));
     s->fast_fragment_list = av_malloc(s->fragment_count * sizeof(int));
-    s->pixel_addresses_initialized = 0;
     if (!s->superblock_coding || !s->all_fragments || !s->coeff_counts ||
         !s->coeffs || !s->coded_fragment_list || !s->fast_fragment_list) {
         vp3_decode_end(avctx);
@@ -1996,17 +1947,10 @@ static int vp3_decode_frame(AVCodecContext *avctx,
 
         /* golden frame is also the current frame */
         s->current_frame= s->golden_frame;
-
-        /* time to figure out pixel addresses? */
-        if (!s->pixel_addresses_initialized)
-        {
-            vp3_calculate_pixel_addresses(s);
-            s->pixel_addresses_initialized = 1;
-        }
     } else {
         /* allocate a new current frame */
         s->current_frame.reference = 3;
-        if (!s->pixel_addresses_initialized) {
+        if (!s->golden_frame.data[0]) {
             av_log(s->avctx, AV_LOG_ERROR, "vp3: first frame not a keyframe\n");
             return -1;
         }
@@ -2042,6 +1986,13 @@ static int vp3_decode_frame(AVCodecContext *avctx,
         return -1;
     }
 
+    for (i = 0; i < 3; i++) {
+        if (s->flipped_image)
+            s->data_offset[i] = 0;
+        else
+            s->data_offset[i] = ((s->height>>!!i)-1) * s->current_frame.linesize[i];
+    }
+
     s->last_slice_end = 0;
     for (i = 0; i < s->macroblock_height; i++)
         render_slice(s, i);
-- 
GitLab