dsputil_ppc.c 11.3 KB
Newer Older
1 2 3
/*
 * Copyright (c) 2002 Brian Foley
 * Copyright (c) 2002 Dieter Shirley
4
 * Copyright (c) 2003-2004 Romain Dolbeau <romain@dolbeau.org>
5 6 7 8 9 10 11 12 13 14 15 16 17
 *
 * This library is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2 of the License, or (at your option) any later version.
 *
 * This library is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with this library; if not, write to the Free Software
18
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19 20
 */

21 22
#include "../dsputil.h"

23 24
#include "dsputil_ppc.h"

25 26 27 28
#ifdef HAVE_ALTIVEC
#include "dsputil_altivec.h"
#endif

29
extern void fdct_altivec(int16_t *block);
30 31 32
extern void idct_put_altivec(uint8_t *dest, int line_size, int16_t *block);
extern void idct_add_altivec(uint8_t *dest, int line_size, int16_t *block);

L
Luca Barbato 已提交
33 34 35 36 37 38 39 40 41 42 43
extern void ff_snow_horizontal_compose97i_altivec(DWTELEM *b, int width);
extern void ff_snow_vertical_compose97i_altivec(DWTELEM *b0, DWTELEM *b1,
                                                DWTELEM *b2, DWTELEM *b3,
                                                DWTELEM *b4, DWTELEM *b5,
                                                int width);
extern void ff_snow_inner_add_yblock_altivec(uint8_t *obmc, const int obmc_stride,
                                          uint8_t * * block, int b_w, int b_h,
                                          int src_x, int src_y, int src_stride,
                                          slice_buffer * sb, int add,
                                          uint8_t * dst8);

44 45
int mm_flags = 0;

46 47 48
int mm_support(void)
{
    int result = 0;
49
#ifdef HAVE_ALTIVEC
50 51 52 53 54 55 56
    if (has_altivec()) {
        result |= MM_ALTIVEC;
    }
#endif /* result */
    return result;
}

57 58
#ifdef POWERPC_PERFORMANCE_REPORT
unsigned long long perfdata[POWERPC_NUM_PMC_ENABLED][powerpc_perf_total][powerpc_data_total];
59
/* list below must match enum in dsputil_ppc.h */
60
static unsigned char* perfname[] = {
61
  "ff_fft_calc_altivec",
62 63
  "gmc1_altivec",
  "dct_unquantize_h263_altivec",
64
  "fdct_altivec",
65 66 67 68 69 70
  "idct_add_altivec",
  "idct_put_altivec",
  "put_pixels16_altivec",
  "avg_pixels16_altivec",
  "avg_pixels8_altivec",
  "put_pixels8_xy2_altivec",
71 72 73
  "put_no_rnd_pixels8_xy2_altivec",
  "put_pixels16_xy2_altivec",
  "put_no_rnd_pixels16_xy2_altivec",
74
  "hadamard8_diff8x8_altivec",
75
  "hadamard8_diff16_altivec",
76
  "avg_pixels8_xy2_altivec",
77
  "clear_blocks_dcbz32_ppc",
78 79 80 81 82 83 84 85 86 87
  "clear_blocks_dcbz128_ppc",
  "put_h264_chroma_mc8_altivec",
  "avg_h264_chroma_mc8_altivec",
  "put_h264_qpel16_h_lowpass_altivec",
  "avg_h264_qpel16_h_lowpass_altivec",
  "put_h264_qpel16_v_lowpass_altivec",
  "avg_h264_qpel16_v_lowpass_altivec",
  "put_h264_qpel16_hv_lowpass_altivec",
  "avg_h264_qpel16_hv_lowpass_altivec",
  ""
88 89 90 91
};
#include <stdio.h>
#endif

92
#ifdef POWERPC_PERFORMANCE_REPORT
93 94
void powerpc_display_perf_report(void)
{
95
  int i, j;
96
  av_log(NULL, AV_LOG_INFO, "PowerPC performance report\n Values are from the PMC registers, and represent whatever the registers are set to record.\n");
97 98
  for(i = 0 ; i < powerpc_perf_total ; i++)
  {
99 100
    for (j = 0; j < POWERPC_NUM_PMC_ENABLED ; j++)
      {
101 102 103 104 105 106 107 108 109 110
        if (perfdata[j][i][powerpc_data_num] != (unsigned long long)0)
          av_log(NULL, AV_LOG_INFO,
                  " Function \"%s\" (pmc%d):\n\tmin: %llu\n\tmax: %llu\n\tavg: %1.2lf (%llu)\n",
                  perfname[i],
                  j+1,
                  perfdata[j][i][powerpc_data_min],
                  perfdata[j][i][powerpc_data_max],
                  (double)perfdata[j][i][powerpc_data_sum] /
                  (double)perfdata[j][i][powerpc_data_num],
                  perfdata[j][i][powerpc_data_num]);
111
      }
112 113
  }
}
114
#endif /* POWERPC_PERFORMANCE_REPORT */
115 116 117 118 119 120 121 122 123 124 125 126 127

/* ***** WARNING ***** WARNING ***** WARNING ***** */
/*
  clear_blocks_dcbz32_ppc will not work properly
  on PowerPC processors with a cache line size
  not equal to 32 bytes.
  Fortunately all processor used by Apple up to
  at least the 7450 (aka second generation G4)
  use 32 bytes cache line.
  This is due to the use of the 'dcbz' instruction.
  It simply clear to zero a single cache line,
  so you need to know the cache line size to use it !
  It's absurd, but it's fast...
128 129 130 131 132 133 134 135 136 137 138 139

  update 24/06/2003 : Apple released yesterday the G5,
  with a PPC970. cache line size : 128 bytes. Oups.
  The semantic of dcbz was changed, it always clear
  32 bytes. so the function below will work, but will
  be slow. So I fixed check_dcbz_effect to use dcbzl,
  which is defined to clear a cache line (as dcbz before).
  So we still can distinguish, and use dcbz (32 bytes)
  or dcbzl (one cache line) as required.

  see <http://developer.apple.com/technotes/tn/tn2087.html>
  and <http://developer.apple.com/technotes/tn/tn2086.html>
140 141 142
*/
void clear_blocks_dcbz32_ppc(DCTELEM *blocks)
{
143
POWERPC_PERF_DECLARE(powerpc_clear_blocks_dcbz32, 1);
144 145
    register int misal = ((unsigned long)blocks & 0x00000010);
    register int i = 0;
146
POWERPC_PERF_START_COUNT(powerpc_clear_blocks_dcbz32, 1);
147 148 149 150 151 152 153 154
#if 1
    if (misal) {
      ((unsigned long*)blocks)[0] = 0L;
      ((unsigned long*)blocks)[1] = 0L;
      ((unsigned long*)blocks)[2] = 0L;
      ((unsigned long*)blocks)[3] = 0L;
      i += 16;
    }
155
    for ( ; i < sizeof(DCTELEM)*6*64-31 ; i += 32) {
156
#ifndef __MWERKS__
157
      asm volatile("dcbz %0,%1" : : "b" (blocks), "r" (i) : "memory");
158 159 160
#else
      __dcbz( blocks, i );
#endif
161 162 163 164 165 166 167 168 169 170 171
    }
    if (misal) {
      ((unsigned long*)blocks)[188] = 0L;
      ((unsigned long*)blocks)[189] = 0L;
      ((unsigned long*)blocks)[190] = 0L;
      ((unsigned long*)blocks)[191] = 0L;
      i += 16;
    }
#else
    memset(blocks, 0, sizeof(DCTELEM)*6*64);
#endif
172
POWERPC_PERF_STOP_COUNT(powerpc_clear_blocks_dcbz32, 1);
173 174
}

175 176 177 178 179
/* same as above, when dcbzl clear a whole 128B cache line
   i.e. the PPC970 aka G5 */
#ifndef NO_DCBZL
void clear_blocks_dcbz128_ppc(DCTELEM *blocks)
{
180
POWERPC_PERF_DECLARE(powerpc_clear_blocks_dcbz128, 1);
181 182
    register int misal = ((unsigned long)blocks & 0x0000007f);
    register int i = 0;
183
POWERPC_PERF_START_COUNT(powerpc_clear_blocks_dcbz128, 1);
184 185 186 187 188 189 190 191 192
#if 1
 if (misal) {
   // we could probably also optimize this case,
   // but there's not much point as the machines
   // aren't available yet (2003-06-26)
      memset(blocks, 0, sizeof(DCTELEM)*6*64);
    }
    else
      for ( ; i < sizeof(DCTELEM)*6*64 ; i += 128) {
193
        asm volatile("dcbzl %0,%1" : : "b" (blocks), "r" (i) : "memory");
194 195 196 197
      }
#else
    memset(blocks, 0, sizeof(DCTELEM)*6*64);
#endif
198
POWERPC_PERF_STOP_COUNT(powerpc_clear_blocks_dcbz128, 1);
199 200 201 202 203 204 205 206 207
}
#else
void clear_blocks_dcbz128_ppc(DCTELEM *blocks)
{
  memset(blocks, 0, sizeof(DCTELEM)*6*64);
}
#endif

#ifndef NO_DCBZL
208
/* check dcbz report how many bytes are set to 0 by dcbz */
209 210 211 212 213
/* update 24/06/2003 : replace dcbz by dcbzl to get
   the intended effect (Apple "fixed" dcbz)
   unfortunately this cannot be used unless the assembler
   knows about dcbzl ... */
long check_dcbzl_effect(void)
214
{
215
  register char *fakedata = (char*)av_malloc(1024);
216 217 218 219 220
  register char *fakedata_middle;
  register long zero = 0;
  register long i = 0;
  long count = 0;

221
  if (!fakedata)
222 223 224 225 226 227 228 229
  {
    return 0L;
  }

  fakedata_middle = (fakedata + 512);

  memset(fakedata, 0xFF, 1024);

230 231 232
  /* below the constraint "b" seems to mean "Address base register"
     in gcc-3.3 / RS/6000 speaks. seems to avoid using r0, so.... */
  asm volatile("dcbzl %0, %1" : : "b" (fakedata_middle), "r" (zero));
233 234 235 236 237 238 239

  for (i = 0; i < 1024 ; i ++)
  {
    if (fakedata[i] == (char)0)
      count++;
  }

240
  av_free(fakedata);
241

242 243
  return count;
}
244 245 246 247 248 249
#else
long check_dcbzl_effect(void)
{
  return 0;
}
#endif
250

251 252 253

void dsputil_h264_init_ppc(DSPContext* c, AVCodecContext *avctx);

254
void dsputil_init_ppc(DSPContext* c, AVCodecContext *avctx)
255
{
256
    // Common optimizations whether Altivec is available or not
257

258
  switch (check_dcbzl_effect()) {
259 260 261
  case 32:
    c->clear_blocks = clear_blocks_dcbz32_ppc;
    break;
262 263 264
  case 128:
    c->clear_blocks = clear_blocks_dcbz128_ppc;
    break;
265 266 267
  default:
    break;
  }
268

269
#ifdef HAVE_ALTIVEC
270
  dsputil_h264_init_ppc(c, avctx);
271

272
    if (has_altivec()) {
273
        mm_flags |= MM_ALTIVEC;
274

275
        // Altivec specific optimisations
M
Michael Niedermayer 已提交
276 277 278 279 280 281 282
        c->pix_abs[0][1] = sad16_x2_altivec;
        c->pix_abs[0][2] = sad16_y2_altivec;
        c->pix_abs[0][3] = sad16_xy2_altivec;
        c->pix_abs[0][0] = sad16_altivec;
        c->pix_abs[1][0] = sad8_altivec;
        c->sad[0]= sad16_altivec;
        c->sad[1]= sad8_altivec;
283
        c->pix_norm1 = pix_norm1_altivec;
284 285
        c->sse[1]= sse8_altivec;
        c->sse[0]= sse16_altivec;
286 287 288
        c->pix_sum = pix_sum_altivec;
        c->diff_pixels = diff_pixels_altivec;
        c->get_pixels = get_pixels_altivec;
289
// next one disabled as it's untested.
290 291
#if 0
        c->add_bytes= add_bytes_altivec;
292
#endif /* 0 */
293
        c->put_pixels_tab[0][0] = put_pixels16_altivec;
294
        /* the two functions do the same thing, so use the same code */
295
        c->put_no_rnd_pixels_tab[0][0] = put_pixels16_altivec;
296
        c->avg_pixels_tab[0][0] = avg_pixels16_altivec;
297
        c->avg_pixels_tab[1][0] = avg_pixels8_altivec;
298
        c->avg_pixels_tab[1][3] = avg_pixels8_xy2_altivec;
299
        c->put_pixels_tab[1][3] = put_pixels8_xy2_altivec;
300 301 302
        c->put_no_rnd_pixels_tab[1][3] = put_no_rnd_pixels8_xy2_altivec;
        c->put_pixels_tab[0][3] = put_pixels16_xy2_altivec;
        c->put_no_rnd_pixels_tab[0][3] = put_no_rnd_pixels16_xy2_altivec;
303

304
        c->gmc1 = gmc1_altivec;
305

306
#ifdef CONFIG_DARWIN // ATM gcc-3.3 and gcc-3.4 fail to compile these in linux...
307 308
        c->hadamard8_diff[0] = hadamard8_diff16_altivec;
        c->hadamard8_diff[1] = hadamard8_diff8x8_altivec;
309
#endif
310

L
Luca Barbato 已提交
311 312 313 314 315

        c->horizontal_compose97i = ff_snow_horizontal_compose97i_altivec;
        c->vertical_compose97i = ff_snow_vertical_compose97i_altivec;
        c->inner_add_yblock = ff_snow_inner_add_yblock_altivec;

316
#ifdef CONFIG_ENCODERS
317 318 319 320 321
        if (avctx->dct_algo == FF_DCT_AUTO ||
            avctx->dct_algo == FF_DCT_ALTIVEC)
        {
            c->fdct = fdct_altivec;
        }
322 323
#endif //CONFIG_ENCODERS

324 325
      if (avctx->lowres==0)
      {
326 327 328 329 330 331 332 333 334 335 336
        if ((avctx->idct_algo == FF_IDCT_AUTO) ||
                (avctx->idct_algo == FF_IDCT_ALTIVEC))
        {
            c->idct_put = idct_put_altivec;
            c->idct_add = idct_add_altivec;
#ifndef ALTIVEC_USE_REFERENCE_C_CODE
            c->idct_permutation_type = FF_TRANSPOSE_IDCT_PERM;
#else /* ALTIVEC_USE_REFERENCE_C_CODE */
            c->idct_permutation_type = FF_NO_IDCT_PERM;
#endif /* ALTIVEC_USE_REFERENCE_C_CODE */
        }
337
      }
338

339
#ifdef POWERPC_PERFORMANCE_REPORT
340
        {
341
          int i, j;
342
          for (i = 0 ; i < powerpc_perf_total ; i++)
343
          {
344 345 346 347 348 349 350 351
            for (j = 0; j < POWERPC_NUM_PMC_ENABLED ; j++)
              {
                perfdata[j][i][powerpc_data_min] = 0xFFFFFFFFFFFFFFFFULL;
                perfdata[j][i][powerpc_data_max] = 0x0000000000000000ULL;
                perfdata[j][i][powerpc_data_sum] = 0x0000000000000000ULL;
                perfdata[j][i][powerpc_data_num] = 0x0000000000000000ULL;
              }
          }
352
        }
353
#endif /* POWERPC_PERFORMANCE_REPORT */
354
    } else
355
#endif /* HAVE_ALTIVEC */
356
    {
357 358 359
        // Non-AltiVec PPC optimisations

        // ... pending ...
360 361
    }
}