dsputil_ppc.c 10.8 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19
/*
 * Copyright (c) 2002 Brian Foley
 * Copyright (c) 2002 Dieter Shirley
 *
 * This library is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2 of the License, or (at your option) any later version.
 *
 * This library is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with this library; if not, write to the Free Software
 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
 */

20 21
#include "../dsputil.h"

22 23
#include "dsputil_ppc.h"

24 25 26 27
#ifdef HAVE_ALTIVEC
#include "dsputil_altivec.h"
#endif

28 29 30
extern void idct_put_altivec(uint8_t *dest, int line_size, int16_t *block);
extern void idct_add_altivec(uint8_t *dest, int line_size, int16_t *block);

31 32
int mm_flags = 0;

33 34 35 36 37 38 39 40 41 42 43
int mm_support(void)
{
    int result = 0;
#if HAVE_ALTIVEC
    if (has_altivec()) {
        result |= MM_ALTIVEC;
    }
#endif /* result */
    return result;
}

44 45
#ifdef POWERPC_TBL_PERFORMANCE_REPORT
unsigned long long perfdata[powerpc_perf_total][powerpc_data_total];
46
/* list below must match enum in dsputil_ppc.h */
47 48 49 50 51 52 53 54 55 56
static unsigned char* perfname[] = {
  "fft_calc_altivec",
  "gmc1_altivec",
  "dct_unquantize_h263_altivec",
  "idct_add_altivec",
  "idct_put_altivec",
  "put_pixels16_altivec",
  "avg_pixels16_altivec",
  "avg_pixels8_altivec",
  "put_pixels8_xy2_altivec",
57 58 59
  "put_no_rnd_pixels8_xy2_altivec",
  "put_pixels16_xy2_altivec",
  "put_no_rnd_pixels16_xy2_altivec",
60 61
  "clear_blocks_dcbz32_ppc",
  "clear_blocks_dcbz128_ppc"
62 63
};
#ifdef POWERPC_PERF_USE_PMC
64 65
unsigned long long perfdata_pmc2[powerpc_perf_total][powerpc_data_total];
unsigned long long perfdata_pmc3[powerpc_perf_total][powerpc_data_total];
66 67 68 69 70 71 72 73 74
#endif
#include <stdio.h>
#endif

#ifdef POWERPC_TBL_PERFORMANCE_REPORT
void powerpc_display_perf_report(void)
{
  int i;
#ifndef POWERPC_PERF_USE_PMC
75
  fprintf(stderr, "PowerPC performance report\n Values are from the Time Base register, and represent 4 bus cycles.\n");
76
#else /* POWERPC_PERF_USE_PMC */
77
  fprintf(stderr, "PowerPC performance report\n Values are from the PMC registers, and represent whatever the registers are set to record.\n");
78 79 80 81 82 83 84 85 86 87 88 89
#endif /* POWERPC_PERF_USE_PMC */
  for(i = 0 ; i < powerpc_perf_total ; i++)
  {
    if (perfdata[i][powerpc_data_num] != (unsigned long long)0)
      fprintf(stderr, " Function \"%s\" (pmc1):\n\tmin: %llu\n\tmax: %llu\n\tavg: %1.2lf (%llu)\n",
              perfname[i],
              perfdata[i][powerpc_data_min],
              perfdata[i][powerpc_data_max],
              (double)perfdata[i][powerpc_data_sum] /
              (double)perfdata[i][powerpc_data_num],
              perfdata[i][powerpc_data_num]);
#ifdef POWERPC_PERF_USE_PMC
90
    if (perfdata_pmc2[i][powerpc_data_num] != (unsigned long long)0)
91 92
      fprintf(stderr, " Function \"%s\" (pmc2):\n\tmin: %llu\n\tmax: %llu\n\tavg: %1.2lf (%llu)\n",
              perfname[i],
93 94 95 96 97 98 99 100 101 102 103 104 105
              perfdata_pmc2[i][powerpc_data_min],
              perfdata_pmc2[i][powerpc_data_max],
              (double)perfdata_pmc2[i][powerpc_data_sum] /
              (double)perfdata_pmc2[i][powerpc_data_num],
              perfdata_pmc2[i][powerpc_data_num]);
    if (perfdata_pmc3[i][powerpc_data_num] != (unsigned long long)0)
      fprintf(stderr, " Function \"%s\" (pmc3):\n\tmin: %llu\n\tmax: %llu\n\tavg: %1.2lf (%llu)\n",
              perfname[i],
              perfdata_pmc3[i][powerpc_data_min],
              perfdata_pmc3[i][powerpc_data_max],
              (double)perfdata_pmc3[i][powerpc_data_sum] /
              (double)perfdata_pmc3[i][powerpc_data_num],
              perfdata_pmc3[i][powerpc_data_num]);
106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122
#endif
  }
}
#endif /* POWERPC_TBL_PERFORMANCE_REPORT */

/* ***** WARNING ***** WARNING ***** WARNING ***** */
/*
  clear_blocks_dcbz32_ppc will not work properly
  on PowerPC processors with a cache line size
  not equal to 32 bytes.
  Fortunately all processor used by Apple up to
  at least the 7450 (aka second generation G4)
  use 32 bytes cache line.
  This is due to the use of the 'dcbz' instruction.
  It simply clear to zero a single cache line,
  so you need to know the cache line size to use it !
  It's absurd, but it's fast...
123 124 125 126 127 128 129 130 131 132 133 134

  update 24/06/2003 : Apple released yesterday the G5,
  with a PPC970. cache line size : 128 bytes. Oups.
  The semantic of dcbz was changed, it always clear
  32 bytes. so the function below will work, but will
  be slow. So I fixed check_dcbz_effect to use dcbzl,
  which is defined to clear a cache line (as dcbz before).
  So we still can distinguish, and use dcbz (32 bytes)
  or dcbzl (one cache line) as required.

  see <http://developer.apple.com/technotes/tn/tn2087.html>
  and <http://developer.apple.com/technotes/tn/tn2086.html>
135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150
*/
void clear_blocks_dcbz32_ppc(DCTELEM *blocks)
{
POWERPC_TBL_DECLARE(powerpc_clear_blocks_dcbz32, 1);
    register int misal = ((unsigned long)blocks & 0x00000010);
    register int i = 0;
POWERPC_TBL_START_COUNT(powerpc_clear_blocks_dcbz32, 1);
#if 1
    if (misal) {
      ((unsigned long*)blocks)[0] = 0L;
      ((unsigned long*)blocks)[1] = 0L;
      ((unsigned long*)blocks)[2] = 0L;
      ((unsigned long*)blocks)[3] = 0L;
      i += 16;
    }
    for ( ; i < sizeof(DCTELEM)*6*64 ; i += 32) {
151
      asm volatile("dcbz %0,%1" : : "b" (blocks), "r" (i) : "memory");
152 153 154 155 156 157 158 159 160 161 162 163 164 165
    }
    if (misal) {
      ((unsigned long*)blocks)[188] = 0L;
      ((unsigned long*)blocks)[189] = 0L;
      ((unsigned long*)blocks)[190] = 0L;
      ((unsigned long*)blocks)[191] = 0L;
      i += 16;
    }
#else
    memset(blocks, 0, sizeof(DCTELEM)*6*64);
#endif
POWERPC_TBL_STOP_COUNT(powerpc_clear_blocks_dcbz32, 1);
}

166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183
/* same as above, when dcbzl clear a whole 128B cache line
   i.e. the PPC970 aka G5 */
#ifndef NO_DCBZL
void clear_blocks_dcbz128_ppc(DCTELEM *blocks)
{
POWERPC_TBL_DECLARE(powerpc_clear_blocks_dcbz128, 1);
    register int misal = ((unsigned long)blocks & 0x0000007f);
    register int i = 0;
POWERPC_TBL_START_COUNT(powerpc_clear_blocks_dcbz128, 1);
#if 1
 if (misal) {
   // we could probably also optimize this case,
   // but there's not much point as the machines
   // aren't available yet (2003-06-26)
      memset(blocks, 0, sizeof(DCTELEM)*6*64);
    }
    else
      for ( ; i < sizeof(DCTELEM)*6*64 ; i += 128) {
184
	asm volatile("dcbzl %0,%1" : : "b" (blocks), "r" (i) : "memory");
185 186 187 188 189 190 191 192 193 194 195 196 197 198
      }
#else
    memset(blocks, 0, sizeof(DCTELEM)*6*64);
#endif
POWERPC_TBL_STOP_COUNT(powerpc_clear_blocks_dcbz128, 1);
}
#else
void clear_blocks_dcbz128_ppc(DCTELEM *blocks)
{
  memset(blocks, 0, sizeof(DCTELEM)*6*64);
}
#endif

#ifndef NO_DCBZL
199
/* check dcbz report how many bytes are set to 0 by dcbz */
200 201 202 203 204
/* update 24/06/2003 : replace dcbz by dcbzl to get
   the intended effect (Apple "fixed" dcbz)
   unfortunately this cannot be used unless the assembler
   knows about dcbzl ... */
long check_dcbzl_effect(void)
205
{
206
  register char *fakedata = (char*)av_malloc(1024);
207 208 209 210 211
  register char *fakedata_middle;
  register long zero = 0;
  register long i = 0;
  long count = 0;

212
  if (!fakedata)
213 214 215 216 217 218 219 220
  {
    return 0L;
  }

  fakedata_middle = (fakedata + 512);

  memset(fakedata, 0xFF, 1024);

221 222 223
  /* below the constraint "b" seems to mean "Address base register"
     in gcc-3.3 / RS/6000 speaks. seems to avoid using r0, so.... */
  asm volatile("dcbzl %0, %1" : : "b" (fakedata_middle), "r" (zero));
224 225 226 227 228 229 230

  for (i = 0; i < 1024 ; i ++)
  {
    if (fakedata[i] == (char)0)
      count++;
  }

231
  av_free(fakedata);
232 233 234
  
  return count;
}
235 236 237 238 239 240
#else
long check_dcbzl_effect(void)
{
  return 0;
}
#endif
241

242
void dsputil_init_ppc(DSPContext* c, AVCodecContext *avctx)
243
{
244
    // Common optimizations whether Altivec is available or not
245

246
  switch (check_dcbzl_effect()) {
247 248 249
  case 32:
    c->clear_blocks = clear_blocks_dcbz32_ppc;
    break;
250 251 252
  case 128:
    c->clear_blocks = clear_blocks_dcbz128_ppc;
    break;
253 254 255 256
  default:
    break;
  }
  
257 258
#if HAVE_ALTIVEC
    if (has_altivec()) {
259 260
        mm_flags |= MM_ALTIVEC;
        
261
        // Altivec specific optimisations
262 263
        c->pix_abs16x16_x2 = pix_abs16x16_x2_altivec;
        c->pix_abs16x16_y2 = pix_abs16x16_y2_altivec;
264 265
        c->pix_abs16x16_xy2 = pix_abs16x16_xy2_altivec;
        c->pix_abs16x16 = pix_abs16x16_altivec;
266
        c->pix_abs8x8 = pix_abs8x8_altivec;
267 268
        c->sad[0]= sad16x16_altivec;
        c->sad[1]= sad8x8_altivec;
269
        c->pix_norm1 = pix_norm1_altivec;
270 271
        c->sse[1]= sse8_altivec;
        c->sse[0]= sse16_altivec;
272 273 274
        c->pix_sum = pix_sum_altivec;
        c->diff_pixels = diff_pixels_altivec;
        c->get_pixels = get_pixels_altivec;
275
// next one disabled as it's untested.
276 277
#if 0
        c->add_bytes= add_bytes_altivec;
278
#endif /* 0 */
279 280
        c->put_pixels_tab[0][0] = put_pixels16_altivec;
        c->avg_pixels_tab[0][0] = avg_pixels16_altivec;
281 282 283
// next one disabled as it's untested.
#if 0
        c->avg_pixels_tab[1][0] = avg_pixels8_altivec;
284
#endif /* 0 */
285
        c->put_pixels_tab[1][3] = put_pixels8_xy2_altivec;
286 287 288
        c->put_no_rnd_pixels_tab[1][3] = put_no_rnd_pixels8_xy2_altivec;
        c->put_pixels_tab[0][3] = put_pixels16_xy2_altivec;
        c->put_no_rnd_pixels_tab[0][3] = put_no_rnd_pixels16_xy2_altivec;
289
        
290
	c->gmc1 = gmc1_altivec;
291 292 293 294 295 296 297 298 299 300 301 302

        if ((avctx->idct_algo == FF_IDCT_AUTO) ||
                (avctx->idct_algo == FF_IDCT_ALTIVEC))
        {
            c->idct_put = idct_put_altivec;
            c->idct_add = idct_add_altivec;
#ifndef ALTIVEC_USE_REFERENCE_C_CODE
            c->idct_permutation_type = FF_TRANSPOSE_IDCT_PERM;
#else /* ALTIVEC_USE_REFERENCE_C_CODE */
            c->idct_permutation_type = FF_NO_IDCT_PERM;
#endif /* ALTIVEC_USE_REFERENCE_C_CODE */
        }
303
        
304
#ifdef POWERPC_TBL_PERFORMANCE_REPORT
305 306
        {
          int i;
307
          for (i = 0 ; i < powerpc_perf_total ; i++)
308
          {
309 310 311 312 313
            perfdata[i][powerpc_data_min] = 0xFFFFFFFFFFFFFFFF;
            perfdata[i][powerpc_data_max] = 0x0000000000000000;
            perfdata[i][powerpc_data_sum] = 0x0000000000000000;
            perfdata[i][powerpc_data_num] = 0x0000000000000000;
#ifdef POWERPC_PERF_USE_PMC
314 315 316 317 318 319 320 321
            perfdata_pmc2[i][powerpc_data_min] = 0xFFFFFFFFFFFFFFFF;
            perfdata_pmc2[i][powerpc_data_max] = 0x0000000000000000;
            perfdata_pmc2[i][powerpc_data_sum] = 0x0000000000000000;
            perfdata_pmc2[i][powerpc_data_num] = 0x0000000000000000;
            perfdata_pmc3[i][powerpc_data_min] = 0xFFFFFFFFFFFFFFFF;
            perfdata_pmc3[i][powerpc_data_max] = 0x0000000000000000;
            perfdata_pmc3[i][powerpc_data_sum] = 0x0000000000000000;
            perfdata_pmc3[i][powerpc_data_num] = 0x0000000000000000;
322
#endif /* POWERPC_PERF_USE_PMC */
323 324
          }
        }
325
#endif /* POWERPC_TBL_PERFORMANCE_REPORT */
326
    } else
327
#endif /* HAVE_ALTIVEC */
328
    {
329 330 331
        // Non-AltiVec PPC optimisations

        // ... pending ...
332 333
    }
}