dsputil_ppc.c 9.9 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19
/*
 * Copyright (c) 2002 Brian Foley
 * Copyright (c) 2002 Dieter Shirley
 *
 * This library is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2 of the License, or (at your option) any later version.
 *
 * This library is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with this library; if not, write to the Free Software
 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
 */

20 21
#include "../dsputil.h"

22 23
#include "dsputil_ppc.h"

24 25 26 27
#ifdef HAVE_ALTIVEC
#include "dsputil_altivec.h"
#endif

28 29 30
extern void idct_put_altivec(uint8_t *dest, int line_size, int16_t *block);
extern void idct_add_altivec(uint8_t *dest, int line_size, int16_t *block);

31 32
int mm_flags = 0;

33 34 35 36 37 38 39 40 41 42 43
int mm_support(void)
{
    int result = 0;
#if HAVE_ALTIVEC
    if (has_altivec()) {
        result |= MM_ALTIVEC;
    }
#endif /* result */
    return result;
}

44 45
#ifdef POWERPC_TBL_PERFORMANCE_REPORT
unsigned long long perfdata[powerpc_perf_total][powerpc_data_total];
46
/* list below must match enum in dsputil_ppc.h */
47 48 49 50 51 52 53 54 55 56
static unsigned char* perfname[] = {
  "fft_calc_altivec",
  "gmc1_altivec",
  "dct_unquantize_h263_altivec",
  "idct_add_altivec",
  "idct_put_altivec",
  "put_pixels16_altivec",
  "avg_pixels16_altivec",
  "avg_pixels8_altivec",
  "put_pixels8_xy2_altivec",
57 58 59
  "put_no_rnd_pixels8_xy2_altivec",
  "put_pixels16_xy2_altivec",
  "put_no_rnd_pixels16_xy2_altivec",
60 61
  "clear_blocks_dcbz32_ppc",
  "clear_blocks_dcbz128_ppc"
62 63 64 65 66 67 68 69 70 71 72 73
};
#ifdef POWERPC_PERF_USE_PMC
unsigned long long perfdata_miss[powerpc_perf_total][powerpc_data_total];
#endif
#include <stdio.h>
#endif

#ifdef POWERPC_TBL_PERFORMANCE_REPORT
void powerpc_display_perf_report(void)
{
  int i;
#ifndef POWERPC_PERF_USE_PMC
74
  fprintf(stderr, "PowerPC performance report\n Values are from the Time Base register, and represent 4 bus cycles.\n");
75
#else /* POWERPC_PERF_USE_PMC */
76
  fprintf(stderr, "PowerPC performance report\n Values are from the PMC registers, and represent whatever the registers are set to record.\n");
77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113
#endif /* POWERPC_PERF_USE_PMC */
  for(i = 0 ; i < powerpc_perf_total ; i++)
  {
    if (perfdata[i][powerpc_data_num] != (unsigned long long)0)
      fprintf(stderr, " Function \"%s\" (pmc1):\n\tmin: %llu\n\tmax: %llu\n\tavg: %1.2lf (%llu)\n",
              perfname[i],
              perfdata[i][powerpc_data_min],
              perfdata[i][powerpc_data_max],
              (double)perfdata[i][powerpc_data_sum] /
              (double)perfdata[i][powerpc_data_num],
              perfdata[i][powerpc_data_num]);
#ifdef POWERPC_PERF_USE_PMC
    if (perfdata_miss[i][powerpc_data_num] != (unsigned long long)0)
      fprintf(stderr, " Function \"%s\" (pmc2):\n\tmin: %llu\n\tmax: %llu\n\tavg: %1.2lf (%llu)\n",
              perfname[i],
              perfdata_miss[i][powerpc_data_min],
              perfdata_miss[i][powerpc_data_max],
              (double)perfdata_miss[i][powerpc_data_sum] /
              (double)perfdata_miss[i][powerpc_data_num],
              perfdata_miss[i][powerpc_data_num]);
#endif
  }
}
#endif /* POWERPC_TBL_PERFORMANCE_REPORT */

/* ***** WARNING ***** WARNING ***** WARNING ***** */
/*
  clear_blocks_dcbz32_ppc will not work properly
  on PowerPC processors with a cache line size
  not equal to 32 bytes.
  Fortunately all processor used by Apple up to
  at least the 7450 (aka second generation G4)
  use 32 bytes cache line.
  This is due to the use of the 'dcbz' instruction.
  It simply clear to zero a single cache line,
  so you need to know the cache line size to use it !
  It's absurd, but it's fast...
114 115 116 117 118 119 120 121 122 123 124 125

  update 24/06/2003 : Apple released yesterday the G5,
  with a PPC970. cache line size : 128 bytes. Oups.
  The semantic of dcbz was changed, it always clear
  32 bytes. so the function below will work, but will
  be slow. So I fixed check_dcbz_effect to use dcbzl,
  which is defined to clear a cache line (as dcbz before).
  So we still can distinguish, and use dcbz (32 bytes)
  or dcbzl (one cache line) as required.

  see <http://developer.apple.com/technotes/tn/tn2087.html>
  and <http://developer.apple.com/technotes/tn/tn2086.html>
126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141
*/
void clear_blocks_dcbz32_ppc(DCTELEM *blocks)
{
POWERPC_TBL_DECLARE(powerpc_clear_blocks_dcbz32, 1);
    register int misal = ((unsigned long)blocks & 0x00000010);
    register int i = 0;
POWERPC_TBL_START_COUNT(powerpc_clear_blocks_dcbz32, 1);
#if 1
    if (misal) {
      ((unsigned long*)blocks)[0] = 0L;
      ((unsigned long*)blocks)[1] = 0L;
      ((unsigned long*)blocks)[2] = 0L;
      ((unsigned long*)blocks)[3] = 0L;
      i += 16;
    }
    for ( ; i < sizeof(DCTELEM)*6*64 ; i += 32) {
142
      asm volatile("dcbz %0,%1" : : "r" (i), "r" (blocks) : "memory");
143 144 145 146 147 148 149 150 151 152 153 154 155 156
    }
    if (misal) {
      ((unsigned long*)blocks)[188] = 0L;
      ((unsigned long*)blocks)[189] = 0L;
      ((unsigned long*)blocks)[190] = 0L;
      ((unsigned long*)blocks)[191] = 0L;
      i += 16;
    }
#else
    memset(blocks, 0, sizeof(DCTELEM)*6*64);
#endif
POWERPC_TBL_STOP_COUNT(powerpc_clear_blocks_dcbz32, 1);
}

157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189
/* same as above, when dcbzl clear a whole 128B cache line
   i.e. the PPC970 aka G5 */
#ifndef NO_DCBZL
void clear_blocks_dcbz128_ppc(DCTELEM *blocks)
{
POWERPC_TBL_DECLARE(powerpc_clear_blocks_dcbz128, 1);
    register int misal = ((unsigned long)blocks & 0x0000007f);
    register int i = 0;
POWERPC_TBL_START_COUNT(powerpc_clear_blocks_dcbz128, 1);
#if 1
 if (misal) {
   // we could probably also optimize this case,
   // but there's not much point as the machines
   // aren't available yet (2003-06-26)
      memset(blocks, 0, sizeof(DCTELEM)*6*64);
    }
    else
      for ( ; i < sizeof(DCTELEM)*6*64 ; i += 128) {
	asm volatile("dcbzl %0,%1" : : "r" (i), "r" (blocks) : "memory");
      }
#else
    memset(blocks, 0, sizeof(DCTELEM)*6*64);
#endif
POWERPC_TBL_STOP_COUNT(powerpc_clear_blocks_dcbz128, 1);
}
#else
void clear_blocks_dcbz128_ppc(DCTELEM *blocks)
{
  memset(blocks, 0, sizeof(DCTELEM)*6*64);
}
#endif

#ifndef NO_DCBZL
190
/* check dcbz report how many bytes are set to 0 by dcbz */
191 192 193 194 195
/* update 24/06/2003 : replace dcbz by dcbzl to get
   the intended effect (Apple "fixed" dcbz)
   unfortunately this cannot be used unless the assembler
   knows about dcbzl ... */
long check_dcbzl_effect(void)
196
{
197
  register char *fakedata = (char*)av_malloc(1024);
198 199 200 201 202
  register char *fakedata_middle;
  register long zero = 0;
  register long i = 0;
  long count = 0;

203
  if (!fakedata)
204 205 206 207 208 209 210 211
  {
    return 0L;
  }

  fakedata_middle = (fakedata + 512);

  memset(fakedata, 0xFF, 1024);

212
  asm volatile("dcbzl %0, %1" : : "r" (fakedata_middle), "r" (zero));
213 214 215 216 217 218 219

  for (i = 0; i < 1024 ; i ++)
  {
    if (fakedata[i] == (char)0)
      count++;
  }

220
  av_free(fakedata);
221 222 223
  
  return count;
}
224 225 226 227 228 229
#else
long check_dcbzl_effect(void)
{
  return 0;
}
#endif
230

231
void dsputil_init_ppc(DSPContext* c, AVCodecContext *avctx)
232
{
233
    // Common optimizations whether Altivec is available or not
234

235
  switch (check_dcbzl_effect()) {
236 237 238
  case 32:
    c->clear_blocks = clear_blocks_dcbz32_ppc;
    break;
239 240 241
  case 128:
    c->clear_blocks = clear_blocks_dcbz128_ppc;
    break;
242 243 244 245
  default:
    break;
  }
  
246 247
#if HAVE_ALTIVEC
    if (has_altivec()) {
248 249
        mm_flags |= MM_ALTIVEC;
        
250
        // Altivec specific optimisations
251 252
        c->pix_abs16x16_x2 = pix_abs16x16_x2_altivec;
        c->pix_abs16x16_y2 = pix_abs16x16_y2_altivec;
253 254
        c->pix_abs16x16_xy2 = pix_abs16x16_xy2_altivec;
        c->pix_abs16x16 = pix_abs16x16_altivec;
255
        c->pix_abs8x8 = pix_abs8x8_altivec;
256 257
        c->sad[0]= sad16x16_altivec;
        c->sad[1]= sad8x8_altivec;
258
        c->pix_norm1 = pix_norm1_altivec;
259 260
        c->sse[1]= sse8_altivec;
        c->sse[0]= sse16_altivec;
261 262 263
        c->pix_sum = pix_sum_altivec;
        c->diff_pixels = diff_pixels_altivec;
        c->get_pixels = get_pixels_altivec;
264
// next one disabled as it's untested.
265 266
#if 0
        c->add_bytes= add_bytes_altivec;
267
#endif /* 0 */
268 269
        c->put_pixels_tab[0][0] = put_pixels16_altivec;
        c->avg_pixels_tab[0][0] = avg_pixels16_altivec;
270 271 272
// next one disabled as it's untested.
#if 0
        c->avg_pixels_tab[1][0] = avg_pixels8_altivec;
273
#endif /* 0 */
274
        c->put_pixels_tab[1][3] = put_pixels8_xy2_altivec;
275 276 277
        c->put_no_rnd_pixels_tab[1][3] = put_no_rnd_pixels8_xy2_altivec;
        c->put_pixels_tab[0][3] = put_pixels16_xy2_altivec;
        c->put_no_rnd_pixels_tab[0][3] = put_no_rnd_pixels16_xy2_altivec;
278
        
279
	c->gmc1 = gmc1_altivec;
280 281 282 283 284 285 286 287 288 289 290 291

        if ((avctx->idct_algo == FF_IDCT_AUTO) ||
                (avctx->idct_algo == FF_IDCT_ALTIVEC))
        {
            c->idct_put = idct_put_altivec;
            c->idct_add = idct_add_altivec;
#ifndef ALTIVEC_USE_REFERENCE_C_CODE
            c->idct_permutation_type = FF_TRANSPOSE_IDCT_PERM;
#else /* ALTIVEC_USE_REFERENCE_C_CODE */
            c->idct_permutation_type = FF_NO_IDCT_PERM;
#endif /* ALTIVEC_USE_REFERENCE_C_CODE */
        }
292
        
293
#ifdef POWERPC_TBL_PERFORMANCE_REPORT
294 295
        {
          int i;
296
          for (i = 0 ; i < powerpc_perf_total ; i++)
297
          {
298 299 300 301 302 303 304 305 306
            perfdata[i][powerpc_data_min] = 0xFFFFFFFFFFFFFFFF;
            perfdata[i][powerpc_data_max] = 0x0000000000000000;
            perfdata[i][powerpc_data_sum] = 0x0000000000000000;
            perfdata[i][powerpc_data_num] = 0x0000000000000000;
#ifdef POWERPC_PERF_USE_PMC
            perfdata_miss[i][powerpc_data_min] = 0xFFFFFFFFFFFFFFFF;
            perfdata_miss[i][powerpc_data_max] = 0x0000000000000000;
            perfdata_miss[i][powerpc_data_sum] = 0x0000000000000000;
            perfdata_miss[i][powerpc_data_num] = 0x0000000000000000;
307
#endif /* POWERPC_PERF_USE_PMC */
308 309
          }
        }
310
#endif /* POWERPC_TBL_PERFORMANCE_REPORT */
311
    } else
312
#endif /* HAVE_ALTIVEC */
313
    {
314 315 316
        // Non-AltiVec PPC optimisations

        // ... pending ...
317 318
    }
}