dsputil_ppc.c 9.7 KB
Newer Older
1 2 3
/*
 * Copyright (c) 2002 Brian Foley
 * Copyright (c) 2002 Dieter Shirley
4
 * Copyright (c) 2003-2004 Romain Dolbeau <romain@dolbeau.org>
5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20
 *
 * This library is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2 of the License, or (at your option) any later version.
 *
 * This library is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with this library; if not, write to the Free Software
 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
 */

21 22
#include "../dsputil.h"

23 24
#include "dsputil_ppc.h"

25 26 27 28
#ifdef HAVE_ALTIVEC
#include "dsputil_altivec.h"
#endif

29
extern void fdct_altivec(int16_t *block);
30 31 32
extern void idct_put_altivec(uint8_t *dest, int line_size, int16_t *block);
extern void idct_add_altivec(uint8_t *dest, int line_size, int16_t *block);

33 34
int mm_flags = 0;

35 36 37
int mm_support(void)
{
    int result = 0;
38
#ifdef HAVE_ALTIVEC
39 40 41 42 43 44 45
    if (has_altivec()) {
        result |= MM_ALTIVEC;
    }
#endif /* result */
    return result;
}

46 47
#ifdef POWERPC_PERFORMANCE_REPORT
unsigned long long perfdata[POWERPC_NUM_PMC_ENABLED][powerpc_perf_total][powerpc_data_total];
48
/* list below must match enum in dsputil_ppc.h */
49
static unsigned char* perfname[] = {
50
  "ff_fft_calc_altivec",
51 52
  "gmc1_altivec",
  "dct_unquantize_h263_altivec",
53
  "fdct_altivec",
54 55 56 57 58 59
  "idct_add_altivec",
  "idct_put_altivec",
  "put_pixels16_altivec",
  "avg_pixels16_altivec",
  "avg_pixels8_altivec",
  "put_pixels8_xy2_altivec",
60 61 62
  "put_no_rnd_pixels8_xy2_altivec",
  "put_pixels16_xy2_altivec",
  "put_no_rnd_pixels16_xy2_altivec",
63
  "hadamard8_diff8x8_altivec",
64
  "hadamard8_diff16_altivec",
65 66
  "clear_blocks_dcbz32_ppc",
  "clear_blocks_dcbz128_ppc"
67 68 69 70
};
#include <stdio.h>
#endif

71
#ifdef POWERPC_PERFORMANCE_REPORT
72 73
void powerpc_display_perf_report(void)
{
74
  int i, j;
75
  av_log(NULL, AV_LOG_INFO, "PowerPC performance report\n Values are from the PMC registers, and represent whatever the registers are set to record.\n");
76 77
  for(i = 0 ; i < powerpc_perf_total ; i++)
  {
78 79 80
    for (j = 0; j < POWERPC_NUM_PMC_ENABLED ; j++)
      {
	if (perfdata[j][i][powerpc_data_num] != (unsigned long long)0)
81
	  av_log(NULL, AV_LOG_INFO,
82 83 84 85 86 87 88 89 90
		  " Function \"%s\" (pmc%d):\n\tmin: %llu\n\tmax: %llu\n\tavg: %1.2lf (%llu)\n",
		  perfname[i],
		  j+1,
		  perfdata[j][i][powerpc_data_min],
		  perfdata[j][i][powerpc_data_max],
		  (double)perfdata[j][i][powerpc_data_sum] /
		  (double)perfdata[j][i][powerpc_data_num],
		  perfdata[j][i][powerpc_data_num]);
      }
91 92
  }
}
93
#endif /* POWERPC_PERFORMANCE_REPORT */
94 95 96 97 98 99 100 101 102 103 104 105 106

/* ***** WARNING ***** WARNING ***** WARNING ***** */
/*
  clear_blocks_dcbz32_ppc will not work properly
  on PowerPC processors with a cache line size
  not equal to 32 bytes.
  Fortunately all processor used by Apple up to
  at least the 7450 (aka second generation G4)
  use 32 bytes cache line.
  This is due to the use of the 'dcbz' instruction.
  It simply clear to zero a single cache line,
  so you need to know the cache line size to use it !
  It's absurd, but it's fast...
107 108 109 110 111 112 113 114 115 116 117 118

  update 24/06/2003 : Apple released yesterday the G5,
  with a PPC970. cache line size : 128 bytes. Oups.
  The semantic of dcbz was changed, it always clear
  32 bytes. so the function below will work, but will
  be slow. So I fixed check_dcbz_effect to use dcbzl,
  which is defined to clear a cache line (as dcbz before).
  So we still can distinguish, and use dcbz (32 bytes)
  or dcbzl (one cache line) as required.

  see <http://developer.apple.com/technotes/tn/tn2087.html>
  and <http://developer.apple.com/technotes/tn/tn2086.html>
119 120 121
*/
void clear_blocks_dcbz32_ppc(DCTELEM *blocks)
{
122
POWERPC_PERF_DECLARE(powerpc_clear_blocks_dcbz32, 1);
123 124
    register int misal = ((unsigned long)blocks & 0x00000010);
    register int i = 0;
125
POWERPC_PERF_START_COUNT(powerpc_clear_blocks_dcbz32, 1);
126 127 128 129 130 131 132 133 134
#if 1
    if (misal) {
      ((unsigned long*)blocks)[0] = 0L;
      ((unsigned long*)blocks)[1] = 0L;
      ((unsigned long*)blocks)[2] = 0L;
      ((unsigned long*)blocks)[3] = 0L;
      i += 16;
    }
    for ( ; i < sizeof(DCTELEM)*6*64 ; i += 32) {
135
#ifndef __MWERKS__
136
      asm volatile("dcbz %0,%1" : : "b" (blocks), "r" (i) : "memory");
137 138 139
#else
      __dcbz( blocks, i );
#endif
140 141 142 143 144 145 146 147 148 149 150
    }
    if (misal) {
      ((unsigned long*)blocks)[188] = 0L;
      ((unsigned long*)blocks)[189] = 0L;
      ((unsigned long*)blocks)[190] = 0L;
      ((unsigned long*)blocks)[191] = 0L;
      i += 16;
    }
#else
    memset(blocks, 0, sizeof(DCTELEM)*6*64);
#endif
151
POWERPC_PERF_STOP_COUNT(powerpc_clear_blocks_dcbz32, 1);
152 153
}

154 155 156 157 158
/* same as above, when dcbzl clear a whole 128B cache line
   i.e. the PPC970 aka G5 */
#ifndef NO_DCBZL
void clear_blocks_dcbz128_ppc(DCTELEM *blocks)
{
159
POWERPC_PERF_DECLARE(powerpc_clear_blocks_dcbz128, 1);
160 161
    register int misal = ((unsigned long)blocks & 0x0000007f);
    register int i = 0;
162
POWERPC_PERF_START_COUNT(powerpc_clear_blocks_dcbz128, 1);
163 164 165 166 167 168 169 170 171
#if 1
 if (misal) {
   // we could probably also optimize this case,
   // but there's not much point as the machines
   // aren't available yet (2003-06-26)
      memset(blocks, 0, sizeof(DCTELEM)*6*64);
    }
    else
      for ( ; i < sizeof(DCTELEM)*6*64 ; i += 128) {
172
	asm volatile("dcbzl %0,%1" : : "b" (blocks), "r" (i) : "memory");
173 174 175 176
      }
#else
    memset(blocks, 0, sizeof(DCTELEM)*6*64);
#endif
177
POWERPC_PERF_STOP_COUNT(powerpc_clear_blocks_dcbz128, 1);
178 179 180 181 182 183 184 185 186
}
#else
void clear_blocks_dcbz128_ppc(DCTELEM *blocks)
{
  memset(blocks, 0, sizeof(DCTELEM)*6*64);
}
#endif

#ifndef NO_DCBZL
187
/* check dcbz report how many bytes are set to 0 by dcbz */
188 189 190 191 192
/* update 24/06/2003 : replace dcbz by dcbzl to get
   the intended effect (Apple "fixed" dcbz)
   unfortunately this cannot be used unless the assembler
   knows about dcbzl ... */
long check_dcbzl_effect(void)
193
{
194
  register char *fakedata = (char*)av_malloc(1024);
195 196 197 198 199
  register char *fakedata_middle;
  register long zero = 0;
  register long i = 0;
  long count = 0;

200
  if (!fakedata)
201 202 203 204 205 206 207 208
  {
    return 0L;
  }

  fakedata_middle = (fakedata + 512);

  memset(fakedata, 0xFF, 1024);

209 210 211
  /* below the constraint "b" seems to mean "Address base register"
     in gcc-3.3 / RS/6000 speaks. seems to avoid using r0, so.... */
  asm volatile("dcbzl %0, %1" : : "b" (fakedata_middle), "r" (zero));
212 213 214 215 216 217 218

  for (i = 0; i < 1024 ; i ++)
  {
    if (fakedata[i] == (char)0)
      count++;
  }

219
  av_free(fakedata);
220 221 222
  
  return count;
}
223 224 225 226 227 228
#else
long check_dcbzl_effect(void)
{
  return 0;
}
#endif
229

230
void dsputil_init_ppc(DSPContext* c, AVCodecContext *avctx)
231
{
232
    // Common optimizations whether Altivec is available or not
233

234
  switch (check_dcbzl_effect()) {
235 236 237
  case 32:
    c->clear_blocks = clear_blocks_dcbz32_ppc;
    break;
238 239 240
  case 128:
    c->clear_blocks = clear_blocks_dcbz128_ppc;
    break;
241 242 243 244
  default:
    break;
  }
  
245
#ifdef HAVE_ALTIVEC
246
    if (has_altivec()) {
247 248
        mm_flags |= MM_ALTIVEC;
        
249
        // Altivec specific optimisations
M
Michael Niedermayer 已提交
250 251 252 253 254 255 256
        c->pix_abs[0][1] = sad16_x2_altivec;
        c->pix_abs[0][2] = sad16_y2_altivec;
        c->pix_abs[0][3] = sad16_xy2_altivec;
        c->pix_abs[0][0] = sad16_altivec;
        c->pix_abs[1][0] = sad8_altivec;
        c->sad[0]= sad16_altivec;
        c->sad[1]= sad8_altivec;
257
        c->pix_norm1 = pix_norm1_altivec;
258 259
        c->sse[1]= sse8_altivec;
        c->sse[0]= sse16_altivec;
260 261 262
        c->pix_sum = pix_sum_altivec;
        c->diff_pixels = diff_pixels_altivec;
        c->get_pixels = get_pixels_altivec;
263
// next one disabled as it's untested.
264 265
#if 0
        c->add_bytes= add_bytes_altivec;
266
#endif /* 0 */
267
        c->put_pixels_tab[0][0] = put_pixels16_altivec;
268
        /* the two functions do the same thing, so use the same code */
269
        c->put_no_rnd_pixels_tab[0][0] = put_pixels16_altivec;
270
        c->avg_pixels_tab[0][0] = avg_pixels16_altivec;
271 272 273
// next one disabled as it's untested.
#if 0
        c->avg_pixels_tab[1][0] = avg_pixels8_altivec;
274
#endif /* 0 */
275
        c->put_pixels_tab[1][3] = put_pixels8_xy2_altivec;
276 277 278
        c->put_no_rnd_pixels_tab[1][3] = put_no_rnd_pixels8_xy2_altivec;
        c->put_pixels_tab[0][3] = put_pixels16_xy2_altivec;
        c->put_no_rnd_pixels_tab[0][3] = put_no_rnd_pixels16_xy2_altivec;
279
        
280
	c->gmc1 = gmc1_altivec;
281

282 283 284
	c->hadamard8_diff[0] = hadamard8_diff16_altivec;
	c->hadamard8_diff[1] = hadamard8_diff8x8_altivec;

285 286 287 288 289 290 291 292
#ifdef CONFIG_ENCODERS
	if (avctx->dct_algo == FF_DCT_AUTO ||
	    avctx->dct_algo == FF_DCT_ALTIVEC)
	{
	    c->fdct = fdct_altivec;
	}
#endif //CONFIG_ENCODERS

293 294 295 296 297 298 299 300 301 302 303
        if ((avctx->idct_algo == FF_IDCT_AUTO) ||
                (avctx->idct_algo == FF_IDCT_ALTIVEC))
        {
            c->idct_put = idct_put_altivec;
            c->idct_add = idct_add_altivec;
#ifndef ALTIVEC_USE_REFERENCE_C_CODE
            c->idct_permutation_type = FF_TRANSPOSE_IDCT_PERM;
#else /* ALTIVEC_USE_REFERENCE_C_CODE */
            c->idct_permutation_type = FF_NO_IDCT_PERM;
#endif /* ALTIVEC_USE_REFERENCE_C_CODE */
        }
304
        
305
#ifdef POWERPC_PERFORMANCE_REPORT
306
        {
307
          int i, j;
308
          for (i = 0 ; i < powerpc_perf_total ; i++)
309
          {
310 311 312 313 314 315 316 317
	    for (j = 0; j < POWERPC_NUM_PMC_ENABLED ; j++)
	      {
		perfdata[j][i][powerpc_data_min] = (unsigned long long)0xFFFFFFFFFFFFFFFF;
		perfdata[j][i][powerpc_data_max] = (unsigned long long)0x0000000000000000;
		perfdata[j][i][powerpc_data_sum] = (unsigned long long)0x0000000000000000;
		perfdata[j][i][powerpc_data_num] = (unsigned long long)0x0000000000000000;
	      }
	  }
318
        }
319
#endif /* POWERPC_PERFORMANCE_REPORT */
320
    } else
321
#endif /* HAVE_ALTIVEC */
322
    {
323 324 325
        // Non-AltiVec PPC optimisations

        // ... pending ...
326 327
    }
}