dsputil_ppc.c 10.2 KB
Newer Older
1 2 3
/*
 * Copyright (c) 2002 Brian Foley
 * Copyright (c) 2002 Dieter Shirley
4
 * Copyright (c) 2003-2004 Romain Dolbeau <romain@dolbeau.org>
5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20
 *
 * This library is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2 of the License, or (at your option) any later version.
 *
 * This library is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with this library; if not, write to the Free Software
 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
 */

21 22
#include "../dsputil.h"

23 24
#include "dsputil_ppc.h"

25 26 27 28
#ifdef HAVE_ALTIVEC
#include "dsputil_altivec.h"
#endif

29
extern void fdct_altivec(int16_t *block);
30 31 32
extern void idct_put_altivec(uint8_t *dest, int line_size, int16_t *block);
extern void idct_add_altivec(uint8_t *dest, int line_size, int16_t *block);

33 34
int mm_flags = 0;

35 36 37
int mm_support(void)
{
    int result = 0;
38
#ifdef HAVE_ALTIVEC
39 40 41 42 43 44 45
    if (has_altivec()) {
        result |= MM_ALTIVEC;
    }
#endif /* result */
    return result;
}

46 47
#ifdef POWERPC_PERFORMANCE_REPORT
unsigned long long perfdata[POWERPC_NUM_PMC_ENABLED][powerpc_perf_total][powerpc_data_total];
48
/* list below must match enum in dsputil_ppc.h */
49
static unsigned char* perfname[] = {
50
  "ff_fft_calc_altivec",
51 52
  "gmc1_altivec",
  "dct_unquantize_h263_altivec",
53
  "fdct_altivec",
54 55 56 57 58 59
  "idct_add_altivec",
  "idct_put_altivec",
  "put_pixels16_altivec",
  "avg_pixels16_altivec",
  "avg_pixels8_altivec",
  "put_pixels8_xy2_altivec",
60 61 62
  "put_no_rnd_pixels8_xy2_altivec",
  "put_pixels16_xy2_altivec",
  "put_no_rnd_pixels16_xy2_altivec",
63
  "hadamard8_diff8x8_altivec",
64
  "hadamard8_diff16_altivec",
65
  "avg_pixels8_xy2_altivec",
66
  "clear_blocks_dcbz32_ppc",
67 68 69 70 71 72 73 74 75 76
  "clear_blocks_dcbz128_ppc",
  "put_h264_chroma_mc8_altivec",
  "avg_h264_chroma_mc8_altivec",
  "put_h264_qpel16_h_lowpass_altivec",
  "avg_h264_qpel16_h_lowpass_altivec",
  "put_h264_qpel16_v_lowpass_altivec",
  "avg_h264_qpel16_v_lowpass_altivec",
  "put_h264_qpel16_hv_lowpass_altivec",
  "avg_h264_qpel16_hv_lowpass_altivec",
  ""
77 78 79 80
};
#include <stdio.h>
#endif

81
#ifdef POWERPC_PERFORMANCE_REPORT
82 83
void powerpc_display_perf_report(void)
{
84
  int i, j;
85
  av_log(NULL, AV_LOG_INFO, "PowerPC performance report\n Values are from the PMC registers, and represent whatever the registers are set to record.\n");
86 87
  for(i = 0 ; i < powerpc_perf_total ; i++)
  {
88 89 90
    for (j = 0; j < POWERPC_NUM_PMC_ENABLED ; j++)
      {
	if (perfdata[j][i][powerpc_data_num] != (unsigned long long)0)
91
	  av_log(NULL, AV_LOG_INFO,
92 93 94 95 96 97 98 99 100
		  " Function \"%s\" (pmc%d):\n\tmin: %llu\n\tmax: %llu\n\tavg: %1.2lf (%llu)\n",
		  perfname[i],
		  j+1,
		  perfdata[j][i][powerpc_data_min],
		  perfdata[j][i][powerpc_data_max],
		  (double)perfdata[j][i][powerpc_data_sum] /
		  (double)perfdata[j][i][powerpc_data_num],
		  perfdata[j][i][powerpc_data_num]);
      }
101 102
  }
}
103
#endif /* POWERPC_PERFORMANCE_REPORT */
104 105 106 107 108 109 110 111 112 113 114 115 116

/* ***** WARNING ***** WARNING ***** WARNING ***** */
/*
  clear_blocks_dcbz32_ppc will not work properly
  on PowerPC processors with a cache line size
  not equal to 32 bytes.
  Fortunately all processor used by Apple up to
  at least the 7450 (aka second generation G4)
  use 32 bytes cache line.
  This is due to the use of the 'dcbz' instruction.
  It simply clear to zero a single cache line,
  so you need to know the cache line size to use it !
  It's absurd, but it's fast...
117 118 119 120 121 122 123 124 125 126 127 128

  update 24/06/2003 : Apple released yesterday the G5,
  with a PPC970. cache line size : 128 bytes. Oups.
  The semantic of dcbz was changed, it always clear
  32 bytes. so the function below will work, but will
  be slow. So I fixed check_dcbz_effect to use dcbzl,
  which is defined to clear a cache line (as dcbz before).
  So we still can distinguish, and use dcbz (32 bytes)
  or dcbzl (one cache line) as required.

  see <http://developer.apple.com/technotes/tn/tn2087.html>
  and <http://developer.apple.com/technotes/tn/tn2086.html>
129 130 131
*/
void clear_blocks_dcbz32_ppc(DCTELEM *blocks)
{
132
POWERPC_PERF_DECLARE(powerpc_clear_blocks_dcbz32, 1);
133 134
    register int misal = ((unsigned long)blocks & 0x00000010);
    register int i = 0;
135
POWERPC_PERF_START_COUNT(powerpc_clear_blocks_dcbz32, 1);
136 137 138 139 140 141 142 143
#if 1
    if (misal) {
      ((unsigned long*)blocks)[0] = 0L;
      ((unsigned long*)blocks)[1] = 0L;
      ((unsigned long*)blocks)[2] = 0L;
      ((unsigned long*)blocks)[3] = 0L;
      i += 16;
    }
144
    for ( ; i < sizeof(DCTELEM)*6*64-31 ; i += 32) {
145
#ifndef __MWERKS__
146
      asm volatile("dcbz %0,%1" : : "b" (blocks), "r" (i) : "memory");
147 148 149
#else
      __dcbz( blocks, i );
#endif
150 151 152 153 154 155 156 157 158 159 160
    }
    if (misal) {
      ((unsigned long*)blocks)[188] = 0L;
      ((unsigned long*)blocks)[189] = 0L;
      ((unsigned long*)blocks)[190] = 0L;
      ((unsigned long*)blocks)[191] = 0L;
      i += 16;
    }
#else
    memset(blocks, 0, sizeof(DCTELEM)*6*64);
#endif
161
POWERPC_PERF_STOP_COUNT(powerpc_clear_blocks_dcbz32, 1);
162 163
}

164 165 166 167 168
/* same as above, when dcbzl clear a whole 128B cache line
   i.e. the PPC970 aka G5 */
#ifndef NO_DCBZL
void clear_blocks_dcbz128_ppc(DCTELEM *blocks)
{
169
POWERPC_PERF_DECLARE(powerpc_clear_blocks_dcbz128, 1);
170 171
    register int misal = ((unsigned long)blocks & 0x0000007f);
    register int i = 0;
172
POWERPC_PERF_START_COUNT(powerpc_clear_blocks_dcbz128, 1);
173 174 175 176 177 178 179 180 181
#if 1
 if (misal) {
   // we could probably also optimize this case,
   // but there's not much point as the machines
   // aren't available yet (2003-06-26)
      memset(blocks, 0, sizeof(DCTELEM)*6*64);
    }
    else
      for ( ; i < sizeof(DCTELEM)*6*64 ; i += 128) {
182
	asm volatile("dcbzl %0,%1" : : "b" (blocks), "r" (i) : "memory");
183 184 185 186
      }
#else
    memset(blocks, 0, sizeof(DCTELEM)*6*64);
#endif
187
POWERPC_PERF_STOP_COUNT(powerpc_clear_blocks_dcbz128, 1);
188 189 190 191 192 193 194 195 196
}
#else
void clear_blocks_dcbz128_ppc(DCTELEM *blocks)
{
  memset(blocks, 0, sizeof(DCTELEM)*6*64);
}
#endif

#ifndef NO_DCBZL
197
/* check dcbz report how many bytes are set to 0 by dcbz */
198 199 200 201 202
/* update 24/06/2003 : replace dcbz by dcbzl to get
   the intended effect (Apple "fixed" dcbz)
   unfortunately this cannot be used unless the assembler
   knows about dcbzl ... */
long check_dcbzl_effect(void)
203
{
204
  register char *fakedata = (char*)av_malloc(1024);
205 206 207 208 209
  register char *fakedata_middle;
  register long zero = 0;
  register long i = 0;
  long count = 0;

210
  if (!fakedata)
211 212 213 214 215 216 217 218
  {
    return 0L;
  }

  fakedata_middle = (fakedata + 512);

  memset(fakedata, 0xFF, 1024);

219 220 221
  /* below the constraint "b" seems to mean "Address base register"
     in gcc-3.3 / RS/6000 speaks. seems to avoid using r0, so.... */
  asm volatile("dcbzl %0, %1" : : "b" (fakedata_middle), "r" (zero));
222 223 224 225 226 227 228

  for (i = 0; i < 1024 ; i ++)
  {
    if (fakedata[i] == (char)0)
      count++;
  }

229
  av_free(fakedata);
230

231 232
  return count;
}
233 234 235 236 237 238
#else
long check_dcbzl_effect(void)
{
  return 0;
}
#endif
239

240 241 242

void dsputil_h264_init_ppc(DSPContext* c, AVCodecContext *avctx);

243
void dsputil_init_ppc(DSPContext* c, AVCodecContext *avctx)
244
{
245
    // Common optimizations whether Altivec is available or not
246

247
  switch (check_dcbzl_effect()) {
248 249 250
  case 32:
    c->clear_blocks = clear_blocks_dcbz32_ppc;
    break;
251 252 253
  case 128:
    c->clear_blocks = clear_blocks_dcbz128_ppc;
    break;
254 255 256
  default:
    break;
  }
257

258
#ifdef HAVE_ALTIVEC
259
  dsputil_h264_init_ppc(c, avctx);
260

261
    if (has_altivec()) {
262
        mm_flags |= MM_ALTIVEC;
263

264
        // Altivec specific optimisations
M
Michael Niedermayer 已提交
265 266 267 268 269 270 271
        c->pix_abs[0][1] = sad16_x2_altivec;
        c->pix_abs[0][2] = sad16_y2_altivec;
        c->pix_abs[0][3] = sad16_xy2_altivec;
        c->pix_abs[0][0] = sad16_altivec;
        c->pix_abs[1][0] = sad8_altivec;
        c->sad[0]= sad16_altivec;
        c->sad[1]= sad8_altivec;
272
        c->pix_norm1 = pix_norm1_altivec;
273 274
        c->sse[1]= sse8_altivec;
        c->sse[0]= sse16_altivec;
275 276 277
        c->pix_sum = pix_sum_altivec;
        c->diff_pixels = diff_pixels_altivec;
        c->get_pixels = get_pixels_altivec;
278
// next one disabled as it's untested.
279 280
#if 0
        c->add_bytes= add_bytes_altivec;
281
#endif /* 0 */
282
        c->put_pixels_tab[0][0] = put_pixels16_altivec;
283
        /* the two functions do the same thing, so use the same code */
284
        c->put_no_rnd_pixels_tab[0][0] = put_pixels16_altivec;
285
        c->avg_pixels_tab[0][0] = avg_pixels16_altivec;
286
        c->avg_pixels_tab[1][0] = avg_pixels8_altivec;
287
	c->avg_pixels_tab[1][3] = avg_pixels8_xy2_altivec;
288
        c->put_pixels_tab[1][3] = put_pixels8_xy2_altivec;
289 290 291
        c->put_no_rnd_pixels_tab[1][3] = put_no_rnd_pixels8_xy2_altivec;
        c->put_pixels_tab[0][3] = put_pixels16_xy2_altivec;
        c->put_no_rnd_pixels_tab[0][3] = put_no_rnd_pixels16_xy2_altivec;
292

293
	c->gmc1 = gmc1_altivec;
294

295
#ifdef CONFIG_DARWIN // ATM gcc-3.3 and gcc-3.4 fail to compile these in linux...
296 297
	c->hadamard8_diff[0] = hadamard8_diff16_altivec;
	c->hadamard8_diff[1] = hadamard8_diff8x8_altivec;
298
#endif
299

300 301 302 303 304 305 306 307
#ifdef CONFIG_ENCODERS
	if (avctx->dct_algo == FF_DCT_AUTO ||
	    avctx->dct_algo == FF_DCT_ALTIVEC)
	{
	    c->fdct = fdct_altivec;
	}
#endif //CONFIG_ENCODERS

308 309
      if (avctx->lowres==0)
      {
310 311 312 313 314 315 316 317 318 319 320
        if ((avctx->idct_algo == FF_IDCT_AUTO) ||
                (avctx->idct_algo == FF_IDCT_ALTIVEC))
        {
            c->idct_put = idct_put_altivec;
            c->idct_add = idct_add_altivec;
#ifndef ALTIVEC_USE_REFERENCE_C_CODE
            c->idct_permutation_type = FF_TRANSPOSE_IDCT_PERM;
#else /* ALTIVEC_USE_REFERENCE_C_CODE */
            c->idct_permutation_type = FF_NO_IDCT_PERM;
#endif /* ALTIVEC_USE_REFERENCE_C_CODE */
        }
321
      }
322

323
#ifdef POWERPC_PERFORMANCE_REPORT
324
        {
325
          int i, j;
326
          for (i = 0 ; i < powerpc_perf_total ; i++)
327
          {
328 329
	    for (j = 0; j < POWERPC_NUM_PMC_ENABLED ; j++)
	      {
330 331 332 333
		perfdata[j][i][powerpc_data_min] = 0xFFFFFFFFFFFFFFFFULL;
		perfdata[j][i][powerpc_data_max] = 0x0000000000000000ULL;
		perfdata[j][i][powerpc_data_sum] = 0x0000000000000000ULL;
		perfdata[j][i][powerpc_data_num] = 0x0000000000000000ULL;
334 335
	      }
	  }
336
        }
337
#endif /* POWERPC_PERFORMANCE_REPORT */
338
    } else
339
#endif /* HAVE_ALTIVEC */
340
    {
341 342 343
        // Non-AltiVec PPC optimisations

        // ... pending ...
344 345
    }
}