Altivec Patch (Mark III) by (Dieter Shirley <dieters at schemasoft dot com>)

Originally committed as revision 1147 to svn://svn.ffmpeg.org/ffmpeg/trunk

Altivec Patch (Mark III) by (Dieter Shirley <dieters at schemasoft dot com>)
Originally committed as revision 1147 to svn://svn.ffmpeg.org/ffmpeg/trunk
05c4072b · Michael Niedermayer · 26b35efb · 05c4072b · 05c4072b · 05c4072b
10 changed file
--- a/libavcodec/Makefile
+++ b/libavcodec/Makefile
@@ -75,7 +75,7 @@ CFLAGS += -fforce-addr -freduce-all-givs
 endif

 ifeq ($(TARGET_ARCH_POWERPC),yes)
-OBJS += ppc/dsputil_ppc.o
+OBJS += ppc/dsputil_ppc.o ppc/mpegvideo_ppc.o
 endif

 ifeq ($(TARGET_MMI),yes)
@@ -84,7 +84,7 @@ endif

 ifeq ($(TARGET_ALTIVEC),yes)
 CFLAGS += -faltivec
-OBJS += ppc/dsputil_altivec.o
+OBJS += ppc/dsputil_altivec.o ppc/mpegvideo_altivec.o ppc/idct_altivec.o
 endif

 SRCS := $(OBJS:.o=.c) $(ASM_OBJS:.o=.S)

--- a/libavcodec/avcodec.h
+++ b/libavcodec/avcodec.h
@@ -650,6 +650,7 @@ typedef struct AVCodecContext {
 #define FF_DCT_INT     2
 #define FF_DCT_MMX     3
 #define FF_DCT_MLIB    4
+#define FF_DCT_ALTIVEC 5

    /**
     * presentation timestamp in micro seconds (time when frame should be shown to user)
@@ -716,6 +717,7 @@ typedef struct AVCodecContext {
 #define FF_IDCT_PS2          5
 #define FF_IDCT_MLIB         6
 #define FF_IDCT_ARM          7
+#define FF_IDCT_ALTIVEC      8

    /**
     * slice count

--- a/libavcodec/mpegvideo.c
+++ b/libavcodec/mpegvideo.c
@@ -157,6 +157,9 @@ void ff_init_scantable(MpegEncContext *s, ScanTable *st, const UINT8 *src_scanta
        int j;
        j = src_scantable[i];
        st->permutated[i] = s->idct_permutation[j];
+#ifdef ARCH_POWERPC
+        st->inverse[j] = i;
+#endif
    }
    
    end=-1;
@@ -221,6 +224,9 @@ int DCT_common_init(MpegEncContext *s)
 #ifdef ARCH_ARMV4L
    MPV_common_init_armv4l();
 #endif
+#ifdef ARCH_POWERPC
+    MPV_common_init_ppc(s);
+#endif

    switch(s->idct_permutation_type){
    case FF_NO_IDCT_PERM:
@@ -3011,7 +3017,7 @@ static int dct_quantize_c(MpegEncContext *s,
    int bias;
    int max=0;
    unsigned int threshold1, threshold2;
-    
+
    s->fdct (block);

    if (s->mb_intra) {

--- a/libavcodec/mpegvideo.h
+++ b/libavcodec/mpegvideo.h
@@ -103,6 +103,10 @@ typedef struct ScanTable{
    const UINT8 *scantable;
    UINT8 permutated[64];
    UINT8 raster_end[64];
+#ifdef ARCH_POWERPC
+		/* Used by dct_quantise_alitvec to find last-non-zero */
+    UINT8 __align8 inverse[64];
+#endif
 } ScanTable;

 typedef struct MpegEncContext {
@@ -287,8 +291,8 @@ typedef struct MpegEncContext {
    int min_qcoeff;          /* minimum encodable coefficient */
    int max_qcoeff;          /* maximum encodable coefficient */
    /* precomputed matrix (combine qscale and DCT renorm) */
-    int q_intra_matrix[32][64];
-    int q_inter_matrix[32][64];
+    int __align8 q_intra_matrix[32][64];
+    int __align8 q_inter_matrix[32][64];
    /* identical to the above but for MMX & these are not permutated */
    UINT16 __align8 q_intra_matrix16[32][64];
    UINT16 __align8 q_inter_matrix16[32][64];
@@ -296,7 +300,7 @@ typedef struct MpegEncContext {
    UINT16 __align8 q_inter_matrix16_bias[32][64];
    int block_last_index[6];  /* last non zero coefficient in block */
    /* scantables */
-    ScanTable intra_scantable;
+    ScanTable __align8 intra_scantable;
    ScanTable intra_h_scantable;
    ScanTable intra_v_scantable;
    ScanTable inter_scantable; // if inter == intra then intra should be used to reduce tha cache usage
@@ -535,6 +539,9 @@ void MPV_common_init_mlib(MpegEncContext *s);
 #ifdef HAVE_MMI
 void MPV_common_init_mmi(MpegEncContext *s);
 #endif
+#ifdef ARCH_POWERPC
+void MPV_common_init_ppc(MpegEncContext *s);
+#endif
 extern void (*draw_edges)(UINT8 *buf, int wrap, int width, int height, int w);
 void ff_conceal_past_errors(MpegEncContext *s, int conceal_all);
 void ff_copy_bits(PutBitContext *pb, UINT8 *src, int length);

--- a/libavcodec/ppc/dsputil_altivec.c
+++ b/libavcodec/ppc/dsputil_altivec.c
+/*
+ * Copyright (c) 2002 Brian Foley
+ * Copyright (c) 2002 Dieter Shirley
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+ 
 #include "../dsputil.h"
+#include "dsputil_altivec.h"

 #if CONFIG_DARWIN
 #include <sys/sysctl.h>
 #endif

-int pix_abs16x16_altivec(uint8_t *pix1, uint8_t *pix2, int line_size);
-int pix_abs8x8_altivec(uint8_t *pix1, uint8_t *pix2, int line_size);
-int pix_sum_altivec(UINT8 * pix, int line_size);
-
-int has_altivec(void);
-
 int pix_abs16x16_altivec(uint8_t *pix1, uint8_t *pix2, int line_size)
 {
    int i, s;
@@ -127,6 +141,105 @@ int pix_sum_altivec(UINT8 * pix, int line_size)
    return s;
 }

+void get_pixels_altivec(DCTELEM *restrict block, const UINT8 *pixels, int line_size)
+{
+    int i;
+    vector unsigned char perm, bytes, *pixv;
+    vector unsigned char zero = (vector unsigned char) (0);
+    vector signed short shorts;
+
+    for(i=0;i<8;i++)
+    {
+        // Read potentially unaligned pixels.
+        // We're reading 16 pixels, and actually only want 8,
+        // but we simply ignore the extras.
+        perm = vec_lvsl(0, pixels);
+        pixv = (vector unsigned char *) pixels;
+        bytes = vec_perm(pixv[0], pixv[1], perm);
+
+        // convert the bytes into shorts
+        shorts = (vector signed short)vec_mergeh(zero, bytes);
+
+        // save the data to the block, we assume the block is 16-byte aligned
+        vec_st(shorts, i*16, (vector signed short*)block);
+
+        pixels += line_size;
+    }
+}
+
+void diff_pixels_altivec(DCTELEM *restrict block, const UINT8 *s1,
+        const UINT8 *s2, int stride)
+{
+    int i;
+    vector unsigned char perm, bytes, *pixv;
+    vector unsigned char zero = (vector unsigned char) (0);
+    vector signed short shorts1, shorts2;
+
+    for(i=0;i<4;i++)
+    {
+        // Read potentially unaligned pixels
+        // We're reading 16 pixels, and actually only want 8,
+        // but we simply ignore the extras.
+        perm = vec_lvsl(0, s1);
+        pixv = (vector unsigned char *) s1;
+        bytes = vec_perm(pixv[0], pixv[1], perm);
+
+        // convert the bytes into shorts
+        shorts1 = (vector signed short)vec_mergeh(zero, bytes);
+
+        // Do the same for the second block of pixels
+        perm = vec_lvsl(0, s2);
+        pixv = (vector unsigned char *) s2;
+        bytes = vec_perm(pixv[0], pixv[1], perm);
+
+        // convert the bytes into shorts
+        shorts2 = (vector signed short)vec_mergeh(zero, bytes);
+
+        // Do the subtraction
+        shorts1 = vec_sub(shorts1, shorts2);
+
+        // save the data to the block, we assume the block is 16-byte aligned
+        vec_st(shorts1, 0, (vector signed short*)block);
+
+        s1 += stride;
+        s2 += stride;
+        block += 8;
+
+
+        // The code below is a copy of the code above... This is a manual
+        // unroll.
+
+        // Read potentially unaligned pixels
+        // We're reading 16 pixels, and actually only want 8,
+        // but we simply ignore the extras.
+        perm = vec_lvsl(0, s1);
+        pixv = (vector unsigned char *) s1;
+        bytes = vec_perm(pixv[0], pixv[1], perm);
+
+        // convert the bytes into shorts
+        shorts1 = (vector signed short)vec_mergeh(zero, bytes);
+
+        // Do the same for the second block of pixels
+        perm = vec_lvsl(0, s2);
+        pixv = (vector unsigned char *) s2;
+        bytes = vec_perm(pixv[0], pixv[1], perm);
+
+        // convert the bytes into shorts
+        shorts2 = (vector signed short)vec_mergeh(zero, bytes);
+
+        // Do the subtraction
+        shorts1 = vec_sub(shorts1, shorts2);
+
+        // save the data to the block, we assume the block is 16-byte aligned
+        vec_st(shorts1, 0, (vector signed short*)block);
+
+        s1 += stride;
+        s2 += stride;
+        block += 8;
+    }
+}
+
+
 int has_altivec(void)
 {
 #if CONFIG_DARWIN
@@ -141,3 +254,4 @@ int has_altivec(void)
 #endif
    return 0;
 }
+
--- a/libavcodec/ppc/dsputil_altivec.h
+++ b/libavcodec/ppc/dsputil_altivec.h
+/*
+ * Copyright (c) 2002 Brian Foley
+ * Copyright (c) 2002 Dieter Shirley
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+ 
 extern int pix_abs16x16_altivec(uint8_t *pix1, uint8_t *pix2, int line_size);
 extern int pix_abs8x8_altivec(uint8_t *pix1, uint8_t *pix2, int line_size);
 extern int pix_sum_altivec(UINT8 * pix, int line_size);
+extern void diff_pixels_altivec(DCTELEM* block, const UINT8* s1, const UINT8* s2, int stride);
+extern void get_pixels_altivec(DCTELEM* block, const UINT8 * pixels, int line_size);

 extern int has_altivec(void);
--- a/libavcodec/ppc/dsputil_ppc.c
+++ b/libavcodec/ppc/dsputil_ppc.c
+/*
+ * Copyright (c) 2002 Brian Foley
+ * Copyright (c) 2002 Dieter Shirley
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
 #include "../dsputil.h"

 #ifdef HAVE_ALTIVEC
@@ -6,14 +25,23 @@

 void dsputil_init_ppc(void)
 {
+    // Common optimisations whether Altivec or not
+
+    // ... pending ...
+
 #if HAVE_ALTIVEC
    if (has_altivec()) {
+        // Altivec specific optimisations
        pix_abs16x16 = pix_abs16x16_altivec;
        pix_abs8x8 = pix_abs8x8_altivec;
        pix_sum = pix_sum_altivec;
+        diff_pixels = diff_pixels_altivec;
+        get_pixels = get_pixels_altivec;
    } else
 #endif
    {
-        /* Non-AltiVec PPC optimisations here */
+        // Non-AltiVec PPC optimisations
+
+        // ... pending ...
    }
 }
--- a/libavcodec/ppc/idct_altivec.c
+++ b/libavcodec/ppc/idct_altivec.c
+/*
+ * Copyright (c) 2001 Michel Lespinasse
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ *
+ */
+
+/*
+ * NOTE: This code is based on GPL code from the libmpeg2 project.  The
+ * author, Michel Lespinasses, has given explicit permission to release
+ * under LGPL as part of ffmpeg.
+ *
+ */
+
+/*
+ * FFMpeg integration by Dieter Shirley
+ *
+ * This file is a direct copy of the altivec idct module from the libmpeg2
+ * project.  I've deleted all of the libmpeg2 specific code, renamed the functions and
+ * re-ordered the function parameters.  The only change to the IDCT function
+ * itself was to factor out the partial transposition, and to perform a full
+ * transpose at the end of the function.
+ */
+
+
+#include <stdlib.h>                                      /* malloc(), free() */
+#include <string.h>
+#include "../dsputil.h"
+
+#define vector_s16_t vector signed short
+#define vector_u16_t vector unsigned short
+#define vector_s8_t vector signed char
+#define vector_u8_t vector unsigned char
+#define vector_s32_t vector signed int
+#define vector_u32_t vector unsigned int
+
+#define IDCT_HALF					\
+    /* 1st stage */					\
+    t1 = vec_mradds (a1, vx7, vx1 );			\
+    t8 = vec_mradds (a1, vx1, vec_subs (zero, vx7));	\
+    t7 = vec_mradds (a2, vx5, vx3);			\
+    t3 = vec_mradds (ma2, vx3, vx5);			\
+							\
+    /* 2nd stage */					\
+    t5 = vec_adds (vx0, vx4);				\
+    t0 = vec_subs (vx0, vx4);				\
+    t2 = vec_mradds (a0, vx6, vx2);			\
+    t4 = vec_mradds (a0, vx2, vec_subs (zero, vx6));	\
+    t6 = vec_adds (t8, t3);				\
+    t3 = vec_subs (t8, t3);				\
+    t8 = vec_subs (t1, t7);				\
+    t1 = vec_adds (t1, t7);				\
+							\
+    /* 3rd stage */					\
+    t7 = vec_adds (t5, t2);				\
+    t2 = vec_subs (t5, t2);				\
+    t5 = vec_adds (t0, t4);				\
+    t0 = vec_subs (t0, t4);				\
+    t4 = vec_subs (t8, t3);				\
+    t3 = vec_adds (t8, t3);				\
+							\
+    /* 4th stage */					\
+    vy0 = vec_adds (t7, t1);				\
+    vy7 = vec_subs (t7, t1);				\
+    vy1 = vec_mradds (c4, t3, t5);			\
+    vy6 = vec_mradds (mc4, t3, t5);			\
+    vy2 = vec_mradds (c4, t4, t0);			\
+    vy5 = vec_mradds (mc4, t4, t0);			\
+    vy3 = vec_adds (t2, t6);				\
+    vy4 = vec_subs (t2, t6);
+
+	
+#define IDCT								\
+    vector_s16_t vx0, vx1, vx2, vx3, vx4, vx5, vx6, vx7;		\
+    vector_s16_t vy0, vy1, vy2, vy3, vy4, vy5, vy6, vy7;		\
+    vector_s16_t a0, a1, a2, ma2, c4, mc4, zero, bias;			\
+    vector_s16_t t0, t1, t2, t3, t4, t5, t6, t7, t8;			\
+    vector_u16_t shift;							\
+									\
+    c4 = vec_splat (constants[0], 0);					\
+    a0 = vec_splat (constants[0], 1);					\
+    a1 = vec_splat (constants[0], 2);					\
+    a2 = vec_splat (constants[0], 3);					\
+    mc4 = vec_splat (constants[0], 4);					\
+    ma2 = vec_splat (constants[0], 5);					\
+    bias = (vector_s16_t)vec_splat ((vector_s32_t)constants[0], 3);	\
+									\
+    zero = vec_splat_s16 (0);						\
+    shift = vec_splat_u16 (4);						\
+									\
+    vx0 = vec_mradds (vec_sl (block[0], shift), constants[1], zero);	\
+    vx1 = vec_mradds (vec_sl (block[1], shift), constants[2], zero);	\
+    vx2 = vec_mradds (vec_sl (block[2], shift), constants[3], zero);	\
+    vx3 = vec_mradds (vec_sl (block[3], shift), constants[4], zero);	\
+    vx4 = vec_mradds (vec_sl (block[4], shift), constants[1], zero);	\
+    vx5 = vec_mradds (vec_sl (block[5], shift), constants[4], zero);	\
+    vx6 = vec_mradds (vec_sl (block[6], shift), constants[3], zero);	\
+    vx7 = vec_mradds (vec_sl (block[7], shift), constants[2], zero);	\
+									\
+    IDCT_HALF								\
+									\
+    vx0 = vec_mergeh (vy0, vy4);					\
+    vx1 = vec_mergel (vy0, vy4);					\
+    vx2 = vec_mergeh (vy1, vy5);					\
+    vx3 = vec_mergel (vy1, vy5);					\
+    vx4 = vec_mergeh (vy2, vy6);					\
+    vx5 = vec_mergel (vy2, vy6);					\
+    vx6 = vec_mergeh (vy3, vy7);					\
+    vx7 = vec_mergel (vy3, vy7);					\
+									\
+    vy0 = vec_mergeh (vx0, vx4);					\
+    vy1 = vec_mergel (vx0, vx4);					\
+    vy2 = vec_mergeh (vx1, vx5);					\
+    vy3 = vec_mergel (vx1, vx5);					\
+    vy4 = vec_mergeh (vx2, vx6);					\
+    vy5 = vec_mergel (vx2, vx6);					\
+    vy6 = vec_mergeh (vx3, vx7);					\
+    vy7 = vec_mergel (vx3, vx7);					\
+									\
+    vx0 = vec_adds (vec_mergeh (vy0, vy4), bias);			\
+    vx1 = vec_mergel (vy0, vy4);					\
+    vx2 = vec_mergeh (vy1, vy5);					\
+    vx3 = vec_mergel (vy1, vy5);					\
+    vx4 = vec_mergeh (vy2, vy6);					\
+    vx5 = vec_mergel (vy2, vy6);					\
+    vx6 = vec_mergeh (vy3, vy7);					\
+    vx7 = vec_mergel (vy3, vy7);					\
+									\
+    IDCT_HALF								\
+									\
+    shift = vec_splat_u16 (6);						\
+    vx0 = vec_sra (vy0, shift);						\
+    vx1 = vec_sra (vy1, shift);						\
+    vx2 = vec_sra (vy2, shift);						\
+    vx3 = vec_sra (vy3, shift);						\
+    vx4 = vec_sra (vy4, shift);						\
+    vx5 = vec_sra (vy5, shift);						\
+    vx6 = vec_sra (vy6, shift);						\
+    vx7 = vec_sra (vy7, shift);
+
+static const vector_s16_t constants[5] = {
+    (vector_s16_t)(23170, 13573, 6518, 21895, -23170, -21895, 32, 31),
+    (vector_s16_t)(16384, 22725, 21407, 19266, 16384, 19266, 21407, 22725),
+    (vector_s16_t)(22725, 31521, 29692, 26722, 22725, 26722, 29692, 31521),
+    (vector_s16_t)(21407, 29692, 27969, 25172, 21407, 25172, 27969, 29692),
+    (vector_s16_t)(19266, 26722, 25172, 22654, 19266, 22654, 25172, 26722)
+};
+
+void idct_put_altivec(uint8_t* dest, int stride, vector_s16_t* block)
+{
+    vector_u8_t tmp;
+
+    IDCT
+
+#define COPY(dest,src)						\
+    tmp = vec_packsu (src, src);				\
+    vec_ste ((vector_u32_t)tmp, 0, (unsigned int *)dest);	\
+    vec_ste ((vector_u32_t)tmp, 4, (unsigned int *)dest);
+
+    COPY (dest, vx0)	dest += stride;
+    COPY (dest, vx1)	dest += stride;
+    COPY (dest, vx2)	dest += stride;
+    COPY (dest, vx3)	dest += stride;
+    COPY (dest, vx4)	dest += stride;
+    COPY (dest, vx5)	dest += stride;
+    COPY (dest, vx6)	dest += stride;
+    COPY (dest, vx7)
+}
+
+void idct_add_altivec(uint8_t* dest, int stride, vector_s16_t* block)
+{
+    vector_u8_t tmp;
+    vector_s16_t tmp2, tmp3;
+    vector_u8_t perm0;
+    vector_u8_t perm1;
+    vector_u8_t p0, p1, p;
+
+    IDCT
+
+    p0 = vec_lvsl (0, dest);
+    p1 = vec_lvsl (stride, dest);
+    p = vec_splat_u8 (-1);
+    perm0 = vec_mergeh (p, p0);
+    perm1 = vec_mergeh (p, p1);
+
+#define ADD(dest,src,perm)						\
+    /* *(uint64_t *)&tmp = *(uint64_t *)dest; */			\
+    tmp = vec_ld (0, dest);						\
+    tmp2 = (vector_s16_t)vec_perm (tmp, (vector_u8_t)zero, perm);	\
+    tmp3 = vec_adds (tmp2, src);					\
+    tmp = vec_packsu (tmp3, tmp3);					\
+    vec_ste ((vector_u32_t)tmp, 0, (unsigned int *)dest);		\
+    vec_ste ((vector_u32_t)tmp, 4, (unsigned int *)dest);
+
+    ADD (dest, vx0, perm0)	dest += stride;
+    ADD (dest, vx1, perm1)	dest += stride;
+    ADD (dest, vx2, perm0)	dest += stride;
+    ADD (dest, vx3, perm1)	dest += stride;
+    ADD (dest, vx4, perm0)	dest += stride;
+    ADD (dest, vx5, perm1)	dest += stride;
+    ADD (dest, vx6, perm0)	dest += stride;
+    ADD (dest, vx7, perm1)
+}
+
--- a/libavcodec/ppc/mpegvideo_altivec.c
+++ b/libavcodec/ppc/mpegvideo_altivec.c
--- a/libavcodec/ppc/mpegvideo_ppc.c
+++ b/libavcodec/ppc/mpegvideo_ppc.c
+/*
+ * Copyright (c) 2002 Dieter Shirley
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+ 
+#include <time.h>
+#include "../../config.h"
+#include "../dsputil.h"
+#include "../mpegvideo.h"
+
+#ifdef HAVE_ALTIVEC
+#include "dsputil_altivec.h"
+#endif
+
+extern int dct_quantize_altivec(MpegEncContext *s,  
+        DCTELEM *block, int n,
+        int qscale, int *overflow);
+
+extern void idct_put_altivec(UINT8 *dest, int line_size, INT16 *block);
+extern void idct_add_altivec(UINT8 *dest, int line_size, INT16 *block);
+
+
+void MPV_common_init_ppc(MpegEncContext *s)
+{
+#if HAVE_ALTIVEC
+    if (has_altivec())
+    {
+        if ((s->avctx->idct_algo == FF_IDCT_AUTO) ||
+                (s->avctx->idct_algo == FF_IDCT_ALTIVEC))
+        {
+            s->idct_put = idct_put_altivec;
+            s->idct_add = idct_add_altivec;
+            s->idct_permutation_type = FF_TRANSPOSE_IDCT_PERM;
+        }
+
+        // Test to make sure that the dct required alignments are met.
+        if ((((long)(s->q_intra_matrix) & 0x0f) != 0) ||
+                (((long)(s->q_inter_matrix) & 0x0f) != 0))
+        {
+            fprintf(stderr, "Internal Error: q-matrix blocks must be 16-byte aligned "
+                    "to use Altivec DCT. Reverting to non-altivec version.\n");
+            return;
+        }
+
+        if (((long)(s->intra_scantable.inverse) & 0x0f) != 0)
+        {
+            fprintf(stderr, "Internal Error: scan table blocks must be 16-byte aligned "
+                    "to use Altivec DCT. Reverting to non-altivec version.\n");
+            return;
+        }
+
+
+        if ((s->avctx->dct_algo == FF_DCT_AUTO) ||
+                (s->avctx->dct_algo == FF_DCT_ALTIVEC))
+        {
+            s->dct_quantize = dct_quantize_altivec;
+        }
+    } else
+#endif
+    {
+        /* Non-AltiVec PPC optimisations here */
+    }
+}
+