powerpc tuning - mad-dev

31 Jan 2001

We did some tuning for an embedded powerpc platform.  FPM_64BIT works
fairly well, the assembly tuning and changes to imdct36() improve things
by another 60% or so over FPM_64BIT using gcc -O3.  Performance with and
without OPT_ACCURACY are pretty close, but OPT_SSO is still somewhat
faster (~5%).
There are 3 sets of changes:
    1) changes to mad/configure and mad/libmad/configure to recognize
--host=powerpc  setting FPM_PPC; to allow -O3 to be set as an
optimization level; and also to not wipe out the CFLAGS setting before
running mad/libmad/configure (otherwise OPTIMIZER doesn't get set in
libmad/Makefile).
2) additions to mad/libmad/fixed.h for asm coded PPC routines
3) modifications to layer3.c:imdct36() to use mad_f_mla for longer
mac sequences if mla is available.  The mla code is written such that
the MLA macros could simply be redefined to use the regular mad_f_mul()
code when the mla routine isn't available, but i chickened out and left
the original code as an ifdef.
diffs attached. patch -p0 < diffs  from inside the mad-0.11.4b directory
should do it.
david
Index: configure
===================================================================
diff -u -r1.1.1.1 -r1.3

--- configure	2001/01/31 03:19:46	1.1.1.1
+++ configure	2001/01/31 04:24:18	1.3
@@ -1061,6 +1061,7 @@
    arm-*)       ARCH="-march=armv4 -mtune=strongarm" ;;
    mips-*)      ARCH="-mips3" ;;
    	     #-mcpu=vr4100
+	powerpc-*)   ARCH="" ;;
     esac
 fi
@@ -1069,10 +1070,11 @@
 esac
case "$CFLAGS" in
+    *-O3) OPTIMIZER="-O3" ;;
     *-O*) OPTIMIZER="-O2" ;;
 esac
-CFLAGS=""
+CFLAGS="$OPTIMIZER $DEBUGGER $PROFILER"
if test -n "$OPTIMIZER" && test "$GCC" = yes
 then
Index: libmad/configure
===================================================================
diff -u -r1.1.1.1 -r1.3
--- libmad/configure	2001/01/31 03:19:46	1.1.1.1
+++ libmad/configure	2001/01/31 04:24:18	1.3
@@ -1086,6 +1086,7 @@
    arm-*)       ARCH="-march=armv4" ;;
    mips-*)      ARCH="-mips3" ;;
    	     #-mcpu=vr4100
+	powerpc*-*)  ARCH="" ;;
     esac
 fi
@@ -1094,6 +1095,7 @@
 esac
case "$CFLAGS" in
+    *-O3*) OPTIMIZER="-O3" ;;
     *-O*) OPTIMIZER="-O" ;;
 esac
@@ -1630,6 +1632,7 @@
    	arm-*)    FPM="ARM"   ;;
    	mips-*)   FPM="MIPS"  ;;
    	sparc*-*) FPM="SPARC" ;;
+		powerpc*-*) FPM="PPC" ;;
        esac
        ;;
@@ -1639,6 +1642,7 @@
    sparc)      FPM="SPARC"  ;;
    64bit)      FPM="64BIT"  ;;
    approx)     FPM="APPROX" ;;
+	powerpc)    FPM="PPC"	 ;;
*) { echo "configure: error: bad --enable-fpm option" 1>&2; exit 1; }
       ;;
@@ -1653,6 +1657,7 @@
        arm-*)    FPM="ARM"   ;;
        mips-*)   FPM="MIPS"  ;;
        sparc*-*) FPM="SPARC" ;;
+	    powerpc*-*) FPM="PPC" ;;
        # FIXME: need to test 64-bit long long...
        *)
        echo "configure: warning: using fixed-point math approximations (see README)" 1>&2
Index: libmad/fixed.h
===================================================================
diff -u -r1.1.1.1 -r1.2
--- libmad/fixed.h	2001/01/31 03:19:46	1.1.1.1
+++ libmad/fixed.h	2001/01/31 04:02:51	1.2
@@ -268,6 +268,79 @@
        : "%r" (x), "rI" (y));  \
        mad_f_scale64(__hi, __lo);  \
     })
+
+#  define MAD_F_SCALEBITS  MAD_F_FRACBITS
+
+# elif defined(FPM_PPC)
+
+/*
+ * This PowerPC version is tuned for the 4xx embedded processors.  It is
+ * effectively a tuned version of FPM_64BIT.  It is a little faster and
+ * just as accurate.  The disposition of the least significant bit depends
+ * on OPT_ACCURACY via mad_f_scale64().
+ */
+#  define mad_f_mul(x, y)  \
+    ({ mad_fixed64hi_t __hi;  \
+       mad_fixed64lo_t __lo;  \
+       asm ("mulhw %1, %2, %3\n\t"  \
+            "mullw %0, %2, %3"  \
+	    : "=&r" (__lo), "=&r" (__hi) \
+	    : "%r" (x), "r" (y)); \
+       mad_f_scale64(__hi, __lo); \
+    })
+
+#  define MAD_F_HAVEMLA
+#  define mad_f_mla(hi, lo, x, y)  \
+    ({ mad_fixed64hi_t __hi;  \
+       mad_fixed64lo_t __lo;  \
+       asm ("mulhw %1, %2, %3\n\t"  \
+            "mullw %0, %2, %3"  \
+	    : "=r" (__lo), "=r" (__hi) \
+	    : "%r" (x), "r" (y)); \
+       asm ("addc %0, %1, %2" \
+	    : "=r" (*lo)  \
+	    : "%r" (__lo), "0" (*lo));  \
+       asm ("adde %0, %1, %2" \
+	    : "=r" (*hi)  \
+	    : "%r" (__hi), "0" (*hi));  \
+    })
+
+#  if defined(OPT_ACCURACY)
+/*
+ * This is accurate and ~2 - 2.5 times slower than the unrounded version.
+ * We let the compiler deal with putting the constant in a register
+ * since the value of MAD_F_SCALEBITS is redefined in some parts of the
+ * code and tracking the magnitude of (1<<(MAD_F_SCALEBITS-1)) is too
+ * complicated.
+ * The __volatile__ improve the generated code by another 5% (fewer
+ * spills to memory), eventually they should be removed.
+ */
+#   undef mad_f_scale64
+#   define mad_f_scale64(hi, lo)  \
+    ({ mad_fixed_t __result; \
+       mad_fixed64hi_t _hi;  \
+       mad_fixed64lo_t _lo;  \
+       asm __volatile__ ("addc %0, %2, %4\n\t" \
+	    "addze %1, %3" \
+	    : "=r" (_lo), "=r" (_hi)  \
+	    : "r" (lo), "r" (hi), "r" (1<<(MAD_F_SCALEBITS-1))); \
+       asm __volatile__ ("rlwinm %0, %2,32-%3,0,%3-1\n\t" \
+	    "rlwimi %0, %1,32-%3,%3,31" \
+	    : "=&r" (__result)  \
+	    : "r" (_lo), "r" (_hi), "I" (MAD_F_SCALEBITS));  \
+	    __result; \
+    })
+#  else
+#   undef mad_f_scale64
+#   define mad_f_scale64(hi, lo)  \
+    ({ mad_fixed_t __result;  \
+       asm ("rlwinm %0, %2,32-%3,0,%3-1\n\t" \
+	    "rlwimi %0, %1,32-%3,%3,31" \
+	    : "=r" (__result)  \
+	    : "r" (lo), "r" (hi), "I" (MAD_F_SCALEBITS));  \
+	    __result; \
+    })
+#  endif  /* OPT_ACCURACY */
#  define MAD_F_SCALEBITS  MAD_F_FRACBITS
Index: libmad/layer3.c
===================================================================
diff -u -r1.1.1.1 -r1.2
--- libmad/layer3.c	2001/01/31 03:19:46	1.1.1.1
+++ libmad/layer3.c	2001/01/31 04:02:51	1.2
@@ -1253,6 +1253,294 @@
   mad_fixed_t t6,  t7,  t8,  t9,  t10, t11;
   mad_fixed_t t12, t13, t14, t15, t16, t17;
+#ifdef MAD_F_HAVEMLA
+  mad_fixed64hi_t hi;
+  mad_fixed64lo_t lo;
+
+#define MLA(res, first, acc)	\
+  hi = lo = 0;	\
+  acc;	\
+  res = first + mad_f_scale64(hi, lo)
+
+#define MLA_STEP(x, y)	\
+  mad_f_mla(&hi, &lo, x, y);
+
+  t6 =
+    mad_f_mul(X[4],  0x0ec835e8L) +
+    mad_f_mul(X[13], 0x061f78aaL);
+
+  t0 = t6 +
+    (t7 = mad_f_mul((t16 = X[1] - X[10]), -0x061f78aaL)) +
+    (t8 = mad_f_mul((t17 = X[7] + X[16]), -0x0ec835e8L));
+
+  x[7] = t0 +
+    mad_f_mul((t10 = X[0] - X[11] - X[12]),  0x0216a2a2L) +
+    mad_f_mul((t11 = X[2] - X[9]  - X[14]),  0x09bd7ca0L) +
+    mad_f_mul((t12 = X[3] - X[8]  - X[15]), -0x0cb19346L) +
+    mad_f_mul((t13 = X[5] - X[6]  - X[17]), -0x0fdcf549L);
+  x[10] = -x[7];
+
+  x[19] = x[34] = -t6 + -t7 + -t8 +
+    mad_f_mul(t10, -0x0cb19346L) +
+    mad_f_mul(t11,  0x0fdcf549L) +
+    mad_f_mul(t12,  0x0216a2a2L) +
+    mad_f_mul(t13, -0x09bd7ca0L);
+
+  t14 = X[0] - X[3] + X[8] - X[11] - X[12] + X[15];
+  t15 = X[2] + X[5] - X[6] - X[9]  - X[14] - X[17];
+
+  x[22] = x[31] = t0 +
+    mad_f_mul(t14, -0x0ec835e8L) +
+    mad_f_mul(t15,  0x061f78aaL);
+
+  t1 = t6 +
+    mad_f_mul(X[1],  -0x09bd7ca0L) +
+    mad_f_mul(X[7],   0x0216a2a2L) +
+    mad_f_mul(X[10], -0x0fdcf549L) +
+    mad_f_mul(X[16],  0x0cb19346L);
+
+  MLA(
+  x[6] , t1, 
+    MLA_STEP(X[0],   0x03768962L)
+    MLA_STEP(X[2],   0x0e313245L)
+    MLA_STEP(X[3],  -0x0ffc19fdL)
+    MLA_STEP(X[5],  -0x0acf37adL)
+    MLA_STEP(X[6],   0x04cfb0e2L)
+    MLA_STEP(X[8],  -0x0898c779L)
+    MLA_STEP(X[9],   0x0d7e8807L)
+    MLA_STEP(X[11],  0x0f426cb5L)
+    MLA_STEP(X[12], -0x0bcbe352L)
+    MLA_STEP(X[14],  0x00b2aa3eL)
+    MLA_STEP(X[15], -0x07635284L)
+    MLA_STEP(X[17], -0x0f9ee890L)
+    );
+  x[11] = -x[6];
+
+  MLA(
+  x[23] = x[30] , t1,
+    MLA_STEP(X[0],  -0x0f426cb5L)
+    MLA_STEP(X[2],  -0x00b2aa3eL)
+    MLA_STEP(X[3],   0x0898c779L)
+    MLA_STEP(X[5],   0x0f9ee890L)
+    MLA_STEP(X[6],   0x0acf37adL)
+    MLA_STEP(X[8],  -0x07635284L)
+    MLA_STEP(X[9],  -0x0e313245L)
+    MLA_STEP(X[11], -0x0bcbe352L)
+    MLA_STEP(X[12], -0x03768962L)
+    MLA_STEP(X[14],  0x0d7e8807L)
+    MLA_STEP(X[15],  0x0ffc19fdL)
+    MLA_STEP(X[17],  0x04cfb0e2L)
+    );
+
+  MLA(
+  x[18] = x[35] , -t1,
+    MLA_STEP(X[0],  -0x0bcbe352L)
+    MLA_STEP(X[2],   0x0d7e8807L)
+    MLA_STEP(X[3],  -0x07635284L)
+    MLA_STEP(X[5],   0x04cfb0e2L)
+    MLA_STEP(X[6],   0x0f9ee890L)
+    MLA_STEP(X[8],  -0x0ffc19fdL)
+    MLA_STEP(X[9],  -0x00b2aa3eL)
+    MLA_STEP(X[11],  0x03768962L)
+    MLA_STEP(X[12], -0x0f426cb5L)
+    MLA_STEP(X[14],  0x0e313245L)
+    MLA_STEP(X[15],  0x0898c779L)
+    MLA_STEP(X[17], -0x0acf37adL)
+    );
+
+  t9 =
+    mad_f_mul(X[4],   0x061f78aaL) +
+    mad_f_mul(X[13], -0x0ec835e8L);
+
+  t2 = t9 +
+    mad_f_mul(X[1],  -0x0cb19346L)  +
+    mad_f_mul(X[7],   0x0fdcf549L)  +
+    mad_f_mul(X[10],  0x0216a2a2L)  +
+    mad_f_mul(X[16], -0x09bd7ca0L);
+
+  MLA(
+  x[5] , t2,
+    MLA_STEP(X[0],   0x04cfb0e2L)
+    MLA_STEP(X[2],   0x0ffc19fdL)
+    MLA_STEP(X[3],  -0x0d7e8807L)
+    MLA_STEP(X[5],   0x03768962L)
+    MLA_STEP(X[6],  -0x0bcbe352L)
+    MLA_STEP(X[8],  -0x0e313245L)
+    MLA_STEP(X[9],   0x07635284L)
+    MLA_STEP(X[11], -0x0acf37adL)
+    MLA_STEP(X[12],  0x0f9ee890L)
+    MLA_STEP(X[14],  0x0898c779L)
+    MLA_STEP(X[15],  0x00b2aa3eL)
+    MLA_STEP(X[17],  0x0f426cb5L)
+    );
+  x[12] = -x[5];
+
+  MLA(
+  x[0] , t2,
+    MLA_STEP(X[0],   0x0acf37adL)
+    MLA_STEP(X[2],  -0x0898c779L)
+    MLA_STEP(X[3],   0x0e313245L)
+    MLA_STEP(X[5],  -0x0f426cb5L)
+    MLA_STEP(X[6],  -0x03768962L)
+    MLA_STEP(X[8],   0x00b2aa3eL)
+    MLA_STEP(X[9],  -0x0ffc19fdL)
+    MLA_STEP(X[11],  0x0f9ee890L)
+    MLA_STEP(X[12], -0x04cfb0e2L)
+    MLA_STEP(X[14],  0x07635284L)
+    MLA_STEP(X[15],  0x0d7e8807L)
+    MLA_STEP(X[17], -0x0bcbe352L)
+    );
+  x[17] = -x[0];
+
+  MLA(
+  x[24] = x[29] , t2,
+    MLA_STEP(X[0],  -0x0f9ee890L)
+    MLA_STEP(X[2],  -0x07635284L)
+    MLA_STEP(X[3],  -0x00b2aa3eL)
+    MLA_STEP(X[5],   0x0bcbe352L)
+    MLA_STEP(X[6],   0x0f426cb5L)
+    MLA_STEP(X[8],   0x0d7e8807L)
+    MLA_STEP(X[9],   0x0898c779L)
+    MLA_STEP(X[11], -0x04cfb0e2L)
+    MLA_STEP(X[12], -0x0acf37adL)
+    MLA_STEP(X[14], -0x0ffc19fdL)
+    MLA_STEP(X[15], -0x0e313245L)
+    MLA_STEP(X[17], -0x03768962L)
+    );
+
+  t3 = t9 +
+    mad_f_mul(X[1],  -0x0216a2a2L) +
+    mad_f_mul(X[7],  -0x09bd7ca0L) +
+    mad_f_mul(X[10],  0x0cb19346L) +
+    mad_f_mul(X[16],  0x0fdcf549L);
+
+  MLA(
+  x[8] , t3,
+    MLA_STEP(X[0],   0x00b2aa3eL)
+    MLA_STEP(X[2],   0x03768962L)
+    MLA_STEP(X[3],  -0x04cfb0e2L)
+    MLA_STEP(X[5],  -0x07635284L)
+    MLA_STEP(X[6],   0x0898c779L)
+    MLA_STEP(X[8],   0x0acf37adL)
+    MLA_STEP(X[9],  -0x0bcbe352L)
+    MLA_STEP(X[11], -0x0d7e8807L)
+    MLA_STEP(X[12],  0x0e313245L)
+    MLA_STEP(X[14],  0x0f426cb5L)
+    MLA_STEP(X[15], -0x0f9ee890L)
+    MLA_STEP(X[17], -0x0ffc19fdL)
+    );
+  x[9] = -x[8];
+
+  MLA(
+  x[21] = x[32] , t3,
+    MLA_STEP(X[0],  -0x0e313245L)
+    MLA_STEP(X[2],   0x0bcbe352L)
+    MLA_STEP(X[3],   0x0f9ee890L)
+    MLA_STEP(X[5],  -0x0898c779L)
+    MLA_STEP(X[6],  -0x0ffc19fdL)
+    MLA_STEP(X[8],   0x04cfb0e2L)
+    MLA_STEP(X[9],   0x0f426cb5L)
+    MLA_STEP(X[11], -0x00b2aa3eL)
+    MLA_STEP(X[12], -0x0d7e8807L)
+    MLA_STEP(X[14], -0x03768962L)
+    MLA_STEP(X[15],  0x0acf37adL)
+    MLA_STEP(X[17],  0x07635284L)
+    );
+
+  MLA(
+  x[20] = x[33] , -t3,
+    MLA_STEP(X[0],  -0x0d7e8807L)
+    MLA_STEP(X[2],   0x0f426cb5L)
+    MLA_STEP(X[3],   0x0acf37adL)
+    MLA_STEP(X[5],  -0x0ffc19fdL)
+    MLA_STEP(X[6],  -0x07635284L)
+    MLA_STEP(X[8],   0x0f9ee890L)
+    MLA_STEP(X[9],   0x03768962L)
+    MLA_STEP(X[11], -0x0e313245L)
+    MLA_STEP(X[12],  0x00b2aa3eL)
+    MLA_STEP(X[14],  0x0bcbe352L)
+    MLA_STEP(X[15], -0x04cfb0e2L)
+    MLA_STEP(X[17], -0x0898c779L)
+    );
+
+  t4 = -t9 +
+    mad_f_mul(t16, -0x0ec835e8L) +
+    mad_f_mul(t17,  0x061f78aaL);
+
+  x[4] = t4 +
+    mad_f_mul(t14, 0x061f78aaL) +
+    mad_f_mul(t15, 0x0ec835e8L);
+  x[13] = -x[4];
+
+  x[1] = t4 +
+    mad_f_mul(t10,  0x09bd7ca0L) +
+    mad_f_mul(t11, -0x0216a2a2L) +
+    mad_f_mul(t12,  0x0fdcf549L) +
+    mad_f_mul(t13, -0x0cb19346L);
+  x[16] = -x[1];
+
+  x[25] = x[28] = t4 +
+    mad_f_mul(t10, -0x0fdcf549L) +
+    mad_f_mul(t11, -0x0cb19346L) +
+    mad_f_mul(t12, -0x09bd7ca0L) +
+    mad_f_mul(t13, -0x0216a2a2L);
+
+  t5 = -t6 +
+    mad_f_mul(X[1],  -0x0fdcf549L) +
+    mad_f_mul(X[7],  -0x0cb19346L) +
+    mad_f_mul(X[10], -0x09bd7ca0L) +
+    mad_f_mul(X[16], -0x0216a2a2L);
+
+  MLA(
+  x[2] , t5,
+    MLA_STEP(X[0],   0x0898c779L)
+    MLA_STEP(X[2],   0x04cfb0e2L)
+    MLA_STEP(X[3],   0x0bcbe352L)
+    MLA_STEP(X[5],   0x00b2aa3eL)
+    MLA_STEP(X[6],   0x0e313245L)
+    MLA_STEP(X[8],  -0x03768962L)
+    MLA_STEP(X[9],   0x0f9ee890L)
+    MLA_STEP(X[11], -0x07635284L)
+    MLA_STEP(X[12],  0x0ffc19fdL)
+    MLA_STEP(X[14], -0x0acf37adL)
+    MLA_STEP(X[15],  0x0f426cb5L)
+    MLA_STEP(X[17], -0x0d7e8807L)
+    );
+  x[15] = -x[2];
+
+  MLA(
+  x[3] , t5,
+    MLA_STEP(X[0],   0x07635284L)
+    MLA_STEP(X[2],   0x0acf37adL)
+    MLA_STEP(X[3],   0x03768962L)
+    MLA_STEP(X[5],   0x0d7e8807L)
+    MLA_STEP(X[6],  -0x00b2aa3eL)
+    MLA_STEP(X[8],   0x0f426cb5L)
+    MLA_STEP(X[9],  -0x04cfb0e2L)
+    MLA_STEP(X[11],  0x0ffc19fdL)
+    MLA_STEP(X[12], -0x0898c779L)
+    MLA_STEP(X[14],  0x0f9ee890L)
+    MLA_STEP(X[15], -0x0bcbe352L)
+    MLA_STEP(X[17],  0x0e313245L)
+    );
+  x[14] = -x[3];
+
+  MLA(
+  x[26] = x[27] , t5,
+    MLA_STEP(X[0],  -0x0ffc19fdL)
+    MLA_STEP(X[2],  -0x0f9ee890L)
+    MLA_STEP(X[3],  -0x0f426cb5L)
+    MLA_STEP(X[5],  -0x0e313245L)
+    MLA_STEP(X[6],  -0x0d7e8807L)
+    MLA_STEP(X[8],  -0x0bcbe352L)
+    MLA_STEP(X[9],  -0x0acf37adL)
+    MLA_STEP(X[11], -0x0898c779L)
+    MLA_STEP(X[12], -0x07635284L)
+    MLA_STEP(X[14], -0x04cfb0e2L)
+    MLA_STEP(X[15], -0x03768962L)
+    MLA_STEP(X[17], -0x00b2aa3eL)
+    );
+#else
   t6 =
     mad_f_mul(X[4],  0x0ec835e8L) +
     mad_f_mul(X[13], 0x061f78aaL);
@@ -1504,6 +1792,7 @@
     mad_f_mul(X[14], -0x04cfb0e2L) +
     mad_f_mul(X[15], -0x03768962L) +
     mad_f_mul(X[17], -0x00b2aa3eL);
+#endif
 }
/*