We did some tuning for an embedded powerpc platform. FPM_64BIT works
fairly well, the assembly tuning and changes to imdct36() improve things
by another 60% or so over FPM_64BIT using gcc -O3. Performance with and
without OPT_ACCURACY are pretty close, but OPT_SSO is still somewhat
faster (~5%).
There are 3 sets of changes:
1) changes to mad/configure and mad/libmad/configure to recognize
--host=powerpc setting FPM_PPC; to allow -O3 to be set as an
optimization level; and also to not wipe out the CFLAGS setting before
running mad/libmad/configure (otherwise OPTIMIZER doesn't get set in
libmad/Makefile).
2) additions to mad/libmad/fixed.h for asm coded PPC routines
3) modifications to layer3.c:imdct36() to use mad_f_mla for longer
mac sequences if mla is available. The mla code is written such that
the MLA macros could simply be redefined to use the regular mad_f_mul()
code when the mla routine isn't available, but i chickened out and left
the original code as an ifdef.
diffs attached. patch -p0 < diffs from inside the mad-0.11.4b directory
should do it.
david
Index: configure
===================================================================
diff -u -r1.1.1.1 -r1.3
--- configure 2001/01/31 03:19:46 1.1.1.1
+++ configure 2001/01/31 04:24:18 1.3
@@ -1061,6 +1061,7 @@
arm-*) ARCH="-march=armv4 -mtune=strongarm" ;;
mips-*) ARCH="-mips3" ;;
#-mcpu=vr4100
+ powerpc-*) ARCH="" ;;
esac
fi
@@ -1069,10 +1070,11 @@
esac
case "$CFLAGS" in
+ *-O3) OPTIMIZER="-O3" ;;
*-O*) OPTIMIZER="-O2" ;;
esac
-CFLAGS=""
+CFLAGS="$OPTIMIZER $DEBUGGER $PROFILER"
if test -n "$OPTIMIZER" && test "$GCC" = yes
then
Index: libmad/configure
===================================================================
diff -u -r1.1.1.1 -r1.3
--- libmad/configure 2001/01/31 03:19:46 1.1.1.1
+++ libmad/configure 2001/01/31 04:24:18 1.3
@@ -1086,6 +1086,7 @@
arm-*) ARCH="-march=armv4" ;;
mips-*) ARCH="-mips3" ;;
#-mcpu=vr4100
+ powerpc*-*) ARCH="" ;;
esac
fi
@@ -1094,6 +1095,7 @@
esac
case "$CFLAGS" in
+ *-O3*) OPTIMIZER="-O3" ;;
*-O*) OPTIMIZER="-O" ;;
esac
@@ -1630,6 +1632,7 @@
arm-*) FPM="ARM" ;;
mips-*) FPM="MIPS" ;;
sparc*-*) FPM="SPARC" ;;
+ powerpc*-*) FPM="PPC" ;;
esac
;;
@@ -1639,6 +1642,7 @@
sparc) FPM="SPARC" ;;
64bit) FPM="64BIT" ;;
approx) FPM="APPROX" ;;
+ powerpc) FPM="PPC" ;;
*) { echo "configure: error: bad --enable-fpm option" 1>&2; exit 1; }
;;
@@ -1653,6 +1657,7 @@
arm-*) FPM="ARM" ;;
mips-*) FPM="MIPS" ;;
sparc*-*) FPM="SPARC" ;;
+ powerpc*-*) FPM="PPC" ;;
# FIXME: need to test 64-bit long long...
*)
echo "configure: warning: using fixed-point math approximations (see README)" 1>&2
Index: libmad/fixed.h
===================================================================
diff -u -r1.1.1.1 -r1.2
--- libmad/fixed.h 2001/01/31 03:19:46 1.1.1.1
+++ libmad/fixed.h 2001/01/31 04:02:51 1.2
@@ -268,6 +268,79 @@
: "%r" (x), "rI" (y)); \
mad_f_scale64(__hi, __lo); \
})
+
+# define MAD_F_SCALEBITS MAD_F_FRACBITS
+
+# elif defined(FPM_PPC)
+
+/*
+ * This PowerPC version is tuned for the 4xx embedded processors. It is
+ * effectively a tuned version of FPM_64BIT. It is a little faster and
+ * just as accurate. The disposition of the least significant bit depends
+ * on OPT_ACCURACY via mad_f_scale64().
+ */
+# define mad_f_mul(x, y) \
+ ({ mad_fixed64hi_t __hi; \
+ mad_fixed64lo_t __lo; \
+ asm ("mulhw %1, %2, %3\n\t" \
+ "mullw %0, %2, %3" \
+ : "=&r" (__lo), "=&r" (__hi) \
+ : "%r" (x), "r" (y)); \
+ mad_f_scale64(__hi, __lo); \
+ })
+
+# define MAD_F_HAVEMLA
+# define mad_f_mla(hi, lo, x, y) \
+ ({ mad_fixed64hi_t __hi; \
+ mad_fixed64lo_t __lo; \
+ asm ("mulhw %1, %2, %3\n\t" \
+ "mullw %0, %2, %3" \
+ : "=r" (__lo), "=r" (__hi) \
+ : "%r" (x), "r" (y)); \
+ asm ("addc %0, %1, %2" \
+ : "=r" (*lo) \
+ : "%r" (__lo), "0" (*lo)); \
+ asm ("adde %0, %1, %2" \
+ : "=r" (*hi) \
+ : "%r" (__hi), "0" (*hi)); \
+ })
+
+# if defined(OPT_ACCURACY)
+/*
+ * This is accurate and ~2 - 2.5 times slower than the unrounded version.
+ * We let the compiler deal with putting the constant in a register
+ * since the value of MAD_F_SCALEBITS is redefined in some parts of the
+ * code and tracking the magnitude of (1<<(MAD_F_SCALEBITS-1)) is too
+ * complicated.
+ * The __volatile__ improve the generated code by another 5% (fewer
+ * spills to memory), eventually they should be removed.
+ */
+# undef mad_f_scale64
+# define mad_f_scale64(hi, lo) \
+ ({ mad_fixed_t __result; \
+ mad_fixed64hi_t _hi; \
+ mad_fixed64lo_t _lo; \
+ asm __volatile__ ("addc %0, %2, %4\n\t" \
+ "addze %1, %3" \
+ : "=r" (_lo), "=r" (_hi) \
+ : "r" (lo), "r" (hi), "r" (1<<(MAD_F_SCALEBITS-1))); \
+ asm __volatile__ ("rlwinm %0, %2,32-%3,0,%3-1\n\t" \
+ "rlwimi %0, %1,32-%3,%3,31" \
+ : "=&r" (__result) \
+ : "r" (_lo), "r" (_hi), "I" (MAD_F_SCALEBITS)); \
+ __result; \
+ })
+# else
+# undef mad_f_scale64
+# define mad_f_scale64(hi, lo) \
+ ({ mad_fixed_t __result; \
+ asm ("rlwinm %0, %2,32-%3,0,%3-1\n\t" \
+ "rlwimi %0, %1,32-%3,%3,31" \
+ : "=r" (__result) \
+ : "r" (lo), "r" (hi), "I" (MAD_F_SCALEBITS)); \
+ __result; \
+ })
+# endif /* OPT_ACCURACY */
# define MAD_F_SCALEBITS MAD_F_FRACBITS
Index: libmad/layer3.c
===================================================================
diff -u -r1.1.1.1 -r1.2
--- libmad/layer3.c 2001/01/31 03:19:46 1.1.1.1
+++ libmad/layer3.c 2001/01/31 04:02:51 1.2
@@ -1253,6 +1253,294 @@
mad_fixed_t t6, t7, t8, t9, t10, t11;
mad_fixed_t t12, t13, t14, t15, t16, t17;
+#ifdef MAD_F_HAVEMLA
+ mad_fixed64hi_t hi;
+ mad_fixed64lo_t lo;
+
+#define MLA(res, first, acc) \
+ hi = lo = 0; \
+ acc; \
+ res = first + mad_f_scale64(hi, lo)
+
+#define MLA_STEP(x, y) \
+ mad_f_mla(&hi, &lo, x, y);
+
+ t6 =
+ mad_f_mul(X[4], 0x0ec835e8L) +
+ mad_f_mul(X[13], 0x061f78aaL);
+
+ t0 = t6 +
+ (t7 = mad_f_mul((t16 = X[1] - X[10]), -0x061f78aaL)) +
+ (t8 = mad_f_mul((t17 = X[7] + X[16]), -0x0ec835e8L));
+
+ x[7] = t0 +
+ mad_f_mul((t10 = X[0] - X[11] - X[12]), 0x0216a2a2L) +
+ mad_f_mul((t11 = X[2] - X[9] - X[14]), 0x09bd7ca0L) +
+ mad_f_mul((t12 = X[3] - X[8] - X[15]), -0x0cb19346L) +
+ mad_f_mul((t13 = X[5] - X[6] - X[17]), -0x0fdcf549L);
+ x[10] = -x[7];
+
+ x[19] = x[34] = -t6 + -t7 + -t8 +
+ mad_f_mul(t10, -0x0cb19346L) +
+ mad_f_mul(t11, 0x0fdcf549L) +
+ mad_f_mul(t12, 0x0216a2a2L) +
+ mad_f_mul(t13, -0x09bd7ca0L);
+
+ t14 = X[0] - X[3] + X[8] - X[11] - X[12] + X[15];
+ t15 = X[2] + X[5] - X[6] - X[9] - X[14] - X[17];
+
+ x[22] = x[31] = t0 +
+ mad_f_mul(t14, -0x0ec835e8L) +
+ mad_f_mul(t15, 0x061f78aaL);
+
+ t1 = t6 +
+ mad_f_mul(X[1], -0x09bd7ca0L) +
+ mad_f_mul(X[7], 0x0216a2a2L) +
+ mad_f_mul(X[10], -0x0fdcf549L) +
+ mad_f_mul(X[16], 0x0cb19346L);
+
+ MLA(
+ x[6] , t1,
+ MLA_STEP(X[0], 0x03768962L)
+ MLA_STEP(X[2], 0x0e313245L)
+ MLA_STEP(X[3], -0x0ffc19fdL)
+ MLA_STEP(X[5], -0x0acf37adL)
+ MLA_STEP(X[6], 0x04cfb0e2L)
+ MLA_STEP(X[8], -0x0898c779L)
+ MLA_STEP(X[9], 0x0d7e8807L)
+ MLA_STEP(X[11], 0x0f426cb5L)
+ MLA_STEP(X[12], -0x0bcbe352L)
+ MLA_STEP(X[14], 0x00b2aa3eL)
+ MLA_STEP(X[15], -0x07635284L)
+ MLA_STEP(X[17], -0x0f9ee890L)
+ );
+ x[11] = -x[6];
+
+ MLA(
+ x[23] = x[30] , t1,
+ MLA_STEP(X[0], -0x0f426cb5L)
+ MLA_STEP(X[2], -0x00b2aa3eL)
+ MLA_STEP(X[3], 0x0898c779L)
+ MLA_STEP(X[5], 0x0f9ee890L)
+ MLA_STEP(X[6], 0x0acf37adL)
+ MLA_STEP(X[8], -0x07635284L)
+ MLA_STEP(X[9], -0x0e313245L)
+ MLA_STEP(X[11], -0x0bcbe352L)
+ MLA_STEP(X[12], -0x03768962L)
+ MLA_STEP(X[14], 0x0d7e8807L)
+ MLA_STEP(X[15], 0x0ffc19fdL)
+ MLA_STEP(X[17], 0x04cfb0e2L)
+ );
+
+ MLA(
+ x[18] = x[35] , -t1,
+ MLA_STEP(X[0], -0x0bcbe352L)
+ MLA_STEP(X[2], 0x0d7e8807L)
+ MLA_STEP(X[3], -0x07635284L)
+ MLA_STEP(X[5], 0x04cfb0e2L)
+ MLA_STEP(X[6], 0x0f9ee890L)
+ MLA_STEP(X[8], -0x0ffc19fdL)
+ MLA_STEP(X[9], -0x00b2aa3eL)
+ MLA_STEP(X[11], 0x03768962L)
+ MLA_STEP(X[12], -0x0f426cb5L)
+ MLA_STEP(X[14], 0x0e313245L)
+ MLA_STEP(X[15], 0x0898c779L)
+ MLA_STEP(X[17], -0x0acf37adL)
+ );
+
+ t9 =
+ mad_f_mul(X[4], 0x061f78aaL) +
+ mad_f_mul(X[13], -0x0ec835e8L);
+
+ t2 = t9 +
+ mad_f_mul(X[1], -0x0cb19346L) +
+ mad_f_mul(X[7], 0x0fdcf549L) +
+ mad_f_mul(X[10], 0x0216a2a2L) +
+ mad_f_mul(X[16], -0x09bd7ca0L);
+
+ MLA(
+ x[5] , t2,
+ MLA_STEP(X[0], 0x04cfb0e2L)
+ MLA_STEP(X[2], 0x0ffc19fdL)
+ MLA_STEP(X[3], -0x0d7e8807L)
+ MLA_STEP(X[5], 0x03768962L)
+ MLA_STEP(X[6], -0x0bcbe352L)
+ MLA_STEP(X[8], -0x0e313245L)
+ MLA_STEP(X[9], 0x07635284L)
+ MLA_STEP(X[11], -0x0acf37adL)
+ MLA_STEP(X[12], 0x0f9ee890L)
+ MLA_STEP(X[14], 0x0898c779L)
+ MLA_STEP(X[15], 0x00b2aa3eL)
+ MLA_STEP(X[17], 0x0f426cb5L)
+ );
+ x[12] = -x[5];
+
+ MLA(
+ x[0] , t2,
+ MLA_STEP(X[0], 0x0acf37adL)
+ MLA_STEP(X[2], -0x0898c779L)
+ MLA_STEP(X[3], 0x0e313245L)
+ MLA_STEP(X[5], -0x0f426cb5L)
+ MLA_STEP(X[6], -0x03768962L)
+ MLA_STEP(X[8], 0x00b2aa3eL)
+ MLA_STEP(X[9], -0x0ffc19fdL)
+ MLA_STEP(X[11], 0x0f9ee890L)
+ MLA_STEP(X[12], -0x04cfb0e2L)
+ MLA_STEP(X[14], 0x07635284L)
+ MLA_STEP(X[15], 0x0d7e8807L)
+ MLA_STEP(X[17], -0x0bcbe352L)
+ );
+ x[17] = -x[0];
+
+ MLA(
+ x[24] = x[29] , t2,
+ MLA_STEP(X[0], -0x0f9ee890L)
+ MLA_STEP(X[2], -0x07635284L)
+ MLA_STEP(X[3], -0x00b2aa3eL)
+ MLA_STEP(X[5], 0x0bcbe352L)
+ MLA_STEP(X[6], 0x0f426cb5L)
+ MLA_STEP(X[8], 0x0d7e8807L)
+ MLA_STEP(X[9], 0x0898c779L)
+ MLA_STEP(X[11], -0x04cfb0e2L)
+ MLA_STEP(X[12], -0x0acf37adL)
+ MLA_STEP(X[14], -0x0ffc19fdL)
+ MLA_STEP(X[15], -0x0e313245L)
+ MLA_STEP(X[17], -0x03768962L)
+ );
+
+ t3 = t9 +
+ mad_f_mul(X[1], -0x0216a2a2L) +
+ mad_f_mul(X[7], -0x09bd7ca0L) +
+ mad_f_mul(X[10], 0x0cb19346L) +
+ mad_f_mul(X[16], 0x0fdcf549L);
+
+ MLA(
+ x[8] , t3,
+ MLA_STEP(X[0], 0x00b2aa3eL)
+ MLA_STEP(X[2], 0x03768962L)
+ MLA_STEP(X[3], -0x04cfb0e2L)
+ MLA_STEP(X[5], -0x07635284L)
+ MLA_STEP(X[6], 0x0898c779L)
+ MLA_STEP(X[8], 0x0acf37adL)
+ MLA_STEP(X[9], -0x0bcbe352L)
+ MLA_STEP(X[11], -0x0d7e8807L)
+ MLA_STEP(X[12], 0x0e313245L)
+ MLA_STEP(X[14], 0x0f426cb5L)
+ MLA_STEP(X[15], -0x0f9ee890L)
+ MLA_STEP(X[17], -0x0ffc19fdL)
+ );
+ x[9] = -x[8];
+
+ MLA(
+ x[21] = x[32] , t3,
+ MLA_STEP(X[0], -0x0e313245L)
+ MLA_STEP(X[2], 0x0bcbe352L)
+ MLA_STEP(X[3], 0x0f9ee890L)
+ MLA_STEP(X[5], -0x0898c779L)
+ MLA_STEP(X[6], -0x0ffc19fdL)
+ MLA_STEP(X[8], 0x04cfb0e2L)
+ MLA_STEP(X[9], 0x0f426cb5L)
+ MLA_STEP(X[11], -0x00b2aa3eL)
+ MLA_STEP(X[12], -0x0d7e8807L)
+ MLA_STEP(X[14], -0x03768962L)
+ MLA_STEP(X[15], 0x0acf37adL)
+ MLA_STEP(X[17], 0x07635284L)
+ );
+
+ MLA(
+ x[20] = x[33] , -t3,
+ MLA_STEP(X[0], -0x0d7e8807L)
+ MLA_STEP(X[2], 0x0f426cb5L)
+ MLA_STEP(X[3], 0x0acf37adL)
+ MLA_STEP(X[5], -0x0ffc19fdL)
+ MLA_STEP(X[6], -0x07635284L)
+ MLA_STEP(X[8], 0x0f9ee890L)
+ MLA_STEP(X[9], 0x03768962L)
+ MLA_STEP(X[11], -0x0e313245L)
+ MLA_STEP(X[12], 0x00b2aa3eL)
+ MLA_STEP(X[14], 0x0bcbe352L)
+ MLA_STEP(X[15], -0x04cfb0e2L)
+ MLA_STEP(X[17], -0x0898c779L)
+ );
+
+ t4 = -t9 +
+ mad_f_mul(t16, -0x0ec835e8L) +
+ mad_f_mul(t17, 0x061f78aaL);
+
+ x[4] = t4 +
+ mad_f_mul(t14, 0x061f78aaL) +
+ mad_f_mul(t15, 0x0ec835e8L);
+ x[13] = -x[4];
+
+ x[1] = t4 +
+ mad_f_mul(t10, 0x09bd7ca0L) +
+ mad_f_mul(t11, -0x0216a2a2L) +
+ mad_f_mul(t12, 0x0fdcf549L) +
+ mad_f_mul(t13, -0x0cb19346L);
+ x[16] = -x[1];
+
+ x[25] = x[28] = t4 +
+ mad_f_mul(t10, -0x0fdcf549L) +
+ mad_f_mul(t11, -0x0cb19346L) +
+ mad_f_mul(t12, -0x09bd7ca0L) +
+ mad_f_mul(t13, -0x0216a2a2L);
+
+ t5 = -t6 +
+ mad_f_mul(X[1], -0x0fdcf549L) +
+ mad_f_mul(X[7], -0x0cb19346L) +
+ mad_f_mul(X[10], -0x09bd7ca0L) +
+ mad_f_mul(X[16], -0x0216a2a2L);
+
+ MLA(
+ x[2] , t5,
+ MLA_STEP(X[0], 0x0898c779L)
+ MLA_STEP(X[2], 0x04cfb0e2L)
+ MLA_STEP(X[3], 0x0bcbe352L)
+ MLA_STEP(X[5], 0x00b2aa3eL)
+ MLA_STEP(X[6], 0x0e313245L)
+ MLA_STEP(X[8], -0x03768962L)
+ MLA_STEP(X[9], 0x0f9ee890L)
+ MLA_STEP(X[11], -0x07635284L)
+ MLA_STEP(X[12], 0x0ffc19fdL)
+ MLA_STEP(X[14], -0x0acf37adL)
+ MLA_STEP(X[15], 0x0f426cb5L)
+ MLA_STEP(X[17], -0x0d7e8807L)
+ );
+ x[15] = -x[2];
+
+ MLA(
+ x[3] , t5,
+ MLA_STEP(X[0], 0x07635284L)
+ MLA_STEP(X[2], 0x0acf37adL)
+ MLA_STEP(X[3], 0x03768962L)
+ MLA_STEP(X[5], 0x0d7e8807L)
+ MLA_STEP(X[6], -0x00b2aa3eL)
+ MLA_STEP(X[8], 0x0f426cb5L)
+ MLA_STEP(X[9], -0x04cfb0e2L)
+ MLA_STEP(X[11], 0x0ffc19fdL)
+ MLA_STEP(X[12], -0x0898c779L)
+ MLA_STEP(X[14], 0x0f9ee890L)
+ MLA_STEP(X[15], -0x0bcbe352L)
+ MLA_STEP(X[17], 0x0e313245L)
+ );
+ x[14] = -x[3];
+
+ MLA(
+ x[26] = x[27] , t5,
+ MLA_STEP(X[0], -0x0ffc19fdL)
+ MLA_STEP(X[2], -0x0f9ee890L)
+ MLA_STEP(X[3], -0x0f426cb5L)
+ MLA_STEP(X[5], -0x0e313245L)
+ MLA_STEP(X[6], -0x0d7e8807L)
+ MLA_STEP(X[8], -0x0bcbe352L)
+ MLA_STEP(X[9], -0x0acf37adL)
+ MLA_STEP(X[11], -0x0898c779L)
+ MLA_STEP(X[12], -0x07635284L)
+ MLA_STEP(X[14], -0x04cfb0e2L)
+ MLA_STEP(X[15], -0x03768962L)
+ MLA_STEP(X[17], -0x00b2aa3eL)
+ );
+#else
t6 =
mad_f_mul(X[4], 0x0ec835e8L) +
mad_f_mul(X[13], 0x061f78aaL);
@@ -1504,6 +1792,7 @@
mad_f_mul(X[14], -0x04cfb0e2L) +
mad_f_mul(X[15], -0x03768962L) +
mad_f_mul(X[17], -0x00b2aa3eL);
+#endif
}
/*