We did some tuning for an embedded powerpc platform. FPM_64BIT works fairly well, the assembly tuning and changes to imdct36() improve things by another 60% or so over FPM_64BIT using gcc -O3. Performance with and without OPT_ACCURACY are pretty close, but OPT_SSO is still somewhat faster (~5%).
There are 3 sets of changes: 1) changes to mad/configure and mad/libmad/configure to recognize --host=powerpc setting FPM_PPC; to allow -O3 to be set as an optimization level; and also to not wipe out the CFLAGS setting before running mad/libmad/configure (otherwise OPTIMIZER doesn't get set in libmad/Makefile).
2) additions to mad/libmad/fixed.h for asm coded PPC routines
3) modifications to layer3.c:imdct36() to use mad_f_mla for longer mac sequences if mla is available. The mla code is written such that the MLA macros could simply be redefined to use the regular mad_f_mul() code when the mla routine isn't available, but i chickened out and left the original code as an ifdef.
diffs attached. patch -p0 < diffs from inside the mad-0.11.4b directory should do it.
david
Index: configure =================================================================== diff -u -r1.1.1.1 -r1.3 --- configure 2001/01/31 03:19:46 1.1.1.1 +++ configure 2001/01/31 04:24:18 1.3 @@ -1061,6 +1061,7 @@ arm-*) ARCH="-march=armv4 -mtune=strongarm" ;; mips-*) ARCH="-mips3" ;; #-mcpu=vr4100 + powerpc-*) ARCH="" ;; esac fi
@@ -1069,10 +1070,11 @@ esac
case "$CFLAGS" in + *-O3) OPTIMIZER="-O3" ;; *-O*) OPTIMIZER="-O2" ;; esac
-CFLAGS="" +CFLAGS="$OPTIMIZER $DEBUGGER $PROFILER"
if test -n "$OPTIMIZER" && test "$GCC" = yes then Index: libmad/configure =================================================================== diff -u -r1.1.1.1 -r1.3 --- libmad/configure 2001/01/31 03:19:46 1.1.1.1 +++ libmad/configure 2001/01/31 04:24:18 1.3 @@ -1086,6 +1086,7 @@ arm-*) ARCH="-march=armv4" ;; mips-*) ARCH="-mips3" ;; #-mcpu=vr4100 + powerpc*-*) ARCH="" ;; esac fi
@@ -1094,6 +1095,7 @@ esac
case "$CFLAGS" in + *-O3*) OPTIMIZER="-O3" ;; *-O*) OPTIMIZER="-O" ;; esac
@@ -1630,6 +1632,7 @@ arm-*) FPM="ARM" ;; mips-*) FPM="MIPS" ;; sparc*-*) FPM="SPARC" ;; + powerpc*-*) FPM="PPC" ;; esac ;;
@@ -1639,6 +1642,7 @@ sparc) FPM="SPARC" ;; 64bit) FPM="64BIT" ;; approx) FPM="APPROX" ;; + powerpc) FPM="PPC" ;;
*) { echo "configure: error: bad --enable-fpm option" 1>&2; exit 1; } ;; @@ -1653,6 +1657,7 @@ arm-*) FPM="ARM" ;; mips-*) FPM="MIPS" ;; sparc*-*) FPM="SPARC" ;; + powerpc*-*) FPM="PPC" ;; # FIXME: need to test 64-bit long long... *) echo "configure: warning: using fixed-point math approximations (see README)" 1>&2 Index: libmad/fixed.h =================================================================== diff -u -r1.1.1.1 -r1.2 --- libmad/fixed.h 2001/01/31 03:19:46 1.1.1.1 +++ libmad/fixed.h 2001/01/31 04:02:51 1.2 @@ -268,6 +268,79 @@ : "%r" (x), "rI" (y)); \ mad_f_scale64(__hi, __lo); \ }) + +# define MAD_F_SCALEBITS MAD_F_FRACBITS + +# elif defined(FPM_PPC) + +/* + * This PowerPC version is tuned for the 4xx embedded processors. It is + * effectively a tuned version of FPM_64BIT. It is a little faster and + * just as accurate. The disposition of the least significant bit depends + * on OPT_ACCURACY via mad_f_scale64(). + */ +# define mad_f_mul(x, y) \ + ({ mad_fixed64hi_t __hi; \ + mad_fixed64lo_t __lo; \ + asm ("mulhw %1, %2, %3\n\t" \ + "mullw %0, %2, %3" \ + : "=&r" (__lo), "=&r" (__hi) \ + : "%r" (x), "r" (y)); \ + mad_f_scale64(__hi, __lo); \ + }) + +# define MAD_F_HAVEMLA +# define mad_f_mla(hi, lo, x, y) \ + ({ mad_fixed64hi_t __hi; \ + mad_fixed64lo_t __lo; \ + asm ("mulhw %1, %2, %3\n\t" \ + "mullw %0, %2, %3" \ + : "=r" (__lo), "=r" (__hi) \ + : "%r" (x), "r" (y)); \ + asm ("addc %0, %1, %2" \ + : "=r" (*lo) \ + : "%r" (__lo), "0" (*lo)); \ + asm ("adde %0, %1, %2" \ + : "=r" (*hi) \ + : "%r" (__hi), "0" (*hi)); \ + }) + +# if defined(OPT_ACCURACY) +/* + * This is accurate and ~2 - 2.5 times slower than the unrounded version. + * We let the compiler deal with putting the constant in a register + * since the value of MAD_F_SCALEBITS is redefined in some parts of the + * code and tracking the magnitude of (1<<(MAD_F_SCALEBITS-1)) is too + * complicated. + * The __volatile__ improve the generated code by another 5% (fewer + * spills to memory), eventually they should be removed. + */ +# undef mad_f_scale64 +# define mad_f_scale64(hi, lo) \ + ({ mad_fixed_t __result; \ + mad_fixed64hi_t _hi; \ + mad_fixed64lo_t _lo; \ + asm __volatile__ ("addc %0, %2, %4\n\t" \ + "addze %1, %3" \ + : "=r" (_lo), "=r" (_hi) \ + : "r" (lo), "r" (hi), "r" (1<<(MAD_F_SCALEBITS-1))); \ + asm __volatile__ ("rlwinm %0, %2,32-%3,0,%3-1\n\t" \ + "rlwimi %0, %1,32-%3,%3,31" \ + : "=&r" (__result) \ + : "r" (_lo), "r" (_hi), "I" (MAD_F_SCALEBITS)); \ + __result; \ + }) +# else +# undef mad_f_scale64 +# define mad_f_scale64(hi, lo) \ + ({ mad_fixed_t __result; \ + asm ("rlwinm %0, %2,32-%3,0,%3-1\n\t" \ + "rlwimi %0, %1,32-%3,%3,31" \ + : "=r" (__result) \ + : "r" (lo), "r" (hi), "I" (MAD_F_SCALEBITS)); \ + __result; \ + }) +# endif /* OPT_ACCURACY */
# define MAD_F_SCALEBITS MAD_F_FRACBITS
Index: libmad/layer3.c =================================================================== diff -u -r1.1.1.1 -r1.2 --- libmad/layer3.c 2001/01/31 03:19:46 1.1.1.1 +++ libmad/layer3.c 2001/01/31 04:02:51 1.2 @@ -1253,6 +1253,294 @@ mad_fixed_t t6, t7, t8, t9, t10, t11; mad_fixed_t t12, t13, t14, t15, t16, t17;
+#ifdef MAD_F_HAVEMLA + mad_fixed64hi_t hi; + mad_fixed64lo_t lo; + +#define MLA(res, first, acc) \ + hi = lo = 0; \ + acc; \ + res = first + mad_f_scale64(hi, lo) + +#define MLA_STEP(x, y) \ + mad_f_mla(&hi, &lo, x, y); + + t6 = + mad_f_mul(X[4], 0x0ec835e8L) + + mad_f_mul(X[13], 0x061f78aaL); + + t0 = t6 + + (t7 = mad_f_mul((t16 = X[1] - X[10]), -0x061f78aaL)) + + (t8 = mad_f_mul((t17 = X[7] + X[16]), -0x0ec835e8L)); + + x[7] = t0 + + mad_f_mul((t10 = X[0] - X[11] - X[12]), 0x0216a2a2L) + + mad_f_mul((t11 = X[2] - X[9] - X[14]), 0x09bd7ca0L) + + mad_f_mul((t12 = X[3] - X[8] - X[15]), -0x0cb19346L) + + mad_f_mul((t13 = X[5] - X[6] - X[17]), -0x0fdcf549L); + x[10] = -x[7]; + + x[19] = x[34] = -t6 + -t7 + -t8 + + mad_f_mul(t10, -0x0cb19346L) + + mad_f_mul(t11, 0x0fdcf549L) + + mad_f_mul(t12, 0x0216a2a2L) + + mad_f_mul(t13, -0x09bd7ca0L); + + t14 = X[0] - X[3] + X[8] - X[11] - X[12] + X[15]; + t15 = X[2] + X[5] - X[6] - X[9] - X[14] - X[17]; + + x[22] = x[31] = t0 + + mad_f_mul(t14, -0x0ec835e8L) + + mad_f_mul(t15, 0x061f78aaL); + + t1 = t6 + + mad_f_mul(X[1], -0x09bd7ca0L) + + mad_f_mul(X[7], 0x0216a2a2L) + + mad_f_mul(X[10], -0x0fdcf549L) + + mad_f_mul(X[16], 0x0cb19346L); + + MLA( + x[6] , t1, + MLA_STEP(X[0], 0x03768962L) + MLA_STEP(X[2], 0x0e313245L) + MLA_STEP(X[3], -0x0ffc19fdL) + MLA_STEP(X[5], -0x0acf37adL) + MLA_STEP(X[6], 0x04cfb0e2L) + MLA_STEP(X[8], -0x0898c779L) + MLA_STEP(X[9], 0x0d7e8807L) + MLA_STEP(X[11], 0x0f426cb5L) + MLA_STEP(X[12], -0x0bcbe352L) + MLA_STEP(X[14], 0x00b2aa3eL) + MLA_STEP(X[15], -0x07635284L) + MLA_STEP(X[17], -0x0f9ee890L) + ); + x[11] = -x[6]; + + MLA( + x[23] = x[30] , t1, + MLA_STEP(X[0], -0x0f426cb5L) + MLA_STEP(X[2], -0x00b2aa3eL) + MLA_STEP(X[3], 0x0898c779L) + MLA_STEP(X[5], 0x0f9ee890L) + MLA_STEP(X[6], 0x0acf37adL) + MLA_STEP(X[8], -0x07635284L) + MLA_STEP(X[9], -0x0e313245L) + MLA_STEP(X[11], -0x0bcbe352L) + MLA_STEP(X[12], -0x03768962L) + MLA_STEP(X[14], 0x0d7e8807L) + MLA_STEP(X[15], 0x0ffc19fdL) + MLA_STEP(X[17], 0x04cfb0e2L) + ); + + MLA( + x[18] = x[35] , -t1, + MLA_STEP(X[0], -0x0bcbe352L) + MLA_STEP(X[2], 0x0d7e8807L) + MLA_STEP(X[3], -0x07635284L) + MLA_STEP(X[5], 0x04cfb0e2L) + MLA_STEP(X[6], 0x0f9ee890L) + MLA_STEP(X[8], -0x0ffc19fdL) + MLA_STEP(X[9], -0x00b2aa3eL) + MLA_STEP(X[11], 0x03768962L) + MLA_STEP(X[12], -0x0f426cb5L) + MLA_STEP(X[14], 0x0e313245L) + MLA_STEP(X[15], 0x0898c779L) + MLA_STEP(X[17], -0x0acf37adL) + ); + + t9 = + mad_f_mul(X[4], 0x061f78aaL) + + mad_f_mul(X[13], -0x0ec835e8L); + + t2 = t9 + + mad_f_mul(X[1], -0x0cb19346L) + + mad_f_mul(X[7], 0x0fdcf549L) + + mad_f_mul(X[10], 0x0216a2a2L) + + mad_f_mul(X[16], -0x09bd7ca0L); + + MLA( + x[5] , t2, + MLA_STEP(X[0], 0x04cfb0e2L) + MLA_STEP(X[2], 0x0ffc19fdL) + MLA_STEP(X[3], -0x0d7e8807L) + MLA_STEP(X[5], 0x03768962L) + MLA_STEP(X[6], -0x0bcbe352L) + MLA_STEP(X[8], -0x0e313245L) + MLA_STEP(X[9], 0x07635284L) + MLA_STEP(X[11], -0x0acf37adL) + MLA_STEP(X[12], 0x0f9ee890L) + MLA_STEP(X[14], 0x0898c779L) + MLA_STEP(X[15], 0x00b2aa3eL) + MLA_STEP(X[17], 0x0f426cb5L) + ); + x[12] = -x[5]; + + MLA( + x[0] , t2, + MLA_STEP(X[0], 0x0acf37adL) + MLA_STEP(X[2], -0x0898c779L) + MLA_STEP(X[3], 0x0e313245L) + MLA_STEP(X[5], -0x0f426cb5L) + MLA_STEP(X[6], -0x03768962L) + MLA_STEP(X[8], 0x00b2aa3eL) + MLA_STEP(X[9], -0x0ffc19fdL) + MLA_STEP(X[11], 0x0f9ee890L) + MLA_STEP(X[12], -0x04cfb0e2L) + MLA_STEP(X[14], 0x07635284L) + MLA_STEP(X[15], 0x0d7e8807L) + MLA_STEP(X[17], -0x0bcbe352L) + ); + x[17] = -x[0]; + + MLA( + x[24] = x[29] , t2, + MLA_STEP(X[0], -0x0f9ee890L) + MLA_STEP(X[2], -0x07635284L) + MLA_STEP(X[3], -0x00b2aa3eL) + MLA_STEP(X[5], 0x0bcbe352L) + MLA_STEP(X[6], 0x0f426cb5L) + MLA_STEP(X[8], 0x0d7e8807L) + MLA_STEP(X[9], 0x0898c779L) + MLA_STEP(X[11], -0x04cfb0e2L) + MLA_STEP(X[12], -0x0acf37adL) + MLA_STEP(X[14], -0x0ffc19fdL) + MLA_STEP(X[15], -0x0e313245L) + MLA_STEP(X[17], -0x03768962L) + ); + + t3 = t9 + + mad_f_mul(X[1], -0x0216a2a2L) + + mad_f_mul(X[7], -0x09bd7ca0L) + + mad_f_mul(X[10], 0x0cb19346L) + + mad_f_mul(X[16], 0x0fdcf549L); + + MLA( + x[8] , t3, + MLA_STEP(X[0], 0x00b2aa3eL) + MLA_STEP(X[2], 0x03768962L) + MLA_STEP(X[3], -0x04cfb0e2L) + MLA_STEP(X[5], -0x07635284L) + MLA_STEP(X[6], 0x0898c779L) + MLA_STEP(X[8], 0x0acf37adL) + MLA_STEP(X[9], -0x0bcbe352L) + MLA_STEP(X[11], -0x0d7e8807L) + MLA_STEP(X[12], 0x0e313245L) + MLA_STEP(X[14], 0x0f426cb5L) + MLA_STEP(X[15], -0x0f9ee890L) + MLA_STEP(X[17], -0x0ffc19fdL) + ); + x[9] = -x[8]; + + MLA( + x[21] = x[32] , t3, + MLA_STEP(X[0], -0x0e313245L) + MLA_STEP(X[2], 0x0bcbe352L) + MLA_STEP(X[3], 0x0f9ee890L) + MLA_STEP(X[5], -0x0898c779L) + MLA_STEP(X[6], -0x0ffc19fdL) + MLA_STEP(X[8], 0x04cfb0e2L) + MLA_STEP(X[9], 0x0f426cb5L) + MLA_STEP(X[11], -0x00b2aa3eL) + MLA_STEP(X[12], -0x0d7e8807L) + MLA_STEP(X[14], -0x03768962L) + MLA_STEP(X[15], 0x0acf37adL) + MLA_STEP(X[17], 0x07635284L) + ); + + MLA( + x[20] = x[33] , -t3, + MLA_STEP(X[0], -0x0d7e8807L) + MLA_STEP(X[2], 0x0f426cb5L) + MLA_STEP(X[3], 0x0acf37adL) + MLA_STEP(X[5], -0x0ffc19fdL) + MLA_STEP(X[6], -0x07635284L) + MLA_STEP(X[8], 0x0f9ee890L) + MLA_STEP(X[9], 0x03768962L) + MLA_STEP(X[11], -0x0e313245L) + MLA_STEP(X[12], 0x00b2aa3eL) + MLA_STEP(X[14], 0x0bcbe352L) + MLA_STEP(X[15], -0x04cfb0e2L) + MLA_STEP(X[17], -0x0898c779L) + ); + + t4 = -t9 + + mad_f_mul(t16, -0x0ec835e8L) + + mad_f_mul(t17, 0x061f78aaL); + + x[4] = t4 + + mad_f_mul(t14, 0x061f78aaL) + + mad_f_mul(t15, 0x0ec835e8L); + x[13] = -x[4]; + + x[1] = t4 + + mad_f_mul(t10, 0x09bd7ca0L) + + mad_f_mul(t11, -0x0216a2a2L) + + mad_f_mul(t12, 0x0fdcf549L) + + mad_f_mul(t13, -0x0cb19346L); + x[16] = -x[1]; + + x[25] = x[28] = t4 + + mad_f_mul(t10, -0x0fdcf549L) + + mad_f_mul(t11, -0x0cb19346L) + + mad_f_mul(t12, -0x09bd7ca0L) + + mad_f_mul(t13, -0x0216a2a2L); + + t5 = -t6 + + mad_f_mul(X[1], -0x0fdcf549L) + + mad_f_mul(X[7], -0x0cb19346L) + + mad_f_mul(X[10], -0x09bd7ca0L) + + mad_f_mul(X[16], -0x0216a2a2L); + + MLA( + x[2] , t5, + MLA_STEP(X[0], 0x0898c779L) + MLA_STEP(X[2], 0x04cfb0e2L) + MLA_STEP(X[3], 0x0bcbe352L) + MLA_STEP(X[5], 0x00b2aa3eL) + MLA_STEP(X[6], 0x0e313245L) + MLA_STEP(X[8], -0x03768962L) + MLA_STEP(X[9], 0x0f9ee890L) + MLA_STEP(X[11], -0x07635284L) + MLA_STEP(X[12], 0x0ffc19fdL) + MLA_STEP(X[14], -0x0acf37adL) + MLA_STEP(X[15], 0x0f426cb5L) + MLA_STEP(X[17], -0x0d7e8807L) + ); + x[15] = -x[2]; + + MLA( + x[3] , t5, + MLA_STEP(X[0], 0x07635284L) + MLA_STEP(X[2], 0x0acf37adL) + MLA_STEP(X[3], 0x03768962L) + MLA_STEP(X[5], 0x0d7e8807L) + MLA_STEP(X[6], -0x00b2aa3eL) + MLA_STEP(X[8], 0x0f426cb5L) + MLA_STEP(X[9], -0x04cfb0e2L) + MLA_STEP(X[11], 0x0ffc19fdL) + MLA_STEP(X[12], -0x0898c779L) + MLA_STEP(X[14], 0x0f9ee890L) + MLA_STEP(X[15], -0x0bcbe352L) + MLA_STEP(X[17], 0x0e313245L) + ); + x[14] = -x[3]; + + MLA( + x[26] = x[27] , t5, + MLA_STEP(X[0], -0x0ffc19fdL) + MLA_STEP(X[2], -0x0f9ee890L) + MLA_STEP(X[3], -0x0f426cb5L) + MLA_STEP(X[5], -0x0e313245L) + MLA_STEP(X[6], -0x0d7e8807L) + MLA_STEP(X[8], -0x0bcbe352L) + MLA_STEP(X[9], -0x0acf37adL) + MLA_STEP(X[11], -0x0898c779L) + MLA_STEP(X[12], -0x07635284L) + MLA_STEP(X[14], -0x04cfb0e2L) + MLA_STEP(X[15], -0x03768962L) + MLA_STEP(X[17], -0x00b2aa3eL) + ); +#else t6 = mad_f_mul(X[4], 0x0ec835e8L) + mad_f_mul(X[13], 0x061f78aaL); @@ -1504,6 +1792,7 @@ mad_f_mul(X[14], -0x04cfb0e2L) + mad_f_mul(X[15], -0x03768962L) + mad_f_mul(X[17], -0x00b2aa3eL); +#endif }
/*