Rob,
It doesn't make a huge difference to the overall execution time, but the following patch almost doubles the speed of III_imdct_s() (and reduces code size slightly).
Could you please do a quick sanity check on the accuracy for me. I would expect it to be about the same as the old version, but checksums of wav file outputs seem to be different (suggesting differences at least as significant as bit 16). Is there an obvious reason for this ?
Andre --
____________________________________________________________ Do You Yahoo!? Get your free @yahoo.co.uk address at http://mail.yahoo.co.uk or your free @yahoo.ie address at http://mail.yahoo.ie
diff -u -r mad-0.11.3b_original/libmad/imdct_s.dat mad-0.11.3b/libmad/imdct_s.dat --- mad-0.11.3b_original/libmad/imdct_s.dat Sun Sep 24 09:20:09 2000 +++ mad-0.11.3b/libmad/imdct_s.dat Sun Sep 24 09:58:28 2000 @@ -19,50 +19,26 @@ * $Id: imdct_s.dat,v 1.1 2000/08/02 05:48:51 rob Exp $ */
- { 0x09bd7ca0L /* 0.608761429 */, -0x0ec835e8L /* -0.923879533 */, /* 0 */ - -0x0216a2a2L /* -0.130526192 */, 0x0fdcf549L /* 0.991444861 */, - -0x061f78aaL /* -0.382683432 */, -0x0cb19346L /* -0.793353340 */ }, - - { 0x061f78aaL /* 0.382683432 */, -0x0ec835e8L /* -0.923879533 */, /* 1 */ - 0x0ec835e8L /* 0.923879533 */, -0x061f78aaL /* -0.382683432 */, - -0x061f78aaL /* -0.382683432 */, 0x0ec835e8L /* 0.923879533 */ }, - - { 0x0216a2a2L /* 0.130526192 */, -0x061f78aaL /* -0.382683432 */, /* 2 */ - 0x09bd7ca0L /* 0.608761429 */, -0x0cb19346L /* -0.793353340 */, - 0x0ec835e8L /* 0.923879533 */, -0x0fdcf549L /* -0.991444861 */ }, - - { -0x0216a2a2L /* -0.130526192 */, 0x061f78aaL /* 0.382683432 */, /* 3 */ + { -0x0216a2a2L /* -0.130526192 */, 0x061f78aaL /* 0.382683432 */, /* 0 */ -0x09bd7ca0L /* -0.608761429 */, 0x0cb19346L /* 0.793353340 */, -0x0ec835e8L /* -0.923879533 */, 0x0fdcf549L /* 0.991444861 */ },
- { -0x061f78aaL /* -0.382683432 */, 0x0ec835e8L /* 0.923879533 */, /* 4 */ + { -0x061f78aaL /* -0.382683432 */, 0x0ec835e8L /* 0.923879533 */, /* 1 */ -0x0ec835e8L /* -0.923879533 */, 0x061f78aaL /* 0.382683432 */, 0x061f78aaL /* 0.382683432 */, -0x0ec835e8L /* -0.923879533 */ },
- { -0x09bd7ca0L /* -0.608761429 */, 0x0ec835e8L /* 0.923879533 */, /* 5 */ + { -0x09bd7ca0L /* -0.608761429 */, 0x0ec835e8L /* 0.923879533 */, /* 2 */ 0x0216a2a2L /* 0.130526192 */, -0x0fdcf549L /* -0.991444861 */, 0x061f78aaL /* 0.382683432 */, 0x0cb19346L /* 0.793353340 */ },
- { -0x0cb19346L /* -0.793353340 */, 0x061f78aaL /* 0.382683432 */, /* 6 */ + { -0x0cb19346L /* -0.793353340 */, 0x061f78aaL /* 0.382683432 */, /* 3 */ 0x0fdcf549L /* 0.991444861 */, 0x0216a2a2L /* 0.130526192 */, -0x0ec835e8L /* -0.923879533 */, -0x09bd7ca0L /* -0.608761429 */ },
- { -0x0ec835e8L /* -0.923879533 */, -0x061f78aaL /* -0.382683432 */, /* 7 */ + { -0x0ec835e8L /* -0.923879533 */, -0x061f78aaL /* -0.382683432 */, /* 4 */ 0x061f78aaL /* 0.382683432 */, 0x0ec835e8L /* 0.923879533 */, 0x0ec835e8L /* 0.923879533 */, 0x061f78aaL /* 0.382683432 */ },
- { -0x0fdcf549L /* -0.991444861 */, -0x0ec835e8L /* -0.923879533 */, /* 8 */ - -0x0cb19346L /* -0.793353340 */, -0x09bd7ca0L /* -0.608761429 */, - -0x061f78aaL /* -0.382683432 */, -0x0216a2a2L /* -0.130526192 */ }, - - { -0x0fdcf549L /* -0.991444861 */, -0x0ec835e8L /* -0.923879533 */, /* 9 */ + { -0x0fdcf549L /* -0.991444861 */, -0x0ec835e8L /* -0.923879533 */, /* 5 */ -0x0cb19346L /* -0.793353340 */, -0x09bd7ca0L /* -0.608761429 */, - -0x061f78aaL /* -0.382683432 */, -0x0216a2a2L /* -0.130526192 */ }, - - { -0x0ec835e8L /* -0.923879533 */, -0x061f78aaL /* -0.382683432 */, /* 10 */ - 0x061f78aaL /* 0.382683432 */, 0x0ec835e8L /* 0.923879533 */, - 0x0ec835e8L /* 0.923879533 */, 0x061f78aaL /* 0.382683432 */ }, - - { -0x0cb19346L /* -0.793353340 */, 0x061f78aaL /* 0.382683432 */, /* 11 */ - 0x0fdcf549L /* 0.991444861 */, 0x0216a2a2L /* 0.130526192 */, - -0x0ec835e8L /* -0.923879533 */, -0x09bd7ca0L /* -0.608761429 */ } + -0x061f78aaL /* -0.382683432 */, -0x0216a2a2L /* -0.130526192 */ } diff -u -r mad-0.11.3b_original/libmad/layer3.c mad-0.11.3b/libmad/layer3.c --- mad-0.11.3b_original/libmad/layer3.c Sun Sep 24 09:20:09 2000 +++ mad-0.11.3b/libmad/layer3.c Sun Sep 24 15:02:20 2000 @@ -222,10 +222,10 @@ * IMDCT coefficients for short blocks * derived from section 2.4.3.4.10.2 of ISO/IEC 11172-3 * - * imdct_s[i][k] = cos((PI / 24) * (2 * i + 7) * (2 * k + 1)) + * imdct_s[i][k] = cos((PI / 24) * (2 * (i+3) + 7) * (2 * k + 1)) */ static -mad_fixed_t const imdct_s[12][6] = { +mad_fixed_t const imdct_s[6][6] = { # include "imdct_s.dat" };
@@ -1555,32 +1555,40 @@ static void III_imdct_s(mad_fixed_t const X[18], mad_fixed_t z[36]) { - unsigned int w, i, k; - mad_fixed_t y[3][12]; + static const unsigned char map_duplicates[6] = { 2, 1, 0, 11, 10, 9 };
- /* IMDCT and windowing */ + int i, k; + mad_fixed_t y[36];
- for (w = 0; w < 3; ++w) { - for (i = 0; i < 12; ++i) { - register mad_fixed_t sum; - - sum = 0; - - for (k = 0; k < 6; ++k) - sum += mad_f_mul(X[6 * w + k], imdct_s[i][k]); + /* IMDCT */
- y[w][i] = mad_f_mul(sum, window_s[i]); + for (i = 0; i < 6; i++) + { + mad_fixed_t sum0 = 0, sum1 = 0, sum2 = 0; + + for (k = 0; k < 6; k++) + { + sum0 += mad_f_mul(X[k ], imdct_s[i][k]); + sum1 += mad_f_mul(X[k+ 6], imdct_s[i][k]); + sum2 += mad_f_mul(X[k+12], imdct_s[i][k]); } + + y[i+ 3] = sum0; y[map_duplicates[i] ] = (i < 3) ? -sum0 : sum0; + y[i+15] = sum1; y[map_duplicates[i]+12] = (i < 3) ? -sum1 : sum1; + y[i+27] = sum2; y[map_duplicates[i]+24] = (i < 3) ? -sum2 : sum2; }
- /* overlapping and concatenation */ + /* Windowing, overlapping and concatenation */
- for (i = 0; i < 6; ++i) z[i] = 0; - for (i = 6; i < 12; ++i) z[i] = y[0][i - 6]; - for (i = 12; i < 18; ++i) z[i] = y[0][i - 6] + y[1][i - 12]; - for (i = 18; i < 24; ++i) z[i] = y[1][i - 12] + y[2][i - 18]; - for (i = 24; i < 30; ++i) z[i] = y[2][i - 18]; - for (i = 30; i < 36; ++i) z[i] = 0; + for (i = 0; i < 6; i++) + { + z[i ] = 0; + z[i+ 6] = mad_f_mul(y[i+ 0], window_s[i+0]); + z[i+12] = mad_f_mul(y[i+ 6], window_s[i+6]) + mad_f_mul(y[i+12], window_s[i]); + z[i+18] = mad_f_mul(y[i+18], window_s[i+6]) + mad_f_mul(y[i+24], window_s[i]); + z[i+24] = mad_f_mul(y[i+30], window_s[i+6]); + z[i+30] = 0; + } }
/*