Rob,
It doesn't make a huge difference to the overall execution time, but the following patch almost doubles the speed of III_imdct_s() (and reduces code size slightly).
Could you please do a quick sanity check on the accuracy for me. I would expect it to be about the same as the old version, but checksums of wav file outputs seem to be different (suggesting differences at least as significant as bit 16). Is there an obvious reason for this ?
Andre --
____________________________________________________________ Do You Yahoo!? Get your free @yahoo.co.uk address at http://mail.yahoo.co.uk or your free @yahoo.ie address at http://mail.yahoo.ie
diff -u -r mad-0.11.3b_original/libmad/imdct_s.dat mad-0.11.3b/libmad/imdct_s.dat --- mad-0.11.3b_original/libmad/imdct_s.dat Sun Sep 24 09:20:09 2000 +++ mad-0.11.3b/libmad/imdct_s.dat Sun Sep 24 09:58:28 2000 @@ -19,50 +19,26 @@ * $Id: imdct_s.dat,v 1.1 2000/08/02 05:48:51 rob Exp $ */
- { 0x09bd7ca0L /* 0.608761429 */, -0x0ec835e8L /* -0.923879533 */, /* 0 */ - -0x0216a2a2L /* -0.130526192 */, 0x0fdcf549L /* 0.991444861 */, - -0x061f78aaL /* -0.382683432 */, -0x0cb19346L /* -0.793353340 */ }, - - { 0x061f78aaL /* 0.382683432 */, -0x0ec835e8L /* -0.923879533 */, /* 1 */ - 0x0ec835e8L /* 0.923879533 */, -0x061f78aaL /* -0.382683432 */, - -0x061f78aaL /* -0.382683432 */, 0x0ec835e8L /* 0.923879533 */ }, - - { 0x0216a2a2L /* 0.130526192 */, -0x061f78aaL /* -0.382683432 */, /* 2 */ - 0x09bd7ca0L /* 0.608761429 */, -0x0cb19346L /* -0.793353340 */, - 0x0ec835e8L /* 0.923879533 */, -0x0fdcf549L /* -0.991444861 */ }, - - { -0x0216a2a2L /* -0.130526192 */, 0x061f78aaL /* 0.382683432 */, /* 3 */ + { -0x0216a2a2L /* -0.130526192 */, 0x061f78aaL /* 0.382683432 */, /* 0 */ -0x09bd7ca0L /* -0.608761429 */, 0x0cb19346L /* 0.793353340 */, -0x0ec835e8L /* -0.923879533 */, 0x0fdcf549L /* 0.991444861 */ },
- { -0x061f78aaL /* -0.382683432 */, 0x0ec835e8L /* 0.923879533 */, /* 4 */ + { -0x061f78aaL /* -0.382683432 */, 0x0ec835e8L /* 0.923879533 */, /* 1 */ -0x0ec835e8L /* -0.923879533 */, 0x061f78aaL /* 0.382683432 */, 0x061f78aaL /* 0.382683432 */, -0x0ec835e8L /* -0.923879533 */ },
- { -0x09bd7ca0L /* -0.608761429 */, 0x0ec835e8L /* 0.923879533 */, /* 5 */ + { -0x09bd7ca0L /* -0.608761429 */, 0x0ec835e8L /* 0.923879533 */, /* 2 */ 0x0216a2a2L /* 0.130526192 */, -0x0fdcf549L /* -0.991444861 */, 0x061f78aaL /* 0.382683432 */, 0x0cb19346L /* 0.793353340 */ },
- { -0x0cb19346L /* -0.793353340 */, 0x061f78aaL /* 0.382683432 */, /* 6 */ + { -0x0cb19346L /* -0.793353340 */, 0x061f78aaL /* 0.382683432 */, /* 3 */ 0x0fdcf549L /* 0.991444861 */, 0x0216a2a2L /* 0.130526192 */, -0x0ec835e8L /* -0.923879533 */, -0x09bd7ca0L /* -0.608761429 */ },
- { -0x0ec835e8L /* -0.923879533 */, -0x061f78aaL /* -0.382683432 */, /* 7 */ + { -0x0ec835e8L /* -0.923879533 */, -0x061f78aaL /* -0.382683432 */, /* 4 */ 0x061f78aaL /* 0.382683432 */, 0x0ec835e8L /* 0.923879533 */, 0x0ec835e8L /* 0.923879533 */, 0x061f78aaL /* 0.382683432 */ },
- { -0x0fdcf549L /* -0.991444861 */, -0x0ec835e8L /* -0.923879533 */, /* 8 */ - -0x0cb19346L /* -0.793353340 */, -0x09bd7ca0L /* -0.608761429 */, - -0x061f78aaL /* -0.382683432 */, -0x0216a2a2L /* -0.130526192 */ }, - - { -0x0fdcf549L /* -0.991444861 */, -0x0ec835e8L /* -0.923879533 */, /* 9 */ + { -0x0fdcf549L /* -0.991444861 */, -0x0ec835e8L /* -0.923879533 */, /* 5 */ -0x0cb19346L /* -0.793353340 */, -0x09bd7ca0L /* -0.608761429 */, - -0x061f78aaL /* -0.382683432 */, -0x0216a2a2L /* -0.130526192 */ }, - - { -0x0ec835e8L /* -0.923879533 */, -0x061f78aaL /* -0.382683432 */, /* 10 */ - 0x061f78aaL /* 0.382683432 */, 0x0ec835e8L /* 0.923879533 */, - 0x0ec835e8L /* 0.923879533 */, 0x061f78aaL /* 0.382683432 */ }, - - { -0x0cb19346L /* -0.793353340 */, 0x061f78aaL /* 0.382683432 */, /* 11 */ - 0x0fdcf549L /* 0.991444861 */, 0x0216a2a2L /* 0.130526192 */, - -0x0ec835e8L /* -0.923879533 */, -0x09bd7ca0L /* -0.608761429 */ } + -0x061f78aaL /* -0.382683432 */, -0x0216a2a2L /* -0.130526192 */ } diff -u -r mad-0.11.3b_original/libmad/layer3.c mad-0.11.3b/libmad/layer3.c --- mad-0.11.3b_original/libmad/layer3.c Sun Sep 24 09:20:09 2000 +++ mad-0.11.3b/libmad/layer3.c Sun Sep 24 15:02:20 2000 @@ -222,10 +222,10 @@ * IMDCT coefficients for short blocks * derived from section 2.4.3.4.10.2 of ISO/IEC 11172-3 * - * imdct_s[i][k] = cos((PI / 24) * (2 * i + 7) * (2 * k + 1)) + * imdct_s[i][k] = cos((PI / 24) * (2 * (i+3) + 7) * (2 * k + 1)) */ static -mad_fixed_t const imdct_s[12][6] = { +mad_fixed_t const imdct_s[6][6] = { # include "imdct_s.dat" };
@@ -1555,32 +1555,40 @@ static void III_imdct_s(mad_fixed_t const X[18], mad_fixed_t z[36]) { - unsigned int w, i, k; - mad_fixed_t y[3][12]; + static const unsigned char map_duplicates[6] = { 2, 1, 0, 11, 10, 9 };
- /* IMDCT and windowing */ + int i, k; + mad_fixed_t y[36];
- for (w = 0; w < 3; ++w) { - for (i = 0; i < 12; ++i) { - register mad_fixed_t sum; - - sum = 0; - - for (k = 0; k < 6; ++k) - sum += mad_f_mul(X[6 * w + k], imdct_s[i][k]); + /* IMDCT */
- y[w][i] = mad_f_mul(sum, window_s[i]); + for (i = 0; i < 6; i++) + { + mad_fixed_t sum0 = 0, sum1 = 0, sum2 = 0; + + for (k = 0; k < 6; k++) + { + sum0 += mad_f_mul(X[k ], imdct_s[i][k]); + sum1 += mad_f_mul(X[k+ 6], imdct_s[i][k]); + sum2 += mad_f_mul(X[k+12], imdct_s[i][k]); } + + y[i+ 3] = sum0; y[map_duplicates[i] ] = (i < 3) ? -sum0 : sum0; + y[i+15] = sum1; y[map_duplicates[i]+12] = (i < 3) ? -sum1 : sum1; + y[i+27] = sum2; y[map_duplicates[i]+24] = (i < 3) ? -sum2 : sum2; }
- /* overlapping and concatenation */ + /* Windowing, overlapping and concatenation */
- for (i = 0; i < 6; ++i) z[i] = 0; - for (i = 6; i < 12; ++i) z[i] = y[0][i - 6]; - for (i = 12; i < 18; ++i) z[i] = y[0][i - 6] + y[1][i - 12]; - for (i = 18; i < 24; ++i) z[i] = y[1][i - 12] + y[2][i - 18]; - for (i = 24; i < 30; ++i) z[i] = y[2][i - 18]; - for (i = 30; i < 36; ++i) z[i] = 0; + for (i = 0; i < 6; i++) + { + z[i ] = 0; + z[i+ 6] = mad_f_mul(y[i+ 0], window_s[i+0]); + z[i+12] = mad_f_mul(y[i+ 6], window_s[i+6]) + mad_f_mul(y[i+12], window_s[i]); + z[i+18] = mad_f_mul(y[i+18], window_s[i+6]) + mad_f_mul(y[i+24], window_s[i]); + z[i+24] = mad_f_mul(y[i+30], window_s[i+6]); + z[i+30] = 0; + } }
/*
Andre,
Could you please do a quick sanity check on the accuracy for me. I would expect it to be about the same as the old version, but checksums of wav file outputs seem to be different (suggesting differences at least as significant as bit 16). Is there an obvious reason for this ?
Oddly, the Layer III compliance test bitstream doesn't contain any short blocks, so I can't use it to check.
Comparing MAD with itself (with and without your patch), I'm finding differences of around 4.768e-07, i.e. the same to 21 bits.
The reason for this difference isn't obvious to me, but I haven't studied the patch carefully. In any case, the difference is very small.
Cheers, -rob