-----BEGIN PGP SIGNED MESSAGE----- Hash: SHA1
I was testing the Szu-Wei Lee algorithm and found that it is faster than the one in libmad. A very simplistic implementation that directly follows the algorithm deduction is:
static void sdct9(const mad_fixed_t x[9], mad_fixed_t y[9]){ mad_fixed_t x0,x1,x2,x3,x4,x5,x6,x7,x8; mad_fixed_t a1,a2,a3,a4,a5,a6,a7,a8,a9,a10,a11,a12,a13,a14,a15,a16,a17,a18,a19,a20,a21,a22,a23,a24,a25,a26; mad_fixed_t m1,m2,m3,m4,m5,m6,m7,m8; const mad_fixed_t d[]={ 0x1bb67ae8, /*sqrt(3)*/ 0xe1ee09bf, /*2*cos(8*M_PI/9)*/ 0x058e86a0, /*2*cos(4*M_PI/9)*/ 0x18836fa3, /*2*cos(2*M_PI/9)*/ 0x0af1d43a, /*2*sin(8*M_PI/9)*/ 0x1f838b8d, /*2*sin(4*M_PI/9)*/ 0x1491b752 /*2*sin(2*M_PI/9)*/ }; x0=x[0]; x1=x[1]; x2=x[2]; x3=x[3]; x4=x[4]; x5=x[5]; x6=x[6]; x7=x[7]; x8=x[8];
a1 =x3 + x5; a2 =x3 - x5; a3 =x6 + x2; a4 =x6 - x2; a5 =x1 + x7; a6 =x1 - x7; a7 =x8 + x0; a8 =x8 - x0; a9 =x4 + a5; a10=a1 + a3; a11=a10 + a7; a12=a3 - a7; a13=a1 - a7; a14=a1 - a3; a15=a2 - a4; a16=a15 + a8; a17=a4 + a8; a18=a2 - a8; a19=a2 + a4; m1 =mad_f_mul(-d[0], a6); m2 =mad_f_mul(-d[1], a12); m3 =mad_f_mul(-d[2], a13); m4 =mad_f_mul(-d[3], a14); m5 =mad_f_mul(-d[0], a16); m6 =mad_f_mul(-d[4], a17); m7 =mad_f_mul(-d[5], a18); m8 =mad_f_mul(-d[6], a19); a20=x4 + x4 -a5; a21=a20+m2; a22=a20-m2; a23=a20+m3; a24=m1+m6; a25=m1-m6; a26=m1+m7; y[0]=a9+a11; y[1]=m8-a26; y[2]=m4-a21; y[3]=m5; y[4]=a22-m3; y[5]=a25-m7; y[6]=a11-a9-a9; y[7]=a24+m8; y[8]=a23+m4; }
static void sdct18(const mad_fixed_t x[18], mad_fixed_t X[36]){ mad_fixed_t u[9]; mad_fixed_t v[9]; mad_fixed_t x1[9]; mad_fixed_t x2[9]; int i; const mad_fixed_t costab[]={ 0x1fe0d3b4, /*2*cos(M_PI*(2*i+1)/(2*18))*/ 0x1ee8dd47, /*2*cos(M_PI*(2*i+1)/(2*18))*/ 0x1d007930, /*2*cos(M_PI*(2*i+1)/(2*18))*/ 0x1a367e59, /*2*cos(M_PI*(2*i+1)/(2*18))*/ 0x16a09e66, /*2*cos(M_PI*(2*i+1)/(2*18))*/ 0x125abcf8, /*2*cos(M_PI*(2*i+1)/(2*18))*/ 0x0d8616bc, /*2*cos(M_PI*(2*i+1)/(2*18))*/ 0x08483ee1, /*2*cos(M_PI*(2*i+1)/(2*18))*/ 0x02c9fad7 /*2*cos(M_PI*(2*i+1)/(2*18))*/ };
for(i=0;i<9;i++){ x1[i]=x[i]+x[18-1-i]; x2[i]=mad_f_mul((x[i]-x[18-1-i]),costab[i]); } sdct9(x1,u); sdct9(x2,v); for(i=1;i<9;i++) v[i]=v[i]-v[i-1]; X[0]=u[0]; X[1]=v[0]; for(i=1;i<9;i++){ X[2*i]=u[i]; X[2*i+1]=v[i]; } }
static void dctIV(mad_fixed_t const y[18], mad_fixed_t X[36]){ int i; mad_fixed_t temp[18]; mad_fixed_t costab[]={ 0x0ffc19fd, /*cos(M_PI*(2*i+1)/(2*36))*/ 0x0fdcf549, /*cos(M_PI*(2*i+1)/(2*36))*/ 0x0f9ee890, /*cos(M_PI*(2*i+1)/(2*36))*/ 0x0f426cb5, /*cos(M_PI*(2*i+1)/(2*36))*/ 0x0ec835e8, /*cos(M_PI*(2*i+1)/(2*36))*/ 0x0e313245, /*cos(M_PI*(2*i+1)/(2*36))*/ 0x0d7e8807, /*cos(M_PI*(2*i+1)/(2*36))*/ 0x0cb19346, /*cos(M_PI*(2*i+1)/(2*36))*/ 0x0bcbe352, /*cos(M_PI*(2*i+1)/(2*36))*/ 0x0acf37ad, /*cos(M_PI*(2*i+1)/(2*36))*/ 0x09bd7ca0, /*cos(M_PI*(2*i+1)/(2*36))*/ 0x0898c779, /*cos(M_PI*(2*i+1)/(2*36))*/ 0x07635284, /*cos(M_PI*(2*i+1)/(2*36))*/ 0x061f78aa, /*cos(M_PI*(2*i+1)/(2*36))*/ 0x04cfb0e2, /*cos(M_PI*(2*i+1)/(2*36))*/ 0x03768962, /*cos(M_PI*(2*i+1)/(2*36))*/ 0x0216a2a2, /*cos(M_PI*(2*i+1)/(2*36))*/ 0x00b2aa3e /*cos(M_PI*(2*i+1)/(2*36))*/ };
for(i=0;i<18;i++){ temp[i]=mad_f_mul(y[i],costab[i]); } sdct18(temp, X);
for(i=1;i<18;i++) X[i]=X[i]-X[i-1]; }
static inline void my_imdct36(mad_fixed_t const X[18], mad_fixed_t y[36]){ mad_fixed_t temp[36]; int m; dctIV(X,temp);
for(m=0;m<18;m++) temp[35-m]=-temp[m]; for(m=0;m<=8;m++) y[m+3*9]=-temp[m]; for(m=9;m<36;m++) y[m-9]=temp[m]; }
The imdct of size 12 has not been implemented. I will now work on it and on some optimisations.
Hope this is useful.
Rafael.
I was testing the Szu-Wei Lee algorithm and found that it is faster than the one in libmad.
As a matter of fact I have also been experimenting with a new Layer III IMDCT implementation based on the same algorithm. It is nice because it reduces the number of multiplications from 188 to 43.
I have had to modify the algorithm slightly to improve the resulting output accuracy. It improves overall performance on many platforms by as much as 6-7% and improves accuracy as well compared to the libmad 0.15.0b release.
My current implementation does not appear to improve performance over Andre McCurdy's hand-written assembly implementation for ARM platforms, however. Maybe that assembly could be rewritten according to the new algorithm for significant gains.
My implementation is attached; this or something very similar will be incorporated into the next libmad release.