-----BEGIN PGP SIGNED MESSAGE-----
Hash: SHA1
I was testing the Szu-Wei Lee algorithm and found that it is faster than the
one in libmad. A very simplistic implementation that directly follows the
algorithm deduction is:
static void sdct9(const mad_fixed_t x[9], mad_fixed_t y[9]){
mad_fixed_t x0,x1,x2,x3,x4,x5,x6,x7,x8;
mad_fixed_t
a1,a2,a3,a4,a5,a6,a7,a8,a9,a10,a11,a12,a13,a14,a15,a16,a17,a18,a19,a20,a21,a22,a23,a24,a25,a26;
mad_fixed_t m1,m2,m3,m4,m5,m6,m7,m8;
const mad_fixed_t d[]={
0x1bb67ae8, /*sqrt(3)*/
0xe1ee09bf, /*2*cos(8*M_PI/9)*/
0x058e86a0, /*2*cos(4*M_PI/9)*/
0x18836fa3, /*2*cos(2*M_PI/9)*/
0x0af1d43a, /*2*sin(8*M_PI/9)*/
0x1f838b8d, /*2*sin(4*M_PI/9)*/
0x1491b752 /*2*sin(2*M_PI/9)*/
};
x0=x[0];
x1=x[1];
x2=x[2];
x3=x[3];
x4=x[4];
x5=x[5];
x6=x[6];
x7=x[7];
x8=x[8];
a1 =x3 + x5;
a2 =x3 - x5;
a3 =x6 + x2;
a4 =x6 - x2;
a5 =x1 + x7;
a6 =x1 - x7;
a7 =x8 + x0;
a8 =x8 - x0;
a9 =x4 + a5;
a10=a1 + a3;
a11=a10 + a7;
a12=a3 - a7;
a13=a1 - a7;
a14=a1 - a3;
a15=a2 - a4;
a16=a15 + a8;
a17=a4 + a8;
a18=a2 - a8;
a19=a2 + a4;
m1 =mad_f_mul(-d[0], a6);
m2 =mad_f_mul(-d[1], a12);
m3 =mad_f_mul(-d[2], a13);
m4 =mad_f_mul(-d[3], a14);
m5 =mad_f_mul(-d[0], a16);
m6 =mad_f_mul(-d[4], a17);
m7 =mad_f_mul(-d[5], a18);
m8 =mad_f_mul(-d[6], a19);
a20=x4 + x4 -a5;
a21=a20+m2;
a22=a20-m2;
a23=a20+m3;
a24=m1+m6;
a25=m1-m6;
a26=m1+m7;
y[0]=a9+a11;
y[1]=m8-a26;
y[2]=m4-a21;
y[3]=m5;
y[4]=a22-m3;
y[5]=a25-m7;
y[6]=a11-a9-a9;
y[7]=a24+m8;
y[8]=a23+m4;
}
static void sdct18(const mad_fixed_t x[18], mad_fixed_t X[36]){
mad_fixed_t u[9];
mad_fixed_t v[9];
mad_fixed_t x1[9];
mad_fixed_t x2[9];
int i;
const mad_fixed_t costab[]={
0x1fe0d3b4, /*2*cos(M_PI*(2*i+1)/(2*18))*/
0x1ee8dd47, /*2*cos(M_PI*(2*i+1)/(2*18))*/
0x1d007930, /*2*cos(M_PI*(2*i+1)/(2*18))*/
0x1a367e59, /*2*cos(M_PI*(2*i+1)/(2*18))*/
0x16a09e66, /*2*cos(M_PI*(2*i+1)/(2*18))*/
0x125abcf8, /*2*cos(M_PI*(2*i+1)/(2*18))*/
0x0d8616bc, /*2*cos(M_PI*(2*i+1)/(2*18))*/
0x08483ee1, /*2*cos(M_PI*(2*i+1)/(2*18))*/
0x02c9fad7 /*2*cos(M_PI*(2*i+1)/(2*18))*/
};
for(i=0;i<9;i++){
x1[i]=x[i]+x[18-1-i];
x2[i]=mad_f_mul((x[i]-x[18-1-i]),costab[i]);
}
sdct9(x1,u);
sdct9(x2,v);
for(i=1;i<9;i++)
v[i]=v[i]-v[i-1];
X[0]=u[0];
X[1]=v[0];
for(i=1;i<9;i++){
X[2*i]=u[i];
X[2*i+1]=v[i];
}
}
static void dctIV(mad_fixed_t const y[18], mad_fixed_t X[36]){
int i;
mad_fixed_t temp[18];
mad_fixed_t costab[]={
0x0ffc19fd, /*cos(M_PI*(2*i+1)/(2*36))*/
0x0fdcf549, /*cos(M_PI*(2*i+1)/(2*36))*/
0x0f9ee890, /*cos(M_PI*(2*i+1)/(2*36))*/
0x0f426cb5, /*cos(M_PI*(2*i+1)/(2*36))*/
0x0ec835e8, /*cos(M_PI*(2*i+1)/(2*36))*/
0x0e313245, /*cos(M_PI*(2*i+1)/(2*36))*/
0x0d7e8807, /*cos(M_PI*(2*i+1)/(2*36))*/
0x0cb19346, /*cos(M_PI*(2*i+1)/(2*36))*/
0x0bcbe352, /*cos(M_PI*(2*i+1)/(2*36))*/
0x0acf37ad, /*cos(M_PI*(2*i+1)/(2*36))*/
0x09bd7ca0, /*cos(M_PI*(2*i+1)/(2*36))*/
0x0898c779, /*cos(M_PI*(2*i+1)/(2*36))*/
0x07635284, /*cos(M_PI*(2*i+1)/(2*36))*/
0x061f78aa, /*cos(M_PI*(2*i+1)/(2*36))*/
0x04cfb0e2, /*cos(M_PI*(2*i+1)/(2*36))*/
0x03768962, /*cos(M_PI*(2*i+1)/(2*36))*/
0x0216a2a2, /*cos(M_PI*(2*i+1)/(2*36))*/
0x00b2aa3e /*cos(M_PI*(2*i+1)/(2*36))*/
};
for(i=0;i<18;i++){
temp[i]=mad_f_mul(y[i],costab[i]);
}
sdct18(temp, X);
for(i=1;i<18;i++)
X[i]=X[i]-X[i-1];
}
static inline
void my_imdct36(mad_fixed_t const X[18], mad_fixed_t y[36]){
mad_fixed_t temp[36];
int m;
dctIV(X,temp);
for(m=0;m<18;m++)
temp[35-m]=-temp[m];
for(m=0;m<=8;m++)
y[m+3*9]=-temp[m];
for(m=9;m<36;m++)
y[m-9]=temp[m];
}
The imdct of size 12 has not been implemented. I will now work on it and on
some optimisations.
Hope this is useful.
Rafael.
-----BEGIN PGP SIGNATURE-----
Version: GnuPG v1.2.3 (GNU/Linux)
iD8DBQE/b7nBLlrfGJ8JUHwRAvkcAJoCSK5uIWWmbZNru3U7ATfllFk71ACdHtUP
mrFQRfHU8Wl+vdUZ5kwt0cw=
=k3e9
-----END PGP SIGNATURE-----