```// ************************************************************************** //
//                                                                            //
//    eses                   eses                                             //
//   eses                     eses                                            //
//  eses    eseses  esesese    eses   Embedded Systems Group                  //
//  ese    ese  ese ese         ese                                           //
//  ese    eseseses eseseses    ese   Department of Computer Science          //
//  eses   eses          ese   eses                                           //
//   eses   eseses  eseseses  eses    University of Kaiserslautern            //
//    eses                   eses                                             //
//                                                                            //
// ************************************************************************** //
// The module below implements the fast Fourier transform as a circuit having //
// K=log(N) columns for N=2^K inputs. In each column a butterfly permutation  //
// is applied and the lower half is multiplied with powers of the principal   //
// root of unity. At the end the results appear with bit reversed indices, so //
// that the final step is a bit reversal of the indices.                      //
//   As can be seen, the FFT can be implemented in O(log(N)) depth using O(N) //
// processors. Since, e.g. multiplication of polynoms and radix-B numbers can //
// be implemented by the FFT as well, we also obtain efficient algorithms for //
// these problems by means of the FFT.                                        //
// By definition, the FFT is a matrix multiplication of the Fourier matrix    //
// F_N and the vector x[n-1..0], where the coefficient f[j][k] of the Fourier //
// matrix F_N are defined as f[j][k] := w^{j*k} = proot(j*k), i.e.            //
//                                                                            //
//    | proot(0*0)  proot(0*1)  proot(0*2)  proot(0*3) |   | x[0] |           //
//    | proot(1*0)  proot(1*1)  proot(1*2)  proot(1*3) |   | x[1] |           //
//    | proot(2*0)  proot(2*1)  proot(2*2)  proot(2*3) | * | x[2] |           //
//    | proot(3*0)  proot(3*1)  proot(3*2)  proot(3*3) |   | x[3] |           //
//                                                                            //
// where proot(k) is defined as a macro below (proot(1) is the principal N-th //
// root of unity, i.e., proot(1)^N = proot(N) = 1 and proot(1) is no power of //
// another N-the root of unity).                                              //
// Thus, the vector y[0…N-1] defined as follows is the DFT of x[0…N-1]:       //
// y[j] = sum(k=0..N-1) exp(e,-(2*pi*i*j*k)/N) * x[k]                         //
//      = sum(k=0..N-1) (cos((2*pi*j*k)/N) - i * sin((2*pi*j*k)/N)) * x[k]    //
// Such a matrix/vector product requires sequential time O(N^2). However,     //
// there is a simple divide and conquer algorithm known as the Fast Fourier   //
// Transform (FFT) that exploits the symmetries of the coefficients. The FFT  //
// consists of O(log(N)) steps where each step requires O(N) time, hence, in  //
// total O(N*log(N)) time is required. The version below is a simple parallel //
// execution of each step in time O(1) with O(N) processors, so that O(N)     //
// processors can execute the FFT in time O(log(N)).                          //
// ************************************************************************** //

macro K = 4;
macro N = exp(2,K);

// arithmetic on complex numbers
macro csub(x,y) = (x.0-y.0,x.1-y.1);
macro cmul(x,y) = (x.0*y.0-x.1*y.1,x.1*y.0+x.0*y.1);
macro cdiv(x,y) = ((x.0*y.0+x.1*y.1)/(y.0*y.0+y.1*y.1),
(x.1*y.0-x.0*y.1)/(y.0*y.0+y.1*y.1));
macro Re(x) = x.0;
macro Im(x) = x.1;

// numeric precision
macro eps = 1.0e-4;
macro almost_equal(x,y) = y-x<eps & x-y < eps ;
macro rnd(x) = (almost_equal(x,0.0) ? 0.0 : (almost_equal(x,1.0) ? 1.0 : x));

// The principal N-th root of unity is exp(e,(-2*pi*i*j*k)/N) with i=sqrt(-1)
// which can be also written as cos((2*pi*k)/N) - i * sin((2*pi*k)/N).
macro e  = 2.7182818284590452353602874713527;
macro pi = 3.1415926535897932384626433832795;
macro proot(k) = (rnd(cos((2*pi*k)/N)),-rnd(sin((2*pi*k)/N))); // =

module FastFourierTransform([N](real * real) ?x,y) {
[K+1][N](real * real) z;

// initial column
for(i=0..N-1)
z[0][i] = x[i];

// there are K columns connected by butterfly permutations
for(h=0..K-1) {
// column h consists of exp(2,h) blocks of size exp(2,K-h)
for(b=0..exp(2,h)-1)
let(boff = b*exp(2,K-h))
let(half = exp(2,K-h-1)) {
// upper half of block b
for(j=0..half-1)
// lower half of block b
for(j=half..2*half-1)
let(w2 = proot(exp(2,h) * (j-half)))
z[h+1][boff+j] = cmul(w2,csub(z[h][boff+j-half],z[h][boff+j]));
}
}

// rightmost row is mapped to outputs by bitreversal of indices
// (all K perfect shuffle permutations are done at the end)
for(j=0..N-1)
y[j] = z[K][bv2nat(reverse(nat2bv(j,K)))];

}
drivenby {
for(j=0..N-1)
x[j] = (j,j);
// check equality up to numeric precision with the formal definition
// y[j] == sum(k=0..N-1) (cos((2*pi*j*k)/N) - i * sin((2*pi*j*k)/N)) * x[k]
for(j=0..N-1)
let(re = sum(k=0..N-1) Re(cmul((rnd(cos((2*pi*j*k)/N)),-rnd(sin((2*pi*j*k)/N))),x[k])))
let(im = sum(k=0..N-1) Im(cmul((rnd(cos((2*pi*j*k)/N)),-rnd(sin((2*pi*j*k)/N))),x[k])))
assert(almost_equal(Re(y[j]),re) & almost_equal(Im(y[j]),im));
}
```