// ************************************************************************** //
//                                                                            //
//    eses                   eses                                             //
//   eses                     eses                                            //
//  eses    eseses  esesese    eses   Embedded Systems Group                  //
//  ese    ese  ese ese         ese                                           //
//  ese    eseseses eseseses    ese   Department of Computer Science          //
//  eses   eses          ese   eses                                           //
//   eses   eseses  eseseses  eses    University of Kaiserslautern            //
//    eses                   eses                                             //
//                                                                            //
// ************************************************************************** //
// This Quartz file implements a Benes network based on the butterfly         //
// permutation. Benes networks for N=exp(2,K) inputs consist of 2*K-1 columns //
// that are partitioned into blocks. If columns are enumerated 0,…,2*K-2 from //
// left to right, then col. i has (i<K ? exp(2,i) : exp(2,2*K-2-col)) blocks. //
// The blocks of the columns are connected with those of the previous column  //
// by a permutation, in case below, the butterfly permutation is used.        //
// The construction can also be recursively defined: A Benes network for N    //
// inputs is obtained by two Benes networks for N/2 inputs that are connected //
// with a new leftmost and a new rightmost column that have both N/2 switches.//
// These are connected with the two subnetworks with the chosen permutation.  //
// Benes networks can implement all permutations, but the routing information //
// is difficult to compute. It is however easily seen that the configurations //
// of switches in columns i for i<K depend on the configurations of those in  //
// column 2*K-2+i which leads to the looping algorithm due to Waksman [Waks69]//
// that requires O(N log(N)) time on a single processor. The version below is //
// a parallel algorithm similar to [NaSa82] that only requires parallel time  //
// O(log(N)^2) with O(N) processors.                                          //
// ************************************************************************** //

macro K = 4;                // log(N), i.e., the number of columns
macro N = exp(2,K);         // number of inputs, i.e. twice the number of rows
macro btfly(x)  = (sizeOf(x)<=2 ? reverse(x) : x{0}@x{-2:1}@x{-1});
macro numb(col) = (col<K ? exp(2,col-1) : exp(2,2*K-2-col));
macro bits(col) = (col<K ? K+1-col : col-K+2);
macro even(x) = ((x/2)*2 == x);


module BenesButterfly([N]nat ?x,[N]nat{N} ?target,[N]nat y,event !rdy) {
    [N][2*K-1]nat z;        // data wires through the network
    [N]nat{N} tg,itg;       // target and inverse target addresses of curr. col.
    [N/2]nat{N} pre,suc;    // next switches on the cycle of dependent switches
    [N/2]nat{N} rep;        // rep[i] is the representative of the cycle of switch i
    [N/2][2*K-1]bool cross; // whether switch[i][j] is crossed
    [N/2][2*K-1]bool known; // whether value of cross[i][j] is known

    // ------------------------------------------------------------------------
    // construction of the Benes network
    // ------------------------------------------------------------------------
    // copy inputs to the leftmost column of the network
    for(i=0..N/2-1) {
        z[2*i][0]   = (cross[i][0] ? x[2*i+1] : x[2*i]   );
        z[2*i+1][0] = (cross[i][0] ? x[2*i]   : x[2*i+1] );
    }

    // intermediate columns of the network
    for(col=1..2*K-2) {                     // for all columns    
        for(b=0..numb(col)-1) {             // for all blocks
            let(bsizew = N/numb(col))       // #wires in a block
            let(bsizes = bsizew/2)          // #switches in a block
            let(boffw  = b*bsizew)          // wire   offset of the block
            let(boffs  = b*bsizes)          // switch offset of the block
            for(i=0..bsizes-1) {            // for all switches in a block
                // determine outputs of switch i in block b of column col
                let(i0 = 2*i)               // block index of input 0
                let(i1 = 2*i+1)             // block index of input 1
                let(s0 = (col<=0 ? i0 : bv2nat(btfly(nat2bv(i0,bits(col))))))
                let(s1 = (col<=0 ? i1 : bv2nat(btfly(nat2bv(i1,bits(col))))))
                let(j0 = boffw + s0)         // absolute source index of i0
                let(j1 = boffw + s1)         // absolute source index of i1
                let(cr = cross[boffs+i][col]) {
                z[boffw+i0][col] = (cr ? z[j1][col-1] : z[j0][col-1]);
                z[boffw+i1][col] = (cr ? z[j0][col-1] : z[j1][col-1]);
                }
            }
        }
    }

    // receiving the final outputs
    for(i=0..N-1)
        y[i] = z[i][2*K-2];



    // ------------------------------------------------------------------------
    // routing by the looping algorithm
    // ------------------------------------------------------------------------
    // The routing algorithm below is a parallel version of Waksman's looping
    // algorithm. We exploit the dependencies of the switches in column c and
    // column 2K-2-c. To this end, we first determine the target and inverse
    // target addresses desired by these columns. Since each switch [i][c] in
    // determines the switches [tg[2*i]/2][2K-2-c] and  [tg[2*i+1]/2][2K-2-c]
    // and each switch [j][2K-2-c] determines two switches [itg[2*j]/2][c] and
    // [itg[2*j+1]/2][c], we see that each switch [i][c] depends on two
    // switches in its own column. For switch [i][c], the rows of these switches
    // are stored in suc[i] and pre[i] without any preference which one goes
    // to suc[i] and pre[i], respectively. This proves that the dependency 
    // graph of the switches is a set of unconnected cycles. We then compute 
    // for each of these cycles the switch with the minimal row in rep[i]
    // so that rep[i] is a representative of the equivalence class that is
    // given by the cycle of i. This is done in parallel time O(log(N)) using
    // O(N) processors. The switches [rep[i]][c] can now be set arbitrarily;
    // we prefer to bring them in the straight position. After this, the 
    // dependent configurations are determined in a single step by propagating
    // the made choices so that all configurations of the switches in columns
    // c and 2K-2-c are known. Then, we proceed to columns c+1 and 2K-2-(c+1).
    // ------------------------------------------------------------------------

    // target and inverse targets for columns 0 and 2K-2, respectively
    for(i=0..N-1) {
        tg[i] = target[i];
    }
   
    for(col=0..K-2) {
        // compute inverse targets
        for(i=0..N-1) {
            itg[tg[i]] = i; 
        }
        // Switch [i][col] shall be connected to switch [tg[2*i]/2][2*K-2-col] and
        // also to switch [tg[2*i+1]/2][2*K-2-col]. Moreover these switches are
        // connected to two (possibly other) switches in column col. These two
        // are determined in suc[i] and pre[i] without preferences which 
        // one goes to suc or pr. Note the following where j is one of tg[2*i]/2
        // or tg[2*i+1]/2:
        //
        //  2*i+1 -|   |          |   |-> 2*j+1  
        //         | i | -------> | j |
        //  2*i   -|   |          |   |-> 2*j
        //
        for(i=0..N/2-1) {
            rep[i] = i;
            let(tg1 = tg[2*i])
            suc[i]  = itg[(even(tg1) ? tg1+1 : tg1-1)]/2;
            let(tg2 = tg[2*i+1])
            pre[i]  = itg[(even(tg2) ? tg2+1 : tg2-1)]/2;
        }
        
        // compute the transitive hull of the dependencies by iterative squaring
        for(h=0..log(N/2)-1) {
            for(i=0..N/2-1) {
                let(v1 = rep[i])
                let(v2 = rep[suc[i]])
                let(v3 = rep[pre[i]])
                let(m12 = (v1<v2 ? v1 : v2))
                let(m123 = (m12<v3 ? m12 : v3))
                next(rep[i]) = m123;
            }
            rep_iter: pause;
        }
        // make decisions for the representatives of each SCC in the left column
        for(i=0..N/2-1) {
            cross[rep[i]][col] = false;
            known[rep[i]][col] = true;
        }
        // propagate the decisions using the dependencies between columns
        for(i=0..N/2-1) {
            if(known[i][col]) {
                // If cross[i][col] holds, z[2*i][col-1] is sent to the upper
                // network while z[2*i+1][col-1] is sent to the lower network.
                // Thus, tg[2*i] must be received from the upper network, which
                // implies crossed position iff tg[2*i] is odd.
                // The other implications are analogous.
                known[tg[2*i  ]/2][2*K-2-col] = true;
                known[tg[2*i+1]/2][2*K-2-col] = true;
                let(cr = cross[i][col]) {
                cross[tg[2*i  ]/2][2*K-2-col] = cr xor nat2bv(tg[2*i  ],K){0};
                cross[tg[2*i+1]/2][2*K-2-col] = cr eqv nat2bv(tg[2*i+1],K){0};
                }
            }
            if(known[i][2*K-2-col]) {
                known[itg[2*i  ]/2][col] = true;
                known[itg[2*i+1]/2][col] = true;
                let(cr = cross[i][2*K-2-col]) {
                cross[itg[2*i  ]/2][col] = cr xor nat2bv(itg[2*i  ],K){0};
                cross[itg[2*i+1]/2][col] = cr eqv nat2bv(itg[2*i+1],K){0};
                }
            }
        }
        // compute next target addresses (i.e. new desired permutations)
        // by propagating the already know targets through the columns;
        // for the computations below, consider the left column:
        //                     butterfly
        //    swj -> |\/| - j -----------> boffw+i
        //           |/\|
        //
        // and the right column
        //
        //                butterfly
        //  boffw + ntg <----------- swt <- |\/|<-- tg[swj] 
        //                                  |/\|
        //
        for(b=0..numb(col+1)-1) {           // for all blocks
            let(bsizew = N/numb(col+1))     // #wires in a block
            let(boffw  = b*bsizew)          // wire offset of the block
            for(i=0..bsizew-1) {            // for all wires in a block
                // pass the permutation (butterfly = inverse butterfly)
                let(bti = bv2nat(btfly(nat2bv(i,bits(col+1)))))
                let(j = boffw + bti)        // absolute source index of i
                // pass the switch in left column 
                let(bj  = nat2bv(j,K))
                let(swj = bv2nat(bj{-1:1}@(cross[j/2][col] xor bj{0})))
                // determine target address of swj
                let(tgj = tg[swj])           // abs. target of abs. addr. of swj0
                // pass the switch in right column 
                let(btj = nat2bv(tgj,bits(col+1)))
                let(swt = btj{-1:1}@(cross[tgj/2][2*K-2-col] xor btj{0}))
                // pass the permutation (butterfly = inverse butterfly)
                let(ntg  = bv2nat(btfly(swt)))
                next(tg[boffw+i]) = boffw + ntg;
            }
        }
        cw : pause;
    }
    // final decisions for middle column K-1
    for(i=0..N/2-1) {
        cross[i][K-1] = tg[2*i] != 2*i;
        known[i][K-1] = true;
    }

    emit(rdy);
}
//drivenby shift1 {
//    for(i=0..N-1) {
//        x[i] = i;
//        target[i] = (i+1) % N;
//    }
//    await(rdy);
//}
drivenby PerfectShuffle {
    for(i=0..N-1) {
        x[i] = i;
        let(b = nat2bv(i,K))
        target[i] = bv2nat(b{-2:0}@b{-1});
    }
    await(rdy);
}
//drivenby InversePerfectShuffle {
//    for(i=0..N-1) {
//        x[i] = i;
//        let(b = nat2bv(i,K))
//        target[i] = bv2nat(b{0}@b{-1:1});
//    }
//    await(rdy);
//}
//drivenby reversal {
//    for(i=0..N-1) {
//        x[i] = i;
//        target[i] = N-1-i;
//    }
//    await(rdy);
//}