// ************************************************************************** //
//                                                                            //
//    eses                   eses                                             //
//   eses                     eses                                            //
//  eses    eseses  esesese    eses   Embedded Systems Group                  //
//  ese    ese  ese ese         ese                                           //
//  ese    eseseses eseseses    ese   Department of Computer Science          //
//  eses   eses          ese   eses                                           //
//   eses   eseses  eseseses  eses    University of Kaiserslautern            //
//    eses                   eses                                             //
//                                                                            //
// ************************************************************************** //
// PairwiseSort implements Parberry's pairwise sorting network [Parb92].      //
// This sorting network has depth log(N)(log(N)+1)/2 and requires             //
// N(log(N)-1)log(N)/4+N-1 compare/exchange operations. Thus, exploiting its  //
// full parallelism (with N processors), sorting can be done in time          //
// O(log^2(N)), while a sequential implementation requires O(Nlog^2(N)) and   //
// is thus slower than the typical MergeSort.                                 //
//  Typically, the algorithm is presented by recursive functions, while the   //
// version below implements an iterative approach whose equivalence is not    //
// easily seen; see the comments in MergeSortOddEven.                         //
// ************************************************************************** //

macro aSize = 16;
macro iSize = 64;

module PairwiseSort([aSize]int{iSize} ?a,b, event !ready) {
    
    // gather the inputs
    for(i=0..aSize-1)
        b[i] = a[i];

    // implement half cleaner columns in all partitions
    for(l=0..log(aSize)-1) {
        // consider partitions (= subtrees of call tree) of size L defined below
        let(L=exp(2,l+1))
        for(p=0..aSize/L-1) {
            let(l = p*L)    // leftmost index of partition p
            let(g = L/2)    // gap for half cleaner
            for(k=0..g-1)   // enumerate swaps of half cleaner
                let(x = l+k)
                let(y = x+g)
                if(b[x]>b[y]) {
                    next(b[x]) = b[y];
                    next(b[y]) = b[x];
                }
        }
        w1: pause;
    }
    // after the half cleaner columns, the shuffle columns are added
    for(l=1..log(aSize)-1) {
        for(d=1..l) { 
            let(g = exp(2,l+1-d)-1)     // gaps for the shuffle
            let(L = exp(2,l+1))         // size of a partition
            let(n = aSize/L)          // number of partitions
            for(p=0..n-1) {             // for each partition p 
                for(i=0..(L-g-1)/2-1) {
                    let(x = p+(2*i+1)*n)
                    let(y = x+n*g)
                    if(b[x]>b[y]) {
                        next(b[x]) = b[y];
                        next(b[y]) = b[x];
                    }
                }
            }
        w2: pause;
        }
    }
    emit(ready);
}
drivenby Test01 {
    for(i=0..aSize-1) a[i] = i;
    dw: await(ready);
    for(i=0..aSize-1) assert(b[i] == i);
}
drivenby Test02 {
    for(i=0..aSize-1) a[i] = aSize-1-i;
    dw: await(ready);
    for(i=0..aSize-1) assert(b[i] == i);
}
drivenby Test03 {
    for(i=0..aSize-1)
        a[i] = ((i%2==0)?i:((aSize%2==0)?aSize-i:aSize-1-i));
    dw: await(ready);
    for(i=0..aSize-1) assert(b[i] == i);
}