// ************************************************************************** //
//                                                                            //
//    eses                   eses                                             //
//   eses                     eses                                            //
//  eses    eseses  esesese    eses   Embedded Systems Group                  //
//  ese    ese  ese ese         ese                                           //
//  ese    eseseses eseseses    ese   Department of Computer Science          //
//  eses   eses          ese   eses                                           //
//   eses   eseses  eseseses  eses    University of Kaiserslautern            //
//    eses                   eses                                             //
//                                                                            //
// ************************************************************************** //
// MergeSortBitonic implements Batcher's MergeSort algorithm using bitonic    //
// mergers [Batc68]. As can be seen, only for-loops are used, so that this    //
// algorithm directly implements a sorting network with depth                 //
// log(N)*(log(N)+1)/2 and N*log(N)*(log(N)+1)/4 compare/exchange operations. //
// Using its full parallelism (with N processors), sorting can be done in time//
// O(log^2(N)), while a sequential implementation requires O(Nlog^2(N)) and is//
// thus slower than MergeSort.                                                //
// ************************************************************************** //

macro aSize = 16;
macro iSize = 64;

module MergeSortBitonic([aSize]int{iSize} ?a,b, event !ready) {
    
    // gather the inputs
    for(i=0..aSize-1)
        b[i] = a[i];

    // perform log(aSize) in-place merging phases of partitions
    // b[2*p*L..(2*p+1)*L-1] and b[(2*p+1)*L..(2*p+2)*L-1] of length
    // L=exp(2,l)

    for(l=0..log(aSize)-1) {
        let(L=exp(2,l))
        // the first column of half-cleaners rotates the second sequence
        for(p=0..(aSize/L-2)/2) {
            let(left=2*p*L)
            let(right=(2*p+2)*L-1)
            for(k=0..L-1)
                if(b[left+k]>b[right-k]) {
                    next(b[left+k]) = b[right-k];
                    next(b[right-k]) = b[left+k];
                }
        }
        w1: pause;

        // next columns of half-cleaners consider partitions of 
        // lengths exp(2,l-1),...,exp(2,0)
        for(d=1..l) {
            let(L = exp(2,l-d))
            for(p=0..(aSize/L-2)/2) {
                // half cleaner for b[2*p*L..(2*p+1)*L-1] and b[(2*p+1)*L..(2*p+2)*L-1]
                let(left=2*p*L)
                let(right=left+L)
                for(k=0..L-1) {
                    if(b[left+k]>b[right+k]) {
                        next(b[left+k]) = b[right+k];
                        next(b[right+k]) = b[left+k];
                    }
                }
            }
            w2: pause;
        }
    }
    emit(ready);
}
drivenby Test01 {
    for(i=0..aSize-1) a[i] = i;
    dw: await(ready);
    for(i=0..aSize-1) assert(b[i] == i);
}
drivenby Test02 {
    for(i=0..aSize-1) a[i] = aSize-1-i;
    dw: await(ready);
    for(i=0..aSize-1) assert(b[i] == i);
}
drivenby Test03 {
    for(i=0..aSize-1)
        a[i] = ((i%2==0)?i:((aSize%2==0)?aSize-i:aSize-1-i));
    dw: await(ready);
    for(i=0..aSize-1) assert(b[i] == i);
}