// ************************************************************************** //
//                                                                            //
//    eses                   eses                                             //
//   eses                     eses                                            //
//  eses    eseses  esesese    eses   Embedded Systems Group                  //
//  ese    ese  ese ese         ese                                           //
//  ese    eseseses eseseses    ese   Department of Computer Science          //
//  eses   eses          ese   eses                                           //
//   eses   eseses  eseseses  eses    University of Kaiserslautern            //
//    eses                   eses                                             //
//                                                                            //
// ************************************************************************** //
// MergeSortOddEven implements Batcher's MergeSort algorithm using OddEven
// mergers [Batc68]. As can be seen, only for-loops are used, so that this
// algorithm directly implements a sorting network with depth log(N)(log(N)+1)/2
// and N(log(N)-1)log(N)/4+N-1 compare/exchange operations. Thus, exploiting its full
// parallelism (with N processors), sorting can be done in time O(log^2(N)),
// while a sequential implementation requires O(Nlog^2(N)) and is thus slower
// than the typical MergeSort.
//  Typically, the algorithm is presented by recursive functions, while the
// version below implements an iterative approach whose equivalence is not
// easily seen, and is thus worth some comments: 
//  In general, the recursive calls to a MergeSort of a list of size N=2^k 
// where the end of the recursion is given when lists of length 1 are to be 
// sorted, will produce a call tree whose leaves are singleton lists. Then, 
// returning from these recursive calls, one works this call tree upwards 
// to the root. During this, the merge operations are applied to subtrees 
// consisting of 1,2,4,8, ... 2^{k-1} leaf nodes. Let us call these operations 
// the Merge(1), Merge(2), Merge(4), Merge(8), ..., Merge(2^{k-1}) operations. 
// These operations are applied in sequence and form the columns of the sorting 
// network, where Merge(2^j) generates j+1 columns. To construct a sorting 
// network, assume further that these operations are in-place operations, i.e. 
// that the results are stored in the array.
//  Hence, the sorting network of OddEvenMergeSort of size N=2^k consists of 
// columns that stem from the subnets obtained from Merge(1), Merge(2), Merge(4), 
// Merge(8), ..., Merge(2^{k-1}) operations. According to OddEvenMergeSort, 
// these merge operations work as follows: The given list a[0],...,a[N-1] is 
// split into halves a[0],a[2],a[4],...,a[N-2] of even and a[1],a[3],a[5],...,a[N-1] 
// of odd indices, respectively. Then, the OddEvenMerge operation is applied 
// recursively to these lists, and assuming that the result is again in the array 
// a[0..N-1], then a final column of swaps (1 2) (3 4) (5 6) ... (N-3 N-2) is added 
// before terminating the operation. Let us call this column the `combine' column.
//  Following the call tree of these OddEvenMergeSort operations, one can see that
// the argument lists generated by the recursive calls contain first the indices
// whose binary representation ends with either 0 or 1 (i.e. even and odd ones),
// then those ending with either 00,10,01,11, and so on. At the level of numbers,
// these are the array indices partitioned modulo 2^i for i=1,...,2^{k-1}. 
//  The first column of Merge(2^k) generate a half cleaner with gaps 2^{k-1}, i.e.
// the swaps (0 2^{k-1}) (1 2^{k-1}+1) ... (2^{k-1}-1 2^k-1), since these swaps are
// performed at the leaves of the call tree of Merge(2^k), i.e., when the OddEvenMerge
// operations terminate. After this first column of a half cleaner, the combine
// columns are added that stem from working the call tree of the recursive calls to
// the merge operation upwards.
//  The combine columns at level i are constructed as follows: Generate the sorted
// list of array indices modulo 2^i, ignore the first and last elements, and construct
// pairs for swap operations of the rest. For example, MergeSortOddEven for N=16,
// consists of the following swaps (i j):
//
//    Merge(1): (variable l=0; eight partitions)
//        HC(l=0): [(0 1)] [(2 3)] [(4 5)] [(6 7)] [(8 9)] [(10 11)] [(12 13)] [(14 15)]
//
//    Merge(2): (variable l=1; four partitions)
//        HC(l=1): [(0 2) (1 3)]  [(4 6) (5 7)]  [(8 10) (9 11)]  [(12 14) (13 15)]
//        C(d=1):   [0 (1 2) 3]    [4 (5 6) 7]    [8 (9 10) 11]    [12 (13 14) 15]
//
//    Merge(4): (variable l=2; two partitions)
//        HC(l=2): [(0 4) (1 5) (2 6) (3 7)]         [(8 12) (9 13) (10 14) (11 15)]
//        C(d=1):  [0 (2 4) 6]  [1 (3 5) 7]          [8 (10 12) 14]  [9 (11 13) 15]
//        C(d=2):  [0 (1 2) (3 4) (5 6) 7]           [8 (9 10) (11 12) (13 14) 15]
//
//    Merge(8): (variable l=3; one partition)
//        HC(l=3): [(0 8) (1 9) (2 10) (3 11) (4 12) (5 13) (6 14) (7 15)]
//        C(d=1):  [0 (4 8) 12] [1 (5 9) 13] [2 (6 10) 14] [3 (7 11) 15]
//        C(d=2):  [0 (2 4) (6 8) (10 12) 14] [1 (3 5) (7 9) (11 13) 15]
//        C(d=3):  [0 (1 2) (3 4) (5 6) (7 8) (9 10) (11 12) (13 14) 15]
//
// Note that Merge(2^l) consists of aSize/exp(2,l+1) partitions and at level
// d, each partition has exp(2,l-d) classes that correspond with the numbers
// of these partitions modulo the gap g = exp(2,l-d).
// ************************************************************************** //

macro aSize = 32;
macro iSize = 64;

module MergeSortOddEven([aSize]int{iSize} ?a,b, event !ready) {
    
    // gather the inputs
    for(i=0..aSize-1)
        b[i] = a[i];

    // implement Merge(2^l) for l=0,...,k-1 as described in the comments above 
    for(l=0..log(aSize)-1) {
        // consider partitions (= subtrees of call tree) of size L defined below
        // the first column is a half cleaner in all of the partitions of this stage
        let(L=exp(2,l+1))
        for(p=0..aSize/L-1) {
            let(l = p*L)    // leftmost index of partition p
            let(g = L/2)    // gap for half cleaner
            for(k=0..g-1)   // enumerate swaps of half cleaner
                let(x = l+k)
                let(y = x+g)
                if(b[x]>b[y]) {
                    next(b[x]) = b[y];
                    next(b[y]) = b[x];
                }
        }
        w1: pause;

        // after the half cleaner column, the combine columns are added
        for(d=1..l) { 
            let(g = exp(2,l-d))             // gaps for the swaps
            let(L = exp(2,l+1))             // size of a partition
            for(p=0..aSize/L-1) {         // for each partition p 
                for(c=0..g-1) {             // for each modulo class in partition p
                    for(i=0..exp(2,d)-2) {  // for all swap pairs in class c of partition p
                        let(x = p*L+c+g+i*2*g)
                        let(y = x+g)
                        if(b[x]>b[y]) {
                            next(b[x]) = b[y];
                            next(b[y]) = b[x];
                        }
                    }
                }
            }
        w2: pause;
        }
    }
    emit(ready);
}
drivenby Test01 {
    for(i=0..aSize-1) a[i] = i;
    dw: await(ready);
    for(i=0..aSize-1) assert(b[i] == i);
}
drivenby Test02 {
    for(i=0..aSize-1) a[i] = aSize-1-i;
    dw: await(ready);
    for(i=0..aSize-1) assert(b[i] == i);
}
drivenby Test03 {
    for(i=0..aSize-1)
        a[i] = ((i%2==0)?i:((aSize%2==0)?aSize-i:aSize-1-i));
    dw: await(ready);
    for(i=0..aSize-1) assert(b[i] == i);
}