```// ************************************************************************** //
//                                                                            //
//    eses                   eses                                             //
//   eses                     eses                                            //
//  eses    eseses  esesese    eses   Embedded Systems Group                  //
//  ese    ese  ese ese         ese                                           //
//  ese    eseseses eseseses    ese   Department of Computer Science          //
//  eses   eses          ese   eses                                           //
//   eses   eseses  eseseses  eses    University of Kaiserslautern            //
//    eses                   eses                                             //
//                                                                            //
// ************************************************************************** //
// MergeSortOddEven implements Batcher's MergeSort algorithm using OddEven
// mergers [Batc68]. As can be seen, only for-loops are used, so that this
// algorithm directly implements a sorting network with depth log(N)(log(N)+1)/2
// and N(log(N)-1)log(N)/4+N-1 compare/exchange operations. Thus, exploiting its full
// parallelism (with N processors), sorting can be done in time O(log^2(N)),
// while a sequential implementation requires O(Nlog^2(N)) and is thus slower
// than the typical MergeSort.
//  Typically, the algorithm is presented by recursive functions, while the
// version below implements an iterative approach whose equivalence is not
// easily seen, and is thus worth some comments:
//  In general, the recursive calls to a MergeSort of a list of size N=2^k
// where the end of the recursion is given when lists of length 1 are to be
// sorted, will produce a call tree whose leaves are singleton lists. Then,
// returning from these recursive calls, one works this call tree upwards
// to the root. During this, the merge operations are applied to subtrees
// consisting of 1,2,4,8, ... 2^{k-1} leaf nodes. Let us call these operations
// the Merge(1), Merge(2), Merge(4), Merge(8), ..., Merge(2^{k-1}) operations.
// These operations are applied in sequence and form the columns of the sorting
// network, where Merge(2^j) generates j+1 columns. To construct a sorting
// network, assume further that these operations are in-place operations, i.e.
// that the results are stored in the array.
//  Hence, the sorting network of OddEvenMergeSort of size N=2^k consists of
// columns that stem from the subnets obtained from Merge(1), Merge(2), Merge(4),
// Merge(8), ..., Merge(2^{k-1}) operations. According to OddEvenMergeSort,
// these merge operations work as follows: The given list a,...,a[N-1] is
// split into halves a,a,a,...,a[N-2] of even and a,a,a,...,a[N-1]
// of odd indices, respectively. Then, the OddEvenMerge operation is applied
// recursively to these lists, and assuming that the result is again in the array
// a[0..N-1], then a final column of swaps (1 2) (3 4) (5 6) ... (N-3 N-2) is added
// before terminating the operation. Let us call this column the `combine' column.
//  Following the call tree of these OddEvenMergeSort operations, one can see that
// the argument lists generated by the recursive calls contain first the indices
// whose binary representation ends with either 0 or 1 (i.e. even and odd ones),
// then those ending with either 00,10,01,11, and so on. At the level of numbers,
// these are the array indices partitioned modulo 2^i for i=1,...,2^{k-1}.
//  The first column of Merge(2^k) generate a half cleaner with gaps 2^{k-1}, i.e.
// the swaps (0 2^{k-1}) (1 2^{k-1}+1) ... (2^{k-1}-1 2^k-1), since these swaps are
// performed at the leaves of the call tree of Merge(2^k), i.e., when the OddEvenMerge
// operations terminate. After this first column of a half cleaner, the combine
// columns are added that stem from working the call tree of the recursive calls to
// the merge operation upwards.
//  The combine columns at level i are constructed as follows: Generate the sorted
// list of array indices modulo 2^i, ignore the first and last elements, and construct
// pairs for swap operations of the rest. For example, MergeSortOddEven for N=16,
// consists of the following swaps (i j):
//
//    Merge(1): (variable l=0; eight partitions)
//        HC(l=0): [(0 1)] [(2 3)] [(4 5)] [(6 7)] [(8 9)] [(10 11)] [(12 13)] [(14 15)]
//
//    Merge(2): (variable l=1; four partitions)
//        HC(l=1): [(0 2) (1 3)]  [(4 6) (5 7)]  [(8 10) (9 11)]  [(12 14) (13 15)]
//        C(d=1):   [0 (1 2) 3]    [4 (5 6) 7]    [8 (9 10) 11]    [12 (13 14) 15]
//
//    Merge(4): (variable l=2; two partitions)
//        HC(l=2): [(0 4) (1 5) (2 6) (3 7)]         [(8 12) (9 13) (10 14) (11 15)]
//        C(d=1):  [0 (2 4) 6]  [1 (3 5) 7]          [8 (10 12) 14]  [9 (11 13) 15]
//        C(d=2):  [0 (1 2) (3 4) (5 6) 7]           [8 (9 10) (11 12) (13 14) 15]
//
//    Merge(8): (variable l=3; one partition)
//        HC(l=3): [(0 8) (1 9) (2 10) (3 11) (4 12) (5 13) (6 14) (7 15)]
//        C(d=1):  [0 (4 8) 12] [1 (5 9) 13] [2 (6 10) 14] [3 (7 11) 15]
//        C(d=2):  [0 (2 4) (6 8) (10 12) 14] [1 (3 5) (7 9) (11 13) 15]
//        C(d=3):  [0 (1 2) (3 4) (5 6) (7 8) (9 10) (11 12) (13 14) 15]
//
// Note that Merge(2^l) consists of aSize/exp(2,l+1) partitions and at level
// d, each partition has exp(2,l-d) classes that correspond with the numbers
// of these partitions modulo the gap g = exp(2,l-d).
// ************************************************************************** //

macro aSize = 32;
macro iSize = 64;

module MergeSortOddEven([aSize]int{iSize} ?a,b, event !ready) {

// gather the inputs
for(i=0..aSize-1)
b[i] = a[i];

// implement Merge(2^l) for l=0,...,k-1 as described in the comments above
for(l=0..log(aSize)-1) {
// consider partitions (= subtrees of call tree) of size L defined below
// the first column is a half cleaner in all of the partitions of this stage
let(L=exp(2,l+1))
for(p=0..aSize/L-1) {
let(l = p*L)    // leftmost index of partition p
let(g = L/2)    // gap for half cleaner
for(k=0..g-1)   // enumerate swaps of half cleaner
let(x = l+k)
let(y = x+g)
if(b[x]>b[y]) {
next(b[x]) = b[y];
next(b[y]) = b[x];
}
}
w1: pause;

// after the half cleaner column, the combine columns are added
for(d=1..l) {
let(g = exp(2,l-d))             // gaps for the swaps
let(L = exp(2,l+1))             // size of a partition
for(p=0..aSize/L-1) {         // for each partition p
for(c=0..g-1) {             // for each modulo class in partition p
for(i=0..exp(2,d)-2) {  // for all swap pairs in class c of partition p
let(x = p*L+c+g+i*2*g)
let(y = x+g)
if(b[x]>b[y]) {
next(b[x]) = b[y];
next(b[y]) = b[x];
}
}
}
}
w2: pause;
}
}
}
drivenby Test01 {
for(i=0..aSize-1) a[i] = i;
for(i=0..aSize-1) assert(b[i] == i);
}
drivenby Test02 {
for(i=0..aSize-1) a[i] = aSize-1-i;