// ************************************************************************** //
//                                                                            //
//    eses                   eses                                             //
//   eses                     eses                                            //
//  eses    eseses  esesese    eses   Embedded Systems Group                  //
//  ese    ese  ese ese         ese                                           //
//  ese    eseseses eseseses    ese   Department of Computer Science          //
//  eses   eses          ese   eses                                           //
//   eses   eseses  eseseses  eses    University of Kaiserslautern            //
//    eses                   eses                                             //
//                                                                            //
// ************************************************************************** //
// CountSortPar is a simple algorithm that sorts an array with N elements in  //
// time O(log(N)) with O(N^2) many processors. The version below requires only//
// two macro steps that both execute O(N^2) many actions with a dependency    //
// depth of O(log(N)). By declaring a second local array m2, the sorting can  //
// even be done in a single macro step. Nevertheless the parallel time is in  //
// all cases O(log(N)) with O(N^2) many actions per macro step.               //
// ************************************************************************** //

macro N = 22;
macro M = 64;

module CountSortPar([N]nat{M} ?a,b) {
    [N]nat{N+1} ls,gt;

    // for every a[i] count the number of a[j] that are less than a[i]
    // each sum requires N processors with log(N) depth;
    // thus, in total N^2 processors with depth log(N) are required
    for(i=0..N-1)
        ls[i] = sum(j=0..N-1) (a[j]<a[i]?1:0);

    // for every a[i] count the number of a[j] that are greater than a[i]
    // each sum requires N processors with log(N) depth;
    // thus, in total N^2 processors with depth log(N) are required
    for(i=0..N-1)
        gt[i] = sum(j=0..N-1) (a[j]>a[i]?1:0);

    // generate the sorted array in one step with N processors
    for(i=0..N-1)
        for(j=0..N-1)
            if(ls[i]<=j & j<N-gt[i])
                b[j] = a[i];
}
drivenby Test01 {
    for(i=0..N-1) a[i] = i;
    for(i=0..N-1) assert(b[i] == i);
}
drivenby Test02 {
    for(i=0..N-1) a[i] = N-1-i;
    for(i=0..N-1) assert(b[i] == i);
}
drivenby Test03 {
    for(i=0..N-1)
        a[i] = ((i%2==0)?i:((N%2==0)?N-i:N-1-i));
    for(i=0..N-1) assert(b[i] == i);
}