// ************************************************************************** //
//                                                                            //
//    eses                   eses                                             //
//   eses                     eses                                            //
//  eses    eseses  esesese    eses   Embedded Systems Group                  //
//  ese    ese  ese ese         ese                                           //
//  ese    eseseses eseseses    ese   Department of Computer Science          //
//  eses   eses          ese   eses                                           //
//   eses   eseses  eseseses  eses    University of Kaiserslautern            //
//    eses                   eses                                             //
//                                                                            //
// ************************************************************************** //
// This module implements a systolic array to compute the edit distance of    //
// many pairs of sequences in a pipelined computation. It is a refinement of  //
// module EditDistanceParallel in that not only one diagonal of the matrix is //
// active for the computation of the distance of two input sequences (which   //
// takes O(M+N) many steps). Instead, all diagonals work in parallel and they //
// perform overlapped parts of the computations of M+N edit distances.        //
// Since diagonals refer to the computation of a part of the edit distance of //
// two input sequences, the components of these input sequences must be piped //
// into the systolic array in a skewed manner: s[i] has to enter at time i of //
// one sequence s (and also t[j] has to enter j times later than t[0]). See   //
// the comment in the driver below for more details.                          //
// Another problem is that element d[i][j] of the distance matrix of a pair of//
// sequences depends on the element d[i-1][j-1], d[i-1][j], and d[i][j-1] that//
// unfortunately lie on the two previous diagonals. Thus, one has the choice  //
// to use only every other cycle or to introduce intermediate stores between  //
// d[i-1][j-1] and d[i][j] to store the value of d[i-1][j-1] for one cycle.   //
// The implementation below makes use of the additional delay elements in the //
// array e. Note that the entire array implements a pipeline with M+N-1 stages//
// since every diagonal is one stage.                                         //
// ************************************************************************** //

macro M = 4;        // size of first sequence
macro N = 5;        // size of second sequence
macro C = 8;        // sequence elements have type nat{C}
macro P = M+N+1;    // number of pairs of input sequences
macro max(x,y) = (x<y ? y : x);
macro min(x,y) = (x<y ? x : y);
macro min3(x,y,z) = min(x,min(y,z));


module EditDistance2D([M]nat{C} ?s,[N]nat{C} ?t, nat{C} dist) {
    [M][N]nat{C} a,b;
    [M][N]nat{max(M,N)} d,e;
    // the systolic array
    loop {
        pause;
        for(i=0..M-1) {
            for(j=0..N-1) {
                let(a_in = (j==0 ? s[i] : a[i][j-1]))
                let(b_in = (i==0 ? t[j] : b[i-1][j]))
                let(e_in = (i==0 ? j : (j==0 ? i : e[i-1][j-1])))
                let(d1 = e_in + (a_in==b_in ? 0 : 1))
                let(d2 = (j==0 ? i : d[i][j-1]) + 1)
                let(d3 = (i==0 ? j : d[i-1][j]) + 1) {
                    next(d[i][j]) = min3(d1,d2,d3);
                    next(a[i][j]) = a_in;
                    next(b[i][j]) = b_in;
                    next(e[i][j]) = d[i][j];
                }
            }
        }
    // final result is in the lower right corner of d
    dist = d[M-1][N-1];
    }
}
drivenby {
    [M+N+1][M]nat{C} s_seq;
    [M+N+1][N]nat{C} t_seq;
    // input sequence of sequences for s
    s_seq = [
         [1,3,3,4],
         [1,2,3,4],
         [1,3,2,4],
         [1,3,5,4],
         [5,3,3,4],
         [5,2,3,3],
         [4,1,2,3],
         [1,3,4,5],
         [1,1,1,1],
         [3,3,3,3]
        ];
    // input sequence of sequences for t
    t_seq = [
         [2,0,3,4,5],
         [3,2,3,4,5],
         [3,5,5,4,5],
         [5,3,6,4,5],
         [5,5,3,3,4],
         [4,2,3,3,3],
         [1,2,3,1,6],
         [1,1,1,1,1],
         [3,2,3,1,3],
         [0,2,2,5,6]
        ];

    // feed in the input sequences with the required delay, i.e.
    //
    //                                           5
    //                                        4  5
    //                                     3  4  5
    //                                  0  3  4  5
    //                               2  2  5  4  4  
    //                               3  5  6  3  3  
    //                               3  3  3  3  6  
    //                               5  5  3  1  1  
    //                               5  2  3  1  4  
    //                               4  2  1  1  6  
    //                               1  1  3  5  |
    //                               1  1  2  |  |
    //                               3  2  |  |  |
    //                               0  |  |  |  |
    //                               |  |  |  |  |
    //                               v  v  v  v  v
    //          3 1 1 4 5 5 1 1 1 -> [] [] [] [] []
    //        3 1 3 1 2 3 3 3 2 ---> [] [] [] [] []
    //      3 1 4 2 3 3 5 2 3 -----> [] [] [] [] []
    //    3 1 5 3 3 4 4 4 4 -------> [] [] [] [] []
    //

    for(i=0..M-1) do || {       // for all input pairs
        for(j=1..i+1)           // delay component j of sequence s_seq[i]
            pause;
        for(j=0..P-1) {
            s[i] = s_seq[j][i];
            pause;
        }
    }
    ||
    for(i=0..N-1) do || {       // for all input pairs
        for(j=1..i+1)           // delay component j of sequence s_seq[i]
            pause;
        for(j=0..P-1) {
            t[i] = t_seq[j][i];
            pause;
        }
    }
    // wait until sequences were piped through
    for(i=1..max(M,N))
        pause;
}