// ************************************************************************** //
//                                                                            //
//    eses                   eses                                             //
//   eses                     eses                                            //
//  eses    eseses  esesese    eses   Embedded Systems Group                  //
//  ese    ese  ese ese         ese                                           //
//  ese    eseseses eseseses    ese   Department of Computer Science          //
//  eses   eses          ese   eses                                           //
//   eses   eseses  eseseses  eses    University of Kaiserslautern            //
//    eses                   eses                                             //
//                                                                            //
// ************************************************************************** //
// This file implements Cannon's parallel matrix multiplication algorithm.    //
// The inputs are NxN matrices a and b, that are partitioned into PxP         //
// submatrices. In each step, the submatrices of a and b are multiplied,      //
// and then the partitions of a are rotated column-wise, while the            //
// partitions of b are rotated row-wise. The algorithm requires N/P steps,    //
// where in each step, two PxP submatrices are multiplied by one processor,   //
// and thus, (N/P)x(N/P) processors are required. The advantage is that       //
// the communication paths are optimized since on a toroidal mesh             //
// interconnection, only neighbor processors need to communicate.             //
// ************************************************************************** //

macro N = 6;    // size of the matrix
macro P = 2;    // size of partions (sub-matrices)

module MatrixMultCannon([N][N]nat ?a,?b,la,lb,c,event !ready) {

    // skewed storage of the input matrices: shift row i of blocks in matrix a
    // by i, and shift column j of blocks in matrix b by j
    for(rw=0..N/P-1)
        for(cl=0..N/P-1)
            for(i=0..P-1)
                for(j=0..P-1) {
                    la[i+rw*P][j+cl*P] = a[i+rw*P][j+((rw+cl)*P)%N];
                    lb[i+rw*P][j+cl*P] = b[i+((rw+cl)*P)%N][j+cl*P];
                }

    // for the number of partitions
    for(s=1..N/P) {
        // multiply and rotate the partitions
        for(rw=0..N/P-1)
            for(cl=0..N/P-1)
                // usual sequential matrix multiplication of partition [rw][cl]
                // within a macro step (this part is typically scheduled to one
                // processor of a N/P x N/P processor array
                for(i=0..P-1)
                    for(j=0..P-1) {
                        next(la[i+rw*P][j+cl*P]) = la[i+rw*P][j+((cl+1)*P)%N];
                        next(lb[i+rw*P][j+cl*P]) = lb[i+((rw+1)*P)%N][j+cl*P];
                        next(c[i+rw*P][j+cl*P]) = 
                             c[i+rw*P][j+cl*P] + 
                                sum(k=0 .. P-1) 
                                    (la[i+rw*P][k+cl*P] * lb[k+rw*P][j+cl*P]);
                    }
        pause;
    }
    emit(ready);
}
drivenby t00 {
    // generate input matrices
    for(i=0..N-1)
        for(j=0..N-1) {
            a[i][j] = i;
            b[i][j] = j;
        }
     await(ready);
     // check correctness of matrix multiplication
     for(i=0 .. N-1)
         for(j=0 ..N-1)
             assert(c[i][j] == sum(k=0 .. N-1) (a[i][k] * b[k][j]));

}
drivenby t01 {
    // generate input matrices
    for(i=0..N-1)
        for(j=0..N-1) {
            a[i][j] = i*N+j;
            b[i][j] = i*N+j;
        }
     await(ready);
     // check correctness of matrix multiplication
     for(i=0 .. N-1)
         for(j=0 ..N-1)
             assert(c[i][j] == sum(k=0 .. N-1) (a[i][k] * b[k][j]));
}