```// ************************************************************************** //
//                                                                            //
//    eses                   eses                                             //
//   eses                     eses                                            //
//  eses    eseses  esesese    eses   Embedded Systems Group                  //
//  ese    ese  ese ese         ese                                           //
//  ese    eseseses eseseses    ese   Department of Computer Science          //
//  eses   eses          ese   eses                                           //
//   eses   eseses  eseseses  eses    University of Kaiserslautern            //
//    eses                   eses                                             //
//                                                                            //
// ************************************************************************** //
// This file implements Cannon's parallel matrix multiplication algorithm.    //
// The inputs are NxN matrices a and b, that are partitioned into PxP         //
// submatrices. In each step, the submatrices of a and b are multiplied,      //
// and then the partitions of a are rotated column-wise, while the            //
// partitions of b are rotated row-wise. The algorithm requires N/P steps,    //
// where in each step, two PxP submatrices are multiplied by one processor,   //
// and thus, (N/P)x(N/P) processors are required. The advantage is that       //
// the communication paths are optimized since on a toroidal mesh             //
// interconnection, only neighbor processors need to communicate.             //
// ************************************************************************** //

macro N = 6;    // size of the matrix
macro P = 2;    // size of partions (sub-matrices)

// skewed storage of the input matrices: shift row i of blocks in matrix a
// by i, and shift column j of blocks in matrix b by j
for(rw=0..N/P-1)
for(cl=0..N/P-1)
for(i=0..P-1)
for(j=0..P-1) {
la[i+rw*P][j+cl*P] = a[i+rw*P][j+((rw+cl)*P)%N];
lb[i+rw*P][j+cl*P] = b[i+((rw+cl)*P)%N][j+cl*P];
}

// for the number of partitions
for(s=1..N/P) {
// multiply and rotate the partitions
for(rw=0..N/P-1)
for(cl=0..N/P-1)
// usual sequential matrix multiplication of partition [rw][cl]
// within a macro step (this part is typically scheduled to one
// processor of a N/P x N/P processor array
for(i=0..P-1)
for(j=0..P-1) {
next(la[i+rw*P][j+cl*P]) = la[i+rw*P][j+((cl+1)*P)%N];
next(lb[i+rw*P][j+cl*P]) = lb[i+((rw+1)*P)%N][j+cl*P];
next(c[i+rw*P][j+cl*P]) =
c[i+rw*P][j+cl*P] +
sum(k=0 .. P-1)
(la[i+rw*P][k+cl*P] * lb[k+rw*P][j+cl*P]);
}
pause;
}
}
drivenby t00 {
// generate input matrices
for(i=0..N-1)
for(j=0..N-1) {
a[i][j] = i;
b[i][j] = j;
}
// check correctness of matrix multiplication
for(i=0 .. N-1)
for(j=0 ..N-1)
assert(c[i][j] == sum(k=0 .. N-1) (a[i][k] * b[k][j]));

}
drivenby t01 {
// generate input matrices
for(i=0..N-1)
for(j=0..N-1) {
a[i][j] = i*N+j;
b[i][j] = i*N+j;
}