// ************************************************************************** //
//                                                                            //
//    eses                   eses                                             //
//   eses                     eses                                            //
//  eses    eseses  esesese    eses   Embedded Systems Group                  //
//  ese    ese  ese ese         ese                                           //
//  ese    eseseses eseseses    ese   Department of Computer Science          //
//  eses   eses          ese   eses                                           //
//   eses   eseses  eseseses  eses    University of Kaiserslautern            //
//    eses                   eses                                             //
//                                                                            //
// ************************************************************************** //
// This file implements Cannon's parallel matrix multiplication algorithm.    //
// The inputs are NxN matrices a and b, that are partitioned into PxP         //
// submatrices. In each step, the submatrices of a and b are multiplied,      //
// and then the partitions of a are rotated column-wise, while the            //
// partitions of b are rotated row-wise. The algorithm requires N/P steps,    //
// where in each step, two PxP submatrices are multiplied by one processor,   //
// and thus, (N/P)x(N/P) processors are required. The advantage is that       //
// the communication paths are optimized since on a toroidal mesh             //
// interconnection, only neighbor processors need to communicate.             //
// ************************************************************************** //


//define global variables
[8][8]nat la, lb, c, a, b;
nat n;
nat p;

function sum(nat s, e, i, j, cl, rw):nat{
	nat res,k;
	for(k=s..e){
		res = res + (la[i+rw*p][k+cl*p] * lb[k+rw*p][j+cl*p]);
	} 
	return res;
}

//for testing purposes
function sum2(nat e,i,j):nat{
	nat res, k;
	for(k=0..e){
		res = res + (a[i][k] * b[k][j]);
	}
    return res;
}


function MatrixMultCannon():nat{
	nat rw, cl, i, j, s;	
	n = 8;
    p = 6;
	
    // skewed storage of the input matrices: shift row i of blocks in matrix a
    // by i, and shift column j of blocks in matrix b by j

    for(rw=0..n/p-1){
        for(cl=0..n/p-1){
            for(i=0..p-1){
                for(j=0..p-1) {
                    la[i+rw*p][j+cl*p] = a[i+rw*p][j+((rw+cl)*p)%n];
                    lb[i+rw*p][j+cl*p] = b[i+((rw+cl)*p)%n][j+cl*p];
                }
            }    
        }        
    }

    // for the number of partitions
    for(s=1..n/p) {
    // multiply and rotate the partitions
        for(rw=0..n/p-1){
        	for(cl=0..n/p-1){
        	// usual sequential matrix multiplication of partition [rw][cl]
                // within a macro step (this part is typically scheduled to one
                // processor of a N/P x N/P processor array
                 for(i=0..p-1){
                 	for(j=0..p-1) {
                 		(la[i+rw*p][j+cl*p]) = la[i+rw*p][j+((cl+1)*p)%n];
                 		(lb[i+rw*p][j+cl*p]) = lb[i+((rw+1)*p)%n][j+cl*p];
                 		(c[i+rw*p][j+cl*p]) = c[i+rw*p][j+cl*p] + sum(0,p-1,i,j,cl,rw);
                    }
            	}
            }         
        }
    }

return 0;
}

thread test{

nat test, k, l, i, j;
bool correct;

//generate input matrices
	for(k=0..n-1){
       		for(l=0..n-1) {
           	 	a[k][l] = k;
            		b[k][l] = l;
        	}
	}

	test = MatrixMultCannon();

	// check correctness

	for(i=0..n-1){
       		for(j=0..n-1){ 
			if(c[i][j] == (sum2(n-1,i,j))){
				 correct = true;			
			}else{
			     correct = false;
			     }		
		    }
    }

}