```// ************************************************************************** //
//                                                                            //
//    eses                   eses                                             //
//   eses                     eses                                            //
//  eses    eseses  esesese    eses   Embedded Systems Group                  //
//  ese    ese  ese ese         ese                                           //
//  ese    eseseses eseseses    ese   Department of Computer Science          //
//  eses   eses          ese   eses                                           //
//   eses   eseses  eseseses  eses    University of Kaiserslautern            //
//    eses                   eses                                             //
//                                                                            //
// ************************************************************************** //
// This file implements a simple matrix multiplication algorithm as found in  //
// many textbooks. It can be used to study the effect of loop ordering on the //
// caches as well as the effect of the cache architecture on the runtime of   //
// this algorithm.                                                            //
// ************************************************************************** //

// -----------------------------------------------------------------------------
// This procedure is used to initialize the matrices a and b with some values.
// -----------------------------------------------------------------------------

procedure Initialize ([][]nat a,b,c,nat d1,d2,d3) {
nat i,j;
for(i=0..d1-1)
for(j=0..d2-1)
a[i][j] = i+j;
for(i=0..d2-1)
for(j=0..d3-1)
b[i][j] = (i+1)*(j+1);
for(i=0..d1-1)
for(j=0..d3-1)
c[i][j] = 0;
return;
}

// -----------------------------------------------------------------------------
// The version below uses order k-j-i which is very bad, since the inner loop
// will then access elements from a,b and c in different rows so that cache
// hits are  unlikely for all of these accesses.
// -----------------------------------------------------------------------------

procedure MatrixMultV0([][]nat a,b,c,nat d1,d2,d3) {
nat i,j,k;
for(k=0..d2-1)
for(j=0..d3-1)
for(i=0..d1-1)
c[i][j] = c[i][j] + a[i][k] * b[k][j];
return;
}

// -----------------------------------------------------------------------------
// This version uses loop ordering i-j-k which is usually the definition found
// in textbooks. This version is better than the previous one, since the
// accesses to a and c are done in the inner loop in the same rows, but the
// inner loop accesses in each iteration another row of b.
// -----------------------------------------------------------------------------

procedure MatrixMultV1([][]nat a,b,c,nat d1,d2,d3) {
nat i,j,k;
for(i=0..d1-1)
for(j=0..d3-1)
for(k=0..d2-1)
c[i][j] = c[i][j] + a[i][k] * b[k][j];
return;
}

// -----------------------------------------------------------------------------
// The version below is much more cache-friendly than the previous versions,
// since now the innermost loop accesses all array accesses in the same rows
// which increases the chances to have cache hits.
// -----------------------------------------------------------------------------

procedure MatrixMultV2([][]nat a,b,c,nat d1,d2,d3) {
nat i,j,k;
for(i=0..d1-1)
for(k=0..d2-1)
for(j=0..d3-1)
c[i][j] = c[i][j] + a[i][k] * b[k][j];
return;
}

// -----------------------------------------------------------------------------
// We can optimize the above version in that we factor out the array access to
// matrix a which is always the same in the innermost loop. This way the number
// of load instructions is reduced.
// -----------------------------------------------------------------------------

procedure MatrixMultV3([][]nat a,b,c,nat d1,d2,d3) {
nat i,j,k,x;
for(i=0..d1-1)
for(k=0..d2-1) {
x = a[i][k];
for(j=0..d3-1)
c[i][j] = c[i][j] + x * b[k][j];
}
return;
}

// -----------------------------------------------------------------------------
// The main thread generates some test examples, and then calls one of versions.
// -----------------------------------------------------------------------------