// ************************************************************************** // // // // eses eses // // eses eses // // eses eseses esesese eses Embedded Systems Group // // ese ese ese ese ese // // ese eseseses eseseses ese Department of Computer Science // // eses eses ese eses // // eses eseses eseseses eses University of Kaiserslautern // // eses eses // // // // ************************************************************************** // // This file implements a simple matrix multiplication algorithm as found in // // many textbooks. It can be used to study the effect of loop ordering on the // // caches as well as the effect of the cache architecture on the runtime of // // this algorithm. // // ************************************************************************** // // ----------------------------------------------------------------------------- // This procedure is used to initialize the matrices a and b with some values. // ----------------------------------------------------------------------------- procedure Initialize ([][]nat a,b,c,nat d1,d2,d3) { nat i,j; for(i=0..d1-1) for(j=0..d2-1) a[i][j] = i+j; for(i=0..d2-1) for(j=0..d3-1) b[i][j] = (i+1)*(j+1); for(i=0..d1-1) for(j=0..d3-1) c[i][j] = 0; return; } // ----------------------------------------------------------------------------- // The version below uses order k-j-i which is very bad, since the inner loop // will then access elements from a,b and c in different rows so that cache // hits are unlikely for all of these accesses. // ----------------------------------------------------------------------------- procedure MatrixMultV0([][]nat a,b,c,nat d1,d2,d3) { nat i,j,k; for(k=0..d2-1) for(j=0..d3-1) for(i=0..d1-1) c[i][j] = c[i][j] + a[i][k] * b[k][j]; return; } // ----------------------------------------------------------------------------- // This version uses loop ordering i-j-k which is usually the definition found // in textbooks. This version is better than the previous one, since the // accesses to a and c are done in the inner loop in the same rows, but the // inner loop accesses in each iteration another row of b. // ----------------------------------------------------------------------------- procedure MatrixMultV1([][]nat a,b,c,nat d1,d2,d3) { nat i,j,k; for(i=0..d1-1) for(j=0..d3-1) for(k=0..d2-1) c[i][j] = c[i][j] + a[i][k] * b[k][j]; return; } // ----------------------------------------------------------------------------- // The version below is much more cache-friendly than the previous versions, // since now the innermost loop accesses all array accesses in the same rows // which increases the chances to have cache hits. // ----------------------------------------------------------------------------- procedure MatrixMultV2([][]nat a,b,c,nat d1,d2,d3) { nat i,j,k; for(i=0..d1-1) for(k=0..d2-1) for(j=0..d3-1) c[i][j] = c[i][j] + a[i][k] * b[k][j]; return; } // ----------------------------------------------------------------------------- // We can optimize the above version in that we factor out the array access to // matrix a which is always the same in the innermost loop. This way the number // of load instructions is reduced. // ----------------------------------------------------------------------------- procedure MatrixMultV3([][]nat a,b,c,nat d1,d2,d3) { nat i,j,k,x; for(i=0..d1-1) for(k=0..d2-1) { x = a[i][k]; for(j=0..d3-1) c[i][j] = c[i][j] + x * b[k][j]; } return; } // ----------------------------------------------------------------------------- // The main thread generates some test examples, and then calls one of versions. // ----------------------------------------------------------------------------- thread MatrixMult { [8][8]nat a,b,c; // first create some test matrices a and b Initialize(a,b,c,8,8,8); // now call one of the above procedures MatrixMultV0(a,b,c,8,8,8); }