// ************************************************************************** //
//                                                                            //
//    eses                   eses                                             //
//   eses                     eses                                            //
//  eses    eseses  esesese    eses   Embedded Systems Group                  //
//  ese    ese  ese ese         ese                                           //
//  ese    eseseses eseseses    ese   Department of Computer Science          //
//  eses   eses          ese   eses                                           //
//   eses   eseses  eseseses  eses    University of Kaiserslautern            //
//    eses                   eses                                             //
//                                                                            //
// ************************************************************************** //
// The module below computes the discrete convolution of an input stream x(t) //
// with respect to given weights w[0],...w[N-1], i.e., it computes an output  //
// stream y(t) defined as                                                     //
//                                                                            //
//        y_out(t+N-1) = sum(k=0..N-1) w[k] * x_in(t+k)                       //
//                                                                            //
// The weights w[i] can be loaded into the module by piping them in via the   //
// input while setting input lw=True until all weights have arrived at the    //
// right places. Having N cells, the module can compute the products and      //
// their sums in one step, and in each step one convolution is obtained.      //
// In the version below, the w[k] stay in the cells, while both x(t) and y(t) //
// are piped through the array. In contrast to ConvArray03, however, x and y  //
// flow in opposite directions, so only every other step can be used to pipe  //
// in element of an input stream (see [Kung82]). This is due to the fact that //
// the relative speed of both streams is double the cycle frequency. Thus,    //
// the performance of the module is only half of that of ConvArray01 and      //
// ConvArray02. However, the array can perform two convolutions of two input  //
// streams that must then be piped in in an interleaved way in the unique     //
// input x_in. Note also that the weights have to be piped in in the reverse  //
// ordering when the results should be the same as for ConvArray01 (and thus  //
// in the same ordering as in ConvArray02).                                   //
//                                                                            //
// For N=5, the array implements the equations:                               //
//                                                                            //
//            next(w[0]) = lw?x_in:w[0]                                       //
//            next(w[1]) = lw?w[0]:w[1]                                       //
//            next(w[2]) = lw?w[1]:w[2]                                       //
//            next(w[3]) = lw?w[2]:w[3]                                       //
//            next(w[4]) = lw?w[3]:w[4]                                       //
//            next(x[0]) = x_in                                               //
//            next(x[1]) = x[0]                                               //
//            next(x[2]) = x[1]                                               //
//            next(x[3]) = x[2]                                               //
//            next(x[4]) = x[3]                                               //
//            next(y[0]) = w[4]*x[4]                                          //
//            next(y[1]) = w[3]*x[3]+y[0]                                     //
//            next(y[2]) = w[2]*x[2]+y[1]                                     //
//            next(y[3]) = w[1]*x[1]+y[2]                                     //
//            next(y[4]) = w[0]*x[0]+y[3]                                     //
//            y_out = y[4]                                                    //
//                                                                            //
// Thus, we have                                                              //
//      x[k](t+k+1) = x(t) and                                                //
//      y_out(t+N) = sum(k=0..N-1) (w[k] * x[k](N-1-k))                       //
// so that the output stream is computed as desired above.                    //
// For weights w[4..0] = [10,8,6,4,2] and inputs x_in = 0,1,2,3,4,5,...,      //
// we therefore obtain:                                                       //
//                                                                            //
//    y_out(15) = y_out(16) = 10*1 + 8*2 + 6*3 + 4*4 + 2*5 =  70              //
//    y_out(17) = y_out(18) = 10*2 + 8*3 + 6*4 + 4*5 + 2*6 = 100              //
//    y_out(19) = y_out(20) = 10*3 + 8*4 + 6*5 + 4*6 + 2*7 = 130              //
//    y_out(21) = y_out(22) = 10*4 + 8*5 + 6*6 + 4*7 + 2*8 = 160              //
//    y_out(23) = y_out(24) = 10*5 + 8*6 + 6*7 + 4*8 + 2*9 = 190              //
//                                                                            //
// ************************************************************************** //

macro N = 5;

module ConvArray03(int ?x_in,!y_out,bool ?lw) {
    [N]int w,x,y;
    
    loop {
        y_out = y[N-1];
        for(j=0..N-1) {
            next(w[j]) = (lw ? (j==0 ? x_in : w[j-1]) : w[j]);
            next(x[j]) = (j==0 ? x_in : x[j-1]);
            next(y[j]) = (j==0 ? 0 : y[j-1]) + w[N-1-j] * x[N-1-j];
        }
        pause;
    }
}
drivenby {
    [N]int dx,dw; // local stores for x and w in driver

    // first load weights 2,4,6,8
    for(i=0..N-1) {
        x_in = 2*(N-1-i)+2;
        dw[i] = 2*i+2;
        lw = true;
        pause;
    }
    // now do some computation
    for(i=0..2*N-1) {
        x_in = i+1;
        lw = false;
        for(j=0..N-1) 
            next(dx[j]) = (j==0 ? x_in : dx[j-1]);
        if(i>=N)
            assert(y_out == sum(k=0..N-1) (dw[k] * dx[k]));
        pause;
        pause;
    }
}