// ************************************************************************** //
//                                                                            //
//    eses                   eses                                             //
//   eses                     eses                                            //
//  eses    eseses  esesese    eses   Embedded Systems Group                  //
//  ese    ese  ese ese         ese                                           //
//  ese    eseseses eseseses    ese   Department of Computer Science          //
//  eses   eses          ese   eses                                           //
//   eses   eseses  eseseses  eses    University of Kaiserslautern            //
//    eses                   eses                                             //
//                                                                            //
// ************************************************************************** //
// It is known that the division of N-bit radix-2 numbers can be reduced to   //
// the powering problem, which requires to compute for an N-bit radix-2 number//
// x all N^2-bit numbers x^2,x^3,…,x^N. For a proof of the reductions between //
// these two problems see [BeCH84,Reif93]. The module implements powering by  //
// the parallel prefix computation. Thus, it requires O(log(N)) steps which   //
// determines the depth, and in total the number of multiplications is O(N)   //
// which determines the work.                                                 //
// ************************************************************************** //

macro N = 8;
macro R  = exp(2,N);
macro R2 = exp(2,N*N);

module Powering(nat{R} ?x,[N]nat{R2} p,event !rdy) {
    for(i=0..N-1)
        p[i] = x;

    // bottom-up traversal requires time log(N)
    // with N-1 work and N/2 processors
    for(l=1..log(N)) {
        for(j=1..N/exp(2,l)) {
            let(s = exp(2,l-1))
            next(p[2*j*s-1]) = p[2*j*s-1] * p[(2*j-1)*s-1];
        }
        pause;
    }

    // top-down traversal requires time log(N)-1
    // with N-log(N)-1 work and N/2 processors
    for(i=1..log(N)-1) {
        let(l = log(N)-i)
        for(j=1..exp(2,i)-1) {
            let(s = exp(2,l-1))
            next(p[(2*j+1)*s-1]) = p[(2*j+1)*s-1] * p[2*j*s-1];
        }
        pause;
    }

    emit(rdy);
}
drivenby {
    x = bv2nat(0b10000000);
    await(rdy);
    for(i=0..N-1)
        assert(p[i] == exp(x,i+1));
}
drivenby {
    x = bv2nat(0b11111111);
    await(rdy);
    for(i=0..N-1)
        assert(p[i] == exp(x,i+1));
}
drivenby {
    x = 2;
    await(rdy);
    for(i=0..N-1)
        assert(p[i] == exp(x,i+1));
}