// ************************************************************************** //
//                                                                            //
//    eses                   eses                                             //
//   eses                     eses                                            //
//  eses    eseses  esesese    eses   Embedded Systems Group                  //
//  ese    ese  ese ese         ese                                           //
//  ese    eseseses eseseses    ese   Department of Computer Science          //
//  eses   eses          ese   eses                                           //
//   eses   eseses  eseseses  eses    University of Kaiserslautern            //
//    eses                   eses                                             //
//                                                                            //
// ************************************************************************** //
// This file implements a transport triggered architecture whose function     //
// units are connected by a bus with several sockets and lanes. All input and //
// output ports of each functional unit have a unique address so that data can//
// be transported from each of these output ports to each of the input ports. //
// The register file has four inputs and can read and write to one of its     //
// registers in parallel. The control unit is able to deal with the following //
// move instructions:                                                         //
//                                                                            //
//  * 00@src@tgt: data transfer from output port src to input port tgt        //
//  * 01@cst@tgt: data transfer of constant cst to input port tgt             //
//  * 10@src@cst: add cst to pc if value received from port src is non-zero   //
//  * 11@src@tgt: no operation                                                //
//                                                                            //
// The control unit maintains the program counter pc and reads the program    //
// memory for fetching the next instruction to be executed. The general way   //
// of execution is that the control unit decodes the instruction, and puts the//
// addresses for a data transfer on the bus. The function units with these    //
// addresses will automatically answer, and as a side effect, new results are //
// computed when a function unit receives new input. In more detail:          //
//  If a "00" instruction is fetched, then the control unit puts the addresses//
// src and tgt found in the instruction on the bus, so that the function unit //
// having an output with address src will put the corresponding value on the  //
// bus, and the function unit having an input with address tgt will read this //
// value from the bus.
//  If a "01" instruction is fetched, then the control unit puts the addresses//
// tgt and the constant cst as the value on the bus, and the function unit    //
// having an input with address tgt will read this value from the bus. Since  //
// the src part on bus is reset to zero, and zero is not used as an address,  //
// no function unit will also put a value on the bus.                         //
//  If a "10" instruction is fetched, then the control unit puts address src  //
// on the bus, so that the function unit having an output with address src    //
// will put the corresponding value on the bus. The control unit will then    //
// read the value, and if it is not zero, then the value cst is added to the  //
// current pc.                                                                //
//  If a "11" instruction is fetched, then the control unit puts nothing on   //
// the bus, and therefore no other function unit will answer.                 //
// ************************************************************************** //

macro NumPorts = 37;
macro NumBus = 4;       // number of busses
macro AW =  8;          // address width
macro DW =  8;          // data width
macro PL = 10;          // length of program

// macros to construct machine instructions

macro MvData(src,tgt) = 0b00@nat2bv(src,AW)@nat2bv(tgt,AW);
macro MvCnst(cst,tgt) = 0b01@int2bv(cst,AW)@nat2bv(tgt,AW);
macro RdCnst(src,cst) = 0b10@nat2bv(src,AW)@int2bv(cst,AW);
macro NOP = 0b11@{false::AW}@{false::AW};

// addresses of the ports of the function units
macro REG_ARG1 =  1;
macro REG_ARG2 =  2;
macro REG_ARG3 =  3;
macro REG_RES  =  4;

macro ADDU_ARG1 =  5;
macro ADDU_ARG2 =  6;
macro ADDU_RES1 =  7;
macro ADDU_RES2 =  8;

macro SUBU_ARG1 =  9;
macro SUBU_ARG2 = 10;
macro SUBU_RES1 = 11;
macro SUBU_RES2 = 12;

macro MULU_ARG1 = 13;
macro MULU_ARG2 = 14;
macro MULU_RES1 = 15;
macro MULU_RES2 = 16;

macro DIVU_ARG1 = 17;
macro DIVU_ARG2 = 18;
macro DIVU_RES1 = 19;
macro DIVU_RES2 = 20;

macro ADDS_ARG1 = 21;
macro ADDS_ARG2 = 22;
macro ADDS_RES1 = 23;
macro ADDS_RES2 = 24;

macro SUBS_ARG1 = 25;
macro SUBS_ARG2 = 26;
macro SUBS_RES1 = 27;
macro SUBS_RES2 = 28;

macro MULS_ARG1 = 29;
macro MULS_ARG2 = 30;
macro MULS_RES1 = 31;
macro MULS_RES2 = 32;

macro DIVS_ARG1 = 33;
macro DIVS_ARG2 = 34;
macro DIVS_RES1 = 35;
macro DIVS_RES2 = 36;

macro CMPU_ARG1 = 37;
macro CMPU_ARG2 = 38;
macro CMPU_RES1 = 39; // equal
macro CMPU_RES2 = 40; // not equal
macro CMPU_RES2 = 41; // less than
macro CMPU_RES2 = 42; // less than or equal
macro CMPU_RES2 = 43; // greater than
macro CMPU_RES2 = 44; // greater than or equal

macro CMPS_ARG1 = 45;
macro CMPS_ARG2 = 46;
macro CMPS_RES1 = 47; // equal
macro CMPS_RES2 = 48; // not equal
macro CMPS_RES2 = 49; // less than
macro CMPS_RES2 = 50; // less than or equal
macro CMPS_RES2 = 51; // greater than
macro CMPS_RES2 = 52; // greater than or equal


// macros for accessing contained data
macro OpcInstr(x) = x{2*AW+1:2*AW}; // operation code of instruction
macro SrcInstr(x) = x{2*AW-1:AW};   // target address or constant operand
macro TgtInstr(x) = x{AW-1:0};      // source address or constant operand

macro SrcBus(x) = x.0;
macro TgtBus(x) = x.1;
macro ValBus(x) = x.2;



module CPU([PL][NumBus]bv{2*AW+2} ?prog,nat{PL+1} pc) {
    event [NumBus](bv{AW} * bv{AW} * bv{DW}) bus;
    event branch;
    event [NumPorts]bool e;
    event [NumPorts]bv{DW} x;

    // -------------------------------------------------------------------------
    // the control unit
    // -------------------------------------------------------------------------
    {
    pause;
    loop {
        for(i=0..NumBus-1) {
            let(instr = prog[pc][i])
            case
                (OpcInstr(instr) == 0b00) do {
                    // move data from SrcInstr(instr[i]) to TgtInstr(instr[i])
                    SrcBus(bus[i]) = SrcInstr(instr);
                    TgtBus(bus[i]) = TgtInstr(instr);
                    // ValBus(bus[i]) will be defined by function unit
                    // having output address SrcInstr(instr[i])
                }
                (OpcInstr(instr) == 0b01) do {
                    // move constant SrcInstr(instr[i]) to TgtInstr(instr[i])
                    SrcBus(bus[i]) = {false::AW};
                    TgtBus(bus[i]) = TgtInstr(instr);
                    ValBus(bus[i]) = SrcInstr(instr);
                }
                (OpcInstr(instr) == 0b10) do {
                    // move data from SrcInstr(instr[i]) to control unit
                    // and modify the pc if the received value is not zero
                    SrcBus(bus[i]) = SrcInstr(instr);
                    TgtBus(bus[i]) = {false::AW};
                    // ValBus(bus[i]) will be defined by function unit
                    // having output address SrcInstr(instr[i])
                    if(bv2nat(ValBus(bus[i]))!=0) {
                        next(pc) = pc + bv2int(TgtInstr(instr));
                        emit(branch);
                    }
                }
            default {
                // no operation
                nothing;
                SrcBus(bus[i]) = {false::AW};
                TgtBus(bus[i]) = {false::AW};
                ValBus(bus[i]) = {false::DW};
            }
        }
        if(!branch)
            next(pc) = pc + 1;
        pause;
    }}
    // ------------------------------------------------------------------------- 
    // function units
    // ------------------------------------------------------------------------- 
 || BC_REG: BusRegConnect(bus,e[REG_ARG1],e[REG_ARG2],e[REG_ARG3],e[REG_RES],
                              x[REG_ARG1],x[REG_ARG2],x[REG_ARG3],x[REG_RES],
                                REG_ARG1,   REG_ARG2,   REG_ARG3,   REG_RES);
 || REG: REG(e[REG_ARG1],e[REG_ARG2],e[REG_ARG3],e[REG_RES],
             x[REG_ARG1],x[REG_ARG2],x[REG_ARG3],x[REG_RES]);
 || BC_ADDU: BusConnect(bus,e[ADDU_ARG1],e[ADDU_ARG2],e[ADDU_RES1],e[ADDU_RES2],
                            x[ADDU_ARG1],x[ADDU_ARG2],x[ADDU_RES1],x[ADDU_RES2],
                              ADDU_ARG1,   ADDU_ARG2,   ADDU_RES1,   ADDU_RES2);
 || ADDU: ADDU(e[ADDU_ARG1],e[ADDU_ARG2],e[ADDU_RES1],e[ADDU_RES2],
               x[ADDU_ARG1],x[ADDU_ARG2],x[ADDU_RES1],x[ADDU_RES2]);
 || BC_SUBU: BusConnect(bus,e[SUBU_ARG1],e[SUBU_ARG2],e[SUBU_RES1],e[SUBU_RES2],
                            x[SUBU_ARG1],x[SUBU_ARG2],x[SUBU_RES1],x[SUBU_RES2],
                              SUBU_ARG1,  SUBU_ARG2,    SUBU_RES1,   SUBU_RES2);
 || SUBU: SUBU(e[SUBU_ARG1],e[SUBU_ARG2],e[SUBU_RES1],e[SUBU_RES2],
               x[SUBU_ARG1],x[SUBU_ARG2],x[SUBU_RES1],x[SUBU_RES2]);
 || BC_MULU: BusConnect(bus,e[MULU_ARG1],e[MULU_ARG2],e[MULU_RES1],e[MULU_RES2],
                            x[MULU_ARG1],x[MULU_ARG2],x[MULU_RES1],x[MULU_RES2],
                              MULU_ARG1,    MULU_ARG2,   MULU_RES1,  MULU_RES2);
 || MULU: MULU(e[MULU_ARG1],e[MULU_ARG2],e[MULU_RES1],e[MULU_RES2],
               x[MULU_ARG1],x[MULU_ARG2],x[MULU_RES1],x[MULU_RES2]);
 || BC_DIVU: BusConnect(bus,e[DIVU_ARG1],e[DIVU_ARG2],e[DIVU_RES1],e[DIVU_RES2],
                            x[DIVU_ARG1],x[DIVU_ARG2],x[DIVU_RES1],x[DIVU_RES2],
                              DIVU_ARG1,   DIVU_ARG2,   DIVU_RES1,   DIVU_RES2);
 || DIVU: DIVU(e[DIVU_ARG1],e[DIVU_ARG2],e[DIVU_RES1],e[DIVU_RES2],
               x[DIVU_ARG1],x[DIVU_ARG2],x[DIVU_RES1],x[DIVU_RES2]);
 || BC_ADDS: BusConnect(bus,e[ADDS_ARG1],e[ADDS_ARG2],e[ADDS_RES1],e[ADDS_RES2],
                            x[ADDS_ARG1],x[ADDS_ARG2],x[ADDS_RES1],x[ADDS_RES2],
                              ADDS_ARG1,   ADDS_ARG2,   ADDS_RES1,   ADDS_RES2);
 || ADDS: ADDS(e[ADDS_ARG1],e[ADDS_ARG2],e[ADDS_RES1],e[ADDS_RES2],
               x[ADDS_ARG1],x[ADDS_ARG2],x[ADDS_RES1],x[ADDS_RES2]);
 || BC_SUBS: BusConnect(bus,e[SUBS_ARG1],e[SUBS_ARG2],e[SUBS_RES1],e[SUBS_RES2],
                            x[SUBS_ARG1],x[SUBS_ARG2],x[SUBS_RES1],x[SUBS_RES2],
                              SUBS_ARG1,  SUBS_ARG2,    SUBS_RES1,   SUBS_RES2);
 || SUBS: SUBS(e[SUBS_ARG1],e[SUBS_ARG2],e[SUBS_RES1],e[SUBS_RES2],
               x[SUBS_ARG1],x[SUBS_ARG2],x[SUBS_RES1],x[SUBS_RES2]);
 || BC_MULS: BusConnect(bus,e[MULS_ARG1],e[MULS_ARG2],e[MULS_RES1],e[MULS_RES2],
                            x[MULS_ARG1],x[MULS_ARG2],x[MULS_RES1],x[MULS_RES2],
                              MULS_ARG1,    MULS_ARG2,   MULS_RES1,  MULS_RES2);
 || MULS: MULS(e[MULS_ARG1],e[MULS_ARG2],e[MULS_RES1],e[MULS_RES2],
               x[MULS_ARG1],x[MULS_ARG2],x[MULS_RES1],x[MULS_RES2]);
 || BC_DIVS: BusConnect(bus,e[DIVS_ARG1],e[DIVS_ARG2],e[DIVS_RES1],e[DIVS_RES2],
                            x[DIVS_ARG1],x[DIVS_ARG2],x[DIVS_RES1],x[DIVS_RES2],
                              DIVS_ARG1,   DIVS_ARG2,   DIVS_RES1,   DIVS_RES2);
 || DIVS: DIVS(e[DIVS_ARG1],e[DIVS_ARG2],e[DIVS_RES1],e[DIVS_RES2],
               x[DIVS_ARG1],x[DIVS_ARG2],x[DIVS_RES1],x[DIVS_RES2]);
}
drivenby p1 {
    // -------------------------------------------------------------------------
    // The following implements a program to compute the sum of the first n=10 
    // natural numbers. To this end, we send n+1 and 1 as arguments to SUBS, so
    // that 10 is obtained as SUBS_RES1. Then, the VLIW instruction prog[1]
    // repeats itself until SUBS_RES1 is zero. Note that even though it will be
    // finally zero, which lets the CU increase the pc to 2, SUBS and ADDU will
    // perform further action in parallel (so that we need to use SUBS instead
    // of SUBU to avoid a runtime error). Note further that the register file 
    // is not used in this program except for storing the final result!
    // -------------------------------------------------------------------------

    prog[0][0] = MvCnst(11,SUBS_ARG1);  // 11 --> SUBS.ARG1
    prog[0][1] = MvCnst( 1,SUBS_ARG2);  //  1 --> SUBS.ARG2
    prog[0][2] = NOP;
    prog[0][3] = NOP;

    prog[1][0] = MvData(SUBS_RES1,SUBS_ARG1);
    prog[1][1] = MvData(SUBS_RES1,ADDU_ARG1);
    prog[1][2] = MvData(ADDU_RES1,ADDU_ARG2);
    prog[1][3] = RdCnst(SUBS_RES1,0);

    prog[2][0] = MvCnst(1,REG_ARG1);
    prog[2][1] = MvData(ADDU_RES1,REG_ARG2);    // ADDU.RES -> reg[1]
    prog[2][2] = NOP;
    prog[2][3] = NOP;

    pause; // is required since all FUs start with a (non-immediate) await 
    await(pc==2);
}
drivenby p2 {
    // -------------------------------------------------------------------------
    // The following program implements y[i] := y[i] + a * x[i] where x and y 
    // are in registers reg[0..15] and reg[16..31], respectively, and a=3. The
    // instructions prog[0..1] will first write values reg[i]:=i to this end.
    // If prog[6] is reached, we should have x[0..15] = reg[0..15] = 0..15 and
    // y[0..15] = reg[16..31] = 16 20 24 .. 76.
    // -------------------------------------------------------------------------

    prog[0][0] = MvCnst(32,SUBS_ARG1);
    prog[0][1] = MvCnst( 1,SUBS_ARG2);
    prog[0][2] = NOP;
    prog[0][3] = NOP;

    prog[1][0] = MvData(SUBS_RES1,SUBS_ARG1);
    prog[1][1] = MvData(SUBS_RES1,REG_ARG1);
    prog[1][2] = MvData(SUBS_RES1,REG_ARG2);
    prog[1][3] = RdCnst(SUBS_RES1,0);

    prog[2][0] = MvCnst(32,SUBU_ARG1);
    prog[2][1] = MvCnst( 1,SUBU_ARG2);
    prog[2][2] = MvCnst(16,SUBS_ARG1);
    prog[2][3] = MvCnst( 1,SUBS_ARG2);

    prog[3][0] = MvCnst(3,MULU_ARG1);
    prog[3][1] = MvData(REG_RES,MULU_ARG2);
    prog[3][2] = MvData(SUBS_RES1,REG_ARG3);
    prog[3][3] = NOP;

    prog[4][0] = MvData(MULU_RES1,ADDU_ARG1);
    prog[4][1] = MvData(REG_RES,ADDU_ARG2);
    prog[4][2] = MvData(SUBU_RES1,REG_ARG3);
    prog[4][3] = MvData(SUBS_RES1,SUBS_ARG1);

    prog[5][0] = MvData(SUBU_RES1,REG_ARG1);
    prog[5][1] = MvData(ADDU_RES1,REG_ARG2);
    prog[5][2] = MvData(SUBU_RES1,SUBU_ARG1);
    prog[5][3] = RdCnst(SUBS_RES1,-2);

    prog[6][0] = NOP;
    prog[6][1] = NOP;
    prog[6][2] = NOP;
    prog[6][3] = NOP;

    pause; // is required since all FUs start with a (non-immediate) await 
    await(pc==6);
}