// ************************************************************************** //
//                                                                            //
//    eses                   eses                                             //
//   eses                     eses                                            //
//  eses    eseses  esesese    eses   Embedded Systems Group                  //
//  ese    ese  ese ese         ese                                           //
//  ese    eseseses eseseses    ese   Department of Computer Science          //
//  eses   eses          ese   eses                                           //
//   eses   eseses  eseseses  eses    University of Kaiserslautern            //
//    eses                   eses                                             //
//                                                                            //
// ************************************************************************** //
// This module implements a classic RISC pipeline for the ISA defined in file //
// ScalarBehav. To this end, the stages of the erroneous pipeline described in//
// module PipeWithConflicts are endowed with a conflict detection and stalling//
// until the detected conflicts disappear. For the construction, recall that  //
// the pipeline constructed in PipeWithConflicts works on the following pipe- //
// line variables:                                                            //
//                                                                            //
//   +------+    +-------+    +--------+    +--------+    +--------+          //
//   |pc    |    |instr  |    | opc_EX |    | opc_MA |    | opc_WB |          //
//   |Mem[0]|    |overflw|    | fnc_EX |    | fnc_MA |    | fnc_WB |          //
//   |Mem[1]| IF |Reg[0] | ID | adr_EX | EX | adr_MA | MA | adr_WB | WB       //
//   |  :   |--->|   :   |--->|  rd_EX |--->|  rd_MA |--->|  rd_WB |---+      //
//   |      |    |Reg[7] |    | opS_EX |    | opS_MA |    | alu_WB |   |      //
//   |      |    |       |    | opL_EX |    | alu_MA |    | cnd_WB |   |      //
//   |      |    |       |    | opR_EX |    | cnd_MA |    |  ld_WB |   |      //
//   +------+    +-------+    +--------+    +--------+    +--------+   |      //
//      ^            ^        | alu_EX |    |  ld_MA |                 |      //
//      |            |        | cnd_EX |    +--------+                 |      //
//      |            |        +--------+                               |      //
//      |            |                                                 |      //
//      +------------+-------------------------------------------------+      //
//                                                                            //
// The blocks are thereby viewed as registers where the predecessor stage     //
// writes to (with a delayed write) and where the successor stage immediately //
// reads from. As can be seen, we also use variables alu_EX and cnd_EX which  //
// are immediately written by EX, so that result forwarding from EX to ID     //
// becomes possible. Similarly, we let MA immediately write a value ld_MA to  //
// forward the latter to ID one step earlier. Even though these variables are //
// written in EX and MA, respectively, we put them in the registers in front  //
// of these stages due to the immediate writes, since they are available at   //
// the time when the stage writing them (immediately) reads its inputs.       //
//  The resulting pipeline has the following conflicts:                       //
//  (1) InstructionFetch reads the pc, which is written only in the WB phase. //
//  (2) InstructionDecode reads the register file which may be updated by WB. //
//      The same holds for the special register overflw.                      //
// Note that InstructionFetch also reads the main memory, but we distinguish  //
// between program and data memory, and therefore, there is no conflict. Since//
// the data memory is both read and written only by MA, there is also no      //
// conflict on the data memory.                                               //
//   We must therefore stall the pipeline if one of the above two conflicts   //
// is observed, i.e., as long as a branch or jump instruction is in the pipe- //
// line or as long as an instruction writing to a register that is read by ID //
// is in the pipeline. In the first case, IF must be stalled, in the latter   //
// case, we must stall ID.                                                    //
//   For this reason, each stage consumes the values of its input registers   //
// and produces values in its output registers that are available at the next //
// point of time while values produced by immediate writes are immediately    //
// available. One may therefore model a pipeline also as a dataflow graph or  //
// a Petri net consisting of an alternating sequence of process nodes and     //
// buffers holding the values. In addition, the instruction fetch stage reads //
// also the instruction memory and the PC, and the instruction decode stage   //
// reads additionally the register file. Thus, if another stages modifies the //
// PC or a register that is to be read in the decode stage, the corresponding //
// stage has to be stalled.                                                   //
//   If a stage ST1 is stalled while its successor stage ST2 is not stalled,  //
// then ST2 consumes the values in the pipeline registers between ST1 and ST2,//
// but no new values are produced at that point of time. For this reason, the //
// pipeline registers will not contain valid values in the next step, so that //
// ST2 must be stalled next. To distinguish between valid and invalid values, //
// one has to use special flags for each group of pipeline registers between  //
// two pipeline stages. If pipeline registers contain valid inputs, one also  //
// says that there are tokens inside, otherwise they contain bubbles.         //
//   While the described effect can create bubbles, it is also possible that  //
// bubbles disappear: Clearly, this can only happen if one pipeline stage     //
// consumes the values slower than they are produced. This can happen if the  //
// consumer stage is stalled or if its execution takes more time than that of //
// its producer stage. Typically, evaluation of complex operations in the     //
// execution unit or loading values from memory are responsible for the latter//
// effect. In general, a pipeline stage must be stalled for the following two //
// reasons:                                                                   //
//   (1) If it does not have valid inputs for the current reaction or         //
//   (2) if it would overwrite its previous outputs that have not yet been    //
//       consumed by the next pipeline stage.                                 //
// In [CoKG06a,CoKi07a], the implementation of elastic synchronous systems has//
// been described that solves these problems by the replacement of wires by   //
// buffers controlled by a Synchronous Elastic Flow (SELF) protocol. To this  //
// end the modules communicate with each other by signals valid and stop in a //
// similar fashion as the pipeline is constructed below that uses signals get //
// and valid:                                                                 //
//                                                                            //
//    +-----+  valid   +-----+  valid   +-----+                               //
//    |     |--------->|     |--------->|     |                               //
//    |     | f1(data) |     | f2(data) |     |                               //
//    | reg |--------->| reg |--------->| reg |                               //
//    |     |   get    |     |   get    |     |                               //
//    |     |<---------|     |<---------|     |                               //
//    +-----+          +-----+          +-----+                               //
//                                                                            //
// The communication of the stages is as follows: whenever valid data that has//
// not yet been consumed is contained in the registers, the producer stage ST //
// sets a signal "validST" to indicate the consumer stage that it can read the//
// values. The producer stage ST has to wait until this happens, since it     //
// would otherwise overwrite these values. It may however already perform a   //
// new computation at the point of time when the values are consumed since the//
// values are written by a delayed assignment. If the values are consumed by  //
// the following stage, the consumer stage sends a signal "get". The general  //
// code of a pipeline stage ST with producer stage IN and consumer stage OUT  //
// is as follows:                                                             //
//                                                                            //
//    loop {                                                                  //
//        // wait for input data and the resolution of the conflicts          //
//        wvST: immediate await(validIN & !conflictST);                       //
//        // signal that input values are consumed                            //
//        emit(getST);                                                        //
//        // with the input values, compute new output values that            //
//        // are written by delayed assignments                               //
//        Operate(…);                                                         //
//        // wait from the next point of time on until outputs are consumed   //
//        do {                                                                //
//            wgST: pause;                                                    //
//            emit(validST);                                                  //
//        } while(!getOUT);                                                   //
//    }                                                                       //
//                                                                            //
// Thus, if there were no conflicts, the stage would always be able to perform//
// an Operation since then validIN and validOUT would be true, and conflictST //
// would be false. Note that all get-signals would be also invariantly true.  //
//   Recalling the above pipeline, there are two sources for pipeline stalls  //
// which are now expressed by the conditions conflict_IF and conflict_ID:     //
//                                                                            //
// conflict_IF = writePC(opc_ID) or writePC(opc_EX) or writePC(MA)            //
//                                                                            //
// conflict_ID =                                                              //
//   rs1(instr_ID)==rd_EX & readRegL(instr_ID) & writeReg(opc_EX) & valid_EX  //
// | rs1(instr_ID)==rd_EX & readRegL(instr_ID) & writeReg(opc_MA) & valid_MA  //
// | rs2(instr_ID)==rd_EX & readRegR(instr_ID) & writeReg(opc_EX) & valid_EX  //
// | rs2(instr_ID)==rd_EX & readRegR(instr_ID) & writeReg(opc_MA) & valid_MA  //
//                                                                            //
// ************************************************************************** //

macro DataWidth =  8; // bit-width of registers

// -------------------------------------------------------------------------- //
// macros to extract parts of machine instruction i                           //
// -------------------------------------------------------------------------- //

macro OpCode(i)  = i{15:10}; // opcode of instruction i
macro DestReg(i) = i{9:7};   // register index of destination
macro SrcLReg(i) = i{6:4};   // register index of left operand 
macro SrcRReg(i) = i{3:1};   // register index of right operand 
macro VctFlag(i) = i{0};     // whether it is a vector operation
macro ConstOp(i) = i{3:0};   // 4-bit immediate operand
macro JumpAdr(i) = i{9:0};   // address of jump instruction
macro FctCode(i) = i{6:0};   // 7-bit immediate operand or function code

// ----------------------------------------------------------------------------
// macros for conflict detection
// ----------------------------------------------------------------------------

macro writePC(opc) = bv2nat(0b100010) <= bv2nat(opc) & bv2nat(opc)<=bv2nat(0b100101);
macro readRegL(i)  = bv2nat(OpCode(i))<=bv2nat(0b011111) or OpCode(i)==0b100010 or OpCode(i)==0b100011;
macro readRegR(i)  = bv2nat(OpCode(i))<=bv2nat(0b011001) and (!OpCode(i){4} -> !OpCode(i){1});
macro writeReg(opc) = bv2nat(opc)<=bv2nat(0b011001) 
                      or opc==0b011110 or opc==0b100000 
                      or opc==0b100001 or opc==0b100111;



module PipeWithStalling(
    bv{16} ?instr,               // instruction to be performed now    
    nat pc,                      // program counter
    event nat !adrBus,           // address for memory access
    event bv{DataWidth} dataBus, // data for memory access
    event readMem,writeMem,      // whether data is read or written to memory
    event reqMem,ackMem,doneMem  // signals for memory transaction
    )
{
    // ----------------------------------------------------------------------
    // scalar registers
    // ----------------------------------------------------------------------    
    [8]bv{DataWidth} Reg;       // scalar registers
    bv{DataWidth} overflw;      // overflw register (completing ALU result)
    // ----------------------------------------------------------------------
    // auxiliary local variables
    // ----------------------------------------------------------------------
    bv{16} instr_ID;                      // instruction register
    bv{6}  opc_ID,opc_EX,opc_MA,opc_WB;   // opcodes of instructions
    bv{7}  fnc_EX,fnc_MA,fnc_WB;          // function codes or 7-bit constant
    bv{10} adr_EX,adr_MA,adr_WB;          // jump addresses of J-type instruction
    nat{8} rd_EX, rd_MA, rd_WB;           // destination registers
    bv{DataWidth} opS_EX,opS_MA;          // values to be stored
    bv{DataWidth} opL_EX,opR_EX;          // ALU operands
    bv{2*DataWidth} alu_EX,alu_MA,alu_WB; // ALU results
    bool cnd_EX,cnd_MA,cnd_WB;            // results of branch condition
    bv{DataWidth} ld_MA,ld_WB;            // load results
    // ----------------------------------------------------------------------
    // signals for stalling control
    // ----------------------------------------------------------------------    
    event conflict_IF,conflict_ID;
    event get_IF,get_ID,get_EX,get_MA,get_WB;
    event valid_ID,valid_EX,valid_MA,valid_WB;

    next(pc) = 0;

    // ----------------------------------------------------------------------    
    // The following loop is not a pipeline stage. It runs in parallel and is
    // used to evaluate the conditions for conflict detection and to determine
    // the next pc in case no branches are evaluated in WB.
    // ----------------------------------------------------------------------    
    loop {
        wcf: pause;
        // check whether we have a control conflict
        conflict_IF = writePC(opc_ID) or writePC(opc_EX) or writePC(opc_MA);
        // check whether we have a RAW conflict
        let(rs1 = bv2nat(SrcLReg(instr_ID)))
        let(rs2 = bv2nat(SrcRReg(instr_ID)))
        conflict_ID =
            rs1==rd_EX & readRegL(instr_ID) & writeReg(opc_EX) & rd_EX!=0 & valid_EX
          | rs1==rd_MA & readRegL(instr_ID) & writeReg(opc_MA) & rd_MA!=0 & valid_MA
          | rs2==rd_EX & readRegR(instr_ID) & writeReg(opc_EX) & rd_EX!=0 & valid_EX
          | rs2==rd_MA & readRegR(instr_ID) & writeReg(opc_MA) & rd_MA!=0 & valid_MA;
        // determine next value of pc unless a branch is executed in WB
        if(!(get_WB & writePC(opc_WB)))
            next(pc) = (get_IF ? pc+1 : pc);
    }
    // ----------------------------------------------------------------------
    // instruction fetch
    // ----------------------------------------------------------------------
    ||
    loop {
        wvIF: immediate await(!conflict_IF);
        emit(get_IF);
        next(instr_ID) = instr;
        do {
            wgIF: pause;
            emit(valid_ID);
        } while(!get_ID);
    }
    // ----------------------------------------------------------------------
    // instruction decode: determine operands and parts of instruction word
    // ----------------------------------------------------------------------
    ||
    loop {
        wvID: immediate await(valid_ID and !conflict_ID);
        emit(get_ID);
        Decode(instr_ID,Reg,overflw,
               opc_ID,opc_EX,fnc_EX,adr_EX,rd_EX,opS_EX,opL_EX,opR_EX);
        do {
            wgID: pause;
            emit(valid_EX);
        } while(!get_EX);
    }
    // ----------------------------------------------------------------------
    // execute instruction
    // ----------------------------------------------------------------------
    ||
    loop {
        wvEX: immediate await(valid_EX);
        emit(get_EX);
        Execute(opc_EX,fnc_EX,adr_EX,rd_EX,opS_EX,opL_EX,opR_EX,
                opc_MA,fnc_MA,adr_MA,rd_MA,opS_MA,alu_EX,alu_MA,cnd_EX,cnd_MA);
        do {
            wgEX: pause;
            emit(valid_MA);
        } while(!get_MA);
    }
    // ----------------------------------------------------------------------
    // memory transaction
    // ----------------------------------------------------------------------
    ||
    loop {
        wvMA: immediate await(valid_MA);
        emit(get_MA);
        MemAccess(pc,opc_MA,fnc_MA,adr_MA,rd_MA,opS_MA,alu_MA,cnd_MA,
                     opc_WB,fnc_WB,adr_WB,rd_WB,       alu_WB,cnd_WB,
                     ld_MA,ld_WB,
                     adrBus,dataBus,
                     readMem,writeMem,reqMem,ackMem,doneMem);
        do {
            wgMA: pause;
            emit(valid_WB);
        } while(!get_WB);
    }
    // ----------------------------------------------------------------------
    // write back results in registers
    // ----------------------------------------------------------------------
    ||
    loop {
        wvWB: immediate await(valid_WB);
        emit(get_WB);
        WriteBack(pc,opc_WB,fnc_WB,adr_WB,rd_WB,alu_WB,ld_WB,cnd_WB,
                     Reg,overflw);
        wgWB: pause;
    }

}
drivenby DataConflictsALU {
    [13]bv{16} Prog;
    Prog = [
        0b1000000010000001,  //   0:     mov R1,1
        0b1000000100000010,  //   1:     mov R2,2
        0b1000000110000011,  //   2:     mov R3,3
        0b1000001000000100,  //   3:     mov R4,4
        0b1000001010000101,  //   4:     mov R5,5
        0b0000000010100110,  //   5: I1: add R1,R2,R3    
        0b0000001000011000,  //   6: I2: add R4,R1,R4    
        0b0000001010010010,  //   7: I3: add R5,R1,R1    
        0b0000001000010100,  //   8: I3: add R4,R1,R2    
        0b0000100000000000,  //   9:     nop
        0b0000100000000000,  //  10:     nop
        0b0000100000000000,  //  11:     nop
        0b0000100000000000   //  12:     nop
    ];
    pause;
    weak abort {
        loop {
            instr = Prog[pc];
            pause;
        }
    } when(pc>=12);
}