// ************************************************************************** //
//                                                                            //
//    eses                   eses                                             //
//   eses                     eses                                            //
//  eses    eseses  esesese    eses   Embedded Systems Group                  //
//  ese    ese  ese ese         ese                                           //
//  ese    eseseses eseseses    ese   Department of Computer Science          //
//  eses   eses          ese   eses                                           //
//   eses   eseses  eseseses  eses    University of Kaiserslautern            //
//    eses                   eses                                             //
//                                                                            //
// ************************************************************************** //
//                                                                            //
// This file defines the instruction set of a simple RISC processor used for  //
// teaching and research of the Embedded Systems Group at the University of   //
// Kaiserslautern. The module below defines the single cycle behavior where   //
// all instructions, except for memory transactions require only one cycle    //
// for their execution. The module is not intended for synthesis, rather as a //
// golden device for verification of more sophisticated implementations.      //
//                                                                            //
// Instructions are encoded by 16 bits, whereas the uppermost 6 bits encode   //
// the opcode of the instruction. The other bits depend on the opcode in that //
// the following types of instructions are distinguished (by the number of    //
// registers that are affected):                                              //
//                                                                            //
//    R-Type (3 registers):                                                   //
//        i{15:10} : opcode                                                   //
//        i{9:7}   : index of destination register                            //
//        i{6:4}   : index of register providing the left operand             //
//        i{3:1}   : index of register providing the right operand            //
//        i{0}     : additional R-type opcode, e.g. vector/scalar operation   //
//    I-Type (2 registers; 1 immediate operand):                              //
//        i{15:10} : opcode                                                   //
//        i{9:7}   : index of destination register                            //
//        i{6:4}   : index of register providing the left operand             //
//        i{3:0}   : 4-bit constant (often the right operand)                 //
//    S-Type (1 register; 1 immediate operand):                               //
//        i{15:10} : opcode                                                   //
//        i{9:7}   : index of destination register                            //
//        i{6:0}   : 7-bit constant (either operand or function code)         //
//    J-Type (0 register; 1 immediate operand):                               //
//        i{15:10} : opcode                                                   //
//        i{9:0}   : 10-bit constant for address offset                       //
//                                                                            //
// The processor has 8 general purpose registers Reg[0],..,Reg[7] whose width //
// is determined by parameter DataWidth. There is also an additional register //
// called "overflw" that takes the upper half of the result of ALU operations.//
// In the vector machine extension, the CPU has further registers Vec[0],..,  //
// Vec[7] such that each Vec[i] can store a vector of DataWidth scalar values //
// of width DataWidth. The ALU instructions operating on two operand registers//
// refer to the vector registers if bit i{0} of the instruction holds.        //
//                                                                            //
// The communication with the instruction memory is quite simple: The program //
// memory is viewed as an array holding the machine instructions. The CPU     //
// writes via its output "pc" the index of the desired instruction, and the   //
// environment provides via input "instr" the corresponding instruction word. //
// This way of communication allows one to easily convert assembler programs  //
// to drivers so that a simulation of the program simulates the processor.    //
//                                                                            //
// The communication with the data memory is more complicated: This memory is //
// assumed to be word addressed, i.e., the size of a memory cell is DataWidth.//
// A memory transaction works as follows: First, the CPU applies for memory a //
// access by emitting "reqMem" until the memory responds with "ackMem". On a  //
// single processor computer, "reqMem" may arrive at the same point of time   //
// where "ackMem" is set. At that point of time, the address is written to    //
// "adrBus" and the signals "readMem"/"writeMem" have to be provided so that  //
// the memory system receives its task. In case of a store operation, also the//
// data to be written must be provided over "dataBus". Then, the CPU has to   //
// wait until signal "doneMem" is set by the main memory which terminates the //
// memory transaction (at that point of time, the data to be loaded can be    //
// fetched from "dataBus"). Emitting "readMem" and "writeMem" at the same time//
// instructs the memory system to write back the cache content (if caches are //
// used).                                                                     //
// ************************************************************************** //

// -------------------------------------------------------------------------- //
// Opcodes of the instructions: Each opcode consists of six bits, and for the //
// arithmetic instructions, opc{0} holds if the operands are interpreted as   //
// unsigned integers, and opc{1} holds if the right operand is an immediate   //
// constant ConstOp(i). The bits opc{3:2} encode the arithmetic operation.    //
// Note that i{0} holds iff these operations refer to vector registers rather //
// than scalar registers, so that an extension to a vector processor can be   //
// easily obtained. The other opcodes are simply obtained by enumeration of   //
// the instructions.                                                          //
// -------------------------------------------------------------------------- //

macro ADD   = 0b000000;     //   signed addition of two registers
macro ADDU  = 0b000001;     // unsigned addition of two registers
macro ADDI  = 0b000010;     //   signed addition of a register and a constant
macro ADDIU = 0b000011;     // unsigned addition of a register and a constant
macro SUB   = 0b000100;     //   signed subtraction of two registers
macro SUBU  = 0b000101;     // unsigned subtraction of two registers
macro SUBI  = 0b000110;     //   signed subtraction of a register and a constant
macro SUBIU = 0b000111;     // unsigned subtraction of a register and a constant
macro MUL   = 0b001000;     //   signed multiplication of two registers
macro MULU  = 0b001001;     // unsigned multiplication of two registers
macro MULI  = 0b001010;     //   signed multiplication of a register and a constant
macro MULIU = 0b001011;     // unsigned multiplication of a register and a constant
macro DIV   = 0b001100;     //   signed division of two registers
macro DIVU  = 0b001101;     // unsigned division of two registers
macro DIVI  = 0b001110;     //   signed division of a register and a constant
macro DIVIU = 0b001111;     // unsigned division of a register and a constant

macro SLT   = 0b010000;     //   signed less-than comparison of two registers
macro SLTU  = 0b010001;     // unsigned less-than comparison of two registers
macro SLE   = 0b010010;     //   signed less-than-or-equal comparison of two registers
macro SLEU  = 0b010011;     // unsigned less-than-or-equal comparison of two registers
macro SEQ   = 0b010100;     // bitwise   equality test of two registers
macro SNE   = 0b010101;     // bitwise inequality test of two registers

macro AND   = 0b010110;     // bitwise conjunction of two registers
macro OR    = 0b010111;     // bitwise disjunction of two registers
macro NAND  = 0b011000;     // bitwise negated conjunction of two registers
macro NOR   = 0b011001;     // bitwise negated disjunction of two registers

macro LD    = 0b011010;     // load word from memory to register Reg[rd]
macro ST    = 0b011011;     // store Reg[rd] to memory
macro LVWS  = 0b011100;     // load vector register with stride
macro SVWS  = 0b011101;     // store vector register with stride
macro LL    = 0b011110;     // load linked for multiprocessors
macro SC    = 0b011111;     // store conditional for multiprocessors
macro MOV   = 0b100000;     // move   signed constant FctCode(i) = i{6:0} to Reg[rd]
macro MOVU  = 0b100001;     // move unsigned constant FctCode(i) = i{6:0} to Reg[rd]

macro BEZ   = 0b100010;     // branch to pc+Reg[rs1]+ConstOp(i) if Reg[rd]==Zero
macro BNZ   = 0b100011;     // branch to pc+Reg[rs1]+ConstOp(i) if Reg[rd]!=Zero
macro JMP   = 0b100100;     // jump to pc+Reg[rd]+FctCode(i)
macro J     = 0b100101;     // jump to pc+JumpAdr(i)

macro SYNC  = 0b100111;     // note that this group of instructions share the same
macro OVF   = 0b100111;     // opcode, and differ in the additional function code
macro MVTM  = 0b100111;     // listed below with their meaning
macro MVFM  = 0b100111;
macro MVTL  = 0b100111;
macro MVFL  = 0b100111;

macro fn_SYNC  = 0b0000000; // flush cache content to main memory
macro fn_OVF   = 0b0000001; // move content of "overflw" to Reg[rd]
macro fn_MVTM  = 0b0000010; // move   to mask register (on vector machines)
macro fn_MVFM  = 0b0000011; // move from mask register (on vector machines)
macro fn_MVTL  = 0b0000100; // move   to length register (on vector machines)
macro fn_MVFL  = 0b0000101; // move from length register (on vector machines)


// -------------------------------------------------------------------------- //
// macros of the processor                                                    //
// -------------------------------------------------------------------------- //

macro DataWidth = 8;
macro One           = {true::DataWidth};          // bitvector consisting of 1s
macro Zero          = {false::DataWidth};         // bitvector consisting of 0s
macro UpperWord(r)  = r{2*DataWidth-1:DataWidth}; // upper half of double word
macro LowerWord(r)  = r{DataWidth-1:0};           // lower half of double word


// -------------------------------------------------------------------------- //
// macros to extract parts of machine instruction i                           //
// -------------------------------------------------------------------------- //

macro OpCode(i)  = i{15:10}; // opcode of instruction i
macro DestReg(i) = i{9:7};   // register index of destination
macro SrcLReg(i) = i{6:4};   // register index of left operand 
macro SrcRReg(i) = i{3:1};   // register index of right operand 
macro VctFlag(i) = i{0};     // whether it is a vector operation
macro ConstOp(i) = i{3:0};   // 4-bit immediate operand
macro JumpAdr(i) = i{9:0};   // address of jump instruction
macro FctCode(i) = i{6:0};   // 7-bit immediate operand or function code




// ----------------------------------------------------------------------------
//                      single cycle behavior of the CPU
// ----------------------------------------------------------------------------

module ScalarBehav (
    bv{16} ?instr,               // instruction to be performed now    
    nat pc,                      // program counter
    event nat !adrBus,           // address for memory access
    event bv{DataWidth} dataBus, // data for memory access
    event readMem,writeMem,      // whether data is read or written to memory
    event reqMem,ackMem,doneMem  // signals for memory transaction
    )
{
    // ----------------------------------------------------------------------
    // registers for scalar data (required in all versions)
    // ----------------------------------------------------------------------    
    [8]bv{DataWidth} Reg;       // scalar registers
    bv{DataWidth} overflw;      // overflw register (completing result)
    bv{2*DataWidth} AluOut;     // intermediate result of ALU operations

    // ----------------------------------------------------------------------
    // auxiliary local variables to increase readability
    // ----------------------------------------------------------------------    
    bv{6} opc;                  // opcode of instr
    nat{8} rd,rs1,rs2;          // register indices taken from instr
    bv{4} cst;                  // constant operand of I-type instructions
    bv{7} fnc;                  // constant operand of S-type instructions
    bv{10} adr;                 // jump address of J-type instruction
    bool vct;                   // whether it's a vector instruction

    next(pc) = 0;
    loop {
        waitInstr: pause;
        // --------------------------------------------------------------------
        // instruction decode
        // --------------------------------------------------------------------
        opc = OpCode(instr);
        rd  = bv2nat(DestReg(instr));
        rs1 = bv2nat(SrcLReg(instr));
        rs2 = bv2nat(SrcRReg(instr));
        cst = ConstOp(instr);
        fnc = FctCode(instr);
        adr = JumpAdr(instr);
        vct = VctFlag(instr);

        // --------------------------------------------------------------------
        // execute current instruction
        // --------------------------------------------------------------------
        case
            // ----------------------------------------------------------------
            // arithmetic instructions with register operands
            // ----------------------------------------------------------------
            (opc==ADD & !vct)  do {
                AluOut = int2bv(bv2int(Reg[rs1]) + bv2int(Reg[rs2]),2*DataWidth);
                next(overflw) = UpperWord(AluOut);
                next(Reg[rd]) = (rd==0 ? Zero : LowerWord(AluOut));
                }
            (opc==ADDU & !vct) do {
                AluOut = nat2bv(bv2nat(Reg[rs1]) + bv2nat(Reg[rs2]),2*DataWidth);
                next(overflw) = UpperWord(AluOut);
                next(Reg[rd]) = (rd==0 ? Zero : LowerWord(AluOut));
                }
            (opc==SUB & !vct) do {
                AluOut = int2bv(bv2int(Reg[rs1]) - bv2int(Reg[rs2]),2*DataWidth);
                next(overflw) = UpperWord(AluOut);
                next(Reg[rd]) = (rd==0 ? Zero : LowerWord(AluOut));
                }
            (opc==SUBU & !vct) do {
                AluOut = nat2bv(bv2nat(Reg[rs1]) - bv2nat(Reg[rs2]),2*DataWidth);
                next(overflw) = UpperWord(AluOut);
                next(Reg[rd]) = (rd==0 ? Zero : LowerWord(AluOut));
                }
            (opc==MUL & !vct) do {
                AluOut = int2bv(bv2int(Reg[rs1]) * bv2int(Reg[rs2]),2*DataWidth);
                next(overflw) = UpperWord(AluOut);
                next(Reg[rd]) = (rd==0 ? Zero : LowerWord(AluOut));
                }
            (opc==MULU & !vct) do {
                AluOut = nat2bv(bv2nat(Reg[rs1]) * bv2nat(Reg[rs2]),2*DataWidth);
                next(overflw) = UpperWord(AluOut);
                next(Reg[rd]) = (rd==0 ? Zero : LowerWord(AluOut));
                }
            (opc==DIV & !vct) do {
                next(overflw) = int2bv(bv2int(Reg[rs1]) % bv2int(Reg[rs2]),DataWidth);
                next(Reg[rd]) = (rd==0 ? Zero : int2bv(bv2int(Reg[rs1]) / bv2int(Reg[rs2]),DataWidth));
                }
            (opc==DIVU & !vct) do {
                next(overflw) = nat2bv(bv2nat(Reg[rs1]) % bv2nat(Reg[rs2]),DataWidth);
                next(Reg[rd]) = (rd==0 ? Zero : nat2bv(bv2nat(Reg[rs1]) / bv2nat(Reg[rs2]),DataWidth));
                }
            // ----------------------------------------------------------------
            // arithmetic instructions with constant operands
            // ----------------------------------------------------------------
            (opc==ADDI) do {
                AluOut = int2bv(bv2int(Reg[rs1]) + bv2int(cst),2*DataWidth);
                next(overflw) = UpperWord(AluOut);
                next(Reg[rd]) = (rd==0 ? Zero : LowerWord(AluOut));
                }
            (opc==ADDIU) do {
                AluOut = nat2bv(bv2nat(Reg[rs1]) + bv2nat(cst),2*DataWidth);
                next(overflw) = UpperWord(AluOut);
                next(Reg[rd]) = (rd==0 ? Zero : LowerWord(AluOut));
                }
            (opc==SUBI) do {
                AluOut = int2bv(bv2int(Reg[rs1]) - bv2int(cst),2*DataWidth);
                next(overflw) = UpperWord(AluOut);
                next(Reg[rd]) = (rd==0 ? Zero : LowerWord(AluOut));
                }
            (opc==SUBIU) do {
                AluOut = nat2bv(bv2nat(Reg[rs1]) - bv2nat(cst),2*DataWidth);
                next(overflw) = UpperWord(AluOut);
                next(Reg[rd]) = (rd==0 ? Zero : LowerWord(AluOut));
                }
            (opc==MULI) do {
                AluOut = int2bv(bv2int(Reg[rs1]) * bv2int(cst),2*DataWidth);
                next(overflw) = UpperWord(AluOut);
                next(Reg[rd]) = (rd==0 ? Zero : LowerWord(AluOut));
                }
            (opc==MULIU) do {
                AluOut = nat2bv(bv2nat(Reg[rs1]) * bv2nat(cst),2*DataWidth);
                next(overflw) = UpperWord(AluOut);
                next(Reg[rd]) = (rd==0 ? Zero : LowerWord(AluOut));
                }
            (opc==DIVI) do {
                next(overflw) = int2bv(bv2int(Reg[rs1]) % bv2int(cst),DataWidth);
                next(Reg[rd]) = (rd==0 ? Zero : int2bv(bv2int(Reg[rs1]) / bv2int(cst),DataWidth));
                }
            (opc==DIVIU) do {
                next(overflw) = nat2bv(bv2nat(Reg[rs1]) % bv2nat(cst),DataWidth);
                next(Reg[rd]) = (rd==0 ? Zero : nat2bv(bv2nat(Reg[rs1]) / bv2nat(cst),DataWidth));
                }
            // ----------------------------------------------------------------
            // comparison instructions
            // ----------------------------------------------------------------
            (opc==SLT & !vct) do 
                next(Reg[rd]) = (rd==0 ? Zero : (bv2int(Reg[rs1]) <  bv2int(Reg[rs2])?One:Zero));
            (opc==SLTU & !vct) do 
                next(Reg[rd]) = (rd==0 ? Zero : (bv2nat(Reg[rs1]) <  bv2nat(Reg[rs2])?One:Zero));
            (opc==SLE & !vct) do 
                next(Reg[rd]) = (rd==0 ? Zero : (bv2int(Reg[rs1]) <= bv2int(Reg[rs2])?One:Zero));
            (opc==SLEU & !vct) do 
                next(Reg[rd]) = (rd==0 ? Zero : (bv2nat(Reg[rs1]) <= bv2nat(Reg[rs2])?One:Zero));
            (opc==SEQ & !vct) do
                next(Reg[rd]) = (rd==0 ? Zero :  (Reg[rs1] == Reg[rs2]?One:Zero));
            (opc==SNE & !vct) do
                next(Reg[rd]) = (rd==0 ? Zero : !(Reg[rs1] != Reg[rs2]?One:Zero));
            // ----------------------------------------------------------------
            // logic instructions
            // ----------------------------------------------------------------
            (opc==AND)  do next(Reg[rd]) = (rd==0 ? Zero :   Reg[rs1] & Reg[rs2]);
            (opc==OR)   do next(Reg[rd]) = (rd==0 ? Zero :   Reg[rs1] | Reg[rs2]);
            (opc==NAND) do next(Reg[rd]) = (rd==0 ? Zero : !(Reg[rs1] & Reg[rs2]));
            (opc==NOR)  do next(Reg[rd]) = (rd==0 ? Zero : !(Reg[rs1] | Reg[rs2]));
            // ----------------------------------------------------------------
            // load and store instructions
            // ----------------------------------------------------------------
            (opc==LD | opc==LL | opc==ST | opc==SC | opc==SYNC & fnc==fn_SYNC) do {
                // apply for memory access by emitting reqMem until
                // ackMem holds; may take time on multiprocessors
                weak immediate abort {
                    loop {
                        emit(reqMem);
                        if(!ackMem) next(pc) = pc;
                        waitMem1: pause;
                    }
                } when(ackMem);
                // provide address and read/write request signal
                // until memory transaction is done
                weak immediate abort {
                    loop {
                        adrBus = bv2nat(Reg[rs1]) + bv2nat(cst);
                        case
                            (opc==LD)   do emit(readMem);
                            (opc==LL)   do emit(readMem);
                            (opc==ST)   do {emit(writeMem); dataBus = Reg[rd];}
                            (opc==SC)   do {emit(writeMem); dataBus = Reg[rd];}
                            (opc==SYNC) do {emit(readMem); emit(writeMem);}
                        default nothing;
                        if(!doneMem) next(pc) = pc;
                        waitMem2: pause;
                    }
                } when(doneMem);
                // in case of load, store the data in the register
                if(opc==LD | opc==LL)
                    next(Reg[rd]) = (rd==0 ? Zero : dataBus);
                }
            // ----------------------------------------------------------------
            // moving constants to registers
            // ----------------------------------------------------------------
            (opc==MOV)  do next(Reg[rd]) = (rd==0 ? Zero : int2bv(bv2int(fnc),DataWidth));
            (opc==MOVU) do next(Reg[rd]) = (rd==0 ? Zero : nat2bv(bv2nat(fnc),DataWidth));
            // ----------------------------------------------------------------
            // branch and jump instructions
            // ----------------------------------------------------------------
            (opc==BEZ) do 
                next(pc) = (Reg[rd]==Zero?pc+bv2int(Reg[rs1])+bv2int(cst):pc+1);
            (opc==BNZ) do 
                next(pc) = (Reg[rd]!=Zero?pc+bv2int(Reg[rs1])+bv2int(cst):pc+1);
            (opc==JMP) do 
                next(pc) = pc+bv2int(Reg[rd])+bv2int(fnc);
            (opc==J) do 
                next(pc) = pc+bv2int(adr);
            // ----------------------------------------------------------------
            // move content of overflow register to destination register rd
            // ----------------------------------------------------------------
            (opc==OVF & fnc==fn_OVF) do 
                next(Reg[rd]) = (rd==0 ? Zero : overflw);
            // ----------------------------------------------------------------
        default nothing;
        // --------------------------------------------------------------------
        // if no branch instruction is executed, the pc has to be incremented
        // to fetch the next instruction
        // --------------------------------------------------------------------
        if(opc!=BEZ & opc!=BNZ & opc!=JMP & opc!=J)
            next(pc) = pc+1;
    }
}