`timescale 1ns / 1ps 
module TinyComp( 
 input Ph0In, Ph1In, //clock phases 
 input Reset, 
 input [31:00] InData, // I/O input 
 input InRdy, 
 output InStrobe, //We are executing an Input instruction 
 output OutStrobe //We are executing an Output instruction 
); 
wire doSkip; 
wire [31:00] WD; //write data to the register file 
wire [23:00] WDmid;  //the WD mux intermediate outputs 
wire [31:00] RFAout; //register file port A read data 
wire [31:00] RFBout; //register file port B read data 
reg  [9:0]   PC; //10-bit program counter 
wire [9:0]   PCinc, PCinc2, PCmux; 
wire [31:00] ALU; // ALUoutput 
wire [31:00] AddSubUnit; 
wire [31:00] ALUresult; 
wire [31:00] DM; //the Data memory (1K x 32) outputs 
wire [31:00] IM; //the Instruction memory (1K x 32) outputs 
wire Ph0, Ph1; //the (buffered) clocks 
wire WriteIM, WriteDM, Jump, LoadDM, LoadALU; //Opcode decodes 
//----------End of declarations------------ 
//this is the only register, other than the registers in the block RAMs. 
// Everything else is combinational. 
 always @(posedge Ph0) 
  if(Reset) PC <= 0; 
  else PC <= PCmux; 
// the Phases.  They are asymmetric -- see .ucf file 
BUFG ph0Buf(.I(Ph0In), .O(Ph0)); //this is Xilinx - specific 
BUFG ph1Buf(.I(Ph1In), .O(Ph1)); 
//the Skip Tester. 1 LUT 
assign doSkip = (~IM[24] & ~IM[4] &  IM[3] & ALU[31]) | 
                (~IM[24] &  IM[4] & ~IM[3] & (ALU == 0)) | 
                (~IM[24] &  IM[4] &  IM[3] & InRdy); 
                
//Opcode decode.  7 LUTs 
 assign WriteIM   = ~IM[24] & ~IM[2] & ~IM[1] &  IM[0]; //Op 1 
 assign WriteDM   = ~IM[24] & ~IM[2] &  IM[1] & ~IM[0]; //Op 2 
 assign OutStrobe = ~IM[24] & ~IM[2] &  IM[1] &  IM[0]; //Op 3 
 assign LoadDM    = ~IM[24] &  IM[2] & ~IM[1] & ~IM[0]; //Op 4 
 assign InStrobe  = ~IM[24] &  IM[2] & ~IM[1] &  IM[0]; //Op 5 
 assign Jump      = ~IM[24] &  IM[2] &  IM[1] & ~IM[0]; //op 6 
 assign LoadALU  =  ~IM[24] & ~IM[2] ;                  //Ops 0..3 
// instantiate the WD multiplexer. 24*2 + 8 = 56 LUTs 
 genvar i; 
 generate 
  for(i = 0; i < 32; i = i+1) 
  begin: wsblock 
   if(i < 10 )begin 
    assign WDmid[i] = (LoadALU & ALU[i]) | (InStrobe & InData[i]) | (LoadDM & DM[i]); 
//6-in 
    assign WD[i] = (Jump & PCinc[i]) | (IM[24] & IM[i]) | WDmid[i]; //5-in 
  end else if(i < 24) begin 
    assign WDmid[i] = (LoadALU & ALU[i]) | (InStrobe & InData[i]) | (LoadDM & DM[i]); 
//6-in 
    assign WD[i] = (IM[24] & IM[i]) | WDmid[i]; //3-in 
  end else 
    assign WD[i] = (LoadALU & ALU[i]) | (InStrobe & InData[i]) | (LoadDM & DM[i]); //6-in 
  end //wsblock 
endgenerate 
//the PC-derived signals 

 assign PCinc =  PC + 1; 
 assign PCinc2 = PC + 2; 
 assign PCmux = Jump ? ALU[9:0] : doSkip ? PCinc2 : PCinc; 
//instantiate the IM. Read during Ph0, written (if needed) at the beginning of the next Ph0
 ramx im( 
   .clkb(Ph0 ), .addrb(PCmux[9:0]),                .doutb(IM),     //the read port 
   .clka(Ph0), .addra(RFBout[9:0]), .wea(WriteIM), .dina(RFAout)); //the write port 
    
//instantiate the DM. Read during Ph1, written (if needed) at the beginning of the next Ph0 
 ramx dm( 
   .clkb(Ph1), .addrb(RFBout[9:0]),                .doutb(DM),  //the read port 
   .clka(Ph0), .addra(RFBout[9:0]), .wea(WriteDM), .dina(RFAout)); //the write port 
   
//instantiate the register file.  This has three independent addresses, so two BRAMs are needed. 
// read after the read and write addresses are stable (rise of Ph1) written at the end of the 
// instruction (rise of Ph0). 
 ramx rfA(.addra({3'b0, IM[31:25]}), .clka(Ph0), .wea(1'b0), .dina(WD), //write port 
   .clkb(Ph1), .addrb({3'b0, IM[23:17]}), .doutb(RFAout));              //read port 
 ramx rfB(.addra({3'b0, IM[31:25]}), .clka(Ph0), .wea(1'b1), .dina(WD), //write port 
   .clkb(Ph1), .addrb({3'b0, IM[16:10]}), .doutb(RFBout));              //read port 
//instantiate the ALU: An adder/subtractor followed by a shifter 
//32 LUTs. IM[8] => mask A, IM[7] => complement B, insert Cin 
 assign AddSubUnit = ((IM[8]? 32'b0 : RFAout) + (IM[7] ? ~RFBout : RFBout)) + IM[7];  
//generate the ALU and shifter one bit at a time 
 genvar j; 
 generate 
  for(j = 0; j < 32; j = j+1) 
   begin: shblock 
    assign ALUresult[j] =  //32 LUTs 
     (~IM[9] & AddSubUnit[j]) |  //0-3: A+B, A-B, B+1, B-1 
     ( IM[9] & ~IM[8] & ~IM[7] & (RFAout[j] & RFBout[j])) | //4: and 
     ( IM[9] & ~IM[8] &  IM[7] & (RFAout[j] | RFBout[j])) | //5: or 
     ( IM[9] &  IM[8] &  IM[7] & (RFAout[j] ^ RFBout[j])) ; //6: xor 
    assign ALU[j] = //32 LUTs 
     (~IM[6] & ~IM[5] & ALUresult[j]) | //0: no cycle 
     (~IM[6] &  IM[5] & ALUresult[(j + 1)  % 32]) | //1: rcy 1 
     ( IM[6] & ~IM[5] & ALUresult[(j + 8)  % 32]) | //2: rcy 8 
     ( IM[6] &  IM[5] & ALUresult[(j + 16) % 32]) ; //rcy 16 
   end //shblock 
 endgenerate 
endmodule