BPU.scala 10.1 KB
Newer Older
1 2 3 4 5 6 7 8
package noop

import chisel3._
import chisel3.util._
import chisel3.util.experimental.BoringUtils

import utils._

9
class TableAddr(val idxBits: Int) extends NOOPBundle {
Z
Zihao Yu 已提交
10
  def tagBits = AddrBits - 2 - idxBits
11 12 13

  val tag = UInt(tagBits.W)
  val idx = UInt(idxBits.W)
14
  val pad = UInt(2.W)//TODO
15

Z
Zihao Yu 已提交
16
  def fromUInt(x: UInt) = x.asTypeOf(UInt(AddrBits.W)).asTypeOf(this)
17 18 19 20
  def getTag(x: UInt) = fromUInt(x).tag
  def getIdx(x: UInt) = fromUInt(x).idx
}

21 22 23 24 25 26 27 28 29
object BTBtype {
  def B = "b00".U  // branch
  def J = "b01".U  // jump
  def I = "b10".U  // indirect
  def R = "b11".U  // return

  def apply() = UInt(2.W)
}

Z
Zihao Yu 已提交
30
class BPUUpdateReq extends NOOPBundle {
31
  val valid = Output(Bool())
Z
Zihao Yu 已提交
32
  val pc = Output(UInt(AddrBits.W))
33
  val isMissPredict = Output(Bool())
Z
Zihao Yu 已提交
34
  val actualTarget = Output(UInt(AddrBits.W))
35
  val actualTaken = Output(Bool())  // for branch
Z
Zihao Yu 已提交
36
  val fuOpType = Output(FuOpType())
37
  val btbType = Output(BTBtype())
38
  val isRVC = Output(Bool()) // for ras, save PC+2 to stack if is RVC
39 40
}

Z
Zihao Yu 已提交
41
class BPU1 extends NOOPModule {
42
  val io = IO(new Bundle {
Z
Zihao Yu 已提交
43
    val in = new Bundle { val pc = Flipped(Valid((UInt(AddrBits.W)))) }
Z
Zihao Yu 已提交
44
    val out = new RedirectIO
45
    val flush = Input(Bool())
W
William Wang 已提交
46
    val brIdx = Output(UInt(3.W))
47
    val lateJump = Output(Bool())
48 49
  })

50 51
  val flush = BoolStopWatch(io.flush, io.in.pc.valid, startHighPriority = true)

52 53 54
  // BTB
  val NRbtb = 512
  val btbAddr = new TableAddr(log2Up(NRbtb))
55
  def btbEntry() = new Bundle {
56 57
    val tag = UInt(btbAddr.tagBits.W)
    val _type = UInt(2.W)
Z
Zihao Yu 已提交
58
    val target = UInt(AddrBits.W)
59
    val brIdx = UInt(3.W)
60
    val valid = Bool()
61 62
  }

Z
Zihao Yu 已提交
63
  val btb = Module(new SRAMTemplate(btbEntry(), set = NRbtb, shouldReset = true, holdRead = true, singlePort = true))
64 65
  // flush BTB when executing fence.i
  val flushBTB = WireInit(false.B)
66
  val flushTLB = WireInit(false.B)
67
  BoringUtils.addSink(flushBTB, "MOUFlushICache")
68 69
  BoringUtils.addSink(flushTLB, "MOUFlushTLB")
  btb.reset := reset.asBool || (flushBTB || flushTLB)
70

71 72 73 74 75 76
  Debug() {
    when (reset.asBool || (flushBTB || flushTLB)) {
      printf("[BPU-RESET] %d bpu-reset flushBTB:%d flushTLB:%d\n", GTimer(), flushBTB, flushTLB)
    }
  }

Z
Zihao Yu 已提交
77
  btb.io.r.req.valid := io.in.pc.valid
78
  btb.io.r.req.bits.setIdx := btbAddr.getIdx(io.in.pc.bits)
79

80 81 82 83 84 85
  Debug() {
    when (btb.io.r.req.valid) {
      printf("[BTB-read] %d pc:%x setIdx:%x\n", GTimer(), io.in.pc.bits, btbAddr.getIdx(io.in.pc.bits))
    }
  }

86 87
  val btbRead = Wire(btbEntry())
  btbRead := btb.io.r.resp.data(0)
Z
Zihao Yu 已提交
88 89 90
  // since there is one cycle latency to read SyncReadMem,
  // we should latch the input pc for one cycle
  val pcLatch = RegEnable(io.in.pc.bits, io.in.pc.valid)
91
  val btbHit = btbRead.tag === btbAddr.getTag(pcLatch) && !flush && RegNext(btb.io.r.req.ready, init = false.B) && !(pcLatch(1) && btbRead.brIdx(0)) && btbRead.valid
92
  // btbHit will ignore pc(1,0). pc(1,0) is used to build brIdx
93 94 95 96 97
  // !(pcLatch(1) && btbRead.brIdx(0)) is used to deal with the following case:
  // -------------------------------------------------
  // 0 jump rvc // marked as "take branch" in BTB
  // 2 xxx  rvc <-- jump to here
  // -------------------------------------------------
98 99 100 101
  val lateJump = btbRead.brIdx(2) && btbHit
  io.lateJump := lateJump
  // val lateJumpLatch = RegNext(lateJump)
  // val lateJumpTarget = RegEnable(btbRead.target, lateJump)
102
  Debug(){
103
    //printf("[BTBHT] lateJump %x lateJumpLatch %x lateJumpTarget %x\n", lateJump, lateJumpLatch, lateJumpTarget)
104
    when(btbHit){
105 106 107 108 109
      printf("[BTBHT1] %d pc=%x tag=%x,%x index=%x bridx=%x tgt=%x,%x flush %x type:%x\n", GTimer(), pcLatch, btbRead.tag, btbAddr.getTag(pcLatch), btbAddr.getIdx(pcLatch), btbRead.brIdx, btbRead.target, io.out.target, flush,btbRead._type)
      printf("[BTBHT2] btbRead.brIdx %x mask %x\n", btbRead.brIdx, Cat(lateJump, Fill(2, io.out.valid)))
      printf(p"[BTBHT3] rasTarget:${rasTarget} pht:${pht} phtTaken:${phtTaken}\n")
      printf(p"[BTBHT4] io.out:${io.out} btbRead:${btbRead} btbWrite:${btbWrite}\n")
      printf("[BTBHT5] btbReqValid:%d btbReqSetIdx:%x\n",btb.io.r.req.valid, btb.io.r.req.bits.setIdx)
110
    }
111

112 113 114 115 116
    when(true.B) {
      //when(req.btbType === BTBtype.R) {
      //  printf("[BTBHT5] btbWrite.type is BTBtype.R/RET!!!\n")
      //}
      printf(p"[BTBHT5] req:${req} \n")
117 118
      
      //printf("[BTBHT5] tag: target:%x type:%d brIdx:%d\n", req.actualTarget, req.btbType, Cat(req.pc(2,0)==="h6".U && !req.isRVC, req.pc(1), ~req.pc(1)))
119 120 121 122 123
    }

    when (true.B) {
      printf("[BTB-read2] %d btbValid:%x pc:%x tag:%x target:%x brIdx:%x\n", GTimer(), btbRead.valid, io.in.pc.bits, btbRead.tag, btbRead.target, btbRead.brIdx)
    }
124
  }
125
  
126
  // PHT
127 128
  val pht = Mem(NRbtb, UInt(2.W))
  val phtTaken = RegEnable(pht.read(btbAddr.getIdx(io.in.pc.bits))(1), io.in.pc.valid)
129 130 131 132

  // RAS

  val NRras = 16
Z
Zihao Yu 已提交
133
  val ras = Mem(NRras, UInt(AddrBits.W))
134
  // val raBrIdxs = Mem(NRras, UInt(2.W))
135
  val sp = Counter(NRras)
Z
Zihao Yu 已提交
136
  val rasTarget = RegEnable(ras.read(sp.value), io.in.pc.valid)
137
  // val rasBrIdx = RegEnable(raBrIdxs.read(sp.value), io.in.pc.valid)
138 139

  // update
140
  val req = WireInit(0.U.asTypeOf(new BPUUpdateReq))
141
  val btbWrite = WireInit(0.U.asTypeOf(btbEntry()))
142
  BoringUtils.addSink(req, "bpuUpdateReq")
143

144 145 146 147
  Debug(){
    when(req.valid){
        printf("[BTBUP] pc=%x tag=%x index=%x bridx=%x tgt=%x type=%x\n", req.pc, btbAddr.getTag(req.pc), btbAddr.getIdx(req.pc), Cat(req.pc(1), ~req.pc(1)), req.actualTarget, req.btbType)
      }
148 149
  }

150 151 152 153 154 155 156 157 158
    //val fflag = req.btbType===3.U && btb.io.w.req.valid && btb.io.w.req.bits.setIdx==="hc9".U
    //when(fflag && GTimer()>2888000.U) {
    //  printf("%d\n", GTimer())
    //  printf("[BTBHT6] btbWrite.type is BTBtype.R/RET!!! Inpc:%x btbWrite.brIdx:%x setIdx:%x\n", io.in.pc.bits, btbWrite.brIdx, btb.io.w.req.bits.setIdx)
    //  printf("[BTBHT6] tag:%x target:%x _type:%x bridx:%x\n", btbWrite.tag,btbWrite.target,btbWrite._type,btbWrite.brIdx)
    //  printf(p"[BTBHT6] req:${req} \n")
    //} 
    //printf("[BTBHT5] tag: target:%x type:%d brIdx:%d\n", req.actualTarget, req.btbType, Cat(req.pc(2,0)==="h6".U && !req.isRVC, req.pc(1), ~req.pc(1)))

159 160 161
  btbWrite.tag := btbAddr.getTag(req.pc)
  btbWrite.target := req.actualTarget
  btbWrite._type := req.btbType
162
  btbWrite.brIdx := Cat(req.pc(2,0)==="h6".U && !req.isRVC, req.pc(1), ~req.pc(1))
163
  btbWrite.valid := true.B 
164 165 166 167 168 169 170
  // NOTE: We only update BTB at a miss prediction.
  // If a miss prediction is found, the pipeline will be flushed
  // in the next cycle. Therefore it is safe to use single-port
  // SRAM to implement BTB, since write requests have higher priority
  // than read request. Again, since the pipeline will be flushed
  // in the next cycle, the read request will be useless.
  btb.io.w.req.valid := req.isMissPredict && req.valid
171
  btb.io.w.req.bits.setIdx := btbAddr.getIdx(req.pc)
172
  btb.io.w.req.bits.data := btbWrite
173

174 175 176 177 178 179 180 181 182 183
  //Debug(true) {
    when (btb.io.w.req.valid && btbWrite.tag === btbAddr.getTag("hffffffff803541a4".U)) {
      printf("[BTBWrite] %d setIdx:%x req.valid:%d pc:%x target:%x bridx:%x\n", GTimer(), btbAddr.getIdx(req.pc), req.valid, req.pc, req.actualTarget, btbWrite.brIdx)
    }
  //}

  when (GTimer() > 77437484.U) {
    printf("[BTBWrite-ALL] %d setIdx:%x req.valid:%d pc:%x target:%x bridx:%x\n", GTimer(), btbAddr.getIdx(req.pc), req.valid, req.pc, req.actualTarget, btbWrite.brIdx)
  }

184 185
  val cnt = RegNext(pht.read(btbAddr.getIdx(req.pc)))
  val reqLatch = RegNext(req)
Z
Zihao Yu 已提交
186
  when (reqLatch.valid && ALUOpType.isBranch(reqLatch.fuOpType)) {
187 188 189 190 191
    val taken = reqLatch.actualTaken
    val newCnt = Mux(taken, cnt + 1.U, cnt - 1.U)
    val wen = (taken && (cnt =/= "b11".U)) || (!taken && (cnt =/= "b00".U))
    when (wen) {
      pht.write(btbAddr.getIdx(reqLatch.pc), newCnt)
192 193 194
      //Debug(){
        //printf("BPUPDATE: pc %x cnt %x\n", reqLatch.pc, newCnt)
      //}
195
    }
196
  }
197
  when (req.valid) {
198 199
    when (req.fuOpType === ALUOpType.call)  {
      ras.write(sp.value + 1.U, Mux(req.isRVC, req.pc + 2.U, req.pc + 4.U))
200
      // raBrIdxs.write(sp.value + 1.U, Mux(req.pc(1), 2.U, 1.U))
201 202
      sp.value := sp.value + 1.U
    }
Z
Zihao Yu 已提交
203
    .elsewhen (req.fuOpType === ALUOpType.ret) {
204
      when(sp.value === 0.U) {
205
        //printf("ATTTTT: sp.value is 0.U\n") //TODO: sp.value may equal to 0.U
206
      }
207
      sp.value := Mux(sp.value===0.U, 0.U, sp.value - 1.U) //TODO: sp.value may less than 0.U
208 209 210
    }
  }

211 212
  io.out.target := Mux(btbRead._type === BTBtype.R, rasTarget, btbRead.target)
  // io.out.target := Mux(lateJumpLatch && !flush, lateJumpTarget, Mux(btbRead._type === BTBtype.R, rasTarget, btbRead.target))
213
  // io.out.brIdx  := btbRead.brIdx & Fill(3, io.out.valid)
214
  io.brIdx  := btbRead.brIdx & Cat(true.B, lateJump, Fill(2, io.out.valid))
215
  io.out.valid := btbHit && Mux(btbRead._type === BTBtype.B, phtTaken, true.B && rasTarget=/=0.U) //TODO: add rasTarget=/=0.U, need fix
216 217 218 219 220 221 222
  // io.out.valid := btbHit && Mux(btbRead._type === BTBtype.B, phtTaken, true.B) && !lateJump || lateJumpLatch && !flush && !lateJump
  // Note: 
  // btbHit && Mux(btbRead._type === BTBtype.B, phtTaken, true.B) && !lateJump : normal branch predict
  // lateJumpLatch && !flush && !lateJump : cross line branch predict, bpu will require imem to fetch the next 16bit of current inst in next instline
  // `&& !lateJump` is used to make sure this logic will run correctly when imem stalls (pcUpdate === false)
  // by using `instline`, we mean a 64 bit instfetch result from imem
  // ROCKET uses a 32 bit instline, and its IDU logic is more simple than this implentation.
223 224
}

Z
Zihao Yu 已提交
225
class BPU2 extends NOOPModule {
226
  val io = IO(new Bundle {
Z
Zihao Yu 已提交
227 228
    val in = Flipped(Valid(new CtrlFlowIO))
    val out = new RedirectIO
229 230 231
  })

  val instr = io.in.bits.instr
Z
Zihao Yu 已提交
232 233
  val immJ = SignExt(Cat(instr(31), instr(19, 12), instr(20), instr(30, 21), 0.U(1.W)), XLEN)
  val immB = SignExt(Cat(instr(31), instr(7), instr(30, 25), instr(11, 8), 0.U(1.W)), XLEN)
234
  val table = Array(
Z
Zihao Yu 已提交
235 236 237 238 239 240 241
    RV32I_BRUInstr.JAL  -> List(immJ, true.B),
    RV32I_BRUInstr.BNE  -> List(immB, instr(31)),
    RV32I_BRUInstr.BEQ  -> List(immB, instr(31)),
    RV32I_BRUInstr.BLT  -> List(immB, instr(31)),
    RV32I_BRUInstr.BGE  -> List(immB, instr(31)),
    RV32I_BRUInstr.BLTU -> List(immB, instr(31)),
    RV32I_BRUInstr.BGEU -> List(immB, instr(31))
242 243 244 245 246
  )
  val default = List(immB, false.B)
  val offset :: predict :: Nil = ListLookup(instr, default, table)

  io.out.target := io.in.bits.pc + offset
Z
Zihao Yu 已提交
247
  io.out.valid := io.in.valid && predict(0)
248
}