From 78105e83748a88b101c34805b2f51c1f51690391 Mon Sep 17 00:00:00 2001 From: zoujr <18870680299@163.com> Date: Wed, 4 Nov 2020 15:25:32 +0800 Subject: [PATCH] LoopBuffer: Separate the LoopBuffer and the IBuffer --- src/main/scala/xiangshan/backend/fu/CSR.scala | 5 +- .../scala/xiangshan/frontend/Frontend.scala | 12 +- src/main/scala/xiangshan/frontend/IFU.scala | 81 +- .../scala/xiangshan/frontend/Ibuffer.scala | 14 +- .../scala/xiangshan/frontend/LoopBuffer.scala | 751 +++++++++--------- .../xiangshan/frontend/NewLoopBuffer.scala | 252 ++++++ .../scala/xiangshan/testutils/AddSinks.scala | 5 +- 7 files changed, 711 insertions(+), 409 deletions(-) create mode 100644 src/main/scala/xiangshan/frontend/NewLoopBuffer.scala diff --git a/src/main/scala/xiangshan/backend/fu/CSR.scala b/src/main/scala/xiangshan/backend/fu/CSR.scala index 19987f6ab..68303e751 100644 --- a/src/main/scala/xiangshan/backend/fu/CSR.scala +++ b/src/main/scala/xiangshan/backend/fu/CSR.scala @@ -830,7 +830,10 @@ class CSR extends FunctionUnit(csrCfg) with HasCSRConst{ "ICacheReq" -> (0xb28, "perfCntIcacheReqCnt" ), "ICacheMiss" -> (0xb29, "perfCntIcacheMissCnt" ), "FetchFromICache" -> (0xb2a, "CntFetchFromICache"), - "FetchFromLoopBuffer" -> (0xb2b, "CntFetchFromLoopBuffer") + "FetchFromLoopBuffer" -> (0xb2b, "CntFetchFromLoopBuffer"), + "ExitLoop1" -> (0xb2c, "CntExitLoop1"), + "ExitLoop2" -> (0xb2d, "CntExitLoop2"), + "ExitLoop3" -> (0xb2e, "CntExitLoop3") // "Custom1" -> (0xb1b, "Custom1" ), // "Custom2" -> (0xb1c, "Custom2" ), // "Custom3" -> (0xb1d, "Custom3" ), diff --git a/src/main/scala/xiangshan/frontend/Frontend.scala b/src/main/scala/xiangshan/frontend/Frontend.scala index 5249c1645..0f398b47c 100644 --- a/src/main/scala/xiangshan/frontend/Frontend.scala +++ b/src/main/scala/xiangshan/frontend/Frontend.scala @@ -18,8 +18,7 @@ class Frontend extends XSModule { }) val ifu = Module(new IFU) - val ibuffer = if(EnableLB) Module(new LoopBuffer) else Module(new Ibuffer) - // val ibuffer = Module(new LoopBuffer) + val ibuffer = Module(new Ibuffer) val needFlush = io.backend.redirect.valid @@ -41,16 +40,9 @@ class Frontend extends XSModule { //ibuffer ibuffer.io.in <> ifu.io.fetchPacket ibuffer.io.flush := needFlush - ifu.io.loopBufPar <> ibuffer.io.loopBufPar io.backend.cfVec <> ibuffer.io.out - - for(out <- ibuffer.io.out){ - XSInfo(out.fire(), - p"inst:${Hexadecimal(out.bits.instr)} pc:${Hexadecimal(out.bits.pc)}\n" - ) - } // for(out <- ibuffer.io.out){ // XSInfo(out.fire(), // p"inst:${Hexadecimal(out.bits.instr)} pc:${Hexadecimal(out.bits.pc)}\n" @@ -58,4 +50,4 @@ class Frontend extends XSModule { // } -} +} \ No newline at end of file diff --git a/src/main/scala/xiangshan/frontend/IFU.scala b/src/main/scala/xiangshan/frontend/IFU.scala index 868036352..2a7142ba8 100644 --- a/src/main/scala/xiangshan/frontend/IFU.scala +++ b/src/main/scala/xiangshan/frontend/IFU.scala @@ -40,7 +40,7 @@ class IFUIO extends XSBundle val icacheReq = DecoupledIO(new ICacheReq) val icacheResp = Flipped(DecoupledIO(new ICacheResp)) val icacheFlush = Output(UInt(2.W)) - val loopBufPar = Flipped(new LoopBufferParameters) + // val loopBufPar = Flipped(new LoopBufferParameters) } class IFU extends XSModule with HasIFUConst @@ -48,22 +48,27 @@ class IFU extends XSModule with HasIFUConst val io = IO(new IFUIO) val bpu = BPU(EnableBPU) val pd = Module(new PreDecode) + val loopBuffer = Module(new NewLoopBuffer) val if2_redirect, if3_redirect, if4_redirect = WireInit(false.B) val if1_flush, if2_flush, if3_flush, if4_flush = WireInit(false.B) - val icacheResp = WireInit(Mux(io.loopBufPar.inLoop, io.loopBufPar.LBResp, io.icacheResp.bits)) + val loopBufPar = loopBuffer.io.loopBufPar + val inLoop = WireInit(loopBuffer.io.out.valid) + val icacheResp = WireInit(Mux(inLoop, loopBuffer.io.out.bits, io.icacheResp.bits)) - if4_flush := io.redirect.valid || io.loopBufPar.LBredirect.valid + if4_flush := io.redirect.valid || loopBufPar.LBredirect.valid if3_flush := if4_flush || if4_redirect if2_flush := if3_flush || if3_redirect if1_flush := if2_flush || if2_redirect + loopBuffer.io.flush := io.redirect.valid + //********************** IF1 ****************************// val if1_valid = !reset.asBool && GTimer() > 500.U val if1_npc = WireInit(0.U(VAddrBits.W)) val if2_ready = WireInit(false.B) - val if1_fire = if1_valid && (if2_ready || if1_flush) && (io.loopBufPar.inLoop || io.icacheReq.ready) + val if1_fire = if1_valid && (if2_ready || if1_flush) && (inLoop || io.icacheReq.ready) val if1_histPtr, if2_histPtr, if3_histPtr, if4_histPtr = Wire(UInt(log2Up(ExtHistoryLength).W)) @@ -133,7 +138,7 @@ class IFU extends XSModule with HasIFUConst //********************** IF3 ****************************// val if3_valid = RegEnable(next = if2_valid, init = false.B, enable = if2_fire) val if4_ready = WireInit(false.B) - val if3_fire = if3_valid && if4_ready && (io.loopBufPar.inLoop || io.icacheResp.valid) && !if3_flush + val if3_fire = if3_valid && if4_ready && (inLoop || io.icacheResp.valid) && !if3_flush val if3_pc = RegEnable(if2_pc, if2_fire) val if3_GHInfo = RegEnable(if2_realGHInfo, if2_fire) val if3_predHistPtr = RegEnable(if2_predHistPtr, enable=if2_fire) @@ -345,15 +350,15 @@ class IFU extends XSModule with HasIFUConst } } - when (io.loopBufPar.LBredirect.valid) { - if1_npc := io.loopBufPar.LBredirect.bits + when (loopBufPar.LBredirect.valid) { + if1_npc := loopBufPar.LBredirect.bits } when (io.redirect.valid) { if1_npc := io.redirect.bits.target } - when(io.loopBufPar.inLoop) { + when(inLoop) { io.icacheReq.valid := if2_flush }.otherwise { io.icacheReq.valid := if1_valid && if2_ready @@ -373,8 +378,7 @@ class IFU extends XSModule with HasIFUConst // io.loopBufPar.LBReq := snpc(if4_pc) // XSDebug(p"snpc(if4_pc)=${Hexadecimal(snpc(if4_pc))}\n") // } - io.loopBufPar.LBReq := if3_pc - io.loopBufPar.tgtpc := if4_bp.target + loopBufPar.fetchReq := if3_pc io.icacheReq.bits.mask := mask(if1_npc) @@ -389,7 +393,7 @@ class IFU extends XSModule with HasIFUConst // bpu.io.flush := Cat(if4_flush, if3_flush, if2_flush) bpu.io.flush := VecInit(if2_flush, if3_flush, if4_flush) - bpu.io.cacheValid := (io.loopBufPar.inLoop || io.icacheResp.valid) + bpu.io.cacheValid := (inLoop || io.icacheResp.valid) bpu.io.in.valid := if1_fire bpu.io.in.bits.pc := if1_npc bpu.io.in.bits.hist := hist.asUInt @@ -404,16 +408,15 @@ class IFU extends XSModule with HasIFUConst bpu.io.predecode.bits.isFetchpcEqualFirstpc := if4_pc === if4_pd.pc(0) bpu.io.branchInfo.ready := if4_fire - when(io.loopBufPar.inLoop) { - pd.io.in := io.loopBufPar.LBResp - pd.io.in.mask := io.loopBufPar.LBResp.mask & mask(io.loopBufPar.LBResp.pc) - XSDebug("Fetch from LB\n") - XSDebug(p"pc=${Hexadecimal(io.loopBufPar.LBResp.pc)}\n") - XSDebug(p"data=${Hexadecimal(io.loopBufPar.LBResp.data)}\n") - XSDebug(p"mask=${Hexadecimal(io.loopBufPar.LBResp.mask)}\n") - }.otherwise { - pd.io.in := icacheResp + pd.io.in := icacheResp + when(inLoop) { + pd.io.in.mask := loopBuffer.io.out.bits.mask & mask(loopBuffer.io.out.bits.pc) + // XSDebug("Fetch from LB\n") + // XSDebug(p"pc=${Hexadecimal(io.loopBufPar.LBResp.pc)}\n") + // XSDebug(p"data=${Hexadecimal(io.loopBufPar.LBResp.data)}\n") + // XSDebug(p"mask=${Hexadecimal(io.loopBufPar.LBResp.mask)}\n") } + pd.io.prev.valid := if3_hasPrevHalfInstr pd.io.prev.bits := prevHalfInstr.instr // if a fetch packet triggers page fault, set the pf instruction to nop @@ -432,27 +435,35 @@ class IFU extends XSModule with HasIFUConst //Performance Counter if (!env.FPGAPlatform ) { - ExcitingUtils.addSource(io.fetchPacket.fire && !io.loopBufPar.inLoop, "CntFetchFromICache", Perf) - ExcitingUtils.addSource(io.fetchPacket.fire && io.loopBufPar.inLoop, "CntFetchFromLoopBuffer", Perf) + ExcitingUtils.addSource(io.fetchPacket.fire && !inLoop, "CntFetchFromICache", Perf) + ExcitingUtils.addSource(io.fetchPacket.fire && inLoop, "CntFetchFromLoopBuffer", Perf) } - io.fetchPacket.valid := if4_valid && !io.redirect.valid - io.fetchPacket.bits.instrs := if4_pd.instrs - io.fetchPacket.bits.mask := if4_pd.mask & (Fill(PredictWidth, !if4_bp.taken) | (Fill(PredictWidth, 1.U(1.W)) >> (~if4_bp.jmpIdx))) - io.fetchPacket.bits.pc := if4_pd.pc - (0 until PredictWidth).foreach(i => io.fetchPacket.bits.pnpc(i) := if4_pd.pc(i) + Mux(if4_pd.pd(i).isRVC, 2.U, 4.U)) + val fetchPacketValid = if4_valid && !io.redirect.valid + val fetchPacketWire = Wire(new FetchPacket) + + // io.fetchPacket.valid := if4_valid && !io.redirect.valid + fetchPacketWire.instrs := if4_pd.instrs + fetchPacketWire.mask := if4_pd.mask & (Fill(PredictWidth, !if4_bp.taken) | (Fill(PredictWidth, 1.U(1.W)) >> (~if4_bp.jmpIdx))) + fetchPacketWire.pc := if4_pd.pc + (0 until PredictWidth).foreach(i => fetchPacketWire.pnpc(i) := if4_pd.pc(i) + Mux(if4_pd.pd(i).isRVC, 2.U, 4.U)) when (if4_bp.taken) { - io.fetchPacket.bits.pnpc(if4_bp.jmpIdx) := if4_bp.target + fetchPacketWire.pnpc(if4_bp.jmpIdx) := if4_bp.target } - io.fetchPacket.bits.brInfo := bpu.io.branchInfo.bits - (0 until PredictWidth).foreach(i => io.fetchPacket.bits.brInfo(i).histPtr := finalPredHistPtr) - (0 until PredictWidth).foreach(i => io.fetchPacket.bits.brInfo(i).predHistPtr := if4_predHistPtr) - io.fetchPacket.bits.pd := if4_pd.pd - io.fetchPacket.bits.ipf := if4_ipf - io.fetchPacket.bits.crossPageIPFFix := if4_crossPageIPF + fetchPacketWire.brInfo := bpu.io.branchInfo.bits + (0 until PredictWidth).foreach(i => fetchPacketWire.brInfo(i).histPtr := finalPredHistPtr) + (0 until PredictWidth).foreach(i => fetchPacketWire.brInfo(i).predHistPtr := if4_predHistPtr) + fetchPacketWire.pd := if4_pd.pd + fetchPacketWire.ipf := if4_ipf + fetchPacketWire.crossPageIPFFix := if4_crossPageIPF // predTaken Vec - io.fetchPacket.bits.predTaken := if4_bp.taken + fetchPacketWire.predTaken := if4_bp.taken + + loopBuffer.io.in.bits := fetchPacketWire + io.fetchPacket.bits := fetchPacketWire + io.fetchPacket.valid := fetchPacketValid + loopBuffer.io.in.valid := io.fetchPacket.fire // debug info if (IFUDebug) { diff --git a/src/main/scala/xiangshan/frontend/Ibuffer.scala b/src/main/scala/xiangshan/frontend/Ibuffer.scala index 81b4525d9..143756f13 100644 --- a/src/main/scala/xiangshan/frontend/Ibuffer.scala +++ b/src/main/scala/xiangshan/frontend/Ibuffer.scala @@ -7,8 +7,14 @@ import xiangshan._ import utils._ import xiangshan.backend.fu.HasExceptionNO +class IBufferIO extends XSBundle { + val flush = Input(Bool()) + val in = Flipped(DecoupledIO(new FetchPacket)) + val out = Vec(DecodeWidth, DecoupledIO(new CtrlFlow)) +} + class Ibuffer extends XSModule { - val io = IO(new LoopBufferIO) + val io = IO(new IBufferIO) class IBufEntry extends XSBundle { val inst = UInt(32.W) @@ -21,9 +27,9 @@ class Ibuffer extends XSModule { } // Ignore - io.loopBufPar <> DontCare - io.loopBufPar.LBredirect.valid := false.B - io.loopBufPar.inLoop := false.B + // io.loopBufPar <> DontCare + // io.loopBufPar.LBredirect.valid := false.B + // io.loopBufPar.inLoop := false.B for(out <- io.out) { diff --git a/src/main/scala/xiangshan/frontend/LoopBuffer.scala b/src/main/scala/xiangshan/frontend/LoopBuffer.scala index f7da8ddf6..652da825e 100644 --- a/src/main/scala/xiangshan/frontend/LoopBuffer.scala +++ b/src/main/scala/xiangshan/frontend/LoopBuffer.scala @@ -1,358 +1,393 @@ -package xiangshan.frontend - -import chisel3._ -import chisel3.util._ -import chisel3.util.experimental.BoringUtils -import chisel3.ExcitingUtils._ -import utils._ -import xiangshan._ -import xiangshan.cache._ - -class LoopBufferParameters extends XSBundle { - val LBredirect = ValidIO(UInt(VAddrBits.W)) - val tgtpc = Input(UInt(VAddrBits.W)) - val inLoop = Output(Bool()) - val LBReq = Input(UInt(VAddrBits.W)) - val LBResp = Output(new ICacheResp) -} - -class LoopBufferIO extends XSBundle { - val flush = Input(Bool()) - val in = Flipped(DecoupledIO(new FetchPacket)) - val out = Vec(DecodeWidth, DecoupledIO(new CtrlFlow)) - val loopBufPar = new LoopBufferParameters -} - -class LoopBuffer extends XSModule { - val io = IO(new LoopBufferIO) - - class IBufEntry extends XSBundle { - val inst = UInt(32.W) - val pc = UInt(VAddrBits.W) - val pnpc = UInt(VAddrBits.W) - val brInfo = new BranchInfo - val pd = new PreDecodeInfo - val ipf = Bool() - val crossPageIPFFix = Bool() - } - - class LBufEntry extends XSBundle { - val inst = UInt(16.W) - } - - // ignore - for(i <- 0 until DecodeWidth) { - // io.out(i).bits.exceptionVec := DontCare - io.out(i).bits.intrVec := DontCare - // io.out(i).bits.crossPageIPFFix := DontCare - } - - def sbbOffest(inst: UInt): UInt = { - val isJal = inst === BitPat("b1111_???????_111111111_?????_1101111") - val isCon = inst === BitPat("b1111???_?????_?????_???_????1_1100011") - val isRVCJal = inst === BitPat("b????????????????_001_1?111??????_01") - val isRVCCon = inst === BitPat("b????????????????_11?_1??_???_?????_01") - - val rst = PriorityMux(Seq( - isJal -> inst(27, 21), - isCon -> Cat(inst(27,25), inst(11,8)), - isRVCJal -> Cat(inst(6), inst(7), inst(2), inst(11), inst(5,3)), - isRVCCon -> Cat(inst(6), inst(5), inst(2), inst(11,10), inst(4,3)), - true.B -> 0.U(7.W) - )) - - (~rst).asUInt + 1.U - } - - def isSBB(inst: UInt): Bool = { - val sbbOffestWire = WireInit(sbbOffest(inst)) - sbbOffestWire > 0.U && sbbOffestWire <= 112.U // TODO < 56.U - } - - // predTaken to OH - val predTakenVec = Mux(io.in.bits.predTaken, Reverse(PriorityEncoderOH(Reverse(io.in.bits.mask))), 0.U(PredictWidth.W)) - - // Loop detect register - val offsetCounter = Reg(UInt((log2Up(IBufSize)+2).W)) - val tsbbPC = RegInit(0.U(VAddrBits.W)) - - val brTaken = Cat((0 until PredictWidth).map(i => io.in.fire && io.in.bits.mask(i) && predTakenVec(i))).orR() - val brIdx = OHToUInt(predTakenVec.asUInt) - val sbbTaken = brTaken && isSBB(io.in.bits.instrs(brIdx)) - - val tsbbVec = Cat((0 until PredictWidth).map(i => io.in.fire && io.in.bits.mask(i) && io.in.bits.pc(i) === tsbbPC)) - val hasTsbb = tsbbVec.orR() - val tsbbIdx = OHToUInt(Reverse(tsbbVec)) - val tsbbTaken = brTaken && io.in.bits.pc(brIdx) === tsbbPC - - // IBuffer define - val ibuf = Mem(IBufSize, new IBufEntry) - val ibufValid = RegInit(VecInit(Seq.fill(IBufSize)(false.B))) - val headPtr = RegInit(0.U(log2Up(IBufSize).W)) - val tailPtr = RegInit(0.U(log2Up(IBufSize).W)) - - // val enqValid = !io.flush && !ibufValid(tailPtr + PopCount(io.in.bits.mask) - 1.U) - val enqValid = !io.flush && (io.in.bits.mask === 0.U || !ibufValid(tailPtr + PopCount(io.in.bits.mask) - 1.U)) - val deqValid = !io.flush && ibufValid(headPtr) - - // LoopBuffer define - val lbuf = Mem(IBufSize*2, new LBufEntry) - val lbufValid = RegInit(VecInit(Seq.fill(IBufSize*2)(false.B))) - - // FSM state define - val s_idle :: s_fill :: s_active :: Nil = Enum(3) - val LBstate = RegInit(s_idle) - - io.loopBufPar.inLoop := LBstate === s_active - - def flushLB() = { - for(i <- 0 until IBufSize*2) { - lbuf(i).inst := 0.U // TODO: This is to make the debugging information clearer, this can be deleted - lbufValid(i) := false.B - } - } - - def flushIB() = { - for(i <- 0 until IBufSize) { - ibuf(i).inst := 0.U // TODO: This is to make the debugging information clearer, this can be deleted - ibuf(i).pc := 0.U // TODO: This is to make the debugging information clearer, this can be deleted - lbuf(i).inst := 0.U // TODO: This is to make the debugging information clearer, this can be deleted - ibufValid(i) := false.B - } - headPtr := 0.U - tailPtr := 0.U - } - - def flush() = { - XSDebug("Loop Buffer Flushed.\n") - LBstate := s_idle - flushLB - flushIB - } - - io.loopBufPar.LBredirect.valid := false.B - io.loopBufPar.LBredirect.bits := DontCare - - /*---------------*/ - /* Dequeue */ - /*---------------*/ - var deq_idx = WireInit(headPtr) - - when(deqValid) { - for(i <- 0 until DecodeWidth) { - var outWire = WireInit(ibuf(deq_idx)) - - io.out(i).valid := ibufValid(deq_idx) - when(ibufValid(deq_idx)) { ibufValid(deq_idx) := !io.out(i).fire } - io.out(i).bits.instr := outWire.inst - - io.out(i).bits.pc := outWire.pc - io.out(i).bits.exceptionVec := 0.U.asTypeOf(Vec(16, Bool())) - io.out(i).bits.exceptionVec(instrPageFault) := outWire.ipf - - io.out(i).bits.brUpdate := DontCare - io.out(i).bits.brUpdate.pc := outWire.pc - io.out(i).bits.brUpdate.pnpc := outWire.pnpc - io.out(i).bits.brUpdate.pd := outWire.pd - io.out(i).bits.brUpdate.brInfo := outWire.brInfo - io.out(i).bits.crossPageIPFFix := outWire.crossPageIPFFix - - deq_idx = deq_idx + io.out(i).fire - } - headPtr := deq_idx - }.otherwise { - io.out.foreach(_.valid := false.B) - io.out.foreach(_.bits <> DontCare) - } - - /*---------------*/ - /* Enqueue */ - /*---------------*/ - io.in.ready := enqValid - - var enq_idx = WireInit(tailPtr) - // ExcitingUtils.addSource(io.in.fire && LBstate =/= s_active, "CntFetchFromICache", Perf) - // ExcitingUtils.addSource(io.in.fire && LBstate === s_active, "CntFetchFromLoopBuffer", Perf) - when(io.in.fire) { - for(i <- 0 until PredictWidth) { - var inWire = Wire(new IBufEntry) - inWire := DontCare - - when(io.in.bits.mask(i)) { - inWire.inst := io.in.bits.instrs(i) - when(LBstate === s_fill/* || (sbbTaken && i.U > brIdx)*/) { - lbuf(io.in.bits.pc(i)(7,1)).inst := io.in.bits.instrs(i)(15, 0) - // lbuf(io.in.bits.pc(i)(7,1)).pd := io.in.bits.pd(i) - lbufValid(io.in.bits.pc(i)(7,1)) := true.B - when(!io.in.bits.pd(i).isRVC) { - lbuf(io.in.bits.pc(i)(7,1) + 1.U).inst := io.in.bits.instrs(i)(31, 16) - lbufValid(io.in.bits.pc(i)(7,1) + 1.U) := true.B - } - } - inWire.pc := io.in.bits.pc(i) - inWire.pnpc := io.in.bits.pnpc(i) - inWire.brInfo := io.in.bits.brInfo(i) - inWire.pd := io.in.bits.pd(i) - inWire.ipf := io.in.bits.ipf - inWire.crossPageIPFFix := io.in.bits.crossPageIPFFix - - // ibufValid(enq_idx) := Mux(LBstate =/= s_active, true.B, !(hasTsbb && !tsbbTaken && i.U > tsbbIdx)) - ibufValid(enq_idx) := true.B - ibuf(enq_idx) := inWire - } - - enq_idx = enq_idx + io.in.bits.mask(i) - } - - tailPtr := enq_idx - } - - // This is ugly - val pcStep = (0 until PredictWidth).map(i => Mux(!io.in.fire || !io.in.bits.mask(i), 0.U, Mux(io.in.bits.pd(i).isRVC, 1.U, 2.U))).fold(0.U(log2Up(16+1).W))(_+_) - val offsetCounterWire = WireInit(offsetCounter + pcStep) - offsetCounter := offsetCounterWire - - // IFU fetch from LB - io.loopBufPar.LBResp.pc := io.loopBufPar.LBReq - io.loopBufPar.LBResp.data := Cat((31 to 0 by -1).map(i => lbuf(io.loopBufPar.LBReq(7,1) + i.U).inst)) - io.loopBufPar.LBResp.mask := Cat((31 to 0 by -1).map(i => lbufValid(io.loopBufPar.LBReq(7,1) + i.U))) - io.loopBufPar.LBResp.ipf := false.B - - /*-----------------------*/ - /* Loop Buffer FSM */ - /*-----------------------*/ - when(io.in.fire) { - switch(LBstate) { - is(s_idle) { - // To FILL - // 检测到sbb且跳转,sbb成为triggering sbb - when(sbbTaken) { - LBstate := s_fill - XSDebug("State change: FILL\n") - // This is ugly - // offsetCounter := Cat("b1".U, sbbOffest(io.in.bits.instrs(brIdx))) + - // (0 until PredictWidth).map(i => Mux(!io.in.bits.mask(i) || i.U < brIdx, 0.U, Mux(io.in.bits.pd(i).isRVC, 1.U, 2.U))).fold(0.U(log2Up(16+1).W))(_+_) - offsetCounter := Cat("b1".U, sbbOffest(io.in.bits.instrs(brIdx))) - tsbbPC := io.in.bits.pc(brIdx) - } - } - is(s_fill) { - // To AVTIVE - // triggering sbb 造成cof - when(offsetCounterWire((log2Up(IBufSize)+2)-1) === 0.U){ - when(hasTsbb && tsbbTaken) { - LBstate := s_active - XSDebug("State change: ACTIVE\n") - }.otherwise { - LBstate := s_idle - XSDebug("State change: IDLE\n") - flushLB() - } - } - - when(brTaken && !tsbbTaken) { - // To IDLE - LBstate := s_idle - XSDebug("State change: IDLE\n") - flushLB() - } - } - is(s_active) { - // To IDLE - // triggering sbb不跳转 退出循环 - val redirect_pc = io.in.bits.pnpc(PredictWidth.U - PriorityEncoder(Reverse(io.in.bits.mask)) - 1.U) - when(hasTsbb && !tsbbTaken) { - XSDebug("tsbb not taken, State change: IDLE\n") - LBstate := s_idle - io.loopBufPar.LBredirect.valid := true.B - // io.loopBufPar.LBredirect.bits := tsbbPC + Mux(io.in.bits.pd(tsbbIdx).isRVC, 2.U, 4.U) - io.loopBufPar.LBredirect.bits := redirect_pc - // ExcitingUtils.addSource(true.B, "CntLBRedirect1", Perf) - XSDebug(p"redirect pc=${Hexadecimal(redirect_pc)}\n") - flushLB() - } - - when(brTaken && !tsbbTaken) { - XSDebug("cof by other inst, State change: IDLE\n") - LBstate := s_idle - io.loopBufPar.LBredirect.valid := true.B - io.loopBufPar.LBredirect.bits := redirect_pc - // io.loopBufPar.LBredirect.bits := Mux(brIdx > tsbbIdx, tsbbPC + 4.U, io.loopBufPar.LBReq) - // ExcitingUtils.addSource(true.B, "CntLBRedirect2", Perf) - XSDebug(p"redirect pc=${Hexadecimal(redirect_pc)}\n") - flushLB() - } - - when(hasTsbb && brTaken && !tsbbTaken) { - XSDebug("tsbb and cof, State change: IDLE\n") - LBstate := s_idle - io.loopBufPar.LBredirect.valid := true.B - io.loopBufPar.LBredirect.bits := redirect_pc - // io.loopBufPar.LBredirect.bits := Mux(brIdx > tsbbIdx, tsbbPC + 4.U, io.loopBufPar.LBReq) - // ExcitingUtils.addSource(true.B, "CntLBRedirect3", Perf) - XSDebug(p"redirect pc=${Hexadecimal(redirect_pc)}\n") - flushLB() - } - } - } - } - - when(io.flush){ - flush() - } - - // Debug Info - XSDebug(io.flush, "LoopBuffer Flushed\n") - - XSDebug(LBstate === s_idle, "Current state: IDLE\n") - XSDebug(LBstate === s_fill, "Current state: FILL\n") - XSDebug(LBstate === s_active, "Current state: ACTIVE\n") - - XSDebug(p"offsetCounter = ${Binary(offsetCounterWire)}\n") - XSDebug(p"tsbbIdx = ${tsbbIdx}\n") - when(io.in.fire) { - XSDebug("Enque:\n") - XSDebug(brTaken, p"Detected jump, idx=${brIdx}\n") - XSDebug(p"predTaken=${io.in.bits.predTaken}, predTakenVec=${Binary(predTakenVec)}\n") - XSDebug(p"MASK=${Binary(io.in.bits.mask)}\n") - for(i <- 0 until PredictWidth){ - XSDebug(p"PC=${Hexadecimal(io.in.bits.pc(i))} ${Hexadecimal(io.in.bits.instrs(i))}\n") - } - } - - when(deqValid) { - XSDebug("Deque:\n") - for(i <- 0 until DecodeWidth){ - XSDebug(p"${Hexadecimal(io.out(i).bits.instr)} PC=${Hexadecimal(io.out(i).bits.pc)} v=${io.out(i).valid} r=${io.out(i).ready}\n") - } - } - - XSDebug(p"last_headPtr=$headPtr last_tailPtr=$tailPtr\n") - XSDebug("IBuffer:\n") - for(i <- 0 until IBufSize/8) { - XSDebug("%x v:%b | %x v:%b | %x v:%b | %x v:%b | %x v:%b | %x v:%b | %x v:%b | %x v:%b\n", - ibuf(i*8+0).inst, ibufValid(i*8+0), - ibuf(i*8+1).inst, ibufValid(i*8+1), - ibuf(i*8+2).inst, ibufValid(i*8+2), - ibuf(i*8+3).inst, ibufValid(i*8+3), - ibuf(i*8+4).inst, ibufValid(i*8+4), - ibuf(i*8+5).inst, ibufValid(i*8+5), - ibuf(i*8+6).inst, ibufValid(i*8+6), - ibuf(i*8+7).inst, ibufValid(i*8+7) - ) - } - - XSDebug("LoopBuffer:\n") - for(i <- 0 until IBufSize*2/8) { - XSDebug("%x v:%b | %x v:%b | %x v:%b | %x v:%b | %x v:%b | %x v:%b | %x v:%b | %x v:%b\n", - lbuf(i*8+0).inst, lbufValid(i*8+0), - lbuf(i*8+1).inst, lbufValid(i*8+1), - lbuf(i*8+2).inst, lbufValid(i*8+2), - lbuf(i*8+3).inst, lbufValid(i*8+3), - lbuf(i*8+4).inst, lbufValid(i*8+4), - lbuf(i*8+5).inst, lbufValid(i*8+5), - lbuf(i*8+6).inst, lbufValid(i*8+6), - lbuf(i*8+7).inst, lbufValid(i*8+7) - ) - } -} \ No newline at end of file +// package xiangshan.frontend + +// import chisel3._ +// import chisel3.util._ +// import chisel3.util.experimental.BoringUtils +// import chisel3.ExcitingUtils._ +// import utils._ +// import xiangshan._ +// import xiangshan.cache._ + +// trait HasLoopBufferCst extends HasXSParameter { +// val preFetchBufferSize = 2 +// } + +// class LoopBufferParameters extends XSBundle { +// val LBredirect = ValidIO(UInt(VAddrBits.W)) +// val tgtpc = Input(UInt(VAddrBits.W)) +// val inLoop = Output(Bool()) +// val LBReq = Input(UInt(VAddrBits.W)) +// val LBResp = Output(new ICacheResp) +// val preFetchPC = DecoupledIO(UInt(VAddrBits.W)) +// val preFetchResp = Flipped(DecoupledIO(new ICacheResp)) +// val preFetchSend = DecoupledIO(new ICacheResp) +// } + +// class LoopBufferIO extends XSBundle { +// val flush = Input(Bool()) +// val in = Flipped(DecoupledIO(new FetchPacket)) +// val out = Vec(DecodeWidth, DecoupledIO(new CtrlFlow)) +// val loopBufPar = new LoopBufferParameters +// } + +// class LoopBuffer extends XSModule with HasLoopBufferCst{ +// val io = IO(new LoopBufferIO) + +// class IBufEntry extends XSBundle { +// val inst = UInt(32.W) +// val pc = UInt(VAddrBits.W) +// val pnpc = UInt(VAddrBits.W) +// val brInfo = new BranchInfo +// val pd = new PreDecodeInfo +// val ipf = Bool() +// val crossPageIPFFix = Bool() +// } + +// class LBufEntry extends XSBundle { +// val inst = UInt(16.W) +// } + +// // ignore +// io.out.foreach{ +// _.bits.intrVec := DontCare +// } + +// def sbbOffest(inst: UInt): UInt = { +// val isJal = inst === BitPat("b1111_???????_111111111_?????_1101111") +// val isCon = inst === BitPat("b1111???_?????_?????_???_????1_1100011") +// val isRVCJal = inst === BitPat("b????????????????_001_1?111??????_01") +// val isRVCCon = inst === BitPat("b????????????????_11?_1??_???_?????_01") + +// val rst = PriorityMux(Seq( +// isJal -> inst(27, 21), +// isCon -> Cat(inst(27,25), inst(11,8)), +// isRVCJal -> Cat(inst(6), inst(7), inst(2), inst(11), inst(5,3)), +// isRVCCon -> Cat(inst(6), inst(5), inst(2), inst(11,10), inst(4,3)), +// true.B -> 0.U(7.W) +// )) + +// (~rst).asUInt + 1.U +// } + +// def isSBB(inst: UInt): Bool = { +// val sbbOffestWire = WireInit(sbbOffest(inst)) +// sbbOffestWire > 0.U && sbbOffestWire <= 112.U // TODO < 56.U +// } + +// def mask(pc: UInt): UInt = (Fill(PredictWidth * 2, 1.U(1.W)) >> pc(groupAlign - 1, 1))(PredictWidth - 1, 0) +// def snpc(pc: UInt): UInt = pc + (PopCount(mask(pc)) << 1) + +// // predTaken to OH +// val predTakenVec = Mux(io.in.bits.predTaken, Reverse(PriorityEncoderOH(Reverse(io.in.bits.mask))), 0.U(PredictWidth.W)) + +// // Loop detect register +// val offsetCounter = Reg(UInt((log2Up(IBufSize)+2).W)) +// val tsbbPC = RegInit(0.U(VAddrBits.W)) + +// val brTaken = Cat((0 until PredictWidth).map(i => io.in.fire && io.in.bits.mask(i) && predTakenVec(i))).orR() +// val brIdx = OHToUInt(predTakenVec.asUInt) +// val sbbTaken = brTaken && isSBB(io.in.bits.instrs(brIdx)) + +// val tsbbVec = Cat((0 until PredictWidth).map(i => io.in.fire && io.in.bits.mask(i) && io.in.bits.pc(i) === tsbbPC)) +// val hasTsbb = tsbbVec.orR() +// val tsbbIdx = OHToUInt(Reverse(tsbbVec)) +// val tsbbTaken = brTaken && io.in.bits.pc(brIdx) === tsbbPC + +// // IBuffer define +// val ibuf = Mem(IBufSize, new IBufEntry) +// val ibufValid = RegInit(VecInit(Seq.fill(IBufSize)(false.B))) +// val headPtr = RegInit(0.U(log2Up(IBufSize).W)) +// val tailPtr = RegInit(0.U(log2Up(IBufSize).W)) + +// // val enqValid = !io.flush && !ibufValid(tailPtr + PopCount(io.in.bits.mask) - 1.U) +// val enqValid = !io.flush && (io.in.bits.mask === 0.U || !ibufValid(tailPtr + PopCount(io.in.bits.mask) - 1.U)) +// val deqValid = !io.flush && ibufValid(headPtr) + +// // LoopBuffer define +// val lbuf = Mem(IBufSize*2, new LBufEntry) +// val lbufValid = RegInit(VecInit(Seq.fill(IBufSize*2)(false.B))) +// val redirect_pc = io.in.bits.pnpc(PredictWidth.U - PriorityEncoder(Reverse(io.in.bits.mask)) - 1.U) + +// // 保存循环体最后1拍的指令和之后2拍的指令 +// val preFetchBuffer = Mem(preFetchBufferSize + 1, new ICacheResp) +// val prefetchCounter = 0.U(2.W) + +// // FSM state define +// val s_idle :: s_fill :: s_active :: Nil = Enum(3) +// val LBstate = RegInit(s_idle) + +// io.loopBufPar.inLoop := LBstate === s_active + +// def flushLB() = { +// for(i <- 0 until IBufSize*2) { +// lbuf(i).inst := 0.U // TODO: This is to make the debugging information clearer, this can be deleted +// lbufValid(i) := false.B +// } +// } + +// def flushIB() = { +// for(i <- 0 until IBufSize) { +// ibuf(i).inst := 0.U // TODO: This is to make the debugging information clearer, this can be deleted +// ibuf(i).pc := 0.U // TODO: This is to make the debugging information clearer, this can be deleted +// lbuf(i).inst := 0.U // TODO: This is to make the debugging information clearer, this can be deleted +// ibufValid(i) := false.B +// } +// headPtr := 0.U +// tailPtr := 0.U +// } + +// def flush() = { +// XSDebug("Loop Buffer Flushed.\n") +// LBstate := s_idle +// flushLB +// flushIB +// } + +// io.loopBufPar.LBredirect.valid := false.B +// io.loopBufPar.LBredirect.bits := DontCare + +// /*---------------*/ +// /* Dequeue */ +// /*---------------*/ +// var deq_idx = WireInit(headPtr) + +// when(deqValid) { +// for(i <- 0 until DecodeWidth) { +// var outWire = WireInit(ibuf(deq_idx)) + +// io.out(i).valid := ibufValid(deq_idx) +// when(ibufValid(deq_idx)) { ibufValid(deq_idx) := !io.out(i).fire } +// io.out(i).bits.instr := outWire.inst + +// io.out(i).bits.pc := outWire.pc +// io.out(i).bits.exceptionVec := 0.U.asTypeOf(Vec(16, Bool())) +// io.out(i).bits.exceptionVec(instrPageFault) := outWire.ipf + +// io.out(i).bits.brUpdate := DontCare +// io.out(i).bits.brUpdate.pc := outWire.pc +// io.out(i).bits.brUpdate.pnpc := outWire.pnpc +// io.out(i).bits.brUpdate.pd := outWire.pd +// io.out(i).bits.brUpdate.brInfo := outWire.brInfo +// io.out(i).bits.crossPageIPFFix := outWire.crossPageIPFFix + +// deq_idx = deq_idx + io.out(i).fire +// } +// headPtr := deq_idx +// }.otherwise { +// io.out.foreach(_.valid := false.B) +// io.out.foreach(_.bits <> DontCare) +// } + +// /*---------------*/ +// /* Enqueue */ +// /*---------------*/ +// io.in.ready := enqValid + +// var enq_idx = WireInit(tailPtr) +// // ExcitingUtils.addSource(io.in.fire && LBstate =/= s_active, "CntFetchFromICache", Perf) +// // ExcitingUtils.addSource(io.in.fire && LBstate === s_active, "CntFetchFromLoopBuffer", Perf) +// when(io.in.fire) { +// for(i <- 0 until PredictWidth) { +// var inWire = Wire(new IBufEntry) +// inWire := DontCare + +// when(io.in.bits.mask(i)) { +// inWire.inst := io.in.bits.instrs(i) +// when(LBstate === s_fill/* || (sbbTaken && i.U > brIdx)*/) { +// lbuf(io.in.bits.pc(i)(7,1)).inst := io.in.bits.instrs(i)(15, 0) +// // lbuf(io.in.bits.pc(i)(7,1)).pd := io.in.bits.pd(i) +// lbufValid(io.in.bits.pc(i)(7,1)) := true.B +// when(!io.in.bits.pd(i).isRVC) { +// lbuf(io.in.bits.pc(i)(7,1) + 1.U).inst := io.in.bits.instrs(i)(31, 16) +// lbufValid(io.in.bits.pc(i)(7,1) + 1.U) := true.B +// } +// } +// inWire.pc := io.in.bits.pc(i) +// inWire.pnpc := io.in.bits.pnpc(i) +// inWire.brInfo := io.in.bits.brInfo(i) +// inWire.pd := io.in.bits.pd(i) +// inWire.ipf := io.in.bits.ipf +// inWire.crossPageIPFFix := io.in.bits.crossPageIPFFix + +// // ibufValid(enq_idx) := Mux(LBstate =/= s_active, true.B, !(hasTsbb && !tsbbTaken && i.U > tsbbIdx)) +// ibufValid(enq_idx) := true.B +// ibuf(enq_idx) := inWire +// } + +// enq_idx = enq_idx + io.in.bits.mask(i) +// } + +// tailPtr := enq_idx +// } + +// // This is ugly +// val pcStep = (0 until PredictWidth).map(i => Mux(!io.in.fire || !io.in.bits.mask(i), 0.U, Mux(io.in.bits.pd(i).isRVC, 1.U, 2.U))).fold(0.U(log2Up(16+1).W))(_+_) +// val offsetCounterWire = WireInit(offsetCounter + pcStep) +// offsetCounter := offsetCounterWire + +// // IFU fetch from LB +// io.loopBufPar.LBResp.pc := io.loopBufPar.LBReq +// io.loopBufPar.LBResp.data := Cat((31 to 0 by -1).map(i => lbuf(io.loopBufPar.LBReq(7,1) + i.U).inst)) +// io.loopBufPar.LBResp.mask := Cat((31 to 0 by -1).map(i => lbufValid(io.loopBufPar.LBReq(7,1) + i.U))) +// io.loopBufPar.LBResp.ipf := false.B + +// // PreFetch from ICache +// when(LBstate === s_active && prefetchCounter =/= 2.U) { +// io.loopBufPar.preFetchPC.valid := true.B +// io.loopBufPar.preFetchPC.bits := redirect_pc + Mux(prefetchCounter(0), snpc(redirect_pc)) +// } + +// when(io.loopBufPar.preFetchResp.fire) { +// preFetchBuffer(prefetchCounter) := io.loopBufPar.preFetchResp.bits +// prefetchCounter := prefetchCounter + 1.U +// } + +// // Send preFetch inst to predecode +// when(LBstate != s_active && prefetchCounter =/= 0.U) { +// io.loopBufPar.preFetchSend.valid := true.B +// io.loopBufPar.preFetchSend.bits := preFetchBuffer(2.U - prefetchCounter) +// prefetchCounter := prefetchCounter - 1.U +// } + +// /*-----------------------*/ +// /* Loop Buffer FSM */ +// /*-----------------------*/ +// when(io.in.fire) { +// switch(LBstate) { +// is(s_idle) { +// // To FILL +// // 检测到sbb且跳转,sbb成为triggering sbb +// when(sbbTaken) { +// LBstate := s_fill +// XSDebug("State change: FILL\n") +// // This is ugly +// // offsetCounter := Cat("b1".U, sbbOffest(io.in.bits.instrs(brIdx))) + +// // (0 until PredictWidth).map(i => Mux(!io.in.bits.mask(i) || i.U < brIdx, 0.U, Mux(io.in.bits.pd(i).isRVC, 1.U, 2.U))).fold(0.U(log2Up(16+1).W))(_+_) +// offsetCounter := Cat("b1".U, sbbOffest(io.in.bits.instrs(brIdx))) +// tsbbPC := io.in.bits.pc(brIdx) +// } +// } +// is(s_fill) { +// // To AVTIVE +// // triggering sbb 造成cof +// when(offsetCounterWire((log2Up(IBufSize)+2)-1) === 0.U){ +// when(hasTsbb && tsbbTaken) { +// LBstate := s_active +// XSDebug("State change: ACTIVE\n") +// }.otherwise { +// LBstate := s_idle +// XSDebug("State change: IDLE\n") +// flushLB() +// } +// } + +// when(brTaken && !tsbbTaken) { +// // To IDLE +// LBstate := s_idle +// XSDebug("State change: IDLE\n") +// flushLB() +// } +// } +// is(s_active) { +// // To IDLE +// // triggering sbb不跳转 退出循环 +// when(hasTsbb && !tsbbTaken) { +// XSDebug("tsbb not taken, State change: IDLE\n") +// LBstate := s_idle +// io.loopBufPar.LBredirect.valid := true.B +// // io.loopBufPar.LBredirect.bits := tsbbPC + Mux(io.in.bits.pd(tsbbIdx).isRVC, 2.U, 4.U) +// io.loopBufPar.LBredirect.bits := redirect_pc +// // ExcitingUtils.addSource(true.B, "CntLBRedirect1", Perf) +// XSDebug(p"redirect pc=${Hexadecimal(redirect_pc)}\n") +// flushLB() +// } + +// when(brTaken && !tsbbTaken) { +// XSDebug("cof by other inst, State change: IDLE\n") +// LBstate := s_idle +// io.loopBufPar.LBredirect.valid := true.B +// io.loopBufPar.LBredirect.bits := redirect_pc +// // io.loopBufPar.LBredirect.bits := Mux(brIdx > tsbbIdx, tsbbPC + 4.U, io.loopBufPar.LBReq) +// // ExcitingUtils.addSource(true.B, "CntLBRedirect2", Perf) +// XSDebug(p"redirect pc=${Hexadecimal(redirect_pc)}\n") +// flushLB() +// } + +// when(hasTsbb && brTaken && !tsbbTaken) { +// XSDebug("tsbb and cof, State change: IDLE\n") +// LBstate := s_idle +// io.loopBufPar.LBredirect.valid := true.B +// io.loopBufPar.LBredirect.bits := redirect_pc +// // io.loopBufPar.LBredirect.bits := Mux(brIdx > tsbbIdx, tsbbPC + 4.U, io.loopBufPar.LBReq) +// // ExcitingUtils.addSource(true.B, "CntLBRedirect3", Perf) +// XSDebug(p"redirect pc=${Hexadecimal(redirect_pc)}\n") +// flushLB() +// } +// } +// } +// } + +// when(io.flush){ +// flush() +// } + +// //Performance Counter +// if (!env.FPGAPlatform ) { +// ExcitingUtils.addSource(LBstate === s_active && hasTsbb && !tsbbTaken, "ExitLoop", Perf) +// } + +// // Debug Info +// XSDebug(io.flush, "LoopBuffer Flushed\n") + +// XSDebug(LBstate === s_idle, "Current state: IDLE\n") +// XSDebug(LBstate === s_fill, "Current state: FILL\n") +// XSDebug(LBstate === s_active, "Current state: ACTIVE\n") + +// XSDebug(p"offsetCounter = ${Binary(offsetCounterWire)}\n") +// XSDebug(p"tsbbIdx = ${tsbbIdx}\n") +// when(io.in.fire) { +// XSDebug("Enque:\n") +// XSDebug(brTaken, p"Detected jump, idx=${brIdx}\n") +// XSDebug(p"predTaken=${io.in.bits.predTaken}, predTakenVec=${Binary(predTakenVec)}\n") +// XSDebug(p"MASK=${Binary(io.in.bits.mask)}\n") +// for(i <- 0 until PredictWidth){ +// XSDebug(p"PC=${Hexadecimal(io.in.bits.pc(i))} ${Hexadecimal(io.in.bits.instrs(i))}\n") +// } +// } + +// when(deqValid) { +// XSDebug("Deque:\n") +// for(i <- 0 until DecodeWidth){ +// XSDebug(p"${Hexadecimal(io.out(i).bits.instr)} PC=${Hexadecimal(io.out(i).bits.pc)} v=${io.out(i).valid} r=${io.out(i).ready}\n") +// } +// } + +// XSDebug(p"last_headPtr=$headPtr last_tailPtr=$tailPtr\n") +// XSDebug("IBuffer:\n") +// for(i <- 0 until IBufSize/8) { +// XSDebug("%x v:%b | %x v:%b | %x v:%b | %x v:%b | %x v:%b | %x v:%b | %x v:%b | %x v:%b\n", +// ibuf(i*8+0).inst, ibufValid(i*8+0), +// ibuf(i*8+1).inst, ibufValid(i*8+1), +// ibuf(i*8+2).inst, ibufValid(i*8+2), +// ibuf(i*8+3).inst, ibufValid(i*8+3), +// ibuf(i*8+4).inst, ibufValid(i*8+4), +// ibuf(i*8+5).inst, ibufValid(i*8+5), +// ibuf(i*8+6).inst, ibufValid(i*8+6), +// ibuf(i*8+7).inst, ibufValid(i*8+7) +// ) +// } + +// XSDebug("LoopBuffer:\n") +// for(i <- 0 until IBufSize*2/8) { +// XSDebug("%x v:%b | %x v:%b | %x v:%b | %x v:%b | %x v:%b | %x v:%b | %x v:%b | %x v:%b\n", +// lbuf(i*8+0).inst, lbufValid(i*8+0), +// lbuf(i*8+1).inst, lbufValid(i*8+1), +// lbuf(i*8+2).inst, lbufValid(i*8+2), +// lbuf(i*8+3).inst, lbufValid(i*8+3), +// lbuf(i*8+4).inst, lbufValid(i*8+4), +// lbuf(i*8+5).inst, lbufValid(i*8+5), +// lbuf(i*8+6).inst, lbufValid(i*8+6), +// lbuf(i*8+7).inst, lbufValid(i*8+7) +// ) +// } +// } \ No newline at end of file diff --git a/src/main/scala/xiangshan/frontend/NewLoopBuffer.scala b/src/main/scala/xiangshan/frontend/NewLoopBuffer.scala new file mode 100644 index 000000000..cfdcaff7e --- /dev/null +++ b/src/main/scala/xiangshan/frontend/NewLoopBuffer.scala @@ -0,0 +1,252 @@ +package xiangshan.frontend + +import chisel3._ +import chisel3.util._ +import chisel3.util.experimental.BoringUtils +import chisel3.ExcitingUtils._ +import utils._ +import xiangshan._ +import xiangshan.cache._ + +trait HasLoopBufferCst extends HasXSParameter { + val preFetchBufferSize = 2 +} + +class LoopBufferParameters extends XSBundle { + val LBredirect = ValidIO(UInt(VAddrBits.W)) + val fetchReq = Input(UInt(VAddrBits.W)) + // val preFetchPC = DecoupledIO(UInt(VAddrBits.W)) + // val preFetchResp = Flipped(DecoupledIO(new ICacheResp)) + // val preFetchSend = DecoupledIO(new ICacheResp) +} + +class LoopBufferIO extends XSBundle { + val flush = Input(Bool()) + val in = Flipped(DecoupledIO(new FetchPacket)) + val out = ValidIO(new ICacheResp) + val loopBufPar = new LoopBufferParameters +} + +class NewLoopBuffer extends XSModule with HasLoopBufferCst{ + val io = IO(new LoopBufferIO) + + // FSM state define + val s_idle :: s_fill :: s_active :: Nil = Enum(3) + val LBstate = RegInit(s_idle) + + io.out <> DontCare + io.out.valid := LBstate === s_active + io.in.ready := true.B + io.loopBufPar.fetchReq := DontCare + + + class LBufEntry extends XSBundle { + val inst = UInt(16.W) + // val tag = UInt(tagBits.W) + } + + def sbboffset(inst: UInt) = { + val isJal = inst === BitPat("b1111_???????_111111111_?????_1101111") + val isCon = inst === BitPat("b1111???_?????_?????_???_????1_1100011") + val isRVCJal = inst === BitPat("b????????????????_001_1?111??????_01") + val isRVCCon = inst === BitPat("b????????????????_11?_1??_???_?????_01") + + val rst = PriorityMux(Seq( + isJal -> inst(27, 21), + isCon -> Cat(inst(27,25), inst(11,8)), + isRVCJal -> Cat(inst(6), inst(7), inst(2), inst(11), inst(5,3)), + isRVCCon -> Cat(inst(6), inst(5), inst(2), inst(11,10), inst(4,3)), + true.B -> 0.U(7.W) + )) + + ((~rst).asUInt + 1.U, rst) + } + + def isSBB(inst: UInt): Bool = { + val sbboffsetWire = WireInit(sbboffset(inst)._1) + sbboffsetWire > 0.U && sbboffsetWire <= 112.U // TODO < 56.U + } + + // predTaken to OH + val predTakenVec = Mux(io.in.bits.predTaken, Reverse(PriorityEncoderOH(Reverse(io.in.bits.mask))), 0.U(PredictWidth.W)) + + // Loop detect register + val offsetCounter = Reg(UInt((log2Up(IBufSize)+2).W)) + val tsbbPC = RegInit(0.U(VAddrBits.W)) + + val brTaken = Cat((0 until PredictWidth).map(i => io.in.fire && io.in.bits.mask(i) && predTakenVec(i))).orR() + val brIdx = OHToUInt(predTakenVec.asUInt) + val sbbTaken = brTaken && isSBB(io.in.bits.instrs(brIdx)) + + val tsbbVec = Cat((0 until PredictWidth).map(i => io.in.fire && io.in.bits.mask(i) && io.in.bits.pc(i) === tsbbPC)) + val hasTsbb = tsbbVec.orR() + val tsbbIdx = OHToUInt(Reverse(tsbbVec)) + val tsbbTaken = brTaken && io.in.bits.pc(brIdx) === tsbbPC + + val buffer = Mem(IBufSize*2, new LBufEntry) + val bufferValid = RegInit(VecInit(Seq.fill(IBufSize*2)(false.B))) + + val redirect_pc = io.in.bits.pnpc(PredictWidth.U - PriorityEncoder(Reverse(io.in.bits.mask)) - 1.U) + + def flush() = { + XSDebug("Loop Buffer Flushed.\n") + LBstate := s_idle + for(i <- 0 until IBufSize*2) { + buffer(i).inst := 0.U // TODO: This is to make the debugging information clearer, this can be deleted + bufferValid(i) := false.B + } + } + + when(io.in.fire && LBstate === s_fill) { + io.in.bits.mask.asBools().zipWithIndex.map {case(m, i) => + when(m) { + buffer(io.in.bits.pc(i)(7,1)).inst := io.in.bits.instrs(i)(15, 0) + bufferValid(io.in.bits.pc(i)(7,1)) := true.B + when(!io.in.bits.pd(i).isRVC) { + buffer(io.in.bits.pc(i)(7,1) + 1.U).inst := io.in.bits.instrs(i)(31, 16) + bufferValid(io.in.bits.pc(i)(7,1) + 1.U) := true.B // May need to be considered already valid + } + } + } + } + + // This is ugly + val pcStep = (0 until PredictWidth).map(i => Mux(!io.in.fire || !io.in.bits.mask(i), 0.U, Mux(io.in.bits.pd(i).isRVC, 1.U, 2.U))).fold(0.U(log2Up(16+1).W))(_+_) + val offsetCounterWire = WireInit(offsetCounter + pcStep) + offsetCounter := offsetCounterWire + + when(LBstate === s_active) { + io.out.bits.pc := io.loopBufPar.fetchReq + io.out.bits.data := Cat((31 to 0 by -1).map(i => buffer(io.loopBufPar.fetchReq(7,1) + i.U).inst)) + io.out.bits.mask := Cat((31 to 0 by -1).map(i => bufferValid(io.loopBufPar.fetchReq(7,1) + i.U))) + io.out.bits.ipf := false.B + } + + io.loopBufPar.LBredirect.valid := false.B + io.loopBufPar.LBredirect.bits := DontCare + + /*-----------------------*/ + /* Loop Buffer FSM */ + /*-----------------------*/ + when(io.in.fire) { + switch(LBstate) { + is(s_idle) { + // To FILL + // 检测到sbb且跳转,sbb成为triggering sbb + when(sbbTaken) { + LBstate := s_fill + XSDebug("State change: FILL\n") + // This is ugly + // offsetCounter := Cat("b1".U, sbboffset(io.in.bits.instrs(brIdx))) + + // (0 until PredictWidth).map(i => Mux(!io.in.bits.mask(i) || i.U < brIdx, 0.U, Mux(io.in.bits.pd(i).isRVC, 1.U, 2.U))).fold(0.U(log2Up(16+1).W))(_+_) + offsetCounter := Cat("b1".U, sbboffset(io.in.bits.instrs(brIdx))._2) + tsbbPC := io.in.bits.pc(brIdx) + } + } + is(s_fill) { + // To AVTIVE + // triggering sbb 造成cof + when(offsetCounterWire((log2Up(IBufSize)+2)-1) === 0.U){ + when(hasTsbb && tsbbTaken) { + LBstate := s_active + XSDebug("State change: ACTIVE\n") + }.otherwise { + LBstate := s_idle + XSDebug("State change: IDLE\n") + flush() + } + } + + when(brTaken && !tsbbTaken) { + // To IDLE + LBstate := s_idle + XSDebug("State change: IDLE\n") + flush() + } + } + is(s_active) { + // To IDLE + // triggering sbb不跳转 退出循环 + when(hasTsbb && !tsbbTaken) { + XSDebug("tsbb not taken, State change: IDLE\n") + LBstate := s_idle + io.loopBufPar.LBredirect.valid := true.B + // io.loopBufPar.LBredirect.bits := tsbbPC + Mux(io.in.bits.pd(tsbbIdx).isRVC, 2.U, 4.U) + io.loopBufPar.LBredirect.bits := redirect_pc + // ExcitingUtils.addSource(true.B, "CntLBRedirect1", Perf) + XSDebug(p"redirect pc=${Hexadecimal(redirect_pc)}\n") + flush() + } + + when(brTaken && !tsbbTaken) { + XSDebug("cof by other inst, State change: IDLE\n") + LBstate := s_idle + io.loopBufPar.LBredirect.valid := true.B + io.loopBufPar.LBredirect.bits := redirect_pc + // io.loopBufPar.LBredirect.bits := Mux(brIdx > tsbbIdx, tsbbPC + 4.U, io.loopBufPar.LBReq) + // ExcitingUtils.addSource(true.B, "CntLBRedirect2", Perf) + XSDebug(p"redirect pc=${Hexadecimal(redirect_pc)}\n") + flush() + } + + when(hasTsbb && brTaken && !tsbbTaken) { + XSDebug("tsbb and cof, State change: IDLE\n") + LBstate := s_idle + io.loopBufPar.LBredirect.valid := true.B + io.loopBufPar.LBredirect.bits := redirect_pc + // io.loopBufPar.LBredirect.bits := Mux(brIdx > tsbbIdx, tsbbPC + 4.U, io.loopBufPar.LBReq) + // ExcitingUtils.addSource(true.B, "CntLBRedirect3", Perf) + XSDebug(p"redirect pc=${Hexadecimal(redirect_pc)}\n") + flush() + } + } + } + } + + when(io.flush){ + flush() + } + + // XSDebug(io.flush, "LoopBuffer Flushed\n") + if (!env.FPGAPlatform ) { + ExcitingUtils.addSource(LBstate === s_active && hasTsbb && !tsbbTaken, "CntExitLoop1", Perf) + ExcitingUtils.addSource(LBstate === s_active && brTaken && !tsbbTaken, "CntExitLoop2", Perf) + ExcitingUtils.addSource(LBstate === s_active && hasTsbb && brTaken && !tsbbTaken, "CntExitLoop3", Perf) + } + + XSDebug(LBstate === s_idle, "Current state: IDLE\n") + XSDebug(LBstate === s_fill, "Current state: FILL\n") + XSDebug(LBstate === s_active, "Current state: ACTIVE\n") + + XSDebug(p"offsetCounter = ${Binary(offsetCounterWire)}\n") + XSDebug(p"tsbbIdx = ${tsbbIdx}\n") + when(io.in.fire) { + XSDebug("Enque:\n") + XSDebug(brTaken, p"Detected jump, idx=${brIdx}\n") + XSDebug(p"predTaken=${io.in.bits.predTaken}, predTakenVec=${Binary(predTakenVec)}\n") + XSDebug(p"MASK=${Binary(io.in.bits.mask)}\n") + for(i <- 0 until PredictWidth){ + XSDebug(p"PC=${Hexadecimal(io.in.bits.pc(i))} ${Hexadecimal(io.in.bits.instrs(i))}\n") + } + } + + XSDebug("LoopBuffer:\n") + for(i <- 0 until IBufSize*2/8) { + XSDebug("%x v:%b | %x v:%b | %x v:%b | %x v:%b | %x v:%b | %x v:%b | %x v:%b | %x v:%b\n", + buffer(i*8+0).inst, bufferValid(i*8+0), + buffer(i*8+1).inst, bufferValid(i*8+1), + buffer(i*8+2).inst, bufferValid(i*8+2), + buffer(i*8+3).inst, bufferValid(i*8+3), + buffer(i*8+4).inst, bufferValid(i*8+4), + buffer(i*8+5).inst, bufferValid(i*8+5), + buffer(i*8+6).inst, bufferValid(i*8+6), + buffer(i*8+7).inst, bufferValid(i*8+7) + ) + } + + XSDebug(io.out.valid, p"fetch pc: ${Hexadecimal(io.loopBufPar.fetchReq)}\n") + XSDebug(io.out.valid, p"fetchIdx: ${io.loopBufPar.fetchReq(7,1)}\n") + XSDebug(io.out.valid, p"out data: ${Hexadecimal(io.out.bits.data)}\n") + XSDebug(io.out.valid, p"out mask: ${Binary(io.out.bits.mask)}\n") + XSDebug(io.out.valid, p"out pc : ${Hexadecimal(io.out.bits.pc)}\n") +} \ No newline at end of file diff --git a/src/test/scala/xiangshan/testutils/AddSinks.scala b/src/test/scala/xiangshan/testutils/AddSinks.scala index f6bb47044..6c08b906e 100644 --- a/src/test/scala/xiangshan/testutils/AddSinks.scala +++ b/src/test/scala/xiangshan/testutils/AddSinks.scala @@ -32,7 +32,10 @@ object AddSinks { "perfCntCondMbpRRight", "perfCntCondMbpRWrong", "CntFetchFromICache", - "CntFetchFromLoopBuffer" + "CntFetchFromLoopBuffer", + "CntExitLoop1", + "CntExitLoop2", + "CntExitLoop3" ) for (s <- sinks){ BoringUtils.addSink(tmp, s) } -- GitLab