From de169c67fc2ef700831ebf032afae0b87c2e5806 Mon Sep 17 00:00:00 2001 From: William Wang Date: Tue, 11 May 2021 09:42:37 +0800 Subject: [PATCH] backend,mem: add Store Sets memory dependence predictor (#796) * LoadQueue: send stFtqIdx via rollback request * It will make it possible for setore set to update its SSIT * StoreSet: setup store set update req * StoreSet: add store set identifier table (SSIT) * StoreSet: add last fetched store table (LFST) * StoreSet: put SSIT into decode stage * StoreSet: put LFST into dispatch1 * Future work: optimize timing * RS: store rs now supports delayed issue * StoreSet: add perf counter * StoreSet: fix SSIT update logic * StoreSet: delay LFST update input for 1 cycle * StoreSet: fix LFST update logic * StoreSet: fix LFST raddr width * StoreSet: do not force store in ss issue in order Classic store set requires store in the same store set issue in seq. However, in current micro-architecture, such restrict will lead to severe perf lost. We choose to disable it until we find another way to fix it. * StoreSet: support ooo store in the same store set * StoreSet: fix store set merge logic * StoreSet: check earlier store when read LFST * If store-load pair is in the same dispatch bundle, loadWaitBit should also be set for load * StoreSet: increase default SSIT flush period * StoreSet: fix LFST read logic * Fix commit c0e541d14 * StoreSet: add StoreSetEnable parameter * RSFeedback: add source type * StoreQueue: split store addr and store data * StoreQueue: update ls forward logic * Now it supports splited addr and data * Chore: force assign name for load/store unit * RS: add rs'support for store a-d split * StoreQueue: fix stlf logic * StoreQueue: fix addr wb sq update logic * AtomicsUnit: support splited a/d * Parameters: disable store set by default * WaitTable: wait table will not cause store delay * WaitTable: recover default reset period to 2^17 * Fix dev-stad merge conflict * StoreSet: enable storeset * RS: disable store rs delay logic CI perf shows that current delay logic will cause perf loss. Disable unnecessary delay logic will help. To be more specific, `io.readyVec` caused the problem. It will be updated in future commits. * RS: opt select logic with load delay (ldWait) * StoreSet: disable 2-bit lwt Co-authored-by: ZhangZifei --- src/main/scala/xiangshan/Bundle.scala | 31 ++- src/main/scala/xiangshan/Parameters.scala | 16 +- .../scala/xiangshan/backend/CtrlBlock.scala | 46 ++-- .../scala/xiangshan/backend/MemBlock.scala | 9 +- .../backend/decode/DecodeStage.scala | 19 +- .../xiangshan/backend/decode/StoreSet.scala | 224 ++++++++++++++++++ .../xiangshan/backend/decode/WaitTable.scala | 15 +- .../xiangshan/backend/dispatch/Dispatch.scala | 7 + .../backend/dispatch/Dispatch1.scala | 68 ++++++ .../scala/xiangshan/backend/ftq/Ftq.scala | 6 +- .../backend/issue/ReservationStation.scala | 10 +- src/main/scala/xiangshan/frontend/IFU.scala | 5 +- .../scala/xiangshan/frontend/Ibuffer.scala | 7 +- .../xiangshan/mem/lsqueue/LoadQueue.scala | 46 ++-- 14 files changed, 444 insertions(+), 65 deletions(-) create mode 100644 src/main/scala/xiangshan/backend/decode/StoreSet.scala diff --git a/src/main/scala/xiangshan/Bundle.scala b/src/main/scala/xiangshan/Bundle.scala index ff5f2c414..1466ac99c 100644 --- a/src/main/scala/xiangshan/Bundle.scala +++ b/src/main/scala/xiangshan/Bundle.scala @@ -3,7 +3,7 @@ package xiangshan import chisel3._ import chisel3.util._ import xiangshan.backend.roq.RoqPtr -import xiangshan.backend.decode.{ImmUnion, WaitTableParameters, XDecode} +import xiangshan.backend.decode.{ImmUnion, XDecode} import xiangshan.mem.{LqPtr, SqPtr} import xiangshan.frontend.PreDecodeInfoForDebug import xiangshan.frontend.PreDecodeInfo @@ -23,13 +23,13 @@ import chipsalliance.rocketchip.config.Parameters import xiangshan.backend.ftq.FtqPtr // Fetch FetchWidth x 32-bit insts from Icache -class FetchPacket(implicit p: Parameters) extends XSBundle with WaitTableParameters { +class FetchPacket(implicit p: Parameters) extends XSBundle { val instrs = Vec(PredictWidth, UInt(32.W)) val mask = UInt(PredictWidth.W) val pdmask = UInt(PredictWidth.W) // val pc = UInt(VAddrBits.W) val pc = Vec(PredictWidth, UInt(VAddrBits.W)) - val foldpc = Vec(PredictWidth, UInt(WaitTableAddrWidth.W)) + val foldpc = Vec(PredictWidth, UInt(MemPredPCWidth.W)) val pd = Vec(PredictWidth, new PreDecodeInfo) val ipf = Bool() val acf = Bool() @@ -179,16 +179,18 @@ class CfiUpdateInfo(implicit p: Parameters) extends XSBundle with HasBPUParamete } // Dequeue DecodeWidth insts from Ibuffer -class CtrlFlow(implicit p: Parameters) extends XSBundle with WaitTableParameters { +class CtrlFlow(implicit p: Parameters) extends XSBundle { val instr = UInt(32.W) val pc = UInt(VAddrBits.W) - val foldpc = UInt(WaitTableAddrWidth.W) + val foldpc = UInt(MemPredPCWidth.W) val exceptionVec = ExceptionVec() val intrVec = Vec(12, Bool()) val pd = new PreDecodeInfo val pred_taken = Bool() val crossPageIPFFix = Bool() + val storeSetHit = Bool() // inst has been allocated an store set val loadWaitBit = Bool() // load inst should not be executed until all former store addr calcuated + val ssid = UInt(SSIDWidth.W) val ftqPtr = new FtqPtr val ftqOffset = UInt(log2Up(PredictWidth).W) } @@ -325,6 +327,11 @@ class MicroOp(implicit p: Parameters) extends CfCtrl { } } +class MicroOpRbExt(implicit p: Parameters) extends XSBundle { + val uop = new MicroOp + val flag = UInt(1.W) +} + class Redirect(implicit p: Parameters) extends XSBundle { val roqIdx = new RoqPtr val ftqIdx = new FtqPtr @@ -333,6 +340,8 @@ class Redirect(implicit p: Parameters) extends XSBundle { val interrupt = Bool() val cfiUpdate = new CfiUpdateInfo + val stFtqIdx = new FtqPtr // for load violation predict + val stFtqOffset = UInt(log2Up(PredictWidth).W) // def isUnconditional() = RedirectLevel.isUnconditional(level) def flushItself() = RedirectLevel.flushItself(level) @@ -467,10 +476,18 @@ class SfenceBundle(implicit p: Parameters) extends XSBundle { } } -class WaitTableUpdateReq(implicit p: Parameters) extends XSBundle with WaitTableParameters { +// Bundle for load violation predictor updating +class MemPredUpdateReq(implicit p: Parameters) extends XSBundle { val valid = Bool() - val waddr = UInt(WaitTableAddrWidth.W) + + // wait table update + val waddr = UInt(MemPredPCWidth.W) val wdata = Bool() // true.B by default + + // store set update + // by default, ldpc/stpc should be xor folded + val ldpc = UInt(MemPredPCWidth.W) + val stpc = UInt(MemPredPCWidth.W) } class PerfInfoIO extends Bundle { diff --git a/src/main/scala/xiangshan/Parameters.scala b/src/main/scala/xiangshan/Parameters.scala index bde02eea8..0729a5fbf 100644 --- a/src/main/scala/xiangshan/Parameters.scala +++ b/src/main/scala/xiangshan/Parameters.scala @@ -285,7 +285,21 @@ trait HasXSParameter { blockBytes = L2BlockSize, nEntries = dcacheParameters.nMissEntries * 2 // TODO: this is too large ), - ) + ) + + // load violation predict + val ResetTimeMax2Pow = 20 //1078576 + val ResetTimeMin2Pow = 10 //1024 + // wait table parameters + val WaitTableSize = 1024 + val MemPredPCWidth = log2Up(WaitTableSize) + val LWTUse2BitCounter = true + // store set parameters + val SSITSize = WaitTableSize + val LFSTSize = 32 + val SSIDWidth = log2Up(LFSTSize) + val LFSTWidth = 4 + val StoreSetEnable = true // LWT will be disabled if SS is enabled val loadExuConfigs = coreParams.loadExuConfigs val storeExuConfigs = coreParams.storeExuConfigs diff --git a/src/main/scala/xiangshan/backend/CtrlBlock.scala b/src/main/scala/xiangshan/backend/CtrlBlock.scala index d23bfdfbc..f0a346b6a 100644 --- a/src/main/scala/xiangshan/backend/CtrlBlock.scala +++ b/src/main/scala/xiangshan/backend/CtrlBlock.scala @@ -5,7 +5,7 @@ import chisel3._ import chisel3.util._ import utils._ import xiangshan._ -import xiangshan.backend.decode.{DecodeStage, ImmUnion, WaitTableParameters} +import xiangshan.backend.decode.{DecodeStage, ImmUnion} import xiangshan.backend.rename.{BusyTable, Rename} import xiangshan.backend.dispatch.Dispatch import xiangshan.backend.exu._ @@ -38,13 +38,13 @@ class CtrlToFpBlockIO(implicit p: Parameters) extends XSBundle { class CtrlToLsBlockIO(implicit p: Parameters) extends XSBundle { val enqIqCtrl = Vec(exuParameters.LsExuCnt, DecoupledIO(new MicroOp)) val enqLsq = Flipped(new LsqEnqIO) - val waitTableUpdate = Vec(StorePipelineWidth, Input(new WaitTableUpdateReq)) + val memPredUpdate = Vec(StorePipelineWidth, Input(new MemPredUpdateReq)) val redirect = ValidIO(new Redirect) val flush = Output(Bool()) } class RedirectGenerator(implicit p: Parameters) extends XSModule - with HasCircularQueuePtrHelper with WaitTableParameters with HasFtqHelper { + with HasCircularQueuePtrHelper with HasFtqHelper { val numRedirect = exuParameters.JmpCnt + exuParameters.AluCnt val io = IO(new Bundle() { val exuMispredict = Vec(numRedirect, Flipped(ValidIO(new ExuOutput))) @@ -54,7 +54,8 @@ class RedirectGenerator(implicit p: Parameters) extends XSModule val stage2FtqRead = new FtqRead val stage2Redirect = ValidIO(new Redirect) val stage3Redirect = ValidIO(new Redirect) - val waitTableUpdate = Output(new WaitTableUpdateReq) + val memPredUpdate = Output(new MemPredUpdateReq) + val memPredFtqRead = new FtqRead // read req send form stage 2 }) /* LoadQueue Jump ALU0 ALU1 ALU2 ALU3 exception Stage1 @@ -143,12 +144,26 @@ class RedirectGenerator(implicit p: Parameters) extends XSModule ) ) - // update waittable if load violation redirect triggered - io.waitTableUpdate.valid := RegNext(s1_isReplay && s1_redirect_valid_reg, init = false.B) - io.waitTableUpdate.waddr := RegNext(XORFold(real_pc(VAddrBits-1, 1), WaitTableAddrWidth)) - io.waitTableUpdate.wdata := true.B + // get pc from ftq + io.memPredFtqRead.ptr := s1_redirect_bits_reg.stFtqIdx + // valid only if redirect is caused by load violation + // store_pc is used to update store set + val memPredFtqRead = io.memPredFtqRead.entry + val store_pc = GetPcByFtq(memPredFtqRead.ftqPC, RegNext(s1_redirect_bits_reg).stFtqOffset, + memPredFtqRead.lastPacketPC.valid, + memPredFtqRead.lastPacketPC.bits + ) + + // update load violation predictor if load violation redirect triggered + io.memPredUpdate.valid := RegNext(s1_isReplay && s1_redirect_valid_reg, init = false.B) + // update wait table + io.memPredUpdate.waddr := RegNext(XORFold(real_pc(VAddrBits-1, 1), MemPredPCWidth)) + io.memPredUpdate.wdata := true.B + // update store set + io.memPredUpdate.ldpc := RegNext(XORFold(real_pc(VAddrBits-1, 1), MemPredPCWidth)) + // store pc is ready 1 cycle after s1_isReplay is judged + io.memPredUpdate.stpc := XORFold(store_pc(VAddrBits-1, 1), MemPredPCWidth) - io.stage2FtqRead.ptr := s1_redirect_bits_reg.ftqIdx val s2_br_mask = RegEnable(ftqRead.br_mask, enable = s1_redirect_valid_reg) val s2_sawNotTakenBranch = RegEnable(VecInit((0 until PredictWidth).map{ i => @@ -242,7 +257,8 @@ class CtrlBlock(implicit p: Parameters) extends XSModule init = false.B ) loadReplay.bits := RegEnable(io.fromLsBlock.replay.bits, io.fromLsBlock.replay.valid) - VecInit(ftq.io.ftqRead.tail.dropRight(1)) <> redirectGen.io.stage1FtqRead + VecInit(ftq.io.ftqRead.tail.dropRight(2)) <> redirectGen.io.stage1FtqRead + ftq.io.ftqRead.dropRight(1).last <> redirectGen.io.memPredFtqRead ftq.io.cfiRead <> redirectGen.io.stage2FtqRead redirectGen.io.exuMispredict <> exuRedirect redirectGen.io.loadReplay <> loadReplay @@ -288,10 +304,10 @@ class CtrlBlock(implicit p: Parameters) extends XSModule decode.io.in <> io.frontend.cfVec // currently, we only update wait table when isReplay - decode.io.waitTableUpdate(0) <> RegNext(redirectGen.io.waitTableUpdate) - decode.io.waitTableUpdate(1) := DontCare - decode.io.waitTableUpdate(1).valid := false.B - // decode.io.waitTableUpdate <> io.toLsBlock.waitTableUpdate + decode.io.memPredUpdate(0) <> RegNext(redirectGen.io.memPredUpdate) + decode.io.memPredUpdate(1) := DontCare + decode.io.memPredUpdate(1).valid := false.B + // decode.io.memPredUpdate <> io.toLsBlock.memPredUpdate decode.io.csrCtrl := RegNext(io.csrCtrl) @@ -335,6 +351,8 @@ class CtrlBlock(implicit p: Parameters) extends XSModule dispatch.io.numExist <> io.fromIntBlock.numExist ++ io.fromFpBlock.numExist ++ io.fromLsBlock.numExist dispatch.io.enqIQCtrl <> io.toIntBlock.enqIqCtrl ++ io.toFpBlock.enqIqCtrl ++ io.toLsBlock.enqIqCtrl // dispatch.io.enqIQData <> io.toIntBlock.enqIqData ++ io.toFpBlock.enqIqData ++ io.toLsBlock.enqIqData + dispatch.io.csrCtrl <> io.csrCtrl + dispatch.io.storeIssue <> io.fromLsBlock.stIn fpBusyTable.io.flush := flushReg diff --git a/src/main/scala/xiangshan/backend/MemBlock.scala b/src/main/scala/xiangshan/backend/MemBlock.scala index 2ffb0c463..d516bad4f 100644 --- a/src/main/scala/xiangshan/backend/MemBlock.scala +++ b/src/main/scala/xiangshan/backend/MemBlock.scala @@ -17,6 +17,7 @@ import xiangshan.backend.regfile.RfReadPort import utils._ class LsBlockToCtrlIO(implicit p: Parameters) extends XSBundle { + val stIn = Vec(exuParameters.StuCnt, ValidIO(new ExuInput)) val stOut = Vec(exuParameters.StuCnt, ValidIO(new ExuOutput)) val numExist = Vec(exuParameters.LsExuCnt, Output(UInt(log2Ceil(IssQueSize).W))) val replay = ValidIO(new Redirect) @@ -246,7 +247,7 @@ class MemBlockImp(outer: MemBlock) extends LazyModuleImp(outer) // update waittable // TODO: read pc - io.fromCtrlBlock.waitTableUpdate(i) := DontCare + io.fromCtrlBlock.memPredUpdate(i) := DontCare lsq.io.needReplayFromRS(i) <> loadUnits(i).io.lsq.needReplayFromRS } @@ -265,6 +266,8 @@ class MemBlockImp(outer: MemBlock) extends LazyModuleImp(outer) stu.io.stin <> rs.io.deq stu.io.lsq <> lsq.io.storeIn(i) + // Lsq to load unit's rs + rs.io.stIssuePtr := lsq.io.issuePtrExt // rs.io.storeData <> lsq.io.storeDataIn(i) lsq.io.storeDataIn(i) := rs.io.stData @@ -272,6 +275,10 @@ class MemBlockImp(outer: MemBlock) extends LazyModuleImp(outer) lsq.io.storeIssue(i).valid := rs.io.deq.valid lsq.io.storeIssue(i).bits := rs.io.deq.bits + // sync issue info to store set LFST + io.toCtrlBlock.stIn(i).valid := rs.io.deq.valid + io.toCtrlBlock.stIn(i).bits := rs.io.deq.bits + io.toCtrlBlock.stOut(i).valid := stu.io.stout.valid io.toCtrlBlock.stOut(i).bits := stu.io.stout.bits stu.io.stout.ready := true.B diff --git a/src/main/scala/xiangshan/backend/decode/DecodeStage.scala b/src/main/scala/xiangshan/backend/decode/DecodeStage.scala index 661232c56..d399ce6ba 100644 --- a/src/main/scala/xiangshan/backend/decode/DecodeStage.scala +++ b/src/main/scala/xiangshan/backend/decode/DecodeStage.scala @@ -11,7 +11,7 @@ class DecodeStage(implicit p: Parameters) extends XSModule { // from Ibuffer val in = Vec(DecodeWidth, Flipped(DecoupledIO(new CtrlFlow))) // from memblock - val waitTableUpdate = Vec(StorePipelineWidth, Input(new WaitTableUpdateReq)) + val memPredUpdate = Vec(StorePipelineWidth, Input(new MemPredUpdateReq)) // to DecBuffer val out = Vec(DecodeWidth, DecoupledIO(new CfCtrl)) // waitable ctrl @@ -19,7 +19,12 @@ class DecodeStage(implicit p: Parameters) extends XSModule { }) val decoders = Seq.fill(DecodeWidth)(Module(new DecodeUnit)) + + // basic wait table load violation predictor (for debug only) val waittable = Module(new WaitTable) + // store set load violation predictor stage 1: SSIT look up + val ssit = Module(new SSIT) + for (i <- 0 until DecodeWidth) { decoders(i).io.enq.ctrl_flow <> io.in(i).bits @@ -27,18 +32,26 @@ class DecodeStage(implicit p: Parameters) extends XSModule { waittable.io.raddr(i) := io.in(i).bits.foldpc decoders(i).io.enq.ctrl_flow.loadWaitBit := waittable.io.rdata(i) + // read SSIT, get SSID + ssit.io.raddr(i) := io.in(i).bits.foldpc + decoders(i).io.enq.ctrl_flow.storeSetHit := ssit.io.rdata(i).valid + decoders(i).io.enq.ctrl_flow.ssid := ssit.io.rdata(i).ssid + io.out(i).valid := io.in(i).valid io.out(i).bits := decoders(i).io.deq.cf_ctrl io.in(i).ready := io.out(i).ready } - for (i <- 0 until StorePipelineWidth) { - waittable.io.update(i) <> RegNext(io.waitTableUpdate(i)) + waittable.io.update(i) <> RegNext(io.memPredUpdate(i)) } waittable.io.csrCtrl <> io.csrCtrl + ssit.io.update <> RegNext(io.memPredUpdate(0)) + ssit.io.csrCtrl <> io.csrCtrl val loadWaitBitSet = PopCount(io.out.map(o => o.fire() && o.bits.cf.loadWaitBit)) XSPerfAccumulate("loadWaitBitSet", loadWaitBitSet) + val storeSetHit = PopCount(io.out.map(o => o.fire() && o.bits.cf.storeSetHit)) + XSPerfAccumulate("storeset_ssit_hit", storeSetHit) val hasValid = VecInit(io.in.map(_.valid)).asUInt.orR XSPerfAccumulate("utilization", PopCount(io.in.map(_.valid))) diff --git a/src/main/scala/xiangshan/backend/decode/StoreSet.scala b/src/main/scala/xiangshan/backend/decode/StoreSet.scala new file mode 100644 index 000000000..9fc8adf52 --- /dev/null +++ b/src/main/scala/xiangshan/backend/decode/StoreSet.scala @@ -0,0 +1,224 @@ +package xiangshan.backend.decode + +import chipsalliance.rocketchip.config.Parameters +import chisel3._ +import chisel3.util._ +import xiangshan._ +import utils._ +import xiangshan.mem.{LqPtr, SqPtr} +import xiangshan.backend.roq.RoqPtr + +// store set load violation predictor +// See "Memory Dependence Prediction using Store Sets" for details + +// Store Set Identifier Table Entry +class SSITEntry(implicit p: Parameters) extends XSBundle { + val valid = Bool() + val isload = Bool() + val ssid = UInt(SSIDWidth.W) // store set identifier +} + +// Store Set Identifier Table +class SSIT(implicit p: Parameters) extends XSModule { + val io = IO(new Bundle { + val raddr = Vec(DecodeWidth, Input(UInt(MemPredPCWidth.W))) // xor hashed decode pc(VaddrBits-1, 1) + val rdata = Vec(DecodeWidth, Output(new SSITEntry)) + val update = Input(new MemPredUpdateReq) // RegNext should be added outside + val csrCtrl = Input(new CustomCSRCtrlIO) + }) + + // TODO: use MemTemplate + val valid = RegInit(VecInit(Seq.fill(SSITSize)(false.B))) + val isload = Reg(Vec(SSITSize, Bool())) + val ssid = Reg(Vec(SSITSize, UInt(SSIDWidth.W))) + + val resetCounter = RegInit(0.U(ResetTimeMax2Pow.W)) + resetCounter := resetCounter + 1.U + + // read SSIT in decode stage + for (i <- 0 until DecodeWidth) { + // io.rdata(i) := (data(io.raddr(i))(1) || io.csrCtrl.no_spec_load) && !io.csrCtrl.lvpred_disable + io.rdata(i).valid := valid(io.raddr(i)) + io.rdata(i).isload := isload(io.raddr(i)) + io.rdata(i).ssid := ssid(io.raddr(i)) + } + + // update SSIT if load violation redirect is detected + + // update stage -1 + // when io.update.valid, we should RegNext() it for at least 1 cycle + // outside of SSIT. + + // update stage 0 + // RegNext(io.update) while reading SSIT entry for necessary information + val memPredUpdateReqValid = RegNext(io.update.valid) + val memPredUpdateReqReg = RegEnable(io.update, enable = io.update.valid) + // load has already been assigned with a store set + val loadAssigned = RegNext(valid(io.update.ldpc)) + val loadOldSSID = RegNext(ssid(io.update.ldpc)) + // store has already been assigned with a store set + val storeAssigned = RegNext(valid(io.update.stpc)) + val storeOldSSID = RegNext(ssid(io.update.stpc)) + // both the load and the store have already been assigned store sets + // but load's store set ID is smaller + val winnerSSID = Mux(loadOldSSID < storeOldSSID, loadOldSSID, storeOldSSID) + + // for now we just use lowest bits of ldpc as store set id + val ssidAllocate = memPredUpdateReqReg.ldpc(SSIDWidth-1, 0) + + // update stage 1 + when(memPredUpdateReqValid){ + switch (Cat(loadAssigned, storeAssigned)) { + // 1. "If neither the load nor the store has been assigned a store set, + // one is allocated and assigned to both instructions." + is ("b00".U(2.W)) { + valid(memPredUpdateReqReg.ldpc) := true.B + isload(memPredUpdateReqReg.ldpc) := true.B + ssid(memPredUpdateReqReg.ldpc) := ssidAllocate + valid(memPredUpdateReqReg.stpc) := true.B + isload(memPredUpdateReqReg.stpc) := false.B + ssid(memPredUpdateReqReg.stpc) := ssidAllocate + } + // 2. "If the load has been assigned a store set, but the store has not, + // the store is assigned the load’s store set." + is ("b10".U(2.W)) { + valid(memPredUpdateReqReg.stpc) := true.B + isload(memPredUpdateReqReg.stpc) := false.B + ssid(memPredUpdateReqReg.stpc) := loadOldSSID + } + // 3. "If the store has been assigned a store set, but the load has not, + // the load is assigned the store’s store set." + is ("b01".U(2.W)) { + valid(memPredUpdateReqReg.ldpc) := true.B + isload(memPredUpdateReqReg.ldpc) := true.B + ssid(memPredUpdateReqReg.ldpc) := storeOldSSID + } + // 4. "If both the load and the store have already been assigned store sets, + // one of the two store sets is declared the "winner". + // The instruction belonging to the loser’s store set is assigned the winner’s store set." + is ("b11".U(2.W)) { + valid(memPredUpdateReqReg.ldpc) := true.B + isload(memPredUpdateReqReg.ldpc) := true.B + ssid(memPredUpdateReqReg.ldpc) := winnerSSID + valid(memPredUpdateReqReg.stpc) := true.B + isload(memPredUpdateReqReg.stpc) := false.B + ssid(memPredUpdateReqReg.stpc) := winnerSSID + } + } + } + + XSPerfAccumulate("ssit_update_lxsx", memPredUpdateReqValid && !loadAssigned && !storeAssigned) + XSPerfAccumulate("ssit_update_lysx", memPredUpdateReqValid && loadAssigned && !storeAssigned) + XSPerfAccumulate("ssit_update_lxsy", memPredUpdateReqValid && !loadAssigned && storeAssigned) + XSPerfAccumulate("ssit_update_lysy", memPredUpdateReqValid && loadAssigned && storeAssigned) + + // reset period: ResetTimeMax2Pow + when(resetCounter(ResetTimeMax2Pow-1, ResetTimeMin2Pow)(RegNext(io.csrCtrl.waittable_timeout))) { + for (j <- 0 until SSITSize) { + valid(j) := 0.U + } + resetCounter:= 0.U + } + + // debug + for (i <- 0 until StorePipelineWidth) { + when (memPredUpdateReqReg.valid) { + XSDebug("%d: SSIT update: load pc %x store pc %x\n", GTimer(), memPredUpdateReqReg.ldpc, memPredUpdateReqReg.stpc) + XSDebug("%d: SSIT update: load valid %b ssid %x store valid %b ssid %x\n", GTimer(), loadAssigned, loadOldSSID, storeAssigned,storeOldSSID) + } + } +} + + +// Last Fetched Store Table Entry +class LFSTEntry(implicit p: Parameters) extends XSBundle { + val valid = Bool() + val sqIdx = new SqPtr + val roqIdx = new RoqPtr +} + +class DispatchToLFST(implicit p: Parameters) extends XSBundle { + val sqIdx = new SqPtr + val roqIdx = new RoqPtr + val ssid = UInt(SSIDWidth.W) +} + +class LookupLFST(implicit p: Parameters) extends XSBundle { + val raddr = Vec(DecodeWidth, Input(UInt(SSIDWidth.W))) // use ssid to llokup LFST + val ren = Vec(DecodeWidth, Input(Bool())) // ren iff uop.cf.storeSetHit + val rdata = Vec(DecodeWidth, Output(Bool())) +} + +// Last Fetched Store Table +class LFST(implicit p: Parameters) extends XSModule { + val io = IO(new Bundle { + val lookup = new LookupLFST + // val update = Input(new MemPredUpdateReq) // RegNext should be added outside + // when redirect, mark canceled store as invalid + val redirect = Input(Valid(new Redirect)) + val flush = Input(Bool()) + // when store is dispatched, mark it as valid + val dispatch = Vec(RenameWidth, Flipped(Valid(new DispatchToLFST))) + // when store issued, mark store as invalid + val storeIssue = Vec(exuParameters.StuCnt, Flipped(Valid(new ExuInput))) + val csrCtrl = Input(new CustomCSRCtrlIO) + }) + + // TODO: use MemTemplate + val validVec = RegInit(VecInit(Seq.fill(LFSTSize)(VecInit(Seq.fill(LFSTWidth)(false.B))))) + val sqIdxVec = Reg(Vec(LFSTSize, Vec(LFSTWidth, new SqPtr))) + val roqIdxVec = Reg(Vec(LFSTSize, Vec(LFSTWidth, new RoqPtr))) + val allocPtr = RegInit(VecInit(Seq.fill(LFSTSize)(0.U(log2Up(LFSTWidth).W)))) + val valid = Wire(Vec(LFSTSize, Bool())) + (0 until LFSTSize).map(i => { + valid(i) := validVec(i).asUInt.orR + }) + + // read LFST in rename stage + for (i <- 0 until DecodeWidth) { + // If store-load pair is in the same dispatch bundle, loadWaitBit should also be set for load + val hitInDispatchBundle = if(i > 0){ + (0 until i).map(j => + io.dispatch(j).valid && io.dispatch(j).bits.ssid === io.lookup.raddr(i) + ).reduce(_||_) + } else { + false.B + } + // Check if store set is valid in LFST + io.lookup.rdata(i) := ( + (valid(io.lookup.raddr(i)) || hitInDispatchBundle) && io.lookup.ren(i) || + io.csrCtrl.no_spec_load // set loadWaitBit for all loads + ) && !io.csrCtrl.lvpred_disable + } + + // when store is issued, mark it as invalid + (0 until exuParameters.StuCnt).map(i => { + // TODO: opt timing + (0 until LFSTWidth).map(j => { + when(io.storeIssue(i).valid && io.storeIssue(i).bits.uop.sqIdx.asUInt === sqIdxVec(io.storeIssue(i).bits.uop.cf.ssid)(j).asUInt){ + validVec(io.storeIssue(i).bits.uop.cf.ssid)(j) := false.B + } + }) + }) + + // when store is dispatched, mark it as valid + (0 until RenameWidth).map(i => { + when(io.dispatch(i).valid){ + val waddr = io.dispatch(i).bits.ssid + val wptr = allocPtr(waddr) + allocPtr(waddr) := allocPtr(waddr) + 1.U + validVec(waddr)(wptr) := true.B + sqIdxVec(waddr)(wptr) := io.dispatch(i).bits.sqIdx + roqIdxVec(waddr)(wptr) := io.dispatch(i).bits.roqIdx + } + }) + + // when redirect, cancel store influenced + (0 until LFSTSize).map(i => { + (0 until LFSTWidth).map(j => { + when(roqIdxVec(i)(j).needFlush(io.redirect, io.flush)){ + validVec(i)(j) := false.B + } + }) + }) +} \ No newline at end of file diff --git a/src/main/scala/xiangshan/backend/decode/WaitTable.scala b/src/main/scala/xiangshan/backend/decode/WaitTable.scala index 3ece16341..5a8f7374d 100644 --- a/src/main/scala/xiangshan/backend/decode/WaitTable.scala +++ b/src/main/scala/xiangshan/backend/decode/WaitTable.scala @@ -6,19 +6,12 @@ import chisel3.util._ import xiangshan._ import utils._ -trait WaitTableParameters { - val WaitTableSize = 1024 - val WaitTableAddrWidth = log2Up(WaitTableSize) - val ResetTimeMax2Pow = 20 //1078576 - val ResetTimeMin2Pow = 10 //1024 -} - // 21264-like wait table -class WaitTable(implicit p: Parameters) extends XSModule with WaitTableParameters { +class WaitTable(implicit p: Parameters) extends XSModule { val io = IO(new Bundle { - val raddr = Vec(DecodeWidth, Input(UInt(WaitTableAddrWidth.W))) // decode pc(VaddrBits-1, 1) + val raddr = Vec(DecodeWidth, Input(UInt(MemPredPCWidth.W))) // decode pc(VaddrBits-1, 1) val rdata = Vec(DecodeWidth, Output(Bool())) // loadWaitBit - val update = Vec(StorePipelineWidth, Input(new WaitTableUpdateReq)) // RegNext should be added outside + val update = Vec(StorePipelineWidth, Input(new MemPredUpdateReq)) // RegNext should be added outside val csrCtrl = Input(new CustomCSRCtrlIO) }) @@ -28,7 +21,7 @@ class WaitTable(implicit p: Parameters) extends XSModule with WaitTableParameter // read ports for (i <- 0 until DecodeWidth) { - io.rdata(i) := (data(io.raddr(i))(1) || io.csrCtrl.no_spec_load) && !io.csrCtrl.lvpred_disable + io.rdata(i) := (data(io.raddr(i))(LWTUse2BitCounter.B.asUInt) || io.csrCtrl.no_spec_load) && !io.csrCtrl.lvpred_disable } // write ports (with priority) diff --git a/src/main/scala/xiangshan/backend/dispatch/Dispatch.scala b/src/main/scala/xiangshan/backend/dispatch/Dispatch.scala index 75f8f19f3..5237293c5 100644 --- a/src/main/scala/xiangshan/backend/dispatch/Dispatch.scala +++ b/src/main/scala/xiangshan/backend/dispatch/Dispatch.scala @@ -50,6 +50,9 @@ class Dispatch(implicit p: Parameters) extends XSModule { val fpIndex = Vec(exuParameters.FpExuCnt, Output(UInt(log2Ceil((NRFpReadPorts - exuParameters.StuCnt) / 3).W))) // ls: hardwired to (0, 1, 2, 4) } + val csrCtrl = Input(new CustomCSRCtrlIO) + // LFST state sync + val storeIssue = Vec(StorePipelineWidth, Flipped(Valid(new ExuInput))) val ctrlInfo = new Bundle { val roqFull = Output(Bool()) val intdqFull = Output(Bool()) @@ -80,6 +83,10 @@ class Dispatch(implicit p: Parameters) extends XSModule { dispatch1.io.toFpDq <> fpDq.io.enq dispatch1.io.toLsDq <> lsDq.io.enq dispatch1.io.allocPregs <> io.allocPregs + dispatch1.io.csrCtrl <> io.csrCtrl + dispatch1.io.storeIssue <> io.storeIssue + dispatch1.io.redirect <> io.redirect + dispatch1.io.flush <> io.flush // dispatch queue: queue uops and dispatch them to different reservation stations or issue queues // it may cancel the uops diff --git a/src/main/scala/xiangshan/backend/dispatch/Dispatch1.scala b/src/main/scala/xiangshan/backend/dispatch/Dispatch1.scala index e1456333a..44d3fef9d 100644 --- a/src/main/scala/xiangshan/backend/dispatch/Dispatch1.scala +++ b/src/main/scala/xiangshan/backend/dispatch/Dispatch1.scala @@ -9,6 +9,7 @@ import xiangshan.backend.roq.{RoqEnqIO, RoqPtr} import xiangshan.backend.rename.RenameBypassInfo import xiangshan.mem.LsqEnqIO import xiangshan.backend.fu.HasExceptionNO +import xiangshan.backend.decode.{LFST, DispatchToLFST, LookupLFST} class PreDispatchInfo(implicit p: Parameters) extends XSBundle { @@ -44,9 +45,30 @@ class Dispatch1(implicit p: Parameters) extends XSModule with HasExceptionNO { val needAlloc = Vec(RenameWidth, Output(Bool())) val req = Vec(RenameWidth, ValidIO(new MicroOp)) } + // to store set LFST + val lfst = Vec(RenameWidth, Valid(new DispatchToLFST)) + // flush or replay, for LFST + val redirect = Flipped(ValidIO(new Redirect)) + val flush = Input(Bool()) + // LFST ctrl + val csrCtrl = Input(new CustomCSRCtrlIO) + // LFST state sync + val storeIssue = Vec(StorePipelineWidth, Flipped(Valid(new ExuInput))) }) + /** + * Store set LFST lookup + */ + // store set LFST lookup may start from rename for better timing + + val lfst = Module(new LFST) + lfst.io.redirect <> RegNext(io.redirect) + lfst.io.flush <> RegNext(io.flush) + lfst.io.storeIssue <> RegNext(io.storeIssue) + lfst.io.csrCtrl <> RegNext(io.csrCtrl) + lfst.io.dispatch := io.lfst + /** * Part 1: choose the target dispatch queue and the corresponding write ports */ @@ -124,8 +146,54 @@ class Dispatch1(implicit p: Parameters) extends XSModule with HasExceptionNO { // XSError(io.fromRename(i).valid && updatedUop(i).roqIdx.asUInt =/= io.enqRoq.resp(i).asUInt, "they should equal") updatedUop(i).lqIdx := io.enqLsq.resp(i).lqIdx updatedUop(i).sqIdx := io.enqLsq.resp(i).sqIdx + + // lookup store set LFST + lfst.io.lookup.raddr(i) := updatedUop(i).cf.ssid + lfst.io.lookup.ren(i) := updatedUop(i).cf.storeSetHit + + // override load delay ctrl signal with store set result + if(StoreSetEnable) { + // updatedUop(i).cf.loadWaitBit := lfst.io.lookup.rdata(i) // classic store set + updatedUop(i).cf.loadWaitBit := lfst.io.lookup.rdata(i) && !isStore(i) // store set lite + // updatedUop(i).cf.loadWaitBit := lfst.io.lookup.rdata(i) && io.fromRename(i).bits.cf.loadWaitBit && !isStore(i) // 2-bit store set + } else { + updatedUop(i).cf.loadWaitBit := io.fromRename(i).bits.cf.loadWaitBit && !isStore(i) // wait table does not require store to be delayed + } + + // update store set LFST + io.lfst(i).valid := io.fromRename(i).valid && updatedUop(i).cf.storeSetHit && isStore(i) + // or io.fromRename(i).ready && updatedUop(i).cf.storeSetHit && isStore(i), which is much slower + io.lfst(i).bits.roqIdx := updatedUop(i).roqIdx + io.lfst(i).bits.sqIdx := updatedUop(i).sqIdx + io.lfst(i).bits.ssid := updatedUop(i).cf.ssid } + // store set perf count + XSPerfAccumulate("waittable_load_wait", PopCount((0 until RenameWidth).map(i => + io.fromRename(i).fire() && io.fromRename(i).bits.cf.loadWaitBit && !isStore(i) && isLs(i) + ))) + XSPerfAccumulate("storeset_load_wait", PopCount((0 until RenameWidth).map(i => + io.fromRename(i).fire() && updatedUop(i).cf.loadWaitBit && !isStore(i) && isLs(i) + ))) + XSPerfAccumulate("storeset_store_wait", PopCount((0 until RenameWidth).map(i => + io.fromRename(i).fire() && updatedUop(i).cf.loadWaitBit && isStore(i) + ))) + XSPerfAccumulate("loadwait_diffmat_sywy", PopCount((0 until RenameWidth).map(i => + io.fromRename(i).fire() && updatedUop(i).cf.loadWaitBit && io.fromRename(i).bits.cf.loadWaitBit && + !isStore(i) && isLs(i) + ))) + XSPerfAccumulate("loadwait_diffmat_sywx", PopCount((0 until RenameWidth).map(i => + io.fromRename(i).fire() && updatedUop(i).cf.loadWaitBit && !io.fromRename(i).bits.cf.loadWaitBit && + !isStore(i) && isLs(i) + ))) + XSPerfAccumulate("loadwait_diffmat_sxwy", PopCount((0 until RenameWidth).map(i => + io.fromRename(i).fire() && !updatedUop(i).cf.loadWaitBit && io.fromRename(i).bits.cf.loadWaitBit && + !isStore(i) && isLs(i) + ))) + XSPerfAccumulate("loadwait_diffmat_sxwx", PopCount((0 until RenameWidth).map(i => + io.fromRename(i).fire() && !updatedUop(i).cf.loadWaitBit && !io.fromRename(i).bits.cf.loadWaitBit && + !isStore(i) && isLs(i) + ))) /** * Part 3: diff --git a/src/main/scala/xiangshan/backend/ftq/Ftq.scala b/src/main/scala/xiangshan/backend/ftq/Ftq.scala index 058c7e611..cd64d25d1 100644 --- a/src/main/scala/xiangshan/backend/ftq/Ftq.scala +++ b/src/main/scala/xiangshan/backend/ftq/Ftq.scala @@ -108,8 +108,8 @@ class Ftq(implicit p: Parameters) extends XSModule with HasCircularQueuePtrHelpe val frontendRedirect = Flipped(ValidIO(new Redirect)) // exu write back, update info val exuWriteback = Vec(exuParameters.JmpCnt + exuParameters.AluCnt, Flipped(ValidIO(new ExuOutput))) - // pc read reqs (0: jump/auipc 1~6: mispredict/load replay 7: exceptions) - val ftqRead = Vec(1 + 6 + 1, Flipped(new FtqRead)) + // pc read reqs (0: jump/auipc 1~6: mispredict/load replay 7: store pc for store set update 8: exceptions) + val ftqRead = Vec(1 + 6 + 1 + 1, Flipped(new FtqRead)) val cfiRead = Flipped(new FtqRead) val bpuInfo = new Bundle { val bpRight = Output(UInt(XLEN.W)) @@ -131,7 +131,7 @@ class Ftq(implicit p: Parameters) extends XSModule with HasCircularQueuePtrHelpe val real_fire = io.enq.fire() && !stage2Flush && !stage3Flush - val ftq_pc_mem = Module(new SyncDataModuleTemplate(new Ftq_4R_SRAMEntry, FtqSize, 9, 1)) + val ftq_pc_mem = Module(new SyncDataModuleTemplate(new Ftq_4R_SRAMEntry, FtqSize, 10, 1)) ftq_pc_mem.io.wen(0) := real_fire ftq_pc_mem.io.waddr(0) := tailPtr.value ftq_pc_mem.io.wdata(0).ftqPC := io.enq.bits.ftqPC diff --git a/src/main/scala/xiangshan/backend/issue/ReservationStation.scala b/src/main/scala/xiangshan/backend/issue/ReservationStation.scala index 282ad1355..c387d7dba 100644 --- a/src/main/scala/xiangshan/backend/issue/ReservationStation.scala +++ b/src/main/scala/xiangshan/backend/issue/ReservationStation.scala @@ -105,7 +105,7 @@ class ReservationStation val stData = if (exuCfg == StExeUnitCfg) ValidIO(new StoreDataBundle) else null val srcRegValue = Input(Vec(srcNum, UInt(srcLen.W))) - val stIssuePtr = if (exuCfg == LdExeUnitCfg) Input(new SqPtr()) else null + val stIssuePtr = if (exuCfg == LdExeUnitCfg || exuCfg == StExeUnitCfg) Input(new SqPtr()) else null val fpRegValue = if (exuCfg == StExeUnitCfg) Input(UInt(srcLen.W)) else null val jumpPc = if(exuCfg == JumpExeUnitCfg) Input(UInt(VAddrBits.W)) else null @@ -165,7 +165,7 @@ class ReservationStation c.valid := i.valid c.bits := i.bits.uop } - if (exuCfg == LdExeUnitCfg) { + if (exuCfg == LdExeUnitCfg || exuCfg == StExeUnitCfg) { ctrl.io.stIssuePtr := RegNext(io.stIssuePtr) } if (exuCfg == StExeUnitCfg) { @@ -541,7 +541,7 @@ class ReservationStationCtrl val listen = Output(Vec(srcNum, Vec(iqSize, Vec(fastPortsCnt + slowPortsCnt, Bool())))) val enqSrcReady = Output(Vec(srcNum, Bool())) - val stIssuePtr = if (exuCfg == LdExeUnitCfg) Input(new SqPtr()) else null + val stIssuePtr = if (exuCfg == LdExeUnitCfg || exuCfg == StExeUnitCfg) Input(new SqPtr()) else null }) val selValid = io.sel.valid @@ -619,6 +619,7 @@ class ReservationStationCtrl if (exuCfg == LdExeUnitCfg) { val ldWait = Reg(Vec(iqSize, Bool())) val sqIdx = Reg(Vec(iqSize, new SqPtr())) + val ldWaitUpdated = WireInit(ldWait) ldWait.zip(sqIdx).map{ case (lw, sq) => when (!isAfter(sq, io.stIssuePtr)) { lw := true.B @@ -626,11 +627,12 @@ class ReservationStationCtrl } when (enqEn) { ldWait(enqPtr) := !enqUop.cf.loadWaitBit + ldWaitUpdated(enqPtr) := !enqUop.cf.loadWaitBit sqIdx(enqPtr) := enqUop.sqIdx } ldWait.suggestName(s"${this.name}_ldWait") sqIdx.suggestName(s"${this.name}_sqIdx") - io.readyVec := srcQueueWire.map(Cat(_).andR).zip(ldWait).map{ case (s, l) => s&l } + io.readyVec := srcQueueWire.map(Cat(_).andR).zip(ldWaitUpdated).map{ case (s, l) => s&l } } val redirectHit = io.redirectVec(selPtr) diff --git a/src/main/scala/xiangshan/frontend/IFU.scala b/src/main/scala/xiangshan/frontend/IFU.scala index e448b2bfb..4e440851f 100644 --- a/src/main/scala/xiangshan/frontend/IFU.scala +++ b/src/main/scala/xiangshan/frontend/IFU.scala @@ -9,7 +9,6 @@ import xiangshan.cache._ import chisel3.experimental.chiselName import freechips.rocketchip.tile.HasLazyRoCC import xiangshan.backend.ftq.FtqPtr -import xiangshan.backend.decode.WaitTableParameters import system.L1CacheErrorInfo trait HasInstrMMIOConst extends HasXSParameter with HasIFUConst{ @@ -99,7 +98,7 @@ class PrevHalfInstr(implicit p: Parameters) extends XSBundle { } @chiselName -class IFU(implicit p: Parameters) extends XSModule with HasIFUConst with HasCircularQueuePtrHelper with WaitTableParameters +class IFU(implicit p: Parameters) extends XSModule with HasIFUConst with HasCircularQueuePtrHelper { val io = IO(new IFUIO) val bpu = BPU(EnableBPU) @@ -518,7 +517,7 @@ class IFU(implicit p: Parameters) extends XSModule with HasIFUConst with HasCirc fetchPacketWire.instrs := expandedInstrs fetchPacketWire.pc := if4_pd.pc - fetchPacketWire.foldpc := if4_pd.pc.map(i => XORFold(i(VAddrBits-1,1), WaitTableAddrWidth)) + fetchPacketWire.foldpc := if4_pd.pc.map(i => XORFold(i(VAddrBits-1,1), MemPredPCWidth)) fetchPacketWire.pdmask := if4_pd.mask fetchPacketWire.pd := if4_pd.pd diff --git a/src/main/scala/xiangshan/frontend/Ibuffer.scala b/src/main/scala/xiangshan/frontend/Ibuffer.scala index 23a8803fa..1b1f91baa 100644 --- a/src/main/scala/xiangshan/frontend/Ibuffer.scala +++ b/src/main/scala/xiangshan/frontend/Ibuffer.scala @@ -6,7 +6,6 @@ import chisel3.util._ import xiangshan._ import utils._ import xiangshan.backend.ftq.FtqPtr -import xiangshan.backend.decode.WaitTableParameters class IbufPtr(implicit p: Parameters) extends CircularQueuePtr[IbufPtr]( p => p(XSCoreParamsKey).IBufSize @@ -24,10 +23,10 @@ class IBufferIO(implicit p: Parameters) extends XSBundle { class Ibuffer(implicit p: Parameters) extends XSModule with HasCircularQueuePtrHelper { val io = IO(new IBufferIO) - class IBufEntry(implicit p: Parameters) extends XSBundle with WaitTableParameters { + class IBufEntry(implicit p: Parameters) extends XSBundle { val inst = UInt(32.W) val pc = UInt(VAddrBits.W) - val foldpc = UInt(WaitTableAddrWidth.W) + val foldpc = UInt(MemPredPCWidth.W) val pd = new PreDecodeInfo val ipf = Bool() val acf = Bool() @@ -125,6 +124,8 @@ class Ibuffer(implicit p: Parameters) extends XSModule with HasCircularQueuePtrH io.out(i).bits.crossPageIPFFix := outWire.crossPageIPFFix io.out(i).bits.foldpc := outWire.foldpc io.out(i).bits.loadWaitBit := DontCare + io.out(i).bits.storeSetHit := DontCare + io.out(i).bits.ssid := DontCare } val next_head_vec = VecInit(head_vec.map(_ + numDeq)) ibuf.io.raddr := VecInit(next_head_vec.map(_.value)) diff --git a/src/main/scala/xiangshan/mem/lsqueue/LoadQueue.scala b/src/main/scala/xiangshan/mem/lsqueue/LoadQueue.scala index 4cd08f438..a5004b076 100644 --- a/src/main/scala/xiangshan/mem/lsqueue/LoadQueue.scala +++ b/src/main/scala/xiangshan/mem/lsqueue/LoadQueue.scala @@ -11,6 +11,7 @@ import xiangshan.cache.{DCacheLineIO, DCacheWordIO, MemoryOpConstants, TlbReques import xiangshan.mem._ import xiangshan.backend.roq.RoqLsqIO import xiangshan.backend.fu.HasExceptionNO +import xiangshan.backend.ftq.FtqPtr class LqPtr(implicit p: Parameters) extends CircularQueuePtr[LqPtr]( @@ -455,12 +456,12 @@ class LoadQueue(implicit p: Parameters) extends XSModule ((lqViolation, lqViolationUop), (wbViolation, wbViolationUop), (l1Violation, l1ViolationUop)) } - def rollbackSel(a: Valid[MicroOp], b: Valid[MicroOp]): ValidIO[MicroOp] = { + def rollbackSel(a: Valid[MicroOpRbExt], b: Valid[MicroOpRbExt]): ValidIO[MicroOpRbExt] = { Mux( a.valid, Mux( b.valid, - Mux(isAfter(a.bits.roqIdx, b.bits.roqIdx), b, a), // a,b both valid, sel oldest + Mux(isAfter(a.bits.uop.roqIdx, b.bits.uop.roqIdx), b, a), // a,b both valid, sel oldest a // sel a ), b // sel b @@ -474,21 +475,29 @@ class LoadQueue(implicit p: Parameters) extends XSModule // S2: select rollback (part1) and generate rollback request // rollback check // Wb/L1 rollback seq check is done in s2 - val rollbackWb = Wire(Vec(StorePipelineWidth, Valid(new MicroOp))) - val rollbackL1 = Wire(Vec(StorePipelineWidth, Valid(new MicroOp))) - val rollbackL1Wb = Wire(Vec(StorePipelineWidth*2, Valid(new MicroOp))) + val rollbackWb = Wire(Vec(StorePipelineWidth, Valid(new MicroOpRbExt))) + val rollbackL1 = Wire(Vec(StorePipelineWidth, Valid(new MicroOpRbExt))) + val rollbackL1Wb = Wire(Vec(StorePipelineWidth*2, Valid(new MicroOpRbExt))) // Lq rollback seq check is done in s3 (next stage), as getting rollbackLq MicroOp is slow - val rollbackLq = Wire(Vec(StorePipelineWidth, Valid(new MicroOp))) + val rollbackLq = Wire(Vec(StorePipelineWidth, Valid(new MicroOpRbExt))) + // store ftq index for store set update + val stFtqIdxS2 = Wire(Vec(StorePipelineWidth, new FtqPtr)) + val stFtqOffsetS2 = Wire(Vec(StorePipelineWidth, UInt(log2Up(PredictWidth).W))) for (i <- 0 until StorePipelineWidth) { val detectedRollback = detectRollback(i) rollbackLq(i).valid := detectedRollback._1._1 && RegNext(io.storeIn(i).valid) - rollbackLq(i).bits := detectedRollback._1._2 + rollbackLq(i).bits.uop := detectedRollback._1._2 + rollbackLq(i).bits.flag := i.U rollbackWb(i).valid := detectedRollback._2._1 && RegNext(io.storeIn(i).valid) - rollbackWb(i).bits := detectedRollback._2._2 + rollbackWb(i).bits.uop := detectedRollback._2._2 + rollbackWb(i).bits.flag := i.U rollbackL1(i).valid := detectedRollback._3._1 && RegNext(io.storeIn(i).valid) - rollbackL1(i).bits := detectedRollback._3._2 + rollbackL1(i).bits.uop := detectedRollback._3._2 + rollbackL1(i).bits.flag := i.U rollbackL1Wb(2*i) := rollbackL1(i) rollbackL1Wb(2*i+1) := rollbackWb(i) + stFtqIdxS2(i) := RegNext(io.storeIn(i).bits.uop.cf.ftqPtr) + stFtqOffsetS2(i) := RegNext(io.storeIn(i).bits.uop.cf.ftqOffset) } val rollbackL1WbSelected = ParallelOperation(rollbackL1Wb, rollbackSel) @@ -505,18 +514,23 @@ class LoadQueue(implicit p: Parameters) extends XSModule // FIXME: this is ugly val rollbackValidVec = Seq(rollbackL1WbVReg, rollbackLq0VReg, rollbackLq1VReg) - val rollbackUopVec = Seq(rollbackL1WbReg, rollbackLq0Reg, rollbackLq1Reg) + val rollbackUopExtVec = Seq(rollbackL1WbReg, rollbackLq0Reg, rollbackLq1Reg) // select uop in parallel - val mask = getAfterMask(rollbackValidVec, rollbackUopVec) + val mask = getAfterMask(rollbackValidVec, rollbackUopExtVec.map(i => i.uop)) val oneAfterZero = mask(1)(0) - val rollbackUop = Mux(oneAfterZero && mask(2)(0), - rollbackUopVec(0), - Mux(!oneAfterZero && mask(2)(1), rollbackUopVec(1), rollbackUopVec(2))) + val rollbackUopExt = Mux(oneAfterZero && mask(2)(0), + rollbackUopExtVec(0), + Mux(!oneAfterZero && mask(2)(1), rollbackUopExtVec(1), rollbackUopExtVec(2))) + val stFtqIdxS3 = RegNext(stFtqIdxS2) + val stFtqOffsetS3 = RegNext(stFtqOffsetS2) + val rollbackUop = rollbackUopExt.uop + val rollbackStFtqIdx = stFtqIdxS3(rollbackUopExt.flag) + val rollbackStFtqOffset = stFtqOffsetS3(rollbackUopExt.flag) // check if rollback request is still valid in parallel val rollbackValidVecChecked = Wire(Vec(3, Bool())) - for(((v, uop), idx) <- rollbackValidVec.zip(rollbackUopVec).zipWithIndex) { + for(((v, uop), idx) <- rollbackValidVec.zip(rollbackUopExtVec.map(i => i.uop)).zipWithIndex) { rollbackValidVecChecked(idx) := v && (!lastCycleRedirect.valid || isBefore(uop.roqIdx, lastCycleRedirect.bits.roqIdx)) && (!lastlastCycleRedirect.valid || isBefore(uop.roqIdx, lastlastCycleRedirect.bits.roqIdx)) @@ -524,7 +538,9 @@ class LoadQueue(implicit p: Parameters) extends XSModule io.rollback.bits.roqIdx := rollbackUop.roqIdx io.rollback.bits.ftqIdx := rollbackUop.cf.ftqPtr + io.rollback.bits.stFtqIdx := rollbackStFtqIdx io.rollback.bits.ftqOffset := rollbackUop.cf.ftqOffset + io.rollback.bits.stFtqOffset := rollbackStFtqOffset io.rollback.bits.level := RedirectLevel.flush io.rollback.bits.interrupt := DontCare io.rollback.bits.cfiUpdate := DontCare -- GitLab