diff --git a/debug/Makefile b/debug/Makefile index d575b3764b725e5838b2b48bc56b900acc9f5970..7ebc4b29610f0bbd5436c47b806df655d337b333 100644 --- a/debug/Makefile +++ b/debug/Makefile @@ -3,7 +3,7 @@ NANOS_HOME ?= $(AM_HOME)/../nanos-lite SINGLETEST = ALL=min3 B ?= 0 -E ?= -1 +E ?= 0 V ?= OFF #V ?= OFF EMU_ARGS = B=$(B) E=$(E) V=$(V) @@ -18,7 +18,7 @@ cache: #2>&1 | tee > loader.log cpu: - $(MAKE) -C $(AM_HOME)/tests/cputest $(ARCH) ALL=dummy $(EMU_ARGS) run 2>&1 | tee > dummy.log + $(MAKE) -C $(AM_HOME)/tests/cputest $(ARCH) ALL=dummy $(EMU_ARGS) run 2>&1 # ------------------------------------------------------------------ # run different test sets diff --git a/src/main/scala/utils/CircularQueuePtr.scala b/src/main/scala/utils/CircularQueuePtr.scala index b0921c1454dfbfa32252a8ebee75c6578b88ec9d..4822307c8271172a18561d5297b543d57313d55f 100644 --- a/src/main/scala/utils/CircularQueuePtr.scala +++ b/src/main/scala/utils/CircularQueuePtr.scala @@ -68,4 +68,8 @@ trait HasCircularQueuePtrHelper { def isAfter[T <: CircularQueuePtr](left: T, right: T): Bool = { Mux(left.flag === right.flag, left.value > right.value, left.value < right.value) } + + def isBefore[T <: CircularQueuePtr](left: T, right: T): Bool = { + Mux(left.flag === right.flag, left.value < right.value, left.value > right.value) + } } diff --git a/src/main/scala/xiangshan/PMA.scala b/src/main/scala/xiangshan/PMA.scala index 93ec79d27130d18c5edd5b9e1180a08663c15a94..41541cbb4fd4d9032e1e687b247d6e5a5bda2b94 100644 --- a/src/main/scala/xiangshan/PMA.scala +++ b/src/main/scala/xiangshan/PMA.scala @@ -17,7 +17,25 @@ object MemMap { } object AddressSpace { - def MemMapList = List( + def SimpleMemMapList = List( + // Base address Top address Width Description Mode (RWXIDSAC) + MemMap("h00_0000_0000", "h00_0FFF_FFFF", "h0", "Reserved", ""), + MemMap("h00_1000_0000", "h00_1FFF_FFFF", "h0", "QSPI_Flash", "RX"), + MemMap("h00_2000_0000", "h00_2FFF_FFFF", "h0", "Reserved", ""), + MemMap("h00_3000_0000", "h00_3000_FFFF", "h0", "DMA", "RW"), + MemMap("h00_3001_0000", "h00_3004_FFFF", "h0", "GPU", "RWC"), + MemMap("h00_3005_0000", "h00_3006_FFFF", "h0", "USB/SDMMC", "RW"), + MemMap("h00_3007_0000", "h00_30FF_FFFF", "h0", "Reserved", ""), + MemMap("h00_3100_0000", "h00_3111_FFFF", "h0", "MMIO", "RW"), + MemMap("h00_3112_0000", "h00_37FF_FFFF", "h0", "Reserved", ""), + MemMap("h00_3800_0000", "h00_3800_FFFF", "h0", "CLINT", "RW"), + MemMap("h00_3801_0000", "h00_3BFF_FFFF", "h0", "Reserved", ""), + MemMap("h00_3C00_0000", "h00_3FFF_FFFF", "h0", "PLIC", "RW"), + MemMap("h00_4000_0000", "h00_7FFF_FFFF", "h0", "PCIe", "RW"), + MemMap("h00_8000_0000", "h1F_FFFF_FFFF", "h0", "DDR", "RWXIDSA"), + ) + + def FullMemMapList = List( // Base address Top address Width Description Mode (RWXIDSAC) MemMap("h00_0000_0000", "h00_0FFF_FFFF", "h0", "Reserved", ""), MemMap("h00_1000_0000", "h00_1FFF_FFFF", "h0", "QSPI_Flash", "RX"), @@ -55,16 +73,42 @@ object AddressSpace { MemMap("h00_8000_0000", "h1F_FFFF_FFFF", "h0", "DDR", "RWXIDSA"), ) + def MemMapList = SimpleMemMapList + def printMemmap(){ println("-------------------- memory map --------------------") for(i <- MemMapList){ - println(i._1._1 + "->" + i._1._2 + " width " + (if(i._2.get("width").get == "0") "unlimited" else i._2.get("width").get) + " " + i._2.get("description").get + " [" + i._2.get("mode").get + "]") + println("[" + i._1._1 + " -> " + i._1._2 + "] Width:" + (if(i._2.get("width").get == "h0") "unlimited" else i._2.get("width").get) + " Description:" + i._2.get("description").get + " [" + i._2.get("mode").get + "]") } println("----------------------------------------------------") } + def checkMemmap(){ + for(i <- MemMapList){ + // pma mode check + val s = i._2.get("mode").get + if( + s.toUpperCase.indexOf("A") >= 0 && + !(s.toUpperCase.indexOf("R") >= 0 && s.toUpperCase.indexOf("W") >= 0) + ){ + println("[error] pma atomicable area must be both readable and writeable") + throw new IllegalArgumentException + } + // pma area size check + if(!i._1._1.endsWith("000") || !i._1._2.endsWith("FFF")){ + println("[error] pma area must be larger than 4KB") + throw new IllegalArgumentException() + } + } + } + def genMemmapMatchVec(addr: UInt): UInt = { VecInit(MemMapList.map(i => { + // calculate addr tag and compare mask + // val mask = i._1._2.U - i._1._1.U + // (~(i._1._1.U ^ addr) | mask).andR + + // pma is not current critical path, use simple compare for now i._1._1.U <= addr && addr < i._1._2.U }).toSeq).asUInt } @@ -75,6 +119,30 @@ object AddressSpace { }).toSeq)) } + // TODO: FIXME + def queryModeFast(matchVec: UInt): UInt = { + var r = WireInit(false.B) + var w = WireInit(false.B) + var x = WireInit(false.B) + var i = WireInit(false.B) + var d = WireInit(false.B) + var s = WireInit(false.B) + var a = WireInit(false.B) + var c = WireInit(false.B) + for((j, idx) <- MemMapList.zipWithIndex){ + val modes = j._2.get("mode").get + if (modes.toUpperCase.indexOf("R") >= 0) r = r || matchVec(idx).asBool + if (modes.toUpperCase.indexOf("W") >= 0) w = w || matchVec(idx).asBool + if (modes.toUpperCase.indexOf("X") >= 0) x = x || matchVec(idx).asBool + if (modes.toUpperCase.indexOf("I") >= 0) i = i || matchVec(idx).asBool + if (modes.toUpperCase.indexOf("D") >= 0) d = d || matchVec(idx).asBool + if (modes.toUpperCase.indexOf("S") >= 0) s = s || matchVec(idx).asBool + if (modes.toUpperCase.indexOf("A") >= 0) a = a || matchVec(idx).asBool + if (modes.toUpperCase.indexOf("C") >= 0) c = c || matchVec(idx).asBool + } + VecInit(Seq(r, w, x, i, d, s, a, c)).asUInt + } + def queryWidth(matchVec: UInt): UInt = { Mux1H(matchVec, VecInit(MemMapList.map(i => { i._2.get("width").get.U @@ -83,7 +151,11 @@ object AddressSpace { def memmapAddrMatch(addr: UInt): (UInt, UInt) = { val matchVec = genMemmapMatchVec(addr) - (queryMode(matchVec), queryWidth(matchVec)) + // when(queryMode(matchVec) =/= queryModeFast(matchVec)){ + // printf("pma fail: right %b wrong %b\n", queryMode(matchVec), queryModeFast(matchVec)) + // } + assert(queryMode(matchVec) === queryModeFast(matchVec)) + (queryModeFast(matchVec), queryWidth(matchVec)) } def isDMMIO(addr: UInt): Bool = !PMAMode.dcache(memmapAddrMatch(addr)._1) diff --git a/src/main/scala/xiangshan/XSCore.scala b/src/main/scala/xiangshan/XSCore.scala index c3be93fb4121c754f6556dfa491eea0244af29ed..8ebb3ea880726a8cd11b9e219b534ec52b93e960 100644 --- a/src/main/scala/xiangshan/XSCore.scala +++ b/src/main/scala/xiangshan/XSCore.scala @@ -377,6 +377,7 @@ class XSCoreImp(outer: XSCore) extends LazyModuleImp(outer) trapIO <> DontCare println(s"FPGAPlatform:${env.FPGAPlatform} EnableDebug:${env.EnableDebug}") + AddressSpace.checkMemmap() AddressSpace.printMemmap() // to fast wake up fp, mem rs diff --git a/src/main/scala/xiangshan/backend/MemBlock.scala b/src/main/scala/xiangshan/backend/MemBlock.scala index a5e7ca0313ca97b301f2ab99d30ce87017528560..e41fdc4c3c3d1564c8b136bc98606425ada3bcc5 100644 --- a/src/main/scala/xiangshan/backend/MemBlock.scala +++ b/src/main/scala/xiangshan/backend/MemBlock.scala @@ -239,6 +239,7 @@ class MemBlockImp lsq.io.loadIn(i) <> loadUnits(i).io.lsq.loadIn lsq.io.ldout(i) <> loadUnits(i).io.lsq.ldout lsq.io.loadDataForwarded(i) <> loadUnits(i).io.lsq.loadDataForwarded + lsq.io.needReplayFromRS(i) <> loadUnits(i).io.lsq.needReplayFromRS } // StoreUnit @@ -274,8 +275,11 @@ class MemBlockImp lsq.io.brqRedirect <> io.fromCtrlBlock.redirect lsq.io.flush <> io.fromCtrlBlock.flush io.toCtrlBlock.replay <> lsq.io.rollback - lsq.io.dcache <> dcache.io.lsu.lsq lsq.io.uncache <> uncache.io.lsq + // delay dcache refill for 1 cycle for better timing + // TODO: remove RegNext after fixing refill paddr timing + // lsq.io.dcache <> dcache.io.lsu.lsq + lsq.io.dcache := RegNext(dcache.io.lsu.lsq) // LSQ to store buffer lsq.io.sbuffer <> sbuffer.io.in @@ -283,6 +287,9 @@ class MemBlockImp // Sbuffer sbuffer.io.dcache <> dcache.io.lsu.store + sbuffer.io.dcache.resp.valid := RegNext(dcache.io.lsu.store.resp.valid) + sbuffer.io.dcache.resp.bits := RegNext(dcache.io.lsu.store.resp.bits) + assert(sbuffer.io.dcache.resp.ready === true.B) // flush sbuffer val fenceFlush = io.fenceToSbuffer.flushSb diff --git a/src/main/scala/xiangshan/backend/issue/ReservationStation.scala b/src/main/scala/xiangshan/backend/issue/ReservationStation.scala index 0f4b2f7b2a2359e51630fb8e71634e1fada0aea4..9d4a10f0128ce86b13c4f28006ed5539e7b4aef7 100644 --- a/src/main/scala/xiangshan/backend/issue/ReservationStation.scala +++ b/src/main/scala/xiangshan/backend/issue/ReservationStation.scala @@ -300,7 +300,9 @@ class ReservationStationSelect if (feedback) { when (io.memfeedback.valid) { - stateQueue(io.memfeedback.bits.rsIdx) := Mux(io.memfeedback.bits.hit, s_idle, s_replay) + when (stateQueue(io.memfeedback.bits.rsIdx) === s_wait) { + stateQueue(io.memfeedback.bits.rsIdx) := Mux(io.memfeedback.bits.hit, s_idle, s_replay) + } when (!io.memfeedback.bits.hit) { countQueue(io.memfeedback.bits.rsIdx) := replayDelay(cntCountQueue(io.memfeedback.bits.rsIdx)) } diff --git a/src/main/scala/xiangshan/cache/TLB.scala b/src/main/scala/xiangshan/cache/TLB.scala index ff9cc8f5a5935b0e7d32b2a374268de34f316411..e01d615e5728249f1228cac8a2cd750152b8e7bd 100644 --- a/src/main/scala/xiangshan/cache/TLB.scala +++ b/src/main/scala/xiangshan/cache/TLB.scala @@ -71,6 +71,7 @@ class PtePermBundle extends TlbBundle { class TlbPermBundle extends TlbBundle { val pf = Bool() // NOTE: if this is true, just raise pf + // pagetable perm (software defined) val d = Bool() val a = Bool() val g = Bool() @@ -78,13 +79,14 @@ class TlbPermBundle extends TlbBundle { val x = Bool() val w = Bool() val r = Bool() + // pma perm (hardwired) + val pr = Bool() //readable + val pw = Bool() //writeable + val pe = Bool() //executable + val pa = Bool() //atom op permitted + val pi = Bool() //icacheable + val pd = Bool() //dcacheable - // pma perm check - // val at = Bool() // Access Type - // val as = Bool() // Atomic Swap - // val al = Bool() // Atomic Logical - // val aa = Bool() // Atomic Arithmetic - // TODO: add pma check override def toPrintable: Printable = { p"pf:${pf} d:${d} a:${a} g:${g} u:${u} x:${x} w:${w} r:${r}" } @@ -172,6 +174,8 @@ class TlbEntry(superpage: Boolean = false) extends TlbBundle { this.tag := vpn this.level.map(_ := level(0)) this.data.ppn := ppn + + // refill pagetable perm val ptePerm = perm.asTypeOf(new PtePermBundle) this.data.perm.pf:= pf this.data.perm.d := ptePerm.d @@ -182,6 +186,15 @@ class TlbEntry(superpage: Boolean = false) extends TlbBundle { this.data.perm.w := ptePerm.w this.data.perm.r := ptePerm.r + // get pma perm + val (pmaMode, accessWidth) = AddressSpace.memmapAddrMatch(Cat(ppn, 0.U(12.W))) + this.data.perm.pr := PMAMode.read(pmaMode) + this.data.perm.pw := PMAMode.write(pmaMode) + this.data.perm.pe := PMAMode.execute(pmaMode) + this.data.perm.pa := PMAMode.atomic(pmaMode) + this.data.perm.pi := PMAMode.icache(pmaMode) + this.data.perm.pd := PMAMode.dcache(pmaMode) + this } @@ -421,11 +434,22 @@ class TLB(Width: Int, isDtlb: Boolean) extends TlbModule with HasCSRConst{ resp(i).bits.excp.pf.st := stPf || update resp(i).bits.excp.pf.instr := instrPf || update + // if vmenable, use pre-calcuated pma check result + resp(i).bits.mmio := Mux(TlbCmd.isExec(cmdReg), !perm.pi, !perm.pd) + resp(i).bits.excp.af.ld := Mux(TlbCmd.isAtom(cmdReg), !perm.pa, !perm.pr) && TlbCmd.isRead(cmdReg) + resp(i).bits.excp.af.st := Mux(TlbCmd.isAtom(cmdReg), !perm.pa, !perm.pw) && TlbCmd.isWrite(cmdReg) + resp(i).bits.excp.af.instr := Mux(TlbCmd.isAtom(cmdReg), false.B, !perm.pe) + + // if !vmenable, check pma val (pmaMode, accessWidth) = AddressSpace.memmapAddrMatch(resp(i).bits.paddr) - resp(i).bits.mmio := Mux(TlbCmd.isExec(cmdReg), !PMAMode.icache(pmaMode), !PMAMode.dcache(pmaMode)) - resp(i).bits.excp.af.ld := Mux(TlbCmd.isAtom(cmdReg), !PMAMode.atomic(pmaMode), !PMAMode.read(pmaMode)) && TlbCmd.isRead(cmdReg) - resp(i).bits.excp.af.st := Mux(TlbCmd.isAtom(cmdReg), !PMAMode.atomic(pmaMode), !PMAMode.write(pmaMode)) && TlbCmd.isWrite(cmdReg) - resp(i).bits.excp.af.instr := Mux(TlbCmd.isAtom(cmdReg), false.B, !PMAMode.execute(pmaMode)) + when(!vmEnable){ + resp(i).bits.mmio := Mux(TlbCmd.isExec(cmdReg), !PMAMode.icache(pmaMode), !PMAMode.dcache(pmaMode)) + resp(i).bits.excp.af.ld := Mux(TlbCmd.isAtom(cmdReg), !PMAMode.atomic(pmaMode), !PMAMode.read(pmaMode)) && TlbCmd.isRead(cmdReg) + resp(i).bits.excp.af.st := Mux(TlbCmd.isAtom(cmdReg), !PMAMode.atomic(pmaMode), !PMAMode.write(pmaMode)) && TlbCmd.isWrite(cmdReg) + resp(i).bits.excp.af.instr := Mux(TlbCmd.isAtom(cmdReg), false.B, !PMAMode.execute(pmaMode)) + } + + // TODO: MMIO check (hit, miss, pfHitVec, multiHit) } diff --git a/src/main/scala/xiangshan/mem/MaskedDataModule.scala b/src/main/scala/xiangshan/mem/MaskedDataModule.scala new file mode 100644 index 0000000000000000000000000000000000000000..25ee80ca554da880f28321a997db528a36cb71ea --- /dev/null +++ b/src/main/scala/xiangshan/mem/MaskedDataModule.scala @@ -0,0 +1,61 @@ +package xiangshan.mem + +import chisel3._ +import chisel3.util._ +import xiangshan._ +import utils._ +import xiangshan.cache._ + +class MaskedSyncDataModuleTemplate[T <: Data](gen: T, numEntries: Int, numRead: Int, numWrite: Int, numMRead: Int = 0, numMWrite: Int = 0) extends XSModule with HasDCacheParameters { + val io = IO(new Bundle { + // address indexed sync read + val raddr = Input(Vec(numRead, UInt(log2Up(numEntries).W))) + val rdata = Output(Vec(numRead, gen)) + // masked sync read (1H) + val mrmask = Input(Vec(numMRead, Vec(numEntries, Bool()))) + val mrdata = Output(Vec(numMRead, gen)) + // address indexed write + val wen = Input(Vec(numWrite, Bool())) + val waddr = Input(Vec(numWrite, UInt(log2Up(numEntries).W))) + val wdata = Input(Vec(numWrite, gen)) + // masked write + val mwmask = Input(Vec(numMWrite, Vec(numEntries, Bool()))) + val mwdata = Input(Vec(numMWrite, gen)) + }) + + val data = Reg(Vec(numEntries, gen)) + + // read ports + for (i <- 0 until numRead) { + io.rdata(i) := data(RegNext(io.raddr(i))) + } + + // masked read ports + for (i <- 0 until numMRead) { + io.mrdata(i) := Mux1H(RegNext(io.mrmask(i)), data) + } + + // write ports (with priorities) + for (i <- 0 until numWrite) { + when (io.wen(i)) { + data(io.waddr(i)) := io.wdata(i) + } + } + + // masked write + for (j <- 0 until numEntries) { + val wen = VecInit((0 until numMWrite).map(i => io.mwmask(i)(j))).asUInt.orR + when (wen) { + data(j) := VecInit((0 until numMWrite).map(i => { + Mux(io.mwmask(i)(j), io.mwdata(i), 0.U).asUInt + })).reduce(_ | _) + } + } + + // DataModuleTemplate should not be used when there're any write conflicts + for (i <- 0 until numWrite) { + for (j <- i+1 until numWrite) { + assert(!(io.wen(i) && io.wen(j) && io.waddr(i) === io.waddr(j))) + } + } +} diff --git a/src/main/scala/xiangshan/mem/Memend.scala b/src/main/scala/xiangshan/mem/MemUtils.scala similarity index 75% rename from src/main/scala/xiangshan/mem/Memend.scala rename to src/main/scala/xiangshan/mem/MemUtils.scala index 743c0759555329cfd0ef3d065fad60fef2113eb9..036d8dd8f416629da57e41afa34b64e072bff2d6 100644 --- a/src/main/scala/xiangshan/mem/Memend.scala +++ b/src/main/scala/xiangshan/mem/MemUtils.scala @@ -60,3 +60,18 @@ class LoadForwardQueryIO extends XSBundle { // val lqIdx = Output(UInt(LoadQueueIdxWidth.W)) val sqIdx = Output(new SqPtr) } + +class MaskedLoadForwardQueryIO extends XSBundle { + val paddr = Output(UInt(PAddrBits.W)) + val mask = Output(UInt(8.W)) + val uop = Output(new MicroOp) // for replay + val pc = Output(UInt(VAddrBits.W)) //for debug + val valid = Output(Bool()) //for debug + + val forwardMask = Input(Vec(8, Bool())) + val forwardData = Input(Vec(8, UInt(8.W))) + + val sqIdx = Output(new SqPtr) // for debug + // sqIdxMask is calcuated in earlier stage for better timing + val sqIdxMask = Output(UInt(StoreQueueSize.W)) +} diff --git a/src/main/scala/xiangshan/mem/lsqueue/LSQWrapper.scala b/src/main/scala/xiangshan/mem/lsqueue/LSQWrapper.scala index 41385c4f93c2374dbdfcd8fcbe688264e03f5330..bbbab1edb0cf772733b2c067015bde180661c9b5 100644 --- a/src/main/scala/xiangshan/mem/lsqueue/LSQWrapper.scala +++ b/src/main/scala/xiangshan/mem/lsqueue/LSQWrapper.scala @@ -43,10 +43,11 @@ class LsqWrappper extends XSModule with HasDCacheParameters { val loadIn = Vec(LoadPipelineWidth, Flipped(Valid(new LsPipelineBundle))) val storeIn = Vec(StorePipelineWidth, Flipped(Valid(new LsPipelineBundle))) val loadDataForwarded = Vec(LoadPipelineWidth, Input(Bool())) + val needReplayFromRS = Vec(LoadPipelineWidth, Input(Bool())) val sbuffer = Vec(StorePipelineWidth, Decoupled(new DCacheWordReq)) val ldout = Vec(2, DecoupledIO(new ExuOutput)) // writeback int load val mmioStout = DecoupledIO(new ExuOutput) // writeback uncached store - val forward = Vec(LoadPipelineWidth, Flipped(new LoadForwardQueryIO)) + val forward = Vec(LoadPipelineWidth, Flipped(new MaskedLoadForwardQueryIO)) val roq = Flipped(new RoqLsqIO) val rollback = Output(Valid(new Redirect)) val dcache = Flipped(ValidIO(new Refill)) @@ -94,6 +95,7 @@ class LsqWrappper extends XSModule with HasDCacheParameters { loadQueue.io.loadIn <> io.loadIn loadQueue.io.storeIn <> io.storeIn loadQueue.io.loadDataForwarded <> io.loadDataForwarded + loadQueue.io.needReplayFromRS <> io.needReplayFromRS loadQueue.io.ldout <> io.ldout loadQueue.io.roq <> io.roq loadQueue.io.rollback <> io.rollback diff --git a/src/main/scala/xiangshan/mem/lsqueue/LoadQueue.scala b/src/main/scala/xiangshan/mem/lsqueue/LoadQueue.scala index 535ea14d7928741889a2db925bf7eecc3145e734..bcb0d4b2bc2033c2f8d5d5cbc5bddad9fd858c8d 100644 --- a/src/main/scala/xiangshan/mem/lsqueue/LoadQueue.scala +++ b/src/main/scala/xiangshan/mem/lsqueue/LoadQueue.scala @@ -68,8 +68,9 @@ class LoadQueue extends XSModule val loadIn = Vec(LoadPipelineWidth, Flipped(Valid(new LsPipelineBundle))) val storeIn = Vec(StorePipelineWidth, Flipped(Valid(new LsPipelineBundle))) val loadDataForwarded = Vec(LoadPipelineWidth, Input(Bool())) + val needReplayFromRS = Vec(LoadPipelineWidth, Input(Bool())) val ldout = Vec(2, DecoupledIO(new ExuOutput)) // writeback int load - val load_s1 = Vec(LoadPipelineWidth, Flipped(new LoadForwardQueryIO)) + val load_s1 = Vec(LoadPipelineWidth, Flipped(new MaskedLoadForwardQueryIO)) val roq = Flipped(new RoqLsqIO) val rollback = Output(Valid(new Redirect)) // replay now starts from load instead of store val dcache = Flipped(ValidIO(new Refill)) @@ -81,7 +82,7 @@ class LoadQueue extends XSModule // val data = Reg(Vec(LoadQueueSize, new LsRoqEntry)) val dataModule = Module(new LoadQueueData(LoadQueueSize, wbNumRead = LoadPipelineWidth, wbNumWrite = LoadPipelineWidth)) dataModule.io := DontCare - val vaddrModule = Module(new AsyncDataModuleTemplate(UInt(VAddrBits.W), LoadQueueSize, numRead = 1, numWrite = LoadPipelineWidth)) + val vaddrModule = Module(new SyncDataModuleTemplate(UInt(VAddrBits.W), LoadQueueSize, numRead = 1, numWrite = LoadPipelineWidth)) vaddrModule.io := DontCare val allocated = RegInit(VecInit(List.fill(LoadQueueSize)(false.B))) // lq entry has been allocated val datavalid = RegInit(VecInit(List.fill(LoadQueueSize)(false.B))) // data is valid @@ -144,7 +145,7 @@ class LoadQueue extends XSModule */ for (i <- 0 until LoadPipelineWidth) { dataModule.io.wb.wen(i) := false.B - vaddrModule.io.wen(i) := false.B + val loadWbIndex = io.loadIn(i).bits.uop.lqIdx.value when(io.loadIn(i).fire()) { when(io.loadIn(i).bits.miss) { XSInfo(io.loadIn(i).valid, "load miss write to lq idx %d pc 0x%x vaddr %x paddr %x data %x mask %x forwardData %x forwardMask: %x mmio %x\n", @@ -170,8 +171,9 @@ class LoadQueue extends XSModule io.loadIn(i).bits.forwardMask.asUInt, io.loadIn(i).bits.mmio )} - val loadWbIndex = io.loadIn(i).bits.uop.lqIdx.value - datavalid(loadWbIndex) := (!io.loadIn(i).bits.miss || io.loadDataForwarded(i)) && !io.loadIn(i).bits.mmio + datavalid(loadWbIndex) := (!io.loadIn(i).bits.miss || io.loadDataForwarded(i)) && + !io.loadIn(i).bits.mmio && // mmio data is not valid until we finished uncache access + !io.needReplayFromRS(i) // do not writeback if that inst will be resend from rs writebacked(loadWbIndex) := !io.loadIn(i).bits.miss && !io.loadIn(i).bits.mmio val loadWbData = Wire(new LQDataEntry) @@ -182,18 +184,19 @@ class LoadQueue extends XSModule dataModule.io.wbWrite(i, loadWbIndex, loadWbData) dataModule.io.wb.wen(i) := true.B - vaddrModule.io.waddr(i) := loadWbIndex - vaddrModule.io.wdata(i) := io.loadIn(i).bits.vaddr - vaddrModule.io.wen(i) := true.B debug_mmio(loadWbIndex) := io.loadIn(i).bits.mmio debug_paddr(loadWbIndex) := io.loadIn(i).bits.paddr val dcacheMissed = io.loadIn(i).bits.miss && !io.loadIn(i).bits.mmio - miss(loadWbIndex) := dcacheMissed && !io.loadDataForwarded(i) + miss(loadWbIndex) := dcacheMissed && !io.loadDataForwarded(i) && !io.needReplayFromRS(i) pending(loadWbIndex) := io.loadIn(i).bits.mmio uop(loadWbIndex).debugInfo.issueTime := io.loadIn(i).bits.uop.debugInfo.issueTime } + // vaddrModule write is delayed, as vaddrModule will not be read right after write + vaddrModule.io.waddr(i) := RegNext(loadWbIndex) + vaddrModule.io.wdata(i) := RegNext(io.loadIn(i).bits.vaddr) + vaddrModule.io.wen(i) := RegNext(io.loadIn(i).fire()) } when(io.dcache.valid) { @@ -361,11 +364,25 @@ class LoadQueue extends XSModule * Generate match vector for store address with rangeMask(stPtr, enqPtr). * Besides, load instructions in LoadUnit_S1 and S2 are also checked. * Cycle 1: Redirect Generation - * There're three possible types of violations. Choose the oldest load. - * Prepare redirect request according to the detected violation. + * There're three possible types of violations, up to 6 possible redirect requests. + * Choose the oldest load (part 1). (4 + 2) -> (1 + 2) * Cycle 2: Redirect Fire + * Choose the oldest load (part 2). (3 -> 1) + * Prepare redirect request according to the detected violation. * Fire redirect request (if valid) */ + + // stage 0: lq l1 wb l1 wb lq + // | | | | | | (paddr match) + // stage 1: lq l1 wb l1 wb lq + // | | | | | | + // | |------------| | + // | | | + // stage 2: lq l1wb lq + // | | | + // -------------------- + // | + // rollback req io.load_s1 := DontCare def detectRollback(i: Int) = { val startIndex = io.storeIn(i).bits.uop.lqIdx.value @@ -413,18 +430,9 @@ class LoadQueue extends XSModule val l1ViolationUop = getOldestInTwo(l1ViolationVec, RegNext(VecInit(io.load_s1.map(_.uop)))) XSDebug(l1Violation, p"${Binary(Cat(l1ViolationVec))}, $l1ViolationUop\n") - val rollbackValidVec = Seq(lqViolation, wbViolation, l1Violation) - val rollbackUopVec = Seq(lqViolationUop, wbViolationUop, l1ViolationUop) - - val mask = getAfterMask(rollbackValidVec, rollbackUopVec) - val oneAfterZero = mask(1)(0) - val rollbackUop = Mux(oneAfterZero && mask(2)(0), - rollbackUopVec(0), - Mux(!oneAfterZero && mask(2)(1), rollbackUopVec(1), rollbackUopVec(2))) - XSDebug( l1Violation, - "need rollback (l4 load) pc %x roqidx %d target %x\n", + "need rollback (l1 load) pc %x roqidx %d target %x\n", io.storeIn(i).bits.uop.cf.pc, io.storeIn(i).bits.uop.roqIdx.asUInt, l1ViolationUop.roqIdx.asUInt ) XSDebug( @@ -438,15 +446,7 @@ class LoadQueue extends XSModule io.storeIn(i).bits.uop.cf.pc, io.storeIn(i).bits.uop.roqIdx.asUInt, wbViolationUop.roqIdx.asUInt ) - (RegNext(io.storeIn(i).valid) && Cat(rollbackValidVec).orR, rollbackUop) - } - - // rollback check - val rollback = Wire(Vec(StorePipelineWidth, Valid(new MicroOp))) - for (i <- 0 until StorePipelineWidth) { - val detectedRollback = detectRollback(i) - rollback(i).valid := detectedRollback._1 - rollback(i).bits := detectedRollback._2 + ((lqViolation, lqViolationUop), (wbViolation, wbViolationUop), (l1Violation, l1ViolationUop)) } def rollbackSel(a: Valid[MicroOp], b: Valid[MicroOp]): ValidIO[MicroOp] = { @@ -460,33 +460,72 @@ class LoadQueue extends XSModule b // sel b ) } - - val rollbackSelected = ParallelOperation(rollback, rollbackSel) val lastCycleRedirect = RegNext(io.brqRedirect) + val lastlastCycleRedirect = RegNext(lastCycleRedirect) val lastCycleFlush = RegNext(io.flush) + val lastlastCycleFlush = RegNext(lastCycleFlush) - // S2: select rollback and generate rollback request + // S2: select rollback (part1) and generate rollback request + // rollback check + // Wb/L1 rollback seq check is done in s2 + val rollbackWb = Wire(Vec(StorePipelineWidth, Valid(new MicroOp))) + val rollbackL1 = Wire(Vec(StorePipelineWidth, Valid(new MicroOp))) + val rollbackL1Wb = Wire(Vec(StorePipelineWidth*2, Valid(new MicroOp))) + // Lq rollback seq check is done in s3 (next stage), as getting rollbackLq MicroOp is slow + val rollbackLq = Wire(Vec(StorePipelineWidth, Valid(new MicroOp))) + for (i <- 0 until StorePipelineWidth) { + val detectedRollback = detectRollback(i) + rollbackLq(i).valid := detectedRollback._1._1 && RegNext(io.storeIn(i).valid) + rollbackLq(i).bits := detectedRollback._1._2 + rollbackWb(i).valid := detectedRollback._2._1 && RegNext(io.storeIn(i).valid) + rollbackWb(i).bits := detectedRollback._2._2 + rollbackL1(i).valid := detectedRollback._3._1 && RegNext(io.storeIn(i).valid) + rollbackL1(i).bits := detectedRollback._3._2 + rollbackL1Wb(2*i) := rollbackL1(i) + rollbackL1Wb(2*i+1) := rollbackWb(i) + } + + val rollbackL1WbSelected = ParallelOperation(rollbackL1Wb, rollbackSel) + val rollbackL1WbVReg = RegNext(rollbackL1WbSelected.valid) + val rollbackL1WbReg = RegEnable(rollbackL1WbSelected.bits, rollbackL1WbSelected.valid) + val rollbackLq0VReg = RegNext(rollbackLq(0).valid) + val rollbackLq0Reg = RegEnable(rollbackLq(0).bits, rollbackLq(0).valid) + val rollbackLq1VReg = RegNext(rollbackLq(1).valid) + val rollbackLq1Reg = RegEnable(rollbackLq(1).bits, rollbackLq(1).valid) + + // S3: select rollback (part2), generate rollback request, then fire rollback request // Note that we use roqIdx - 1.U to flush the load instruction itself. // Thus, here if last cycle's roqIdx equals to this cycle's roqIdx, it still triggers the redirect. - val rollbackGen = Wire(Valid(new Redirect)) - val rollbackReg = Reg(Valid(new Redirect)) - rollbackGen.valid := rollbackSelected.valid && - !rollbackSelected.bits.roqIdx.needFlush(lastCycleRedirect, lastCycleFlush) - - rollbackGen.bits.roqIdx := rollbackSelected.bits.roqIdx - rollbackGen.bits.ftqIdx := rollbackSelected.bits.cf.ftqPtr - rollbackGen.bits.ftqOffset := rollbackSelected.bits.cf.ftqOffset - rollbackGen.bits.level := RedirectLevel.flush - rollbackGen.bits.interrupt := DontCare - rollbackGen.bits.cfiUpdate := DontCare - rollbackGen.bits.cfiUpdate.target := rollbackSelected.bits.cf.pc - - rollbackReg := rollbackGen - - // S3: fire rollback request - io.rollback := rollbackReg - io.rollback.valid := rollbackReg.valid && - !rollbackReg.bits.roqIdx.needFlush(lastCycleRedirect, lastCycleFlush) + + // FIXME: this is ugly + val rollbackValidVec = Seq(rollbackL1WbVReg, rollbackLq0VReg, rollbackLq1VReg) + val rollbackUopVec = Seq(rollbackL1WbReg, rollbackLq0Reg, rollbackLq1Reg) + + // select uop in parallel + val mask = getAfterMask(rollbackValidVec, rollbackUopVec) + val oneAfterZero = mask(1)(0) + val rollbackUop = Mux(oneAfterZero && mask(2)(0), + rollbackUopVec(0), + Mux(!oneAfterZero && mask(2)(1), rollbackUopVec(1), rollbackUopVec(2))) + + // check if rollback request is still valid in parallel + val rollbackValidVecChecked = Wire(Vec(3, Bool())) + for(((v, uop), idx) <- rollbackValidVec.zip(rollbackUopVec).zipWithIndex) { + rollbackValidVecChecked(idx) := v && + (!lastCycleRedirect.valid || isBefore(uop.roqIdx, lastCycleRedirect.bits.roqIdx)) && + (!lastlastCycleRedirect.valid || isBefore(uop.roqIdx, lastlastCycleRedirect.bits.roqIdx)) + } + + io.rollback.bits.roqIdx := rollbackUop.roqIdx + io.rollback.bits.ftqIdx := rollbackUop.cf.ftqPtr + io.rollback.bits.ftqOffset := rollbackUop.cf.ftqOffset + io.rollback.bits.level := RedirectLevel.flush + io.rollback.bits.interrupt := DontCare + io.rollback.bits.cfiUpdate := DontCare + io.rollback.bits.cfiUpdate.target := rollbackUop.cf.pc + // io.rollback.bits.pc := DontCare + + io.rollback.valid := rollbackValidVecChecked.asUInt.orR && !lastCycleFlush && !lastlastCycleFlush when(io.rollback.valid) { // XSDebug("Mem rollback: pc %x roqidx %d\n", io.rollback.bits.cfi, io.rollback.bits.roqIdx.asUInt) @@ -503,11 +542,13 @@ class LoadQueue extends XSModule * (5) ROB commits the instruction: same as normal instructions */ //(2) when they reach ROB's head, they can be sent to uncache channel + val lqTailMmioPending = WireInit(pending(deqPtr)) + val lqTailAllocated = WireInit(allocated(deqPtr)) val s_idle :: s_req :: s_resp :: s_wait :: Nil = Enum(4) val uncacheState = RegInit(s_idle) switch(uncacheState) { is(s_idle) { - when(io.roq.pendingld && pending(deqPtr) && allocated(deqPtr)) { + when(io.roq.pendingld && lqTailMmioPending && lqTailAllocated) { uncacheState := s_req } } @@ -563,7 +604,7 @@ class LoadQueue extends XSModule } // Read vaddr for mem exception - vaddrModule.io.raddr(0) := deqPtr + commitCount + vaddrModule.io.raddr(0) := deqPtr + io.roq.lcommit io.exceptionAddr.vaddr := vaddrModule.io.rdata(0) // misprediction recovery / exception redirect @@ -596,6 +637,15 @@ class LoadQueue extends XSModule allowEnqueue := validCount + enqNumber <= (LoadQueueSize - RenameWidth).U + // perf counter + XSPerf("lqRollback", io.rollback.valid, acc = true) // rollback redirect generated + XSPerf("lqFull", !allowEnqueue, acc = true) + XSPerf("lqMmioCycle", uncacheState =/= s_idle, acc = true) // lq is busy dealing with uncache req + XSPerf("lqMmioCnt", io.uncache.req.fire(), acc = true) + XSPerf("lqRefill", io.dcache.valid, acc = true) + XSPerf("lqWriteback", PopCount(VecInit(io.ldout.map(i => i.fire()))), acc = true) + XSPerf("lqWbBlocked", PopCount(VecInit(io.ldout.map(i => i.valid && !i.ready))), acc = true) + // debug info XSDebug("enqPtrExt %d:%d deqPtrExt %d:%d\n", enqPtrExt(0).flag, enqPtr, deqPtrExt.flag, deqPtr) diff --git a/src/main/scala/xiangshan/mem/lsqueue/LoadQueueData.scala b/src/main/scala/xiangshan/mem/lsqueue/LoadQueueData.scala index 754bc67f306a10a37cff3f681d9b35d035aafeb9..5902326f5f8f8b61b638c1a5e435f39bc91d73b4 100644 --- a/src/main/scala/xiangshan/mem/lsqueue/LoadQueueData.scala +++ b/src/main/scala/xiangshan/mem/lsqueue/LoadQueueData.scala @@ -106,51 +106,51 @@ class MaskModule(numEntries: Int, numRead: Int, numWrite: Int) extends XSModule } } -class LQData8Module(numEntries: Int, numRead: Int, numWrite: Int) extends XSModule with HasDCacheParameters { - val io = IO(new Bundle { - // read - val raddr = Input(Vec(numRead, UInt(log2Up(numEntries).W))) - val rdata = Output(Vec(numRead, UInt(8.W))) - // address indexed write - val wen = Input(Vec(numWrite, Bool())) - val waddr = Input(Vec(numWrite, UInt(log2Up(numEntries).W))) - val wdata = Input(Vec(numWrite, UInt(8.W))) - // masked write - val mwmask = Input(Vec(blockWords, Vec(numEntries, Bool()))) - val mwdata = Input(Vec(blockWords, UInt(8.W))) - }) - - val data = Reg(Vec(numEntries, UInt(8.W))) - - // read ports - for (i <- 0 until numRead) { - io.rdata(i) := data(RegNext(io.raddr(i))) - } - - // below is the write ports (with priorities) - for (i <- 0 until numWrite) { - when (io.wen(i)) { - data(io.waddr(i)) := io.wdata(i) - } - } - - // masked write - for (j <- 0 until numEntries) { - val wen = VecInit((0 until blockWords).map(i => io.mwmask(i)(j))).asUInt.orR - when (wen) { - data(j) := VecInit((0 until blockWords).map(i => { - Mux(io.mwmask(i)(j), io.mwdata(i), 0.U) - })).reduce(_ | _) - } - } - - // DataModuleTemplate should not be used when there're any write conflicts - for (i <- 0 until numWrite) { - for (j <- i+1 until numWrite) { - assert(!(io.wen(i) && io.wen(j) && io.waddr(i) === io.waddr(j))) - } - } -} +// class LQData8Module(numEntries: Int, numRead: Int, numWrite: Int) extends XSModule with HasDCacheParameters { +// val io = IO(new Bundle { +// // read +// val raddr = Input(Vec(numRead, UInt(log2Up(numEntries).W))) +// val rdata = Output(Vec(numRead, UInt(8.W))) +// // address indexed write +// val wen = Input(Vec(numWrite, Bool())) +// val waddr = Input(Vec(numWrite, UInt(log2Up(numEntries).W))) +// val wdata = Input(Vec(numWrite, UInt(8.W))) +// // masked write +// val mwmask = Input(Vec(blockWords, Vec(numEntries, Bool()))) +// val mwdata = Input(Vec(blockWords, UInt(8.W))) +// }) + +// val data = Reg(Vec(numEntries, UInt(8.W))) + +// // read ports +// for (i <- 0 until numRead) { +// io.rdata(i) := data(RegNext(io.raddr(i))) +// } + +// // below is the write ports (with priorities) +// for (i <- 0 until numWrite) { +// when (io.wen(i)) { +// data(io.waddr(i)) := io.wdata(i) +// } +// } + +// // masked write +// for (j <- 0 until numEntries) { +// val wen = VecInit((0 until blockWords).map(i => io.mwmask(i)(j))).asUInt.orR +// when (wen) { +// data(j) := VecInit((0 until blockWords).map(i => { +// Mux(io.mwmask(i)(j), io.mwdata(i), 0.U) +// })).reduce(_ | _) +// } +// } + +// // DataModuleTemplate should not be used when there're any write conflicts +// for (i <- 0 until numWrite) { +// for (j <- i+1 until numWrite) { +// assert(!(io.wen(i) && io.wen(j) && io.waddr(i) === io.waddr(j))) +// } +// } +// } class CoredataModule(numEntries: Int, numRead: Int, numWrite: Int) extends XSModule with HasDCacheParameters { val io = IO(new Bundle { @@ -177,7 +177,7 @@ class CoredataModule(numEntries: Int, numRead: Int, numWrite: Int) extends XSMod val paddrWen = Input(Vec(numWrite, Bool())) }) - val data8 = Seq.fill(8)(Module(new LQData8Module(numEntries, numRead, numWrite))) + val data8 = Seq.fill(8)(Module(new MaskedSyncDataModuleTemplate(UInt(8.W), numEntries, numRead, numWrite, numMWrite = blockWords))) val fwdMask = Reg(Vec(numEntries, UInt(8.W))) val wordIndex = Reg(Vec(numEntries, UInt((blockOffBits - wordOffBits).W))) diff --git a/src/main/scala/xiangshan/mem/lsqueue/StoreQueue.scala b/src/main/scala/xiangshan/mem/lsqueue/StoreQueue.scala index 5c65be6b4c1af9c7e4ade2de3950c21961cd6545..943294bbe7dec3875609fc179d23204c28a4c4ae 100644 --- a/src/main/scala/xiangshan/mem/lsqueue/StoreQueue.scala +++ b/src/main/scala/xiangshan/mem/lsqueue/StoreQueue.scala @@ -38,7 +38,7 @@ class StoreQueue extends XSModule with HasDCacheParameters with HasCircularQueue val storeIn = Vec(StorePipelineWidth, Flipped(Valid(new LsPipelineBundle))) val sbuffer = Vec(StorePipelineWidth, Decoupled(new DCacheWordReq)) val mmioStout = DecoupledIO(new ExuOutput) // writeback uncached store - val forward = Vec(LoadPipelineWidth, Flipped(new LoadForwardQueryIO)) + val forward = Vec(LoadPipelineWidth, Flipped(new MaskedLoadForwardQueryIO)) val roq = Flipped(new RoqLsqIO) val uncache = new DCacheWordIO // val refill = Flipped(Valid(new DCacheLineReq )) @@ -61,7 +61,7 @@ class StoreQueue extends XSModule with HasDCacheParameters with HasCircularQueue dataModule.io := DontCare val paddrModule = Module(new SQPaddrModule(StoreQueueSize, numRead = StorePipelineWidth, numWrite = StorePipelineWidth, numForward = StorePipelineWidth)) paddrModule.io := DontCare - val vaddrModule = Module(new AsyncDataModuleTemplate(UInt(VAddrBits.W), StoreQueueSize, numRead = 1, numWrite = StorePipelineWidth)) + val vaddrModule = Module(new SyncDataModuleTemplate(UInt(VAddrBits.W), StoreQueueSize, numRead = 1, numWrite = StorePipelineWidth)) vaddrModule.io := DontCare // state & misc @@ -104,7 +104,7 @@ class StoreQueue extends XSModule with HasDCacheParameters with HasCircularQueue dataModule.io.raddr(i) := deqPtrExtNext(i).value paddrModule.io.raddr(i) := deqPtrExtNext(i).value } - vaddrModule.io.raddr(0) := cmtPtr + commitCount + vaddrModule.io.raddr(0) := cmtPtr + io.roq.scommit /** * Enqueue at dispatch @@ -144,9 +144,8 @@ class StoreQueue extends XSModule with HasDCacheParameters with HasCircularQueue for (i <- 0 until StorePipelineWidth) { dataModule.io.wen(i) := false.B paddrModule.io.wen(i) := false.B - vaddrModule.io.wen(i) := false.B + val stWbIndex = io.storeIn(i).bits.uop.sqIdx.value when (io.storeIn(i).fire()) { - val stWbIndex = io.storeIn(i).bits.uop.sqIdx.value datavalid(stWbIndex) := !io.storeIn(i).bits.mmio writebacked(stWbIndex) := !io.storeIn(i).bits.mmio pending(stWbIndex) := io.storeIn(i).bits.mmio @@ -164,9 +163,6 @@ class StoreQueue extends XSModule with HasDCacheParameters with HasCircularQueue paddrModule.io.wdata(i) := io.storeIn(i).bits.paddr paddrModule.io.wen(i) := true.B - vaddrModule.io.waddr(i) := stWbIndex - vaddrModule.io.wdata(i) := io.storeIn(i).bits.vaddr - vaddrModule.io.wen(i) := true.B mmio(stWbIndex) := io.storeIn(i).bits.mmio @@ -179,6 +175,10 @@ class StoreQueue extends XSModule with HasDCacheParameters with HasCircularQueue io.storeIn(i).bits.mmio ) } + // vaddrModule write is delayed, as vaddrModule will not be read right after write + vaddrModule.io.waddr(i) := RegNext(stWbIndex) + vaddrModule.io.wdata(i) := RegNext(io.storeIn(i).bits.vaddr) + vaddrModule.io.wen(i) := RegNext(io.storeIn(i).fire()) } /** @@ -199,7 +199,7 @@ class StoreQueue extends XSModule with HasDCacheParameters with HasCircularQueue // Forward2: Mux(same_flag, 0.U, range(0, sqIdx) ) // i.e. forward1 is the target entries with the same flag bits and forward2 otherwise val differentFlag = deqPtrExt(0).flag =/= io.forward(i).sqIdx.flag - val forwardMask = UIntToMask(io.forward(i).sqIdx.value, StoreQueueSize) + val forwardMask = io.forward(i).sqIdxMask val storeWritebackedVec = WireInit(VecInit(Seq.fill(StoreQueueSize)(false.B))) for (j <- 0 until StoreQueueSize) { storeWritebackedVec(j) := datavalid(j) && allocated(j) // all datavalid terms need to be checked @@ -388,6 +388,16 @@ class StoreQueue extends XSModule with HasDCacheParameters with HasCircularQueue // for 1 cycle will also promise that sq is empty in that cycle io.sqempty := RegNext(enqPtrExt(0).value === deqPtrExt(0).value && enqPtrExt(0).flag === deqPtrExt(0).flag) + // perf counter + XSPerf("sqFull", !allowEnqueue, acc = true) + XSPerf("sqMmioCycle", uncacheState =/= s_idle, acc = true) // lq is busy dealing with uncache req + XSPerf("sqMmioCnt", io.uncache.req.fire(), acc = true) + XSPerf("sqWriteback", io.mmioStout.fire(), acc = true) + XSPerf("sqWbBlocked", io.mmioStout.valid && !io.mmioStout.ready, acc = true) + XSPerf("sqValidEntryCnt", distanceBetween(enqPtrExt(0), deqPtrExt(0))) + XSPerf("sqCmtEntryCnt", distanceBetween(cmtPtrExt(0), deqPtrExt(0))) + XSPerf("sqNCmtEntryCnt", distanceBetween(enqPtrExt(0), cmtPtrExt(0))) + // debug info XSDebug("enqPtrExt %d:%d deqPtrExt %d:%d\n", enqPtrExt(0).flag, enqPtr, deqPtrExt(0).flag, deqPtr) diff --git a/src/main/scala/xiangshan/mem/pipeline/LoadUnit.scala b/src/main/scala/xiangshan/mem/pipeline/LoadUnit.scala index 61a4a64a0aace2e0fd5c005f5c2512b1ddabee82..f4d404572839acc851f6d02700f4b02e14117d23 100644 --- a/src/main/scala/xiangshan/mem/pipeline/LoadUnit.scala +++ b/src/main/scala/xiangshan/mem/pipeline/LoadUnit.scala @@ -13,7 +13,8 @@ class LoadToLsqIO extends XSBundle { val loadIn = ValidIO(new LsPipelineBundle) val ldout = Flipped(DecoupledIO(new ExuOutput)) val loadDataForwarded = Output(Bool()) - val forward = new LoadForwardQueryIO + val needReplayFromRS = Output(Bool()) + val forward = new MaskedLoadForwardQueryIO } // Load Pipeline Stage 0 @@ -28,17 +29,15 @@ class LoadUnit_S0 extends XSModule { }) val s0_uop = io.in.bits.uop - val s0_vaddr_old = io.in.bits.src1 + SignExt(ImmUnion.I.toImm32(s0_uop.ctrl.imm), XLEN) + // val s0_vaddr = io.in.bits.src1 + SignExt(s0_uop.ctrl.imm(11,0), VAddrBits) + // val s0_mask = genWmask(s0_vaddr, s0_uop.ctrl.fuOpType(1,0)) val imm12 = WireInit(s0_uop.ctrl.imm(11,0)) val s0_vaddr_lo = io.in.bits.src1(11,0) + Cat(0.U(1.W), imm12) - val s0_vaddr_hi = Mux(imm12(11), - Mux((s0_vaddr_lo(12)), io.in.bits.src1(VAddrBits-1, 12), io.in.bits.src1(VAddrBits-1, 12)+SignExt(1.U, VAddrBits-12)), - Mux((s0_vaddr_lo(12)), io.in.bits.src1(VAddrBits-1, 12)+1.U, io.in.bits.src1(VAddrBits-1, 12)) + val s0_vaddr_hi = Mux(s0_vaddr_lo(12), + Mux(imm12(11), io.in.bits.src1(VAddrBits-1, 12), io.in.bits.src1(VAddrBits-1, 12)+1.U), + Mux(imm12(11), io.in.bits.src1(VAddrBits-1, 12)+SignExt(1.U, VAddrBits-12), io.in.bits.src1(VAddrBits-1, 12)), ) val s0_vaddr = Cat(s0_vaddr_hi, s0_vaddr_lo(11,0)) - when(io.in.fire() && s0_vaddr(VAddrBits-1,0) =/= (io.in.bits.src1 + SignExt(ImmUnion.I.toImm32(s0_uop.ctrl.imm), XLEN))(VAddrBits-1,0)){ - printf("s0_vaddr %x s0_vaddr_old %x\n", s0_vaddr, s0_vaddr_old(VAddrBits-1,0)) - } val s0_mask = genWmask(s0_vaddr_lo, s0_uop.ctrl.fuOpType(1,0)) // query DTLB @@ -92,7 +91,7 @@ class LoadUnit_S1 extends XSModule { val dcachePAddr = Output(UInt(PAddrBits.W)) val dcacheKill = Output(Bool()) val sbuffer = new LoadForwardQueryIO - val lsq = new LoadForwardQueryIO + val lsq = new MaskedLoadForwardQueryIO }) val s1_uop = io.in.bits.uop @@ -122,6 +121,7 @@ class LoadUnit_S1 extends XSModule { io.lsq.paddr := s1_paddr io.lsq.uop := s1_uop io.lsq.sqIdx := s1_uop.sqIdx + io.lsq.sqIdxMask := DontCare // will be overwritten by sqIdxMask pre-generated in s0 io.lsq.mask := s1_mask io.lsq.pc := s1_uop.cf.pc // FIXME: remove it @@ -149,6 +149,7 @@ class LoadUnit_S2 extends XSModule with HasLoadHelper { val lsq = new LoadForwardQueryIO val sbuffer = new LoadForwardQueryIO val dataForwarded = Output(Bool()) + val needReplayFromRS = Output(Bool()) }) val s2_uop = io.in.bits.uop @@ -168,10 +169,22 @@ class LoadUnit_S2 extends XSModule with HasLoadHelper { io.tlbFeedback.valid := io.in.valid io.tlbFeedback.bits.hit := !s2_tlb_miss && (!s2_cache_replay || s2_mmio) io.tlbFeedback.bits.rsIdx := io.in.bits.rsIdx + io.needReplayFromRS := s2_cache_replay + + // merge forward result + // lsq has higher priority than sbuffer + val forwardMask = Wire(Vec(8, Bool())) + val forwardData = Wire(Vec(8, UInt(8.W))) - val forwardMask = io.out.bits.forwardMask - val forwardData = io.out.bits.forwardData val fullForward = (~forwardMask.asUInt & s2_mask) === 0.U + io.lsq := DontCare + io.sbuffer := DontCare + + // generate XLEN/8 Muxs + for (i <- 0 until XLEN / 8) { + forwardMask(i) := io.lsq.forwardMask(i) || io.sbuffer.forwardMask(i) + forwardData(i) := Mux(io.lsq.forwardMask(i), io.lsq.forwardData(i), io.sbuffer.forwardData(i)) + } XSDebug(io.out.fire(), "[FWD LOAD RESP] pc %x fwd %x(%b) + %x(%b)\n", s2_uop.cf.pc, @@ -180,8 +193,9 @@ class LoadUnit_S2 extends XSModule with HasLoadHelper { ) // data merge - val rdata = VecInit((0 until XLEN / 8).map(j => - Mux(forwardMask(j), forwardData(j), io.dcacheResp.bits.data(8*(j+1)-1, 8*j)))).asUInt + val rdataVec = VecInit((0 until XLEN / 8).map(j => + Mux(forwardMask(j), forwardData(j), io.dcacheResp.bits.data(8*(j+1)-1, 8*j)))) + val rdata = rdataVec.asUInt val rdataSel = LookupTree(s2_paddr(2, 0), List( "b000".U -> rdata(63, 0), "b001".U -> rdata(63, 8), @@ -194,9 +208,7 @@ class LoadUnit_S2 extends XSModule with HasLoadHelper { )) val rdataPartialLoad = rdataHelper(s2_uop, rdataSel) - // TODO: ECC check - - io.out.valid := io.in.valid && !s2_tlb_miss && (!s2_cache_replay || s2_mmio || s2_exception) + io.out.valid := io.in.valid && !s2_tlb_miss // Inst will be canceled in store queue / lsq, // so we do not need to care about flush in load / store unit's out.valid io.out.bits := io.in.bits @@ -212,28 +224,16 @@ class LoadUnit_S2 extends XSModule with HasLoadHelper { // and dcache query is no longer needed. // Such inst will be writebacked from load queue. io.dataForwarded := s2_cache_miss && fullForward && !s2_exception + // io.out.bits.forwardX will be send to lq + io.out.bits.forwardMask := forwardMask + // data retbrived from dcache is also included in io.out.bits.forwardData + io.out.bits.forwardData := rdataVec io.in.ready := io.out.ready || !io.in.valid - // merge forward result - // lsq has higher priority than sbuffer - io.lsq := DontCare - io.sbuffer := DontCare - // generate XLEN/8 Muxs - for (i <- 0 until XLEN / 8) { - when (io.sbuffer.forwardMask(i)) { - io.out.bits.forwardMask(i) := true.B - io.out.bits.forwardData(i) := io.sbuffer.forwardData(i) - } - when (io.lsq.forwardMask(i)) { - io.out.bits.forwardMask(i) := true.B - io.out.bits.forwardData(i) := io.lsq.forwardData(i) - } - } - XSDebug(io.out.fire(), "[DCACHE LOAD RESP] pc %x rdata %x <- D$ %x + fwd %x(%b)\n", s2_uop.cf.pc, rdataPartialLoad, io.dcacheResp.bits.data, - io.out.bits.forwardData.asUInt, io.out.bits.forwardMask.asUInt + forwardData.asUInt, forwardMask.asUInt ) } @@ -271,13 +271,19 @@ class LoadUnit extends XSModule with HasLoadHelper { PipelineConnect(load_s1.io.out, load_s2.io.in, true.B, load_s1.io.out.bits.uop.roqIdx.needFlush(io.redirect, io.flush)) - load_s2.io.tlbFeedback <> io.tlbFeedback load_s2.io.dcacheResp <> io.dcache.resp load_s2.io.lsq.forwardData <> io.lsq.forward.forwardData load_s2.io.lsq.forwardMask <> io.lsq.forward.forwardMask load_s2.io.sbuffer.forwardData <> io.sbuffer.forwardData load_s2.io.sbuffer.forwardMask <> io.sbuffer.forwardMask load_s2.io.dataForwarded <> io.lsq.loadDataForwarded + io.tlbFeedback.bits := RegNext(load_s2.io.tlbFeedback.bits) + io.tlbFeedback.valid := RegNext(load_s2.io.tlbFeedback.valid && !load_s2.io.out.bits.uop.roqIdx.needFlush(io.redirect, io.flush)) + io.lsq.needReplayFromRS := load_s2.io.needReplayFromRS + + // pre-calcuate sqIdx mask in s0, then send it to lsq in s1 for forwarding + val sqIdxMaskReg = RegNext(UIntToMask(load_s0.io.in.bits.uop.sqIdx.value, StoreQueueSize)) + io.lsq.forward.sqIdxMask := sqIdxMaskReg // use s2_hit_way to select data received in s1 load_s2.io.dcacheResp.bits.data := Mux1H(io.dcache.s2_hit_way, RegNext(io.dcache.s1_data)) @@ -317,19 +323,26 @@ class LoadUnit extends XSModule with HasLoadHelper { io.ldout.bits := Mux(intHitLoadOut.valid, intHitLoadOut.bits, io.lsq.ldout.bits) io.ldout.valid := intHitLoadOut.valid || io.lsq.ldout.valid && !refillFpLoad - // Fp load, if hit, will be send to recoder at s2, then it will be recoded & writebacked at s3 + // Fp load, if hit, will be stored to reg at s2, then it will be recoded at s3, writebacked at s4 val fpHitLoadOut = Wire(Valid(new ExuOutput)) fpHitLoadOut.valid := s2_wb_valid && load_s2.io.out.bits.uop.ctrl.fpWen fpHitLoadOut.bits := intHitLoadOut.bits - val fpLoadOut = Wire(Valid(new ExuOutput)) - fpLoadOut.bits := Mux(fpHitLoadOut.valid, fpHitLoadOut.bits, io.lsq.ldout.bits) - fpLoadOut.valid := fpHitLoadOut.valid || io.lsq.ldout.valid && refillFpLoad + val fpLoadUnRecodedReg = Reg(Valid(new ExuOutput)) + fpLoadUnRecodedReg.valid := fpHitLoadOut.valid || io.lsq.ldout.valid && refillFpLoad + when(fpHitLoadOut.valid || io.lsq.ldout.valid && refillFpLoad){ + fpLoadUnRecodedReg.bits := Mux(fpHitLoadOut.valid, fpHitLoadOut.bits, io.lsq.ldout.bits) + } - val fpLoadOutReg = RegNext(fpLoadOut) - io.fpout.bits := fpLoadOutReg.bits - io.fpout.bits.data := fpRdataHelper(fpLoadOutReg.bits.uop, fpLoadOutReg.bits.data) // recode - io.fpout.valid := RegNext(fpLoadOut.valid) + val fpLoadRecodedReg = Reg(Valid(new ExuOutput)) + when(fpLoadUnRecodedReg.valid){ + fpLoadRecodedReg := fpLoadUnRecodedReg + fpLoadRecodedReg.bits.data := fpRdataHelper(fpLoadUnRecodedReg.bits.uop, fpLoadUnRecodedReg.bits.data) // recode + } + fpLoadRecodedReg.valid := fpLoadUnRecodedReg.valid + + io.fpout.bits := fpLoadRecodedReg.bits + io.fpout.valid := fpLoadRecodedReg.valid io.lsq.ldout.ready := Mux(refillFpLoad, !fpHitLoadOut.valid, !intHitLoadOut.valid) diff --git a/src/main/scala/xiangshan/mem/pipeline/StoreUnit.scala b/src/main/scala/xiangshan/mem/pipeline/StoreUnit.scala index fbd9735531a3a8204c8c02714c6aea3ef7770099..86fe8e003c7cbd6fbf64227ba4da23e9656152fb 100644 --- a/src/main/scala/xiangshan/mem/pipeline/StoreUnit.scala +++ b/src/main/scala/xiangshan/mem/pipeline/StoreUnit.scala @@ -18,17 +18,14 @@ class StoreUnit_S0 extends XSModule { }) // send req to dtlb - val saddr_old = io.in.bits.src1 + SignExt(ImmUnion.S.toImm32(io.in.bits.uop.ctrl.imm), XLEN) + // val saddr = io.in.bits.src1 + SignExt(io.in.bits.uop.ctrl.imm(11,0), VAddrBits) val imm12 = WireInit(io.in.bits.uop.ctrl.imm(11,0)) val saddr_lo = io.in.bits.src1(11,0) + Cat(0.U(1.W), imm12) - val saddr_hi = Mux(imm12(11), - Mux((saddr_lo(12)), io.in.bits.src1(VAddrBits-1, 12), io.in.bits.src1(VAddrBits-1, 12)+SignExt(1.U, VAddrBits-12)), - Mux((saddr_lo(12)), io.in.bits.src1(VAddrBits-1, 12)+1.U, io.in.bits.src1(VAddrBits-1, 12)) + val saddr_hi = Mux(saddr_lo(12), + Mux(imm12(11), io.in.bits.src1(VAddrBits-1, 12), io.in.bits.src1(VAddrBits-1, 12)+1.U), + Mux(imm12(11), io.in.bits.src1(VAddrBits-1, 12)+SignExt(1.U, VAddrBits-12), io.in.bits.src1(VAddrBits-1, 12)), ) val saddr = Cat(saddr_hi, saddr_lo(11,0)) - when(io.in.fire() && saddr(VAddrBits-1,0) =/= (io.in.bits.src1 + SignExt(ImmUnion.S.toImm32(io.in.bits.uop.ctrl.imm), XLEN))(VAddrBits-1,0)){ - printf("saddr %x saddr_old %x\n", saddr, saddr_old(VAddrBits-1,0)) - } io.dtlbReq.bits.vaddr := saddr io.dtlbReq.valid := io.in.valid diff --git a/src/main/scala/xiangshan/mem/sbuffer/NewSbuffer.scala b/src/main/scala/xiangshan/mem/sbuffer/NewSbuffer.scala index 2ed57c0dcfa334756f90cae3aa12d16c655807fc..ed0edc6ac8c563c480557ba91d50253ed38ea889 100644 --- a/src/main/scala/xiangshan/mem/sbuffer/NewSbuffer.scala +++ b/src/main/scala/xiangshan/mem/sbuffer/NewSbuffer.scala @@ -129,6 +129,9 @@ class NewSbuffer extends XSModule with HasSbufferCst { difftestIO <> DontCare val buffer = Mem(StoreBufferSize, new SbufferLine) + val tag = Reg(Vec(StoreBufferSize, UInt(TagWidth.W))) + val mask = Reg(Vec(StoreBufferSize, Vec(CacheLineWords, Vec(DataBytes, Bool())))) + val data = Reg(Vec(StoreBufferSize, Vec(CacheLineWords, Vec(DataBytes, UInt(8.W))))) // TODO: will be replaced by SyncDataModuleTemplate val stateVec = RegInit(VecInit(Seq.fill(StoreBufferSize)(s_invalid))) val cohCount = Reg(Vec(StoreBufferSize, UInt(countBits.W))) /* @@ -165,30 +168,13 @@ class NewSbuffer extends XSModule with HasSbufferCst { val validCount = RegInit(0.U((log2Up(StoreBufferSize) + 1).W)) val full = invalidCount === 0.U // full = TODO: validCount(log2Up(StoreBufferSize)) - val bufferRead = VecInit((0 until StoreBufferSize).map(i => buffer(i))) - val stateRead = VecInit((0 until StoreBufferSize).map(i => stateVec(i))) - val dataRead = VecInit((0 until StoreBufferSize).map(i => bufferRead(i).data.asTypeOf(Vec(CacheLineWords, Vec(DataBytes, UInt(8.W)))))) - val maskRead = VecInit((0 until StoreBufferSize).map(i => bufferRead(i).mask.asTypeOf(Vec(CacheLineWords, Vec(DataBytes, Bool()))))) - val tagRead = VecInit((0 until StoreBufferSize).map(i => bufferRead(i).tag)) - - val dataUpdate = WireInit(dataRead) - val maskUpdate = WireInit(maskRead) - val tagUpdate = WireInit(tagRead) - val stateUpdate = WireInit(stateRead) - val bufferUpdate = Wire(Vec(StoreBufferSize, new SbufferLine)) - (0 until StoreBufferSize) foreach { i => - bufferUpdate(i).tag := tagUpdate(i) - bufferUpdate(i).data := dataUpdate(i).asUInt() - bufferUpdate(i).mask := maskUpdate(i).asUInt() - } - val lru = Module(new ChooseReplace(StoreBufferSize)) val evictionIdx = lru.io.way - lru.io.mask := stateRead.map(isValid(_)) + lru.io.mask := stateVec.map(isValid(_)) - val tags = io.in.map(in => getTag(in.bits.addr)) - val sameTag = tags(0) === tags(1) + val intags = io.in.map(in => getTag(in.bits.addr)) + val sameTag = intags(0) === intags(1) val firstWord = getWord(io.in(0).bits.addr) val secondWord = getWord(io.in(1).bits.addr) val sameWord = firstWord === secondWord @@ -201,13 +187,14 @@ class NewSbuffer extends XSModule with HasSbufferCst { for(i <- 0 until StorePipelineWidth){ mergeMask(i) := widthMap(j => - Mux(tags(i) === tagRead(j) && isValid(stateRead(j)), true.B, false.B)) + intags(i) === tag(j) && isValid(stateVec(j)) + ) } // insert confition // firstInsert: the first invalid entry // if first entry canMerge or second entry has the same tag with the first entry , secondInsert equal the first invalid entry, otherwise, the second invalid entry - val invalidMask = stateRead.map(s => isInvalid(s)) + val invalidMask = stateVec.map(s => isInvalid(s)) val evenInvalidMask = GetEvenBits(VecInit(invalidMask).asUInt) val oddInvalidMask = GetOddBits(VecInit(invalidMask).asUInt) @@ -232,27 +219,26 @@ class NewSbuffer extends XSModule with HasSbufferCst { Mux(~enbufferSelReg, evenCanInsert, oddCanInsert) ) - io.in(0).ready := firstCanInsert || canMerge(0) - io.in(1).ready := (secondCanInsert || canMerge(1)) && !sameWord && io.in(0).ready - + io.in(0).ready := firstCanInsert + io.in(1).ready := secondCanInsert && !sameWord && io.in(0).ready - def wordReqToBufLine(req: DCacheWordReq, tag: UInt, insertIdx: UInt, wordOffset: UInt, flushMask: Bool): Unit = { - stateUpdate(insertIdx) := s_valid - tagUpdate(insertIdx) := tag + def wordReqToBufLine(req: DCacheWordReq, reqtag: UInt, insertIdx: UInt, wordOffset: UInt, flushMask: Bool): Unit = { + stateVec(insertIdx) := s_valid cohCount(insertIdx) := 0.U + tag(insertIdx) := reqtag when(flushMask){ for(j <- 0 until CacheLineWords){ for(i <- 0 until DataBytes){ - maskUpdate(insertIdx)(j)(i) := false.B + mask(insertIdx)(j)(i) := false.B } } } for(i <- 0 until DataBytes){ when(req.mask(i)){ - maskUpdate(insertIdx)(wordOffset)(i) := true.B - dataUpdate(insertIdx)(wordOffset)(i) := req.data(i*8+7, i*8) + mask(insertIdx)(wordOffset)(i) := true.B + data(insertIdx)(wordOffset)(i) := req.data(i*8+7, i*8) } } } @@ -261,8 +247,8 @@ class NewSbuffer extends XSModule with HasSbufferCst { cohCount(mergeIdx) := 0.U for(i <- 0 until DataBytes){ when(req.mask(i)){ - maskUpdate(mergeIdx)(wordOffset)(i) := true.B - dataUpdate(mergeIdx)(wordOffset)(i) := req.data(i*8+7, i*8) + mask(mergeIdx)(wordOffset)(i) := true.B + data(mergeIdx)(wordOffset)(i) := req.data(i*8+7, i*8) } } } @@ -273,7 +259,7 @@ class NewSbuffer extends XSModule with HasSbufferCst { mergeWordReq(io.in(0).bits, mergeIdx(0), firstWord) XSDebug(p"merge req 0 to line [${mergeIdx(0)}]\n") }.otherwise{ - wordReqToBufLine(io.in(0).bits, tags(0), firstInsertIdx, firstWord, true.B) + wordReqToBufLine(io.in(0).bits, intags(0), firstInsertIdx, firstWord, true.B) XSDebug(p"insert req 0 to line[$firstInsertIdx]\n") } } @@ -284,19 +270,14 @@ class NewSbuffer extends XSModule with HasSbufferCst { mergeWordReq(io.in(1).bits, mergeIdx(1), secondWord) XSDebug(p"merge req 1 to line [${mergeIdx(1)}]\n") }.otherwise{ - wordReqToBufLine(io.in(1).bits, tags(1), secondInsertIdx, secondWord, !sameTag) + wordReqToBufLine(io.in(1).bits, intags(1), secondInsertIdx, secondWord, !sameTag) XSDebug(p"insert req 1 to line[$secondInsertIdx]\n") } } - for(i <- 0 until StoreBufferSize){ - buffer.write(i.U, bufferUpdate(i)) - stateVec(i) := stateUpdate(i) - } - for(i <- 0 until StoreBufferSize){ XSDebug(stateVec(i)=/=s_invalid, - p"[$i] timeout:${cohCount(i)(countBits-1)} state:${stateVec(i)} buf:${bufferRead(i)}\n" + p"[$i] timeout:${cohCount(i)(countBits-1)} state:${stateVec(i)}\n" ) } @@ -320,7 +301,7 @@ class NewSbuffer extends XSModule with HasSbufferCst { do_eviction := validCount >= 12.U - io.flush.empty := empty && io.sqempty + io.flush.empty := RegNext(empty && io.sqempty) lru.io.flush := sbuffer_state === x_drain_sbuffer && empty switch(sbuffer_state){ is(x_idle){ @@ -346,11 +327,11 @@ class NewSbuffer extends XSModule with HasSbufferCst { XSDebug(p"sbuffer state:${sbuffer_state} do eviction:${do_eviction} empty:${empty}\n") def noSameBlockInflight(idx: UInt): Bool = { - val tag = tagRead(idx) + val atag = tag(idx) !Cat(widthMap(i => { // stateVec(idx) itself must not be s_inflight* - (isInflight(stateRead(i)) || isPrepare(stateRead(i))) && - tag === tagRead(i) + (isInflight(stateVec(i)) || isPrepare(stateVec(i))) && + atag === tag(i) })).orR() } @@ -384,9 +365,9 @@ class NewSbuffer extends XSModule with HasSbufferCst { dcacheReqValid := false.B } when(prepareEn && (!dcacheReqValid || io.dcache.req.fire())) { - dcacheCandidate.addr := getAddr(tagRead(prepareIdx)) - dcacheCandidate.data := bufferRead(prepareIdx).data - dcacheCandidate.mask := bufferRead(prepareIdx).mask + dcacheCandidate.addr := getAddr(tag(prepareIdx)) + dcacheCandidate.data := data(prepareIdx).asUInt + dcacheCandidate.mask := mask(prepareIdx).asUInt dcacheCandidate.cmd := MemoryOpConstants.M_XWR dcacheCandidate.id := prepareIdx stateVec(prepareIdx) := s_inflight @@ -411,9 +392,9 @@ class NewSbuffer extends XSModule with HasSbufferCst { if (!env.FPGAPlatform) { difftestIO.sbufferResp := WireInit(io.dcache.resp.fire()) - difftestIO.sbufferAddr := WireInit(getAddr(tagRead(respId))) - difftestIO.sbufferData := WireInit(bufferRead(respId).data.asTypeOf(Vec(CacheLineBytes, UInt(8.W)))) - difftestIO.sbufferMask := WireInit(bufferRead(respId).mask) + difftestIO.sbufferAddr := WireInit(getAddr(tag(respId))) + difftestIO.sbufferData := WireInit(data(respId).asTypeOf(Vec(CacheLineBytes, UInt(8.W)))) + difftestIO.sbufferMask := WireInit(mask(respId).asUInt) } val needSpace = (io.in(0).fire && !canMerge(0)) +& (io.in(1).fire && !canMerge(1) && !sameTag) @@ -431,7 +412,7 @@ class NewSbuffer extends XSModule with HasSbufferCst { when(isValid(stateVec(i))){ when(cohCount(i)(countBits-1)){ assert(stateVec(i) === s_valid) - stateUpdate(i) := s_prepare + stateVec(i) := s_prepare } cohCount(i) := cohCount(i)+1.U } @@ -440,7 +421,7 @@ class NewSbuffer extends XSModule with HasSbufferCst { // ---------------------- Load Data Forward --------------------- for ((forward, i) <- io.forward.zipWithIndex) { - val tag_matches = widthMap(i => tagRead(i) === getTag(forward.paddr)) + val tag_matches = widthMap(i => tag(i) === getTag(forward.paddr)) val valid_tag_matches = widthMap(i => tag_matches(i) && isValid(stateVec(i))) val inflight_tag_matches = widthMap(i => tag_matches(i) && (isInflight(stateVec(i)) || isPrepare(stateVec(i))) @@ -451,13 +432,11 @@ class NewSbuffer extends XSModule with HasSbufferCst { val inflight_tag_match_reg = inflight_tag_matches.map(RegNext(_)) val line_offset_reg = RegNext(line_offset_mask) - val selectedValidLine = Mux1H(valid_tag_match_reg, bufferRead) - val selectedValidMask = Mux1H(line_offset_reg, selectedValidLine.mask.asTypeOf(Vec(CacheLineWords, Vec(DataBytes, Bool())))) - val selectedValidData = Mux1H(line_offset_reg, selectedValidLine.data.asTypeOf(Vec(CacheLineWords, Vec(DataBytes, UInt(8.W))))) + val selectedValidMask = Mux1H(line_offset_reg, Mux1H(valid_tag_match_reg, mask).asTypeOf(Vec(CacheLineWords, Vec(DataBytes, Bool())))) + val selectedValidData = Mux1H(line_offset_reg, Mux1H(valid_tag_match_reg, data).asTypeOf(Vec(CacheLineWords, Vec(DataBytes, UInt(8.W))))) - val selectedInflightLine = Mux1H(inflight_tag_match_reg, bufferRead) - val selectedInflightMask = Mux1H(line_offset_reg, selectedInflightLine.mask.asTypeOf(Vec(CacheLineWords, Vec(DataBytes, Bool())))) - val selectedInflightData = Mux1H(line_offset_reg, selectedInflightLine.data.asTypeOf(Vec(CacheLineWords, Vec(DataBytes, UInt(8.W))))) + val selectedInflightMask = Mux1H(line_offset_reg, Mux1H(inflight_tag_match_reg, mask).asTypeOf(Vec(CacheLineWords, Vec(DataBytes, Bool())))) + val selectedInflightData = Mux1H(line_offset_reg, Mux1H(inflight_tag_match_reg, data).asTypeOf(Vec(CacheLineWords, Vec(DataBytes, UInt(8.W))))) for (j <- 0 until DataBytes) { forward.forwardMask(j) := false.B diff --git a/src/main/scala/xiangshan/package.scala b/src/main/scala/xiangshan/package.scala index f20620149a420110ab1e2437c23bf3a972298bf3..e737bb73197dfba64b379cb4161a6a228808c6b8 100644 --- a/src/main/scala/xiangshan/package.scala +++ b/src/main/scala/xiangshan/package.scala @@ -148,7 +148,7 @@ package object xiangshan { def configable_cache(mode: UInt) = mode(7) def strToMode(s: String) = { - var result = 0.U << 8 + var result = 0.U(8.W) if (s.toUpperCase.indexOf("R") >= 0) result = result + R if (s.toUpperCase.indexOf("W") >= 0) result = result + W if (s.toUpperCase.indexOf("X") >= 0) result = result + X