diff --git a/Makefile b/Makefile index e4551293ef373f1b26ce13431baeb4aef380d7c3..f652d0ecf7c1d43e4cb2ab446a6df360c22511e3 100644 --- a/Makefile +++ b/Makefile @@ -28,16 +28,15 @@ help: $(TOP_V): $(SCALA_FILE) mkdir -p $(@D) mill XiangShan.test.runMain $(SIMTOP) -td $(@D) --full-stacktrace --output-file $(@F) --disable-all --fpga-platform --remove-assert --infer-rw --repl-seq-mem -c:$(SIMTOP):-o:$(@D)/$(@F).conf $(SIM_ARGS) - # mill XiangShan.runMain top.$(TOP) -X verilog -td $(@D) --output-file $(@F) --infer-rw $(FPGATOP) --repl-seq-mem -c:$(FPGATOP):-o:$(@D)/$(@F).conf - # $(MEM_GEN) $(@D)/$(@F).conf >> $@ + $(MEM_GEN) $(@D)/$(@F).conf >> $@ # sed -i -e 's/_\(aw\|ar\|w\|r\|b\)_\(\|bits_\)/_\1/g' $@ - # @git log -n 1 >> .__head__ - # @git diff >> .__diff__ - # @sed -i 's/^/\/\// ' .__head__ - # @sed -i 's/^/\/\//' .__diff__ - # @cat .__head__ .__diff__ $@ > .__out__ - # @mv .__out__ $@ - # @rm .__head__ .__diff__ + @git log -n 1 >> .__head__ + @git diff >> .__diff__ + @sed -i 's/^/\/\// ' .__head__ + @sed -i 's/^/\/\//' .__diff__ + @cat .__head__ .__diff__ $@ > .__out__ + @mv .__out__ $@ + @rm .__head__ .__diff__ deploy: build/top.zip diff --git a/block-inclusivecache-sifive b/block-inclusivecache-sifive index ca387163b32f20406d443bdab34bc034d5281b51..cf429e420be6702a2e24b9b91910366187c103b4 160000 --- a/block-inclusivecache-sifive +++ b/block-inclusivecache-sifive @@ -1 +1 @@ -Subproject commit ca387163b32f20406d443bdab34bc034d5281b51 +Subproject commit cf429e420be6702a2e24b9b91910366187c103b4 diff --git a/debug/env.sh b/debug/env.sh new file mode 100644 index 0000000000000000000000000000000000000000..aa7989c3fc983c1a1a13c3f291d1c2ed6d907759 --- /dev/null +++ b/debug/env.sh @@ -0,0 +1,2 @@ +export NOOP_HOME=$(pwd)/.. +echo $NOOP_HOME diff --git a/src/main/scala/system/SoC.scala b/src/main/scala/system/SoC.scala index 40e00c42cf8847fd5999835c525671303402b0bc..ba5e6efb7a89603469816d58729324e743c0c0c0 100644 --- a/src/main/scala/system/SoC.scala +++ b/src/main/scala/system/SoC.scala @@ -61,7 +61,7 @@ class XSSoc()(implicit p: Parameters) extends LazyModule with HasSoCParameter { cacheName = s"L2" ), InclusiveCacheMicroParameters( - writeBytes = 8 + writeBytes = 32 ) ))) @@ -79,7 +79,7 @@ class XSSoc()(implicit p: Parameters) extends LazyModule with HasSoCParameter { cacheName = "L3" ), InclusiveCacheMicroParameters( - writeBytes = 8 + writeBytes = 32 ) )).node @@ -170,6 +170,7 @@ class XSSoc()(implicit p: Parameters) extends LazyModule with HasSoCParameter { xs_core(i).module.io.externalInterrupt.msip := clint.module.io.msip(i) // xs_core(i).module.io.externalInterrupt.meip := RegNext(RegNext(io.meip(i))) xs_core(i).module.io.externalInterrupt.meip := plic.module.io.extra.get.meip(i) + xs_core(i).module.io.l2ToPrefetcher <> l2cache(i).module.io } // do not let dma AXI signals optimized out chisel3.dontTouch(dma.out.head._1) diff --git a/src/main/scala/utils/SRAMTemplate.scala b/src/main/scala/utils/SRAMTemplate.scala new file mode 100644 index 0000000000000000000000000000000000000000..dee1697fd64c33b046496a772b1609b64a61e8e6 --- /dev/null +++ b/src/main/scala/utils/SRAMTemplate.scala @@ -0,0 +1,124 @@ +/************************************************************************************** +* Copyright (c) 2020 Institute of Computing Technology, CAS +* Copyright (c) 2020 University of Chinese Academy of Sciences +* +* NutShell is licensed under Mulan PSL v2. +* You can use this software according to the terms and conditions of the Mulan PSL v2. +* You may obtain a copy of Mulan PSL v2 at: +* http://license.coscl.org.cn/MulanPSL2 +* +* THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY OR +* FIT FOR A PARTICULAR PURPOSE. +* +* See the Mulan PSL v2 for more details. +***************************************************************************************/ + +package utils + +import chisel3._ +import chisel3.util._ + +class SRAMBundleA(val set: Int) extends Bundle { + val setIdx = Output(UInt(log2Up(set).W)) + + def apply(setIdx: UInt) = { + this.setIdx := setIdx + this + } +} + +class SRAMBundleAW[T <: Data](private val gen: T, set: Int, val way: Int = 1) extends SRAMBundleA(set) { + val data = Output(gen) + val waymask = if (way > 1) Some(Output(UInt(way.W))) else None + + def apply(data: T, setIdx: UInt, waymask: UInt) = { + super.apply(setIdx) + this.data := data + this.waymask.map(_ := waymask) + this + } +} + +class SRAMBundleR[T <: Data](private val gen: T, val way: Int = 1) extends Bundle { + val data = Output(Vec(way, gen)) +} + +class SRAMReadBus[T <: Data](private val gen: T, val set: Int, val way: Int = 1) extends Bundle { + val req = Decoupled(new SRAMBundleA(set)) + val resp = Flipped(new SRAMBundleR(gen, way)) + + def apply(valid: Bool, setIdx: UInt) = { + this.req.bits.apply(setIdx) + this.req.valid := valid + this + } +} + +class SRAMWriteBus[T <: Data](private val gen: T, val set: Int, val way: Int = 1) extends Bundle { + val req = Decoupled(new SRAMBundleAW(gen, set, way)) + + def apply(valid: Bool, data: T, setIdx: UInt, waymask: UInt) = { + this.req.bits.apply(data = data, setIdx = setIdx, waymask = waymask) + this.req.valid := valid + this + } +} + +class SRAMTemplate[T <: Data](gen: T, set: Int, way: Int = 1, + shouldReset: Boolean = false, holdRead: Boolean = false, singlePort: Boolean = false) extends Module { + val io = IO(new Bundle { + val r = Flipped(new SRAMReadBus(gen, set, way)) + val w = Flipped(new SRAMWriteBus(gen, set, way)) + }) + + val wordType = UInt(gen.getWidth.W) + val array = SyncReadMem(set, Vec(way, wordType)) + val (resetState, resetSet) = (WireInit(false.B), WireInit(0.U)) + + if (shouldReset) { + val _resetState = RegInit(true.B) + val (_resetSet, resetFinish) = Counter(_resetState, set) + when (resetFinish) { _resetState := false.B } + + resetState := _resetState + resetSet := _resetSet + } + + val (ren, wen) = (io.r.req.valid, io.w.req.valid || resetState) + val realRen = (if (singlePort) ren && !wen else ren) + + val setIdx = Mux(resetState, resetSet, io.w.req.bits.setIdx) + val wdataword = Mux(resetState, 0.U.asTypeOf(wordType), io.w.req.bits.data.asUInt) + val waymask = Mux(resetState, Fill(way, "b1".U), io.w.req.bits.waymask.getOrElse("b1".U)) + val wdata = VecInit(Seq.fill(way)(wdataword)) + when (wen) { array.write(setIdx, wdata, waymask.asBools) } + + val rdata = (if (holdRead) ReadAndHold(array, io.r.req.bits.setIdx, realRen) + else array.read(io.r.req.bits.setIdx, realRen)).map(_.asTypeOf(gen)) + io.r.resp.data := VecInit(rdata) + + io.r.req.ready := !resetState && (if (singlePort) !wen else true.B) + io.w.req.ready := true.B + +} + +class SRAMTemplateWithArbiter[T <: Data](nRead: Int, gen: T, set: Int, way: Int = 1, + shouldReset: Boolean = false) extends Module { + val io = IO(new Bundle { + val r = Flipped(Vec(nRead, new SRAMReadBus(gen, set, way))) + val w = Flipped(new SRAMWriteBus(gen, set, way)) + }) + + val ram = Module(new SRAMTemplate(gen, set, way, shouldReset, holdRead = false, singlePort = true)) + ram.io.w <> io.w + + val readArb = Module(new Arbiter(chiselTypeOf(io.r(0).req.bits), nRead)) + readArb.io.in <> io.r.map(_.req) + ram.io.r.req <> readArb.io.out + + // latch read results + io.r.map{ case r => { + r.resp.data := HoldUnless(ram.io.r.resp.data, RegNext(r.req.fire())) + }} +} diff --git a/src/main/scala/utils/SRAMWrapper.scala b/src/main/scala/utils/SRAMWrapper.scala deleted file mode 100644 index 598dd027f8272d51bbf2ef82ab89b16cff95af70..0000000000000000000000000000000000000000 --- a/src/main/scala/utils/SRAMWrapper.scala +++ /dev/null @@ -1,193 +0,0 @@ -package utils - -import chisel3._ -import chisel3.experimental.{ChiselAnnotation, annotate} -import chisel3.util._ -import firrtl.annotations.Annotation -import freechips.rocketchip.transforms.naming.OverrideDesiredNameAnnotation - -class SRAMBundleA(val set: Int) extends Bundle { - val setIdx = Output(UInt(log2Up(set).W)) - - def apply(setIdx: UInt) = { - this.setIdx := setIdx - this - } -} - -class SRAMBundleAW[T <: Data](private val gen: T, set: Int, val way: Int = 1) extends SRAMBundleA(set) { - val data = Output(gen) - val waymask = if (way > 1) Some(Output(UInt(way.W))) else None - - def apply(data: T, setIdx: UInt, waymask: UInt) = { - super.apply(setIdx) - this.data := data - this.waymask.foreach(_ := waymask) - this - } -} - -class SRAMBundleR[T <: Data](private val gen: T, val way: Int = 1) extends Bundle { - val data = Output(Vec(way, gen)) -} - -class SRAMReadBus[T <: Data](private val gen: T, val set: Int, val way: Int = 1) extends Bundle { - val req = Decoupled(new SRAMBundleA(set)) - val resp = Flipped(new SRAMBundleR(gen, way)) - - def apply(valid: Bool, setIdx: UInt) = { - this.req.bits.apply(setIdx) - this.req.valid := valid - this - } -} - -class SRAMWriteBus[T <: Data](private val gen: T, val set: Int, val way: Int = 1) extends Bundle { - val req = Decoupled(new SRAMBundleAW(gen, set, way)) - - def apply(valid: Bool, data: T, setIdx: UInt, waymask: UInt) = { - this.req.bits.apply(data = data, setIdx = setIdx, waymask = waymask) - this.req.valid := valid - this - } -} - -abstract class SRAMTemplate extends Module { - def read(addr: UInt, ren: Bool): Vec[UInt] - def write(addr: UInt, wen: Bool, wdata: UInt, wmask: UInt): Unit -} - -class SinglePortSRAM(set: Int, way: Int, width: Int) extends SRAMTemplate { - val io = IO(new Bundle() { - val addr = Input(UInt(log2Up(set).W)) - val ren = Input(Bool()) - val rdata = Output(Vec(way, UInt(width.W))) - val wdata = Input(UInt(width.W)) - val wen = Input(Bool()) - val wmask = Input(UInt(way.W)) - }) - val mem = SyncReadMem(set, Vec(way, UInt(width.W))) - val addr = io.addr - when(io.wen){ - mem.write(addr, VecInit(Seq.fill(way)(io.wdata)), io.wmask.asBools()) - } - io.rdata := mem.read(addr, io.ren && !io.wen) - - override def read(addr: UInt, ren: Bool): Vec[UInt] = { - io.addr := addr - io.ren := ren - io.rdata - } - - override def write(addr: UInt, wen: Bool, wdata: UInt, wmask: UInt): Unit = { - io.addr := addr - io.wen := wen - io.wdata := wdata - io.wmask := wmask - } -} -class DualPortSRAM(set: Int, way: Int, width: Int) extends SRAMTemplate { - val io = IO(new Bundle() { - val raddr = Input(UInt(log2Up(set).W)) - val ren = Input(Bool()) - val rdata = Output(Vec(way, UInt(width.W))) - val waddr = Input(UInt(log2Up(set).W)) - val wdata = Input(UInt(width.W)) - val wen = Input(Bool()) - val wmask = Input(UInt(way.W)) - }) - val mem = SyncReadMem(set, Vec(way, UInt(width.W))) - io.rdata := mem.read(io.raddr, io.ren) - when(io.wen){ - mem.write(io.waddr, VecInit(Seq.fill(way)(io.wdata)), io.wmask.asBools()) - } - - override def read(addr: UInt, ren: Bool): Vec[UInt] = { - io.raddr := addr - io.ren := ren - io.rdata - } - - override def write(addr: UInt, wen: Bool, wdata: UInt, wmask: UInt): Unit = { - io.waddr := addr - io.wen := wen - io.wdata := wdata - io.wmask := wmask - } -} - -class SRAMWrapper[T <: Data] -( - sramName: String, - gen: T, set: Int, way: Int = 1, - shouldReset: Boolean = false, - holdRead: Boolean = false, - singlePort: Boolean = false -) extends Module { - val io = IO(new Bundle { - val r = Flipped(new SRAMReadBus(gen, set, way)) - val w = Flipped(new SRAMWriteBus(gen, set, way)) - }) - - val wordType = UInt(gen.getWidth.W) -// val array = SyncReadMem(set, Vec(way, wordType)) - val array: SRAMTemplate = if(singlePort) { - Module(new SinglePortSRAM(set, way, gen.getWidth)) - } else { - Module(new DualPortSRAM(set, way, gen.getWidth)) - } - val (resetState, resetSet) = (WireInit(false.B), WireInit(0.U)) - - if (shouldReset) { - val _resetState = RegInit(true.B) - val (_resetSet, resetFinish) = Counter(_resetState, set) - when (resetFinish) { _resetState := false.B } - - resetState := _resetState - resetSet := _resetSet - } - - val (ren, wen) = (io.r.req.valid, io.w.req.valid || resetState) - val realRen = ren //(if (singlePort) ren && !wen else ren) do mutex inside inner sram - - val setIdx = Mux(resetState, resetSet, - if(singlePort) Mux(io.w.req.valid, io.w.req.bits.setIdx, io.r.req.bits.setIdx) - else io.w.req.bits.setIdx - ) - val wdataword = Mux(resetState, 0.U.asTypeOf(wordType), io.w.req.bits.data.asUInt) - val waymask = Mux(resetState, Fill(way, "b1".U), io.w.req.bits.waymask.getOrElse("b1".U)) - array.write(setIdx, wen, wdataword, waymask) - - val rdataWire = if(singlePort) array.read(setIdx, realRen) else array.read(io.r.req.bits.setIdx, realRen) - - val rdata = (if(holdRead) HoldUnless(rdataWire, RegNext(realRen)) else rdataWire).map(_.asTypeOf(gen)) - io.r.resp.data := VecInit(rdata) - - io.r.req.ready := !resetState && (if (singlePort) !wen else true.B) - io.w.req.ready := true.B - - val prefix = if(singlePort) "SinglePortSRAM_" else "DualPortSRAM_" - annotate(new ChiselAnnotation { - override def toFirrtl: Annotation = OverrideDesiredNameAnnotation(s"$prefix$sramName", array.toAbsoluteTarget) - }) -} - -class SRAMTemplateWithArbiter[T <: Data](sramName: String, nRead: Int, gen: T, set: Int, way: Int = 1, - shouldReset: Boolean = false) extends Module { - val io = IO(new Bundle { - val r = Flipped(Vec(nRead, new SRAMReadBus(gen, set, way))) - val w = Flipped(new SRAMWriteBus(gen, set, way)) - }) - - val ram = Module(new SRAMWrapper(sramName, gen, set, way, shouldReset, holdRead = false, singlePort = true)) - ram.io.w <> io.w - - val readArb = Module(new Arbiter(chiselTypeOf(io.r(0).req.bits), nRead)) - readArb.io.in <> io.r.map(_.req) - ram.io.r.req <> readArb.io.out - - // latch read results - io.r.map { r => { - r.resp.data := HoldUnless(ram.io.r.resp.data, RegNext(r.req.fire())) - }} -} diff --git a/src/main/scala/xiangshan/XSCore.scala b/src/main/scala/xiangshan/XSCore.scala index bcc4bb0d1ecf06ebd6515b2fa25d516f54b2dc8d..e14211f51826b64bedec76af7a73716ef4b205df 100644 --- a/src/main/scala/xiangshan/XSCore.scala +++ b/src/main/scala/xiangshan/XSCore.scala @@ -10,7 +10,7 @@ import xiangshan.backend.exu.Exu._ import xiangshan.frontend._ import xiangshan.mem._ import xiangshan.backend.fu.HasExceptionNO -import xiangshan.cache.{DCache,InstrUncache, DCacheParameters, ICache, ICacheParameters, L1plusCache, L1plusCacheParameters, PTW, Uncache} +import xiangshan.cache.{DCache,InstrUncache, DCacheParameters, ICache, ICacheParameters, L1plusCache, L1plusCacheParameters, PTW, Uncache, MemoryOpConstants, MissReq} import xiangshan.cache.prefetch._ import chipsalliance.rocketchip.config import freechips.rocketchip.diplomacy.{AddressSet, LazyModule, LazyModuleImp} @@ -19,6 +19,7 @@ import freechips.rocketchip.devices.tilelink.{DevNullParams, TLError} import sifive.blocks.inclusivecache.{CacheParameters, InclusiveCache, InclusiveCacheMicroParameters} import freechips.rocketchip.amba.axi4.{AXI4Deinterleaver, AXI4Fragmenter, AXI4IdIndexer, AXI4IdentityNode, AXI4ToTL, AXI4UserYanker} import freechips.rocketchip.tile.HasFPUParameters +import sifive.blocks.inclusivecache.PrefetcherIO import utils._ case class XSCoreParameters @@ -236,7 +237,7 @@ trait HasXSParameter { // dcache prefetcher val l2PrefetcherParameters = L2PrefetcherParameters( enable = true, - _type = "stream", + _type = "bop",// "stream" or "bop" streamParams = StreamPrefetchParameters( streamCnt = 4, streamSize = 4, @@ -244,7 +245,16 @@ trait HasXSParameter { blockBytes = L2BlockSize, reallocStreamOnMissInstantly = true, cacheName = "dcache" - ) + ), + bopParams = BOPParameters( + rrTableEntries = 256, + rrTagBits = 12, + scoreBits = 5, + roundMax = 50, + badScore = 1, + blockBytes = L2BlockSize, + nEntries = dcacheParameters.nMissEntries * 2 // TODO: this is too large + ), ) } @@ -337,6 +347,7 @@ class XSCoreImp(outer: XSCore) extends LazyModuleImp(outer) { val io = IO(new Bundle { val externalInterrupt = new ExternalInterruptIO + val l2ToPrefetcher = Flipped(new PrefetcherIO(PAddrBits)) }) println(s"FPGAPlatform:${env.FPGAPlatform} EnableDebug:${env.EnableDebug}") @@ -451,7 +462,16 @@ class XSCoreImp(outer: XSCore) extends LazyModuleImp(outer) ptw.io.sfence <> integerBlock.io.fenceio.sfence ptw.io.csr <> integerBlock.io.csrio.tlb - l2Prefetcher.io.in <> memBlock.io.toDCachePrefetch + val l2PrefetcherIn = Wire(Decoupled(new MissReq)) + if (l2PrefetcherParameters.enable && l2PrefetcherParameters._type == "bop") { + l2PrefetcherIn.valid := io.l2ToPrefetcher.acquire.valid + l2PrefetcherIn.bits := DontCare + l2PrefetcherIn.bits.addr := io.l2ToPrefetcher.acquire.bits.address + l2PrefetcherIn.bits.cmd := Mux(io.l2ToPrefetcher.acquire.bits.write, MemoryOpConstants.M_XWR, MemoryOpConstants.M_XRD) + } else { + l2PrefetcherIn <> memBlock.io.toDCachePrefetch + } + l2Prefetcher.io.in <> l2PrefetcherIn if (!env.FPGAPlatform) { val debugIntReg, debugFpReg = WireInit(VecInit(Seq.fill(32)(0.U(XLEN.W)))) diff --git a/src/main/scala/xiangshan/backend/fu/Radix2Divider.scala b/src/main/scala/xiangshan/backend/fu/Radix2Divider.scala index fbe27ed88b13ea822da6ea0647fe6221e00d19da..654f8b3264f58c3a309743127e1d4f3608e33ab5 100644 --- a/src/main/scala/xiangshan/backend/fu/Radix2Divider.scala +++ b/src/main/scala/xiangshan/backend/fu/Radix2Divider.scala @@ -41,7 +41,7 @@ class Radix2Divider(len: Int) extends AbstractDivider(len) { val uopReg = RegEnable(uop, newReq) val cnt = Counter(len) - when (newReq) { + when (newReq && !io.in.bits.uop.roqIdx.needFlush(io.redirectIn)) { state := s_log2 } .elsewhen (state === s_log2) { // `canSkipShift` is calculated as following: diff --git a/src/main/scala/xiangshan/backend/fu/SRT4Divider.scala b/src/main/scala/xiangshan/backend/fu/SRT4Divider.scala index c981982d5225811defbf5c95779275f9b14db5ec..ea8fd75724cc3525e1bd41378cd4459b4533f363 100644 --- a/src/main/scala/xiangshan/backend/fu/SRT4Divider.scala +++ b/src/main/scala/xiangshan/backend/fu/SRT4Divider.scala @@ -37,7 +37,9 @@ class SRT4Divider(len: Int) extends AbstractDivider(len) { switch(state){ is(s_idle){ - when(io.in.fire()){ state := Mux(divZero, s_finish, s_lzd) } + when (io.in.fire() && !io.in.bits.uop.roqIdx.needFlush(io.redirectIn)) { + state := Mux(divZero, s_finish, s_lzd) + } } is(s_lzd){ // leading zero detection state := s_normlize diff --git a/src/main/scala/xiangshan/cache/L1plusCache.scala b/src/main/scala/xiangshan/cache/L1plusCache.scala index 67811004ab3233398cd8b231819ee9720beafca6..e503155e83fe07d0a29a903d90a5afeb89511273 100644 --- a/src/main/scala/xiangshan/cache/L1plusCache.scala +++ b/src/main/scala/xiangshan/cache/L1plusCache.scala @@ -2,7 +2,7 @@ package xiangshan.cache import chisel3._ import chisel3.util._ -import utils.{Code, RandomReplacement, HasTLDump, XSDebug, SRAMWrapper} +import utils.{Code, RandomReplacement, HasTLDump, XSDebug, SRAMTemplate} import xiangshan.{HasXSLog} import chipsalliance.rocketchip.config.Parameters @@ -130,7 +130,7 @@ class L1plusCacheDataArray extends L1plusCacheModule { io.read.ready := !rwhazard for (w <- 0 until nWays) { - val array = Module(new SRAMWrapper("L1Plus_Data", Bits((blockRows * encRowBits).W), set=nSets, way=1, + val array = Module(new SRAMTemplate(Bits((blockRows * encRowBits).W), set=nSets, way=1, shouldReset=false, holdRead=false, singlePort=singlePort)) // data write array.io.w.req.valid := io.write.bits.way_en(w) && io.write.valid @@ -209,7 +209,7 @@ class L1plusCacheMetadataArray extends L1plusCacheModule { val rmask = Mux((nWays == 1).B, (-1).asSInt, io.read.bits.way_en.asSInt).asBools def encTagBits = cacheParams.tagCode.width(tagBits) - val tag_array = Module(new SRAMWrapper("L1Plus_Meta", UInt(encTagBits.W), set=nSets, way=nWays, + val tag_array = Module(new SRAMTemplate(UInt(encTagBits.W), set=nSets, way=nWays, shouldReset=false, holdRead=false, singlePort=true)) val valid_array = Reg(Vec(nSets, UInt(nWays.W))) when (reset.toBool || io.flush) { diff --git a/src/main/scala/xiangshan/cache/dcache.scala b/src/main/scala/xiangshan/cache/dcache.scala index 2e74cefdc70d259db3798c27512af5a616b920fc..6b70b7eae7800abad12651b0950494986861a9c1 100644 --- a/src/main/scala/xiangshan/cache/dcache.scala +++ b/src/main/scala/xiangshan/cache/dcache.scala @@ -3,7 +3,7 @@ package xiangshan.cache import chisel3._ import chisel3.util._ import freechips.rocketchip.tilelink.{ClientMetadata, TLClientParameters, TLEdgeOut} -import utils.{Code, RandomReplacement, XSDebug, SRAMWrapper} +import utils.{Code, RandomReplacement, XSDebug, SRAMTemplate} import scala.math.max @@ -197,8 +197,7 @@ class DuplicatedDataArray extends AbstractDataArray io.resp(j)(w)(r) := Cat((0 until rowWords).reverse map (k => resp(k))) for (k <- 0 until rowWords) { - val array = Module(new SRAMWrapper( - "Dcache_Data", + val array = Module(new SRAMTemplate( Bits(encWordBits.W), set=nSets, way=1, @@ -245,7 +244,7 @@ class L1MetadataArray(onReset: () => L1Metadata) extends DCacheModule { val metaBits = rstVal.getWidth val encMetaBits = cacheParams.tagCode.width(metaBits) - val tag_array = Module(new SRAMWrapper("Dcache_Meta", UInt(encMetaBits.W), set=nSets, way=nWays, + val tag_array = Module(new SRAMTemplate(UInt(encMetaBits.W), set=nSets, way=nWays, shouldReset=false, holdRead=false, singlePort=true)) // tag write diff --git a/src/main/scala/xiangshan/cache/icache.scala b/src/main/scala/xiangshan/cache/icache.scala index eabadeea8c647b247031dc5b7b8860f14b5bf5d1..6a5124f9dc24036cf216a61dd6b814f9a73fe54f 100644 --- a/src/main/scala/xiangshan/cache/icache.scala +++ b/src/main/scala/xiangshan/cache/icache.scala @@ -191,8 +191,7 @@ class ICacheMetaArray extends ICachArray val readResp = Output(Vec(nWays,UInt(tagBits.W))) }} - val metaArray = Module(new SRAMWrapper( - "Icache_Meta", + val metaArray = Module(new SRAMTemplate( UInt(metaEntryBits.W), set=nSets, way=nWays, @@ -233,8 +232,7 @@ class ICacheDataArray extends ICachArray }} //dataEntryBits = 144 - val dataArray = List.fill(nWays){List.fill(nBanks){Module(new SRAMWrapper( - "Icache_Data", + val dataArray = List.fill(nWays){List.fill(nBanks){Module(new SRAMTemplate( UInt(dataEntryBits.W), set=nSets, way = 1, diff --git a/src/main/scala/xiangshan/cache/missQueue.scala b/src/main/scala/xiangshan/cache/missQueue.scala index ec285b1337b201105cb0c9af35d4789f9cc77789..23b4fe6bf267d948b6f9e1e59c295b4f20b33dc0 100644 --- a/src/main/scala/xiangshan/cache/missQueue.scala +++ b/src/main/scala/xiangshan/cache/missQueue.scala @@ -495,8 +495,8 @@ class MissQueue(edge: TLEdgeOut) extends DCacheModule with HasTLDump if (!env.FPGAPlatform) { ExcitingUtils.addSource( BoolStopWatch( - start = entry.io.req.fire(), - stop = entry.io.resp.fire(), + start = entry.io.block_idx.valid, + stop = !entry.io.block_idx.valid, startHighPriority = true), "perfCntDCacheMissQueuePenaltyEntry" + Integer.toString(i, 10), Perf diff --git a/src/main/scala/xiangshan/cache/prefetch/BestOffsetPrefetch.scala b/src/main/scala/xiangshan/cache/prefetch/BestOffsetPrefetch.scala index 4b06617b8245e860bff00062d3a5b00186e1a3fb..7d0e83da500cd2755d230e6f96f200f2c21f05a4 100644 --- a/src/main/scala/xiangshan/cache/prefetch/BestOffsetPrefetch.scala +++ b/src/main/scala/xiangshan/cache/prefetch/BestOffsetPrefetch.scala @@ -12,21 +12,24 @@ case class BOPParameters( scoreBits: Int, roundMax: Int, badScore: Int, - scores: Int = 52, + // TODO: Is 256-offset necessary, which will cross pages? offsetList: Seq[Int] = Seq( 1, 2, 3, 4, 5, 6, 8, 9, 10, 12, - 15, 16, 18, 20, 24, 25, 27, 30, 32, 36, + 15, 16/*, 18, 20, 24, 25, 27, 30, 32, 36, 40, 45, 48, 50, 54, 60, 64, 72, 75, 80, 81, 90, 96, 100, 108, 120, 125, 128, 135, 144, 150, 160, 162, 180, 192, 200, 216, 225, 240, 243, - 250, 256 + 250, 256*/ ), - blockBytes: Int + blockBytes: Int, + nEntries: Int ) { + def scores = offsetList.length def offsetWidth = log2Up(offsetList(scores - 1)) + 1 def rrIdxBits = log2Up(rrTableEntries) def roundBits = log2Up(roundMax) def scoreMax = (1 << scoreBits) - 1 + def totalWidth = log2Up(nEntries) // id's width } class ScoreTableEntry(p: BOPParameters) extends PrefetchBundle { @@ -34,7 +37,7 @@ class ScoreTableEntry(p: BOPParameters) extends PrefetchBundle { val score = UInt(p.scoreBits.W) def apply(offset: UInt, score: UInt) = { - val entry = new ScoreTableEntry(p) + val entry = Wire(new ScoreTableEntry(p)) entry.offset := offset entry.score := score entry @@ -78,9 +81,51 @@ class TestOffsetBundle(p: BOPParameters) extends PrefetchBundle { override def cloneType: this.type = (new TestOffsetBundle(p)).asInstanceOf[this.type] } +class BestOffsetPrefetchReq(p: BOPParameters) extends PrefetchReq { + val id = UInt(p.totalWidth.W) + + override def toPrintable: Printable = { + p"addr=0x${Hexadecimal(addr)} w=${write} id=0x${Hexadecimal(id)}" + } + override def cloneType: this.type = (new BestOffsetPrefetchReq(p)).asInstanceOf[this.type] +} + +class BestOffsetPrefetchResp(p: BOPParameters) extends PrefetchResp { + val id = UInt(p.totalWidth.W) + + override def toPrintable: Printable = { + p"id=0x${Hexadecimal(id)}" + } + override def cloneType: this.type = (new BestOffsetPrefetchResp(p)).asInstanceOf[this.type] +} + +class BestOffsetPrefetchFinish(p: BOPParameters) extends PrefetchFinish { + val id = UInt(p.totalWidth.W) + + override def toPrintable: Printable = { + p"id=0x${Hexadecimal(id)}" + } + override def cloneType: this.type = (new BestOffsetPrefetchFinish(p)).asInstanceOf[this.type] +} + +class BestOffsetPrefetchIO(p: BOPParameters) extends PrefetchBundle { + val train = Flipped(ValidIO(new PrefetchTrain)) + val req = DecoupledIO(new BestOffsetPrefetchReq(p)) + val resp = Flipped(DecoupledIO(new BestOffsetPrefetchResp(p))) + val finish = DecoupledIO(new BestOffsetPrefetchFinish(p)) + + override def toPrintable: Printable = { + p"train: v=${train.valid} ${train.bits} " + + p"req: v=${req.valid} r=${req.ready} ${req.bits} " + + p"resp: v=${resp.valid} r=${resp.ready} ${resp.bits} " + + p"finish: v=${finish.valid} r=${finish.ready} ${finish.bits}" + } + override def cloneType: this.type = (new BestOffsetPrefetchIO(p)).asInstanceOf[this.type] +} + class RecentRequestTable(p: BOPParameters) extends PrefetchModule { val io = IO(new Bundle { - val w = Flipped(ValidIO(UInt(PAddrBits.W))) + val w = Flipped(DecoupledIO(UInt(PAddrBits.W))) val r = Flipped(new TestOffsetBundle(p)) }) def rrIdxBits = p.rrIdxBits @@ -108,10 +153,10 @@ class RecentRequestTable(p: BOPParameters) extends PrefetchModule { } } - val rrTable = Module(new SRAMWrapper("RR_Table", rrTableEntry(), set = rrTableEntries, way = 1, shouldReset = true)) + val rrTable = Module(new SRAMTemplate(rrTableEntry(), set = rrTableEntries, way = 1, shouldReset = true, singlePort = true)) val wAddr = io.w.bits - rrTable.io.w.req.valid := io.w.valid + rrTable.io.w.req.valid := io.w.valid && !io.r.req.valid rrTable.io.w.req.bits.setIdx := idx(wAddr) rrTable.io.w.req.bits.data.valid := true.B rrTable.io.w.req.bits.data.tag := tag(wAddr) @@ -122,32 +167,35 @@ class RecentRequestTable(p: BOPParameters) extends PrefetchModule { rrTable.io.r.req.bits.setIdx := idx(rAddr) rData := rrTable.io.r.resp.data(0) - val rwConflict = io.w.valid && io.r.req.fire() && idx(wAddr) === idx(rAddr) - when (rwConflict) { - rrTable.io.r.req.valid := false.B - } - when (RegNext(rwConflict)) { - rData.valid := true.B - rData.tag := RegNext(tag(wAddr)) - } + val rwConflict = io.w.fire() && io.r.req.fire() && idx(wAddr) === idx(rAddr) + // when (rwConflict) { + // rrTable.io.r.req.valid := false.B + // } + // when (RegNext(rwConflict)) { + // rData.valid := true.B + // rData.tag := RegNext(tag(wAddr)) + // } + io.w.ready := rrTable.io.w.req.ready && !io.r.req.valid io.r.req.ready := true.B - io.r.resp.valid := RegNext(io.r.req.fire()) + io.r.resp.valid := RegNext(rrTable.io.r.req.fire()) io.r.resp.bits.testOffset := RegNext(io.r.req.bits.testOffset) io.r.resp.bits.ptr := RegNext(io.r.req.bits.ptr) io.r.resp.bits.hit := rData.valid && rData.tag === RegNext(tag(rAddr)) + assert(!RegNext(rwConflict), "single port SRAM should not read and write at the same time") + // debug info - XSDebug(io.w.valid, p"io.write: v=${io.w.valid} addr=0x${Hexadecimal(io.w.bits)}\n") + XSDebug(io.w.fire(), p"io.write: v=${io.w.valid} addr=0x${Hexadecimal(io.w.bits)}\n") XSDebug(p"io.read: ${io.r}\n") - XSDebug(io.w.valid, p"wAddr=0x${Hexadecimal(wAddr)} idx=${Hexadecimal(idx(wAddr))} tag=${Hexadecimal(tag(wAddr))}\n") + XSDebug(io.w.fire(), p"wAddr=0x${Hexadecimal(wAddr)} idx=${Hexadecimal(idx(wAddr))} tag=${Hexadecimal(tag(wAddr))}\n") XSDebug(io.r.req.fire(), p"rAddr=0x${Hexadecimal(rAddr)} idx=${Hexadecimal(idx(rAddr))} rData=${rData}\n") - XSDebug(rwConflict, p"write and read conflict!\n") } class OffsetScoreTable(p: BOPParameters) extends PrefetchModule { val io = IO(new Bundle { + val req = Flipped(DecoupledIO(UInt(PAddrBits.W))) // req addr from L1 val prefetchOffset = Output(UInt(p.offsetWidth.W)) val test = new TestOffsetBundle(p) }) @@ -158,33 +206,34 @@ class OffsetScoreTable(p: BOPParameters) extends PrefetchModule { def roundBits = p.roundBits def roundMax = p.roundMax def scoreMax = p.scoreMax + def badScore = p.badScore - val prefetchOffset = RegInit(1.U(offsetWidth)) // best offset is 1, this is, a next-line prefetcher as initialization + val prefetchOffset = RegInit(2.U(offsetWidth.W)) // best offset is 1, that is, a next-line prefetcher as initialization val st = RegInit(VecInit(offsetList.map(off => new ScoreTableEntry(p).apply(off.U, 0.U)))) val ptr = RegInit(0.U(log2Up(scores).W)) val round = RegInit(0.U(roundBits.W)) - val bestOffset = RegInit(new ScoreTableEntry(p).apply(1.U, 0.U)) // the entry with the highest score while traversing - val testOffset = WireInit(0.U(offsetWidth.W)) + val bestOffset = RegInit(new ScoreTableEntry(p).apply(2.U, 0.U)) // the entry with the highest score while traversing + val testOffset = WireInit(st(ptr).offset) def winner(e1: ScoreTableEntry, e2: ScoreTableEntry): ScoreTableEntry = { - val w = new ScoreTableEntry(p) + val w = Wire(new ScoreTableEntry(p)) w := Mux(e1.score > e2.score, e1, e2) w } - val s_idle :: s_learn :: s_finish :: Nil = Enum(3) + val s_idle :: s_learn :: Nil = Enum(2) val state = RegInit(s_idle) // 1. At the start of a learning phase // All the scores are reset to 0. + // At the end of every learning phase, the prefetch offset is updated as the one with the highest score. when (state === s_idle) { - when (ptr =/= scores.U) { - st(ptr).score := 0.U - ptr := ptr + 1.U - }.otherwise { - ptr := 0.U - state := s_learn - } + st.foreach(_.score := 0.U) + ptr := 0.U + round := 0.U + bestOffset.score := badScore.U + prefetchOffset := bestOffset.offset + state := s_learn } // 2. During a learning phase @@ -196,16 +245,18 @@ class OffsetScoreTable(p: BOPParameters) extends PrefetchModule { // (1) one of the score equals SCOREMAX, or // (2) the number of rounds equals ROUNDMAX. when (state === s_learn) { - testOffset := st(ptr).offset when (io.test.req.fire()) { val roundFinish = ptr === (scores - 1).U ptr := Mux(roundFinish, 0.U, ptr + 1.U) round := Mux(roundFinish, round + 1.U, round) + + XSDebug(p"test offset ${testOffset} req fire\n") } // (2) the number of rounds equals ROUNDMAX. - when (round === roundMax.U) { - state := s_finish + when (round >= roundMax.U) { + state := s_idle + XSDebug(p"round reaches roundMax(${roundMax.U})\n") } when (io.test.resp.fire() && io.test.resp.bits.hit) { @@ -216,25 +267,148 @@ class OffsetScoreTable(p: BOPParameters) extends PrefetchModule { st(io.test.resp.bits.ptr).score := newScore bestOffset := winner(new ScoreTableEntry(p).apply(offset, newScore), bestOffset) // (1) one of the score equals SCOREMAX - when (newScore === scoreMax.U) { - state := s_finish + when (newScore >= scoreMax.U) { + state := s_idle + XSDebug(p"newScore reaches scoreMax(${scoreMax.U})\n") } - } - } - // 3. At the end of every learning phase, the prefetch offset is updated as the one with the highest score. - when (state === s_finish) { - prefetchOffset := bestOffset.offset - ptr := 0.U - round := 0.U - bestOffset.offset := 1.U - bestOffset.score := 0.U - state := s_idle + XSDebug(p"test offset ${offset} resp fire and hit. score ${oldScore} -> ${newScore}\n") + } } + io.req.ready := true.B io.prefetchOffset := prefetchOffset - io.test.req.valid := state === s_learn && round =/= roundMax.U - io.test.req.bits.addr := DontCare // assign this outside the score table + io.test.req.valid := state === s_learn && io.req.fire() + io.test.req.bits.addr := io.req.bits io.test.req.bits.testOffset := testOffset io.test.req.bits.ptr := ptr + io.test.resp.ready := true.B + + XSDebug(p"state=${state} prefetchOffset=${prefetchOffset} ptr=${ptr} round=${round} bestOffset=${bestOffset} testOffset=${testOffset}\n") + // score table + XSDebug(p"OffsetScoreTable(idx:offset:score) as follows:\n") + for (i <- 0 until scores) { + if (i % 8 == 0) { XSDebug(p"${i.U}:${st(i)}\t") } + else if (i % 8 == 7 || i == scores - 1) { XSDebug(false, true.B, p"${i.U}:${st(i)}\n") } + else { XSDebug(false, true.B, p"${i.U}:${st(i)}\t") } + } + XSDebug(io.req.fire(), p"receive req from L1. io.req.bits=0x${Hexadecimal(io.req.bits)}\n") +} + +class BestOffsetPrefetchEntry(p: BOPParameters) extends PrefetchModule { + val io = IO(new Bundle { + val id = Input(UInt(p.totalWidth.W)) + val prefetchOffset = Input(UInt(p.offsetWidth.W)) + val pft = new BestOffsetPrefetchIO(p) + val inflight = ValidIO(UInt(PAddrBits.W)) + val writeRRTable = DecoupledIO(UInt(PAddrBits.W)) + }) + + def blockBytes = p.blockBytes + def getBlockAddr(addr: UInt) = Cat(addr(PAddrBits - 1, log2Up(blockBytes)), 0.U(log2Up(blockBytes).W)) + + val s_idle :: s_req :: s_resp :: s_write_recent_req :: s_finish :: Nil = Enum(5) + val state = RegInit(s_idle) + val req = RegInit(0.U.asTypeOf(new PrefetchReq)) + val baseAddr = RegInit(0.U(PAddrBits.W)) + + when (state === s_idle) { + when (io.pft.train.valid) { + state := s_req + req.addr := getBlockAddr(io.pft.train.bits.addr) + (io.prefetchOffset << log2Up(blockBytes)) + req.write := io.pft.train.bits.write + baseAddr := getBlockAddr(io.pft.train.bits.addr) + } + } + + when (state === s_req) { + when (io.pft.req.fire()) { + state := s_resp + } + } + + when (state === s_resp) { + when (io.pft.resp.fire()) { + state := s_write_recent_req + } + } + + when (state === s_write_recent_req) { + when (io.writeRRTable.fire()) { + state := s_finish + } + } + + when (state === s_finish) { + when (io.pft.finish.fire()) { + state := s_idle + } + } + + io.pft.req.valid := state === s_req + io.pft.req.bits.addr := req.addr + io.pft.req.bits.write := req.write + io.pft.req.bits.id := io.id + io.pft.resp.ready := state === s_resp + io.pft.finish.valid := state === s_finish + io.pft.finish.bits.id := io.id + io.inflight.valid := state =/= s_idle + io.inflight.bits := req.addr + io.writeRRTable.valid := state === s_write_recent_req + io.writeRRTable.bits := baseAddr // write this into recent request table + + XSDebug(p"bopEntry ${io.id}: state=${state} prefetchOffset=${io.prefetchOffset} inflight=${io.inflight.valid} 0x${Hexadecimal(io.inflight.bits)} writeRRTable: ${io.writeRRTable.valid} 0x${Hexadecimal(io.writeRRTable.bits)} baseAddr=0x${Hexadecimal(baseAddr)} req: ${req}\n") + XSDebug(p"bopEntry ${io.id}: io.pft: ${io.pft}\n") +} + +class BestOffsetPrefetch(p: BOPParameters) extends PrefetchModule { + val io = IO(new BestOffsetPrefetchIO(p)) + + def nEntries = p.nEntries + def blockBytes = p.blockBytes + def getBlockAddr(addr: UInt) = Cat(addr(PAddrBits - 1, log2Up(blockBytes)), 0.U(log2Up(blockBytes).W)) + val scoreTable = Module(new OffsetScoreTable(p)) + val rrTable = Module(new RecentRequestTable(p)) + val reqArb = Module(new Arbiter(new BestOffsetPrefetchReq(p), nEntries)) + val finishArb = Module(new Arbiter(new BestOffsetPrefetchFinish(p), nEntries)) + val writeRRTableArb = Module(new Arbiter(UInt(PAddrBits.W), nEntries)) + + val entryReadyIdx = Wire(UInt(log2Up(nEntries).W)) + val inflightMatchVec = Wire(Vec(nEntries, Bool())) + + val bopEntries = (0 until nEntries).map { i => + val bopEntry = Module(new BestOffsetPrefetchEntry(p)) + + bopEntry.io.id := i.U + bopEntry.io.prefetchOffset := scoreTable.io.prefetchOffset + + bopEntry.io.pft.train.valid := io.train.valid && i.U === entryReadyIdx && !inflightMatchVec.asUInt.orR + bopEntry.io.pft.train.bits := io.train.bits + + reqArb.io.in(i) <> bopEntry.io.pft.req + bopEntry.io.pft.resp.valid := io.resp.valid && i.U === io.resp.bits.id + bopEntry.io.pft.resp.bits := io.resp.bits + finishArb.io.in(i) <> bopEntry.io.pft.finish + + writeRRTableArb.io.in(i) <> bopEntry.io.writeRRTable + + bopEntry + } + + entryReadyIdx := PriorityEncoder(bopEntries.map { e => !e.io.inflight.valid }) + (0 until nEntries).foreach(i => + inflightMatchVec(i) := bopEntries(i).io.inflight.valid && bopEntries(i).io.inflight.bits === getBlockAddr(io.train.bits.addr) + ) + + io.req <> reqArb.io.out + io.resp.ready := VecInit(bopEntries.zipWithIndex.map { case (e, i) => i.U === io.resp.bits.id && e.io.pft.resp.ready }).asUInt.orR + io.finish <> finishArb.io.out + rrTable.io.w <> writeRRTableArb.io.out + rrTable.io.r <> scoreTable.io.test + scoreTable.io.req.valid := io.train.valid + scoreTable.io.req.bits := getBlockAddr(io.train.bits.addr) + + XSDebug(p"io: ${io}\n") + XSDebug(p"entryReadyIdx=${entryReadyIdx} inflightMatchVec=${Binary(inflightMatchVec.asUInt)}\n") + } diff --git a/src/main/scala/xiangshan/cache/prefetch/L2Prefetcher.scala b/src/main/scala/xiangshan/cache/prefetch/L2Prefetcher.scala index 7d11547da4f44d2caf825c4672b265eaa027050f..c76b7412b9ca689fc2700208059883c808451a28 100644 --- a/src/main/scala/xiangshan/cache/prefetch/L2Prefetcher.scala +++ b/src/main/scala/xiangshan/cache/prefetch/L2Prefetcher.scala @@ -15,13 +15,30 @@ import freechips.rocketchip.tilelink.{TLClientNode, TLClientParameters, TLEdgeOut, TLBundleA, TLBundleD, ClientStates, ClientMetadata, TLHints } +import sifive.blocks.inclusivecache.PrefetcherIO case class L2PrefetcherParameters( enable: Boolean, _type: String, - streamParams: StreamPrefetchParameters + streamParams: StreamPrefetchParameters, + bopParams: BOPParameters ) { - def nEntries: Int = streamParams.streamCnt * streamParams.streamSize + // def nEntries: Int = streamParams.streamCnt * streamParams.streamSize + def nEntries: Int = { + if (enable && _type == "stream") { streamParams.streamCnt * streamParams.streamSize } + else if (enable && _type == "bop") { bopParams.nEntries } + else 1 + } + def totalWidth: Int = { + if (enable && _type == "stream") streamParams.totalWidth + else if (enable && _type == "bop") bopParams.totalWidth + else 1 + } + def blockBytes: Int = { + if (enable && _type == "stream") streamParams.blockBytes + else if (enable && _type == "bop") bopParams.blockBytes + else 64 + } } class L2Prefetcher()(implicit p: Parameters) extends LazyModule with HasPrefetchParameters { @@ -37,18 +54,41 @@ class L2Prefetcher()(implicit p: Parameters) extends LazyModule with HasPrefetch lazy val module = new L2PrefetcherImp(this) } +class L2PrefetcherIO extends XSBundle with HasPrefetchParameters { + val in = Flipped(DecoupledIO(new MissReq)) +} + // prefetch DCache lines in L2 using StreamPrefetch class L2PrefetcherImp(outer: L2Prefetcher) extends LazyModuleImp(outer) with HasPrefetchParameters with HasXSLog { - val io = IO(new Bundle { - val in = Flipped(DecoupledIO(new MissReq)) - // prefetch - // val mem_acquire = Decoupled(new TLBundleA(edge.bundle)) - // val mem_grant = Flipped(Decoupled(new TLBundleD(edge.bundle))) - // val mem_finish = Decoupled(new TLBundleE(edge.bundle)) - }) + val io = IO(new L2PrefetcherIO) val (bus, edge) = outer.clientNode.out.head - if (l2PrefetcherParameters.enable && l2PrefetcherParameters._type == "stream") { + if (l2PrefetcherParameters.enable && l2PrefetcherParameters._type == "bop") { + val bopParams = l2PrefetcherParameters.bopParams + val dPrefetch = Module(new BestOffsetPrefetch(bopParams)) + dPrefetch.io.train.valid := io.in.fire() + dPrefetch.io.train.bits.addr := io.in.bits.addr + dPrefetch.io.train.bits.write := MemoryOpConstants.isWrite(io.in.bits.cmd) + dPrefetch.io.train.bits.miss := true.B + io.in.ready := true.B + + bus.a.valid := dPrefetch.io.req.valid + bus.a.bits := DontCare + bus.a.bits := edge.Hint( + fromSource = dPrefetch.io.req.bits.id, + toAddress = dPrefetch.io.req.bits.addr, + lgSize = log2Up(bopParams.blockBytes).U, + param = Mux(dPrefetch.io.req.bits.write, TLHints.PREFETCH_WRITE, TLHints.PREFETCH_READ) + )._2 + dPrefetch.io.req.ready := bus.a.ready + + dPrefetch.io.resp.valid := bus.d.valid + dPrefetch.io.resp.bits.id := bus.d.bits.source(bopParams.totalWidth - 1, 0) + bus.d.ready := dPrefetch.io.resp.ready + + dPrefetch.io.finish.ready := true.B + + } else if (l2PrefetcherParameters.enable && l2PrefetcherParameters._type == "stream") { val streamParams = l2PrefetcherParameters.streamParams val dPrefetch = Module(new StreamPrefetch(streamParams)) dPrefetch.io.train.valid := io.in.fire() @@ -62,49 +102,44 @@ class L2PrefetcherImp(outer: L2Prefetcher) extends LazyModuleImp(outer) with Has bus.a.bits := edge.Hint( fromSource = dPrefetch.io.req.bits.id, toAddress = dPrefetch.io.req.bits.addr, - lgSize = log2Up(streamParams.blockBytes).U, + lgSize = log2Up(l2PrefetcherParameters.blockBytes).U, param = Mux(dPrefetch.io.req.bits.write, TLHints.PREFETCH_WRITE, TLHints.PREFETCH_READ) // TODO )._2 dPrefetch.io.req.ready := bus.a.ready - bus.b.ready := true.B - - bus.c.valid := false.B - bus.c.bits := DontCare - dPrefetch.io.resp.valid := bus.d.valid - dPrefetch.io.resp.bits.id := bus.d.bits.source(streamParams.totalWidth - 1, 0) + dPrefetch.io.resp.bits.id := bus.d.bits.source(l2PrefetcherParameters.totalWidth - 1, 0) bus.d.ready := dPrefetch.io.resp.ready - bus.e.valid := false.B - bus.e.bits := DontCare dPrefetch.io.finish.ready := true.B - if (!env.FPGAPlatform) { - ExcitingUtils.addSource(bus.a.fire(), "perfCntL2PrefetchReqCnt", Perf) - def idWidth = log2Up(l2PrefetcherParameters.nEntries) - (0 until l2PrefetcherParameters.nEntries).foreach(i => - ExcitingUtils.addSource( - BoolStopWatch( - start = bus.a.fire() && dPrefetch.io.req.bits.id(streamParams.totalWidth - 1, 0) === i.U, - stop = bus.d.fire() && bus.d.bits.source(streamParams.totalWidth - 1, 0) === i.U, - startHighPriority = true - ), - "perfCntL2PrefetchPenaltyEntry" + Integer.toString(i, 10), - Perf - ) - ) - } - } else { bus.a.valid := false.B bus.a.bits := DontCare - bus.b.ready := true.B - bus.c.valid := false.B - bus.c.bits := DontCare bus.d.ready := true.B - bus.e.valid := false.B - bus.e.bits := DontCare + } + + bus.b.ready := true.B + + bus.c.valid := false.B + bus.c.bits := DontCare + + bus.e.valid := false.B + bus.e.bits := DontCare + + if (!env.FPGAPlatform) { + ExcitingUtils.addSource(bus.a.fire(), "perfCntL2PrefetchReqCnt", Perf) + (0 until l2PrefetcherParameters.nEntries).foreach(i => + ExcitingUtils.addSource( + BoolStopWatch( + start = bus.a.fire() && bus.a.bits.source(l2PrefetcherParameters.totalWidth - 1, 0) === i.U, + stop = bus.d.fire() && bus.d.bits.source(l2PrefetcherParameters.totalWidth - 1, 0) === i.U, + startHighPriority = true + ), + "perfCntL2PrefetchPenaltyEntry" + Integer.toString(i, 10), + Perf + ) + ) } } diff --git a/src/main/scala/xiangshan/cache/prefetch/Prefetcher.scala b/src/main/scala/xiangshan/cache/prefetch/Prefetcher.scala index 950f5676d15fef85314fd17a0a44183853f3d3dc..c64fda77ab63fb0adb5a489dfe313c7eb2213c1c 100644 --- a/src/main/scala/xiangshan/cache/prefetch/Prefetcher.scala +++ b/src/main/scala/xiangshan/cache/prefetch/Prefetcher.scala @@ -40,11 +40,11 @@ class PrefetchTrain extends PrefetchBundle { } } -class PrefetchIO extends PrefetchBundle { - val train = Flipped(ValidIO(new PrefetchTrain)) - val req = DecoupledIO(new PrefetchReq) - val resp = Flipped(DecoupledIO(new PrefetchResp)) -} +// class PrefetchIO extends PrefetchBundle { +// val train = Flipped(ValidIO(new PrefetchTrain)) +// val req = DecoupledIO(new PrefetchReq) +// val resp = Flipped(DecoupledIO(new PrefetchResp)) +// } // class FakePrefetcher extends PrefetchModule { // val io = IO(new PrefetchIO) diff --git a/src/main/scala/xiangshan/cache/ptw.scala b/src/main/scala/xiangshan/cache/ptw.scala index 036b149ca966413e8347f0d4f8157899dd5ccaf7..e553bc6d1927b341f1583fc7c480d27f5d274bf5 100644 --- a/src/main/scala/xiangshan/cache/ptw.scala +++ b/src/main/scala/xiangshan/cache/ptw.scala @@ -323,8 +323,7 @@ class PTWImp(outer: PTW) extends PtwModule(outer){ // two level: l2-tlb-cache && pde/pte-cache // l2-tlb-cache is ram-larger-edition tlb // pde/pte-cache is cache of page-table, speeding up ptw - val tlbl2 = Module(new SRAMWrapper( - "L2TLB", + val tlbl2 = Module(new SRAMTemplate( new L2TlbEntires(num = TlbL2LineSize, tagLen = TlbL2TagLen), set = TlbL2LineNum, singlePort = true @@ -339,8 +338,7 @@ class PTWImp(outer: PTW) extends PtwModule(outer){ val ptwl1 = Reg(Vec(PtwL1EntrySize, new PtwEntry(tagLen = PtwL1TagLen))) val l1v = RegInit(0.U(PtwL1EntrySize.W)) // valid val l1g = Reg(UInt(PtwL1EntrySize.W)) - val ptwl2 = Module(new SRAMWrapper( - "L2PTW", + val ptwl2 = Module(new SRAMTemplate( new PtwEntries(num = PtwL2LineSize, tagLen = PtwL2TagLen), set = PtwL2LineNum, singlePort = true diff --git a/src/main/scala/xiangshan/frontend/Bim.scala b/src/main/scala/xiangshan/frontend/Bim.scala index f1945d98c3a56a03203e6198204ba6f39f6c2ea8..f15bfff44354bcfa65ee851d5882e1c38a5e34b6 100644 --- a/src/main/scala/xiangshan/frontend/Bim.scala +++ b/src/main/scala/xiangshan/frontend/Bim.scala @@ -34,7 +34,7 @@ class BIM extends BasePredictor with BimParams { val bimAddr = new TableAddr(log2Up(BimSize), BimBanks) val bim = List.fill(BimBanks) { - Module(new SRAMWrapper("Bim", UInt(2.W), set = nRows, shouldReset = false, holdRead = true)) + Module(new SRAMTemplate(UInt(2.W), set = nRows, shouldReset = false, holdRead = true)) } val doing_reset = RegInit(true.B) diff --git a/src/main/scala/xiangshan/frontend/Btb.scala b/src/main/scala/xiangshan/frontend/Btb.scala index ca274a5993a5126804539dda087336d6a0487ae0..66a6eb35a057756dec535ea3a4a75222d9392829 100644 --- a/src/main/scala/xiangshan/frontend/Btb.scala +++ b/src/main/scala/xiangshan/frontend/Btb.scala @@ -78,15 +78,15 @@ class BTB extends BasePredictor with BTBParams{ val data = List.fill(BtbWays) { List.fill(BtbBanks) { - Module(new SRAMWrapper("BTB_Data", new BtbDataEntry, set = nRows, shouldReset = true, holdRead = true)) + Module(new SRAMTemplate(new BtbDataEntry, set = nRows, shouldReset = true, holdRead = true)) } } val meta = List.fill(BtbWays) { List.fill(BtbBanks) { - Module(new SRAMWrapper("BTB_Meta", new BtbMetaEntry, set = nRows, shouldReset = true, holdRead = true)) + Module(new SRAMTemplate(new BtbMetaEntry, set = nRows, shouldReset = true, holdRead = true)) } } - val edata = Module(new SRAMWrapper("BTB_Edata", UInt(VAddrBits.W), set = extendedNRows, shouldReset = true, holdRead = true)) + val edata = Module(new SRAMTemplate(UInt(VAddrBits.W), set = extendedNRows, shouldReset = true, holdRead = true)) val if1_mask = io.inMask val if2_mask = RegEnable(if1_mask, io.pc.valid) diff --git a/src/main/scala/xiangshan/frontend/SC.scala b/src/main/scala/xiangshan/frontend/SC.scala index 5960e8d4abde0cfb719b481b96835dceb99546c1..1f4c396c9dcefcaf25d976ef8668c4b572ba986b 100644 --- a/src/main/scala/xiangshan/frontend/SC.scala +++ b/src/main/scala/xiangshan/frontend/SC.scala @@ -44,7 +44,7 @@ class SCTable(val nRows: Int, val ctrBits: Int, val histLen: Int) extends BaseSC val table = List.fill(TageBanks) { List.fill(2) { - Module(new SRAMWrapper("SC_Table", SInt(ctrBits.W), set=nRows, shouldReset=false, holdRead=true, singlePort=false)) + Module(new SRAMTemplate(SInt(ctrBits.W), set=nRows, shouldReset=false, holdRead=true, singlePort=false)) } } diff --git a/src/main/scala/xiangshan/frontend/Tage.scala b/src/main/scala/xiangshan/frontend/Tage.scala index 54afc421dc67a18d71452593727e4baa48a5a305..743e957b76aa6a790ea54524eec734cee36f02f8 100644 --- a/src/main/scala/xiangshan/frontend/Tage.scala +++ b/src/main/scala/xiangshan/frontend/Tage.scala @@ -162,7 +162,7 @@ class TageTable(val nRows: Int, val histLen: Int, val tagLen: Int, val uBitPerio val hi_us = List.fill(TageBanks)(Module(new HL_Bank(nRows))) val lo_us = List.fill(TageBanks)(Module(new HL_Bank(nRows))) - val table = List.fill(TageBanks)(Module(new SRAMWrapper(s"TageTable_H${histLen}_T${tagLen}", new TageEntry, set=nRows, shouldReset=false, holdRead=true, singlePort=false))) + val table = List.fill(TageBanks)(Module(new SRAMTemplate(new TageEntry, set=nRows, shouldReset=false, holdRead=true, singlePort=false))) val if3_hi_us_r = WireInit(0.U.asTypeOf(Vec(TageBanks, Bool()))) val if3_lo_us_r = WireInit(0.U.asTypeOf(Vec(TageBanks, Bool()))) diff --git a/src/main/scala/xiangshan/frontend/jbtac.scala b/src/main/scala/xiangshan/frontend/jbtac.scala index f22f9497ae236cc444d669c382fac34c70ed4f63..9e0735959d4ffd6b7b5ce5f2ecbe120243d2a8e1 100644 --- a/src/main/scala/xiangshan/frontend/jbtac.scala +++ b/src/main/scala/xiangshan/frontend/jbtac.scala @@ -55,7 +55,7 @@ class JBTAC extends XSModule { val isRVC = Bool() } - val jbtac = List.fill(JbtacBanks)(Module(new SRAMWrapper("JBTac", jbtacEntry(), set = JbtacSize / JbtacBanks, shouldReset = true, holdRead = true, singlePort = false))) + val jbtac = List.fill(JbtacBanks)(Module(new SRAMTemplate(jbtacEntry(), set = JbtacSize / JbtacBanks, shouldReset = true, holdRead = true, singlePort = false))) val readEntries = Wire(Vec(JbtacBanks, jbtacEntry())) diff --git a/src/main/scala/xiangshan/mem/lsqueue/LoadQueue.scala b/src/main/scala/xiangshan/mem/lsqueue/LoadQueue.scala index 9040919b2035ab71c42a474e26bf9e8dbea22e57..5329953eccc697d601a4fb6c38ae9b1c5826f46b 100644 --- a/src/main/scala/xiangshan/mem/lsqueue/LoadQueue.scala +++ b/src/main/scala/xiangshan/mem/lsqueue/LoadQueue.scala @@ -163,109 +163,43 @@ class LoadQueue extends XSModule io.loadIn(i).bits.forwardData.asUInt, io.loadIn(i).bits.forwardMask.asUInt, io.loadIn(i).bits.mmio - ) - }.otherwise { - XSInfo(io.loadIn(i).valid, "load hit write to cbd lqidx %d pc 0x%x vaddr %x paddr %x data %x mask %x forwardData %x forwardMask: %x mmio %x\n", - io.loadIn(i).bits.uop.lqIdx.asUInt, - io.loadIn(i).bits.uop.cf.pc, - io.loadIn(i).bits.vaddr, - io.loadIn(i).bits.paddr, - io.loadIn(i).bits.data, - io.loadIn(i).bits.mask, - io.loadIn(i).bits.forwardData.asUInt, - io.loadIn(i).bits.forwardMask.asUInt, - io.loadIn(i).bits.mmio - ) - } - val loadWbIndex = io.loadIn(i).bits.uop.lqIdx.value - datavalid(loadWbIndex) := !io.loadIn(i).bits.miss && !io.loadIn(i).bits.mmio - writebacked(loadWbIndex) := !io.loadIn(i).bits.miss && !io.loadIn(i).bits.mmio - - val loadWbData = Wire(new LQDataEntry) - loadWbData.paddr := io.loadIn(i).bits.paddr - loadWbData.mask := io.loadIn(i).bits.mask - loadWbData.data := io.loadIn(i).bits.data // fwd data - loadWbData.fwdMask := io.loadIn(i).bits.forwardMask - dataModule.io.wbWrite(i, loadWbIndex, loadWbData) - dataModule.io.wb.wen(i) := true.B - - vaddrModule.io.waddr(i) := loadWbIndex - vaddrModule.io.wdata(i) := io.loadIn(i).bits.vaddr - vaddrModule.io.wen(i) := true.B - - debug_mmio(loadWbIndex) := io.loadIn(i).bits.mmio - - val dcacheMissed = io.loadIn(i).bits.miss && !io.loadIn(i).bits.mmio - miss(loadWbIndex) := dcacheMissed - pending(loadWbIndex) := io.loadIn(i).bits.mmio - uop(loadWbIndex).debugInfo.issueTime := io.loadIn(i).bits.uop.debugInfo.issueTime - } + ) + }.otherwise { + XSInfo(io.loadIn(i).valid, "load hit write to cbd lqidx %d pc 0x%x vaddr %x paddr %x data %x mask %x forwardData %x forwardMask: %x mmio %x\n", + io.loadIn(i).bits.uop.lqIdx.asUInt, + io.loadIn(i).bits.uop.cf.pc, + io.loadIn(i).bits.vaddr, + io.loadIn(i).bits.paddr, + io.loadIn(i).bits.data, + io.loadIn(i).bits.mask, + io.loadIn(i).bits.forwardData.asUInt, + io.loadIn(i).bits.forwardMask.asUInt, + io.loadIn(i).bits.mmio + )} + val loadWbIndex = io.loadIn(i).bits.uop.lqIdx.value + datavalid(loadWbIndex) := !io.loadIn(i).bits.miss && !io.loadIn(i).bits.mmio + writebacked(loadWbIndex) := !io.loadIn(i).bits.miss && !io.loadIn(i).bits.mmio + + val loadWbData = Wire(new LQDataEntry) + loadWbData.paddr := io.loadIn(i).bits.paddr + loadWbData.mask := io.loadIn(i).bits.mask + loadWbData.data := io.loadIn(i).bits.data // fwd data + loadWbData.fwdMask := io.loadIn(i).bits.forwardMask + dataModule.io.wbWrite(i, loadWbIndex, loadWbData) + dataModule.io.wb.wen(i) := true.B + + vaddrModule.io.waddr(i) := loadWbIndex + vaddrModule.io.wdata(i) := io.loadIn(i).bits.vaddr + vaddrModule.io.wen(i) := true.B + + debug_mmio(loadWbIndex) := io.loadIn(i).bits.mmio + + val dcacheMissed = io.loadIn(i).bits.miss && !io.loadIn(i).bits.mmio + miss(loadWbIndex) := dcacheMissed + pending(loadWbIndex) := io.loadIn(i).bits.mmio + uop(loadWbIndex).debugInfo.issueTime := io.loadIn(i).bits.uop.debugInfo.issueTime } - - /** - * Cache miss request - * - * (1) writeback: miss - * (2) send to dcache: listing - * (3) dcache response: datavalid - * (4) writeback to ROB: writeback - */ - // val inflightReqs = RegInit(VecInit(Seq.fill(cfg.nLoadMissEntries)(0.U.asTypeOf(new InflightBlockInfo)))) - // val inflightReqFull = inflightReqs.map(req => req.valid).reduce(_&&_) - // val reqBlockIndex = PriorityEncoder(~VecInit(inflightReqs.map(req => req.valid)).asUInt) - - // val missRefillSelVec = VecInit( - // (0 until LoadQueueSize).map{ i => - // val inflight = inflightReqs.map(req => req.valid && req.block_addr === get_block_addr(dataModule.io.rdata(i).paddr)).reduce(_||_) - // allocated(i) && miss(i) && !inflight - // }) - - // val missRefillSel = getFirstOne(missRefillSelVec, deqMask) - // val missRefillBlockAddr = get_block_addr(dataModule.io.rdata(missRefillSel).paddr) - // io.dcache.req.valid := missRefillSelVec.asUInt.orR - // io.dcache.req.bits.cmd := MemoryOpConstants.M_XRD - // io.dcache.req.bits.addr := missRefillBlockAddr - // io.dcache.req.bits.data := DontCare - // io.dcache.req.bits.mask := DontCare - - // io.dcache.req.bits.meta.id := DontCare - // io.dcache.req.bits.meta.vaddr := DontCare // dataModule.io.rdata(missRefillSel).vaddr - // io.dcache.req.bits.meta.paddr := missRefillBlockAddr - // io.dcache.req.bits.meta.uop := uop(missRefillSel) - // io.dcache.req.bits.meta.mmio := false.B // dataModule.io.rdata(missRefillSel).mmio - // io.dcache.req.bits.meta.tlb_miss := false.B - // io.dcache.req.bits.meta.mask := DontCare - // io.dcache.req.bits.meta.replay := false.B - - // assert(!(dataModule.io.rdata(missRefillSel).mmio && io.dcache.req.valid)) - - // when(io.dcache.req.fire()) { - // miss(missRefillSel) := false.B - // listening(missRefillSel) := true.B - - // mark this block as inflight - // inflightReqs(reqBlockIndex).valid := true.B - // inflightReqs(reqBlockIndex).block_addr := missRefillBlockAddr - // assert(!inflightReqs(reqBlockIndex).valid) - // } - - // when(io.dcache.resp.fire()) { - // val inflight = inflightReqs.map(req => req.valid && req.block_addr === get_block_addr(io.dcache.resp.bits.meta.paddr)).reduce(_||_) - // assert(inflight) - // for (i <- 0 until cfg.nLoadMissEntries) { - // when (inflightReqs(i).valid && inflightReqs(i).block_addr === get_block_addr(io.dcache.resp.bits.meta.paddr)) { - // inflightReqs(i).valid := false.B - // } - // } - // } - - - // when(io.dcache.req.fire()){ - // XSDebug("miss req: pc:0x%x roqIdx:%d lqIdx:%d (p)addr:0x%x vaddr:0x%x\n", - // io.dcache.req.bits.meta.uop.cf.pc, io.dcache.req.bits.meta.uop.roqIdx.asUInt, io.dcache.req.bits.meta.uop.lqIdx.asUInt, - // io.dcache.req.bits.addr, io.dcache.req.bits.meta.vaddr - // ) - // } + } when(io.dcache.valid) { XSDebug("miss resp: paddr:0x%x data %x\n", io.dcache.bits.addr, io.dcache.bits.data) @@ -303,7 +237,7 @@ class LoadQueue extends XSModule } val loadWbSel = Wire(Vec(LoadPipelineWidth, UInt(log2Up(LoadQueueSize).W))) // index selected last cycle - val loadWbSelV = RegInit(VecInit(List.fill(LoadPipelineWidth)(false.B))) // index selected in last cycle is valid + val loadWbSelV = Wire(Vec(LoadPipelineWidth, Bool())) // index selected in last cycle is valid val loadWbSelVec = VecInit((0 until LoadQueueSize).map(i => { allocated(i) && !writebacked(i) && datavalid(i) @@ -329,17 +263,11 @@ class LoadQueue extends XSModule loadWbSelVGen(1) := loadOddSelVec.asUInt.orR (0 until LoadPipelineWidth).map(i => { - val canGo = io.ldout(i).fire() || !loadWbSelV(i) - val valid = loadWbSelVGen(i) loadWbSel(i) := RegNext(loadWbSelGen(i)) + loadWbSelV(i) := RegNext(loadWbSelVGen(i), init = false.B) when(io.ldout(i).fire()){ // Mark them as writebacked, so they will not be selected in the next cycle writebacked(loadWbSel(i)) := true.B - // update loadWbSelValidReg - loadWbSelV(i) := false.B - } - when(valid && canGo){ - loadWbSelV(i) := true.B } }) @@ -440,7 +368,9 @@ class LoadQueue extends XSModule * Besides, load instructions in LoadUnit_S1 and S2 are also checked. * Cycle 1: Redirect Generation * There're three possible types of violations. Choose the oldest load. - * Set io.redirect according to the detected violation. + * Prepare redirect request according to the detected violation. + * Cycle 2: Redirect Fire + * Fire redirect request (if valid) */ io.load_s1 := DontCare def detectRollback(i: Int) = { @@ -540,18 +470,29 @@ class LoadQueue extends XSModule val rollbackSelected = ParallelOperation(rollback, rollbackSel) val lastCycleRedirect = RegNext(io.brqRedirect) + // S2: select rollback and generate rollback request // Note that we use roqIdx - 1.U to flush the load instruction itself. // Thus, here if last cycle's roqIdx equals to this cycle's roqIdx, it still triggers the redirect. - io.rollback.valid := rollbackSelected.valid && + val rollbackGen = Wire(Valid(new Redirect)) + val rollbackReg = Reg(Valid(new Redirect)) + rollbackGen.valid := rollbackSelected.valid && (!lastCycleRedirect.valid || !isAfter(rollbackSelected.bits.roqIdx, lastCycleRedirect.bits.roqIdx)) && !(lastCycleRedirect.valid && lastCycleRedirect.bits.isUnconditional()) - io.rollback.bits.roqIdx := rollbackSelected.bits.roqIdx - io.rollback.bits.level := RedirectLevel.flush - io.rollback.bits.interrupt := DontCare - io.rollback.bits.pc := DontCare - io.rollback.bits.target := rollbackSelected.bits.cf.pc - io.rollback.bits.brTag := rollbackSelected.bits.brTag + rollbackGen.bits.roqIdx := rollbackSelected.bits.roqIdx + rollbackGen.bits.level := RedirectLevel.flush + rollbackGen.bits.interrupt := DontCare + rollbackGen.bits.pc := DontCare + rollbackGen.bits.target := rollbackSelected.bits.cf.pc + rollbackGen.bits.brTag := rollbackSelected.bits.brTag + + rollbackReg := rollbackGen + + // S3: fire rollback request + io.rollback := rollbackReg + io.rollback.valid := rollbackReg.valid && + (!lastCycleRedirect.valid || !isAfter(rollbackReg.bits.roqIdx, lastCycleRedirect.bits.roqIdx)) && + !(lastCycleRedirect.valid && lastCycleRedirect.bits.isUnconditional()) when(io.rollback.valid) { XSDebug("Mem rollback: pc %x roqidx %d\n", io.rollback.bits.pc, io.rollback.bits.roqIdx.asUInt) diff --git a/src/main/scala/xiangshan/mem/pipeline/StoreUnit.scala b/src/main/scala/xiangshan/mem/pipeline/StoreUnit.scala index 3ef7a6863141207539cd92bb932e0e3429a6ea61..c4868d2783c8fb500ef91192b87ab2d655e5a942 100644 --- a/src/main/scala/xiangshan/mem/pipeline/StoreUnit.scala +++ b/src/main/scala/xiangshan/mem/pipeline/StoreUnit.scala @@ -102,6 +102,18 @@ class StoreUnit_S1 extends XSModule { } class StoreUnit_S2 extends XSModule { + val io = IO(new Bundle() { + val in = Flipped(Decoupled(new LsPipelineBundle)) + val out = Decoupled(new LsPipelineBundle) + }) + + io.in.ready := true.B + io.out.bits := io.in.bits + io.out.valid := io.in.valid + +} + +class StoreUnit_S3 extends XSModule { val io = IO(new Bundle() { val in = Flipped(Decoupled(new LsPipelineBundle)) val stout = DecoupledIO(new ExuOutput) // writeback store @@ -134,6 +146,7 @@ class StoreUnit extends XSModule { val store_s0 = Module(new StoreUnit_S0) val store_s1 = Module(new StoreUnit_S1) val store_s2 = Module(new StoreUnit_S2) + val store_s3 = Module(new StoreUnit_S3) store_s0.io.in <> io.stin store_s0.io.dtlbReq <> io.dtlb.req @@ -146,7 +159,9 @@ class StoreUnit extends XSModule { PipelineConnect(store_s1.io.out, store_s2.io.in, true.B, store_s1.io.out.bits.uop.roqIdx.needFlush(io.redirect)) - store_s2.io.stout <> io.stout + PipelineConnect(store_s2.io.out, store_s3.io.in, true.B, store_s2.io.out.bits.uop.roqIdx.needFlush(io.redirect)) + + store_s3.io.stout <> io.stout private def printPipeLine(pipeline: LsPipelineBundle, cond: Bool, name: String): Unit = { XSDebug(cond,