diff --git a/.github/workflows/emu.yml b/.github/workflows/emu.yml index e1167de24eaae9583ec7967a7803be4bb8b35390..85c9d8b7fabea9d94bbb4a7316908fde582fc550 100644 --- a/.github/workflows/emu.yml +++ b/.github/workflows/emu.yml @@ -39,7 +39,7 @@ jobs: echo "AM_HOME=/home/ci-runner/xsenv/nexus-am" >> $GITHUB_ENV - name: Build EMU run: - make ./build/emu SIM_ARGS=--disable-all NEMU_HOME=$NEMU_HOME NOOP_HOME=$NOOP_HOME -j220 + make ./build/emu SIM_ARGS=--disable-log NEMU_HOME=$NEMU_HOME NOOP_HOME=$NOOP_HOME B=0 E=0 -j220 - name: Run cputest run: | CPU_TEST_DIR=$AM_HOME/tests/cputest @@ -49,7 +49,7 @@ jobs: do t=${test%.c} echo $t - make -C $CPU_TEST_DIR ALL=$t ARCH=riscv64-noop AM_HOME=$AM_HOME NEMU_HOME=$NEMU_HOME NOOP_HOME=$NOOP_HOME run 2>/dev/null | grep "HIT GOOD TRAP" + make -C $CPU_TEST_DIR ALL=$t ARCH=riscv64-noop AM_HOME=$AM_HOME NEMU_HOME=$NEMU_HOME NOOP_HOME=$NOOP_HOME run B=0 E=0 | grep "HIT GOOD TRAP" if [[ $? != 0 ]]; then echo $t fail @@ -59,10 +59,10 @@ jobs: exit $ret - name: Run riscv-tests run: | - make -C $RVTEST_HOME/isa/ SUITES+=rv64ui SUITES+=rv64um SUITES+=rv64ua SUITES+=rv64uf SUITES+=rv64ud NEMU_HOME=$NEMU_HOME NOOP_HOME=$NOOP_HOME noop_run 2> /dev/null + make -C $RVTEST_HOME/isa/ SUITES+=rv64ui SUITES+=rv64um SUITES+=rv64ua SUITES+=rv64uf SUITES+=rv64ud NEMU_HOME=$NEMU_HOME NOOP_HOME=$NOOP_HOME noop_run B=0 E=0 - name: Run microbench run: | - make -C $AM_HOME/apps/microbench ARCH=riscv64-noop AM_HOME=$AM_HOME NEMU_HOME=$NEMU_HOME NOOP_HOME=$NOOP_HOME mainargs=test run 2> /dev/null + make -C $AM_HOME/apps/microbench ARCH=riscv64-noop AM_HOME=$AM_HOME NEMU_HOME=$NEMU_HOME NOOP_HOME=$NOOP_HOME mainargs=test run B=0 E=0 - name: Run coremark run: | - make -C $AM_HOME/apps/coremark ARCH=riscv64-noop AM_HOME=$AM_HOME NEMU_HOME=$NEMU_HOME NOOP_HOME=$NOOP_HOME run 2> /dev/null + make -C $AM_HOME/apps/coremark ARCH=riscv64-noop AM_HOME=$AM_HOME NEMU_HOME=$NEMU_HOME NOOP_HOME=$NOOP_HOME run B=0 E=0 diff --git a/Makefile b/Makefile index fb0610fc444fc8e44b38469bee1c3316db41b635..f652d0ecf7c1d43e4cb2ab446a6df360c22511e3 100644 --- a/Makefile +++ b/Makefile @@ -27,17 +27,16 @@ help: $(TOP_V): $(SCALA_FILE) mkdir -p $(@D) - mill XiangShan.test.runMain $(SIMTOP) -X verilog -td $(@D) --full-stacktrace --output-file $(@F) --disable-all --fpga-platform --remove-assert $(SIM_ARGS) - # mill XiangShan.runMain top.$(TOP) -X verilog -td $(@D) --output-file $(@F) --infer-rw $(FPGATOP) --repl-seq-mem -c:$(FPGATOP):-o:$(@D)/$(@F).conf - # $(MEM_GEN) $(@D)/$(@F).conf >> $@ + mill XiangShan.test.runMain $(SIMTOP) -td $(@D) --full-stacktrace --output-file $(@F) --disable-all --fpga-platform --remove-assert --infer-rw --repl-seq-mem -c:$(SIMTOP):-o:$(@D)/$(@F).conf $(SIM_ARGS) + $(MEM_GEN) $(@D)/$(@F).conf >> $@ # sed -i -e 's/_\(aw\|ar\|w\|r\|b\)_\(\|bits_\)/_\1/g' $@ - # @git log -n 1 >> .__head__ - # @git diff >> .__diff__ - # @sed -i 's/^/\/\// ' .__head__ - # @sed -i 's/^/\/\//' .__diff__ - # @cat .__head__ .__diff__ $@ > .__out__ - # @mv .__out__ $@ - # @rm .__head__ .__diff__ + @git log -n 1 >> .__head__ + @git diff >> .__diff__ + @sed -i 's/^/\/\// ' .__head__ + @sed -i 's/^/\/\//' .__diff__ + @cat .__head__ .__diff__ $@ > .__out__ + @mv .__out__ $@ + @rm .__head__ .__diff__ deploy: build/top.zip diff --git a/block-inclusivecache-sifive b/block-inclusivecache-sifive index ab2a8e8afd162b601d9f749e6e6af452cccc03a7..cf429e420be6702a2e24b9b91910366187c103b4 160000 --- a/block-inclusivecache-sifive +++ b/block-inclusivecache-sifive @@ -1 +1 @@ -Subproject commit ab2a8e8afd162b601d9f749e6e6af452cccc03a7 +Subproject commit cf429e420be6702a2e24b9b91910366187c103b4 diff --git a/debug/Makefile b/debug/Makefile index 4253105ee2dd321429354b8ed75e3889a5bf863a..b971ac1adbd5f0f2b28d223e19069069a7ae1017 100644 --- a/debug/Makefile +++ b/debug/Makefile @@ -3,7 +3,7 @@ NANOS_HOME ?= $(AM_HOME)/../nanos-lite SINGLETEST = ALL=min3 B ?= 0 -E ?= 0 +E ?= -1 V ?= OFF #V ?= OFF EMU_ARGS = B=$(B) E=$(E) V=$(V) diff --git a/debug/env.sh b/debug/env.sh new file mode 100644 index 0000000000000000000000000000000000000000..aa7989c3fc983c1a1a13c3f291d1c2ed6d907759 --- /dev/null +++ b/debug/env.sh @@ -0,0 +1,2 @@ +export NOOP_HOME=$(pwd)/.. +echo $NOOP_HOME diff --git a/scripts/utils/lock-emu.c b/scripts/utils/lock-emu.c index 57620ffa89812b91ef98b5c39746c6206fc142cd..075f7806034cf4b92d2eaa2041ab1af47c4b0d33 100644 --- a/scripts/utils/lock-emu.c +++ b/scripts/utils/lock-emu.c @@ -23,7 +23,9 @@ int main(int argc, char* argv[]){ fd = tryLock(argv[1]); if(fd > 0){ getlogin_r(user, BUF_SIZE); - write(fd, user, strlen(user)); + int len = strlen(user); + user[len] = '\0'; + write(fd, user, len+1); break; } else { // someone is holding the lock... diff --git a/src/main/scala/device/AXI4Flash.scala b/src/main/scala/device/AXI4Flash.scala index aeee36b7c423d22914771d02d5fe393e35fb0a6d..90b7f3a0a4d034f80c2f08b81a64c7cad451be7a 100644 --- a/src/main/scala/device/AXI4Flash.scala +++ b/src/main/scala/device/AXI4Flash.scala @@ -25,10 +25,12 @@ class AXI4Flash ) def getOffset(addr: UInt) = addr(12,0) - val rdata = Wire(UInt(64.W)) - RegMap.generate(mapping, getOffset(raddr), rdata, - getOffset(waddr), in.w.fire(), in.w.bits.data, MaskExpand(in.w.bits.strb)) + val rdata = Wire(Vec(2,UInt(32.W))) + (0 until 2).map{ i => + RegMap.generate(mapping, getOffset(raddr + (i * 4).U), rdata(i), + getOffset(waddr), in.w.fire(), in.w.bits.data, MaskExpand(in.w.bits.strb)) + } - in.r.bits.data := Fill(2, rdata(31,0)) + in.r.bits.data := rdata.asUInt } } diff --git a/src/main/scala/system/SoC.scala b/src/main/scala/system/SoC.scala index 917ac31abf51ee7756b112214fb49a3ef959d9ce..ba5e6efb7a89603469816d58729324e743c0c0c0 100644 --- a/src/main/scala/system/SoC.scala +++ b/src/main/scala/system/SoC.scala @@ -6,7 +6,7 @@ import chisel3._ import chisel3.util._ import freechips.rocketchip.diplomacy.{AddressSet, LazyModule, LazyModuleImp} import freechips.rocketchip.tilelink.{BankBinder, TLBuffer, TLBundleParameters, TLCacheCork, TLClientNode, TLFilter, TLFuzzer, TLIdentityNode, TLToAXI4, TLWidthWidget, TLXbar} -import utils.DebugIdentityNode +import utils.{DebugIdentityNode, DataDontCareNode} import utils.XSInfo import xiangshan.{HasXSParameter, XSCore, HasXSLog} import sifive.blocks.inclusivecache.{CacheParameters, InclusiveCache, InclusiveCacheMicroParameters} @@ -61,7 +61,7 @@ class XSSoc()(implicit p: Parameters) extends LazyModule with HasSoCParameter { cacheName = s"L2" ), InclusiveCacheMicroParameters( - writeBytes = 8 + writeBytes = 32 ) ))) @@ -79,7 +79,7 @@ class XSSoc()(implicit p: Parameters) extends LazyModule with HasSoCParameter { cacheName = "L3" ), InclusiveCacheMicroParameters( - writeBytes = 8 + writeBytes = 32 ) )).node @@ -101,7 +101,8 @@ class XSSoc()(implicit p: Parameters) extends LazyModule with HasSoCParameter { l2_xbar(i) := TLBuffer() := DebugIdentityNode() := xs_core(i).ptw.node l2_xbar(i) := TLBuffer() := DebugIdentityNode() := xs_core(i).l2Prefetcher.clientNode mmioXbar := TLBuffer() := DebugIdentityNode() := xs_core(i).memBlock.uncache.clientNode - l2cache(i).node := TLBuffer() := DebugIdentityNode() := l2_xbar(i) + mmioXbar := TLBuffer() := DebugIdentityNode() := xs_core(i).frontend.instrUncache.clientNode + l2cache(i).node := DataDontCareNode(a = true, b = true) := TLBuffer() := DebugIdentityNode() := l2_xbar(i) l3_xbar := TLBuffer() := DebugIdentityNode() := l2cache(i).node } @@ -169,6 +170,7 @@ class XSSoc()(implicit p: Parameters) extends LazyModule with HasSoCParameter { xs_core(i).module.io.externalInterrupt.msip := clint.module.io.msip(i) // xs_core(i).module.io.externalInterrupt.meip := RegNext(RegNext(io.meip(i))) xs_core(i).module.io.externalInterrupt.meip := plic.module.io.extra.get.meip(i) + xs_core(i).module.io.l2ToPrefetcher <> l2cache(i).module.io } // do not let dma AXI signals optimized out chisel3.dontTouch(dma.out.head._1) diff --git a/src/main/scala/top/XiangShanStage.scala b/src/main/scala/top/XiangShanStage.scala index 8c61e2b671c9dde850240b3c2a620797287b4d72..87852db90d0f80a75e492b17d45ae115d4bbbca1 100644 --- a/src/main/scala/top/XiangShanStage.scala +++ b/src/main/scala/top/XiangShanStage.scala @@ -5,6 +5,7 @@ import firrtl.AnnotationSeq import firrtl.annotations.NoTargetAnnotation import firrtl.options.{HasShellOptions, Shell, ShellOption} import firrtl.stage.{FirrtlCli, RunFirrtlTransformAnnotation} +import freechips.rocketchip.transforms.naming.{OverrideDesiredNameAnnotation, RenameDesiredNames} import xstransforms.ShowPrintTransform import xstransforms.PrintModuleName @@ -93,7 +94,8 @@ object XiangShanStage { args, annotations ++ Seq( RunFirrtlTransformAnnotation(new ShowPrintTransform), - RunFirrtlTransformAnnotation(new PrintModuleName) + RunFirrtlTransformAnnotation(new PrintModuleName), + RunFirrtlTransformAnnotation(new RenameDesiredNames) ) ) } diff --git a/src/main/scala/utils/DataDontCareNode.scala b/src/main/scala/utils/DataDontCareNode.scala new file mode 100644 index 0000000000000000000000000000000000000000..53e7e01bc653c913a54e4bce9a08b9b95d685772 --- /dev/null +++ b/src/main/scala/utils/DataDontCareNode.scala @@ -0,0 +1,44 @@ +package utils + +import chisel3._ +import chipsalliance.rocketchip.config.Parameters +import chisel3.util.DecoupledIO +import freechips.rocketchip.diplomacy.{LazyModule, LazyModuleImp} +import freechips.rocketchip.tilelink.{TLBundle, TLClientNode, TLIdentityNode, TLMasterParameters, TLMasterPortParameters} +import xiangshan.HasXSLog + +class DataDontCareNode(a: Boolean = false, b: Boolean = false, c: Boolean = false, d: Boolean = false)(implicit p: Parameters) extends LazyModule { + + val node = TLIdentityNode() + + val n = TLClientNode(Seq(TLMasterPortParameters.v1( + Seq( + TLMasterParameters.v1("DataDontCareNode") + ) + ))) + + lazy val module = new LazyModuleImp(this) with HasXSLog with HasTLDump{ + val (out, _) = node.out(0) + val (in, _) = node.in(0) + + if (a) { + out.a.bits.data := DontCare + } + if (b) { + in.b.bits.data := DontCare + } + if (c) { + out.c.bits.data := DontCare + } + if (d) { + in.d.bits.data := DontCare + } + } +} + +object DataDontCareNode { + def apply(a: Boolean = false, b: Boolean = false, c: Boolean = false, d: Boolean = false)(implicit p: Parameters): TLIdentityNode = { + val dataDontCareNode = LazyModule(new DataDontCareNode(a, b, c, d)) + dataDontCareNode.node + } +} diff --git a/src/main/scala/utils/ExcitingUtils.scala b/src/main/scala/utils/ExcitingUtils.scala index 2c53d00a9cb68a6abf5181d91463265f52576cd8..4ab83770144a2131def2ace26ab81bdc9ee47492 100644 --- a/src/main/scala/utils/ExcitingUtils.scala +++ b/src/main/scala/utils/ExcitingUtils.scala @@ -23,14 +23,15 @@ object ExcitingUtils { ( var connType: ConnectionType, var sourceModule: Option[String] = None, - var sinkModule: Option[String] = None + var sinkModule: Option[String] = None, + var warned: Boolean = false ){ override def toString: String = s"type:[$connType] source location:[${sourceModule.getOrElse(strToErrorMsg("Not Found"))}]" + s" sink location:[${sinkModule.getOrElse(strToErrorMsg("Not Found"))}]" - def isLeagleConnection: Boolean = sourceModule.nonEmpty && sinkModule.nonEmpty + def isLegalConnection: Boolean = sourceModule.nonEmpty && sinkModule.nonEmpty } private val map = mutable.LinkedHashMap[String, Connection]() @@ -44,6 +45,10 @@ object ExcitingUtils { uniqueName: Boolean = false ): String = { val conn = map.getOrElseUpdate(name, new Connection(connType)) + if (!conn.sourceModule.isEmpty && !conn.warned) { + println(s"[WARN] Signal |$name| has multiple sources") + conn.warned = true + } require(conn.connType == connType) conn.sourceModule = Some(component.parentModName) BoringUtils.addSource(component, name, disableDedup, uniqueName) @@ -58,6 +63,10 @@ object ExcitingUtils { forceExists: Boolean = false ): Unit = { val conn = map.getOrElseUpdate(name, new Connection(connType)) + if (!conn.sinkModule.isEmpty && !conn.warned) { + println(s"[WARN] Signal |$name| has multiple sinks") + conn.warned = true + } require(conn.connType == connType) conn.sinkModule = Some(component.parentModName) BoringUtils.addSink(component, name, disableDedup, forceExists) @@ -77,14 +86,14 @@ object ExcitingUtils { def checkAndDisplay(): Unit = { - var leagle = true + var legal = true val buf = new mutable.StringBuilder() for((id, conn) <- map){ buf ++= s"Connection:[$id] $conn\n" - if(!conn.isLeagleConnection) leagle = false + if(!conn.isLegalConnection) legal = false } print(buf) - require(leagle, strToErrorMsg("Error: Illeagle connection found!")) + require(legal, strToErrorMsg("Error: Illegal connection found!")) } } diff --git a/src/main/scala/utils/SRAMTemplate.scala b/src/main/scala/utils/SRAMTemplate.scala index ce894d41a73bdb1cc2e1ee57a26d4e3512d8fcde..dee1697fd64c33b046496a772b1609b64a61e8e6 100644 --- a/src/main/scala/utils/SRAMTemplate.scala +++ b/src/main/scala/utils/SRAMTemplate.scala @@ -1,3 +1,19 @@ +/************************************************************************************** +* Copyright (c) 2020 Institute of Computing Technology, CAS +* Copyright (c) 2020 University of Chinese Academy of Sciences +* +* NutShell is licensed under Mulan PSL v2. +* You can use this software according to the terms and conditions of the Mulan PSL v2. +* You may obtain a copy of Mulan PSL v2 at: +* http://license.coscl.org.cn/MulanPSL2 +* +* THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY OR +* FIT FOR A PARTICULAR PURPOSE. +* +* See the Mulan PSL v2 for more details. +***************************************************************************************/ + package utils import chisel3._ @@ -85,14 +101,6 @@ class SRAMTemplate[T <: Data](gen: T, set: Int, way: Int = 1, io.r.req.ready := !resetState && (if (singlePort) !wen else true.B) io.w.req.ready := true.B - // Debug(false) { - // when (wen) { - // printf("%d: SRAMTemplate: write %x to idx = %d\n", GTimer(), wdata.asUInt, setIdx) - // } - // when (RegNext(realRen)) { - // printf("%d: SRAMTemplate: read %x at idx = %d\n", GTimer(), VecInit(rdata).asUInt, RegNext(io.r.req.bits.setIdx)) - // } - // } } class SRAMTemplateWithArbiter[T <: Data](nRead: Int, gen: T, set: Int, way: Int = 1, diff --git a/src/main/scala/xiangshan/Bundle.scala b/src/main/scala/xiangshan/Bundle.scala index 5c24d9412a6bddc3eda9ff7ee601e6b233f3ebfa..df98b219e8a6ce8f5c07c498eca24554b1d80f08 100644 --- a/src/main/scala/xiangshan/Bundle.scala +++ b/src/main/scala/xiangshan/Bundle.scala @@ -196,7 +196,7 @@ class CfiUpdateInfo extends XSBundle with HasBPUParameter { class CtrlFlow extends XSBundle { val instr = UInt(32.W) val pc = UInt(VAddrBits.W) - val exceptionVec = Vec(16, Bool()) + val exceptionVec = ExceptionVec() val intrVec = Vec(12, Bool()) val brUpdate = new CfiUpdateInfo val crossPageIPFFix = Bool() diff --git a/src/main/scala/xiangshan/PMA.scala b/src/main/scala/xiangshan/PMA.scala new file mode 100644 index 0000000000000000000000000000000000000000..93ec79d27130d18c5edd5b9e1180a08663c15a94 --- /dev/null +++ b/src/main/scala/xiangshan/PMA.scala @@ -0,0 +1,117 @@ +package xiangshan + +import chisel3._ +import chisel3.util._ +import utils._ +import Chisel.experimental.chiselName +import xiangshan.cache.{DCache, HasDCacheParameters, DCacheParameters, ICache, ICacheParameters, L1plusCache, L1plusCacheParameters, PTW, Uncache} + +object MemMap { + def apply (base: String, top: String, width: String, description: String, mode: String): ((String, String), Map[String, String]) = { + ((base, top) -> Map( + "width" -> width, // 0 means no limitation + "description" -> description, + "mode" -> mode, + )) + } +} + +object AddressSpace { + def MemMapList = List( + // Base address Top address Width Description Mode (RWXIDSAC) + MemMap("h00_0000_0000", "h00_0FFF_FFFF", "h0", "Reserved", ""), + MemMap("h00_1000_0000", "h00_1FFF_FFFF", "h0", "QSPI_Flash", "RX"), + MemMap("h00_2000_0000", "h00_2FFF_FFFF", "h0", "Reserved", ""), + MemMap("h00_3000_0000", "h00_3000_FFFF", "h0", "DMA", "RW"), + MemMap("h00_3001_0000", "h00_3004_FFFF", "h0", "GPU", "RWC"), + MemMap("h00_3005_0000", "h00_3005_FFFF", "h0", "USB", "RW"), + MemMap("h00_3006_0000", "h00_3006_FFFF", "h0", "SDMMC", "RW"), + MemMap("h00_3007_0000", "h00_30FF_FFFF", "h0", "Reserved", ""), + MemMap("h00_3100_0000", "h00_3100_FFFF", "h0", "QSPI", "RW"), + MemMap("h00_3101_0000", "h00_3101_FFFF", "h0", "GMAC", "RW"), + MemMap("h00_3102_0000", "h00_3102_FFFF", "h0", "HDMI", "RW"), + MemMap("h00_3103_0000", "h00_3103_FFFF", "h0", "HDMI_PHY", "RW"), + MemMap("h00_3104_0000", "h00_3105_FFFF", "h0", "DP", "RW"), + MemMap("h00_3106_0000", "h00_3106_FFFF", "h0", "DDR0", "RW"), + MemMap("h00_3107_0000", "h00_3107_FFFF", "h0", "DDR0_PHY", "RW"), + MemMap("h00_3108_0000", "h00_3108_FFFF", "h0", "DDR1", "RW"), + MemMap("h00_3109_0000", "h00_3109_FFFF", "h0", "DDR1_PHY", "RW"), + MemMap("h00_310A_0000", "h00_310A_FFFF", "h0", "IIS", "RW"), + MemMap("h00_310B_0000", "h00_310B_FFFF", "h0", "UART0", "RW"), + MemMap("h00_310C_0000", "h00_310C_FFFF", "h0", "UART1", "RW"), + MemMap("h00_310D_0000", "h00_310D_FFFF", "h0", "IIC0", "RW"), + MemMap("h00_310E_0000", "h00_310E_FFFF", "h0", "IIC1", "RW"), + MemMap("h00_310F_0000", "h00_310F_FFFF", "h0", "IIC2", "RW"), + MemMap("h00_3110_0000", "h00_3110_FFFF", "h0", "GPIO", "RW"), + MemMap("h00_3111_0000", "h00_3111_FFFF", "h0", "CRU", "RW"), + MemMap("h00_3112_0000", "h00_37FF_FFFF", "h0", "Reserved", ""), + MemMap("h00_3800_0000", "h00_3800_FFFF", "h0", "CLINT", "RW"), + MemMap("h00_3801_0000", "h00_3BFF_FFFF", "h0", "Reserved", ""), + MemMap("h00_3C00_0000", "h00_3FFF_FFFF", "h0", "PLIC", "RW"), + MemMap("h00_4000_0000", "h00_4FFF_FFFF", "h0", "PCIe0", "RW"), + MemMap("h00_5000_0000", "h00_5FFF_FFFF", "h0", "PCIe1", "RW"), + MemMap("h00_6000_0000", "h00_6FFF_FFFF", "h0", "PCIe2", "RW"), + MemMap("h00_7000_0000", "h00_7FFF_FFFF", "h0", "PCIe3", "RW"), + MemMap("h00_8000_0000", "h1F_FFFF_FFFF", "h0", "DDR", "RWXIDSA"), + ) + + def printMemmap(){ + println("-------------------- memory map --------------------") + for(i <- MemMapList){ + println(i._1._1 + "->" + i._1._2 + " width " + (if(i._2.get("width").get == "0") "unlimited" else i._2.get("width").get) + " " + i._2.get("description").get + " [" + i._2.get("mode").get + "]") + } + println("----------------------------------------------------") + } + + def genMemmapMatchVec(addr: UInt): UInt = { + VecInit(MemMapList.map(i => { + i._1._1.U <= addr && addr < i._1._2.U + }).toSeq).asUInt + } + + def queryMode(matchVec: UInt): UInt = { + Mux1H(matchVec, VecInit(MemMapList.map(i => { + PMAMode.strToMode(i._2.get("mode").get) + }).toSeq)) + } + + def queryWidth(matchVec: UInt): UInt = { + Mux1H(matchVec, VecInit(MemMapList.map(i => { + i._2.get("width").get.U + }).toSeq)) + } + + def memmapAddrMatch(addr: UInt): (UInt, UInt) = { + val matchVec = genMemmapMatchVec(addr) + (queryMode(matchVec), queryWidth(matchVec)) + } + + def isDMMIO(addr: UInt): Bool = !PMAMode.dcache(memmapAddrMatch(addr)._1) + def isIMMIO(addr: UInt): Bool = !PMAMode.icache(memmapAddrMatch(addr)._1) + + def isConfigableAddr(addr: UInt): Bool = { + VecInit(MemMapList.map(i => { + i._1._1.U <= addr && addr < i._1._2.U && (i._2.get("mode").get.toUpperCase.indexOf("C") >= 0).B + }).toSeq).asUInt.orR + } +} + +class PMAChecker extends XSModule with HasDCacheParameters +{ + val io = IO(new Bundle() { + val paddr = Input(UInt(VAddrBits.W)) + val mode = Output(PMAMode()) + val widthLimit = Output(UInt(8.W)) // TODO: fixme + val updateCConfig = Input(Valid(Bool())) + }) + + val enableConfigableCacheZone = RegInit(false.B) + val updateCConfig = RegNext(RegNext(RegNext(io.updateCConfig))) + when(updateCConfig.valid) { + enableConfigableCacheZone := updateCConfig.bits + } + + val (mode, widthLimit) = AddressSpace.memmapAddrMatch(io.paddr) + io.mode := Mux(AddressSpace.isConfigableAddr(io.paddr) && enableConfigableCacheZone, mode | PMAMode.D, mode) + io.widthLimit := widthLimit +} \ No newline at end of file diff --git a/src/main/scala/xiangshan/XSCore.scala b/src/main/scala/xiangshan/XSCore.scala index 6b8d60b86c617d9b7cbf71ed93553b7bcf9ed4e6..e14211f51826b64bedec76af7a73716ef4b205df 100644 --- a/src/main/scala/xiangshan/XSCore.scala +++ b/src/main/scala/xiangshan/XSCore.scala @@ -10,7 +10,7 @@ import xiangshan.backend.exu.Exu._ import xiangshan.frontend._ import xiangshan.mem._ import xiangshan.backend.fu.HasExceptionNO -import xiangshan.cache.{DCache, DCacheParameters, ICache, ICacheParameters, L1plusCache, L1plusCacheParameters, PTW, Uncache} +import xiangshan.cache.{DCache,InstrUncache, DCacheParameters, ICache, ICacheParameters, L1plusCache, L1plusCacheParameters, PTW, Uncache, MemoryOpConstants, MissReq} import xiangshan.cache.prefetch._ import chipsalliance.rocketchip.config import freechips.rocketchip.diplomacy.{AddressSet, LazyModule, LazyModuleImp} @@ -19,6 +19,7 @@ import freechips.rocketchip.devices.tilelink.{DevNullParams, TLError} import sifive.blocks.inclusivecache.{CacheParameters, InclusiveCache, InclusiveCacheMicroParameters} import freechips.rocketchip.amba.axi4.{AXI4Deinterleaver, AXI4Fragmenter, AXI4IdIndexer, AXI4IdentityNode, AXI4ToTL, AXI4UserYanker} import freechips.rocketchip.tile.HasFPUParameters +import sifive.blocks.inclusivecache.PrefetcherIO import utils._ case class XSCoreParameters @@ -89,7 +90,9 @@ case class XSCoreParameters StoreBufferSize: Int = 16, RefillSize: Int = 512, TlbEntrySize: Int = 32, + TlbSPEntrySize: Int = 4, TlbL2EntrySize: Int = 256, // or 512 + TlbL2SPEntrySize: Int = 16, PtwL1EntrySize: Int = 16, PtwL2EntrySize: Int = 256, NumPerfCounters: Int = 16, @@ -165,7 +168,9 @@ trait HasXSParameter { val RefillSize = core.RefillSize val DTLBWidth = core.LoadPipelineWidth + core.StorePipelineWidth val TlbEntrySize = core.TlbEntrySize + val TlbSPEntrySize = core.TlbSPEntrySize val TlbL2EntrySize = core.TlbL2EntrySize + val TlbL2SPEntrySize = core.TlbL2SPEntrySize val PtwL1EntrySize = core.PtwL1EntrySize val PtwL2EntrySize = core.PtwL2EntrySize val NumPerfCounters = core.NumPerfCounters @@ -183,32 +188,6 @@ trait HasXSParameter { nMissEntries = 8 ) - // icache prefetcher - val l1plusPrefetcherParameters = L1plusPrefetcherParameters( - enable = false, - _type = "stream", - streamParams = StreamPrefetchParameters( - streamCnt = 4, - streamSize = 4, - ageWidth = 4, - blockBytes = l1plusCacheParameters.blockBytes, - reallocStreamOnMissInstantly = true - ) - ) - - // dcache prefetcher - val l2PrefetcherParameters = L2PrefetcherParameters( - enable = true, - _type = "stream", - streamParams = StreamPrefetchParameters( - streamCnt = 4, - streamSize = 4, - ageWidth = 4, - blockBytes = L2BlockSize, - reallocStreamOnMissInstantly = true - ) - ) - val dcacheParameters = DCacheParameters( tagECC = Some("secded"), dataECC = Some("secded"), @@ -240,6 +219,43 @@ trait HasXSParameter { // on chip network configurations val L3BusWidth = 256 + + // icache prefetcher + val l1plusPrefetcherParameters = L1plusPrefetcherParameters( + enable = true, + _type = "stream", + streamParams = StreamPrefetchParameters( + streamCnt = 2, + streamSize = 4, + ageWidth = 4, + blockBytes = l1plusCacheParameters.blockBytes, + reallocStreamOnMissInstantly = true, + cacheName = "icache" + ) + ) + + // dcache prefetcher + val l2PrefetcherParameters = L2PrefetcherParameters( + enable = true, + _type = "bop",// "stream" or "bop" + streamParams = StreamPrefetchParameters( + streamCnt = 4, + streamSize = 4, + ageWidth = 4, + blockBytes = L2BlockSize, + reallocStreamOnMissInstantly = true, + cacheName = "dcache" + ), + bopParams = BOPParameters( + rrTableEntries = 256, + rrTagBits = 12, + scoreBits = 5, + roundMax = 50, + badScore = 1, + blockBytes = L2BlockSize, + nEntries = dcacheParameters.nMissEntries * 2 // TODO: this is too large + ), + ) } trait HasXSLog { this: RawModule => @@ -272,23 +288,23 @@ case class EnviromentParameters ( FPGAPlatform: Boolean = true, EnableDebug: Boolean = false, - EnablePerfDebug: Boolean = true + EnablePerfDebug: Boolean = false ) -object AddressSpace extends HasXSParameter { - // (start, size) - // address out of MMIO will be considered as DRAM - def mmio = List( - (0x00000000L, 0x40000000L), // internal devices, such as CLINT and PLIC - (0x40000000L, 0x40000000L) // external devices - ) +// object AddressSpace extends HasXSParameter { +// // (start, size) +// // address out of MMIO will be considered as DRAM +// def mmio = List( +// (0x00000000L, 0x40000000L), // internal devices, such as CLINT and PLIC +// (0x40000000L, 0x40000000L) // external devices +// ) - def isMMIO(addr: UInt): Bool = mmio.map(range => { - require(isPow2(range._2)) - val bits = log2Up(range._2) - (addr ^ range._1.U)(PAddrBits-1, bits) === 0.U - }).reduce(_ || _) -} +// def isMMIO(addr: UInt): Bool = mmio.map(range => { +// require(isPow2(range._2)) +// val bits = log2Up(range._2) +// (addr ^ range._1.U)(PAddrBits-1, bits) === 0.U +// }).reduce(_ || _) +// } @@ -309,6 +325,7 @@ class XSCore()(implicit p: config.Parameters) extends LazyModule val fpBlockSlowWakeUpInt = fpExuConfigs.filter(intSlowFilter) // outer facing nodes + val frontend = LazyModule(new Frontend()) val l1pluscache = LazyModule(new L1plusCache()) val ptw = LazyModule(new PTW()) val l2Prefetcher = LazyModule(new L2Prefetcher()) @@ -330,9 +347,11 @@ class XSCoreImp(outer: XSCore) extends LazyModuleImp(outer) { val io = IO(new Bundle { val externalInterrupt = new ExternalInterruptIO + val l2ToPrefetcher = Flipped(new PrefetcherIO(PAddrBits)) }) println(s"FPGAPlatform:${env.FPGAPlatform} EnableDebug:${env.EnableDebug}") + AddressSpace.printMemmap() // to fast wake up fp, mem rs val intBlockFastWakeUpFp = intExuConfigs.filter(fpFastFilter) @@ -345,7 +364,6 @@ class XSCoreImp(outer: XSCore) extends LazyModuleImp(outer) val fpBlockFastWakeUpInt = fpExuConfigs.filter(intFastFilter) val fpBlockSlowWakeUpInt = fpExuConfigs.filter(intSlowFilter) - val frontend = Module(new Frontend) val ctrlBlock = Module(new CtrlBlock) val integerBlock = Module(new IntegerBlock( fastWakeUpIn = fpBlockFastWakeUpInt, @@ -364,6 +382,7 @@ class XSCoreImp(outer: XSCore) extends LazyModuleImp(outer) slowIntOut = fpBlockSlowWakeUpInt )) + val frontend = outer.frontend.module val memBlock = outer.memBlock.module val l1pluscache = outer.l1pluscache.module val ptw = outer.ptw.module @@ -443,7 +462,16 @@ class XSCoreImp(outer: XSCore) extends LazyModuleImp(outer) ptw.io.sfence <> integerBlock.io.fenceio.sfence ptw.io.csr <> integerBlock.io.csrio.tlb - l2Prefetcher.io.in <> memBlock.io.toDCachePrefetch + val l2PrefetcherIn = Wire(Decoupled(new MissReq)) + if (l2PrefetcherParameters.enable && l2PrefetcherParameters._type == "bop") { + l2PrefetcherIn.valid := io.l2ToPrefetcher.acquire.valid + l2PrefetcherIn.bits := DontCare + l2PrefetcherIn.bits.addr := io.l2ToPrefetcher.acquire.bits.address + l2PrefetcherIn.bits.cmd := Mux(io.l2ToPrefetcher.acquire.bits.write, MemoryOpConstants.M_XWR, MemoryOpConstants.M_XRD) + } else { + l2PrefetcherIn <> memBlock.io.toDCachePrefetch + } + l2Prefetcher.io.in <> l2PrefetcherIn if (!env.FPGAPlatform) { val debugIntReg, debugFpReg = WireInit(VecInit(Seq.fill(32)(0.U(XLEN.W)))) diff --git a/src/main/scala/xiangshan/backend/CtrlBlock.scala b/src/main/scala/xiangshan/backend/CtrlBlock.scala index 4541d38243a096c2b609b4cfb11535725c96350e..a3f46459e1e7e0e6977757608e6dfc546cdb8f90 100644 --- a/src/main/scala/xiangshan/backend/CtrlBlock.scala +++ b/src/main/scala/xiangshan/backend/CtrlBlock.scala @@ -16,7 +16,7 @@ import xiangshan.mem.LsqEnqIO class CtrlToIntBlockIO extends XSBundle { val enqIqCtrl = Vec(exuParameters.IntExuCnt, DecoupledIO(new MicroOp)) - val readRf = Vec(NRIntReadPorts, Flipped(new RfReadPort(XLEN))) + val readRf = Vec(NRIntReadPorts, Output(UInt(PhyRegIdxWidth.W))) val jumpPc = Output(UInt(VAddrBits.W)) // int block only uses port 0~7 val readPortIndex = Vec(exuParameters.IntExuCnt, Output(UInt(log2Ceil(8 / 2).W))) // TODO parameterize 8 here @@ -25,7 +25,7 @@ class CtrlToIntBlockIO extends XSBundle { class CtrlToFpBlockIO extends XSBundle { val enqIqCtrl = Vec(exuParameters.FpExuCnt, DecoupledIO(new MicroOp)) - val readRf = Vec(NRFpReadPorts, Flipped(new RfReadPort(XLEN + 1))) + val readRf = Vec(NRFpReadPorts, Output(UInt(PhyRegIdxWidth.W))) // fp block uses port 0~11 val readPortIndex = Vec(exuParameters.FpExuCnt, Output(UInt(log2Ceil((NRFpReadPorts - exuParameters.StuCnt) / 3).W))) val redirect = ValidIO(new Redirect) @@ -131,10 +131,8 @@ class CtrlBlock extends XSModule with HasCircularQueuePtrHelper { setPhyRegRdy.valid := wb.valid && wb.bits.uop.ctrl.fpWen setPhyRegRdy.bits := wb.bits.uop.pdest } - intBusyTable.io.rfReadAddr <> dispatch.io.readIntRf.map(_.addr) - intBusyTable.io.pregRdy <> dispatch.io.intPregRdy - fpBusyTable.io.rfReadAddr <> dispatch.io.readFpRf.map(_.addr) - fpBusyTable.io.pregRdy <> dispatch.io.fpPregRdy + intBusyTable.io.read <> dispatch.io.readIntState + fpBusyTable.io.read <> dispatch.io.readFpState roq.io.redirect.valid := brq.io.redirectOut.valid || io.fromLsBlock.replay.valid roq.io.redirect.bits <> redirectArb diff --git a/src/main/scala/xiangshan/backend/FloatBlock.scala b/src/main/scala/xiangshan/backend/FloatBlock.scala index e85af2c993644ceae48a9dae8af32ae6ba470e97..df519dfcf71e03a15eeaaf1cf702b0946affcca3 100644 --- a/src/main/scala/xiangshan/backend/FloatBlock.scala +++ b/src/main/scala/xiangshan/backend/FloatBlock.scala @@ -152,7 +152,7 @@ class FloatBlock // read fp rf from ctrl block - fpRf.io.readPorts <> io.fromCtrlBlock.readRf + fpRf.io.readPorts.zipWithIndex.map{ case (r, i) => r.addr := io.fromCtrlBlock.readRf(i) } (0 until exuParameters.StuCnt).foreach(i => io.toMemBlock.readFpRf(i).data := fpRf.io.readPorts(i + 12).data) // write fp rf arbiter val fpWbArbiter = Module(new Wb( diff --git a/src/main/scala/xiangshan/backend/IntegerBlock.scala b/src/main/scala/xiangshan/backend/IntegerBlock.scala index ad12583c4ee1a37e84536ca27217418869e8bbbd..4e7da59c3924c02c50bcbc930eca3181d8dc5070 100644 --- a/src/main/scala/xiangshan/backend/IntegerBlock.scala +++ b/src/main/scala/xiangshan/backend/IntegerBlock.scala @@ -218,7 +218,7 @@ class IntegerBlock jmpExeUnit.fenceio <> io.fenceio // read int rf from ctrl block - intRf.io.readPorts <> io.fromCtrlBlock.readRf + intRf.io.readPorts.zipWithIndex.map{ case(r, i) => r.addr := io.fromCtrlBlock.readRf(i) } (0 until NRMemReadPorts).foreach(i => io.toMemBlock.readIntRf(i).data := intRf.io.readPorts(i + 8).data) // write int rf arbiter val intWbArbiter = Module(new Wb( @@ -237,4 +237,4 @@ class IntegerBlock rf.addr := wb.bits.uop.pdest rf.data := wb.bits.data } -} \ No newline at end of file +} diff --git a/src/main/scala/xiangshan/backend/MemBlock.scala b/src/main/scala/xiangshan/backend/MemBlock.scala index 4418a15dffaab0aec724cd97faf138b4aef6c6f8..0f2572ad379661f4a4149d49d7970bef31cdcf7a 100644 --- a/src/main/scala/xiangshan/backend/MemBlock.scala +++ b/src/main/scala/xiangshan/backend/MemBlock.scala @@ -250,6 +250,7 @@ class MemBlockImp // LSQ to store buffer lsq.io.sbuffer <> sbuffer.io.in + lsq.io.sqempty <> sbuffer.io.sqempty // Sbuffer sbuffer.io.dcache <> dcache.io.lsu.store diff --git a/src/main/scala/xiangshan/backend/brq/Brq.scala b/src/main/scala/xiangshan/backend/brq/Brq.scala index 91144f32d6d9c0f4d6de6cb3bed020fe3d2f7484..bd2dde54b390ef777d09993f18b38d4dea00fbc7 100644 --- a/src/main/scala/xiangshan/backend/brq/Brq.scala +++ b/src/main/scala/xiangshan/backend/brq/Brq.scala @@ -5,6 +5,7 @@ import chisel3.util._ import xiangshan._ import utils._ import chisel3.ExcitingUtils._ +import xiangshan.backend.JumpOpType import xiangshan.backend.decode.ImmUnion @@ -75,7 +76,7 @@ class Brq extends XSModule with HasCircularQueuePtrHelper { val exuOut = new ExuOutput } - val s_idle :: s_wb :: Nil = Enum(2) + val s_idle :: s_wb :: s_auipc_wb :: Nil = Enum(3) class DecodeEnqBrqData extends Bundle { val cfiUpdateInfo = new CfiUpdateInfo @@ -107,7 +108,9 @@ class Brq extends XSModule with HasCircularQueuePtrHelper { /** * write back */ - val wbValid = stateQueue(writebackIdx) === s_wb + val wbState = stateQueue(writebackIdx) + val wbValid = wbState === s_wb + val wbIsAuipc = wbState === s_auipc_wb val wbEntry = Wire(new ExuOutput) val wbIsMisPred = wbEntry.redirect.target =/= wbEntry.brUpdate.pnpc @@ -115,9 +118,9 @@ class Brq extends XSModule with HasCircularQueuePtrHelper { io.redirectOut.bits := wbEntry.redirect io.redirectOut.bits.brTag := BrqPtr(ptrFlagVec(writebackIdx), writebackIdx) - io.out.valid := wbValid + io.out.valid := wbValid || wbIsAuipc io.out.bits := wbEntry - when (wbValid) { + when (io.out.valid) { stateQueue(writebackIdx) := s_idle writebackPtr_next := writebackPtr + 1.U } @@ -164,7 +167,7 @@ class Brq extends XSModule with HasCircularQueuePtrHelper { /** * exu write back */ - for (exuWb <- io.exuRedirectWb) { + for ((exuWb, i) <- io.exuRedirectWb.zipWithIndex) { when (exuWb.valid) { val wbIdx = exuWb.bits.redirect.brTag.value XSInfo( @@ -174,8 +177,14 @@ class Brq extends XSModule with HasCircularQueuePtrHelper { p"target=${Hexadecimal(exuWb.bits.redirect.target)}\n" ) assert(stateQueue(wbIdx) === s_idle) - - stateQueue(wbIdx) := s_wb + if(i == 0){ // jump + stateQueue(wbIdx) := Mux(JumpOpType.jumpOpisAuipc(exuWb.bits.uop.ctrl.fuOpType), + s_auipc_wb, + s_wb + ) + } else { // alu + stateQueue(wbIdx) := s_wb + } } } diff --git a/src/main/scala/xiangshan/backend/decode/DecodeStage.scala b/src/main/scala/xiangshan/backend/decode/DecodeStage.scala index 8b2a1e49a5c1e1ca35c0233bec8f549d6417d566..fb02934d702ff3d8892118487de55b580ecb7402 100644 --- a/src/main/scala/xiangshan/backend/decode/DecodeStage.scala +++ b/src/main/scala/xiangshan/backend/decode/DecodeStage.scala @@ -5,6 +5,7 @@ import chisel3.util._ import xiangshan._ import xiangshan.backend.brq.BrqEnqIO import utils._ +import xiangshan.backend.decode.Instructions.{AUIPC, MRET, SRET} class DecodeStage extends XSModule { val io = IO(new Bundle() { @@ -31,12 +32,14 @@ class DecodeStage extends XSModule { for (i <- 0 until DecodeWidth) { decoders(i).io.enq.ctrl_flow <> io.in(i).bits - val isMret = io.in(i).bits.instr === BitPat("b001100000010_00000_000_00000_1110011") - val isSret = io.in(i).bits.instr === BitPat("b000100000010_00000_000_00000_1110011") - val thisBrqValid = !io.in(i).bits.brUpdate.pd.notCFI || isMret || isSret + val isMret = io.in(i).bits.instr === MRET + val isSret = io.in(i).bits.instr === SRET + val isAuiPc = io.in(i).bits.instr === AUIPC + val thisBrqValid = !io.in(i).bits.brUpdate.pd.notCFI || isMret || isSret || isAuiPc io.enqBrq.needAlloc(i) := thisBrqValid io.enqBrq.req(i).valid := io.in(i).valid && thisBrqValid && io.out(i).ready - io.enqBrq.req(i).bits := decoders(i).io.deq.cf_ctrl.cf + io.enqBrq.req(i).bits := io.in(i).bits + io.enqBrq.req(i).bits.instr := decoders(i).io.deq.cf_ctrl.cf.instr io.out(i).valid := io.in(i).valid && io.enqBrq.req(i).ready io.out(i).bits := decoders(i).io.deq.cf_ctrl diff --git a/src/main/scala/xiangshan/backend/decode/DecodeUnit.scala b/src/main/scala/xiangshan/backend/decode/DecodeUnit.scala index 76696d2601a22c1df5fb7f98d10740030613da38..b321c427334b1d7d173367170b288edee09a2f2b 100644 --- a/src/main/scala/xiangshan/backend/decode/DecodeUnit.scala +++ b/src/main/scala/xiangshan/backend/decode/DecodeUnit.scala @@ -135,7 +135,7 @@ object XDecode extends DecodeConstants { REMW -> List(SrcType.reg, SrcType.reg, SrcType.DC, FuType.div, MDUOpType.remw, Y, N, N, N, N, N, N, SelImm.IMM_X), REMUW -> List(SrcType.reg, SrcType.reg, SrcType.DC, FuType.div, MDUOpType.remuw, Y, N, N, N, N, N, N, SelImm.IMM_X), - AUIPC -> List(SrcType.pc, SrcType.imm, SrcType.DC, FuType.alu, ALUOpType.add, Y, N, N, N, N, N, N, SelImm.IMM_U), + AUIPC -> List(SrcType.pc , SrcType.imm, SrcType.DC, FuType.jmp, JumpOpType.auipc, Y, N, N, N, N, N, N, SelImm.IMM_U), JAL -> List(SrcType.pc , SrcType.imm, SrcType.DC, FuType.jmp, JumpOpType.jal, Y, N, N, N, N, N, N, SelImm.IMM_UJ), JALR -> List(SrcType.reg, SrcType.imm, SrcType.DC, FuType.jmp, JumpOpType.jalr, Y, N, N, N, N, N, N, SelImm.IMM_I), BEQ -> List(SrcType.reg, SrcType.reg, SrcType.DC, FuType.alu, ALUOpType.beq, N, N, N, N, N, N, N, SelImm.IMM_SB), @@ -155,9 +155,9 @@ object XDecode extends DecodeConstants { CSRRCI -> List(SrcType.reg, SrcType.imm, SrcType.DC, FuType.csr, CSROpType.clri, Y, N, N, Y, Y, N, N, SelImm.IMM_Z), SFENCE_VMA->List(SrcType.reg, SrcType.imm, SrcType.DC, FuType.fence, FenceOpType.sfence, N, N, N, Y, Y, Y, N, SelImm.IMM_X), - ECALL -> List(SrcType.reg, SrcType.imm, SrcType.DC, FuType.csr, CSROpType.jmp, Y, N, N, Y, Y, N, N, SelImm.IMM_X), - SRET -> List(SrcType.reg, SrcType.imm, SrcType.DC, FuType.csr, CSROpType.jmp, Y, N, N, Y, Y, N, N, SelImm.IMM_X), - MRET -> List(SrcType.reg, SrcType.imm, SrcType.DC, FuType.csr, CSROpType.jmp, Y, N, N, Y, Y, N, N, SelImm.IMM_X), + ECALL -> List(SrcType.reg, SrcType.imm, SrcType.DC, FuType.csr, CSROpType.jmp, Y, N, N, Y, Y, N, N, SelImm.IMM_I), + SRET -> List(SrcType.reg, SrcType.imm, SrcType.DC, FuType.csr, CSROpType.jmp, Y, N, N, Y, Y, N, N, SelImm.IMM_I), + MRET -> List(SrcType.reg, SrcType.imm, SrcType.DC, FuType.csr, CSROpType.jmp, Y, N, N, Y, Y, N, N, SelImm.IMM_I), WFI -> List(SrcType.pc, SrcType.imm, SrcType.DC, FuType.alu, ALUOpType.sll, Y, N, N, N, N, N, N, SelImm.IMM_X), @@ -300,22 +300,6 @@ object XSTrapDecode extends DecodeConstants { ) } -class RVCExpander extends XSModule { - val io = IO(new Bundle { - val in = Input(UInt(32.W)) - val out = Output(new ExpandedInstruction) - val rvc = Output(Bool()) - }) - - if (HasCExtension) { - io.rvc := io.in(1,0) =/= 3.U - io.out := new RVCDecoder(io.in, XLEN).decode - } else { - io.rvc := false.B - io.out := new RVCDecoder(io.in, XLEN).passthrough - } -} - //object Imm32Gen { // def apply(sel: UInt, inst: UInt) = { // val sign = Mux(sel === SelImm.IMM_Z, 0.S, inst(31).asSInt) @@ -425,19 +409,7 @@ class DecodeUnit extends XSModule with DecodeUnitConstants { val ctrl_flow = Wire(new CtrlFlow) // input with RVC Expanded val cf_ctrl = Wire(new CfCtrl) - val exp = Module(new RVCExpander()) - exp.io.in := io.enq.ctrl_flow.instr ctrl_flow := io.enq.ctrl_flow - when (exp.io.rvc) { - ctrl_flow.instr := exp.io.out.bits - } - - // save rvc decode info - // TODO maybe rvc_info are useless? - val rvc_info = Wire(new ExpandedInstruction()) - val is_rvc = Wire(Bool()) - rvc_info := exp.io.out - is_rvc := exp.io.rvc var decode_table = XDecode.table ++ FDecode.table ++ FDivSqrtDecode.table ++ X64Decode.table ++ XSTrapDecode.table @@ -458,10 +430,8 @@ class DecodeUnit extends XSModule with DecodeUnitConstants { cs.ldest := Mux(cs.fpWen || cs.rfWen, ctrl_flow.instr(RD_MSB,RD_LSB), 0.U) // fill in exception vector - cf_ctrl.cf.exceptionVec.map(_ := false.B) + cf_ctrl.cf.exceptionVec := io.enq.ctrl_flow.exceptionVec cf_ctrl.cf.exceptionVec(illegalInstr) := cs.selImm === SelImm.INVALID_INSTR - cf_ctrl.cf.exceptionVec(instrPageFault) := io.enq.ctrl_flow.exceptionVec(instrPageFault) - cf_ctrl.cf.exceptionVec(instrAccessFault) := io.enq.ctrl_flow.exceptionVec(instrAccessFault) // fix frflags // fflags zero csrrs rd csr diff --git a/src/main/scala/xiangshan/backend/decode/FPDecoder.scala b/src/main/scala/xiangshan/backend/decode/FPDecoder.scala index a58735bca1bb1f1739afbb82f1045ecce75fb6ef..5910af21ffbf230249019c5a845d9b2198a84340 100644 --- a/src/main/scala/xiangshan/backend/decode/FPDecoder.scala +++ b/src/main/scala/xiangshan/backend/decode/FPDecoder.scala @@ -27,7 +27,7 @@ class FPDecoder extends XSModule{ FCVT_S_WU-> List(N,s,s,Y,Y,Y,N,N,Y), FCVT_S_L -> List(N,s,s,Y,Y,Y,N,N,Y), FCVT_S_LU-> List(N,s,s,Y,Y,Y,N,N,Y), - FMV_X_W -> List(N,s,X,N,N,N,N,N,N), + FMV_X_W -> List(N,d,X,N,N,N,N,N,N), FCLASS_S -> List(N,s,X,N,N,N,N,N,N), FCVT_W_S -> List(N,s,X,N,Y,N,N,N,Y), FCVT_WU_S-> List(N,s,X,N,Y,N,N,N,Y), diff --git a/src/main/scala/xiangshan/backend/dispatch/Dispatch.scala b/src/main/scala/xiangshan/backend/dispatch/Dispatch.scala index 144b4a5332ecf3e945fd0b59ddecefa7b83e819d..23f7ee32e48a4476f9e19b4a0d41b625f49a7987 100644 --- a/src/main/scala/xiangshan/backend/dispatch/Dispatch.scala +++ b/src/main/scala/xiangshan/backend/dispatch/Dispatch.scala @@ -7,7 +7,7 @@ import utils._ import xiangshan.backend.regfile.RfReadPort import chisel3.ExcitingUtils._ import xiangshan.backend.roq.{RoqPtr, RoqEnqIO} -import xiangshan.backend.rename.RenameBypassInfo +import xiangshan.backend.rename.{RenameBypassInfo, BusyTableReadIO} import xiangshan.mem.LsqEnqIO case class DispatchParameters @@ -34,11 +34,11 @@ class Dispatch extends XSModule { // enq Lsq val enqLsq = Flipped(new LsqEnqIO) // read regfile - val readIntRf = Vec(NRIntReadPorts, Flipped(new RfReadPort(XLEN))) - val readFpRf = Vec(NRFpReadPorts, Flipped(new RfReadPort(XLEN + 1))) - // read reg status (busy/ready) - val intPregRdy = Vec(NRIntReadPorts, Input(Bool())) - val fpPregRdy = Vec(NRFpReadPorts, Input(Bool())) + val readIntRf = Vec(NRIntReadPorts, Output(UInt(PhyRegIdxWidth.W))) + val readFpRf = Vec(NRFpReadPorts, Output(UInt(PhyRegIdxWidth.W))) + // to busytable: read physical registers' state (busy/ready) + val readIntState= Vec(NRIntReadPorts, Flipped(new BusyTableReadIO)) + val readFpState = Vec(NRFpReadPorts, Flipped(new BusyTableReadIO)) // to reservation stations val numExist = Input(Vec(exuParameters.ExuCnt, UInt(log2Ceil(IssQueSize).W))) val enqIQCtrl = Vec(exuParameters.ExuCnt, DecoupledIO(new MicroOp)) @@ -82,7 +82,7 @@ class Dispatch extends XSModule { val intDispatch = Module(new Dispatch2Int) intDispatch.io.fromDq <> intDq.io.deq intDispatch.io.readRf.zipWithIndex.map({case (r, i) => r <> io.readIntRf(i)}) - intDispatch.io.regRdy.zipWithIndex.map({case (r, i) => r <> io.intPregRdy(i)}) + intDispatch.io.readState.zipWithIndex.map({case (r, i) => r <> io.readIntState(i)}) intDispatch.io.numExist.zipWithIndex.map({case (num, i) => num := io.numExist(i)}) intDispatch.io.enqIQCtrl.zipWithIndex.map({case (enq, i) => enq <> io.enqIQCtrl(i)}) // intDispatch.io.enqIQData.zipWithIndex.map({case (enq, i) => enq <> io.enqIQData(i)}) @@ -92,7 +92,7 @@ class Dispatch extends XSModule { val fpDispatch = Module(new Dispatch2Fp) fpDispatch.io.fromDq <> fpDq.io.deq fpDispatch.io.readRf.zipWithIndex.map({case (r, i) => r <> io.readFpRf(i)}) - fpDispatch.io.regRdy.zipWithIndex.map({case (r, i) => r <> io.fpPregRdy(i)}) + fpDispatch.io.readState.zipWithIndex.map({case (r, i) => r <> io.readFpState(i)}) fpDispatch.io.numExist.zipWithIndex.map({case (num, i) => num := io.numExist(i + exuParameters.IntExuCnt)}) fpDispatch.io.enqIQCtrl.zipWithIndex.map({case (enq, i) => enq <> io.enqIQCtrl(i + exuParameters.IntExuCnt)}) // fpDispatch.io.enqIQData.zipWithIndex.map({case (enq, i) => enq <> io.enqIQData(i + exuParameters.IntExuCnt)}) @@ -103,8 +103,8 @@ class Dispatch extends XSModule { lsDispatch.io.fromDq <> lsDq.io.deq lsDispatch.io.readIntRf.zipWithIndex.map({case (r, i) => r <> io.readIntRf(i + 8)}) lsDispatch.io.readFpRf.zipWithIndex.map({case (r, i) => r <> io.readFpRf(i + 12)}) - lsDispatch.io.intRegRdy.zipWithIndex.map({case (r, i) => r <> io.intPregRdy(i + 8)}) - lsDispatch.io.fpRegRdy.zipWithIndex.map({case (r, i) => r <> io.fpPregRdy(i + 12)}) + lsDispatch.io.readIntState.zipWithIndex.map({case (r, i) => r <> io.readIntState(i + 8)}) + lsDispatch.io.readFpState.zipWithIndex.map({case (r, i) => r <> io.readFpState(i + 12)}) lsDispatch.io.numExist.zipWithIndex.map({case (num, i) => num := io.numExist(exuParameters.IntExuCnt + exuParameters.FpExuCnt + i)}) lsDispatch.io.enqIQCtrl.zipWithIndex.map({case (enq, i) => enq <> io.enqIQCtrl(exuParameters.IntExuCnt + exuParameters.FpExuCnt + i)}) // lsDispatch.io.enqIQData.zipWithIndex.map({case (enq, i) => enq <> io.enqIQData(exuParameters.IntExuCnt + exuParameters.FpExuCnt + i)}) diff --git a/src/main/scala/xiangshan/backend/dispatch/Dispatch1.scala b/src/main/scala/xiangshan/backend/dispatch/Dispatch1.scala index 782e464a2275bf8b0e1530eae217fe6f6e9b5f84..0e372fff56a929a3db953e493e635102ad8e300b 100644 --- a/src/main/scala/xiangshan/backend/dispatch/Dispatch1.scala +++ b/src/main/scala/xiangshan/backend/dispatch/Dispatch1.scala @@ -8,9 +8,10 @@ import utils._ import xiangshan.backend.roq.{RoqPtr, RoqEnqIO} import xiangshan.backend.rename.RenameBypassInfo import xiangshan.mem.LsqEnqIO +import xiangshan.backend.fu.HasExceptionNO // read rob and enqueue -class Dispatch1 extends XSModule { +class Dispatch1 extends XSModule with HasExceptionNO { val io = IO(new Bundle() { // from rename val fromRename = Vec(RenameWidth, Flipped(DecoupledIO(new MicroOp))) @@ -45,7 +46,10 @@ class Dispatch1 extends XSModule { */ // valid bits for different dispatch queues val isInt = VecInit(io.fromRename.map(req => FuType.isIntExu(req.bits.ctrl.fuType))) - val isBranch = VecInit(io.fromRename.map(req => !req.bits.cf.brUpdate.pd.notCFI)) + val isBranch = VecInit(io.fromRename.map(req => + // cover auipc (a fake branch) + !req.bits.cf.brUpdate.pd.notCFI || FuType.isJumpExu(req.bits.ctrl.fuType) + )) val isFp = VecInit(io.fromRename.map(req => FuType.isFpExu (req.bits.ctrl.fuType))) val isLs = VecInit(io.fromRename.map(req => FuType.isMemExu(req.bits.ctrl.fuType))) val isStore = VecInit(io.fromRename.map(req => FuType.isStoreExu(req.bits.ctrl.fuType))) @@ -113,6 +117,7 @@ class Dispatch1 extends XSModule { // thisIsBlocked: this instruction is blocked by itself (based on noSpecExec) // nextCanOut: next instructions can out (based on blockBackward) // notBlockedByPrevious: previous instructions can enqueue + val hasException = VecInit(io.fromRename.map(r => selectFrontend(r.bits.cf.exceptionVec).asUInt.orR)) val thisIsBlocked = VecInit((0 until RenameWidth).map(i => { // for i > 0, when Roq is empty but dispatch1 have valid instructions to enqueue, it's blocked if (i > 0) isNoSpecExec(i) && (!io.enqRoq.isEmpty || Cat(io.fromRename.take(i).map(_.valid)).orR) @@ -153,17 +158,17 @@ class Dispatch1 extends XSModule { // We use notBlockedByPrevious here. io.toIntDq.needAlloc(i) := io.fromRename(i).valid && isInt(i) io.toIntDq.req(i).bits := updatedUop(i) - io.toIntDq.req(i).valid := io.fromRename(i).valid && isInt(i) && thisCanActualOut(i) && + io.toIntDq.req(i).valid := io.fromRename(i).valid && !hasException(i) && isInt(i) && thisCanActualOut(i) && io.enqLsq.canAccept && io.enqRoq.canAccept && io.toFpDq.canAccept && io.toLsDq.canAccept io.toFpDq.needAlloc(i) := io.fromRename(i).valid && isFp(i) io.toFpDq.req(i).bits := updatedUop(i) - io.toFpDq.req(i).valid := io.fromRename(i).valid && isFp(i) && thisCanActualOut(i) && + io.toFpDq.req(i).valid := io.fromRename(i).valid && !hasException(i) && isFp(i) && thisCanActualOut(i) && io.enqLsq.canAccept && io.enqRoq.canAccept && io.toIntDq.canAccept && io.toLsDq.canAccept io.toLsDq.needAlloc(i) := io.fromRename(i).valid && isLs(i) io.toLsDq.req(i).bits := updatedUop(i) - io.toLsDq.req(i).valid := io.fromRename(i).valid && isLs(i) && thisCanActualOut(i) && + io.toLsDq.req(i).valid := io.fromRename(i).valid && !hasException(i) && isLs(i) && thisCanActualOut(i) && io.enqLsq.canAccept && io.enqRoq.canAccept && io.toIntDq.canAccept && io.toFpDq.canAccept XSDebug(io.toIntDq.req(i).valid, p"pc 0x${Hexadecimal(io.toIntDq.req(i).bits.cf.pc)} int index $i\n") diff --git a/src/main/scala/xiangshan/backend/dispatch/Dispatch2Fp.scala b/src/main/scala/xiangshan/backend/dispatch/Dispatch2Fp.scala index ec6e151a1f98791e6b017a23e89d809daf56a7cc..763abb16403fab382ce778816046a622e3800499 100644 --- a/src/main/scala/xiangshan/backend/dispatch/Dispatch2Fp.scala +++ b/src/main/scala/xiangshan/backend/dispatch/Dispatch2Fp.scala @@ -5,13 +5,14 @@ import chisel3.util._ import xiangshan._ import utils._ import xiangshan.backend.regfile.RfReadPort +import xiangshan.backend.rename.BusyTableReadIO import xiangshan.backend.exu.Exu._ class Dispatch2Fp extends XSModule { val io = IO(new Bundle() { val fromDq = Flipped(Vec(dpParams.FpDqDeqWidth, DecoupledIO(new MicroOp))) - val readRf = Vec(NRFpReadPorts - exuParameters.StuCnt, Flipped(new RfReadPort(XLEN + 1))) - val regRdy = Vec(NRFpReadPorts - exuParameters.StuCnt, Input(Bool())) + val readRf = Vec(NRFpReadPorts - exuParameters.StuCnt, Output(UInt(PhyRegIdxWidth.W))) + val readState = Vec(NRFpReadPorts - exuParameters.StuCnt, Flipped(new BusyTableReadIO)) val numExist = Input(Vec(exuParameters.FpExuCnt, UInt(log2Ceil(IssQueSize).W))) val enqIQCtrl = Vec(exuParameters.FpExuCnt, DecoupledIO(new MicroOp)) val readPortIndex = Vec(exuParameters.FpExuCnt, Output(UInt(log2Ceil((NRFpReadPorts - exuParameters.StuCnt) / 3).W))) @@ -51,14 +52,20 @@ class Dispatch2Fp extends XSModule { val fpDynamicMapped = fpDynamicIndex.map(i => indexVec(i)) for (i <- fpStaticIndex.indices) { val index = WireInit(VecInit(fpStaticMapped(i) +: fpDynamicMapped)) - io.readRf(3*i ).addr := io.fromDq(index(fpReadPortSrc(i))).bits.psrc1 - io.readRf(3*i+1).addr := io.fromDq(index(fpReadPortSrc(i))).bits.psrc2 - io.readRf(3*i+2).addr := io.fromDq(index(fpReadPortSrc(i))).bits.psrc3 + io.readRf(3*i ) := io.fromDq(index(fpReadPortSrc(i))).bits.psrc1 + io.readRf(3*i+1) := io.fromDq(index(fpReadPortSrc(i))).bits.psrc2 + io.readRf(3*i+2) := io.fromDq(index(fpReadPortSrc(i))).bits.psrc3 } val readPortIndex = Wire(Vec(exuParameters.FpExuCnt, UInt(2.W))) fpStaticIndex.zipWithIndex.map({case (index, i) => readPortIndex(index) := i.U}) fpDynamicIndex.zipWithIndex.map({case (index, i) => readPortIndex(index) := fpDynamicExuSrc(i)}) + for (i <- 0 until dpParams.IntDqDeqWidth) { + io.readState(3*i ).req := io.fromDq(i).bits.psrc1 + io.readState(3*i+1).req := io.fromDq(i).bits.psrc2 + io.readState(3*i+2).req := io.fromDq(i).bits.psrc3 + } + /** * Part 3: dispatch to reservation stations */ @@ -74,12 +81,12 @@ class Dispatch2Fp extends XSModule { } enq.bits := io.fromDq(indexVec(i)).bits - val src1Ready = VecInit((0 until 4).map(i => io.regRdy(i * 3))) - val src2Ready = VecInit((0 until 4).map(i => io.regRdy(i * 3 + 1))) - val src3Ready = VecInit((0 until 4).map(i => io.regRdy(i * 3 + 2))) - enq.bits.src1State := src1Ready(readPortIndex(i)) - enq.bits.src2State := src2Ready(readPortIndex(i)) - enq.bits.src3State := src3Ready(readPortIndex(i)) + val src1Ready = VecInit((0 until 4).map(i => io.readState(i * 3).resp)) + val src2Ready = VecInit((0 until 4).map(i => io.readState(i * 3 + 1).resp)) + val src3Ready = VecInit((0 until 4).map(i => io.readState(i * 3 + 2).resp)) + enq.bits.src1State := src1Ready(indexVec(i)) + enq.bits.src2State := src2Ready(indexVec(i)) + enq.bits.src3State := src3Ready(indexVec(i)) XSInfo(enq.fire(), p"pc 0x${Hexadecimal(enq.bits.cf.pc)} with type ${enq.bits.ctrl.fuType} " + p"srcState(${enq.bits.src1State} ${enq.bits.src2State} ${enq.bits.src3State}) " + diff --git a/src/main/scala/xiangshan/backend/dispatch/Dispatch2Int.scala b/src/main/scala/xiangshan/backend/dispatch/Dispatch2Int.scala index d72c86c46ec75ca8a61c0faeacf1d19c61d0b56d..8a6bf53be22a734f1e0dd0909c33e7c0f1d6f845 100644 --- a/src/main/scala/xiangshan/backend/dispatch/Dispatch2Int.scala +++ b/src/main/scala/xiangshan/backend/dispatch/Dispatch2Int.scala @@ -6,12 +6,13 @@ import xiangshan._ import utils._ import xiangshan.backend.exu.Exu._ import xiangshan.backend.regfile.RfReadPort +import xiangshan.backend.rename.BusyTableReadIO class Dispatch2Int extends XSModule { val io = IO(new Bundle() { val fromDq = Flipped(Vec(dpParams.IntDqDeqWidth, DecoupledIO(new MicroOp))) - val readRf = Vec(NRIntReadPorts - NRMemReadPorts, Flipped(new RfReadPort(XLEN))) - val regRdy = Vec(NRIntReadPorts - NRMemReadPorts, Input(Bool())) + val readRf = Vec(NRIntReadPorts - NRMemReadPorts, Output(UInt(PhyRegIdxWidth.W))) + val readState = Vec(NRIntReadPorts - NRMemReadPorts, Flipped(new BusyTableReadIO)) val numExist = Input(Vec(exuParameters.IntExuCnt, UInt(log2Ceil(IssQueSize).W))) val enqIQCtrl = Vec(exuParameters.IntExuCnt, DecoupledIO(new MicroOp)) val readPortIndex = Vec(exuParameters.IntExuCnt, Output(UInt(log2Ceil(8 / 2).W))) @@ -58,13 +59,18 @@ class Dispatch2Int extends XSModule { val intDynamicMapped = intDynamicIndex.map(i => indexVec(i)) for (i <- intStaticIndex.indices) { val index = WireInit(VecInit(intStaticMapped(i) +: intDynamicMapped)) - io.readRf(2*i ).addr := io.fromDq(index(intReadPortSrc(i))).bits.psrc1 - io.readRf(2*i+1).addr := io.fromDq(index(intReadPortSrc(i))).bits.psrc2 + io.readRf(2*i ) := io.fromDq(index(intReadPortSrc(i))).bits.psrc1 + io.readRf(2*i+1) := io.fromDq(index(intReadPortSrc(i))).bits.psrc2 } val readPortIndex = Wire(Vec(exuParameters.IntExuCnt, UInt(2.W))) intStaticIndex.zipWithIndex.map({case (index, i) => readPortIndex(index) := i.U}) intDynamicIndex.zipWithIndex.map({case (index, i) => readPortIndex(index) := intDynamicExuSrc(i)}) + for (i <- 0 until dpParams.IntDqDeqWidth) { + io.readState(2*i ).req := io.fromDq(i).bits.psrc1 + io.readState(2*i+1).req := io.fromDq(i).bits.psrc2 + } + /** * Part 3: dispatch to reservation stations */ @@ -84,10 +90,11 @@ class Dispatch2Int extends XSModule { } enq.bits := io.fromDq(indexVec(i)).bits - val src1Ready = VecInit((0 until 4).map(i => io.regRdy(i * 2))) - val src2Ready = VecInit((0 until 4).map(i => io.regRdy(i * 2 + 1))) - enq.bits.src1State := src1Ready(readPortIndex(i)) - enq.bits.src2State := src2Ready(readPortIndex(i)) + val src1Ready = VecInit((0 until 4).map(i => io.readState(i * 2).resp)) + val src2Ready = VecInit((0 until 4).map(i => io.readState(i * 2 + 1).resp)) + enq.bits.src1State := src1Ready(indexVec(i)) + enq.bits.src2State := src2Ready(indexVec(i)) + enq.bits.src3State := DontCare XSInfo(enq.fire(), p"pc 0x${Hexadecimal(enq.bits.cf.pc)} with type ${enq.bits.ctrl.fuType} " + p"srcState(${enq.bits.src1State} ${enq.bits.src2State}) " + diff --git a/src/main/scala/xiangshan/backend/dispatch/Dispatch2Ls.scala b/src/main/scala/xiangshan/backend/dispatch/Dispatch2Ls.scala index 1ce1b8d8e1704cd255c4492c093fd832f0f2ae64..2c407c7e80812f1be8b61093207a3144b58d6f82 100644 --- a/src/main/scala/xiangshan/backend/dispatch/Dispatch2Ls.scala +++ b/src/main/scala/xiangshan/backend/dispatch/Dispatch2Ls.scala @@ -5,17 +5,18 @@ import chisel3.util._ import xiangshan._ import utils._ import xiangshan.backend.regfile.RfReadPort +import xiangshan.backend.rename.BusyTableReadIO import xiangshan.backend.exu.Exu._ class Dispatch2Ls extends XSModule { val io = IO(new Bundle() { val fromDq = Flipped(Vec(dpParams.LsDqDeqWidth, DecoupledIO(new MicroOp))) - val readIntRf = Vec(NRMemReadPorts, Flipped(new RfReadPort(XLEN))) - val readFpRf = Vec(exuParameters.StuCnt, Flipped(new RfReadPort(XLEN + 1))) + val readIntRf = Vec(NRMemReadPorts, Output(UInt(PhyRegIdxWidth.W))) + val readFpRf = Vec(exuParameters.StuCnt, Output(UInt(PhyRegIdxWidth.W))) // val intRegAddr = Vec(NRMemReadPorts, Output(UInt(PhyRegIdxWidth.W))) // val fpRegAddr = Vec(exuParameters.StuCnt, Output(UInt(PhyRegIdxWidth.W))) - val intRegRdy = Vec(NRMemReadPorts, Input(Bool())) - val fpRegRdy = Vec(exuParameters.StuCnt, Input(Bool())) + val readIntState = Vec(NRMemReadPorts, Flipped(new BusyTableReadIO)) + val readFpState = Vec(exuParameters.StuCnt, Flipped(new BusyTableReadIO)) val numExist = Input(Vec(exuParameters.LsExuCnt, UInt(log2Ceil(IssQueSize).W))) val enqIQCtrl = Vec(exuParameters.LsExuCnt, DecoupledIO(new MicroOp)) }) @@ -52,12 +53,16 @@ class Dispatch2Ls extends XSModule { val readPort = Seq(0, 1, 2, 4) for (i <- 0 until exuParameters.LsExuCnt) { if (i < exuParameters.LduCnt) { - io.readIntRf(readPort(i)).addr := io.fromDq(indexVec(i)).bits.psrc1 + io.readIntRf(readPort(i)) := io.fromDq(indexVec(i)).bits.psrc1 + io.readIntState(readPort(i)).req := io.fromDq(indexVec(i)).bits.psrc1 } else { - io.readFpRf(i - exuParameters.LduCnt).addr := io.fromDq(indexVec(i)).bits.psrc2 - io.readIntRf(readPort(i) ).addr := io.fromDq(indexVec(i)).bits.psrc1 - io.readIntRf(readPort(i)+1).addr := io.fromDq(indexVec(i)).bits.psrc2 + io.readFpRf(i - exuParameters.LduCnt) := io.fromDq(indexVec(i)).bits.psrc2 + io.readIntRf(readPort(i) ) := io.fromDq(indexVec(i)).bits.psrc1 + io.readIntRf(readPort(i)+1) := io.fromDq(indexVec(i)).bits.psrc2 + io.readFpState(i - exuParameters.LduCnt).req := io.fromDq(indexVec(i)).bits.psrc2 + io.readIntState(readPort(i) ).req := io.fromDq(indexVec(i)).bits.psrc1 + io.readIntState(readPort(i)+1).req := io.fromDq(indexVec(i)).bits.psrc2 } } @@ -75,14 +80,15 @@ class Dispatch2Ls extends XSModule { enq.valid := storeIndexGen.io.mapping(i - exuParameters.LduCnt).valid && storeReady } enq.bits := io.fromDq(indexVec(i)).bits - enq.bits.src1State := io.intRegRdy(readPort(i)) + enq.bits.src1State := io.readIntState(readPort(i)).resp if (i < exuParameters.LduCnt) { enq.bits.src2State := DontCare } else { enq.bits.src2State := Mux(io.fromDq(indexVec(i)).bits.ctrl.src2Type === SrcType.fp, - io.fpRegRdy(i - exuParameters.LduCnt), io.intRegRdy(readPort(i) + 1)) + io.readFpState(i - exuParameters.LduCnt).resp, io.readIntState(readPort(i) + 1).resp) } + enq.bits.src3State := DontCare XSInfo(enq.fire(), p"pc 0x${Hexadecimal(enq.bits.cf.pc)} with type ${enq.bits.ctrl.fuType} " + p"srcState(${enq.bits.src1State} ${enq.bits.src2State}) " + diff --git a/src/main/scala/xiangshan/backend/exu/Exu.scala b/src/main/scala/xiangshan/backend/exu/Exu.scala index 4d88adbdad63af836fe3fa61bf6d9eb98f43e92b..4efe16bb122cd7b91cfa5651dacb86c9bc9d52f0 100644 --- a/src/main/scala/xiangshan/backend/exu/Exu.scala +++ b/src/main/scala/xiangshan/backend/exu/Exu.scala @@ -100,7 +100,7 @@ abstract class Exu(val config: ExuConfig) extends XSModule { val src2 = in.bits.src2 val src3 = in.bits.src3 - fu.io.in.valid := in.valid && sel && !in.bits.uop.roqIdx.needFlush(io.redirect) + fu.io.in.valid := in.valid && sel fu.io.in.bits.uop := in.bits.uop fu.io.in.bits.src.foreach(_ <> DontCare) if (fuCfg.srcCnt > 0) { @@ -120,13 +120,21 @@ abstract class Exu(val config: ExuConfig) extends XSModule { def writebackArb(in: Seq[DecoupledIO[FuOutput]], out: DecoupledIO[ExuOutput]): Arbiter[FuOutput] = { if (needArbiter) { - val arb = Module(new Arbiter(new FuOutput(in.head.bits.len), in.size)) - arb.io.in <> in - arb.io.out.ready := out.ready - out.bits.data := arb.io.out.bits.data - out.bits.uop := arb.io.out.bits.uop - out.valid := arb.io.out.valid - arb + if(in.size == 1){ + in.head.ready := out.ready + out.bits.data := in.head.bits.data + out.bits.uop := in.head.bits.uop + out.valid := in.head.valid + null + } else { + val arb = Module(new Arbiter(new FuOutput(in.head.bits.len), in.size)) + arb.io.in <> in + arb.io.out.ready := out.ready + out.bits.data := arb.io.out.bits.data + out.bits.uop := arb.io.out.bits.uop + out.valid := arb.io.out.valid + arb + } } else { in.foreach(_.ready := out.ready) val sel = Mux1H(in.map(x => x.valid -> x)) diff --git a/src/main/scala/xiangshan/backend/exu/FmiscExeUnit.scala b/src/main/scala/xiangshan/backend/exu/FmiscExeUnit.scala index 3c067732ca66bec02940e42abe759f81c5cfc9d3..d7a9e6c8a2469472a3a16b98e085874380d1a62b 100644 --- a/src/main/scala/xiangshan/backend/exu/FmiscExeUnit.scala +++ b/src/main/scala/xiangshan/backend/exu/FmiscExeUnit.scala @@ -14,8 +14,8 @@ class FmiscExeUnit extends Exu(fmiscExeUnitCfg) { val toFpUnits = Seq(f2f, fdivSqrt) val toIntUnits = Seq(f2i) - assert(fpArb.io.in.length == toFpUnits.size) - assert(intArb.io.in.length == toIntUnits.size) + assert(toFpUnits.size == 1 || fpArb.io.in.length == toFpUnits.size) + assert(toIntUnits.size == 1 || intArb.io.in.length == toIntUnits.size) val input = io.fromFp val isRVF = input.bits.uop.ctrl.isRVF diff --git a/src/main/scala/xiangshan/backend/exu/Wb.scala b/src/main/scala/xiangshan/backend/exu/Wb.scala index 7c7ad5011b57a57cc9cf4b51d05be9a519be22fe..3f6dbe757fad016209b2a2f5d4beebac87ba823e 100644 --- a/src/main/scala/xiangshan/backend/exu/Wb.scala +++ b/src/main/scala/xiangshan/backend/exu/Wb.scala @@ -56,15 +56,20 @@ class Wb(cfgs: Seq[ExuConfig], numOut: Int, isFp: Boolean) extends XSModule { mulReq.size ) - val arbiters = for(i <- mulReq.indices) yield { - val other = arbReq(i).getOrElse(Seq()) - val arb = Module(new Arbiter(new ExuOutput, 1+other.size)) - arb.io.in <> mulReq(i) +: other + for(i <- mulReq.indices) { val out = io.out(directConnect.size + i) - out.valid := arb.io.out.valid - out.bits := arb.io.out.bits - arb.io.out.ready := true.B - arb + val other = arbReq(i).getOrElse(Seq()) + if(other.isEmpty){ + out.valid := mulReq(i).valid + out.bits := mulReq(i).bits + mulReq(i).ready := true.B + } else { + val arb = Module(new Arbiter(new ExuOutput, 1+other.size)) + arb.io.in <> mulReq(i) +: other + out.valid := arb.io.out.valid + out.bits := arb.io.out.bits + arb.io.out.ready := true.B + } } if(portUsed < numOut){ @@ -78,10 +83,11 @@ class Wb(cfgs: Seq[ExuConfig], numOut: Int, isFp: Boolean) extends XSModule { } for(i <- mulReq.indices){ sb.append(s"[ ${cfgs(io.in.indexOf(mulReq(i))).name} ") + val useArb = arbReq(i).nonEmpty for(req <- arbReq(i).getOrElse(Nil)){ sb.append(s"${cfgs(io.in.indexOf(req)).name} ") } - sb.append(s"] -> arb -> out #${directConnect.size + i}\n") + sb.append(s"] -> ${if(useArb) "arb ->" else ""} out #${directConnect.size + i}\n") } println(sb) diff --git a/src/main/scala/xiangshan/backend/fu/CSR.scala b/src/main/scala/xiangshan/backend/fu/CSR.scala index 9897c844fdd17b90f827f2dcbbf62dc68495551c..a5ee9d8b069c65300572626fcf454249a408b3e4 100644 --- a/src/main/scala/xiangshan/backend/fu/CSR.scala +++ b/src/main/scala/xiangshan/backend/fu/CSR.scala @@ -47,6 +47,67 @@ trait HasExceptionNO { storeAddrMisaligned, loadAddrMisaligned ) + val frontendSet = List( + // instrAddrMisaligned, + instrAccessFault, + illegalInstr, + instrPageFault + ) + val csrSet = List( + illegalInstr, + breakPoint, + ecallU, + ecallS, + ecallM + ) + val loadUnitSet = List( + loadAddrMisaligned, + loadAccessFault, + loadPageFault + ) + val storeUnitSet = List( + storeAddrMisaligned, + storeAccessFault, + storePageFault + ) + val atomicsUnitSet = (loadUnitSet ++ storeUnitSet).distinct + val allPossibleSet = (frontendSet ++ csrSet ++ loadUnitSet ++ storeUnitSet).distinct + val csrWbCount = (0 until 16).map(i => if (csrSet.contains(i)) 1 else 0) + val loadWbCount = (0 until 16).map(i => if (loadUnitSet.contains(i)) 1 else 0) + val storeWbCount = (0 until 16).map(i => if (storeUnitSet.contains(i)) 1 else 0) + val atomicsWbCount = (0 until 16).map(i => if (atomicsUnitSet.contains(i)) 1 else 0) + val writebackCount = (0 until 16).map(i => csrWbCount(i) + atomicsWbCount(i) + loadWbCount(i) + 2 * storeWbCount(i)) + def partialSelect(vec: Vec[Bool], select: Seq[Int], dontCareBits: Boolean = true, falseBits: Boolean = false): Vec[Bool] = { + if (dontCareBits) { + val new_vec = Wire(ExceptionVec()) + new_vec := DontCare + select.map(i => new_vec(i) := vec(i)) + return new_vec + } + else if (falseBits) { + val new_vec = Wire(ExceptionVec()) + new_vec.map(_ := false.B) + select.map(i => new_vec(i) := vec(i)) + return new_vec + } + else { + val new_vec = Wire(Vec(select.length, Bool())) + select.zipWithIndex.map{ case(s, i) => new_vec(i) := vec(s) } + return new_vec + } + } + def selectFrontend(vec: Vec[Bool], dontCareBits: Boolean = true, falseBits: Boolean = false): Vec[Bool] = + partialSelect(vec, frontendSet, dontCareBits, falseBits) + def selectCSR(vec: Vec[Bool], dontCareBits: Boolean = true, falseBits: Boolean = false): Vec[Bool] = + partialSelect(vec, csrSet, dontCareBits, falseBits) + def selectLoad(vec: Vec[Bool], dontCareBits: Boolean = true, falseBits: Boolean = false): Vec[Bool] = + partialSelect(vec, loadUnitSet, dontCareBits, falseBits) + def selectStore(vec: Vec[Bool], dontCareBits: Boolean = true, falseBits: Boolean = false): Vec[Bool] = + partialSelect(vec, storeUnitSet, dontCareBits, falseBits) + def selectAtomics(vec: Vec[Bool], dontCareBits: Boolean = true, falseBits: Boolean = false): Vec[Bool] = + partialSelect(vec, atomicsUnitSet, dontCareBits, falseBits) + def selectAll(vec: Vec[Bool], dontCareBits: Boolean = true, falseBits: Boolean = false): Vec[Bool] = + partialSelect(vec, allPossibleSet, dontCareBits, falseBits) } class FpuCsrIO extends XSBundle { @@ -166,7 +227,7 @@ class CSR extends FunctionUnit with HasCSRConst if (HasFPU) { extList = extList ++ List('f', 'd') } val misaInitVal = getMisaMxl(2) | extList.foldLeft(0.U)((sum, i) => sum | getMisaExt(i)) //"h8000000000141105".U val misa = RegInit(UInt(XLEN.W), misaInitVal) - + // MXL = 2 | 0 | EXT = b 00 0000 0100 0001 0001 0000 0101 // (XLEN-1, XLEN-2) | |(25, 0) ZY XWVU TSRQ PONM LKJI HGFE DCBA @@ -175,8 +236,8 @@ class CSR extends FunctionUnit with HasCSRConst val mimpid = RegInit(UInt(XLEN.W), 0.U) // provides a unique encoding of the version of the processor implementation val mhartNo = hartId() val mhartid = RegInit(UInt(XLEN.W), mhartNo.asUInt) // the hardware thread running the code - val mstatus = RegInit(UInt(XLEN.W), "h00001800".U) // another option: "h8000c0100".U - + val mstatus = RegInit(UInt(XLEN.W), 0.U) + // mstatus Value Table // | sd | // | pad1 | @@ -196,7 +257,7 @@ class CSR extends FunctionUnit with HasCSRConst // | spp | 0 | // | pie | 0000 | pie.h is used as UBE // | ie | 0000 | uie hardlinked to 0, as N ext is not implemented - + val mstatusStruct = mstatus.asTypeOf(new MstatusStruct) def mstatusUpdateSideEffect(mstatus: UInt): UInt = { val mstatusOld = WireInit(mstatus.asTypeOf(new MstatusStruct)) @@ -318,11 +379,11 @@ class CSR extends FunctionUnit with HasCSRConst // Emu perfcnt val hasEmuPerfCnt = !env.FPGAPlatform val nrEmuPerfCnts = if (hasEmuPerfCnt) 0x80 else 0x3 - + val emuPerfCnts = List.fill(nrEmuPerfCnts)(RegInit(0.U(XLEN.W))) val emuPerfCntCond = List.fill(nrEmuPerfCnts)(WireInit(false.B)) (emuPerfCnts zip emuPerfCntCond).map { case (c, e) => when (e) { c := c + 1.U } } - + val emuPerfCntsLoMapping = (0 until nrEmuPerfCnts).map(i => MaskedRegMap(0x1000 + i, emuPerfCnts(i))) val emuPerfCntsHiMapping = (0 until nrEmuPerfCnts).map(i => MaskedRegMap(0x1080 + i, emuPerfCnts(i)(63, 32))) println(s"CSR: hasEmuPerfCnt:${hasEmuPerfCnt}") @@ -336,7 +397,7 @@ class CSR extends FunctionUnit with HasCSRConst mcycle := mcycle + 1.U val minstret = RegInit(0.U(XLEN.W)) minstret := minstret + RegNext(csrio.perf.retiredInstr) - + // CSR reg map val basicPrivMapping = Map( @@ -424,13 +485,13 @@ class CSR extends FunctionUnit with HasCSRConst val mapping = basicPrivMapping ++ perfCntMapping ++ - pmpMapping ++ - emuPerfCntsLoMapping ++ + pmpMapping ++ + emuPerfCntsLoMapping ++ (if (XLEN == 32) emuPerfCntsHiMapping else Nil) ++ (if (HasFPU) fcsrMapping else Nil) - + val addr = src2(11, 0) - val csri = src2(16, 12) + val csri = ZeroExt(src2(16, 12), XLEN) val rdata = Wire(UInt(XLEN.W)) val wdata = LookupTree(func, List( CSROpType.wrt -> src1, @@ -441,14 +502,17 @@ class CSR extends FunctionUnit with HasCSRConst CSROpType.clri -> (rdata & (~csri).asUInt()) )) - csrio.isPerfCnt := (addr >= Mcycle.U) && (addr <= Mhpmcounter31.U) + val addrInPerfCnt = (addr >= Mcycle.U) && (addr <= Mhpmcounter31.U) + csrio.isPerfCnt := addrInPerfCnt // satp wen check val satpLegalMode = (wdata.asTypeOf(new SatpStruct).mode===0.U) || (wdata.asTypeOf(new SatpStruct).mode===8.U) // general CSR wen check val wen = valid && func =/= CSROpType.jmp && (addr=/=Satp.U || satpLegalMode) - val permitted = csrAccessPermissionCheck(addr, false.B, priviledgeMode) + val modePermitted = csrAccessPermissionCheck(addr, false.B, priviledgeMode) + val perfcntPermitted = perfcntPermissionCheck(addr, priviledgeMode, mcounteren, scounteren) + val permitted = Mux(addrInPerfCnt, perfcntPermitted, modePermitted) // Writeable check is ingored. // Currently, write to illegal csr addr will be ignored MaskedRegMap.generate(mapping, addr, rdata, wen && permitted, wdata) @@ -580,6 +644,17 @@ class CSR extends FunctionUnit with HasCSRConst io.in.ready := true.B io.out.valid := valid + val csrExceptionVec = WireInit(cfIn.exceptionVec) + csrExceptionVec(breakPoint) := io.in.valid && isEbreak + csrExceptionVec(ecallM) := priviledgeMode === ModeM && io.in.valid && isEcall + csrExceptionVec(ecallS) := priviledgeMode === ModeS && io.in.valid && isEcall + csrExceptionVec(ecallU) := priviledgeMode === ModeU && io.in.valid && isEcall + // Trigger an illegal instr exception when: + // * unimplemented csr is being read/written + // * csr access is illegal + csrExceptionVec(illegalInstr) := (isIllegalAddr || isIllegalAccess) && wen + cfOut.exceptionVec := csrExceptionVec + /** * Exception and Intr */ @@ -609,24 +684,11 @@ class CSR extends FunctionUnit with HasCSRConst val hasStorePageFault = csrio.exception.bits.cf.exceptionVec(storePageFault) && raiseException val hasStoreAddrMisaligned = csrio.exception.bits.cf.exceptionVec(storeAddrMisaligned) && raiseException val hasLoadAddrMisaligned = csrio.exception.bits.cf.exceptionVec(loadAddrMisaligned) && raiseException + val hasInstrAccessFault = csrio.exception.bits.cf.exceptionVec(instrAccessFault) && raiseException + val hasLoadAccessFault = csrio.exception.bits.cf.exceptionVec(loadAccessFault) && raiseException + val hasStoreAccessFault = csrio.exception.bits.cf.exceptionVec(storeAccessFault) && raiseException - val csrExceptionVec = Wire(Vec(16, Bool())) - csrExceptionVec.map(_ := false.B) - csrExceptionVec(breakPoint) := io.in.valid && isEbreak - csrExceptionVec(ecallM) := priviledgeMode === ModeM && io.in.valid && isEcall - csrExceptionVec(ecallS) := priviledgeMode === ModeS && io.in.valid && isEcall - csrExceptionVec(ecallU) := priviledgeMode === ModeU && io.in.valid && isEcall - // Trigger an illegal instr exception when: - // * unimplemented csr is being read/written - // * csr access is illegal - csrExceptionVec(illegalInstr) := (isIllegalAddr || isIllegalAccess) && wen - csrExceptionVec(loadPageFault) := hasLoadPageFault - csrExceptionVec(storePageFault) := hasStorePageFault - val iduExceptionVec = cfIn.exceptionVec - val exceptionVec = csrExceptionVec.asUInt() | iduExceptionVec.asUInt() - cfOut.exceptionVec.zipWithIndex.map{case (e, i) => e := exceptionVec(i) } - - val raiseExceptionVec = csrio.exception.bits.cf.exceptionVec.asUInt() + val raiseExceptionVec = csrio.exception.bits.cf.exceptionVec val exceptionNO = ExcPriority.foldRight(0.U)((i: Int, sum: UInt) => Mux(raiseExceptionVec(i), i.U, sum)) val causeNO = (raiseIntr << (XLEN-1)).asUInt() | Mux(raiseIntr, intrNO, exceptionNO) @@ -738,6 +800,11 @@ class CSR extends FunctionUnit with HasCSRConst "PtwL2TlbHit" -> (0x1027, "perfCntPtwL2TlbHit" ), "ICacheReq" -> (0x1028, "perfCntIcacheReqCnt" ), "ICacheMiss" -> (0x1029, "perfCntIcacheMissCnt"), + "ICacheMMIO" -> (0x102a, "perfCntIcacheMMIOCnt"), + // "FetchFromLoopBuffer" -> (0x102b, "CntFetchFromLoopBuffer"), + // "ExitLoop1" -> (0x102c, "CntExitLoop1"), + // "ExitLoop2" -> (0x102d, "CntExitLoop2"), + // "ExitLoop3" -> (0x102e, "CntExitLoop3") "ubtbRight" -> (0x1030, "perfCntubtbRight"), "ubtbWrong" -> (0x1031, "perfCntubtbWrong"), diff --git a/src/main/scala/xiangshan/backend/fu/Fence.scala b/src/main/scala/xiangshan/backend/fu/Fence.scala index 5230e3429737d4a4b7693216fade2fc6ea571d8f..640f8889d77ef2077b5046f96e53f7d2df7e34d2 100644 --- a/src/main/scala/xiangshan/backend/fu/Fence.scala +++ b/src/main/scala/xiangshan/backend/fu/Fence.scala @@ -20,41 +20,51 @@ class Fence extends FunctionUnit{ // TODO: check it val fencei = IO(Output(Bool())) val toSbuffer = IO(new FenceToSbuffer) - val (valid, src1, uop, func, lsrc1, lsrc2) = ( + val (valid, src1) = ( io.in.valid, - io.in.bits.src(0), - io.in.bits.uop, - io.in.bits.uop.ctrl.fuOpType, - io.in.bits.uop.ctrl.lsrc1, - io.in.bits.uop.ctrl.lsrc2 + io.in.bits.src(0) ) - val s_sb :: s_tlb :: s_icache :: s_none :: Nil = Enum(4) - val state = RegInit(s_sb) + val s_idle :: s_wait :: s_tlb :: s_icache :: s_fence :: Nil = Enum(5) + val state = RegInit(s_idle) + /* fsm + * s_idle : init state, send sbflush + * s_wait : send sbflush, wait for sbEmpty + * s_tlb : flush tlb, just hold one cycle + * s_icache: flush icache, just hold one cycle + * s_fence : do nothing, for timing optimiaztion + */ val sbuffer = toSbuffer.flushSb val sbEmpty = toSbuffer.sbIsEmpty + val uop = RegEnable(io.in.bits.uop, io.in.fire()) + val func = uop.ctrl.fuOpType + val lsrc1 = uop.ctrl.lsrc1 + val lsrc2 = uop.ctrl.lsrc2 // NOTE: icache & tlb & sbuffer must receive flush signal at any time - sbuffer := valid && state === s_sb && !sbEmpty - fencei := (state === s_icache && sbEmpty) || (state === s_sb && valid && sbEmpty && func === FenceOpType.fencei) - sfence.valid := (state === s_tlb && sbEmpty) || (state === s_sb && valid && sbEmpty && func === FenceOpType.sfence) - sfence.bits.rs1 := Mux(state === s_sb, lsrc1 === 0.U, RegEnable(lsrc1 === 0.U, io.in.fire())) - sfence.bits.rs2 := Mux(state === s_sb, lsrc2 === 0.U, RegEnable(lsrc2 === 0.U, io.in.fire())) - sfence.bits.addr := Mux(state === s_sb, src1, RegEnable(src1, io.in.fire())) - - when (state === s_sb && valid && func === FenceOpType.fencei && !sbEmpty) { state := s_icache } - when (state === s_sb && valid && func === FenceOpType.sfence && !sbEmpty) { state := s_tlb } - when (state === s_sb && valid && func === FenceOpType.fence && !sbEmpty) { state := s_none } - when (state =/= s_sb && sbEmpty) { state := s_sb } + sbuffer := state === s_wait + fencei := state === s_icache + sfence.valid := state === s_tlb + sfence.bits.rs1 := lsrc1 === 0.U + sfence.bits.rs2 := lsrc2 === 0.U + sfence.bits.addr := RegEnable(src1, io.in.fire()) - assert(!(io.out.valid && io.out.bits.uop.ctrl.rfWen)) - io.in.ready := state === s_sb - io.out.valid := (state =/= s_sb && sbEmpty) || (state === s_sb && sbEmpty && valid) + when (state === s_idle && valid) { state := s_wait } + when (state === s_wait && func === FenceOpType.fencei && sbEmpty) { state := s_icache } + when (state === s_wait && func === FenceOpType.sfence && sbEmpty) { state := s_tlb } + when (state === s_wait && func === FenceOpType.fence && sbEmpty) { state := s_fence } + when (state =/= s_idle && state =/= s_wait) { state := s_idle } + + io.in.ready := state === s_idle + io.out.valid := state =/= s_idle && state =/= s_wait io.out.bits.data := DontCare - io.out.bits.uop := Mux(state === s_sb, uop, RegEnable(uop, io.in.fire())) + io.out.bits.uop := uop - assert(!(valid || state =/= s_sb) || io.out.ready) // NOTE: fence instr must be the first(only one) instr, so io.out.ready must be true + XSDebug(valid, p"In(${io.in.valid} ${io.in.ready}) state:${state} Inpc:0x${Hexadecimal(io.in.bits.uop.cf.pc)} InroqIdx:${io.in.bits.uop.roqIdx}\n") + XSDebug(state =/= s_idle, p"state:${state} sbuffer(flush:${sbuffer} empty:${sbEmpty}) fencei:${fencei} sfence:${sfence}\n") + XSDebug(io.out.valid, p" Out(${io.out.valid} ${io.out.ready}) state:${state} Outpc:0x${Hexadecimal(io.out.bits.uop.cf.pc)} OutroqIdx:${io.out.bits.uop.roqIdx}\n") - XSDebug(valid || state=/=s_sb || io.out.valid, p"In(${io.in.valid} ${io.in.ready}) Out(${io.out.valid} ${io.out.ready}) state:${state} sbuffer(flush:${sbuffer} empty:${sbEmpty}) fencei:${fencei} sfence:${sfence} Inpc:0x${Hexadecimal(io.in.bits.uop.cf.pc)} InroqIdx:${io.in.bits.uop.roqIdx} Outpc:0x${Hexadecimal(io.out.bits.uop.cf.pc)} OutroqIdx:${io.out.bits.uop.roqIdx}\n") + assert(!(io.out.valid && io.out.bits.uop.ctrl.rfWen)) + assert(!io.out.valid || io.out.ready, "when fence is out valid, out ready should always be true") } \ No newline at end of file diff --git a/src/main/scala/xiangshan/backend/fu/Jump.scala b/src/main/scala/xiangshan/backend/fu/Jump.scala index 56fed86aaff0f5b375b3e7431400a1772d16671f..64a77242eddd4f650d705788ddabb6ab3810b9f6 100644 --- a/src/main/scala/xiangshan/backend/fu/Jump.scala +++ b/src/main/scala/xiangshan/backend/fu/Jump.scala @@ -25,10 +25,13 @@ class Jump extends FunctionUnit with HasRedirectOut { io.in.bits.uop ) - val offset = SignExt(Mux(JumpOpType.jumpOpIsJal(func), - ImmUnion.J.toImm32(immMin), - ImmUnion.I.toImm32(immMin) - ), XLEN) + val isJalr = JumpOpType.jumpOpisJalr(func) + val isAuipc = JumpOpType.jumpOpisAuipc(func) + val offset = SignExt(Mux1H(Seq( + isJalr -> ImmUnion.I.toImm32(immMin), + isAuipc -> ImmUnion.U.toImm32(immMin), + !(isJalr || isAuipc) -> ImmUnion.J.toImm32(immMin) + )), XLEN) val redirectHit = uop.roqIdx.needFlush(io.redirectIn) val valid = io.in.valid @@ -53,7 +56,7 @@ class Jump extends FunctionUnit with HasRedirectOut { brUpdate.taken := true.B // Output - val res = snpc + val res = Mux(JumpOpType.jumpOpisAuipc(func), target, snpc) io.in.ready := io.out.ready io.out.valid := valid diff --git a/src/main/scala/xiangshan/backend/fu/Radix2Divider.scala b/src/main/scala/xiangshan/backend/fu/Radix2Divider.scala index 67fd4a6fe156d0d3b6a5a784c730146ed4207ee1..654f8b3264f58c3a309743127e1d4f3608e33ab5 100644 --- a/src/main/scala/xiangshan/backend/fu/Radix2Divider.scala +++ b/src/main/scala/xiangshan/backend/fu/Radix2Divider.scala @@ -41,7 +41,7 @@ class Radix2Divider(len: Int) extends AbstractDivider(len) { val uopReg = RegEnable(uop, newReq) val cnt = Counter(len) - when (newReq) { + when (newReq && !io.in.bits.uop.roqIdx.needFlush(io.redirectIn)) { state := s_log2 } .elsewhen (state === s_log2) { // `canSkipShift` is calculated as following: @@ -85,6 +85,6 @@ class Radix2Divider(len: Int) extends AbstractDivider(len) { io.out.bits.data := Mux(ctrlReg.isW, SignExt(res(31,0),xlen), res) io.out.bits.uop := uopReg - io.out.valid := state === s_finish && !kill + io.out.valid := state === s_finish io.in.ready := state === s_idle } \ No newline at end of file diff --git a/src/main/scala/xiangshan/backend/fu/SRT4Divider.scala b/src/main/scala/xiangshan/backend/fu/SRT4Divider.scala index 4d3806672f3bfd235fa97d3dbacb6d83509a58be..ea8fd75724cc3525e1bd41378cd4459b4533f363 100644 --- a/src/main/scala/xiangshan/backend/fu/SRT4Divider.scala +++ b/src/main/scala/xiangshan/backend/fu/SRT4Divider.scala @@ -37,7 +37,9 @@ class SRT4Divider(len: Int) extends AbstractDivider(len) { switch(state){ is(s_idle){ - when(io.in.fire()){ state := Mux(divZero, s_finish, s_lzd) } + when (io.in.fire() && !io.in.bits.uop.roqIdx.needFlush(io.redirectIn)) { + state := Mux(divZero, s_finish, s_lzd) + } } is(s_lzd){ // leading zero detection state := s_normlize @@ -220,7 +222,7 @@ class SRT4Divider(len: Int) extends AbstractDivider(len) { ) io.in.ready := state===s_idle - io.out.valid := state===s_finish && !kill + io.out.valid := state===s_finish io.out.bits.data := Mux(ctrlReg.isW, SignExt(res(31, 0), len), res diff --git a/src/main/scala/xiangshan/backend/fu/fpu/FDivSqrt.scala b/src/main/scala/xiangshan/backend/fu/fpu/FDivSqrt.scala index 0e57bd851b385ced46ecfa4ba3257346d8900b3c..0b5ff0f66836e3dca9beda5f84c3f4ee0c332db1 100644 --- a/src/main/scala/xiangshan/backend/fu/fpu/FDivSqrt.scala +++ b/src/main/scala/xiangshan/backend/fu/fpu/FDivSqrt.scala @@ -47,7 +47,7 @@ class FDivSqrt extends FPUSubModule { val src1 = unbox(io.in.bits.src(0), tag, None) val src2 = unbox(io.in.bits.src(1), tag, None) - divSqrt.io.inValid := io.in.fire() + divSqrt.io.inValid := io.in.fire() && !io.in.bits.uop.roqIdx.needFlush(io.redirectIn) divSqrt.io.sqrtOp := fpCtrl.sqrt divSqrt.io.a := src1 divSqrt.io.b := src2 @@ -74,7 +74,7 @@ class FDivSqrt extends FPUSubModule { val flags = Mux(single, round32.io.exceptionFlags, round64.io.exceptionFlags) io.in.ready := state===s_idle - io.out.valid := state===s_finish && !(killReg || kill) + io.out.valid := state===s_finish && !killReg io.out.bits.uop := uopReg io.out.bits.data := RegNext(data, divSqrtRawValid) fflags := RegNext(flags, divSqrtRawValid) diff --git a/src/main/scala/xiangshan/backend/fu/fpu/FPToFP.scala b/src/main/scala/xiangshan/backend/fu/fpu/FPToFP.scala index 2d434bb169c73271468c1c7d0d0ef3ba25f1e536..0b1e11fbbff9bc61cf80d532c35ab121819b1092 100644 --- a/src/main/scala/xiangshan/backend/fu/fpu/FPToFP.scala +++ b/src/main/scala/xiangshan/backend/fu/fpu/FPToFP.scala @@ -12,14 +12,16 @@ class FPToFP extends FPUPipelineModule{ override def latency: Int = FunctionUnit.f2iCfg.latency.latencyVal.get - val ctrl = io.in.bits.uop.ctrl.fpu + val ctrlIn = io.in.bits.uop.ctrl.fpu + val ctrl = S1Reg(ctrlIn) val inTag = ctrl.typeTagIn val outTag = ctrl.typeTagOut - val src1 = unbox(io.in.bits.src(0), inTag, None) - val src2 = unbox(io.in.bits.src(1), inTag, None) val wflags = ctrl.wflags + val src1 = S1Reg(unbox(io.in.bits.src(0), ctrlIn.typeTagIn, None)) + val src2 = S1Reg(unbox(io.in.bits.src(1), ctrlIn.typeTagIn, None)) + val rmReg = S1Reg(rm) - val signNum = Mux(rm(1), src1 ^ src2, Mux(rm(0), ~src2, src2)) + val signNum = Mux(rmReg(1), src1 ^ src2, Mux(rmReg(0), ~src2, src2)) val fsgnj = Cat(signNum(fLen), src1(fLen-1, 0)) val fsgnjMux = Wire(new Bundle() { @@ -32,7 +34,7 @@ class FPToFP extends FPUPipelineModule{ val dcmp = Module(new CompareRecFN(maxExpWidth, maxSigWidth)) dcmp.io.a := src1 dcmp.io.b := src2 - dcmp.io.signaling := !rm(1) + dcmp.io.signaling := !rmReg(1) val lt = dcmp.io.lt || (dcmp.io.a.asSInt() < 0.S && dcmp.io.b.asSInt() >= 0.S) @@ -41,7 +43,7 @@ class FPToFP extends FPUPipelineModule{ val isnan2 = maxType.isNaN(src2) val isInvalid = maxType.isSNaN(src1) || maxType.isSNaN(src2) val isNaNOut = isnan1 && isnan2 - val isLHS = isnan2 || rm(0) =/= lt && !isnan1 + val isLHS = isnan2 || rmReg(0) =/= lt && !isnan1 fsgnjMux.exc := isInvalid << 4 fsgnjMux.data := Mux(isNaNOut, maxType.qNaN, Mux(isLHS, src1, src2)) } @@ -67,7 +69,7 @@ class FPToFP extends FPUPipelineModule{ when(outTag === typeTag(outType).U && (typeTag(outType) == 0).B || (outTag < inTag)){ val narrower = Module(new hardfloat.RecFNToRecFN(maxType.exp, maxType.sig, outType.exp, outType.sig)) narrower.io.in := src1 - narrower.io.roundingMode := rm + narrower.io.roundingMode := rmReg narrower.io.detectTininess := hardfloat.consts.tininess_afterRounding val narrowed = sanitizeNaN(narrower.io.out, outType) mux.data := Cat(fsgnjMux.data >> narrowed.getWidth, narrowed) @@ -77,11 +79,6 @@ class FPToFP extends FPUPipelineModule{ } } - var resVec = Seq(mux) - for(i <- 1 to latency){ - resVec = resVec :+ PipelineReg(i)(resVec(i-1)) - } - - io.out.bits.data := resVec.last.data - fflags := resVec.last.exc + io.out.bits.data := S2Reg(mux.data) + fflags := S2Reg(mux.exc) } diff --git a/src/main/scala/xiangshan/backend/fu/fpu/FPToInt.scala b/src/main/scala/xiangshan/backend/fu/fpu/FPToInt.scala index 405afc4ce2c2b45c61f1456c03863b5e6d97a400..9a1e0d269c5c292f28969abb4503caf5ad6843e3 100644 --- a/src/main/scala/xiangshan/backend/fu/fpu/FPToInt.scala +++ b/src/main/scala/xiangshan/backend/fu/fpu/FPToInt.scala @@ -18,30 +18,37 @@ class FPToInt extends FPUPipelineModule { val ctrl = io.in.bits.uop.ctrl.fpu - val src1_s = unbox(src1, S, Some(FType.S)) - val src1_d = unbox(src1, ctrl.typeTagIn, None) - val src2_d = unbox(src2, ctrl.typeTagIn, None) - - val src1_ieee = ieee(src1) - val move_out = Mux(ctrl.typeTagIn === S, src1_ieee(31, 0), src1_ieee) + // stage 1: unbox inputs + val src1_d = S1Reg(unbox(src1, ctrl.typeTagIn, None)) + val src2_d = S1Reg(unbox(src2, ctrl.typeTagIn, None)) + val ctrl_reg = S1Reg(ctrl) + val rm_reg = S1Reg(rm) + + // stage2 + + val src1_ieee = ieee(src1_d) + val move_out = Mux(ctrl_reg.typeTagIn === S, + src1_ieee(FType.S.ieeeWidth - 1, 0), + src1_ieee + ) - val classify_out = Mux(ctrl.typeTagIn === S, - FType.S.classify(src1_s), - FType.D.classify(src1) + val classify_out = Mux(ctrl_reg.typeTagIn === S, + FType.S.classify(maxType.unsafeConvert(src1_d, FType.S)), + FType.D.classify(src1_d) ) val dcmp = Module(new hardfloat.CompareRecFN(maxExpWidth, maxSigWidth)) dcmp.io.a := src1_d dcmp.io.b := src2_d - dcmp.io.signaling := !rm(1) + dcmp.io.signaling := !rm_reg(1) - val dcmp_out = ((~rm).asUInt() & Cat(dcmp.io.lt, dcmp.io.eq)).orR() + val dcmp_out = ((~rm_reg).asUInt() & Cat(dcmp.io.lt, dcmp.io.eq)).orR() val dcmp_exc = dcmp.io.exceptionFlags val conv = Module(new RecFNToIN(maxExpWidth, maxSigWidth, XLEN)) conv.io.in := src1_d - conv.io.roundingMode := rm - conv.io.signedOut := ~ctrl.typ(0) + conv.io.roundingMode := rm_reg + conv.io.signedOut := ~ctrl_reg.typ(0) val conv_out = WireInit(conv.io.out) val conv_exc = WireInit(Cat( @@ -52,10 +59,10 @@ class FPToInt extends FPUPipelineModule { val narrow = Module(new RecFNToIN(maxExpWidth, maxSigWidth, 32)) narrow.io.in := src1_d - narrow.io.roundingMode := rm - narrow.io.signedOut := ~ctrl.typ(0) + narrow.io.roundingMode := rm_reg + narrow.io.signedOut := ~ctrl_reg.typ(0) - when(!ctrl.typ(1)) { // fcvt.w/wu.fp + when(!ctrl_reg.typ(1)) { // fcvt.w/wu.fp val excSign = src1_d(maxExpWidth + maxSigWidth) && !maxType.isNaN(src1_d) val excOut = Cat(conv.io.signedOut === excSign, Fill(32 - 1, !excSign)) val invalid = conv.io.intExceptionFlags(2) || narrow.io.intExceptionFlags(1) @@ -67,26 +74,18 @@ class FPToInt extends FPUPipelineModule { val intData = Wire(UInt(XLEN.W)) - intData := Mux(ctrl.wflags, - Mux(ctrl.fcvt, conv_out, dcmp_out), - Mux(rm(0), classify_out, move_out) + intData := Mux(ctrl_reg.wflags, + Mux(ctrl_reg.fcvt, conv_out, dcmp_out), + Mux(rm_reg(0), classify_out, move_out) ) - val doubleOut = Mux(ctrl.fcvt, ctrl.typ(1), ctrl.fmt(0)) - val intValue = Mux(doubleOut, + val doubleOut = Mux(ctrl_reg.fcvt, ctrl_reg.typ(1), ctrl_reg.fmt(0)) + val intValue = S2Reg(Mux(doubleOut, SignExt(intData, XLEN), SignExt(intData(31, 0), XLEN) - ) - - val exc = Mux(ctrl.fcvt, conv_exc, dcmp_exc) - - var dataVec = Seq(intValue) - var excVec = Seq(exc) + )) - for (i <- 1 to latency) { - dataVec = dataVec :+ PipelineReg(i)(dataVec(i - 1)) - excVec = excVec :+ PipelineReg(i)(excVec(i - 1)) - } + val exc = S2Reg(Mux(ctrl_reg.fcvt, conv_exc, dcmp_exc)) - io.out.bits.data := dataVec.last - fflags := excVec.last + io.out.bits.data := intValue + fflags := exc } diff --git a/src/main/scala/xiangshan/backend/fu/util/CSRConst.scala b/src/main/scala/xiangshan/backend/fu/util/CSRConst.scala index d84f0a66d36ea9f2b2d0b2731d6815443dee6bf3..80b6c16ad8c7d4993429ecb78f8fef13c775fa96 100644 --- a/src/main/scala/xiangshan/backend/fu/util/CSRConst.scala +++ b/src/main/scala/xiangshan/backend/fu/util/CSRConst.scala @@ -185,4 +185,9 @@ trait HasCSRConst { val lowestAccessPrivilegeLevel = addr(9,8) mode >= lowestAccessPrivilegeLevel && !(wen && readOnly) } + + def perfcntPermissionCheck(addr: UInt, mode: UInt, mmask: UInt, smask: UInt): Bool = { + val index = UIntToOH(addr & 31.U) + Mux(mode === ModeM, true.B, Mux(mode === ModeS, (index & mmask) =/= 0.U, (index & mmask & smask) =/= 0.U)) + } } \ No newline at end of file diff --git a/src/main/scala/xiangshan/backend/issue/ReservationStationNew.scala b/src/main/scala/xiangshan/backend/issue/ReservationStationNew.scala index ab43642ed657ab99fabb8feb76a604f62a7fb90b..3c25d7084cf0b028b949d31d1dda656df42d0da1 100644 --- a/src/main/scala/xiangshan/backend/issue/ReservationStationNew.scala +++ b/src/main/scala/xiangshan/backend/issue/ReservationStationNew.scala @@ -4,7 +4,8 @@ import chisel3._ import chisel3.util._ import xiangshan._ import utils._ -import xiangshan.backend.decode.ImmUnion +import xiangshan.backend.SelImm +import xiangshan.backend.decode.{ImmUnion, Imm_U} import xiangshan.backend.exu.{Exu, ExuConfig} import xiangshan.backend.regfile.RfReadPort @@ -22,12 +23,13 @@ class BypassQueue(number: Int) extends XSModule { } else if(number == 0) { io.in <> io.out io.out.valid := io.in.valid + // NOTE: no delay bypass don't care redirect } else { val queue = Seq.fill(number)(RegInit(0.U.asTypeOf(new Bundle{ val valid = Bool() val bits = new MicroOp }))) - queue(0).valid := io.in.valid + queue(0).valid := io.in.valid && !io.in.bits.roqIdx.needFlush(io.redirect) queue(0).bits := io.in.bits (0 until (number-1)).map{i => queue(i+1) := queue(i) @@ -50,7 +52,7 @@ class RSCtrlDataIO(srcNum: Int) extends XSBundle { val fuReady = Input(Bool()) val srcUpdate = Input(Vec(IssQueSize+1, Vec(srcNum, Bool()))) // Note: the last one for enq - val redVec = Input(UInt(IssQueSize.W)) + val redirectVec = Input(Vec(IssQueSize, Bool())) val feedback = Input(Vec(IssQueSize+1, Bool())) // Note: the last one for hit override def cloneType: RSCtrlDataIO.this.type = new RSCtrlDataIO(srcNum).asInstanceOf[this.type] @@ -100,16 +102,14 @@ class ReservationStationCtrl * valid queue : from state queue, valid or not * empty queue : from state queue, empty or not(not valid and not replay) * src queue : record rdy or not - * cnt queue : record replay cycle + * count queue : record replay cycle */ - val s_idle :: s_valid :: s_selected :: s_bubble :: s_wait :: s_replay :: Nil = Enum(6) + val s_idle :: s_valid :: s_wait :: s_replay :: Nil = Enum(4) /* state machine * s_idle : empty slot, init state, set when deq * s_valid : ready to be secleted - * s_selected : the not bubble that selected - * s_bubble : the bubble that selected * s_wait : wait for feedback * s_replay : replay after some particular cycle */ @@ -117,64 +117,69 @@ class ReservationStationCtrl val validQueue = VecInit(stateQueue.map(_ === s_valid)) val emptyQueue = VecInit(stateQueue.map(_ === s_idle)) val srcQueue = Reg(Vec(iqSize, Vec(srcNum, Bool()))) - val cntQueue = Reg(Vec(iqSize, UInt(log2Up(replayDelay).W))) + val countQueue = Reg(Vec(iqSize, UInt(log2Up(replayDelay).W))) // rs queue part: // val tailPtr = RegInit(0.U((iqIdxWidth+1).W)) val tailPtr = RegInit(0.U.asTypeOf(new CircularQueuePtr(iqSize))) - val idxQueue = RegInit(VecInit((0 until iqSize).map(_.U(iqIdxWidth.W)))) + val indexQueue = RegInit(VecInit((0 until iqSize).map(_.U(iqIdxWidth.W)))) // turn to indexed index def widthMap[T <: Data](f: Int => T) = VecInit((0 until iqSize).map(f)) - val stateIdxQue = widthMap(i => stateQueue(idxQueue(i))) // NOTE: only use for debug, remove it later - val validIdxQue = widthMap(i => validQueue(idxQueue(i))) - val emptyIdxQue = widthMap(i => emptyQueue(idxQueue(i))) - val srcIdxQue = widthMap(i => srcQueue(idxQueue(i))) - val cntIdxQue = widthMap(i => cntQueue(idxQueue(i))) // NOTE: only use for debug, remove it later + val stateIdxQue = widthMap(i => stateQueue(indexQueue(i))) // NOTE: only use for debug, remove it later + val validIdxQue = widthMap(i => validQueue(indexQueue(i))) + val emptyIdxQue = widthMap(i => emptyQueue(indexQueue(i))) + val srcIdxQue = widthMap(i => srcQueue(indexQueue(i))) + val cntIdxQue = widthMap(i => countQueue(indexQueue(i))) // NOTE: only use for debug, remove it later val readyIdxQue = VecInit(srcIdxQue.zip(validIdxQue).map{ case (a,b) => Cat(a).andR & b }) // redirect - val redVec = io.data.redVec - val redVecPtr = widthMap(i => io.data.redVec(idxQueue(i))) - val fbMatchVec = Wire(UInt(iqSize.W)) + val redirectVec = io.data.redirectVec + val redirectVecPtr = widthMap(i => io.data.redirectVec(indexQueue(i))) + val feedbackMatchVec = Wire(UInt(iqSize.W)) if (feedback) { - fbMatchVec := widthMap(i => io.data.feedback(i) && (stateQueue(i) === s_wait || stateQueue(i)===s_valid)).asUInt + feedbackMatchVec := widthMap(i => io.data.feedback(i) && (stateQueue(i) === s_wait || stateQueue(i)===s_valid)).asUInt } else { - fbMatchVec := 0.U + feedbackMatchVec := 0.U } - val fbHit = io.data.feedback(IssQueSize) + val feedbackHit = io.data.feedback(IssQueSize) // select ready // for no replay, select just equal to deq (attached) // with replay, select is just two stage with deq. - val issFire = Wire(Bool()) + val issueFire = Wire(Bool()) val moveMask = WireInit(0.U(iqSize.W)) - val selectMask = WireInit(VecInit((0 until iqSize).map(i => readyIdxQue(i)))) - // val selIdx = ParallelMux(selectMask zip idxQueue) // NOTE: the idx in the idxQueue - val (selPtr, haveReady) = PriorityEncoderWithFlag(selectMask) // NOTE: the idx of idxQueue - val selIdx = idxQueue(selPtr) - val selIdxReg = RegNext(selIdx) // NOTE: may dup with other signal, fix it later - val redSel = redVec(selIdx) - val selValid = !redSel && haveReady - val selReg = RegNext(selValid) - val selPtrReg = RegNext(Mux(moveMask(selPtr), selPtr-1.U, selPtr)) + val lastSelMask = Wire(UInt(iqSize.W)) + val selectMask = WireInit(VecInit((0 until iqSize).map(i => readyIdxQue(i)))).asUInt & lastSelMask + val selectIndex = ParallelPriorityMux(selectMask.asBools zip indexQueue) // NOTE: the idx in the indexQueue + val selectPtr = ParallelPriorityMux(selectMask.asBools.zipWithIndex.map{ case (a,i) => (a, i.U)}) // NOTE: the idx of indexQueue + val haveReady = Cat(selectMask).orR + val selectIndexReg = RegNext(selectIndex) + val selectValid = haveReady + val selectReg = RegNext(selectValid) + val selectPtrReg = RegNext(Mux(moveMask(selectPtr), selectPtr-1.U, selectPtr)) + lastSelMask := ~Mux(selectReg, UIntToOH(selectPtrReg), 0.U) + assert(RegNext(!(haveReady && selectPtr >= tailPtr.asUInt)), "bubble should not have valid state like s_valid or s_wait") // sel bubble - val bubMask = WireInit(VecInit((0 until iqSize).map(i => emptyIdxQue(i)))) - // val bubIdx = ParallelMux(bubMask zip idxQueue) // NOTE: the idx in the idxQueue - val (bubPtr, findBubble) = PriorityEncoderWithFlag(bubMask) // NOTE: the idx of the idxQueue - val haveBubble = findBubble && (bubPtr < tailPtr.asUInt) - val bubIdx = idxQueue(bubPtr) - val bubIdxReg = RegNext(bubIdx) // NOTE: may dup with other signal, fix it later - val bubValid = haveBubble && (if (feedback) true.B else !selValid) - val bubReg = RegNext(bubValid) - val bubPtrReg = RegNext(Mux(moveMask(bubPtr), bubPtr-1.U, bubPtr)) + val lastbubbleMask = Wire(UInt(iqSize.W)) + val bubbleMask = WireInit(VecInit((0 until iqSize).map(i => emptyIdxQue(i)))).asUInt & lastbubbleMask + // val bubbleIndex = ParallelMux(bubbleMask zip indexQueue) // NOTE: the idx in the indexQueue + val bubblePtr= ParallelPriorityMux(bubbleMask.asBools.zipWithIndex.map{ case (a,i) => (a, i.U)}) // NOTE: the idx of the indexQueue + val findBubble = Cat(bubbleMask).orR + val haveBubble = findBubble && (bubblePtr < tailPtr.asUInt) + val bubbleIndex = indexQueue(bubblePtr) + val bubbleValid = haveBubble && (if (feedback) true.B else !selectValid) + val bubbleReg = RegNext(bubbleValid) + val bubblePtrReg = RegNext(Mux(moveMask(bubblePtr), bubblePtr-1.U, bubblePtr)) + lastbubbleMask := ~Mux(bubbleReg, UIntToOH(bubblePtrReg), 0.U) & (if(feedback) ~(0.U(iqSize.W)) + else Mux(RegNext(selectValid && io.redirect.valid), 0.U, ~(0.U(iqSize.W)))) // deq - val dequeue = if (feedback) bubReg - else bubReg || issFire - val deqPtr = if (feedback) bubPtrReg - else Mux(selReg, selPtrReg, bubPtrReg) + val dequeue = if (feedback) bubbleReg + else bubbleReg || issueFire + val deqPtr = if (feedback) bubblePtrReg + else Mux(selectReg, selectPtrReg, bubblePtrReg) moveMask := { (Fill(iqSize, 1.U(1.W)) << deqPtr)(iqSize-1, 0) } & Fill(iqSize, dequeue) @@ -182,31 +187,30 @@ class ReservationStationCtrl // move, move happens when deq for(i <- 0 until iqSize-1){ when(moveMask(i)){ - idxQueue(i) := idxQueue(i+1) + indexQueue(i) := indexQueue(i+1) } } when(dequeue){ - idxQueue.last := idxQueue(deqPtr) + indexQueue.last := indexQueue(deqPtr) } - when (selValid) { - stateQueue(selIdx) := s_selected - } - when (bubValid) { - stateQueue(bubIdx) := s_bubble + + when (issueFire) { + if (feedback) { when (stateQueue(selectIndexReg) === s_valid) { stateQueue(selectIndexReg) := s_wait } } + else { stateQueue(selectIndexReg) := s_idle } // NOTE: reset the state for seclectMask timing to avoid operaion '<' } // redirect and feedback && wakeup for (i <- 0 until iqSize) { // replay - val cnt = cntQueue(i) + val count = countQueue(i) when (stateQueue(i) === s_replay) { - cnt := cnt - 1.U - when (cnt === 0.U) { stateQueue(i) := s_valid } + count := count - 1.U + when (count === 0.U) { stateQueue(i) := s_valid } } // feedback - when (fbMatchVec(i)) { - stateQueue(i) := Mux(fbHit, s_idle, s_replay) - cntQueue(i) := Mux(fbHit, cnt, (replayDelay-1).U) + when (feedbackMatchVec(i)) { + stateQueue(i) := Mux(!feedbackHit && (stateQueue(i) === s_wait || stateQueue(i) === s_valid), s_replay, s_idle) + countQueue(i) := Mux(feedbackHit, count, (replayDelay-1).U) } // wakeup val hitVec = io.data.srcUpdate(i) @@ -216,33 +220,19 @@ class ReservationStationCtrl XSDebug(p"srcHit: i:${i.U} j:${j.U} src:${srcQueue(i)(j)}\n") } } - // mask last selectet slot and deal with the mask - // TODO: state queu change may have long 'when' chain -> long latency - when (stateQueue(i) === s_selected) { - when (io.data.fuReady) { - if (feedback) { - stateQueue(i) := s_wait - } else { - stateQueue(i) := s_idle - } - }.otherwise { stateQueue(i) := s_valid } - } - when (stateQueue(i) === s_bubble) { - stateQueue(i) := s_idle - } // redirect - when (redVec(i) && stateQueue(i) =/= s_idle) { + when (redirectVec(i) && stateQueue(i) =/= s_idle) { stateQueue(i) := s_idle } } // output - val issValid = selReg && !redVecPtr(selPtrReg) + val issueValid = selectReg if (nonBlocked) { - issFire := issValid + issueFire := issueValid assert(RegNext(io.data.fuReady), "if fu wanna fast wakeup, it should not block") } else { - issFire := issValid && io.data.fuReady + issueFire := issueValid && io.data.fuReady } // enq @@ -253,14 +243,14 @@ class ReservationStationCtrl val tailDec = tailPtr-1.U tailPtr := Mux(dequeue === enqueue, tailPtr, Mux(dequeue, tailDec, tailInc)) - io.enqCtrl.ready := !isFull || dequeue + io.enqCtrl.ready := !isFull || (if(feedback || nonBlocked) dequeue else false.B) val enqUop = io.enqCtrl.bits val srcSeq = Seq(enqUop.psrc1, enqUop.psrc2, enqUop.psrc3) val srcTypeSeq = Seq(enqUop.ctrl.src1Type, enqUop.ctrl.src2Type, enqUop.ctrl.src3Type) val srcStateSeq = Seq(enqUop.src1State, enqUop.src2State, enqUop.src3State) val enqPtr = Mux(tailPtr.flag, deqPtr, tailPtr.value) - val enqIdx = idxQueue(enqPtr) + val enqIdx = indexQueue(enqPtr) val enqBpVec = io.data.srcUpdate(IssQueSize) def stateCheck(src: UInt, srcType: UInt): Bool = { @@ -280,8 +270,8 @@ class ReservationStationCtrl // other to Data io.data.enqPtr := enqIdx - io.data.deqPtr.valid := selValid - io.data.deqPtr.bits := selIdx + io.data.deqPtr.valid := selectValid + io.data.deqPtr.bits := selectIndex io.data.enqCtrl.valid := enqueue io.data.enqCtrl.bits := io.enqCtrl.bits @@ -292,20 +282,20 @@ class ReservationStationCtrl assert(RegNext(Mux(tailPtr.flag, tailPtr.value===0.U, true.B))) val print = !(tailPtr.asUInt===0.U) || io.enqCtrl.valid || enqueue || dequeue - XSDebug(print || true.B, p"In(${io.enqCtrl.valid} ${io.enqCtrl.ready}) Out(${issValid} ${io.data.fuReady}) nonBlocked:${nonBlocked.B} needfb:${feedback.B}\n") - XSDebug(print , p"tailPtr:${tailPtr} enq:${enqueue} deq:${dequeue} isFull:${isFull} " + + XSDebug(print || true.B, p"In(${io.enqCtrl.valid} ${io.enqCtrl.ready}) Out(${issueValid} ${io.data.fuReady}) nonBlocked:${nonBlocked.B} needfb:${feedback.B}\n") + XSDebug(print || true.B, p"tailPtr:${tailPtr} enq:${enqueue} deq:${dequeue} isFull:${isFull} " + p"vIdxQue:${Binary(validIdxQue.asUInt)} rIdxQue:${Binary(readyIdxQue.asUInt)}\n") - XSDebug(print && Cat(redVecPtr).orR, p"Redirect: ${Hexadecimal(redVecPtr.asUInt)}\n") - XSDebug(print && Cat(fbMatchVec).orR, p"Feedback: ${Hexadecimal(fbMatchVec.asUInt)} Hit:${fbHit}\n") - XSDebug(print, p"moveMask:${Binary(moveMask)} selMask:${Binary(selectMask.asUInt)} bubMask:${Binary(bubMask.asUInt)}\n") - XSDebug(print, p"selIdxWire:${selPtr} haveReady:${haveReady} redSel:${redSel}" + - p"selV:${selValid} selReg:${selReg} selPtrReg:${selPtrReg} selIdx:${selIdx} selIdxReg:${selIdxReg}\n") - XSDebug(print, p"bubValid:${bubValid} haveBub:${haveBubble} bubPtr:${bubPtr} findBub:${findBubble} " + - p"bubReg:${bubReg} bubPtrReg:${bubPtrReg} bubIdx:${bubIdx} bubIdxReg:${bubIdxReg}\n") - XSDebug(print, p"issValid:${issValid} issueFire:${issFire} dequeue:${dequeue} deqPtr:${deqPtr}\n") - XSDebug(p" :Idx|v|r|s |cnt|s1:s2:s3\n") + XSDebug(print && Cat(redirectVecPtr).orR, p"Redirect: ${Hexadecimal(redirectVecPtr.asUInt)}\n") + XSDebug(print && Cat(feedbackMatchVec).orR, p"Feedback: ${Hexadecimal(feedbackMatchVec.asUInt)} Hit:${feedbackHit}\n") + XSDebug(print || true.B, p"moveMask:${Binary(moveMask)} selMask:${Binary(selectMask.asUInt)} bubbleMask:${Binary(bubbleMask.asUInt)}\n") + XSDebug(print || true.B, p"selectPtr:${selectPtr} haveReady:${haveReady} " + + p"selV:${selectValid} selectReg:${selectReg} selectPtrReg:${selectPtrReg} selectIndex:${selectIndex} lastSelMask:${Hexadecimal(lastSelMask)}\n") + XSDebug(print || true.B, p"bubbleValid:${bubbleValid} haveBub:${haveBubble} bubblePtr:${bubblePtr} findBub:${findBubble} " + + p"bubbleReg:${bubbleReg} bubblePtrReg:${bubblePtrReg} bubbleIndex:${bubbleIndex} lastbubbleMask:${Hexadecimal(lastbubbleMask)}\n") + XSDebug(print || true.B, p"issueValid:${issueValid} issueFire:${issueFire} dequeue:${dequeue} deqPtr:${deqPtr}\n") + XSDebug(p" :Idx|v|r|s |count|s1:s2:s3\n") for(i <- srcQueue.indices) { - XSDebug(p"${i.U}: ${idxQueue(i)}|${validIdxQue(i)}|${readyIdxQue(i)}|${stateIdxQue(i)}|${cntIdxQue(i)}|" + + XSDebug(p"${i.U}: ${indexQueue(i)}|${validIdxQue(i)}|${readyIdxQue(i)}|${stateIdxQue(i)}|${cntIdxQue(i)}|" + List.tabulate(srcNum)(j => p"${srcIdxQue(i)(j)}").reduce(_ + ":" + _) + "\n") } } @@ -387,7 +377,7 @@ class ReservationStationData val uopMem = Module(new SyncDataModuleTemplate(new MicroOp, iqSize, iqSize, 1)) uopMem.io <> DontCare uopMem.io.wen.foreach(_ := false.B) - + // uop -- read = iqSize write = 1 // uopMem 's read ports have fixed values uopMem.io.raddr.zipWithIndex.foreach{ case(r, i) => r := i.U } @@ -402,6 +392,7 @@ class ReservationStationData val uop = WireInit(VecInit((0 until iqSize).map(i => uopRead(i.U)))) + val redirectHit = WireInit(false.B) val enq = io.ctrl.enqPtr val sel = io.ctrl.deqPtr val deq = RegEnable(sel.bits, sel.valid) @@ -435,10 +426,10 @@ class ReservationStationData io.srcRegValue(0) ) dataWrite(enqPtrReg, 0, src1Mux) - // TODO: opt this, a full map is not necesscary here - val imm32 = LookupTree( - enqUopReg.ctrl.selImm, - ImmUnion.immSelMap.map(x => x._1 -> x._2.toImm32(enqUopReg.ctrl.imm)) + // alu only need U type and I type imm + val imm32 = Mux(enqUopReg.ctrl.selImm === SelImm.IMM_U, + ImmUnion.U.toImm32(enqUopReg.ctrl.imm), + ImmUnion.I.toImm32(enqUopReg.ctrl.imm) ) val imm64 = SignExt(imm32, XLEN) val src2Mux = Mux(enqUopReg.ctrl.src2Type === SrcType.imm, @@ -476,23 +467,36 @@ class ReservationStationData (hit, RegNext(hit), ParallelMux(hitVec.map(RegNext(_)) zip io.writeBackedData)) } + // NOTE: special case that bypass(fast) when enq for bypass's uop will arrive one cylce later + val lastFastUops = Reg(Vec(wakeupCnt, Valid(new MicroOp))) + for (i <- 0 until wakeupCnt) { + lastFastUops(i) := io.broadcastedUops(i) + } + def lastBypass(src: UInt, srcType: UInt, valid: Bool = true.B) : (Bool, Bool, UInt) = { + val hitVec = lastFastUops.map(port => wbHit(port.bits, src, srcType) && port.valid && valid) + assert(RegNext(PopCount(hitVec)===0.U || PopCount(hitVec)===1.U)) + + val hit = ParallelOR(hitVec) + (hit, RegNext(hit), RegNext(ParallelMux(hitVec zip io.writeBackedData))) + } + io.ctrl.srcUpdate.map(a => a.map(_ := false.B)) for (i <- 0 until iqSize) { val srcSeq = Seq(uop(i).psrc1, uop(i).psrc2, uop(i).psrc3) val srcTypeSeq = Seq(uop(i).ctrl.src1Type, uop(i).ctrl.src2Type, uop(i).ctrl.src3Type) for (j <- 0 until srcNum) { - val (wuHit, wuData) = wakeup(srcSeq(j), srcTypeSeq(j)) - val (bpHit, bpHitReg, bpData) = bypass(srcSeq(j), srcTypeSeq(j)) - when (wuHit || bpHit) { io.ctrl.srcUpdate(i)(j) := true.B } - when (wuHit) { /* data(i)(j) := wuData */dataWrite(i.U, j, wuData) } - when (bpHitReg && !(enqPtrReg===i.U && enqEnReg)) { /* data(i)(j) := bpData */dataWrite(i.U, j, bpData) } + val (wakeupHit, wakeupData) = wakeup(srcSeq(j), srcTypeSeq(j)) + val (bypassHit, bypassHitReg, bypassData) = bypass(srcSeq(j), srcTypeSeq(j)) + when (wakeupHit || bypassHit) { io.ctrl.srcUpdate(i)(j) := true.B } + when (wakeupHit) { dataWrite(i.U, j, wakeupData) } + when (bypassHitReg && !(enqPtrReg===i.U && enqEnReg)) { dataWrite(i.U, j, bypassData) } // NOTE: the hit is from data's info, so there is an erro that: // when enq, hit use last instr's info not the enq info. // it will be long latency to add correct here, so add it to ctrl or somewhere else // enq bp is done at below - XSDebug(wuHit, p"WUHit: (${i.U})(${j.U}) Data:0x${Hexadecimal(wuData)}\n") - XSDebug(bpHit, p"BPHit: (${i.U})(${j.U})\n") - XSDebug(bpHitReg, p"BPHitData: (${i.U})(${j.U}) Data:0x${Hexadecimal(bpData)}\n") + XSDebug(wakeupHit, p"wakeupHit: (${i.U})(${j.U}) Data:0x${Hexadecimal(wakeupData)}\n") + XSDebug(bypassHit, p"bypassHit: (${i.U})(${j.U})\n") + XSDebug(bypassHitReg, p"bypassHitData: (${i.U})(${j.U}) Data:0x${Hexadecimal(bypassData)}\n") } } @@ -500,29 +504,41 @@ class ReservationStationData val exuInput = io.deq.bits exuInput := DontCare exuInput.uop := uop(deq) + exuInput.uop.cf.exceptionVec := 0.U.asTypeOf(ExceptionVec()) val regValues = List.tabulate(srcNum)(i => dataRead(Mux(sel.valid, sel.bits, deq), i)) XSDebug(io.deq.fire(), p"[regValues] " + List.tabulate(srcNum)(idx => p"reg$idx: ${Hexadecimal(regValues(idx))}").reduce((p1, p2) => p1 + " " + p2) + "\n") exuInput.src1 := regValues(0) if (srcNum > 1) exuInput.src2 := regValues(1) if (srcNum > 2) exuInput.src3 := regValues(2) - io.deq.valid := RegNext(sel.valid) + io.deq.valid := RegNext(sel.valid && ~redirectHit) if (nonBlocked) { assert(RegNext(io.deq.ready), s"${name} if fu wanna fast wakeup, it should not block")} // to ctrl val srcSeq = Seq(enqUop.psrc1, enqUop.psrc2, enqUop.psrc3) val srcTypeSeq = Seq(enqUop.ctrl.src1Type, enqUop.ctrl.src2Type, enqUop.ctrl.src3Type) - io.ctrl.srcUpdate(IssQueSize).zipWithIndex.map{ case (h, i) => - val (bpHit, bpHitReg, bpData)= bypass(srcSeq(i), srcTypeSeq(i), enqCtrl.valid) - when (bpHitReg) { /* data(enqPtrReg)(i) := bpData */dataWrite(enqPtrReg, i, bpData) } - h := bpHit + io.ctrl.srcUpdate(IssQueSize).zipWithIndex.map{ case (h, i) => // h: port, i: 0~srcNum-1 + val (bypassHit, bypassHitReg, bypassData) = bypass(srcSeq(i), srcTypeSeq(i), enqCtrl.valid) + val (wakeupHit, wakeupData) = wakeup(srcSeq(i), srcTypeSeq(i), enqCtrl.valid) + val (lastBypassHit, lastBypassHitReg, lastBypassDataReg) = lastBypass(srcSeq(i), srcTypeSeq(i), enqCtrl.valid) + val wakeupHitReg = RegNext(wakeupHit) + val wakeupDataReg = RegNext(wakeupData) + when (bypassHitReg) { dataWrite(enqPtrReg, i, bypassData) } + when (wakeupHitReg) { dataWrite(enqPtrReg, i, wakeupDataReg) } + when (lastBypassHitReg) { dataWrite(enqPtrReg, i, lastBypassDataReg) } + h := bypassHit || wakeupHit || lastBypassHit // NOTE: enq bp is done here - XSDebug(bpHit, p"EnqBPHit: (${i.U})\n") - XSDebug(bpHitReg, p"EnqBPHitData: (${i.U}) data:${Hexadecimal(bpData)}\n") + XSDebug(bypassHit, p"EnqbypassHit: (${i.U})\n") + XSDebug(lastBypassHit, p"EnqLbypassHit: (${i.U})\n") + XSDebug(wakeupHit, p"EnqwakeupHit: (${Binary(io.ctrl.srcUpdate(iqSize).asUInt())})\n") + XSDebug(bypassHitReg, p"EnqbypassHitData: (${i.U}) data:${Hexadecimal(bypassData)}\n") + XSDebug(lastBypassHitReg, p"EnqLbypassHitData: (${i.U}) data:${Hexadecimal(lastBypassDataReg)}\n") + XSDebug(wakeupHitReg, p"EnqwakeupHitData: (${i.U}) data:${Hexadecimal(wakeupDataReg)}\n") } if (nonBlocked) { io.ctrl.fuReady := true.B } else { io.ctrl.fuReady := io.deq.ready } - io.ctrl.redVec := VecInit(uop.map(_.roqIdx.needFlush(io.redirect))).asUInt + io.ctrl.redirectVec := uop.map(_.roqIdx.needFlush(io.redirect)) + redirectHit := io.ctrl.redirectVec(sel.bits) io.ctrl.feedback := DontCare if (feedback) { @@ -535,12 +551,20 @@ class ReservationStationData // bypass send io.selectedUop <> DontCare if (fastWakeup) { - val bpQueue = Module(new BypassQueue(fixedDelay)) - bpQueue.io.in.valid := sel.valid // FIXME: error when function is blocked => fu should not be blocked - bpQueue.io.in.bits := uop(sel.bits) - bpQueue.io.redirect := io.redirect - io.selectedUop.valid := bpQueue.io.out.valid - io.selectedUop.bits := bpQueue.io.out.bits + if (fixedDelay == 0) { + io.selectedUop.valid := sel.valid + io.selectedUop.bits := uop(sel.bits) + io.selectedUop.bits.cf.exceptionVec := 0.U.asTypeOf(ExceptionVec()) + } else { + val bpQueue = Module(new BypassQueue(fixedDelay)) + bpQueue.io.in.valid := sel.valid // FIXME: error when function is blocked => fu should not be blocked + bpQueue.io.in.bits := uop(sel.bits) + bpQueue.io.redirect := io.redirect + io.selectedUop.valid := bpQueue.io.out.valid + io.selectedUop.bits := bpQueue.io.out.bits + io.selectedUop.bits.cf.exceptionVec := 0.U.asTypeOf(ExceptionVec()) + } + XSDebug(io.selectedUop.valid, p"SelUop: pc:0x${Hexadecimal(io.selectedUop.bits.cf.pc)}" + p" roqIdx:${io.selectedUop.bits.roqIdx} pdest:${io.selectedUop.bits.pdest} " + @@ -549,7 +573,7 @@ class ReservationStationData // log - XSDebug(io.ctrl.redVec.orR, p"Red: ${Binary(io.ctrl.redVec)}\n") + XSDebug(Cat(io.ctrl.redirectVec).orR, p"Red: ${io.ctrl.redirectVec}\n") XSDebug(io.feedback.valid && feedback.B, p"feedback: roqIdx:${io.feedback.bits.roqIdx} hit:${io.feedback.bits.hit}\n") XSDebug(true.B, p"out(${io.deq.valid} ${io.deq.ready})\n") XSDebug(io.deq.valid, p"Deq(${io.deq.valid} ${io.deq.ready}): deqPtr:${deq} pc:${Hexadecimal(io.deq.bits.uop.cf.pc)}" + diff --git a/src/main/scala/xiangshan/backend/package.scala b/src/main/scala/xiangshan/backend/package.scala index e61d932ee87faafd70396e192e7bc971ee741657..3f05b446a0ff01507f7f153cdf012ed5af8a9185 100644 --- a/src/main/scala/xiangshan/backend/package.scala +++ b/src/main/scala/xiangshan/backend/package.scala @@ -17,12 +17,13 @@ package object backend { // jump object JumpOpType { - def jal = "b11_000".U - def jalr = "b11_010".U + def jal = "b00".U + def jalr = "b01".U + def auipc = "b10".U // def call = "b11_011".U // def ret = "b11_100".U - def jumpOpIsJal(op: UInt) = !op(1) - def jumpOpisJalr(op: UInt) = op(1) + def jumpOpisJalr(op: UInt) = op(0) + def jumpOpisAuipc(op: UInt) = op(1) } object FenceOpType { diff --git a/src/main/scala/xiangshan/backend/regfile/Regfile.scala b/src/main/scala/xiangshan/backend/regfile/Regfile.scala index 033523f4e158fc53755ee81600319dcbaa5831b9..86f5e2254ad5bb13e1da98e8baff83b2fd611a41 100644 --- a/src/main/scala/xiangshan/backend/regfile/Regfile.scala +++ b/src/main/scala/xiangshan/backend/regfile/Regfile.scala @@ -35,9 +35,8 @@ class Regfile if (!useBlackBox) { val mem = Mem(NRPhyRegs, UInt(len.W)) for (r <- io.readPorts) { - val raddr_reg = RegNext(r.addr) - val rdata = if (hasZero) Mux(raddr_reg === 0.U, 0.U, mem(raddr_reg)) else mem(raddr_reg) - r.data := rdata + val rdata = if (hasZero) Mux(r.addr === 0.U, 0.U, mem(r.addr)) else mem(r.addr) + r.data := RegNext(rdata) } for (w <- io.writePorts) { when(w.wen) { diff --git a/src/main/scala/xiangshan/backend/rename/BusyTable.scala b/src/main/scala/xiangshan/backend/rename/BusyTable.scala index 4ad32d8ebe912dbaecca7c3e38d2ac66e5c420b7..0b85a8a3b87d0419ddc957b6289ad7d3699d2fa2 100644 --- a/src/main/scala/xiangshan/backend/rename/BusyTable.scala +++ b/src/main/scala/xiangshan/backend/rename/BusyTable.scala @@ -5,6 +5,11 @@ import chisel3.util._ import xiangshan._ import utils.{ParallelOR, XSDebug} +class BusyTableReadIO extends XSBundle { + val req = Input(UInt(PhyRegIdxWidth.W)) + val resp = Output(Bool()) +} + class BusyTable(numReadPorts: Int, numWritePorts: Int) extends XSModule { val io = IO(new Bundle() { val flush = Input(Bool()) @@ -13,8 +18,7 @@ class BusyTable(numReadPorts: Int, numWritePorts: Int) extends XSModule { // set preg state to ready (write back regfile + roq walk) val wbPregs = Vec(numWritePorts, Flipped(ValidIO(UInt(PhyRegIdxWidth.W)))) // read preg state - val rfReadAddr = Vec(numReadPorts, Input(UInt(PhyRegIdxWidth.W))) - val pregRdy = Vec(numReadPorts, Output(Bool())) + val read = Vec(numReadPorts, new BusyTableReadIO) }) val table = RegInit(0.U(NRPhyRegs.W)) @@ -29,27 +33,10 @@ class BusyTable(numReadPorts: Int, numWritePorts: Int) extends XSModule { val tableAfterWb = table & (~wbMask).asUInt val tableAfterAlloc = tableAfterWb | allocMask - for((raddr, rdy) <- io.rfReadAddr.zip(io.pregRdy)){ - rdy := !tableAfterWb(raddr) - } + io.read.map(r => r.resp := !table(r.req)) table := tableAfterAlloc -// for((alloc, i) <- io.allocPregs.zipWithIndex){ -// when(alloc.valid){ -// table(alloc.bits) := true.B -// } -// XSDebug(alloc.valid, "Allocate %d\n", alloc.bits) -// } - - -// for((wb, i) <- io.wbPregs.zipWithIndex){ -// when(wb.valid){ -// table(wb.bits) := false.B -// } -// XSDebug(wb.valid, "writeback %d\n", wb.bits) -// } - when(io.flush){ table := 0.U(NRPhyRegs.W) } diff --git a/src/main/scala/xiangshan/backend/roq/Roq.scala b/src/main/scala/xiangshan/backend/roq/Roq.scala index df859f46ae817841812e826f56d75bc78ee27419..f758fc3878ae3582aa68b41451f274d75cb4eae0 100644 --- a/src/main/scala/xiangshan/backend/roq/Roq.scala +++ b/src/main/scala/xiangshan/backend/roq/Roq.scala @@ -54,12 +54,9 @@ class RoqEnqIO extends XSBundle { class RoqDispatchData extends RoqCommitInfo { val crossPageIPFFix = Bool() - val exceptionVec = Vec(16, Bool()) } class RoqWbData extends XSBundle { - // mostly for exceptions - val exceptionVec = Vec(16, Bool()) val fflags = UInt(5.W) val flushPipe = Bool() } @@ -70,7 +67,7 @@ class RoqDeqPtrWrapper extends XSModule with HasCircularQueuePtrHelper { val state = Input(UInt(2.W)) val deq_v = Vec(CommitWidth, Input(Bool())) val deq_w = Vec(CommitWidth, Input(Bool())) - val deq_exceptionVec = Vec(CommitWidth, Input(UInt(16.W))) + val deq_exceptionVec = Vec(CommitWidth, Input(ExceptionVec())) val deq_flushPipe = Vec(CommitWidth, Input(Bool())) // for flush: when exception occurs, reset deqPtrs to range(0, CommitWidth) val intrBitSetReg = Input(Bool()) @@ -83,19 +80,21 @@ class RoqDeqPtrWrapper extends XSModule with HasCircularQueuePtrHelper { val deqPtrVec = RegInit(VecInit((0 until CommitWidth).map(_.U.asTypeOf(new RoqPtr)))) + val possibleException = VecInit(io.deq_exceptionVec.map(selectAll(_, false))) // for exceptions (flushPipe included) and interrupts: // only consider the first instruction val intrEnable = io.intrBitSetReg && !io.hasNoSpecExec && !CommitType.isLoadStore(io.commitType) - val exceptionEnable = io.deq_w(0) && (io.deq_exceptionVec(0).orR || io.deq_flushPipe(0)) + val exceptionEnable = io.deq_w(0) && (possibleException(0).asUInt.orR || io.deq_flushPipe(0)) val redirectOutValid = io.state === 0.U && io.deq_v(0) && (intrEnable || exceptionEnable) // for normal commits: only to consider when there're no exceptions // we don't need to consider whether the first instruction has exceptions since it wil trigger exceptions. - val commitBlocked = VecInit((0 until CommitWidth).map(i => if (i == 0) false.B else io.deq_exceptionVec(i).orR || io.deq_flushPipe(i))) - val canCommit = VecInit((0 until CommitWidth).map(i => io.deq_v(i) && io.deq_w(i) && !commitBlocked(i))) + val commitBlocked = VecInit((0 until CommitWidth).map(i => if (i == 0) false.B else possibleException(i).asUInt.orR || io.deq_flushPipe(i))) + val canCommit = VecInit((0 until CommitWidth).map(i => io.deq_v(i) && io.deq_w(i) /*&& !commitBlocked(i)*/)) val normalCommitCnt = PriorityEncoder(canCommit.map(c => !c) :+ true.B) - // when io.intrBitSetReg, only one instruction is allowed to commit - val commitCnt = Mux(io.intrBitSetReg, io.deq_v(0) && io.deq_w(0), normalCommitCnt) + // when io.intrBitSetReg or there're possible exceptions in these instructions, only one instruction is allowed to commit + val allowOnlyOne = VecInit(commitBlocked.drop(1)).asUInt.orR || io.intrBitSetReg + val commitCnt = Mux(allowOnlyOne, io.deq_v(0) && io.deq_w(0), normalCommitCnt) val resetDeqPtrVec = VecInit((0 until CommitWidth).map(_.U.asTypeOf(new RoqPtr))) val commitDeqPtrVec = VecInit(deqPtrVec.map(_ + commitCnt)) @@ -118,7 +117,7 @@ class RoqEnqPtrWrapper extends XSModule with HasCircularQueuePtrHelper { val state = Input(UInt(2.W)) val deq_v = Input(Bool()) val deq_w = Input(Bool()) - val deq_exceptionVec = Input(UInt(16.W)) + val deq_exceptionVec = Input(ExceptionVec()) val deq_flushPipe = Input(Bool()) val intrBitSetReg = Input(Bool()) val hasNoSpecExec = Input(Bool()) @@ -137,7 +136,7 @@ class RoqEnqPtrWrapper extends XSModule with HasCircularQueuePtrHelper { // for exceptions (flushPipe included) and interrupts: // only consider the first instruction val intrEnable = io.intrBitSetReg && !io.hasNoSpecExec && !CommitType.isLoadStore(io.commitType) - val exceptionEnable = io.deq_w && (io.deq_exceptionVec.orR || io.deq_flushPipe) + val exceptionEnable = io.deq_w && (selectAll(io.deq_exceptionVec, false).asUInt.orR || io.deq_flushPipe) val redirectOutValid = io.state === 0.U && io.deq_v && (intrEnable || exceptionEnable) // enqueue @@ -264,28 +263,7 @@ class Roq(numWbPorts: Int) extends XSModule with HasCircularQueuePtrHelper { val writebackData = Module(new SyncDataModuleTemplate(new RoqWbData, RoqSize, CommitWidth, numWbPorts)) val writebackDataRead = writebackData.io.rdata - def mergeExceptionVec(dpData: RoqDispatchData, wbData: RoqWbData) = { - // these exceptions can be determined before dispatch. - // by default, let all exceptions be determined by dispatch. - // mergeVec(instrAddrMisaligned) := dpData(instrAddrMisaligned) - // mergeVec(instrAccessFault) := dpData(instrAccessFault) - // mergeVec(instrPageFault) := dpData(instrPageFault) - val mergeVec = WireInit(dpData.exceptionVec) - // these exceptions are determined in execution units - mergeVec(illegalInstr) := wbData.exceptionVec(illegalInstr) - mergeVec(breakPoint) := wbData.exceptionVec(breakPoint) - mergeVec(loadAddrMisaligned) := wbData.exceptionVec(loadAddrMisaligned) - mergeVec(loadAccessFault) := wbData.exceptionVec(loadAccessFault) - mergeVec(storeAddrMisaligned) := wbData.exceptionVec(storeAddrMisaligned) - mergeVec(storeAccessFault) := wbData.exceptionVec(storeAccessFault) - mergeVec(ecallU) := wbData.exceptionVec(ecallU) - mergeVec(ecallS) := wbData.exceptionVec(ecallS) - mergeVec(ecallM) := wbData.exceptionVec(ecallM) - mergeVec(loadPageFault) := wbData.exceptionVec(loadPageFault) - mergeVec(storePageFault) := wbData.exceptionVec(storePageFault) - // returns the merged exception vector - mergeVec - } + val exceptionDataRead = Wire(Vec(CommitWidth, ExceptionVec())) io.roqDeqPtr := deqPtr @@ -303,8 +281,7 @@ class Roq(numWbPorts: Int) extends XSModule with HasCircularQueuePtrHelper { when (io.commits.valid.asUInt.orR && state =/= s_extrawalk) { hasNoSpecExec:= false.B } io.enq.canAccept := allowEnqueue && !hasBlockBackward - io.enq.isEmpty := isEmpty - io.enq.resp := enqPtrVec + io.enq.resp := enqPtrVec val canEnqueue = VecInit(io.enq.req.map(_.valid && io.enq.canAccept)) for (i <- 0 until RenameWidth) { // we don't check whether io.redirect is valid here since redirect has higher priority @@ -319,9 +296,10 @@ class Roq(numWbPorts: Int) extends XSModule with HasCircularQueuePtrHelper { } } } + val dispatchNum = Mux(io.enq.canAccept, PopCount(Cat(io.enq.req.map(_.valid))), 0.U) + io.enq.isEmpty := RegNext(isEmpty && dispatchNum === 0.U) // debug info for enqueue (dispatch) - val dispatchNum = Mux(io.enq.canAccept, PopCount(Cat(io.enq.req.map(_.valid))), 0.U) XSDebug(p"(ready, valid): ${io.enq.canAccept}, ${Binary(Cat(io.enq.req.map(_.valid)))}\n") XSInfo(dispatchNum =/= 0.U, p"dispatched $dispatchNum insts\n") @@ -359,7 +337,7 @@ class Roq(numWbPorts: Int) extends XSModule with HasCircularQueuePtrHelper { val deqWritebackData = writebackDataRead(0) val debug_deqUop = debug_microOp(deqPtr.value) - val deqExceptionVec = mergeExceptionVec(deqDispatchData, deqWritebackData) + val deqExceptionVec = exceptionDataRead(0) // For MMIO instructions, they should not trigger interrupts since they may be sent to lower level before it writes back. // However, we cannot determine whether a load/store instruction is MMIO. // Thus, we don't allow load/store instructions to trigger an interrupt. @@ -369,9 +347,9 @@ class Roq(numWbPorts: Int) extends XSModule with HasCircularQueuePtrHelper { val isFlushPipe = writebacked(deqPtr.value) && deqWritebackData.flushPipe io.redirectOut := DontCare io.redirectOut.valid := (state === s_idle) && valid(deqPtr.value) && (intrEnable || exceptionEnable || isFlushPipe) - io.redirectOut.bits.level := Mux(isFlushPipe, RedirectLevel.flushAll, RedirectLevel.exception) + io.redirectOut.bits.level := Mux(intrEnable || exceptionEnable, RedirectLevel.exception, RedirectLevel.flushAll) io.redirectOut.bits.interrupt := intrEnable - io.redirectOut.bits.target := Mux(isFlushPipe, deqDispatchData.pc + 4.U, io.csr.trapTarget) + io.redirectOut.bits.target := Mux(intrEnable || exceptionEnable, io.csr.trapTarget, deqDispatchData.pc + 4.U) io.exception := debug_deqUop io.exception.ctrl.commitType := deqDispatchData.commitType @@ -421,13 +399,15 @@ class Roq(numWbPorts: Int) extends XSModule with HasCircularQueuePtrHelper { io.commits.isWalk := state =/= s_idle val commit_v = Mux(state === s_idle, VecInit(deqPtrVec.map(ptr => valid(ptr.value))), VecInit(walkPtrVec.map(ptr => valid(ptr.value)))) val commit_w = VecInit(deqPtrVec.map(ptr => writebacked(ptr.value))) - val commit_exception = dispatchDataRead.zip(writebackDataRead).map{ case (d, w) => mergeExceptionVec(d, w).asUInt.orR } - val commit_block = VecInit((0 until CommitWidth).map(i => !commit_w(i) || commit_exception(i) || writebackDataRead(i).flushPipe)) + val commit_exception = exceptionDataRead.zip(writebackDataRead.map(_.flushPipe)).map{ case (e, f) => e.asUInt.orR || f } + val commit_block = VecInit((0 until CommitWidth).map(i => !commit_w(i))) + val allowOnlyOneCommit = VecInit(commit_exception.drop(1)).asUInt.orR || intrBitSetReg + // for instructions that may block others, we don't allow them to commit for (i <- 0 until CommitWidth) { // defaults: state === s_idle and instructions commit // when intrBitSetReg, allow only one instruction to commit at each clock cycle - val isBlocked = if (i != 0) Cat(commit_block.take(i)).orR || intrBitSetReg else intrEnable - io.commits.valid(i) := commit_v(i) && commit_w(i) && !isBlocked && !commit_exception(i) + val isBlocked = if (i != 0) Cat(commit_block.take(i)).orR || allowOnlyOneCommit else intrEnable || commit_exception(0) + io.commits.valid(i) := commit_v(i) && commit_w(i) && !isBlocked io.commits.info(i) := dispatchDataRead(i) when (state === s_walk) { @@ -495,7 +475,7 @@ class Roq(numWbPorts: Int) extends XSModule with HasCircularQueuePtrHelper { deqPtrGenModule.io.state := state deqPtrGenModule.io.deq_v := commit_v deqPtrGenModule.io.deq_w := commit_w - deqPtrGenModule.io.deq_exceptionVec := VecInit(dispatchDataRead.zip(writebackDataRead).map{ case (d, w) => mergeExceptionVec(d, w).asUInt }) + deqPtrGenModule.io.deq_exceptionVec := exceptionDataRead deqPtrGenModule.io.deq_flushPipe := writebackDataRead.map(_.flushPipe) deqPtrGenModule.io.intrBitSetReg := intrBitSetReg deqPtrGenModule.io.hasNoSpecExec := hasNoSpecExec @@ -507,7 +487,7 @@ class Roq(numWbPorts: Int) extends XSModule with HasCircularQueuePtrHelper { enqPtrGenModule.io.state := state enqPtrGenModule.io.deq_v := commit_v(0) enqPtrGenModule.io.deq_w := commit_w(0) - enqPtrGenModule.io.deq_exceptionVec := deqExceptionVec.asUInt + enqPtrGenModule.io.deq_exceptionVec := deqExceptionVec enqPtrGenModule.io.deq_flushPipe := writebackDataRead(0).flushPipe enqPtrGenModule.io.intrBitSetReg := intrBitSetReg enqPtrGenModule.io.hasNoSpecExec := hasNoSpecExec @@ -600,7 +580,7 @@ class Roq(numWbPorts: Int) extends XSModule with HasCircularQueuePtrHelper { // enqueue logic set 6 writebacked to false for (i <- 0 until RenameWidth) { when (canEnqueue(i)) { - writebacked(enqPtrVec(i).value) := false.B + writebacked(enqPtrVec(i).value) := selectFrontend(io.enq.req(i).bits.cf.exceptionVec, false).asUInt.orR } } // writeback logic set numWbPorts writebacked to true @@ -641,19 +621,59 @@ class Roq(numWbPorts: Int) extends XSModule with HasCircularQueuePtrHelper { wdata.sqIdx := req.sqIdx wdata.pc := req.cf.pc wdata.crossPageIPFFix := req.cf.crossPageIPFFix - wdata.exceptionVec := req.cf.exceptionVec + // wdata.exceptionVec := req.cf.exceptionVec } dispatchData.io.raddr := commitReadAddr_next writebackData.io.wen := io.exeWbResults.map(_.valid) writebackData.io.waddr := io.exeWbResults.map(_.bits.uop.roqIdx.value) writebackData.io.wdata.zip(io.exeWbResults.map(_.bits)).map{ case (wdata, wb) => - wdata.exceptionVec := wb.uop.cf.exceptionVec wdata.fflags := wb.fflags wdata.flushPipe := wb.uop.ctrl.flushPipe } writebackData.io.raddr := commitReadAddr_next + for (i <- 0 until 16) { + val exceptionData = Module(new SyncDataModuleTemplate(Bool(), RoqSize, CommitWidth, RenameWidth + writebackCount(i))) + var wPortIdx = 0 + for (j <- 0 until RenameWidth) { + exceptionData.io.wen (wPortIdx) := canEnqueue(j) + exceptionData.io.waddr(wPortIdx) := enqPtrVec(j).value + exceptionData.io.wdata(wPortIdx) := (if (allPossibleSet.contains(i)) io.enq.req(j).bits.cf.exceptionVec(i) else false.B) + wPortIdx = wPortIdx + 1 + } + if (csrWbCount(i) > 0) { + exceptionData.io.wen (wPortIdx) := io.exeWbResults(6).valid + exceptionData.io.waddr(wPortIdx) := io.exeWbResults(6).bits.uop.roqIdx.value + exceptionData.io.wdata(wPortIdx) := io.exeWbResults(6).bits.uop.cf.exceptionVec(i) + wPortIdx = wPortIdx + 1 + } + if (atomicsWbCount(i) > 0) { + exceptionData.io.wen (wPortIdx) := io.exeWbResults(4).valid + exceptionData.io.waddr(wPortIdx) := io.exeWbResults(4).bits.uop.roqIdx.value + exceptionData.io.wdata(wPortIdx) := io.exeWbResults(4).bits.uop.cf.exceptionVec(i) + wPortIdx = wPortIdx + 1 + } + if (loadWbCount(i) > 0) { + exceptionData.io.wen (wPortIdx) := io.exeWbResults(5).valid + exceptionData.io.waddr(wPortIdx) := io.exeWbResults(5).bits.uop.roqIdx.value + exceptionData.io.wdata(wPortIdx) := io.exeWbResults(5).bits.uop.cf.exceptionVec(i) + wPortIdx = wPortIdx + 1 + } + if (storeWbCount(i) > 0) { + exceptionData.io.wen (wPortIdx) := io.exeWbResults(16).valid + exceptionData.io.waddr(wPortIdx) := io.exeWbResults(16).bits.uop.roqIdx.value + exceptionData.io.wdata(wPortIdx) := io.exeWbResults(16).bits.uop.cf.exceptionVec(i) + wPortIdx = wPortIdx + 1 + exceptionData.io.wen (wPortIdx) := io.exeWbResults(17).valid + exceptionData.io.waddr(wPortIdx) := io.exeWbResults(17).bits.uop.roqIdx.value + exceptionData.io.wdata(wPortIdx) := io.exeWbResults(17).bits.uop.cf.exceptionVec(i) + wPortIdx = wPortIdx + 1 + } + + exceptionData.io.raddr := VecInit(deqPtrVec_next.map(_.value)) + exceptionDataRead.zip(exceptionData.io.rdata).map{ case (d, r) => d(i) := r } + } /** * debug info diff --git a/src/main/scala/xiangshan/cache/InstrUncache.scala b/src/main/scala/xiangshan/cache/InstrUncache.scala new file mode 100644 index 0000000000000000000000000000000000000000..9e7779bfbc46da6ecc2cee5e1c1b443d190a7df5 --- /dev/null +++ b/src/main/scala/xiangshan/cache/InstrUncache.scala @@ -0,0 +1,206 @@ +package xiangshan.cache + +import chisel3._ +import chisel3.util._ +import utils._ +import chipsalliance.rocketchip.config.Parameters +import freechips.rocketchip.diplomacy.{IdRange, LazyModule, LazyModuleImp, TransferSizes} +import freechips.rocketchip.tilelink.{TLArbiter, TLBundleA, TLBundleD, TLClientNode, TLEdgeOut, TLMasterParameters, TLMasterPortParameters} +import xiangshan._ +import xiangshan.frontend._ + +class InsUncacheReq extends ICacheBundle +{ + val addr = UInt(PAddrBits.W) + val id = UInt(3.W) +} + +class InsUncacheResp extends ICacheBundle +{ + val data = UInt((mmioBeats * mmioBusWidth).W) + val id = UInt(3.W) +} + +// One miss entry deals with one mmio request +class InstrMMIOEntry(edge: TLEdgeOut) extends XSModule with HasICacheParameters +{ + val io = IO(new Bundle { + val id = Input(UInt(log2Up(cacheParams.nMMIOs).W)) + // client requests + val req = Flipped(DecoupledIO(new InsUncacheReq )) + val resp = DecoupledIO(new InsUncacheResp) + + val mmio_acquire = DecoupledIO(new TLBundleA(edge.bundle)) + val mmio_grant = Flipped(DecoupledIO(new TLBundleD(edge.bundle))) + + val flush = Input(Bool()) + }) + + + val s_invalid :: s_refill_req :: s_refill_resp :: s_send_resp :: Nil = Enum(4) + + val state = RegInit(s_invalid) + + val req = Reg(new InsUncacheReq ) + val respDataReg = Reg(Vec(mmioBeats,UInt(mmioBusWidth.W))) + val beatCounter = Counter(mmioBeats) + + + // assign default values to output signals + io.req.ready := false.B + io.resp.valid := false.B + io.resp.bits := DontCare + + io.mmio_acquire.valid := false.B + io.mmio_acquire.bits := DontCare + + io.mmio_grant.ready := false.B + + val needFlush = RegInit(false.B) + + XSDebug("[ICache MMIO]entry: %d state: %d needFlush%d flush:%d\n", io.id, state, needFlush,io.flush) + XSDebug("[ICache MMIO]req.addr: %x req.id \n", req.addr) + XSDebug("[ICache MMIO]mmio_acquire:(v:%d r:%d) mmio_grant:(v:%d r:%d)\n", io.mmio_acquire.valid, io.mmio_acquire.ready, io.mmio_grant.valid, io.mmio_grant.ready) + XSDebug("[ICache MMIO]mmio_acquire:(v:%d r:%d) mmio_grant:(v:%d r:%d)\n", io.mmio_acquire.valid, io.mmio_acquire.ready, io.mmio_grant.valid, io.mmio_grant.ready) + + XSDebug("[ICache MMIO]respReg: %x\n",respDataReg.asUInt) + + + when(io.flush && (state =/= s_invalid) && (state =/= s_send_resp)){ needFlush := true.B } + .elsewhen((state=== s_send_resp) && needFlush){ needFlush := false.B } + + // -------------------------------------------- + // s_invalid: receive requests + when (state === s_invalid) { + io.req.ready := true.B + beatCounter.value := 0.U + + when (io.req.fire()) { + req := io.req.bits + state := s_refill_req + } + } + + + when (state === s_refill_req) { + io.mmio_acquire.valid := true.B + io.mmio_acquire.bits := edge.Get( + fromSource = io.id, + toAddress = req.addr + (beatCounter.value << log2Ceil(mmioBusBytes).U), + lgSize = log2Ceil(mmioBusBytes).U + )._2 + + when (io.mmio_acquire.fire()) { + state := s_refill_resp + } + } + + val (_, _, refill_done, _) = edge.addr_inc(io.mmio_grant) + + when (state === s_refill_resp) { + io.mmio_grant.ready := true.B + + when (io.mmio_grant.fire()) { + respDataReg(beatCounter.value) := io.mmio_grant.bits.data + state := Mux(needFlush || io.flush, s_invalid,Mux(beatCounter.value === (mmioBeats - 1).U,s_send_resp,s_refill_req)) + beatCounter.inc() + } + } + + // -------------------------------------------- + when (state === s_send_resp) { + io.resp.valid := true.B + io.resp.bits.data := respDataReg.asUInt + io.resp.bits.id := req.id + // meta data should go with the response + when (io.resp.fire() || needFlush) { + state := s_invalid + beatCounter.value := 0.U + } + } +} + +class icacheUncacheIO extends DCacheBundle { + val req = Flipped(DecoupledIO(new InsUncacheReq )) + val resp = DecoupledIO(new InsUncacheResp) + val flush = Input(Bool()) + +} + +// convert DCacheIO to TileLink +// for Now, we only deal with TL-UL + +class InstrUncache()(implicit p: Parameters) extends LazyModule with HasICacheParameters { + + val clientParameters = TLMasterPortParameters.v1( + clients = Seq(TLMasterParameters.v1( + "InstrUncache", + sourceId = IdRange(0, cacheParams.nMMIOs) + )) + ) + val clientNode = TLClientNode(Seq(clientParameters)) + + lazy val module = new icacheUncacheImp(this) + +} + +class icacheUncacheImp(outer: InstrUncache) + extends LazyModuleImp(outer) + with HasICacheParameters + with HasXSLog + with HasTLDump +{ + val io = IO(new icacheUncacheIO) + + val (bus, edge) = outer.clientNode.out.head + require(bus.d.bits.data.getWidth == wordBits, "Uncache: tilelink width does not match") + + val resp_arb = Module(new Arbiter(new InsUncacheResp, cacheParams.nMMIOs)) + + val req = io.req + val resp = io.resp + val mmio_acquire = bus.a + val mmio_grant = bus.d + + val entry_alloc_idx = Wire(UInt()) + val req_ready = WireInit(false.B) + + // assign default values to output signals + bus.b.ready := false.B + bus.c.valid := false.B + bus.c.bits := DontCare + bus.d.ready := false.B + bus.e.valid := false.B + bus.e.bits := DontCare + + val entries = (0 until cacheParams.nMMIOs) map { i => + val entry = Module(new InstrMMIOEntry(edge)) + + entry.io.id := i.U(log2Up(cacheParams.nMMIOs).W) + entry.io.flush := io.flush + + // entry req + entry.io.req.valid := (i.U === entry_alloc_idx) && req.valid + entry.io.req.bits := req.bits + when (i.U === entry_alloc_idx) { + req_ready := entry.io.req.ready + } + + // entry resp + resp_arb.io.in(i) <> entry.io.resp + + entry.io.mmio_grant.valid := false.B + entry.io.mmio_grant.bits := DontCare + when (mmio_grant.bits.source === i.U) { + entry.io.mmio_grant <> mmio_grant + } + entry + } + + entry_alloc_idx := PriorityEncoder(entries.map(m=>m.io.req.ready)) + + req.ready := req_ready + resp <> resp_arb.io.out + TLArbiter.lowestFromSeq(edge, mmio_acquire, entries.map(_.io.mmio_acquire)) + +} diff --git a/src/main/scala/xiangshan/cache/L1plusCache.scala b/src/main/scala/xiangshan/cache/L1plusCache.scala index a4083297b72c9ca5e95b0ec77a77891c4b16478d..e503155e83fe07d0a29a903d90a5afeb89511273 100644 --- a/src/main/scala/xiangshan/cache/L1plusCache.scala +++ b/src/main/scala/xiangshan/cache/L1plusCache.scala @@ -130,23 +130,35 @@ class L1plusCacheDataArray extends L1plusCacheModule { io.read.ready := !rwhazard for (w <- 0 until nWays) { + val array = Module(new SRAMTemplate(Bits((blockRows * encRowBits).W), set=nSets, way=1, + shouldReset=false, holdRead=false, singlePort=singlePort)) + // data write + array.io.w.req.valid := io.write.bits.way_en(w) && io.write.valid + array.io.w.req.bits.apply( + setIdx=waddr, + data=io.write.bits.data.asUInt, + waymask=1.U) + + // data read + array.io.r.req.valid := io.read.bits.way_en(w) && io.read.valid + array.io.r.req.bits.apply(setIdx=raddr) for (r <- 0 until blockRows) { - val array = Module(new SRAMTemplate(Bits(encRowBits.W), set=nSets, way=1, - shouldReset=false, holdRead=false, singlePort=singlePort)) - // data write - array.io.w.req.valid := io.write.bits.way_en(w) && io.write.bits.wmask(r).asBool && io.write.valid - array.io.w.req.bits.apply( - setIdx=waddr, - data=io.write.bits.data(r), - waymask=1.U) - - // data read - array.io.r.req.valid := io.read.bits.way_en(w) && io.read.bits.rmask(r) && io.read.valid - array.io.r.req.bits.apply(setIdx=raddr) - io.resp(w)(r) := RegNext(array.io.r.resp.data(0)) + io.resp(w)(r) := RegNext(array.io.r.resp.data(0)((r + 1) * encRowBits - 1, r * encRowBits)) } } + // since we use a RAM of block width + // we must do full read and write + when (io.write.valid) { + assert (io.write.bits.wmask.andR) + } + + // since we use a RAM of block width + // we must do full read and write + when (io.read.valid) { + assert (io.read.bits.rmask.andR) + } + // debug output def dumpRead() = { when (io.read.valid) { @@ -230,7 +242,7 @@ class L1plusCacheMetadataArray extends L1plusCacheModule { cacheParams.tagCode.decode(rdata).corrected) for (i <- 0 until nWays) { - io.resp(i).valid := RegNext(valid_array(io.read.bits.idx)(i)) + io.resp(i).valid := valid_array(RegNext(io.read.bits.idx))(i) io.resp(i).tag := rtags(i) } diff --git a/src/main/scala/xiangshan/cache/dcache.scala b/src/main/scala/xiangshan/cache/dcache.scala index 9c9bef19cc80bbcfd4b235cf61c44d050ca2e9ba..6b70b7eae7800abad12651b0950494986861a9c1 100644 --- a/src/main/scala/xiangshan/cache/dcache.scala +++ b/src/main/scala/xiangshan/cache/dcache.scala @@ -197,15 +197,22 @@ class DuplicatedDataArray extends AbstractDataArray io.resp(j)(w)(r) := Cat((0 until rowWords).reverse map (k => resp(k))) for (k <- 0 until rowWords) { - val array = Module(new SRAMTemplate(Bits(encWordBits.W), set=nSets, way=1, - shouldReset=false, holdRead=false, singlePort=singlePort)) + val array = Module(new SRAMTemplate( + Bits(encWordBits.W), + set=nSets, + way=1, + shouldReset=false, + holdRead=false, + singlePort=singlePort + )) // data write val wen = io.write.valid && io.write.bits.way_en(w) && io.write.bits.wmask(r)(k) array.io.w.req.valid := wen array.io.w.req.bits.apply( setIdx=waddr, data=io.write.bits.data(r)(encWordBits*(k+1)-1,encWordBits*k), - waymask=1.U) + waymask=1.U + ) // data read val ren = io.read(j).valid && io.read(j).bits.way_en(w) && io.read(j).bits.rmask(r) diff --git a/src/main/scala/xiangshan/cache/dtlb.scala b/src/main/scala/xiangshan/cache/dtlb.scala index af7353ddfd788e9daab3cffdba2d1066e525544d..96ae97294d45c315d653e9124a8c3828199ea127 100644 --- a/src/main/scala/xiangshan/cache/dtlb.scala +++ b/src/main/scala/xiangshan/cache/dtlb.scala @@ -43,7 +43,7 @@ trait HasTlbConst extends HasXSParameter { abstract class TlbBundle extends XSBundle with HasTlbConst abstract class TlbModule extends XSModule with HasTlbConst -class PermBundle(val hasV: Boolean = true) extends TlbBundle { +class PtePermBundle extends TlbBundle { val d = Bool() val a = Bool() val g = Bool() @@ -51,7 +51,6 @@ class PermBundle(val hasV: Boolean = true) extends TlbBundle { val x = Bool() val w = Bool() val r = Bool() - if (hasV) { val v = Bool() } override def toPrintable: Printable = { p"d:${d} a:${a} g:${g} u:${u} x:${x} w:${w} r:${r}"// + @@ -59,6 +58,27 @@ class PermBundle(val hasV: Boolean = true) extends TlbBundle { } } +class TlbPermBundle extends TlbBundle { + val pf = Bool() // NOTE: if this is true, just raise pf + val d = Bool() + val a = Bool() + val g = Bool() + val u = Bool() + val x = Bool() + val w = Bool() + val r = Bool() + + // pma perm check + // val at = Bool() // Access Type + // val as = Bool() // Atomic Swap + // val al = Bool() // Atomic Logical + // val aa = Bool() // Atomic Arithmetic + // TODO: add pma check + override def toPrintable: Printable = { + p"pf:${pf} d:${d} a:${a} g:${g} u:${u} x:${x} w:${w} r:${r}" + } +} + class comBundle extends TlbBundle with HasCircularQueuePtrHelper{ val roqIdx = new RoqPtr val valid = Bool() @@ -73,111 +93,94 @@ object Compare { } } -class TlbEntry extends TlbBundle { - val vpn = UInt(vpnLen.W) // tag is vpn - val ppn = UInt(ppnLen.W) - val level = UInt(log2Up(Level).W) // 2 for 4KB, 1 for 2MB, 0 for 1GB - // val asid = UInt(asidLen.W), asid maybe expensive to support, but useless - // val v = Bool() // v&g is special, may need sperate storage? - val perm = new PermBundle(hasV = false) - - def vpnHit(vpn: UInt):Bool = { - val fullMask = VecInit((Seq.fill(vpnLen)(true.B))).asUInt - val maskLevel = VecInit((Level-1 to 0 by -1).map{i => // NOTE: level 2 for 4KB, 1 for 2MB, 0 for 1GB - Reverse(VecInit(Seq.fill(vpnLen-i*vpnnLen)(true.B) ++ Seq.fill(i*vpnnLen)(false.B)).asUInt)}) - val mask = maskLevel(level) - (mask&this.vpn) === (mask&vpn) - } - - // def asidHit(asid: UInt) = { - // this.asid === asid - // } +// multi-read && single-write +// input is data, output is hot-code(not one-hot) +class CAMTemplate[T <: Data](val gen: T, val set: Int, val readWidth: Int) extends TlbModule { + val io = IO(new Bundle { + val r = new Bundle { + val req = Input(Vec(readWidth, gen)) + val resp = Output(Vec(readWidth, UInt(set.W))) + } + val w = Flipped(ValidIO(new Bundle { + val index = UInt(log2Up(set).W) + val data = gen + })) + }) - def hit(vpn: UInt/*, asid: UInt*/):Bool = { - vpnHit(vpn) // && asidHit(asid) - } + val wordType = UInt(gen.getWidth.W) + val array = Reg(Vec(set, wordType)) - def genTlbEntry(pte: UInt, level: UInt, vpn: UInt/*, asid: UInt*/) = { - val e = Wire(new TlbEntry) - e.ppn := pte.asTypeOf(pteBundle).ppn - e.level := level - e.vpn := vpn - e.perm := pte.asTypeOf(pteBundle).perm - // e.asid := asid - e + io.r.resp.zipWithIndex.map{ case (a,i) => + a := VecInit(array.map(io.r.req(i).asUInt === _)).asUInt } - override def toPrintable: Printable = { - p"vpn:0x${Hexadecimal(vpn)} ppn:0x${Hexadecimal(ppn)} level:${level} perm:${perm}" + when (io.w.valid) { + array(io.w.bits.index) := io.w.bits.data } } -class TlbEntires(num: Int, tagLen: Int) extends TlbBundle { - require(log2Up(num)==log2Down(num)) - /* vpn can be divide into three part */ - // vpn: tagPart(17bit) + addrPart(8bit) + cutLenPart(2bit) - val cutLen = log2Up(num) - - val tag = UInt(tagLen.W) // NOTE: high part of vpn - val level = UInt(log2Up(Level).W) - val ppns = Vec(num, UInt(ppnLen.W)) - val perms = Vec(num, new PermBundle(hasV = false)) - val vs = Vec(num, Bool()) - - def tagClip(vpn: UInt, level: UInt) = { // full vpn => tagLen - val tmp = Mux(level===0.U, Cat(vpn(vpnLen-1, vpnnLen*2+cutLen), 0.U(vpnnLen*2)), - Mux(level===1.U, Cat(vpn(vpnLen-1, vpnnLen*1+cutLen), 0.U(vpnnLen*1)), - Cat(vpn(vpnLen-1, vpnnLen*0+cutLen), 0.U(vpnnLen*0)))) - tmp(tmp.getWidth-1, tmp.getWidth-tagLen) - } +class TlbEntryData extends TlbBundle { + val ppn = UInt(ppnLen.W) + val perm = new TlbPermBundle + // TODO: change perm to every kinds of pf check - // NOTE: get insize idx - def idxClip(vpn: UInt, level: UInt) = { - Mux(level===0.U, vpn(vpnnLen*2+cutLen-1, vpnnLen*2), - Mux(level===1.U, vpn(vpnnLen*1+cutLen-1, vpnnLen*1), - vpn(vpnnLen*0+cutLen-1, vpnnLen*0))) + override def toPrintable: Printable = { + p"ppn:0x${Hexadecimal(ppn)} perm:${perm}" } +} - def hit(vpn: UInt) = { - (tag === tagClip(vpn, level)) && vs(idxClip(vpn, level)) && (level === 2.U) +class TlbEntry(superpage: Boolean = false) extends TlbBundle { + val tag = UInt(vpnLen.W) // tag is vpn + val level = if(superpage) Some(UInt(1.W)) else None // /*2 for 4KB,*/ 1 for 2MB, 0 for 1GB + val data = new TlbEntryData + + + def hit(vpn: UInt): Bool = { + if (superpage) { + val insideLevel = level.getOrElse(0.U) + val a = tag(vpnnLen*3-1, vpnnLen*2) === vpn(vpnnLen*3-1, vpnnLen*2) + val b = tag(vpnnLen*2-1, vpnnLen*1) === vpn(vpnnLen*2-1, vpnnLen*1) + XSDebug(Mux(insideLevel.asBool, a&b, a), p"Hit superpage: hit:${Mux(insideLevel.asBool, a&b, a)} tag:${Hexadecimal(tag)} level:${insideLevel} data:${data} a:${a} b:${b} vpn:${Hexadecimal(vpn)}\n")("TlbEntrySuperpage") + Mux(insideLevel.asBool, a&b, a) + } else { + XSDebug(tag === vpn, p"Hit normalpage: hit:${tag === vpn} tag:${Hexadecimal(tag)} data:${data} vpn:${Hexadecimal(vpn)}\n")("TlbEntryNormalpage") + tag === vpn + } } - def genEntries(data: UInt, level: UInt, vpn: UInt): TlbEntires = { - require((data.getWidth / XLEN) == num, - "input data length must be multiple of pte length") - assert(level=/=3.U, "level should not be 3") - - val ts = Wire(new TlbEntires(num, tagLen)) - ts.tag := tagClip(vpn, level) - ts.level := level - for (i <- 0 until num) { - val pte = data((i+1)*XLEN-1, i*XLEN).asTypeOf(new PteBundle) - ts.ppns(i) := pte.ppn - ts.perms(i):= pte.perm // this.perms has no v - ts.vs(i) := !pte.isPf(level) && pte.isLeaf() // legal and leaf, store to l2Tlb + def ppn(vpn: UInt): UInt = { + if (superpage) { + val insideLevel = level.getOrElse(0.U) + Mux(insideLevel.asBool, Cat(data.ppn(data.ppn.getWidth-1, vpnnLen*1), vpn(vpnnLen*1-1, 0)), + Cat(data.ppn(data.ppn.getWidth-1, vpnnLen*2), vpn(vpnnLen*2-1, 0))) + } else { + data.ppn } - - ts } - def get(vpn: UInt): TlbEntry = { - val t = Wire(new TlbEntry()) - val idx = idxClip(vpn, level) - t.vpn := vpn // Note: Use input vpn, not vpn in TlbL2 - t.ppn := ppns(idx) - t.level := level - t.perm := perms(idx) - t + def apply(vpn: UInt, ppn: UInt, level: UInt, perm: UInt, pf: Bool) = { + this.tag := vpn + this.level.map(_ := level(0)) + this.data.ppn := ppn + val ptePerm = perm.asTypeOf(new PtePermBundle) + this.data.perm.pf:= pf + this.data.perm.d := ptePerm.d + this.data.perm.a := ptePerm.a + this.data.perm.g := ptePerm.g + this.data.perm.u := ptePerm.u + this.data.perm.x := ptePerm.x + this.data.perm.w := ptePerm.w + this.data.perm.r := ptePerm.r + + this } - override def cloneType: this.type = (new TlbEntires(num, tagLen)).asInstanceOf[this.type] override def toPrintable: Printable = { - require(num == 4, "if num is not 4, please comment this toPrintable") - // NOTE: if num is not 4, please comment this toPrintable - p"tag:${Hexadecimal(tag)} level:${level} ppn(0):${Hexadecimal(ppns(0))} ppn(1):${Hexadecimal(ppns(1))}" + - p"ppn(2):${Hexadecimal(ppns(2))} ppn(3):${Hexadecimal(ppns(3))} " + - p"perms(0):${perms(0)} perms(1):${perms(1)} perms(2):${perms(2)} perms(3):${perms(3)} vs:${Binary(vs.asUInt)}" + val insideLevel = level.getOrElse(0.U) + p"vpn:0x${Hexadecimal(tag)} level:${insideLevel} data:${data}" } + + override def cloneType: this.type = (new TlbEntry(superpage)).asInstanceOf[this.type] } object TlbCmd { @@ -185,10 +188,15 @@ object TlbCmd { def write = "b01".U def exec = "b10".U - def apply() = UInt(2.W) - def isRead(a: UInt) = a===read - def isWrite(a: UInt) = a===write - def isExec(a: UInt) = a===exec + def atom_read = "b100".U // lr + def atom_write = "b101".U // sc / amo + + def apply() = UInt(3.W) + def isRead(a: UInt) = a(1,0)===read + def isWrite(a: UInt) = a(1,0)===write + def isExec(a: UInt) = a(1,0)===exec + + def isAtom(a: UInt) = a(2) } class TlbReq extends TlbBundle { @@ -207,12 +215,18 @@ class TlbReq extends TlbBundle { class TlbResp extends TlbBundle { val paddr = UInt(PAddrBits.W) val miss = Bool() + val mmio = Bool() val excp = new Bundle { val pf = new Bundle { val ld = Bool() val st = Bool() val instr = Bool() } + val af = new Bundle { + val ld = Bool() + val st = Bool() + val instr = Bool() + } } override def toPrintable: Printable = { p"paddr:0x${Hexadecimal(paddr)} miss:${miss} excp.pf: ld:${excp.pf.ld} st:${excp.pf.st} instr:${excp.pf.instr}" @@ -267,34 +281,63 @@ class TLB(Width: Int, isDtlb: Boolean) extends TlbModule with HasCSRConst{ def widthMapSeq[T <: Seq[Data]](f: Int => T) = (0 until Width).map(f) def widthMap[T <: Data](f: Int => T) = (0 until Width).map(f) - val v = RegInit(0.U(TlbEntrySize.W)) - val pf = RegInit(0.U(TlbEntrySize.W)) // TODO: when ptw resp a pf(now only page not found), store here - val entry = Reg(Vec(TlbEntrySize, new TlbEntry)) - val g = VecInit(entry.map(_.perm.g)).asUInt // TODO: need check if reverse is needed + // Normal page && Super page + val nv = RegInit(VecInit(Seq.fill(TlbEntrySize)(false.B))) + val nentry = Reg(Vec(TlbEntrySize, new TlbEntry(false))) + val sv = RegInit(VecInit(Seq.fill(TlbSPEntrySize)(false.B))) + val sentry = Reg(Vec(TlbSPEntrySize, new TlbEntry(true))) + val v = nv ++ sv + val entry = nentry ++ sentry + val g = VecInit(entry.map(_.data.perm.g)) + val pf = VecInit(entry.zip(v).map{ case(e, vi) => e.data.perm.pf & vi }) /** * PTW refill */ val refill = ptw.resp.fire() - val randIdx = LFSR64()(log2Up(TlbEntrySize)-1,0) - val priorIdx = PriorityEncoder(~(v|pf)) - val tlbfull = ParallelAND((v|pf).asBools) - val refillIdx = Mux(tlbfull, randIdx, priorIdx) - val refillIdxOH = UIntToOH(refillIdx) + def randReplace(v: UInt) = { + val width = v.getWidth + val randIdx = LFSR64()(log2Up(width)-1, 0) + val priorIdx = PriorityEncoder(~(v)) + val full = Cat(v).andR + Mux(full, randIdx, priorIdx) + } + when (refill) { - v := Mux(ptw.resp.bits.pf, v & ~refillIdxOH, v | refillIdxOH) - entry(refillIdx) := ptw.resp.bits.entry - XSDebug(p"Refill: idx:${refillIdx} entry:${ptw.resp.bits.entry}\n") + val resp = ptw.resp.bits + when (resp.entry.level === 2.U) { + val refillIdx = randReplace(nv.asUInt) + nv(refillIdx) := true.B + nentry(refillIdx).apply( + vpn = resp.entry.tag, + ppn = resp.entry.ppn, + level = resp.entry.level, + perm = VecInit(resp.entry.perm).asUInt, + pf = resp.pf + ) + XSDebug(p"Refill normal: idx:${refillIdx} entry:${resp.entry} pf:${resp.pf}\n") + }.otherwise { + val refillIdx = randReplace(sv.asUInt) + sv(refillIdx) := true.B + sentry(refillIdx).apply( + vpn = resp.entry.tag, + ppn = resp.entry.ppn, + level = resp.entry.level, + perm = VecInit(resp.entry.perm).asUInt, + pf = resp.pf + ) + XSDebug(p"Refill superpage: idx:${refillIdx} entry:${resp.entry} pf:${resp.pf}\n") + } } /** * L1 TLB read */ - val tlb_read_mask = Mux(refill, refillIdxOH, 0.U(TlbEntrySize.W)) - def TLBRead(i: Int) = { + // val tlb_read_mask = Mux(refill, ((1<<(TlbEntrySize+TlbSPEntrySize))-1).U, 0.U((TlbEntrySize+TlbSPEntrySize).W)) + def TLBNormalRead(i: Int) = { val entryHitVec = ( if (isDtlb) - VecInit((tlb_read_mask.asBools zip entry).map{ case (r, e) => !r && e.hit(reqAddr(i).vpn/*, satp.asid*/)}) + VecInit(entry.map{ e => ~refill && e.hit(reqAddr(i).vpn/*, satp.asid*/)}) else VecInit(entry.map(_.hit(reqAddr(i).vpn/*, satp.asid*/))) ) @@ -304,26 +347,24 @@ class TLB(Width: Int, isDtlb: Boolean) extends TlbModule with HasCSRConst{ val validReg = if (isDtlb) RegNext(valid(i)) else valid(i) val entryHitVecReg = if (isDtlb) RegNext(entryHitVec) else entryHitVec - val hitVec = (v.asBools zip entryHitVecReg).map{ case (a,b) => a&b } - val pfHitVec = (pf.asBools zip entryHitVecReg).map{ case (a,b) => a&b } + val hitVec = (v zip entryHitVecReg).map{ case (a,b) => a&b } + val pfHitVec = (pf zip entryHitVecReg).map{ case (a,b) => a&b } val pfArray = ParallelOR(pfHitVec).asBool && validReg && vmEnable val hit = ParallelOR(hitVec).asBool && validReg && vmEnable && ~pfArray val miss = !hit && validReg && vmEnable && ~pfArray - val hitppn = ParallelMux(hitVec zip entry.map(_.ppn)) - val hitPerm = ParallelMux(hitVec zip entry.map(_.perm)) - val hitLevel= ParallelMux(hitVec zip entry.map(_.level)) + val hitppn = ParallelMux(hitVec zip entry.map(_.ppn(reqAddrReg.vpn))) + val hitPerm = ParallelMux(hitVec zip entry.map(_.data.perm)) + + XSDebug(valid(i), p"(${i.U}) entryHit:${Hexadecimal(entryHitVec.asUInt)}\n") + XSDebug(validReg, p"(${i.U}) entryHitReg:${Hexadecimal(entryHitVecReg.asUInt)} hitVec:${Hexadecimal(VecInit(hitVec).asUInt)} pfHitVec:${Hexadecimal(VecInit(pfHitVec).asUInt)} pfArray:${Hexadecimal(pfArray.asUInt)} hit:${hit} miss:${miss} hitppn:${Hexadecimal(hitppn)} hitPerm:${hitPerm}\n") + val multiHit = { val hitSum = PopCount(hitVec) - val pfHitSum = PopCount(pfHitVec) - !(hitSum===0.U || hitSum===1.U) || !(pfHitSum===0.U || pfHitSum===1.U) + !(hitSum===0.U || hitSum===1.U) } // resp // TODO: A/D has not being concerned - val paddr = LookupTreeDefault(hitLevel, Cat(hitppn, reqAddrReg.off), List( - 0.U -> Cat(hitppn(ppnLen - 1, 2*vpnnLen), reqAddrReg.vpn(2*vpnnLen - 1, 0), reqAddrReg.off), - 1.U -> Cat(hitppn(ppnLen - 1, vpnnLen), reqAddrReg.vpn(vpnnLen - 1, 0), reqAddrReg.off), - 2.U -> Cat(hitppn, reqAddrReg.off) - )) + val paddr = Cat(hitppn, reqAddrReg.off) val vaddr = SignExt(req(i).bits.vaddr, PAddrBits) req(i).ready := resp(i).ready @@ -341,10 +382,16 @@ class TLB(Width: Int, isDtlb: Boolean) extends TlbModule with HasCSRConst{ resp(i).bits.excp.pf.st := stPf || update resp(i).bits.excp.pf.instr := instrPf || update + val (pmaMode, accessWidth) = AddressSpace.memmapAddrMatch(resp(i).bits.paddr) + resp(i).bits.mmio := Mux(TlbCmd.isExec(cmdReg), !PMAMode.icache(pmaMode), !PMAMode.dcache(pmaMode)) + resp(i).bits.excp.af.ld := Mux(TlbCmd.isAtom(cmdReg), !PMAMode.atomic(pmaMode), !PMAMode.read(pmaMode)) && TlbCmd.isRead(cmdReg) + resp(i).bits.excp.af.st := Mux(TlbCmd.isAtom(cmdReg), !PMAMode.atomic(pmaMode), !PMAMode.write(pmaMode)) && TlbCmd.isWrite(cmdReg) + resp(i).bits.excp.af.instr := Mux(TlbCmd.isAtom(cmdReg), false.B, !PMAMode.execute(pmaMode)) + (hit, miss, pfHitVec, multiHit) } - val readResult = (0 until Width).map(TLBRead(_)) + val readResult = (0 until Width).map(TLBNormalRead(_)) val hitVec = readResult.map(res => res._1) val missVec = readResult.map(res => res._2) val pfHitVecVec = readResult.map(res => res._3) @@ -352,12 +399,15 @@ class TLB(Width: Int, isDtlb: Boolean) extends TlbModule with HasCSRConst{ val hasMissReq = Cat(missVec).orR // ptw - val state_idle :: state_wait :: Nil = Enum(2) - val state = RegInit(state_idle) - - ptw <> DontCare // TODO: need check it - ptw.req.valid := hasMissReq && state===state_idle && !sfence.valid - ptw.resp.ready := state===state_wait + val waiting = RegInit(false.B) + when (ptw.req.fire()) { + waiting := true.B + }.elsewhen (sfence.valid || ptw.resp.valid) { + waiting := false.B + } + // ptw <> DontCare // TODO: need check it + ptw.req.valid := hasMissReq && !sfence.valid && !waiting && !RegNext(refill) + ptw.resp.ready := waiting // val ptwReqSeq = Wire(Seq.fill(Width)(new comBundle())) val ptwReqSeq = Seq.fill(Width)(Wire(new comBundle())) @@ -368,82 +418,49 @@ class TLB(Width: Int, isDtlb: Boolean) extends TlbModule with HasCSRConst{ } ptw.req.bits := Compare(ptwReqSeq).bits - switch (state) { - is (state_idle) { - when (hasMissReq && ptw.req.fire()) { - state := state_wait - } - assert(!ptw.resp.valid) - } - - is (state_wait) { - when (ptw.resp.fire()) { - state := state_idle - } - } - } - - // reset pf when pf hit - val pfHitReset = ParallelOR(widthMap{i => Mux(resp(i).fire(), VecInit(pfHitVecVec(i)).asUInt, 0.U) }) - val pfHitRefill = false.B//ParallelOR(pfHitReset.asBools) - - // pf update - when (refill) { - when (pfHitRefill) { - pf := Mux(ptw.resp.bits.pf, pf | refillIdxOH, pf & ~refillIdxOH) & ~pfHitReset - } .otherwise { - pf := Mux(ptw.resp.bits.pf, pf | refillIdxOH, pf & ~refillIdxOH) - } - } .otherwise { - when (pfHitRefill) { - pf := pf & ~pfHitReset - } - } - when (PopCount(pf) > 10.U) { // when too much pf, just clear - pf := Mux(refill && ptw.resp.bits.pf, refillIdxOH, 0.U) + val tooManyPf = PopCount(pf) > 5.U + when (tooManyPf) { // when too much pf, just clear + XSDebug(p"Too many pf just flush all the pf v:${Hexadecimal(VecInit(v).asUInt)} pf:${Hexadecimal(pf.asUInt)}\n") + v.zipWithIndex.map{ case (a, i) => a := a & !pf(i) } } // sfence (flush) when (sfence.valid) { - state := state_idle ptw.req.valid := false.B when (sfence.bits.rs1) { // virtual address *.rs1 <- (rs1===0.U) when (sfence.bits.rs2) { // asid, but i do not want to support asid, *.rs2 <- (rs2===0.U) // all addr and all asid - v := 0.U - pf := 0.U + v.map(_ := false.B) }.otherwise { // all addr but specific asid - v := v & g // TODO: need check if reverse is needed - pf := pf & g + v.zipWithIndex.map{ case (a,i) => a := a & g(i) } } }.otherwise { + val sfenceVpn = sfence.bits.addr.asTypeOf(vaBundle).vpn when (sfence.bits.rs2) { // specific addr but all asid - v := v & ~VecInit(entry.map(_.hit(sfence.bits.addr.asTypeOf(vaBundle).vpn))).asUInt - pf := pf & ~VecInit(entry.map(_.hit(sfence.bits.addr.asTypeOf(vaBundle).vpn))).asUInt + v.zipWithIndex.map{ case (a,i) => a := a & !entry(i).hit(sfenceVpn) } }.otherwise { // specific addr and specific asid - v := v & ~VecInit(entry.map(e => e.hit(sfence.bits.addr.asTypeOf(vaBundle).vpn) && (/*e.asid === sfence.bits.asid && */!e.perm.g))).asUInt - pf := pf & ~VecInit(entry.map(e => e.hit(sfence.bits.addr.asTypeOf(vaBundle).vpn) && (/*e.asid === sfence.bits.asid && */!e.perm.g))).asUInt + v.zipWithIndex.map{ case (a,i) => a := a & !(entry(i).hit(sfenceVpn) && !g(i))} } } } if (!env.FPGAPlatform && isDtlb) { - ExcitingUtils.addSource(valid(0)/* && vmEnable*/, "perfCntDtlbReqCnt0", Perf) - ExcitingUtils.addSource(valid(1)/* && vmEnable*/, "perfCntDtlbReqCnt1", Perf) - ExcitingUtils.addSource(valid(2)/* && vmEnable*/, "perfCntDtlbReqCnt2", Perf) - ExcitingUtils.addSource(valid(3)/* && vmEnable*/, "perfCntDtlbReqCnt3", Perf) - ExcitingUtils.addSource(valid(0)/* && vmEnable*/ && missVec(0), "perfCntDtlbMissCnt0", Perf) - ExcitingUtils.addSource(valid(1)/* && vmEnable*/ && missVec(1), "perfCntDtlbMissCnt1", Perf) - ExcitingUtils.addSource(valid(2)/* && vmEnable*/ && missVec(2), "perfCntDtlbMissCnt2", Perf) - ExcitingUtils.addSource(valid(3)/* && vmEnable*/ && missVec(3), "perfCntDtlbMissCnt3", Perf) + ExcitingUtils.addSource(valid(0) && vmEnable, "perfCntDtlbReqCnt0", Perf) + ExcitingUtils.addSource(valid(1) && vmEnable, "perfCntDtlbReqCnt1", Perf) + ExcitingUtils.addSource(valid(2) && vmEnable, "perfCntDtlbReqCnt2", Perf) + ExcitingUtils.addSource(valid(3) && vmEnable, "perfCntDtlbReqCnt3", Perf) + ExcitingUtils.addSource(valid(0) && vmEnable && missVec(0), "perfCntDtlbMissCnt0", Perf) + ExcitingUtils.addSource(valid(1) && vmEnable && missVec(1), "perfCntDtlbMissCnt1", Perf) + ExcitingUtils.addSource(valid(2) && vmEnable && missVec(2), "perfCntDtlbMissCnt2", Perf) + ExcitingUtils.addSource(valid(3) && vmEnable && missVec(3), "perfCntDtlbMissCnt3", Perf) } if (!env.FPGAPlatform && !isDtlb) { - ExcitingUtils.addSource(valid(0)/* && vmEnable*/, "perfCntItlbReqCnt0", Perf) - ExcitingUtils.addSource(valid(0)/* && vmEnable*/ && missVec(0), "perfCntItlbMissCnt0", Perf) + ExcitingUtils.addSource(valid(0) && vmEnable, "perfCntItlbReqCnt0", Perf) + ExcitingUtils.addSource(valid(0) && vmEnable && missVec(0), "perfCntItlbMissCnt0", Perf) } // Log @@ -454,35 +471,20 @@ class TLB(Width: Int, isDtlb: Boolean) extends TlbModule with HasCSRConst{ XSDebug(sfence.valid, p"Sfence: ${sfence}\n") XSDebug(ParallelOR(valid)|| ptw.resp.valid, p"CSR: ${csr}\n") - XSDebug(ParallelOR(valid) || ptw.resp.valid, p"vmEnable:${vmEnable} hit:${Binary(VecInit(hitVec).asUInt)} miss:${Binary(VecInit(missVec).asUInt)} v:${Hexadecimal(v)} pf:${Hexadecimal(pf)} state:${state}\n") + XSDebug(ParallelOR(valid) || ptw.resp.valid, p"vmEnable:${vmEnable} hit:${Binary(VecInit(hitVec).asUInt)} miss:${Binary(VecInit(missVec).asUInt)} v:${Hexadecimal(VecInit(v).asUInt)} pf:${Hexadecimal(pf.asUInt)}\n") XSDebug(ptw.req.fire(), p"PTW req:${ptw.req.bits}\n") XSDebug(ptw.resp.valid, p"PTW resp:${ptw.resp.bits} (v:${ptw.resp.valid}r:${ptw.resp.ready}) \n") - // // assert check, can be remove when tlb can work - // for(i <- 0 until Width) { - // assert((hit(i)&pfArray(i))===false.B, "hit(%d):%d pfArray(%d):%d v:0x%x pf:0x%x", i.U, hit(i), i.U, pfArray(i), v, pf) - // } - // for(i <- 0 until Width) { - // XSDebug(multiHit, p"vpn:0x${Hexadecimal(reqAddr(i).vpn)} hitVec:0x${Hexadecimal(VecInit(hitVec(i)).asUInt)} pfHitVecVec:0x${Hexadecimal(VecInit(pfHitVecVec(i)).asUInt)}\n") - // } - // for(i <- 0 until TlbEntrySize) { - // XSDebug(multiHit, p"entry(${i.U}): v:${v(i)} ${entry(i)}\n") - // } - // assert(!multiHit) // add multiHit here, later it should be removed (maybe), turn to miss and flush - - // for (i <- 0 until Width) { - // XSDebug(resp(i).valid && hit(i) && !(req(i).bits.vaddr===resp(i).bits.paddr), p"vaddr:0x${Hexadecimal(req(i).bits.vaddr)} paddr:0x${Hexadecimal(resp(i).bits.paddr)} hitVec:0x${Hexadecimal(VecInit(hitVec(i)).asUInt)}}\n") - // when (resp(i).valid && hit(i) && !(req(i).bits.vaddr===resp(i).bits.paddr)) { - // for (j <- 0 until TlbEntrySize) { - // XSDebug(true.B, p"TLBEntry(${j.U}): v:${v(j)} ${entry(j)}\n") - // } - // } // FIXME: remove me when tlb may be ok - // when(resp(i).valid && hit(i)) { - // assert(req(i).bits.vaddr===resp(i).bits.paddr, "vaddr:0x%x paddr:0x%x hitVec:%x ", req(i).bits.vaddr, resp(i).bits.paddr, VecInit(hitVec(i)).asUInt) - // } // FIXME: remove me when tlb may be ok - // } - - // assert((v&pf)===0.U, "v and pf can't be true at same time: v:0x%x pf:0x%x", v, pf) +// // NOTE: just for simple tlb debug, comment it after tlb's debug +// for (i <- 0 until Width) { +// if(isDtlb) { +// XSDebug(!(!vmEnable || RegNext(req(i).bits.vaddr)===resp(i).bits.paddr || !resp(i).valid || resp(i).bits.miss || Cat(VecInit(resp(i).bits.excp.pf).asUInt).orR), p"Dtlb: vaddr:${Hexadecimal(RegNext(req(i).bits.vaddr))} paddr:${Hexadecimal(resp(i).bits.paddr)} should be equal\n") +// assert(!vmEnable || RegNext(req(i).bits.vaddr)===resp(i).bits.paddr || !resp(i).valid || resp(i).bits.miss || Cat(VecInit(resp(i).bits.excp.pf).asUInt).orR) +// } else { +// XSDebug(!(!vmEnable || req(i).bits.vaddr===resp(i).bits.paddr || !resp(i).valid || resp(i).bits.miss || Cat(VecInit(resp(i).bits.excp.pf).asUInt).orR), p"Itlb: vaddr:${Hexadecimal(RegNext(req(i).bits.vaddr))} paddr:${Hexadecimal(resp(i).bits.paddr)} should be equal\n") +// assert(!vmEnable || req(i).bits.vaddr===resp(i).bits.paddr || !resp(i).valid || resp(i).bits.miss || Cat(VecInit(resp(i).bits.excp.pf).asUInt).orR) +// } +// } } object TLB { diff --git a/src/main/scala/xiangshan/cache/icache.scala b/src/main/scala/xiangshan/cache/icache.scala index 03a49e38891a3a3fc358a0be11f43df2de433a19..6a5124f9dc24036cf216a61dd6b814f9a73fe54f 100644 --- a/src/main/scala/xiangshan/cache/icache.scala +++ b/src/main/scala/xiangshan/cache/icache.scala @@ -7,6 +7,7 @@ import xiangshan._ import xiangshan.frontend._ import utils._ import chisel3.ExcitingUtils._ +import bus.tilelink.TLParameters case class ICacheParameters( nSets: Int = 64, @@ -27,22 +28,34 @@ case class ICacheParameters( def replacement = new RandomReplacement(nWays) } -trait HasICacheParameters extends HasL1CacheParameters with HasIFUConst { +trait HasICacheParameters extends HasL1CacheParameters with HasIFUConst with HasInstrMMIOConst { val cacheParams = icacheParameters val groupAlign = log2Up(cacheParams.blockBytes) val packetInstNum = packetBytes/instBytes val packetInstNumBit = log2Up(packetInstNum) val ptrHighBit = log2Up(groupBytes) - 1 val ptrLowBit = log2Up(packetBytes) + val encUnitBits = 8 + val bankRows = 2 + val bankBits = bankRows * rowBits + val nBanks = blockRows/bankRows + val bankUnitNum = (bankBits / encUnitBits) - - def accessBorder = 0x80000000L def cacheID = 0 def insLen = if (HasCExtension) 16 else 32 def RVCInsLen = 16 def groupPC(pc: UInt): UInt = Cat(pc(PAddrBits-1, groupAlign), 0.U(groupAlign.W)) - def encRowBits = cacheParams.dataCode.width(rowBits) - def encTagBits = cacheParams.tagCode.width(tagBits) + // def encRowBits = cacheParams.dataCode.width(rowBits) + // def encTagBits = cacheParams.tagCode.width(tagBits) + + // + def encMetaBits = cacheParams.tagCode.width(tagBits) + def metaEntryBits = encMetaBits + def encDataBits = cacheParams.dataCode.width(encUnitBits) + def dataEntryBits = encDataBits * bankUnitNum + // def encDataBits + // def encCacheline + require(isPow2(nSets), s"nSets($nSets) must be pow2") require(isPow2(nWays), s"nWays($nWays) must be pow2") @@ -52,12 +65,18 @@ trait HasICacheParameters extends HasL1CacheParameters with HasIFUConst { require(pgIdxBits >= untagBits, s"page aliasing problem: pgIdxBits($pgIdxBits) < untagBits($untagBits)") } +trait HasFrontEndExceptionNo { + def accessFault = 0 + def pageFault = 1 +} + abstract class ICacheBundle extends XSBundle with HasICacheParameters abstract class ICacheModule extends XSModule with HasICacheParameters with ICacheBase + with HasFrontEndExceptionNo abstract class ICacheArray extends XSModule with HasICacheParameters @@ -65,15 +84,6 @@ abstract class ICacheArray extends XSModule abstract class ICachArray extends XSModule with HasICacheParameters -// sealed class ICacheMetaBundle extends ICacheBundle -// { -// val tag = UInt(tagBits.W) -// } - -// sealed class ICacheDataBundle extends ICacheBundle -// { -// val data = UInt(encRowBits.W) -// } class ICacheReq extends ICacheBundle { @@ -85,6 +95,7 @@ class ICacheResp extends ICacheBundle { val pc = UInt(VAddrBits.W) val data = UInt((FetchWidth * 32).W) + val mmio = Bool() val mask = UInt(PredictWidth.W) val ipf = Bool() val acf = Bool() @@ -97,6 +108,9 @@ class ICacheIO extends ICacheBundle val resp = DecoupledIO(new ICacheResp) val mem_acquire = DecoupledIO(new L1plusCacheReq) val mem_grant = Flipped(DecoupledIO(new L1plusCacheResp)) + val mmio_acquire = DecoupledIO(new InsUncacheReq) + val mmio_grant = Flipped(DecoupledIO(new InsUncacheResp)) + val mmio_flush = Output(Bool()) val prefetchTrainReq = ValidIO(new IcacheMissReq) val tlb = new BlockTlbRequestIO val flush = Input(UInt(2.W)) @@ -177,23 +191,35 @@ class ICacheMetaArray extends ICachArray val readResp = Output(Vec(nWays,UInt(tagBits.W))) }} - val metaArray = Module(new SRAMTemplate(UInt(encTagBits.W), set=nSets, way=nWays, shouldReset = true)) - - //read + val metaArray = Module(new SRAMTemplate( + UInt(metaEntryBits.W), + set=nSets, + way=nWays, + shouldReset = true, + singlePort = true + )) + + // read + //do Parity decoding after way choose + // do not read and write in the same cycle: when write SRAM disable read + val readNextReg = RegNext(io.read.fire()) + val rtags = metaArray.io.r.resp.asTypeOf(Vec(nWays,UInt(encMetaBits.W))) + val rtags_decoded = rtags.map{ wtag =>cacheParams.dataCode.decode(wtag)} + val rtags_wrong = rtags_decoded.map{ wtag_decoded => wtag_decoded.uncorrectable} + //assert(readNextReg && !ParallelOR(rtags_wrong)) + val rtags_corrected = VecInit(rtags_decoded.map{ wtag_decoded => wtag_decoded.corrected}) metaArray.io.r.req.valid := io.read.valid - io.read.ready := metaArray.io.r.req.ready - io.write.ready := DontCare metaArray.io.r.req.bits.apply(setIdx=io.read.bits) + io.read.ready := !io.write.valid + io.readResp := rtags_corrected.asTypeOf(Vec(nWays,UInt(tagBits.W))) - val rtag = metaArray.io.r.resp.asTypeOf(Vec(nWays,UInt(encTagBits.W))) - val tag_encoded = VecInit(rtag.map(wtag => cacheParams.tagCode.decode(wtag).corrected)) - io.readResp :=tag_encoded.asTypeOf(Vec(nWays,UInt(tagBits.W))) //write val write = io.write.bits - val wdata_encoded = cacheParams.tagCode.encode(write.phyTag.asUInt) + val wtag_encoded = cacheParams.tagCode.encode(write.phyTag.asUInt) metaArray.io.w.req.valid := io.write.valid - metaArray.io.w.req.bits.apply(data=wdata_encoded, setIdx=write.virIdx, waymask=write.waymask) + metaArray.io.w.req.bits.apply(data=wtag_encoded, setIdx=write.virIdx, waymask=write.waymask) + io.write.ready := DontCare } @@ -202,36 +228,77 @@ class ICacheDataArray extends ICachArray val io=IO{new Bundle{ val write = Flipped(DecoupledIO(new ICacheDataWriteBundle)) val read = Flipped(DecoupledIO(UInt(idxBits.W))) - val readResp = Output(Vec(blockWords,Vec(nWays,UInt(encRowBits.W)))) + val readResp = Output(Vec(nWays,Vec(blockRows,UInt(rowBits.W)))) }} - val dataArray = List.fill(blockWords){ Module(new SRAMTemplate(UInt(encRowBits.W), set=nSets, way = nWays))} - - //read - //do ECC decoding after way choose - for(b <- 0 until blockWords){ - dataArray(b).io.r.req.valid := io.read.valid - dataArray(b).io.r.req.bits.apply(setIdx=io.read.bits) + //dataEntryBits = 144 + val dataArray = List.fill(nWays){List.fill(nBanks){Module(new SRAMTemplate( + UInt(dataEntryBits.W), + set=nSets, + way = 1, + singlePort = true + ))}} + + // read + // do Parity decoding after way choose + // do not read and write in the same cycle: when write SRAM disable read + val readNextReg = RegNext(io.read.fire()) + val rdatas = VecInit((0 until nWays).map( w => + VecInit( (0 until nBanks).map( b => + dataArray(w)(b).io.r.resp.asTypeOf(Vec( bankUnitNum, UInt(encDataBits.W))) + )) + )) + for(w <- 0 until nWays){ + for(b <- 0 until nBanks){ + dataArray(w)(b).io.r.req.valid := io.read.valid + dataArray(w)(b).io.r.req.bits.apply(setIdx=io.read.bits) + } + } + val rdatas_decoded = rdatas.map{wdata => wdata.map{ bdata => bdata.map{ unit => cacheParams.dataCode.decode(unit)}}} + val rdata_corrected = VecInit((0 until nWays).map{ w => + VecInit((0 until nBanks).map{ b => + VecInit((0 until bankUnitNum).map{ i => + rdatas_decoded(w)(b)(i).corrected + }) + }) + }) + + (0 until nWays).map{ w => + (0 until blockRows).map{ r => + io.readResp(w)(r) := Cat( + (0 until bankUnitNum/2).map{ i => + //println("result: ",r,i) + rdata_corrected(w)(r >> 1)((r%2) * 8 + i).asUInt + }.reverse ) + } } - val dataArrayReadyVec = dataArray.map(b => b.io.r.req.ready) - io.read.ready := ParallelOR(dataArrayReadyVec) - io.write.ready := DontCare - io.readResp := VecInit(dataArray.map(b => b.io.r.resp.asTypeOf(Vec(nWays,UInt(encRowBits.W))))) + io.read.ready := !io.write.valid //write val write = io.write.bits - val write_data = write.data.asTypeOf(Vec(blockWords,UInt(rowBits.W))) - val write_data_encoded = write_data.map(wdata => cacheParams.tagCode.encode(wdata)) + val write_way = OHToUInt(write.waymask) + val write_data = write.data.asTypeOf(Vec(nBanks,Vec( bankUnitNum, UInt(encUnitBits.W)))) + val write_data_encoded = write_data.map(b => b.map{ unit => cacheParams.dataCode.encode(unit) } ) + val write_bank_data = Wire(Vec(nBanks,UInt((dataEntryBits).W))) + + (0 until nBanks).map{ b => + write_bank_data(b) := Cat( + (0 until bankUnitNum).map{ i => + write_data_encoded(b)(i).asUInt + }.reverse ) + } - for(b <- 0 until blockWords){ - dataArray(b).io.w.req.valid := io.write.valid - dataArray(b).io.w.req.bits.apply( setIdx=write.virIdx, - data=write_data_encoded(b), - waymask=write.waymask) + for(w <- 0 until nWays){ + for(b <- 0 until nBanks){ + dataArray(w)(b).io.w.req.valid := io.write.valid && w.U === write_way + dataArray(w)(b).io.w.req.bits.setIdx := write.virIdx + dataArray(w)(b).io.w.req.bits.data := write_bank_data(b) + } } + io.write.ready := DontCare } /* ------------------------------------------------------------ @@ -243,10 +310,10 @@ class ICache extends ICacheModule { // cut a cacheline into a fetch packet def cutHelper(sourceVec: Vec[UInt], pc: UInt, mask: UInt): UInt = { - val sourceVec_inst = Wire(Vec(blockWords*wordBytes/instBytes,UInt(insLen.W))) - (0 until blockWords).foreach{ i => - (0 until wordBytes/instBytes).foreach{ j => - sourceVec_inst(i*wordBytes/instBytes + j) := sourceVec(i)(j*insLen+insLen-1, j*insLen) + val sourceVec_inst = Wire(Vec(blockRows*rowBytes/instBytes,UInt(insLen.W))) + (0 until blockRows).foreach{ i => + (0 until rowBytes/instBytes).foreach{ j => + sourceVec_inst(i*rowBytes/instBytes + j) := sourceVec(i)(j*insLen+insLen-1, j*insLen) } } val cutPacket = WireInit(VecInit(Seq.fill(PredictWidth){0.U(insLen.W)})) @@ -257,6 +324,23 @@ class ICache extends ICacheModule cutPacket.asUInt } + def cutHelperMMIO(sourceVec: Vec[UInt], pc: UInt, mask: UInt) = { + val sourceVec_inst = Wire(Vec(mmioBeats * mmioBusBytes/instBytes,UInt(insLen.W))) + (0 until mmioBeats).foreach{ i => + (0 until mmioBusBytes/instBytes).foreach{ j => + sourceVec_inst(i*mmioBusBytes/instBytes + j) := sourceVec(i)(j*insLen+insLen-1, j*insLen) + } + } + val cutPacket = WireInit(VecInit(Seq.fill(PredictWidth){0.U(insLen.W)})) + val insLenLog = log2Ceil(insLen) + val start = (pc >> insLenLog.U)(log2Ceil(mmioBeats * mmioBusBytes/instBytes) -1, 0) + val outMask = mask >> start + (0 until PredictWidth ).foreach{ i => + cutPacket(i) := Mux(outMask(i).asBool,sourceVec_inst(start + i.U),0.U) + } + (cutPacket.asUInt, outMask.asUInt) + } + // generate the one hot code according to a UInt between 0-8 def PriorityMask(sourceVec: UInt) : UInt = { val oneHot = Mux(sourceVec >= 8.U, "b1000".U, @@ -299,25 +383,25 @@ class ICache extends ICacheModule //---------------------------- - // Stage 2 + // Stage 2 //---------------------------- val s2_idx = get_idx(s2_req_pc) val s2_tlb_resp = WireInit(io.tlb.resp.bits) val s2_tag = get_tag(s2_tlb_resp.paddr) val s2_hit = WireInit(false.B) - val s2_access_fault = WireInit(false.B) val s2_allValid = s2_valid && io.tlb.resp.valid + val s2_mmio = WireInit(false.B) + s2_fire := s2_allValid && s3_ready + s2_ready := s3_ready || !s2_valid when(s1_fire) { s2_valid := true.B } .elsewhen(s2_flush) { s2_valid := false.B } .elsewhen(s2_fire) { s2_valid := false.B } - //physical address < 0x80000000 - //TODO: May have bugs - s2_access_fault := (s2_tlb_resp.paddr < accessBorder.U) && s2_valid - // SRAM(Meta and Data) read reseponse - val metas = metaArray.io.readResp + // TODO :Parity wrong excetion + val metas = metaArray.io.readResp + val datas =RegEnable(next=dataArray.io.readResp, enable=s2_fire) val validMeta = Cat((0 until nWays).map{w => validArray(Cat(s2_idx, w.U(log2Ceil(nWays).W)))}.reverse).asUInt @@ -329,15 +413,26 @@ class ICache extends ICacheModule val hasInvalidWay = invalidVec.orR val refillInvalidWaymask = PriorityMask(invalidVec) - val waymask = Mux(s2_hit, hitVec.asUInt, Mux(hasInvalidWay, refillInvalidWaymask, victimWayMask)) - s2_hit := ParallelOR(hitVec) || s2_tlb_resp.excp.pf.instr || s2_access_fault - s2_ready := s3_ready || !s2_valid + //deal with icache exception + val icacheExceptionVec = Wire(Vec(8,Bool())) + val hasIcacheException = icacheExceptionVec.asUInt().orR() + icacheExceptionVec := DontCare + icacheExceptionVec(accessFault) := s2_tlb_resp.excp.af.instr && s2_allValid + icacheExceptionVec(pageFault) := s2_tlb_resp.excp.pf.instr && s2_allValid + + s2_mmio := s2_valid && io.tlb.resp.valid && s2_tlb_resp.mmio && !hasIcacheException + s2_hit := s2_valid && ParallelOR(hitVec) + + val waymask = Mux(hasIcacheException,1.U(nWays.W),Mux(s2_hit, hitVec.asUInt, Mux(hasInvalidWay, refillInvalidWaymask, victimWayMask))) - XSDebug("[Stage 2] v : r : f (%d %d %d) pc: 0x%x mask: %b acf:%d\n",s2_valid,s3_ready,s2_fire,s2_req_pc,s2_req_mask,s2_access_fault) + assert(!(s2_hit && s2_mmio),"MMIO address should not hit in icache") + + XSDebug("[Stage 2] v : r : f (%d %d %d) pc: 0x%x mask: %b mmio:%d \n",s2_valid,s3_ready,s2_fire,s2_req_pc,s2_req_mask,s2_mmio) + XSDebug("[Stage 2] exception: af:%d pf:%d \n",icacheExceptionVec(accessFault),icacheExceptionVec(pageFault)) XSDebug(p"[Stage 2] tlb req: v ${io.tlb.req.valid} r ${io.tlb.req.ready} ${io.tlb.req.bits}\n") XSDebug(p"[Stage 2] tlb resp: v ${io.tlb.resp.valid} r ${io.tlb.resp.ready} ${s2_tlb_resp}\n") - XSDebug("[Stage 2] tag: %x hit:%d\n",s2_tag,s2_hit) + XSDebug("[Stage 2] tag: %x hit:%d mmio:%d\n",s2_tag,s2_hit,s2_mmio) XSDebug("[Stage 2] validMeta: %b victimWayMaks:%b invalidVec:%b hitVec:%b waymask:%b \n",validMeta,victimWayMask,invalidVec.asUInt,hitVec.asUInt,waymask.asUInt) @@ -348,28 +443,21 @@ class ICache extends ICacheModule val s3_data = datas val s3_tag = RegEnable(s2_tag, s2_fire) val s3_hit = RegEnable(next=s2_hit,init=false.B,enable=s2_fire) + val s3_mmio = RegEnable(next=s2_mmio,init=false.B,enable=s2_fire) val s3_wayMask = RegEnable(next=waymask,init=0.U,enable=s2_fire) - val s3_miss = s3_valid && !s3_hit val s3_idx = get_idx(s3_req_pc) - val s3_access_fault = RegEnable(s2_access_fault,init=false.B,enable=s2_fire) + val s3_exception_vec = RegEnable(next= icacheExceptionVec,init=0.U.asTypeOf(Vec(8,Bool())), enable=s2_fire) + val s3_has_exception = s3_exception_vec.asUInt.orR + val s3_miss = s3_valid && !s3_hit && !s3_mmio && !s3_has_exception when(s3_flush) { s3_valid := false.B } .elsewhen(s2_fire && !s2_flush) { s3_valid := true.B } .elsewhen(io.resp.fire()) { s3_valid := false.B } - val refillDataReg = Reg(Vec(refillCycles,UInt(beatBits.W))) // icache hit - // data ECC encoding + // data Parity encoding // simply cut the hit cacheline - val dataHitWay = VecInit(s3_data.map(b => Mux1H(s3_wayMask,b).asUInt)) + val dataHitWay = Mux1H(s3_wayMask,s3_data) val outPacket = Wire(UInt((FetchWidth * 32).W)) - val dataHitWayDecoded = VecInit( - (0 until blockWords).map{r => - val row = dataHitWay.asTypeOf(Vec(blockWords,UInt(encRowBits.W)))(r) - val decodedRow = cacheParams.dataCode.decode(row) - assert(!(s3_valid && s3_hit && decodedRow.uncorrectable)) - decodedRow.corrected - } - ) outPacket := cutHelper(dataHitWay,s3_req_pc.asUInt,s3_req_mask.asUInt) @@ -378,13 +466,13 @@ class ICache extends ICacheModule val icacheMissQueue = Module(new IcacheMissQueue) val blocking = RegInit(false.B) val isICacheResp = icacheMissQueue.io.resp.valid && icacheMissQueue.io.resp.bits.clientID === cacheID.U(2.W) - icacheMissQueue.io.req.valid := s3_miss && !s3_flush && !blocking//TODO: specificate flush condition + icacheMissQueue.io.req.valid := s3_miss && !s3_has_exception && !s3_flush && !blocking//TODO: specificate flush condition icacheMissQueue.io.req.bits.apply(missAddr=groupPC(s3_tlb_resp.paddr),missIdx=s3_idx,missWaymask=s3_wayMask,source=cacheID.U(2.W)) icacheMissQueue.io.resp.ready := io.resp.ready icacheMissQueue.io.flush := s3_flush - when(icacheMissQueue.io.req.fire()){blocking := true.B} - .elsewhen(blocking && ((icacheMissQueue.io.resp.fire() && isICacheResp) || s3_flush) ){blocking := false.B} + when(icacheMissQueue.io.req.fire() || io.mmio_acquire.fire()){blocking := true.B} + .elsewhen(blocking && ((icacheMissQueue.io.resp.fire() && isICacheResp) || io.mmio_grant.fire() || s3_flush) ){blocking := false.B} XSDebug(blocking && s3_flush,"check for icache non-blocking") //cache flush register @@ -421,66 +509,74 @@ class ICache extends ICacheModule //icache flush: only flush valid Array register when(icacheFlush){ validArray := 0.U } - val refillDataVec = icacheMissQueue.io.resp.bits.data.asTypeOf(Vec(blockWords,UInt(wordBits.W))) + val refillDataVec = icacheMissQueue.io.resp.bits.data.asTypeOf(Vec(blockRows,UInt(wordBits.W))) val refillDataOut = cutHelper(refillDataVec, s3_req_pc,s3_req_mask ) - s3_ready := ((io.resp.ready && s3_hit || !s3_valid) && !blocking) || (blocking && icacheMissQueue.io.resp.valid && io.resp.ready) + val is_same_cacheline = s3_miss && s2_valid && (groupAligned(s2_req_pc) ===groupAligned(s3_req_pc)) + val useRefillReg = RegNext(is_same_cacheline && icacheMissQueue.io.resp.fire()) + val refillDataVecReg = RegEnable(next=refillDataVec, enable= (is_same_cacheline && icacheMissQueue.io.resp.fire())) + + //FIXME!! + val mmioDataVec = io.mmio_grant.bits.data.asTypeOf(Vec(mmioBeats,UInt(mmioBusWidth.W))) + val mmio_packet = cutHelperMMIO(mmioDataVec, s3_req_pc, mmioMask)._1 + val mmio_mask = cutHelperMMIO(mmioDataVec, s3_req_pc, mmioMask)._2 + + XSDebug("mmio data %x\n", mmio_packet) + + + s3_ready := ((io.resp.ready && s3_hit || !s3_valid) && !blocking) || (blocking && ((icacheMissQueue.io.resp.fire()) || io.mmio_grant.fire())) val pds = Seq.fill(nWays)(Module(new PreDecode)) for (i <- 0 until nWays) { val wayResp = Wire(new ICacheResp) - val wayData = cutHelper(VecInit(s3_data.map(b => b(i).asUInt)), s3_req_pc, s3_req_mask) - val refillData = cutHelper(refillDataVec, s3_req_pc,s3_req_mask) + val wayData = cutHelper(s3_data(i), s3_req_pc, s3_req_mask) + val refillData = Mux(useRefillReg,cutHelper(refillDataVecReg, s3_req_pc,s3_req_mask),cutHelper(refillDataVec, s3_req_pc,s3_req_mask)) wayResp.pc := s3_req_pc - wayResp.data := Mux(s3_valid && s3_hit, wayData, refillData) - wayResp.mask := s3_req_mask - wayResp.ipf := s3_tlb_resp.excp.pf.instr - wayResp.acf := s3_access_fault + wayResp.data := Mux(s3_valid && s3_hit, wayData, Mux(s3_mmio ,mmio_packet ,refillData)) + wayResp.mask := Mux(s3_mmio,mmio_mask,s3_req_mask) + wayResp.ipf := s3_exception_vec(pageFault) + wayResp.acf := s3_exception_vec(accessFault) + wayResp.mmio := s3_mmio pds(i).io.in := wayResp pds(i).io.prev <> io.prev pds(i).io.prev_pc := io.prev_pc - // if a fetch packet triggers page fault, set the pf instruction to nop - when ((!(HasCExtension.B) || io.prev.valid) && s3_tlb_resp.excp.pf.instr ) { - val instrs = Wire(Vec(FetchWidth, UInt(32.W))) - (0 until FetchWidth).foreach(i => instrs(i) := ZeroExt("b0010011".U, 32)) // nop - pds(i).io.in.data := instrs.asUInt - }.elsewhen (HasCExtension.B && io.prev.valid && (io.prev_ipf || s3_tlb_resp.excp.pf.instr)) { - pds(i).io.prev.bits := ZeroExt("b0010011".U, 16) - val instrs = Wire(Vec(FetchWidth, UInt(32.W))) - (0 until FetchWidth).foreach(i => instrs(i) := Cat(ZeroExt("b0010011".U, 16), Fill(16, 0.U(1.W)))) - pds(i).io.in.data := instrs.asUInt - } } + + + // if a fetch packet triggers page fault, at least send a valid instruction io.pd_out := Mux1H(s3_wayMask, pds.map(_.io.out)) + val s3_noHit = s3_wayMask === 0.U //TODO: coherence - XSDebug("[Stage 3] valid:%d pc: 0x%x mask: %b ipf:%d acf:%d \n",s3_valid,s3_req_pc,s3_req_mask,s3_tlb_resp.excp.pf.instr,s3_access_fault) + XSDebug("[Stage 3] valid:%d miss:%d pc: 0x%x mmio :%d mask: %b ipf:%d\n",s3_valid, s3_miss,s3_req_pc,s3_req_mask,s3_tlb_resp.excp.pf.instr, s3_mmio) XSDebug("[Stage 3] hit:%d miss:%d waymask:%x blocking:%d\n",s3_hit,s3_miss,s3_wayMask.asUInt,blocking) XSDebug("[Stage 3] tag: %x idx: %d\n",s3_tag,get_idx(s3_req_pc)) XSDebug(p"[Stage 3] tlb resp: ${s3_tlb_resp}\n") XSDebug("[mem_acquire] valid:%d ready:%d\n",io.mem_acquire.valid,io.mem_acquire.ready) XSDebug("[mem_grant] valid:%d ready:%d data:%x id:%d \n",io.mem_grant.valid,io.mem_grant.ready,io.mem_grant.bits.data,io.mem_grant.bits.id) XSDebug("[Stage 3] ---------Hit Way--------- \n") - for(i <- 0 until blockWords){ + for(i <- 0 until blockRows){ XSDebug("[Stage 3] %x\n",dataHitWay(i)) } XSDebug("[Stage 3] outPacket :%x\n",outPacket) XSDebug("[Stage 3] refillDataOut :%x\n",refillDataOut) + XSDebug("[Stage 3] refillDataOutVec :%x startPtr:%d\n",refillDataVec.asUInt, s3_req_pc(5,1).asUInt) //---------------------------- // Out Put //---------------------------- //icache request - io.req.ready := s2_ready + io.req.ready := s2_ready && metaArray.io.read.ready && dataArray.io.read.ready //icache response: to pre-decoder - io.resp.valid := s3_valid && (s3_hit || icacheMissQueue.io.resp.valid) - io.resp.bits.data := Mux((s3_valid && s3_hit),outPacket,refillDataOut) - io.resp.bits.mask := s3_req_mask + io.resp.valid := s3_valid && (s3_hit || s3_has_exception || icacheMissQueue.io.resp.valid || io.mmio_grant.valid) + io.resp.bits.data := Mux(s3_mmio,mmio_packet,Mux((s3_valid && s3_hit),outPacket,refillDataOut)) + io.resp.bits.mask := Mux(s3_mmio,mmio_mask,s3_req_mask) io.resp.bits.pc := s3_req_pc io.resp.bits.ipf := s3_tlb_resp.excp.pf.instr - io.resp.bits.acf := s3_access_fault + io.resp.bits.acf := s3_exception_vec(accessFault) + io.resp.bits.mmio := s3_mmio //to itlb io.tlb.resp.ready := true.B // DontCare @@ -499,6 +595,15 @@ class ICache extends ICacheModule io.prefetchTrainReq.bits := DontCare io.prefetchTrainReq.bits.addr := groupPC(s3_tlb_resp.paddr) + //To icache Uncache + io.mmio_acquire.valid := s3_mmio && s3_valid + io.mmio_acquire.bits.addr := mmioBusAligned(s3_tlb_resp.paddr) + io.mmio_acquire.bits.id := cacheID.U + + io.mmio_grant.ready := io.resp.ready + + io.mmio_flush := io.flush(1) + io.l1plusflush := icacheFlush XSDebug("[flush] flush_0:%d flush_1:%d\n",s2_flush,s3_flush) @@ -507,6 +612,6 @@ class ICache extends ICacheModule if (!env.FPGAPlatform ) { ExcitingUtils.addSource( s3_valid && !blocking, "perfCntIcacheReqCnt", Perf) ExcitingUtils.addSource( s3_miss && blocking && io.resp.fire(), "perfCntIcacheMissCnt", Perf) + ExcitingUtils.addSource( s3_mmio && blocking && io.resp.fire(), "perfCntIcacheMMIOCnt", Perf) } -} - +} \ No newline at end of file diff --git a/src/main/scala/xiangshan/cache/icacheMissQueue.scala b/src/main/scala/xiangshan/cache/icacheMissQueue.scala index 7f7bcbbea64f0c80a7597d446eae99ac487dcf19..f02c051926693fdcf9db5727992b53110038c06c 100644 --- a/src/main/scala/xiangshan/cache/icacheMissQueue.scala +++ b/src/main/scala/xiangshan/cache/icacheMissQueue.scala @@ -135,7 +135,7 @@ class IcacheMissEntry extends ICacheMissQueueModule //TODO: Maybe this sate is noe necessary so we don't need respDataReg is(s_write_back){ - when(io.refill.fire() && io.meta_write.fire()){ + when((io.refill.fire() && io.meta_write.fire()) || needFlush || io.flush){ state := s_wait_resp } } @@ -150,10 +150,10 @@ class IcacheMissEntry extends ICacheMissQueueModule //refill write and meta write //WARNING: Maybe could not finish refill in 1 cycle - io.meta_write.valid := (state === s_write_back) && !needFlush + io.meta_write.valid := (state === s_write_back) && !needFlush && !io.flush io.meta_write.bits.apply(tag=req_tag, setIdx=req_idx, waymask=req_waymask) - io.refill.valid := (state === s_write_back) && !needFlush + io.refill.valid := (state === s_write_back) && !needFlush && !io.flush io.refill.bits.apply(data=respDataReg.asUInt, setIdx=req_idx, waymask=req_waymask) diff --git a/src/main/scala/xiangshan/cache/missQueue.scala b/src/main/scala/xiangshan/cache/missQueue.scala index ec285b1337b201105cb0c9af35d4789f9cc77789..23b4fe6bf267d948b6f9e1e59c295b4f20b33dc0 100644 --- a/src/main/scala/xiangshan/cache/missQueue.scala +++ b/src/main/scala/xiangshan/cache/missQueue.scala @@ -495,8 +495,8 @@ class MissQueue(edge: TLEdgeOut) extends DCacheModule with HasTLDump if (!env.FPGAPlatform) { ExcitingUtils.addSource( BoolStopWatch( - start = entry.io.req.fire(), - stop = entry.io.resp.fire(), + start = entry.io.block_idx.valid, + stop = !entry.io.block_idx.valid, startHighPriority = true), "perfCntDCacheMissQueuePenaltyEntry" + Integer.toString(i, 10), Perf diff --git a/src/main/scala/xiangshan/cache/prefetch/BestOffsetPrefetch.scala b/src/main/scala/xiangshan/cache/prefetch/BestOffsetPrefetch.scala index 489d294c43061fc3291698f48597e8e333b7c4da..7d0e83da500cd2755d230e6f96f200f2c21f05a4 100644 --- a/src/main/scala/xiangshan/cache/prefetch/BestOffsetPrefetch.scala +++ b/src/main/scala/xiangshan/cache/prefetch/BestOffsetPrefetch.scala @@ -12,21 +12,24 @@ case class BOPParameters( scoreBits: Int, roundMax: Int, badScore: Int, - scores: Int = 52, + // TODO: Is 256-offset necessary, which will cross pages? offsetList: Seq[Int] = Seq( 1, 2, 3, 4, 5, 6, 8, 9, 10, 12, - 15, 16, 18, 20, 24, 25, 27, 30, 32, 36, + 15, 16/*, 18, 20, 24, 25, 27, 30, 32, 36, 40, 45, 48, 50, 54, 60, 64, 72, 75, 80, 81, 90, 96, 100, 108, 120, 125, 128, 135, 144, 150, 160, 162, 180, 192, 200, 216, 225, 240, 243, - 250, 256 + 250, 256*/ ), - blockBytes: Int + blockBytes: Int, + nEntries: Int ) { + def scores = offsetList.length def offsetWidth = log2Up(offsetList(scores - 1)) + 1 def rrIdxBits = log2Up(rrTableEntries) def roundBits = log2Up(roundMax) def scoreMax = (1 << scoreBits) - 1 + def totalWidth = log2Up(nEntries) // id's width } class ScoreTableEntry(p: BOPParameters) extends PrefetchBundle { @@ -34,7 +37,7 @@ class ScoreTableEntry(p: BOPParameters) extends PrefetchBundle { val score = UInt(p.scoreBits.W) def apply(offset: UInt, score: UInt) = { - val entry = new ScoreTableEntry(p) + val entry = Wire(new ScoreTableEntry(p)) entry.offset := offset entry.score := score entry @@ -78,9 +81,51 @@ class TestOffsetBundle(p: BOPParameters) extends PrefetchBundle { override def cloneType: this.type = (new TestOffsetBundle(p)).asInstanceOf[this.type] } +class BestOffsetPrefetchReq(p: BOPParameters) extends PrefetchReq { + val id = UInt(p.totalWidth.W) + + override def toPrintable: Printable = { + p"addr=0x${Hexadecimal(addr)} w=${write} id=0x${Hexadecimal(id)}" + } + override def cloneType: this.type = (new BestOffsetPrefetchReq(p)).asInstanceOf[this.type] +} + +class BestOffsetPrefetchResp(p: BOPParameters) extends PrefetchResp { + val id = UInt(p.totalWidth.W) + + override def toPrintable: Printable = { + p"id=0x${Hexadecimal(id)}" + } + override def cloneType: this.type = (new BestOffsetPrefetchResp(p)).asInstanceOf[this.type] +} + +class BestOffsetPrefetchFinish(p: BOPParameters) extends PrefetchFinish { + val id = UInt(p.totalWidth.W) + + override def toPrintable: Printable = { + p"id=0x${Hexadecimal(id)}" + } + override def cloneType: this.type = (new BestOffsetPrefetchFinish(p)).asInstanceOf[this.type] +} + +class BestOffsetPrefetchIO(p: BOPParameters) extends PrefetchBundle { + val train = Flipped(ValidIO(new PrefetchTrain)) + val req = DecoupledIO(new BestOffsetPrefetchReq(p)) + val resp = Flipped(DecoupledIO(new BestOffsetPrefetchResp(p))) + val finish = DecoupledIO(new BestOffsetPrefetchFinish(p)) + + override def toPrintable: Printable = { + p"train: v=${train.valid} ${train.bits} " + + p"req: v=${req.valid} r=${req.ready} ${req.bits} " + + p"resp: v=${resp.valid} r=${resp.ready} ${resp.bits} " + + p"finish: v=${finish.valid} r=${finish.ready} ${finish.bits}" + } + override def cloneType: this.type = (new BestOffsetPrefetchIO(p)).asInstanceOf[this.type] +} + class RecentRequestTable(p: BOPParameters) extends PrefetchModule { val io = IO(new Bundle { - val w = Flipped(ValidIO(UInt(PAddrBits.W))) + val w = Flipped(DecoupledIO(UInt(PAddrBits.W))) val r = Flipped(new TestOffsetBundle(p)) }) def rrIdxBits = p.rrIdxBits @@ -108,10 +153,10 @@ class RecentRequestTable(p: BOPParameters) extends PrefetchModule { } } - val rrTable = Module(new SRAMTemplate(rrTableEntry(), set = rrTableEntries, way = 1, shouldReset = true)) + val rrTable = Module(new SRAMTemplate(rrTableEntry(), set = rrTableEntries, way = 1, shouldReset = true, singlePort = true)) val wAddr = io.w.bits - rrTable.io.w.req.valid := io.w.valid + rrTable.io.w.req.valid := io.w.valid && !io.r.req.valid rrTable.io.w.req.bits.setIdx := idx(wAddr) rrTable.io.w.req.bits.data.valid := true.B rrTable.io.w.req.bits.data.tag := tag(wAddr) @@ -122,32 +167,35 @@ class RecentRequestTable(p: BOPParameters) extends PrefetchModule { rrTable.io.r.req.bits.setIdx := idx(rAddr) rData := rrTable.io.r.resp.data(0) - val rwConflict = io.w.valid && io.r.req.fire() && idx(wAddr) === idx(rAddr) - when (rwConflict) { - rrTable.io.r.req.valid := false.B - } - when (RegNext(rwConflict)) { - rData.valid := true.B - rData.tag := RegNext(tag(wAddr)) - } + val rwConflict = io.w.fire() && io.r.req.fire() && idx(wAddr) === idx(rAddr) + // when (rwConflict) { + // rrTable.io.r.req.valid := false.B + // } + // when (RegNext(rwConflict)) { + // rData.valid := true.B + // rData.tag := RegNext(tag(wAddr)) + // } + io.w.ready := rrTable.io.w.req.ready && !io.r.req.valid io.r.req.ready := true.B - io.r.resp.valid := RegNext(io.r.req.fire()) + io.r.resp.valid := RegNext(rrTable.io.r.req.fire()) io.r.resp.bits.testOffset := RegNext(io.r.req.bits.testOffset) io.r.resp.bits.ptr := RegNext(io.r.req.bits.ptr) io.r.resp.bits.hit := rData.valid && rData.tag === RegNext(tag(rAddr)) + assert(!RegNext(rwConflict), "single port SRAM should not read and write at the same time") + // debug info - XSDebug(io.w.valid, p"io.write: v=${io.w.valid} addr=0x${Hexadecimal(io.w.bits)}\n") + XSDebug(io.w.fire(), p"io.write: v=${io.w.valid} addr=0x${Hexadecimal(io.w.bits)}\n") XSDebug(p"io.read: ${io.r}\n") - XSDebug(io.w.valid, p"wAddr=0x${Hexadecimal(wAddr)} idx=${Hexadecimal(idx(wAddr))} tag=${Hexadecimal(tag(wAddr))}\n") + XSDebug(io.w.fire(), p"wAddr=0x${Hexadecimal(wAddr)} idx=${Hexadecimal(idx(wAddr))} tag=${Hexadecimal(tag(wAddr))}\n") XSDebug(io.r.req.fire(), p"rAddr=0x${Hexadecimal(rAddr)} idx=${Hexadecimal(idx(rAddr))} rData=${rData}\n") - XSDebug(rwConflict, p"write and read conflict!\n") } class OffsetScoreTable(p: BOPParameters) extends PrefetchModule { val io = IO(new Bundle { + val req = Flipped(DecoupledIO(UInt(PAddrBits.W))) // req addr from L1 val prefetchOffset = Output(UInt(p.offsetWidth.W)) val test = new TestOffsetBundle(p) }) @@ -158,33 +206,34 @@ class OffsetScoreTable(p: BOPParameters) extends PrefetchModule { def roundBits = p.roundBits def roundMax = p.roundMax def scoreMax = p.scoreMax + def badScore = p.badScore - val prefetchOffset = RegInit(1.U(offsetWidth)) // best offset is 1, this is, a next-line prefetcher as initialization + val prefetchOffset = RegInit(2.U(offsetWidth.W)) // best offset is 1, that is, a next-line prefetcher as initialization val st = RegInit(VecInit(offsetList.map(off => new ScoreTableEntry(p).apply(off.U, 0.U)))) val ptr = RegInit(0.U(log2Up(scores).W)) val round = RegInit(0.U(roundBits.W)) - val bestOffset = RegInit(new ScoreTableEntry(p).apply(1.U, 0.U)) // the entry with the highest score while traversing - val testOffset = WireInit(0.U(offsetWidth.W)) + val bestOffset = RegInit(new ScoreTableEntry(p).apply(2.U, 0.U)) // the entry with the highest score while traversing + val testOffset = WireInit(st(ptr).offset) def winner(e1: ScoreTableEntry, e2: ScoreTableEntry): ScoreTableEntry = { - val w = new ScoreTableEntry(p) + val w = Wire(new ScoreTableEntry(p)) w := Mux(e1.score > e2.score, e1, e2) w } - val s_idle :: s_learn :: s_finish :: Nil = Enum(3) + val s_idle :: s_learn :: Nil = Enum(2) val state = RegInit(s_idle) // 1. At the start of a learning phase // All the scores are reset to 0. + // At the end of every learning phase, the prefetch offset is updated as the one with the highest score. when (state === s_idle) { - when (ptr =/= scores.U) { - st(ptr).score := 0.U - ptr := ptr + 1.U - }.otherwise { - ptr := 0.U - state := s_learn - } + st.foreach(_.score := 0.U) + ptr := 0.U + round := 0.U + bestOffset.score := badScore.U + prefetchOffset := bestOffset.offset + state := s_learn } // 2. During a learning phase @@ -196,16 +245,18 @@ class OffsetScoreTable(p: BOPParameters) extends PrefetchModule { // (1) one of the score equals SCOREMAX, or // (2) the number of rounds equals ROUNDMAX. when (state === s_learn) { - testOffset := st(ptr).offset when (io.test.req.fire()) { val roundFinish = ptr === (scores - 1).U ptr := Mux(roundFinish, 0.U, ptr + 1.U) round := Mux(roundFinish, round + 1.U, round) + + XSDebug(p"test offset ${testOffset} req fire\n") } // (2) the number of rounds equals ROUNDMAX. - when (round === roundMax.U) { - state := s_finish + when (round >= roundMax.U) { + state := s_idle + XSDebug(p"round reaches roundMax(${roundMax.U})\n") } when (io.test.resp.fire() && io.test.resp.bits.hit) { @@ -216,25 +267,148 @@ class OffsetScoreTable(p: BOPParameters) extends PrefetchModule { st(io.test.resp.bits.ptr).score := newScore bestOffset := winner(new ScoreTableEntry(p).apply(offset, newScore), bestOffset) // (1) one of the score equals SCOREMAX - when (newScore === scoreMax.U) { - state := s_finish + when (newScore >= scoreMax.U) { + state := s_idle + XSDebug(p"newScore reaches scoreMax(${scoreMax.U})\n") } - } - } - // 3. At the end of every learning phase, the prefetch offset is updated as the one with the highest score. - when (state === s_finish) { - prefetchOffset := bestOffset.offset - ptr := 0.U - round := 0.U - bestOffset.offset := 1.U - bestOffset.score := 0.U - state := s_idle + XSDebug(p"test offset ${offset} resp fire and hit. score ${oldScore} -> ${newScore}\n") + } } + io.req.ready := true.B io.prefetchOffset := prefetchOffset - io.test.req.valid := state === s_learn && round =/= roundMax.U - io.test.req.bits.addr := DontCare // assign this outside the score table + io.test.req.valid := state === s_learn && io.req.fire() + io.test.req.bits.addr := io.req.bits io.test.req.bits.testOffset := testOffset io.test.req.bits.ptr := ptr + io.test.resp.ready := true.B + + XSDebug(p"state=${state} prefetchOffset=${prefetchOffset} ptr=${ptr} round=${round} bestOffset=${bestOffset} testOffset=${testOffset}\n") + // score table + XSDebug(p"OffsetScoreTable(idx:offset:score) as follows:\n") + for (i <- 0 until scores) { + if (i % 8 == 0) { XSDebug(p"${i.U}:${st(i)}\t") } + else if (i % 8 == 7 || i == scores - 1) { XSDebug(false, true.B, p"${i.U}:${st(i)}\n") } + else { XSDebug(false, true.B, p"${i.U}:${st(i)}\t") } + } + XSDebug(io.req.fire(), p"receive req from L1. io.req.bits=0x${Hexadecimal(io.req.bits)}\n") +} + +class BestOffsetPrefetchEntry(p: BOPParameters) extends PrefetchModule { + val io = IO(new Bundle { + val id = Input(UInt(p.totalWidth.W)) + val prefetchOffset = Input(UInt(p.offsetWidth.W)) + val pft = new BestOffsetPrefetchIO(p) + val inflight = ValidIO(UInt(PAddrBits.W)) + val writeRRTable = DecoupledIO(UInt(PAddrBits.W)) + }) + + def blockBytes = p.blockBytes + def getBlockAddr(addr: UInt) = Cat(addr(PAddrBits - 1, log2Up(blockBytes)), 0.U(log2Up(blockBytes).W)) + + val s_idle :: s_req :: s_resp :: s_write_recent_req :: s_finish :: Nil = Enum(5) + val state = RegInit(s_idle) + val req = RegInit(0.U.asTypeOf(new PrefetchReq)) + val baseAddr = RegInit(0.U(PAddrBits.W)) + + when (state === s_idle) { + when (io.pft.train.valid) { + state := s_req + req.addr := getBlockAddr(io.pft.train.bits.addr) + (io.prefetchOffset << log2Up(blockBytes)) + req.write := io.pft.train.bits.write + baseAddr := getBlockAddr(io.pft.train.bits.addr) + } + } + + when (state === s_req) { + when (io.pft.req.fire()) { + state := s_resp + } + } + + when (state === s_resp) { + when (io.pft.resp.fire()) { + state := s_write_recent_req + } + } + + when (state === s_write_recent_req) { + when (io.writeRRTable.fire()) { + state := s_finish + } + } + + when (state === s_finish) { + when (io.pft.finish.fire()) { + state := s_idle + } + } + + io.pft.req.valid := state === s_req + io.pft.req.bits.addr := req.addr + io.pft.req.bits.write := req.write + io.pft.req.bits.id := io.id + io.pft.resp.ready := state === s_resp + io.pft.finish.valid := state === s_finish + io.pft.finish.bits.id := io.id + io.inflight.valid := state =/= s_idle + io.inflight.bits := req.addr + io.writeRRTable.valid := state === s_write_recent_req + io.writeRRTable.bits := baseAddr // write this into recent request table + + XSDebug(p"bopEntry ${io.id}: state=${state} prefetchOffset=${io.prefetchOffset} inflight=${io.inflight.valid} 0x${Hexadecimal(io.inflight.bits)} writeRRTable: ${io.writeRRTable.valid} 0x${Hexadecimal(io.writeRRTable.bits)} baseAddr=0x${Hexadecimal(baseAddr)} req: ${req}\n") + XSDebug(p"bopEntry ${io.id}: io.pft: ${io.pft}\n") +} + +class BestOffsetPrefetch(p: BOPParameters) extends PrefetchModule { + val io = IO(new BestOffsetPrefetchIO(p)) + + def nEntries = p.nEntries + def blockBytes = p.blockBytes + def getBlockAddr(addr: UInt) = Cat(addr(PAddrBits - 1, log2Up(blockBytes)), 0.U(log2Up(blockBytes).W)) + val scoreTable = Module(new OffsetScoreTable(p)) + val rrTable = Module(new RecentRequestTable(p)) + val reqArb = Module(new Arbiter(new BestOffsetPrefetchReq(p), nEntries)) + val finishArb = Module(new Arbiter(new BestOffsetPrefetchFinish(p), nEntries)) + val writeRRTableArb = Module(new Arbiter(UInt(PAddrBits.W), nEntries)) + + val entryReadyIdx = Wire(UInt(log2Up(nEntries).W)) + val inflightMatchVec = Wire(Vec(nEntries, Bool())) + + val bopEntries = (0 until nEntries).map { i => + val bopEntry = Module(new BestOffsetPrefetchEntry(p)) + + bopEntry.io.id := i.U + bopEntry.io.prefetchOffset := scoreTable.io.prefetchOffset + + bopEntry.io.pft.train.valid := io.train.valid && i.U === entryReadyIdx && !inflightMatchVec.asUInt.orR + bopEntry.io.pft.train.bits := io.train.bits + + reqArb.io.in(i) <> bopEntry.io.pft.req + bopEntry.io.pft.resp.valid := io.resp.valid && i.U === io.resp.bits.id + bopEntry.io.pft.resp.bits := io.resp.bits + finishArb.io.in(i) <> bopEntry.io.pft.finish + + writeRRTableArb.io.in(i) <> bopEntry.io.writeRRTable + + bopEntry + } + + entryReadyIdx := PriorityEncoder(bopEntries.map { e => !e.io.inflight.valid }) + (0 until nEntries).foreach(i => + inflightMatchVec(i) := bopEntries(i).io.inflight.valid && bopEntries(i).io.inflight.bits === getBlockAddr(io.train.bits.addr) + ) + + io.req <> reqArb.io.out + io.resp.ready := VecInit(bopEntries.zipWithIndex.map { case (e, i) => i.U === io.resp.bits.id && e.io.pft.resp.ready }).asUInt.orR + io.finish <> finishArb.io.out + rrTable.io.w <> writeRRTableArb.io.out + rrTable.io.r <> scoreTable.io.test + scoreTable.io.req.valid := io.train.valid + scoreTable.io.req.bits := getBlockAddr(io.train.bits.addr) + + XSDebug(p"io: ${io}\n") + XSDebug(p"entryReadyIdx=${entryReadyIdx} inflightMatchVec=${Binary(inflightMatchVec.asUInt)}\n") + } diff --git a/src/main/scala/xiangshan/cache/prefetch/L2Prefetcher.scala b/src/main/scala/xiangshan/cache/prefetch/L2Prefetcher.scala index 7d11547da4f44d2caf825c4672b265eaa027050f..c76b7412b9ca689fc2700208059883c808451a28 100644 --- a/src/main/scala/xiangshan/cache/prefetch/L2Prefetcher.scala +++ b/src/main/scala/xiangshan/cache/prefetch/L2Prefetcher.scala @@ -15,13 +15,30 @@ import freechips.rocketchip.tilelink.{TLClientNode, TLClientParameters, TLEdgeOut, TLBundleA, TLBundleD, ClientStates, ClientMetadata, TLHints } +import sifive.blocks.inclusivecache.PrefetcherIO case class L2PrefetcherParameters( enable: Boolean, _type: String, - streamParams: StreamPrefetchParameters + streamParams: StreamPrefetchParameters, + bopParams: BOPParameters ) { - def nEntries: Int = streamParams.streamCnt * streamParams.streamSize + // def nEntries: Int = streamParams.streamCnt * streamParams.streamSize + def nEntries: Int = { + if (enable && _type == "stream") { streamParams.streamCnt * streamParams.streamSize } + else if (enable && _type == "bop") { bopParams.nEntries } + else 1 + } + def totalWidth: Int = { + if (enable && _type == "stream") streamParams.totalWidth + else if (enable && _type == "bop") bopParams.totalWidth + else 1 + } + def blockBytes: Int = { + if (enable && _type == "stream") streamParams.blockBytes + else if (enable && _type == "bop") bopParams.blockBytes + else 64 + } } class L2Prefetcher()(implicit p: Parameters) extends LazyModule with HasPrefetchParameters { @@ -37,18 +54,41 @@ class L2Prefetcher()(implicit p: Parameters) extends LazyModule with HasPrefetch lazy val module = new L2PrefetcherImp(this) } +class L2PrefetcherIO extends XSBundle with HasPrefetchParameters { + val in = Flipped(DecoupledIO(new MissReq)) +} + // prefetch DCache lines in L2 using StreamPrefetch class L2PrefetcherImp(outer: L2Prefetcher) extends LazyModuleImp(outer) with HasPrefetchParameters with HasXSLog { - val io = IO(new Bundle { - val in = Flipped(DecoupledIO(new MissReq)) - // prefetch - // val mem_acquire = Decoupled(new TLBundleA(edge.bundle)) - // val mem_grant = Flipped(Decoupled(new TLBundleD(edge.bundle))) - // val mem_finish = Decoupled(new TLBundleE(edge.bundle)) - }) + val io = IO(new L2PrefetcherIO) val (bus, edge) = outer.clientNode.out.head - if (l2PrefetcherParameters.enable && l2PrefetcherParameters._type == "stream") { + if (l2PrefetcherParameters.enable && l2PrefetcherParameters._type == "bop") { + val bopParams = l2PrefetcherParameters.bopParams + val dPrefetch = Module(new BestOffsetPrefetch(bopParams)) + dPrefetch.io.train.valid := io.in.fire() + dPrefetch.io.train.bits.addr := io.in.bits.addr + dPrefetch.io.train.bits.write := MemoryOpConstants.isWrite(io.in.bits.cmd) + dPrefetch.io.train.bits.miss := true.B + io.in.ready := true.B + + bus.a.valid := dPrefetch.io.req.valid + bus.a.bits := DontCare + bus.a.bits := edge.Hint( + fromSource = dPrefetch.io.req.bits.id, + toAddress = dPrefetch.io.req.bits.addr, + lgSize = log2Up(bopParams.blockBytes).U, + param = Mux(dPrefetch.io.req.bits.write, TLHints.PREFETCH_WRITE, TLHints.PREFETCH_READ) + )._2 + dPrefetch.io.req.ready := bus.a.ready + + dPrefetch.io.resp.valid := bus.d.valid + dPrefetch.io.resp.bits.id := bus.d.bits.source(bopParams.totalWidth - 1, 0) + bus.d.ready := dPrefetch.io.resp.ready + + dPrefetch.io.finish.ready := true.B + + } else if (l2PrefetcherParameters.enable && l2PrefetcherParameters._type == "stream") { val streamParams = l2PrefetcherParameters.streamParams val dPrefetch = Module(new StreamPrefetch(streamParams)) dPrefetch.io.train.valid := io.in.fire() @@ -62,49 +102,44 @@ class L2PrefetcherImp(outer: L2Prefetcher) extends LazyModuleImp(outer) with Has bus.a.bits := edge.Hint( fromSource = dPrefetch.io.req.bits.id, toAddress = dPrefetch.io.req.bits.addr, - lgSize = log2Up(streamParams.blockBytes).U, + lgSize = log2Up(l2PrefetcherParameters.blockBytes).U, param = Mux(dPrefetch.io.req.bits.write, TLHints.PREFETCH_WRITE, TLHints.PREFETCH_READ) // TODO )._2 dPrefetch.io.req.ready := bus.a.ready - bus.b.ready := true.B - - bus.c.valid := false.B - bus.c.bits := DontCare - dPrefetch.io.resp.valid := bus.d.valid - dPrefetch.io.resp.bits.id := bus.d.bits.source(streamParams.totalWidth - 1, 0) + dPrefetch.io.resp.bits.id := bus.d.bits.source(l2PrefetcherParameters.totalWidth - 1, 0) bus.d.ready := dPrefetch.io.resp.ready - bus.e.valid := false.B - bus.e.bits := DontCare dPrefetch.io.finish.ready := true.B - if (!env.FPGAPlatform) { - ExcitingUtils.addSource(bus.a.fire(), "perfCntL2PrefetchReqCnt", Perf) - def idWidth = log2Up(l2PrefetcherParameters.nEntries) - (0 until l2PrefetcherParameters.nEntries).foreach(i => - ExcitingUtils.addSource( - BoolStopWatch( - start = bus.a.fire() && dPrefetch.io.req.bits.id(streamParams.totalWidth - 1, 0) === i.U, - stop = bus.d.fire() && bus.d.bits.source(streamParams.totalWidth - 1, 0) === i.U, - startHighPriority = true - ), - "perfCntL2PrefetchPenaltyEntry" + Integer.toString(i, 10), - Perf - ) - ) - } - } else { bus.a.valid := false.B bus.a.bits := DontCare - bus.b.ready := true.B - bus.c.valid := false.B - bus.c.bits := DontCare bus.d.ready := true.B - bus.e.valid := false.B - bus.e.bits := DontCare + } + + bus.b.ready := true.B + + bus.c.valid := false.B + bus.c.bits := DontCare + + bus.e.valid := false.B + bus.e.bits := DontCare + + if (!env.FPGAPlatform) { + ExcitingUtils.addSource(bus.a.fire(), "perfCntL2PrefetchReqCnt", Perf) + (0 until l2PrefetcherParameters.nEntries).foreach(i => + ExcitingUtils.addSource( + BoolStopWatch( + start = bus.a.fire() && bus.a.bits.source(l2PrefetcherParameters.totalWidth - 1, 0) === i.U, + stop = bus.d.fire() && bus.d.bits.source(l2PrefetcherParameters.totalWidth - 1, 0) === i.U, + startHighPriority = true + ), + "perfCntL2PrefetchPenaltyEntry" + Integer.toString(i, 10), + Perf + ) + ) } } diff --git a/src/main/scala/xiangshan/cache/prefetch/Prefetcher.scala b/src/main/scala/xiangshan/cache/prefetch/Prefetcher.scala index 950f5676d15fef85314fd17a0a44183853f3d3dc..c64fda77ab63fb0adb5a489dfe313c7eb2213c1c 100644 --- a/src/main/scala/xiangshan/cache/prefetch/Prefetcher.scala +++ b/src/main/scala/xiangshan/cache/prefetch/Prefetcher.scala @@ -40,11 +40,11 @@ class PrefetchTrain extends PrefetchBundle { } } -class PrefetchIO extends PrefetchBundle { - val train = Flipped(ValidIO(new PrefetchTrain)) - val req = DecoupledIO(new PrefetchReq) - val resp = Flipped(DecoupledIO(new PrefetchResp)) -} +// class PrefetchIO extends PrefetchBundle { +// val train = Flipped(ValidIO(new PrefetchTrain)) +// val req = DecoupledIO(new PrefetchReq) +// val resp = Flipped(DecoupledIO(new PrefetchResp)) +// } // class FakePrefetcher extends PrefetchModule { // val io = IO(new PrefetchIO) diff --git a/src/main/scala/xiangshan/cache/prefetch/StreamPrefetch.scala b/src/main/scala/xiangshan/cache/prefetch/StreamPrefetch.scala index 0d22af8ae27f05a7d91f8376d74741a517eba9f6..5daceb104836fe8b464a0977f385314ec63b2469 100644 --- a/src/main/scala/xiangshan/cache/prefetch/StreamPrefetch.scala +++ b/src/main/scala/xiangshan/cache/prefetch/StreamPrefetch.scala @@ -11,7 +11,8 @@ case class StreamPrefetchParameters( streamSize: Int, ageWidth: Int, blockBytes: Int, - reallocStreamOnMissInstantly: Boolean + reallocStreamOnMissInstantly: Boolean, + cacheName: String // distinguish between different prefetchers ) { def streamWidth = log2Up(streamCnt) def idxWidth = log2Up(streamSize) @@ -107,7 +108,7 @@ class StreamBuffer(p: StreamPrefetchParameters) extends PrefetchModule { val buf = RegInit(VecInit(Seq.fill(streamSize)(0.U.asTypeOf(new PrefetchReq)))) val valid = RegInit(VecInit(Seq.fill(streamSize)(false.B))) val head = RegInit(0.U(log2Up(streamSize).W)) - val tail = RegInit(0.U(log2Up(streamCnt).W)) + val tail = RegInit(0.U(log2Up(streamSize).W)) val s_idle :: s_req :: s_resp :: s_finish :: Nil = Enum(4) val state = RegInit(VecInit(Seq.fill(streamSize)(s_idle))) @@ -122,7 +123,7 @@ class StreamBuffer(p: StreamPrefetchParameters) extends PrefetchModule { // dequeue val hitIdx = io.update.bits.hitIdx - when (io.update.valid && !empty && valid(hitIdx)) { + when (io.update.valid && !empty && (isPrefetching(hitIdx) || valid(hitIdx))) { val headBeforehitIdx = head <= hitIdx && (hitIdx < tail || tail <= head) val hitIdxBeforeHead = hitIdx < tail && tail <= head when (headBeforehitIdx) { @@ -132,6 +133,8 @@ class StreamBuffer(p: StreamPrefetchParameters) extends PrefetchModule { when (hitIdxBeforeHead) { (0 until streamSize).foreach(i => deqLater(i) := Mux(i.U >= head || i.U <= hitIdx, true.B, deqLater(i))) } + + XSDebug(io.update.valid && !empty && (isPrefetching(hitIdx) || valid(hitIdx)), p"hitIdx=${hitIdx} headBeforehitIdx=${headBeforehitIdx} hitIdxBeforeHead=${hitIdxBeforeHead}\n") } val deqValid = WireInit(VecInit(Seq.fill(streamSize)(false.B))) @@ -143,8 +146,15 @@ class StreamBuffer(p: StreamPrefetchParameters) extends PrefetchModule { deqValid(idx) := deq } - (0 until streamSize).foreach(i => valid(i) := valid(i) && !deqValid(i)) - (0 until streamSize).foreach(i => deqLater(i) := deqLater(i) && !deqValid(i)) + // (0 until streamSize).foreach(i => valid(i) := valid(i) && !deqValid(i)) + // (0 until streamSize).foreach(i => deqLater(i) := deqLater(i) && !deqValid(i)) + for (i <- 0 until streamSize) { + when (deqValid(i)) { + valid(i) := false.B + deqLater(i) := false.B + } + } + val nextHead = head + PopCount(deqValid) when (deqValid.asUInt.orR) { head := nextHead @@ -198,13 +208,17 @@ class StreamBuffer(p: StreamPrefetchParameters) extends PrefetchModule { val finishArb = Module(new Arbiter(new StreamPrefetchFinish(p), streamSize)) for (i <- 0 until streamSize) { prefetchPrior(i) := head + i.U - reqs(i).ready := false.B reqArb.io.in(i) <> reqs(prefetchPrior(i)) - finishs(i).ready := false.B + reqs(i).ready := DontCare finishArb.io.in(i) <> finishs(prefetchPrior(i)) + finishs(i).ready := DontCare resps(i).bits := io.resp.bits resps(i).valid := io.resp.valid && io.resp.bits.idx === i.U } + for (i <- 0 until streamSize) { + reqs(prefetchPrior(i)).ready := reqArb.io.in(i).ready + finishs(prefetchPrior(i)).ready := finishArb.io.in(i).ready + } io.req <> reqArb.io.out io.finish <> finishArb.io.out io.resp.ready := VecInit(resps.zipWithIndex.map{ case (r, i) => @@ -225,6 +239,7 @@ class StreamBuffer(p: StreamPrefetchParameters) extends PrefetchModule { needRealloc := false.B state.foreach(_ := s_idle) valid.foreach(_ := false.B) + deqLater.foreach(_ := false.B) } for (i <- 0 until streamSize) { @@ -233,20 +248,20 @@ class StreamBuffer(p: StreamPrefetchParameters) extends PrefetchModule { } // debug info - XSDebug(p"StreamBuf ${io.streamBufId} io.req: v=${io.req.valid} r=${io.req.ready} ${io.req.bits}\n") - XSDebug(p"StreamBuf ${io.streamBufId} io.resp: v=${io.resp.valid} r=${io.resp.ready} ${io.resp.bits}\n") - XSDebug(p"StreamBuf ${io.streamBufId} io.finish: v=${io.finish.valid} r=${io.finish.ready} ${io.finish.bits}") - XSDebug(p"StreamBuf ${io.streamBufId} io.update: v=${io.update.valid} ${io.update.bits}\n") - XSDebug(p"StreamBuf ${io.streamBufId} io.alloc: v=${io.alloc.valid} ${io.alloc.bits}\n") + XSDebug(s"${p.cacheName} " + p"StreamBuf ${io.streamBufId} io.req: v=${io.req.valid} r=${io.req.ready} ${io.req.bits}\n") + XSDebug(s"${p.cacheName} " + p"StreamBuf ${io.streamBufId} io.resp: v=${io.resp.valid} r=${io.resp.ready} ${io.resp.bits}\n") + XSDebug(s"${p.cacheName} " + p"StreamBuf ${io.streamBufId} io.finish: v=${io.finish.valid} r=${io.finish.ready} ${io.finish.bits}\n") + XSDebug(s"${p.cacheName} " + p"StreamBuf ${io.streamBufId} io.update: v=${io.update.valid} ${io.update.bits}\n") + XSDebug(s"${p.cacheName} " + p"StreamBuf ${io.streamBufId} io.alloc: v=${io.alloc.valid} ${io.alloc.bits}\n") for (i <- 0 until streamSize) { - XSDebug(p"StreamBuf ${io.streamBufId} [${i.U}] io.addrs: ${io.addrs(i).valid} 0x${Hexadecimal(io.addrs(i).bits)} " + + XSDebug(s"${p.cacheName} " + p"StreamBuf ${io.streamBufId} [${i.U}] io.addrs: ${io.addrs(i).valid} 0x${Hexadecimal(io.addrs(i).bits)} " + p"buf: ${buf(i)} valid: ${valid(i)} state: ${state(i)} isPfting: ${isPrefetching(i)} " + p"deqLater: ${deqLater(i)} deqValid: ${deqValid(i)}\n") } - XSDebug(p"StreamBuf ${io.streamBufId} head: ${head} tail: ${tail} full: ${full} empty: ${empty} nextHead: ${nextHead}\n") - XSDebug(p"StreamBuf ${io.streamBufId} baseReq: v=${baseReq.valid} ${baseReq.bits} nextReq: ${nextReq}\n") - XSDebug(needRealloc, p"StreamBuf ${io.streamBufId} needRealloc: ${needRealloc} reallocReq: ${reallocReq}\n") - XSDebug(p"StreamBuf ${io.streamBufId} prefetchPrior: ") + XSDebug(s"${p.cacheName} " + p"StreamBuf ${io.streamBufId} head: ${head} tail: ${tail} full: ${full} empty: ${empty} nextHead: ${nextHead} blockBytes: ${blockBytes.U}\n") + XSDebug(s"${p.cacheName} " + p"StreamBuf ${io.streamBufId} baseReq: v=${baseReq.valid} ${baseReq.bits} nextReq: ${nextReq}\n") + XSDebug(needRealloc, s"${p.cacheName} " + p"StreamBuf ${io.streamBufId} needRealloc: ${needRealloc} reallocReq: ${reallocReq}\n") + XSDebug(s"${p.cacheName} " + p"StreamBuf ${io.streamBufId} prefetchPrior: ") (0 until streamSize).foreach(i => XSDebug(false, true.B, p"${prefetchPrior(i)} ")) XSDebug(false, true.B, "\n") } @@ -266,6 +281,8 @@ object ParallelMin { class StreamPrefetch(p: StreamPrefetchParameters) extends PrefetchModule { val io = IO(new StreamPrefetchIO(p)) + + require(p.blockBytes > 0) // TODO: implement this def streamCnt = p.streamCnt @@ -352,8 +369,8 @@ class StreamPrefetch(p: StreamPrefetchParameters) extends PrefetchModule { i.U === io.resp.bits.stream && buf.io.resp.ready}).asUInt.orR // debug info - XSDebug(p"io: ${io}\n") - XSDebug(p"bufValids: ${Binary(bufValids.asUInt)} hit: ${hit} ages: ") + XSDebug(s"${p.cacheName} " + p"io: ${io}\n") + XSDebug(s"${p.cacheName} " + p"bufValids: ${Binary(bufValids.asUInt)} hit: ${hit} ages: ") (0 until streamCnt).foreach(i => XSDebug(false, true.B, p"${Hexadecimal(ages(i))} ")) XSDebug(false, true.B, "\n") } diff --git a/src/main/scala/xiangshan/cache/ptw.scala b/src/main/scala/xiangshan/cache/ptw.scala index 4ad2628bad526596a8562aa62df1cbc37059d14a..e553bc6d1927b341f1583fc7c480d27f5d274bf5 100644 --- a/src/main/scala/xiangshan/cache/ptw.scala +++ b/src/main/scala/xiangshan/cache/ptw.scala @@ -156,6 +156,94 @@ class PtwEntries(num: Int, tagLen: Int) extends PtwBundle { } } +class L2TlbEntry extends TlbBundle { + val tag = UInt(vpnLen.W) // tag is vpn + val level = UInt(log2Up(Level).W) // 2 for 4KB, 1 for 2MB, 0 for 1GB + val ppn = UInt(ppnLen.W) + val perm = new PtePermBundle + + def hit(vpn: UInt):Bool = { + val fullMask = VecInit((Seq.fill(vpnLen)(true.B))).asUInt + val maskLevel = VecInit((Level-1 to 0 by -1).map{i => // NOTE: level 2 for 4KB, 1 for 2MB, 0 for 1GB + Reverse(VecInit(Seq.fill(vpnLen-i*vpnnLen)(true.B) ++ Seq.fill(i*vpnnLen)(false.B)).asUInt)}) + val mask = maskLevel(level) + (mask&this.tag) === (mask&vpn) + } + + def apply(pte: UInt, level: UInt, vpn: UInt) = { + this.tag := vpn + this.level := level + this.ppn := pte.asTypeOf(pteBundle).ppn + this.perm := pte.asTypeOf(pteBundle).perm + this + } + + override def toPrintable: Printable = { + p"vpn:0x${Hexadecimal(tag)} level:${level} ppn:${Hexadecimal(ppn)} perm:${perm}" + } +} + +class L2TlbEntires(num: Int, tagLen: Int) extends TlbBundle { + require(log2Up(num)==log2Down(num)) + /* vpn can be divide into three part */ + // vpn: tagPart(17bit) + addrPart(8bit) + cutLenPart(2bit) + val cutLen = log2Up(num) + + val tag = UInt(tagLen.W) // NOTE: high part of vpn + val ppns = Vec(num, UInt(ppnLen.W)) + val perms = Vec(num, new PtePermBundle) + val vs = Vec(num, Bool()) + + def tagClip(vpn: UInt) = { // full vpn => tagLen + vpn(vpn.getWidth-1, vpn.getWidth-tagLen) + } + + // NOTE: get insize idx + def idxClip(vpn: UInt) = { + vpn(cutLen-1, 0) + } + + def hit(vpn: UInt) = { + (tag === tagClip(vpn)) && vs(idxClip(vpn)) + } + + def genEntries(data: UInt, level: UInt, vpn: UInt): L2TlbEntires = { + require((data.getWidth / XLEN) == num, + "input data length must be multiple of pte length") + assert(level===2.U, "tlb entries only support 4K pages") + + val ts = Wire(new L2TlbEntires(num, tagLen)) + ts.tag := tagClip(vpn) + for (i <- 0 until num) { + val pte = data((i+1)*XLEN-1, i*XLEN).asTypeOf(new PteBundle) + ts.ppns(i) := pte.ppn + ts.perms(i):= pte.perm // this.perms has no v + ts.vs(i) := !pte.isPf(level) && pte.isLeaf() // legal and leaf, store to l2Tlb + } + + ts + } + + def get(vpn: UInt): L2TlbEntry = { + val t = Wire(new L2TlbEntry) + val idx = idxClip(vpn) + t.tag := vpn // Note: Use input vpn, not vpn in TlbL2 + t.level := 2.U // L2TlbEntries only support 4k page + t.ppn := ppns(idx) + t.perm := perms(idx) + t + } + + override def cloneType: this.type = (new L2TlbEntires(num, tagLen)).asInstanceOf[this.type] + override def toPrintable: Printable = { + require(num == 4, "if num is not 4, please comment this toPrintable") + // NOTE: if num is not 4, please comment this toPrintable + p"tag:${Hexadecimal(tag)} ppn(0):${Hexadecimal(ppns(0))} ppn(1):${Hexadecimal(ppns(1))}" + + p"ppn(2):${Hexadecimal(ppns(2))} ppn(3):${Hexadecimal(ppns(3))} " + + p"perms(0):${perms(0)} perms(1):${perms(1)} perms(2):${perms(2)} perms(3):${perms(3)} vs:${Binary(vs.asUInt)}" + } +} + class PtwReq extends PtwBundle { val vpn = UInt(vpnLen.W) @@ -165,8 +253,8 @@ class PtwReq extends PtwBundle { } class PtwResp extends PtwBundle { - val entry = new TlbEntry - val pf = Bool() // simple pf no matter cmd + val entry = new L2TlbEntry + val pf = Bool() override def toPrintable: Printable = { p"entry:${entry} pf:${pf}" @@ -235,13 +323,26 @@ class PTWImp(outer: PTW) extends PtwModule(outer){ // two level: l2-tlb-cache && pde/pte-cache // l2-tlb-cache is ram-larger-edition tlb // pde/pte-cache is cache of page-table, speeding up ptw - val tlbl2 = Module(new SRAMTemplate(new TlbEntires(num = TlbL2LineSize, tagLen = TlbL2TagLen), set = TlbL2LineNum)) // (total 256, one line is 4 => 64 lines) + val tlbl2 = Module(new SRAMTemplate( + new L2TlbEntires(num = TlbL2LineSize, tagLen = TlbL2TagLen), + set = TlbL2LineNum, + singlePort = true + )) // (total 256, one line is 4 => 64 lines) val tlbv = RegInit(0.U(TlbL2LineNum.W)) // valid val tlbg = Reg(UInt(TlbL2LineNum.W)) // global + + val sp = Reg(Vec(TlbL2SPEntrySize, new L2TlbEntry)) // (total 16, one is 4M or 1G) + val spv = RegInit(0.U(TlbL2SPEntrySize.W)) + val spg = Reg(UInt(TlbL2SPEntrySize.W)) + val ptwl1 = Reg(Vec(PtwL1EntrySize, new PtwEntry(tagLen = PtwL1TagLen))) val l1v = RegInit(0.U(PtwL1EntrySize.W)) // valid val l1g = Reg(UInt(PtwL1EntrySize.W)) - val ptwl2 = Module(new SRAMTemplate(new PtwEntries(num = PtwL2LineSize, tagLen = PtwL2TagLen), set = PtwL2LineNum)) // (total 256, one line is 4 => 64 lines) + val ptwl2 = Module(new SRAMTemplate( + new PtwEntries(num = PtwL2LineSize, tagLen = PtwL2TagLen), + set = PtwL2LineNum, + singlePort = true + )) // (total 256, one line is 4 => 64 lines) val l2v = RegInit(0.U(PtwL2LineNum.W)) // valid val l2g = Reg(UInt(PtwL2LineNum.W)) // global @@ -268,7 +369,6 @@ class PTWImp(outer: PTW) extends PtwModule(outer){ * tlbl2 */ val (tlbHit, tlbHitData) = { - assert(tlbl2.io.r.req.ready) val ridx = genTlbL2Idx(req.vpn) val vidx = RegEnable(tlbv(ridx), validOneCycle) @@ -276,10 +376,22 @@ class PTWImp(outer: PTW) extends PtwModule(outer){ tlbl2.io.r.req.bits.apply(setIdx = ridx) val ramData = tlbl2.io.r.resp.data(0) + assert(tlbl2.io.r.req.ready || !tlbl2.io.r.req.valid) XSDebug(tlbl2.io.r.req.valid, p"tlbl2 Read rIdx:${Hexadecimal(ridx)}\n") XSDebug(RegNext(tlbl2.io.r.req.valid), p"tlbl2 RamData:${ramData}\n") XSDebug(RegNext(tlbl2.io.r.req.valid), p"tlbl2 v:${vidx} hit:${ramData.hit(req.vpn)} tlbPte:${ramData.get(req.vpn)}\n") - (ramData.hit(req.vpn) && vidx, ramData.get(req.vpn)) + + val spHitVec = sp.zipWithIndex.map{ case (a,i) => + RegEnable(a.hit(req.vpn) && spv(i), validOneCycle) + } + val spHitData = ParallelMux(spHitVec zip sp) + val spHit = Cat(spHitVec).orR + + XSDebug(RegNext(validOneCycle), p"tlbl2 sp: spHit:${spHit} spPte:${spHitData}\n") + + assert(RegNext(!(ramData.hit(req.vpn) && vidx && spHit && RegNext(validOneCycle))), "pages should not be normal page and super page as well") + + (ramData.hit(req.vpn) && vidx || spHit, Mux(spHit, spHitData, ramData.get(req.vpn))) } /* @@ -306,7 +418,7 @@ class PTWImp(outer: PTW) extends PtwModule(outer){ val idx = RegEnable(l2addr(log2Up(PtwL2LineSize)+log2Up(XLEN/8)-1, log2Up(XLEN/8)), readRam) val vidx = RegEnable(l2v(ridx), readRam) - assert(ptwl2.io.r.req.ready) + assert(ptwl2.io.r.req.ready || !readRam) ptwl2.io.r.req.valid := readRam ptwl2.io.r.req.bits.apply(setIdx = ridx) val ramData = ptwl2.io.r.resp.data(0) @@ -361,7 +473,7 @@ class PTWImp(outer: PTW) extends PtwModule(outer){ state := state_idle }.otherwise { state := state_wait_ready - latch.entry := new TlbEntry().genTlbEntry(memRdata, level, req.vpn) + latch.entry := Wire(new L2TlbEntry()).apply(memRdata, level, req.vpn) latch.pf := memPte.isPf(level) } }.otherwise { @@ -418,7 +530,7 @@ class PTWImp(outer: PTW) extends PtwModule(outer){ for(i <- 0 until PtwWidth) { resp(i).valid := valid && arbChosen===i.U && ptwFinish // TODO: add resp valid logic resp(i).bits.entry := Mux(tlbHit, tlbHitData, - Mux(state===state_wait_ready, latch.entry, new TlbEntry().genTlbEntry(memSelData, Mux(level===3.U, 2.U, level), req.vpn))) + Mux(state===state_wait_ready, latch.entry, Wire(new L2TlbEntry()).apply(memSelData, Mux(level===3.U, 2.U, level), req.vpn))) resp(i).bits.pf := Mux(level===3.U || notFound, true.B, Mux(tlbHit, false.B, Mux(state===state_wait_ready, latch.pf, memPte.isPf(level)))) // TODO: the pf must not be correct, check it } @@ -434,13 +546,15 @@ class PTWImp(outer: PTW) extends PtwModule(outer){ when (memRespFire && !memPte.isPf(level) && !sfenceLatch) { when (level===0.U && !memPte.isLeaf) { val refillIdx = LFSR64()(log2Up(PtwL1EntrySize)-1,0) // TODO: may be LRU + val rfOH = UIntToOH(refillIdx) ptwl1(refillIdx).refill(l1addr, memSelData) - l1v := l1v | UIntToOH(refillIdx) - l1g := (l1g & ~UIntToOH(refillIdx)) | Mux(memPte.perm.g, UIntToOH(refillIdx), 0.U) + l1v := l1v | rfOH + l1g := (l1g & ~rfOH) | Mux(memPte.perm.g, rfOH, 0.U) } when (level===1.U && !memPte.isLeaf) { val l2addrStore = RegEnable(l2addr, memReqFire && state===state_req && level===1.U) val refillIdx = genPtwL2Idx(l2addrStore) //getVpnn(req.vpn, 1)(log2Up(PtwL2EntrySize)-1, 0) + val rfOH = UIntToOH(refillIdx) //TODO: check why the old refillIdx is right assert(ptwl2.io.w.req.ready) @@ -451,26 +565,34 @@ class PTWImp(outer: PTW) extends PtwModule(outer){ data = ps, waymask = -1.S.asUInt ) - l2v := l2v | UIntToOH(refillIdx) - l2g := (l2g & ~UIntToOH(refillIdx)) | Mux(Cat(memPtes.map(_.perm.g)).andR, UIntToOH(refillIdx), 0.U) + l2v := l2v | rfOH + l2g := (l2g & ~rfOH) | Mux(Cat(memPtes.map(_.perm.g)).andR, rfOH, 0.U) XSDebug(p"ptwl2 RefillIdx:${Hexadecimal(refillIdx)} ps:${ps}\n") } when (memPte.isLeaf() && (level===2.U)) { val refillIdx = genTlbL2Idx(req.vpn)//getVpnn(req.vpn, 0)(log2Up(TlbL2EntrySize)-1, 0) + val rfOH = UIntToOH(refillIdx) //TODO: check why the old refillIdx is right assert(tlbl2.io.w.req.ready) - val ts = new TlbEntires(num = TlbL2LineSize, tagLen = TlbL2TagLen).genEntries(memRdata, level, req.vpn) + val ts = new L2TlbEntires(num = TlbL2LineSize, tagLen = TlbL2TagLen).genEntries(memRdata, level, req.vpn) tlbl2.io.w.apply( valid = true.B, setIdx = refillIdx, data = ts, waymask = -1.S.asUInt ) - tlbv := tlbv | UIntToOH(refillIdx) - tlbg := (tlbg & ~UIntToOH(refillIdx)) | Mux(Cat(memPtes.map(_.perm.g)).andR, UIntToOH(refillIdx), 0.U) + tlbv := tlbv | rfOH + tlbg := (tlbg & ~rfOH) | Mux(Cat(memPtes.map(_.perm.g)).andR, rfOH, 0.U) XSDebug(p"tlbl2 refillIdx:${Hexadecimal(refillIdx)} ts:${ts}\n") } + when (memPte.isLeaf() && (level===1.U || level===0.U)) { + val refillIdx = LFSR64()(log2Up(TlbL2SPEntrySize)-1,0) // TODO: may be LRU + val rfOH = UIntToOH(refillIdx) + sp(refillIdx) := Wire(new L2TlbEntry()).apply(memSelData, Mux(level===3.U, 2.U, level), req.vpn) + spv := spv | rfOH + spg := (spg & ~rfOH) | Mux(memPte.perm.g, rfOH, 0.U) + } } /* sfence @@ -488,25 +610,29 @@ class PTWImp(outer: PTW) extends PtwModule(outer){ when (sfence.bits.rs2) { // all va && all asid tlbv := 0.U - tlbg := 0.U + spv := 0.U + // tlbg := 0.U l1v := 0.U l2v := 0.U - l2g := 0.U + // l2g := 0.U } .otherwise { // all va && specific asid except global tlbv := tlbv & tlbg + spv := spv & spg l1v := l1v & l1g l2v := l2v & l2g } } .otherwise { + val sfenceTlbL2IdxOH = UIntToOH(genTlbL2Idx(sfence.bits.addr(sfence.bits.addr.getWidth-1, offLen))) when (sfence.bits.rs2) { // specific leaf of addr && all asid - tlbv := tlbv & ~UIntToOH(genTlbL2Idx(sfence.bits.addr(sfence.bits.addr.getWidth-1, offLen))) - tlbg := tlbg & ~UIntToOH(genTlbL2Idx(sfence.bits.addr(sfence.bits.addr.getWidth-1, offLen))) + tlbv := tlbv & ~sfenceTlbL2IdxOH + tlbg := tlbg & ~sfenceTlbL2IdxOH } .otherwise { // specific leaf of addr && specific asid - tlbv := tlbv & (~UIntToOH(genTlbL2Idx(sfence.bits.addr(sfence.bits.addr.getWidth-1, offLen)))| tlbg) + tlbv := tlbv & (~sfenceTlbL2IdxOH| tlbg) } + spv := 0.U } } diff --git a/src/main/scala/xiangshan/frontend/Frontend.scala b/src/main/scala/xiangshan/frontend/Frontend.scala index 8458eadd84d2af40a90a8115845f469328c8f589..d163ed8f9c5d82789bcc45e8b9d985fc556d7547 100644 --- a/src/main/scala/xiangshan/frontend/Frontend.scala +++ b/src/main/scala/xiangshan/frontend/Frontend.scala @@ -2,13 +2,28 @@ package xiangshan.frontend import utils.XSInfo import chisel3._ import chisel3.util._ +import chipsalliance.rocketchip.config.Parameters +import freechips.rocketchip.diplomacy.{LazyModule, LazyModuleImp} import utils.PipelineConnect import xiangshan._ import xiangshan.cache._ import xiangshan.cache.prefetch.L1plusPrefetcher +import xiangshan.backend.fu.HasExceptionNO +class Frontend()(implicit p: Parameters) extends LazyModule with HasXSParameter{ -class Frontend extends XSModule with HasL1plusCacheParameters { + val instrUncache = LazyModule(new InstrUncache()) + + lazy val module = new FrontendImp(this) +} + + +class FrontendImp (outer: Frontend) extends LazyModuleImp(outer) + with HasL1plusCacheParameters + with HasXSParameter + with HasExceptionNO + with HasXSLog +{ val io = IO(new Bundle() { val icacheMemAcq = DecoupledIO(new L1plusCacheReq) val icacheMemGrant = Flipped(DecoupledIO(new L1plusCacheResp)) @@ -23,7 +38,7 @@ class Frontend extends XSModule with HasL1plusCacheParameters { val ifu = Module(new IFU) val ibuffer = Module(new Ibuffer) val l1plusPrefetcher = Module(new L1plusPrefetcher) - + val instrUncache = outer.instrUncache.module val needFlush = io.backend.redirect.valid @@ -43,6 +58,11 @@ class Frontend extends XSModule with HasL1plusCacheParameters { ifu.io.icacheMemGrant.ready, l1plusPrefetcher.io.mem_grant.ready) ifu.io.fencei := io.fencei + + + instrUncache.io.req <> ifu.io.mmio_acquire + instrUncache.io.resp <> ifu.io.mmio_grant + instrUncache.io.flush <> ifu.io.mmio_flush // to tlb ifu.io.sfence := io.sfence ifu.io.tlbCsr := io.tlbCsr diff --git a/src/main/scala/xiangshan/frontend/IFU.scala b/src/main/scala/xiangshan/frontend/IFU.scala index 2420a0fc9f7b17143b9499f35d939eb8741bcd35..0bfb343243b916750c24a9e490d746e3c6ec2f55 100644 --- a/src/main/scala/xiangshan/frontend/IFU.scala +++ b/src/main/scala/xiangshan/frontend/IFU.scala @@ -10,8 +10,16 @@ import chisel3.experimental.chiselName import freechips.rocketchip.tile.HasLazyRoCC import chisel3.ExcitingUtils._ +trait HasInstrMMIOConst extends HasXSParameter with HasIFUConst{ + def mmioBusWidth = 64 + def mmioBusBytes = mmioBusWidth /8 + def mmioBeats = FetchWidth * 4 * 8 / mmioBusWidth + def mmioMask = VecInit(List.fill(PredictWidth)(true.B)).asUInt + def mmioBusAligned(pc :UInt): UInt = align(pc, mmioBusBytes) +} + trait HasIFUConst extends HasXSParameter { - val resetVector = 0x80000000L//TODO: set reset vec + val resetVector = 0x10000000L//TODO: set reset vec def align(pc: UInt, bytes: Int): UInt = Cat(pc(VAddrBits-1, log2Ceil(bytes)), 0.U(log2Ceil(bytes).W)) val instBytes = if (HasCExtension) 2 else 4 val instOffsetBits = log2Ceil(instBytes) @@ -71,6 +79,10 @@ class IFUIO extends XSBundle val tlbCsr = Input(new TlbCsrBundle) // from tlb val ptw = new TlbPtwIO + // icache uncache + val mmio_acquire = DecoupledIO(new InsUncacheReq) + val mmio_grant = Flipped(DecoupledIO(new InsUncacheResp)) + val mmio_flush = Output(Bool()) } class PrevHalfInstr extends XSBundle { @@ -119,7 +131,7 @@ class IFU extends XSModule with HasIFUConst val if2_valid = RegInit(init = false.B) val if2_allReady = WireInit(if2_ready && icache.io.req.ready) val if1_fire = (if1_valid && if2_allReady) && (icache.io.tlb.resp.valid || !if2_valid) - val if1_can_go = if1_fire || if2_flush + val if1_can_go = if1_fire || if3_flush val if1_gh, if2_gh, if3_gh, if4_gh = Wire(new GlobalHistory) val if2_predicted_gh, if3_predicted_gh, if4_predicted_gh = Wire(new GlobalHistory) @@ -418,6 +430,9 @@ class IFU extends XSModule with HasIFUConst icache.io.prev.bits := if3_prevHalfInstr.bits.instr icache.io.prev_ipf := if3_prevHalfInstr.bits.ipf icache.io.prev_pc := if3_prevHalfInstr.bits.pc + icache.io.mmio_acquire <> io.mmio_acquire + icache.io.mmio_grant <> io.mmio_grant + icache.io.mmio_flush <> io.mmio_flush io.icacheMemAcq <> icache.io.mem_acquire io.l1plusFlush := icache.io.l1plusflush io.prefetchTrainReq := icache.io.prefetchTrainReq @@ -443,10 +458,18 @@ class IFU extends XSModule with HasIFUConst crossPageIPF := true.B // higher 16 bits page fault } + //RVC expand + val expandedInstrs = Wire(Vec(PredictWidth, UInt(32.W))) + for(i <- 0 until PredictWidth){ + val expander = Module(new RVCExpander) + expander.io.in := if4_pd.instrs(i) + expandedInstrs(i) := expander.io.out.bits + } + val fetchPacketValid = if4_valid && !io.redirect.valid val fetchPacketWire = Wire(new FetchPacket) - fetchPacketWire.instrs := if4_pd.instrs + fetchPacketWire.instrs := expandedInstrs fetchPacketWire.mask := if4_pd.mask & (Fill(PredictWidth, !if4_bp.taken) | (Fill(PredictWidth, 1.U(1.W)) >> (~if4_bp.jmpIdx))) fetchPacketWire.pdmask := if4_pd.mask diff --git a/src/main/scala/xiangshan/frontend/PreDecode.scala b/src/main/scala/xiangshan/frontend/PreDecode.scala index 8251611d4b1089fe00894a286054d24358d8c6c3..b790bdc0dc8668b7c3128230aa92081b84d5b586 100644 --- a/src/main/scala/xiangshan/frontend/PreDecode.scala +++ b/src/main/scala/xiangshan/frontend/PreDecode.scala @@ -3,6 +3,7 @@ package xiangshan.frontend import chisel3._ import chisel3.util._ import utils._ +import freechips.rocketchip.rocket.{RVCDecoder, ExpandedInstruction} import xiangshan._ import xiangshan.backend.decode.isa.predecode.PreDecodeInst import xiangshan.cache._ @@ -134,3 +135,16 @@ class PreDecode extends XSModule with HasPdconst with HasIFUConst { ) } } + +class RVCExpander extends XSModule { + val io = IO(new Bundle { + val in = Input(UInt(32.W)) + val out = Output(new ExpandedInstruction) + }) + + if (HasCExtension) { + io.out := new RVCDecoder(io.in, XLEN).decode + } else { + io.out := new RVCDecoder(io.in, XLEN).passthrough + } +} diff --git a/src/main/scala/xiangshan/frontend/Tage.scala b/src/main/scala/xiangshan/frontend/Tage.scala index 7dd07212a1f0d3fe4c99869397f30cbd60307dc3..4bb01f932fd1622880001db6221ef66e766741d6 100644 --- a/src/main/scala/xiangshan/frontend/Tage.scala +++ b/src/main/scala/xiangshan/frontend/Tage.scala @@ -5,6 +5,10 @@ import chisel3.util._ import xiangshan._ import utils._ import chisel3.experimental.chiselName +import chisel3.stage.{ChiselGeneratorAnnotation, ChiselStage} +import firrtl.stage.RunFirrtlTransformAnnotation +import firrtl.transforms.RenameModules +import freechips.rocketchip.transforms.naming.RenameDesiredNames import scala.math.min import scala.util.matching.Regex @@ -373,14 +377,13 @@ class FakeTage extends BaseTage { class Tage extends BaseTage { val tables = TableInfo.map { - case (nRows, histLen, tagLen) => { + case (nRows, histLen, tagLen) => val t = if(EnableBPD) Module(new TageTable(nRows, histLen, tagLen, UBitPeriod)) else Module(new FakeTageTable) t.io.req.valid := io.pc.valid t.io.req.bits.pc := io.pc.bits t.io.req.bits.hist := io.hist t.io.req.bits.mask := io.inMask t - } } val scTables = SCTableInfo.map { @@ -658,4 +661,13 @@ class Tage extends BaseTage { XSDebug(io.update.valid && updateIsBr, p"update: sc: ${updateSCMeta}\n") XSDebug(true.B, p"scThres: use(${useThreshold}), update(${updateThreshold})\n") } +} + +object TageTest extends App { + override def main(args: Array[String]): Unit = { + (new ChiselStage).execute(args, Seq( + ChiselGeneratorAnnotation(() => new Tage), + RunFirrtlTransformAnnotation(new RenameDesiredNames) + )) + } } \ No newline at end of file diff --git a/src/main/scala/xiangshan/mem/Memend.scala b/src/main/scala/xiangshan/mem/Memend.scala index fa30d9f8917959366a9289ce22649119ef4e54da..b750cc30ad2f61a98c8c3f3904bc87a7fa11ef07 100644 --- a/src/main/scala/xiangshan/mem/Memend.scala +++ b/src/main/scala/xiangshan/mem/Memend.scala @@ -41,7 +41,6 @@ class LsPipelineBundle extends XSBundle { val miss = Bool() val tlbMiss = Bool() val mmio = Bool() - val rollback = Bool() val forwardMask = Vec(8, Bool()) val forwardData = Vec(8, UInt(8.W)) diff --git a/src/main/scala/xiangshan/mem/lsqueue/LSQWrapper.scala b/src/main/scala/xiangshan/mem/lsqueue/LSQWrapper.scala index 95c9c5bcb44e92cb557bc84ed4f9cfd4dab9c884..b65155d674d53411e678e4a83a1d5fcd43f030e6 100644 --- a/src/main/scala/xiangshan/mem/lsqueue/LSQWrapper.scala +++ b/src/main/scala/xiangshan/mem/lsqueue/LSQWrapper.scala @@ -51,6 +51,7 @@ class LsqWrappper extends XSModule with HasDCacheParameters { val uncache = new DCacheWordIO val roqDeqPtr = Input(new RoqPtr) val exceptionAddr = new ExceptionAddrIO + val sqempty = Output(Bool()) }) val loadQueue = Module(new LoadQueue) @@ -103,6 +104,8 @@ class LsqWrappper extends XSModule with HasDCacheParameters { loadQueue.io.load_s1 <> io.forward storeQueue.io.forward <> io.forward // overlap forwardMask & forwardData, DO NOT CHANGE SEQUENCE + storeQueue.io.sqempty <> io.sqempty + io.exceptionAddr.vaddr := Mux(io.exceptionAddr.isStore, storeQueue.io.exceptionAddr.vaddr, loadQueue.io.exceptionAddr.vaddr) // naive uncache arbiter diff --git a/src/main/scala/xiangshan/mem/lsqueue/LoadQueue.scala b/src/main/scala/xiangshan/mem/lsqueue/LoadQueue.scala index 0a09492fc33f272767f30c7a32a4463bfbd6737f..5329953eccc697d601a4fb6c38ae9b1c5826f46b 100644 --- a/src/main/scala/xiangshan/mem/lsqueue/LoadQueue.scala +++ b/src/main/scala/xiangshan/mem/lsqueue/LoadQueue.scala @@ -10,6 +10,7 @@ import xiangshan.cache.{DCacheLineIO, DCacheWordIO, MemoryOpConstants, TlbReques import xiangshan.backend.LSUOpType import xiangshan.mem._ import xiangshan.backend.roq.RoqPtr +import xiangshan.backend.fu.HasExceptionNO class LqPtr extends CircularQueuePtr(LqPtr.LoadQueueSize) { } @@ -58,6 +59,7 @@ class LoadQueue extends XSModule with HasDCacheParameters with HasCircularQueuePtrHelper with HasLoadHelper + with HasExceptionNO { val io = IO(new Bundle() { val enq = new LqEnqIO @@ -92,6 +94,7 @@ class LoadQueue extends XSModule val enqPtrExt = RegInit(VecInit((0 until RenameWidth).map(_.U.asTypeOf(new LqPtr)))) val deqPtrExt = RegInit(0.U.asTypeOf(new LqPtr)) + val deqPtrExtNext = Wire(new LqPtr) val validCounter = RegInit(0.U(log2Ceil(LoadQueueSize + 1).W)) val allowEnqueue = RegInit(true.B) @@ -150,7 +153,7 @@ class LoadQueue extends XSModule vaddrModule.io.wen(i) := false.B when(io.loadIn(i).fire()) { when(io.loadIn(i).bits.miss) { - XSInfo(io.loadIn(i).valid, "load miss write to lq idx %d pc 0x%x vaddr %x paddr %x data %x mask %x forwardData %x forwardMask: %x mmio %x roll %x exc %x\n", + XSInfo(io.loadIn(i).valid, "load miss write to lq idx %d pc 0x%x vaddr %x paddr %x data %x mask %x forwardData %x forwardMask: %x mmio %x\n", io.loadIn(i).bits.uop.lqIdx.asUInt, io.loadIn(i).bits.uop.cf.pc, io.loadIn(i).bits.vaddr, @@ -159,116 +162,44 @@ class LoadQueue extends XSModule io.loadIn(i).bits.mask, io.loadIn(i).bits.forwardData.asUInt, io.loadIn(i).bits.forwardMask.asUInt, - io.loadIn(i).bits.mmio, - io.loadIn(i).bits.rollback, - io.loadIn(i).bits.uop.cf.exceptionVec.asUInt - ) - }.otherwise { - XSInfo(io.loadIn(i).valid, "load hit write to cbd lqidx %d pc 0x%x vaddr %x paddr %x data %x mask %x forwardData %x forwardMask: %x mmio %x roll %x exc %x\n", - io.loadIn(i).bits.uop.lqIdx.asUInt, - io.loadIn(i).bits.uop.cf.pc, - io.loadIn(i).bits.vaddr, - io.loadIn(i).bits.paddr, - io.loadIn(i).bits.data, - io.loadIn(i).bits.mask, - io.loadIn(i).bits.forwardData.asUInt, - io.loadIn(i).bits.forwardMask.asUInt, - io.loadIn(i).bits.mmio, - io.loadIn(i).bits.rollback, - io.loadIn(i).bits.uop.cf.exceptionVec.asUInt - ) - } - val loadWbIndex = io.loadIn(i).bits.uop.lqIdx.value - datavalid(loadWbIndex) := !io.loadIn(i).bits.miss && !io.loadIn(i).bits.mmio - writebacked(loadWbIndex) := !io.loadIn(i).bits.miss && !io.loadIn(i).bits.mmio - - val loadWbData = Wire(new LQDataEntry) - loadWbData.paddr := io.loadIn(i).bits.paddr - loadWbData.mask := io.loadIn(i).bits.mask - loadWbData.data := io.loadIn(i).bits.data // fwd data - loadWbData.fwdMask := io.loadIn(i).bits.forwardMask - loadWbData.exception := io.loadIn(i).bits.uop.cf.exceptionVec.asUInt - dataModule.io.wbWrite(i, loadWbIndex, loadWbData) - dataModule.io.wb.wen(i) := true.B - - vaddrModule.io.waddr(i) := loadWbIndex - vaddrModule.io.wdata(i) := io.loadIn(i).bits.vaddr - vaddrModule.io.wen(i) := true.B - - debug_mmio(loadWbIndex) := io.loadIn(i).bits.mmio - - val dcacheMissed = io.loadIn(i).bits.miss && !io.loadIn(i).bits.mmio - miss(loadWbIndex) := dcacheMissed && !io.loadIn(i).bits.uop.cf.exceptionVec.asUInt.orR - // listening(loadWbIndex) := dcacheMissed - pending(loadWbIndex) := io.loadIn(i).bits.mmio && !io.loadIn(i).bits.uop.cf.exceptionVec.asUInt.orR - uop(loadWbIndex).debugInfo.issueTime := io.loadIn(i).bits.uop.debugInfo.issueTime - } + io.loadIn(i).bits.mmio + ) + }.otherwise { + XSInfo(io.loadIn(i).valid, "load hit write to cbd lqidx %d pc 0x%x vaddr %x paddr %x data %x mask %x forwardData %x forwardMask: %x mmio %x\n", + io.loadIn(i).bits.uop.lqIdx.asUInt, + io.loadIn(i).bits.uop.cf.pc, + io.loadIn(i).bits.vaddr, + io.loadIn(i).bits.paddr, + io.loadIn(i).bits.data, + io.loadIn(i).bits.mask, + io.loadIn(i).bits.forwardData.asUInt, + io.loadIn(i).bits.forwardMask.asUInt, + io.loadIn(i).bits.mmio + )} + val loadWbIndex = io.loadIn(i).bits.uop.lqIdx.value + datavalid(loadWbIndex) := !io.loadIn(i).bits.miss && !io.loadIn(i).bits.mmio + writebacked(loadWbIndex) := !io.loadIn(i).bits.miss && !io.loadIn(i).bits.mmio + + val loadWbData = Wire(new LQDataEntry) + loadWbData.paddr := io.loadIn(i).bits.paddr + loadWbData.mask := io.loadIn(i).bits.mask + loadWbData.data := io.loadIn(i).bits.data // fwd data + loadWbData.fwdMask := io.loadIn(i).bits.forwardMask + dataModule.io.wbWrite(i, loadWbIndex, loadWbData) + dataModule.io.wb.wen(i) := true.B + + vaddrModule.io.waddr(i) := loadWbIndex + vaddrModule.io.wdata(i) := io.loadIn(i).bits.vaddr + vaddrModule.io.wen(i) := true.B + + debug_mmio(loadWbIndex) := io.loadIn(i).bits.mmio + + val dcacheMissed = io.loadIn(i).bits.miss && !io.loadIn(i).bits.mmio + miss(loadWbIndex) := dcacheMissed + pending(loadWbIndex) := io.loadIn(i).bits.mmio + uop(loadWbIndex).debugInfo.issueTime := io.loadIn(i).bits.uop.debugInfo.issueTime } - - /** - * Cache miss request - * - * (1) writeback: miss - * (2) send to dcache: listing - * (3) dcache response: datavalid - * (4) writeback to ROB: writeback - */ - // val inflightReqs = RegInit(VecInit(Seq.fill(cfg.nLoadMissEntries)(0.U.asTypeOf(new InflightBlockInfo)))) - // val inflightReqFull = inflightReqs.map(req => req.valid).reduce(_&&_) - // val reqBlockIndex = PriorityEncoder(~VecInit(inflightReqs.map(req => req.valid)).asUInt) - - // val missRefillSelVec = VecInit( - // (0 until LoadQueueSize).map{ i => - // val inflight = inflightReqs.map(req => req.valid && req.block_addr === get_block_addr(dataModule.io.rdata(i).paddr)).reduce(_||_) - // allocated(i) && miss(i) && !inflight - // }) - - // val missRefillSel = getFirstOne(missRefillSelVec, deqMask) - // val missRefillBlockAddr = get_block_addr(dataModule.io.rdata(missRefillSel).paddr) - // io.dcache.req.valid := missRefillSelVec.asUInt.orR - // io.dcache.req.bits.cmd := MemoryOpConstants.M_XRD - // io.dcache.req.bits.addr := missRefillBlockAddr - // io.dcache.req.bits.data := DontCare - // io.dcache.req.bits.mask := DontCare - - // io.dcache.req.bits.meta.id := DontCare - // io.dcache.req.bits.meta.vaddr := DontCare // dataModule.io.rdata(missRefillSel).vaddr - // io.dcache.req.bits.meta.paddr := missRefillBlockAddr - // io.dcache.req.bits.meta.uop := uop(missRefillSel) - // io.dcache.req.bits.meta.mmio := false.B // dataModule.io.rdata(missRefillSel).mmio - // io.dcache.req.bits.meta.tlb_miss := false.B - // io.dcache.req.bits.meta.mask := DontCare - // io.dcache.req.bits.meta.replay := false.B - - // assert(!(dataModule.io.rdata(missRefillSel).mmio && io.dcache.req.valid)) - - // when(io.dcache.req.fire()) { - // miss(missRefillSel) := false.B - // listening(missRefillSel) := true.B - - // mark this block as inflight - // inflightReqs(reqBlockIndex).valid := true.B - // inflightReqs(reqBlockIndex).block_addr := missRefillBlockAddr - // assert(!inflightReqs(reqBlockIndex).valid) - // } - - // when(io.dcache.resp.fire()) { - // val inflight = inflightReqs.map(req => req.valid && req.block_addr === get_block_addr(io.dcache.resp.bits.meta.paddr)).reduce(_||_) - // assert(inflight) - // for (i <- 0 until cfg.nLoadMissEntries) { - // when (inflightReqs(i).valid && inflightReqs(i).block_addr === get_block_addr(io.dcache.resp.bits.meta.paddr)) { - // inflightReqs(i).valid := false.B - // } - // } - // } - - - // when(io.dcache.req.fire()){ - // XSDebug("miss req: pc:0x%x roqIdx:%d lqIdx:%d (p)addr:0x%x vaddr:0x%x\n", - // io.dcache.req.bits.meta.uop.cf.pc, io.dcache.req.bits.meta.uop.roqIdx.asUInt, io.dcache.req.bits.meta.uop.lqIdx.asUInt, - // io.dcache.req.bits.addr, io.dcache.req.bits.meta.vaddr - // ) - // } + } when(io.dcache.valid) { XSDebug("miss resp: paddr:0x%x data %x\n", io.dcache.bits.addr, io.dcache.bits.data) @@ -295,47 +226,57 @@ class LoadQueue extends XSModule // Stage 0 // Generate writeback indexes + + def getEvenBits(input: UInt): UInt = { + require(input.getWidth == LoadQueueSize) + VecInit((0 until LoadQueueSize/2).map(i => {input(2*i)})).asUInt + } + def getOddBits(input: UInt): UInt = { + require(input.getWidth == LoadQueueSize) + VecInit((0 until LoadQueueSize/2).map(i => {input(2*i+1)})).asUInt + } + + val loadWbSel = Wire(Vec(LoadPipelineWidth, UInt(log2Up(LoadQueueSize).W))) // index selected last cycle + val loadWbSelV = Wire(Vec(LoadPipelineWidth, Bool())) // index selected in last cycle is valid + val loadWbSelVec = VecInit((0 until LoadQueueSize).map(i => { allocated(i) && !writebacked(i) && datavalid(i) })).asUInt() // use uint instead vec to reduce verilog lines - val loadEvenSelVec = VecInit((0 until LoadQueueSize/2).map(i => {loadWbSelVec(2*i)})) - val loadOddSelVec = VecInit((0 until LoadQueueSize/2).map(i => {loadWbSelVec(2*i+1)})) - val evenDeqMask = VecInit((0 until LoadQueueSize/2).map(i => {deqMask(2*i)})).asUInt - val oddDeqMask = VecInit((0 until LoadQueueSize/2).map(i => {deqMask(2*i+1)})).asUInt + val evenDeqMask = getEvenBits(deqMask) + val oddDeqMask = getOddBits(deqMask) + // generate lastCycleSelect mask + val evenSelectMask = Mux(io.ldout(0).fire(), getEvenBits(UIntToOH(loadWbSel(0))), 0.U) + val oddSelectMask = Mux(io.ldout(1).fire(), getOddBits(UIntToOH(loadWbSel(1))), 0.U) + // generate real select vec + val loadEvenSelVec = getEvenBits(loadWbSelVec) & ~evenSelectMask + val loadOddSelVec = getOddBits(loadWbSelVec) & ~oddSelectMask + + def toVec(a: UInt): Vec[Bool] = { + VecInit(a.asBools) + } val loadWbSelGen = Wire(Vec(LoadPipelineWidth, UInt(log2Up(LoadQueueSize).W))) val loadWbSelVGen = Wire(Vec(LoadPipelineWidth, Bool())) - loadWbSelGen(0) := Cat(getFirstOne(loadEvenSelVec, evenDeqMask), 0.U(1.W)) + loadWbSelGen(0) := Cat(getFirstOne(toVec(loadEvenSelVec), evenDeqMask), 0.U(1.W)) loadWbSelVGen(0):= loadEvenSelVec.asUInt.orR - loadWbSelGen(1) := Cat(getFirstOne(loadOddSelVec, oddDeqMask), 1.U(1.W)) + loadWbSelGen(1) := Cat(getFirstOne(toVec(loadOddSelVec), oddDeqMask), 1.U(1.W)) loadWbSelVGen(1) := loadOddSelVec.asUInt.orR - val loadWbSel = Wire(Vec(LoadPipelineWidth, UInt(log2Up(LoadQueueSize).W))) - val loadWbSelV = RegInit(VecInit(List.fill(LoadPipelineWidth)(false.B))) (0 until LoadPipelineWidth).map(i => { - val canGo = io.ldout(i).fire() || !loadWbSelV(i) - val valid = loadWbSelVGen(i) - // store selected index in pipeline reg - loadWbSel(i) := RegEnable(loadWbSelGen(i), valid && canGo) - // Mark them as writebacked, so they will not be selected in the next cycle - when(valid && canGo){ - writebacked(loadWbSelGen(i)) := true.B - } - // update loadWbSelValidReg + loadWbSel(i) := RegNext(loadWbSelGen(i)) + loadWbSelV(i) := RegNext(loadWbSelVGen(i), init = false.B) when(io.ldout(i).fire()){ - loadWbSelV(i) := false.B - } - when(valid && canGo){ - loadWbSelV(i) := true.B + // Mark them as writebacked, so they will not be selected in the next cycle + writebacked(loadWbSel(i)) := true.B } }) - + // Stage 1 // Use indexes generated in cycle 0 to read data // writeback data to cdb (0 until LoadPipelineWidth).map(i => { // data select - dataModule.io.wb.raddr(i) := loadWbSel(i) + dataModule.io.wb.raddr(i) := loadWbSelGen(i) val rdata = dataModule.io.wb.rdata(i).data val seluop = uop(loadWbSel(i)) val func = seluop.ctrl.fuOpType @@ -353,10 +294,9 @@ class LoadQueue extends XSModule val rdataPartialLoad = rdataHelper(seluop, rdataSel) // writeback missed int/fp load - // + // // Int load writeback will finish (if not blocked) in one cycle io.ldout(i).bits.uop := seluop - io.ldout(i).bits.uop.cf.exceptionVec := dataModule.io.wb.rdata(i).exception.asBools io.ldout(i).bits.uop.lqIdx := loadWbSel(i).asTypeOf(new LqPtr) io.ldout(i).bits.data := rdataPartialLoad io.ldout(i).bits.redirectValid := false.B @@ -368,12 +308,10 @@ class LoadQueue extends XSModule io.ldout(i).valid := loadWbSelV(i) when(io.ldout(i).fire()) { - XSInfo("int load miss write to cbd roqidx %d lqidx %d pc 0x%x paddr %x data %x mmio %x\n", + XSInfo("int load miss write to cbd roqidx %d lqidx %d pc 0x%x mmio %x\n", io.ldout(i).bits.uop.roqIdx.asUInt, io.ldout(i).bits.uop.lqIdx.asUInt, io.ldout(i).bits.uop.cf.pc, - dataModule.io.debug(loadWbSel(i)).paddr, - dataModule.io.debug(loadWbSel(i)).data, debug_mmio(loadWbSel(i)) ) } @@ -430,7 +368,9 @@ class LoadQueue extends XSModule * Besides, load instructions in LoadUnit_S1 and S2 are also checked. * Cycle 1: Redirect Generation * There're three possible types of violations. Choose the oldest load. - * Set io.redirect according to the detected violation. + * Prepare redirect request according to the detected violation. + * Cycle 2: Redirect Fire + * Fire redirect request (if valid) */ io.load_s1 := DontCare def detectRollback(i: Int) = { @@ -530,18 +470,29 @@ class LoadQueue extends XSModule val rollbackSelected = ParallelOperation(rollback, rollbackSel) val lastCycleRedirect = RegNext(io.brqRedirect) + // S2: select rollback and generate rollback request // Note that we use roqIdx - 1.U to flush the load instruction itself. // Thus, here if last cycle's roqIdx equals to this cycle's roqIdx, it still triggers the redirect. - io.rollback.valid := rollbackSelected.valid && + val rollbackGen = Wire(Valid(new Redirect)) + val rollbackReg = Reg(Valid(new Redirect)) + rollbackGen.valid := rollbackSelected.valid && (!lastCycleRedirect.valid || !isAfter(rollbackSelected.bits.roqIdx, lastCycleRedirect.bits.roqIdx)) && !(lastCycleRedirect.valid && lastCycleRedirect.bits.isUnconditional()) - io.rollback.bits.roqIdx := rollbackSelected.bits.roqIdx - io.rollback.bits.level := RedirectLevel.flush - io.rollback.bits.interrupt := DontCare - io.rollback.bits.pc := DontCare - io.rollback.bits.target := rollbackSelected.bits.cf.pc - io.rollback.bits.brTag := rollbackSelected.bits.brTag + rollbackGen.bits.roqIdx := rollbackSelected.bits.roqIdx + rollbackGen.bits.level := RedirectLevel.flush + rollbackGen.bits.interrupt := DontCare + rollbackGen.bits.pc := DontCare + rollbackGen.bits.target := rollbackSelected.bits.cf.pc + rollbackGen.bits.brTag := rollbackSelected.bits.brTag + + rollbackReg := rollbackGen + + // S3: fire rollback request + io.rollback := rollbackReg + io.rollback.valid := rollbackReg.valid && + (!lastCycleRedirect.valid || !isAfter(rollbackReg.bits.roqIdx, lastCycleRedirect.bits.roqIdx)) && + !(lastCycleRedirect.valid && lastCycleRedirect.bits.isUnconditional()) when(io.rollback.valid) { XSDebug("Mem rollback: pc %x roqidx %d\n", io.rollback.bits.pc, io.rollback.bits.roqIdx.asUInt) @@ -556,7 +507,7 @@ class LoadQueue extends XSModule io.roqDeqPtr === uop(deqPtr).roqIdx && !io.commits.isWalk - dataModule.io.uncache.raddr := deqPtr + dataModule.io.uncache.raddr := deqPtrExtNext.value io.uncache.req.bits.cmd := MemoryOpConstants.M_XRD io.uncache.req.bits.addr := dataModule.io.uncache.rdata.paddr @@ -623,7 +574,8 @@ class LoadQueue extends XSModule } val commitCount = PopCount(loadCommit) - deqPtrExt := deqPtrExt + commitCount + deqPtrExtNext := deqPtrExt + commitCount + deqPtrExt := deqPtrExtNext val lastLastCycleRedirect = RegNext(lastCycleRedirect.valid) val trueValidCounter = distanceBetween(enqPtrExt(0), deqPtrExt) diff --git a/src/main/scala/xiangshan/mem/lsqueue/LoadQueueData.scala b/src/main/scala/xiangshan/mem/lsqueue/LoadQueueData.scala index 1e4cecb154e3bba703ad02e2dc720357a9249f9d..a054fc2d4ef79c68d8554fe05bf884052dcf4d3b 100644 --- a/src/main/scala/xiangshan/mem/lsqueue/LoadQueueData.scala +++ b/src/main/scala/xiangshan/mem/lsqueue/LoadQueueData.scala @@ -15,7 +15,6 @@ class LQDataEntry extends XSBundle { val paddr = UInt(PAddrBits.W) val mask = UInt(8.W) val data = UInt(XLEN.W) - val exception = UInt(16.W) // TODO: opt size val fwdMask = Vec(8, Bool()) } @@ -38,7 +37,7 @@ class PaddrModule(numEntries: Int, numRead: Int, numWrite: Int) extends XSModule // read ports for (i <- 0 until numRead) { - io.rdata(i) := data(io.raddr(i)) + io.rdata(i) := data(RegNext(io.raddr(i))) } // below is the write ports (with priorities) @@ -82,7 +81,7 @@ class MaskModule(numEntries: Int, numRead: Int, numWrite: Int) extends XSModule // read ports for (i <- 0 until numRead) { - io.rdata(i) := data(io.raddr(i)) + io.rdata(i) := data(RegNext(io.raddr(i))) } // below is the write ports (with priorities) @@ -138,7 +137,7 @@ class CoredataModule(numEntries: Int, numRead: Int, numWrite: Int) extends XSMod // read ports for (i <- 0 until numRead) { - io.rdata(i) := data(io.raddr(i)) + io.rdata(i) := data(RegNext(io.raddr(i))) } // below is the write ports (with priorities) @@ -236,7 +235,6 @@ class LoadQueueData(size: Int, wbNumRead: Int, wbNumWrite: Int) extends XSModule // data module val paddrModule = Module(new PaddrModule(size, numRead = 3, numWrite = 2)) val maskModule = Module(new MaskModule(size, numRead = 3, numWrite = 2)) - val exceptionModule = Module(new AsyncDataModuleTemplate(UInt(16.W), size, numRead = 3, numWrite = 2)) val coredataModule = Module(new CoredataModule(size, numRead = 3, numWrite = 3)) // read data @@ -244,26 +242,22 @@ class LoadQueueData(size: Int, wbNumRead: Int, wbNumWrite: Int) extends XSModule (0 until wbNumRead).map(i => { paddrModule.io.raddr(i) := io.wb.raddr(i) maskModule.io.raddr(i) := io.wb.raddr(i) - exceptionModule.io.raddr(i) := io.wb.raddr(i) coredataModule.io.raddr(i) := io.wb.raddr(i) io.wb.rdata(i).paddr := paddrModule.io.rdata(i) io.wb.rdata(i).mask := maskModule.io.rdata(i) io.wb.rdata(i).data := coredataModule.io.rdata(i) - io.wb.rdata(i).exception := exceptionModule.io.rdata(i) io.wb.rdata(i).fwdMask := DontCare }) // read port wbNumRead paddrModule.io.raddr(wbNumRead) := io.uncache.raddr maskModule.io.raddr(wbNumRead) := io.uncache.raddr - exceptionModule.io.raddr(wbNumRead) := io.uncache.raddr coredataModule.io.raddr(wbNumRead) := io.uncache.raddr io.uncache.rdata.paddr := paddrModule.io.rdata(wbNumRead) io.uncache.rdata.mask := maskModule.io.rdata(wbNumRead) - io.uncache.rdata.data := exceptionModule.io.rdata(wbNumRead) - io.uncache.rdata.exception := coredataModule.io.rdata(wbNumRead) + io.uncache.rdata.data := coredataModule.io.rdata(wbNumRead) io.uncache.rdata.fwdMask := DontCare // write data @@ -271,19 +265,16 @@ class LoadQueueData(size: Int, wbNumRead: Int, wbNumWrite: Int) extends XSModule (0 until wbNumWrite).map(i => { paddrModule.io.wen(i) := false.B maskModule.io.wen(i) := false.B - exceptionModule.io.wen(i) := false.B coredataModule.io.wen(i) := false.B coredataModule.io.fwdMaskWen(i) := false.B coredataModule.io.paddrWen(i) := false.B paddrModule.io.waddr(i) := io.wb.waddr(i) maskModule.io.waddr(i) := io.wb.waddr(i) - exceptionModule.io.waddr(i) := io.wb.waddr(i) coredataModule.io.waddr(i) := io.wb.waddr(i) paddrModule.io.wdata(i) := io.wb.wdata(i).paddr maskModule.io.wdata(i) := io.wb.wdata(i).mask - exceptionModule.io.wdata(i) := io.wb.wdata(i).exception coredataModule.io.wdata(i) := io.wb.wdata(i).data coredataModule.io.fwdMaskWdata(i) := io.wb.wdata(i).fwdMask.asUInt coredataModule.io.paddrWdata(i) := io.wb.wdata(i).paddr @@ -291,7 +282,6 @@ class LoadQueueData(size: Int, wbNumRead: Int, wbNumWrite: Int) extends XSModule when(io.wb.wen(i)){ paddrModule.io.wen(i) := true.B maskModule.io.wen(i) := true.B - exceptionModule.io.wen(i) := true.B coredataModule.io.wen(i) := true.B coredataModule.io.fwdMaskWen(i) := true.B coredataModule.io.paddrWen(i) := true.B diff --git a/src/main/scala/xiangshan/mem/lsqueue/StoreQueue.scala b/src/main/scala/xiangshan/mem/lsqueue/StoreQueue.scala index c80bd70eaf0af0147e6a86094745ba1d8c17b0d4..da58ce688794e14823808d31255ca9461374bb6b 100644 --- a/src/main/scala/xiangshan/mem/lsqueue/StoreQueue.scala +++ b/src/main/scala/xiangshan/mem/lsqueue/StoreQueue.scala @@ -43,6 +43,7 @@ class StoreQueue extends XSModule with HasDCacheParameters with HasCircularQueue val roqDeqPtr = Input(new RoqPtr) // val refill = Flipped(Valid(new DCacheLineReq )) val exceptionAddr = new ExceptionAddrIO + val sqempty = Output(Bool()) }) // data modules @@ -52,8 +53,6 @@ class StoreQueue extends XSModule with HasDCacheParameters with HasCircularQueue dataModule.io := DontCare val vaddrModule = Module(new AsyncDataModuleTemplate(UInt(VAddrBits.W), StoreQueueSize, numRead = 1, numWrite = StorePipelineWidth)) vaddrModule.io := DontCare - val exceptionModule = Module(new AsyncDataModuleTemplate(UInt(16.W), StoreQueueSize, numRead = StorePipelineWidth, numWrite = StorePipelineWidth)) - exceptionModule.io := DontCare // state & misc val allocated = RegInit(VecInit(List.fill(StoreQueueSize)(false.B))) // sq entry has been allocated @@ -77,13 +76,21 @@ class StoreQueue extends XSModule with HasDCacheParameters with HasCircularQueue val headMask = UIntToMask(enqPtr, StoreQueueSize) // Read dataModule - // deqPtr and deqPtr+1 entry will be read from dataModule + // deqPtrExtNext and deqPtrExtNext+1 entry will be read from dataModule + // if !sbuffer.fire(), read the same ptr + // if sbuffer.fire(), read next + val deqPtrExtNext = WireInit(Mux(io.sbuffer(1).fire(), + VecInit(deqPtrExt.map(_ + 2.U)), + Mux(io.sbuffer(0).fire() || io.mmioStout.fire(), + VecInit(deqPtrExt.map(_ + 1.U)), + deqPtrExt + ) + )) val dataModuleRead = dataModule.io.rdata for (i <- 0 until StorePipelineWidth) { - dataModule.io.raddr(i) := deqPtrExt(i).value + dataModule.io.raddr(i) := deqPtrExtNext(i).value } vaddrModule.io.raddr(0) := io.exceptionAddr.lsIdx.sqIdx.value - exceptionModule.io.raddr(0) := deqPtr // read exception /** * Enqueue at dispatch @@ -123,14 +130,11 @@ class StoreQueue extends XSModule with HasDCacheParameters with HasCircularQueue for (i <- 0 until StorePipelineWidth) { dataModule.io.wen(i) := false.B vaddrModule.io.wen(i) := false.B - exceptionModule.io.wen(i) := false.B - when(io.storeIn(i).fire()) { + when (io.storeIn(i).fire()) { val stWbIndex = io.storeIn(i).bits.uop.sqIdx.value - val hasException = io.storeIn(i).bits.uop.cf.exceptionVec.asUInt.orR - val hasWritebacked = !io.storeIn(i).bits.mmio || hasException - datavalid(stWbIndex) := hasWritebacked - writebacked(stWbIndex) := hasWritebacked - pending(stWbIndex) := !hasWritebacked // valid mmio require + datavalid(stWbIndex) := !io.storeIn(i).bits.mmio + writebacked(stWbIndex) := !io.storeIn(i).bits.mmio + pending(stWbIndex) := io.storeIn(i).bits.mmio val storeWbData = Wire(new SQDataEntry) storeWbData := DontCare @@ -145,21 +149,15 @@ class StoreQueue extends XSModule with HasDCacheParameters with HasCircularQueue vaddrModule.io.wdata(i) := io.storeIn(i).bits.vaddr vaddrModule.io.wen(i) := true.B - exceptionModule.io.waddr(i) := stWbIndex - exceptionModule.io.wdata(i) := io.storeIn(i).bits.uop.cf.exceptionVec.asUInt - exceptionModule.io.wen(i) := true.B - mmio(stWbIndex) := io.storeIn(i).bits.mmio - XSInfo("store write to sq idx %d pc 0x%x vaddr %x paddr %x data %x mmio %x roll %x exc %x\n", + XSInfo("store write to sq idx %d pc 0x%x vaddr %x paddr %x data %x mmio %x\n", io.storeIn(i).bits.uop.sqIdx.value, io.storeIn(i).bits.uop.cf.pc, io.storeIn(i).bits.vaddr, io.storeIn(i).bits.paddr, io.storeIn(i).bits.data, - io.storeIn(i).bits.mmio, - io.storeIn(i).bits.rollback, - io.storeIn(i).bits.uop.cf.exceptionVec.asUInt + io.storeIn(i).bits.mmio ) } } @@ -258,7 +256,6 @@ class StoreQueue extends XSModule with HasDCacheParameters with HasCircularQueue io.mmioStout.valid := allocated(deqPtr) && datavalid(deqPtr) && !writebacked(deqPtr) io.mmioStout.bits.uop := uop(deqPtr) io.mmioStout.bits.uop.sqIdx := deqPtrExt(0) - io.mmioStout.bits.uop.cf.exceptionVec := exceptionModule.io.rdata(0).asBools io.mmioStout.bits.data := dataModuleRead(0).data // dataModuleRead.read(deqPtr) io.mmioStout.bits.redirectValid := false.B io.mmioStout.bits.redirect := DontCare @@ -288,9 +285,11 @@ class StoreQueue extends XSModule with HasDCacheParameters with HasCircularQueue // Commited stores will not be cancelled and can be sent to lower level. // remove retired insts from sq, add retired store to sbuffer for (i <- 0 until StorePipelineWidth) { + // We use RegNext to prepare data for sbuffer val ptr = deqPtrExt(i).value - val ismmio = mmio(ptr) - io.sbuffer(i).valid := allocated(ptr) && commited(ptr) && !ismmio + // if !sbuffer.fire(), read the same ptr + // if sbuffer.fire(), read next + io.sbuffer(i).valid := allocated(ptr) && commited(ptr) && !mmio(ptr) io.sbuffer(i).bits.cmd := MemoryOpConstants.M_XWR io.sbuffer(i).bits.addr := dataModuleRead(i).paddr io.sbuffer(i).bits.data := dataModuleRead(i).data @@ -299,7 +298,7 @@ class StoreQueue extends XSModule with HasDCacheParameters with HasCircularQueue io.sbuffer(i).bits.meta.tlb_miss := false.B io.sbuffer(i).bits.meta.uop := DontCare io.sbuffer(i).bits.meta.mmio := false.B - io.sbuffer(i).bits.meta.mask := dataModuleRead(i).mask + io.sbuffer(i).bits.meta.mask := io.sbuffer(i).bits.mask when (io.sbuffer(i).fire()) { allocated(ptr) := false.B @@ -349,13 +348,7 @@ class StoreQueue extends XSModule with HasDCacheParameters with HasCircularQueue enqPtrExt := VecInit(enqPtrExt.map(_ + enqNumber)) } - deqPtrExt := Mux(io.sbuffer(1).fire(), - VecInit(deqPtrExt.map(_ + 2.U)), - Mux(io.sbuffer(0).fire() || io.mmioStout.fire(), - VecInit(deqPtrExt.map(_ + 1.U)), - deqPtrExt - ) - ) + deqPtrExt := deqPtrExtNext val lastLastCycleRedirect = RegNext(lastCycleRedirect) val dequeueCount = Mux(io.sbuffer(1).fire(), 2.U, Mux(io.sbuffer(0).fire() || io.mmioStout.fire(), 1.U, 0.U)) @@ -373,6 +366,12 @@ class StoreQueue extends XSModule with HasDCacheParameters with HasCircularQueue ) ) + // io.sqempty will be used by sbuffer + // We delay it for 1 cycle for better timing + // When sbuffer need to check if it is empty, the pipeline is blocked, which means delay io.sqempty + // for 1 cycle will also promise that sq is empty in that cycle + io.sqempty := RegNext(enqPtrExt(0).value === deqPtrExt(0).value && enqPtrExt(0).flag === deqPtrExt(0).flag) + // debug info XSDebug("enqPtrExt %d:%d deqPtrExt %d:%d\n", enqPtrExt(0).flag, enqPtr, deqPtrExt(0).flag, deqPtr) diff --git a/src/main/scala/xiangshan/mem/lsqueue/StoreQueueData.scala b/src/main/scala/xiangshan/mem/lsqueue/StoreQueueData.scala index 2edc6f3188d93b29fa8ea1d1dc7a4da2b2abfaf4..32b86eb40411efb4cf438fabfce474e08100ed64 100644 --- a/src/main/scala/xiangshan/mem/lsqueue/StoreQueueData.scala +++ b/src/main/scala/xiangshan/mem/lsqueue/StoreQueueData.scala @@ -51,7 +51,7 @@ class StoreQueueData(size: Int, numRead: Int, numWrite: Int, numForward: Int) ex // destorequeue read data (0 until numRead).map(i => { - io.rdata(i) := data(io.raddr(i)) + io.rdata(i) := data(RegNext(io.raddr(i))) }) // DataModuleTemplate should not be used when there're any write conflicts diff --git a/src/main/scala/xiangshan/mem/pipeline/AtomicsUnit.scala b/src/main/scala/xiangshan/mem/pipeline/AtomicsUnit.scala index 573e99bfbeea507369da24ac3dc6f75963ad22be..f0b0003b197649a38c82267646e3e1110815dd26 100644 --- a/src/main/scala/xiangshan/mem/pipeline/AtomicsUnit.scala +++ b/src/main/scala/xiangshan/mem/pipeline/AtomicsUnit.scala @@ -25,9 +25,11 @@ class AtomicsUnit extends XSModule with MemoryOpConstants{ val s_invalid :: s_tlb :: s_flush_sbuffer_req :: s_flush_sbuffer_resp :: s_cache_req :: s_cache_resp :: s_finish :: Nil = Enum(7) val state = RegInit(s_invalid) val in = Reg(new ExuInput()) + val exceptionVec = RegInit(0.U.asTypeOf(ExceptionVec())) val atom_override_xtval = RegInit(false.B) // paddr after translation val paddr = Reg(UInt()) + val is_mmio = Reg(Bool()) // dcache response data val resp_data = Reg(UInt()) val is_lrsc_valid = Reg(Bool()) @@ -68,7 +70,6 @@ class AtomicsUnit extends XSModule with MemoryOpConstants{ io.tlbFeedback.bits.hit := true.B io.tlbFeedback.bits.roqIdx := in.uop.roqIdx - // tlb translation, manipulating signals && deal with exception when (state === s_tlb) { // send req to dtlb @@ -78,7 +79,7 @@ class AtomicsUnit extends XSModule with MemoryOpConstants{ io.dtlb.req.bits.roqIdx := in.uop.roqIdx io.dtlb.resp.ready := true.B val is_lr = in.uop.ctrl.fuOpType === LSUOpType.lr_w || in.uop.ctrl.fuOpType === LSUOpType.lr_d - io.dtlb.req.bits.cmd := Mux(is_lr, TlbCmd.read, TlbCmd.write) + io.dtlb.req.bits.cmd := Mux(is_lr, TlbCmd.atom_read, TlbCmd.atom_write) io.dtlb.req.bits.debug.pc := in.uop.cf.pc when(io.dtlb.resp.fire && !io.dtlb.resp.bits.miss){ @@ -89,10 +90,17 @@ class AtomicsUnit extends XSModule with MemoryOpConstants{ "b10".U -> (in.src1(1,0) === 0.U), //w "b11".U -> (in.src1(2,0) === 0.U) //d )) - in.uop.cf.exceptionVec(storeAddrMisaligned) := !addrAligned - in.uop.cf.exceptionVec(storePageFault) := io.dtlb.resp.bits.excp.pf.st - in.uop.cf.exceptionVec(loadPageFault) := io.dtlb.resp.bits.excp.pf.ld - val exception = !addrAligned || io.dtlb.resp.bits.excp.pf.st || io.dtlb.resp.bits.excp.pf.ld + exceptionVec(storeAddrMisaligned) := !addrAligned + exceptionVec(storePageFault) := io.dtlb.resp.bits.excp.pf.st + exceptionVec(loadPageFault) := io.dtlb.resp.bits.excp.pf.ld + exceptionVec(storeAccessFault) := io.dtlb.resp.bits.excp.af.st + exceptionVec(loadAccessFault) := io.dtlb.resp.bits.excp.af.ld + val exception = !addrAligned || + io.dtlb.resp.bits.excp.pf.st || + io.dtlb.resp.bits.excp.pf.ld || + io.dtlb.resp.bits.excp.af.st || + io.dtlb.resp.bits.excp.af.ld + is_mmio := io.dtlb.resp.bits.mmio when (exception) { // check for exceptions // if there are exceptions, no need to execute it @@ -208,12 +216,13 @@ class AtomicsUnit extends XSModule with MemoryOpConstants{ when (state === s_finish) { io.out.valid := true.B io.out.bits.uop := in.uop + io.out.bits.uop.cf.exceptionVec := exceptionVec io.out.bits.uop.diffTestDebugLrScValid := is_lrsc_valid io.out.bits.data := resp_data io.out.bits.redirectValid := false.B io.out.bits.redirect := DontCare io.out.bits.brUpdate := DontCare - io.out.bits.debug.isMMIO := AddressSpace.isMMIO(paddr) + io.out.bits.debug.isMMIO := is_mmio when (io.out.fire()) { XSDebug("atomics writeback: pc %x data %x\n", io.out.bits.uop.cf.pc, io.dcache.resp.bits.data) state := s_invalid diff --git a/src/main/scala/xiangshan/mem/pipeline/LoadUnit.scala b/src/main/scala/xiangshan/mem/pipeline/LoadUnit.scala index 133176d4c6d69b35a33a1dda0f1a8485887bd48f..ffe7c0732e0d5acda6516cf6e423f65c42d01fd5 100644 --- a/src/main/scala/xiangshan/mem/pipeline/LoadUnit.scala +++ b/src/main/scala/xiangshan/mem/pipeline/LoadUnit.scala @@ -91,9 +91,9 @@ class LoadUnit_S1 extends XSModule { val s1_uop = io.in.bits.uop val s1_paddr = io.dtlbResp.bits.paddr - val s1_exception = io.out.bits.uop.cf.exceptionVec.asUInt.orR + val s1_exception = selectLoad(io.out.bits.uop.cf.exceptionVec, false).asUInt.orR val s1_tlb_miss = io.dtlbResp.bits.miss - val s1_mmio = !s1_tlb_miss && AddressSpace.isMMIO(s1_paddr) + val s1_mmio = !s1_tlb_miss && io.dtlbResp.bits.mmio val s1_mask = io.in.bits.mask io.out.bits := io.in.bits // forwardXX field will be updated in s1 @@ -124,6 +124,7 @@ class LoadUnit_S1 extends XSModule { io.out.bits.mmio := s1_mmio && !s1_exception io.out.bits.tlbMiss := s1_tlb_miss io.out.bits.uop.cf.exceptionVec(loadPageFault) := io.dtlbResp.bits.excp.pf.ld + io.out.bits.uop.cf.exceptionVec(loadAccessFault) := io.dtlbResp.bits.excp.af.ld io.in.ready := !io.in.valid || io.out.ready @@ -147,7 +148,7 @@ class LoadUnit_S2 extends XSModule with HasLoadHelper { val s2_paddr = io.in.bits.paddr val s2_tlb_miss = io.in.bits.tlbMiss val s2_mmio = io.in.bits.mmio - val s2_exception = io.in.bits.uop.cf.exceptionVec.asUInt.orR + val s2_exception = selectLoad(io.in.bits.uop.cf.exceptionVec, false).asUInt.orR val s2_cache_miss = io.dcacheResp.bits.miss val s2_cache_replay = io.dcacheResp.bits.replay @@ -192,7 +193,9 @@ class LoadUnit_S2 extends XSModule with HasLoadHelper { // so we do not need to care about flush in load / store unit's out.valid io.out.bits := io.in.bits io.out.bits.data := rdataPartialLoad - io.out.bits.miss := s2_cache_miss && !fullForward + // when exception occurs, set it to not miss and let it write back to roq (via int port) + io.out.bits.miss := s2_cache_miss && !fullForward && !s2_exception + io.out.bits.uop.ctrl.fpWen := io.in.bits.uop.ctrl.fpWen && !s2_exception io.out.bits.mmio := s2_mmio io.in.ready := io.out.ready || !io.in.valid @@ -269,12 +272,14 @@ class LoadUnit extends XSModule with HasLoadHelper { // Load queue will be updated at s2 for both hit/miss int/fp load io.lsq.loadIn.valid := load_s2.io.out.valid io.lsq.loadIn.bits := load_s2.io.out.bits - val s2Valid = load_s2.io.out.valid && (!load_s2.io.out.bits.miss || load_s2.io.out.bits.uop.cf.exceptionVec.asUInt.orR) + + // write to rob and writeback bus + val s2_wb_valid = load_s2.io.out.valid && !load_s2.io.out.bits.miss val refillFpLoad = io.lsq.ldout.bits.uop.ctrl.fpWen // Int load, if hit, will be writebacked at s2 val intHitLoadOut = Wire(Valid(new ExuOutput)) - intHitLoadOut.valid := s2Valid && !load_s2.io.out.bits.uop.ctrl.fpWen + intHitLoadOut.valid := s2_wb_valid && !load_s2.io.out.bits.uop.ctrl.fpWen intHitLoadOut.bits.uop := load_s2.io.out.bits.uop intHitLoadOut.bits.data := load_s2.io.out.bits.data intHitLoadOut.bits.redirectValid := false.B @@ -288,10 +293,10 @@ class LoadUnit extends XSModule with HasLoadHelper { io.ldout.bits := Mux(intHitLoadOut.valid, intHitLoadOut.bits, io.lsq.ldout.bits) io.ldout.valid := intHitLoadOut.valid || io.lsq.ldout.valid && !refillFpLoad - + // Fp load, if hit, will be send to recoder at s2, then it will be recoded & writebacked at s3 val fpHitLoadOut = Wire(Valid(new ExuOutput)) - fpHitLoadOut.valid := s2Valid && load_s2.io.out.bits.uop.ctrl.fpWen + fpHitLoadOut.valid := s2_wb_valid && load_s2.io.out.bits.uop.ctrl.fpWen fpHitLoadOut.bits := intHitLoadOut.bits val fpLoadOut = Wire(Valid(new ExuOutput)) diff --git a/src/main/scala/xiangshan/mem/pipeline/StoreUnit.scala b/src/main/scala/xiangshan/mem/pipeline/StoreUnit.scala index 5c1403ea635752f35ab7f16cacc1ca1ea53866cd..c4868d2783c8fb500ef91192b87ab2d655e5a942 100644 --- a/src/main/scala/xiangshan/mem/pipeline/StoreUnit.scala +++ b/src/main/scala/xiangshan/mem/pipeline/StoreUnit.scala @@ -85,11 +85,12 @@ class StoreUnit_S1 extends XSModule { io.lsq.bits := io.in.bits io.lsq.bits.paddr := s1_paddr io.lsq.bits.miss := false.B - io.lsq.bits.mmio := AddressSpace.isMMIO(s1_paddr) + io.lsq.bits.mmio := io.dtlbResp.bits.mmio io.lsq.bits.uop.cf.exceptionVec(storePageFault) := io.dtlbResp.bits.excp.pf.st + io.lsq.bits.uop.cf.exceptionVec(storeAccessFault) := io.dtlbResp.bits.excp.af.st // mmio inst with exception will be writebacked immediately - val hasException = io.out.bits.uop.cf.exceptionVec.asUInt.orR + val hasException = selectStore(io.out.bits.uop.cf.exceptionVec, false).asUInt.orR io.out.valid := io.in.valid && (!io.out.bits.mmio || hasException) && !s1_tlb_miss io.out.bits := io.lsq.bits @@ -101,6 +102,18 @@ class StoreUnit_S1 extends XSModule { } class StoreUnit_S2 extends XSModule { + val io = IO(new Bundle() { + val in = Flipped(Decoupled(new LsPipelineBundle)) + val out = Decoupled(new LsPipelineBundle) + }) + + io.in.ready := true.B + io.out.bits := io.in.bits + io.out.valid := io.in.valid + +} + +class StoreUnit_S3 extends XSModule { val io = IO(new Bundle() { val in = Flipped(Decoupled(new LsPipelineBundle)) val stout = DecoupledIO(new ExuOutput) // writeback store @@ -133,6 +146,7 @@ class StoreUnit extends XSModule { val store_s0 = Module(new StoreUnit_S0) val store_s1 = Module(new StoreUnit_S1) val store_s2 = Module(new StoreUnit_S2) + val store_s3 = Module(new StoreUnit_S3) store_s0.io.in <> io.stin store_s0.io.dtlbReq <> io.dtlb.req @@ -145,7 +159,9 @@ class StoreUnit extends XSModule { PipelineConnect(store_s1.io.out, store_s2.io.in, true.B, store_s1.io.out.bits.uop.roqIdx.needFlush(io.redirect)) - store_s2.io.stout <> io.stout + PipelineConnect(store_s2.io.out, store_s3.io.in, true.B, store_s2.io.out.bits.uop.roqIdx.needFlush(io.redirect)) + + store_s3.io.stout <> io.stout private def printPipeLine(pipeline: LsPipelineBundle, cond: Bool, name: String): Unit = { XSDebug(cond, diff --git a/src/main/scala/xiangshan/mem/sbuffer/NewSbuffer.scala b/src/main/scala/xiangshan/mem/sbuffer/NewSbuffer.scala index 5d56705a417dc39d9dbd5f89ac4917185d6d7c99..37b9d9a45749bf6df15412020c17af3723d06e45 100644 --- a/src/main/scala/xiangshan/mem/sbuffer/NewSbuffer.scala +++ b/src/main/scala/xiangshan/mem/sbuffer/NewSbuffer.scala @@ -13,9 +13,9 @@ trait HasSbufferCst extends HasXSParameter { def s_prepare = 2.U(2.W) def s_inflight = 3.U(2.W) - val evictCycle = 8192 + val evictCycle = 1 << 20 require(isPow2(evictCycle)) - val countBits = 1 + log2Up(evictCycle) + val countBits = log2Up(evictCycle+1) val SbufferIndexWidth: Int = log2Up(StoreBufferSize) // paddr = tag + offset @@ -108,6 +108,7 @@ class NewSbuffer extends XSModule with HasSbufferCst { val in = Vec(StorePipelineWidth, Flipped(Decoupled(new DCacheWordReq))) //Todo: store logic only support Width == 2 now val dcache = new DCacheLineIO val forward = Vec(LoadPipelineWidth, Flipped(new LoadForwardQueryIO)) + val sqempty = Input(Bool()) val flush = new Bundle { val valid = Input(Bool()) val empty = Output(Bool()) @@ -291,7 +292,7 @@ class NewSbuffer extends XSModule with HasSbufferCst { do_eviction := validCount >= 12.U - io.flush.empty := empty + io.flush.empty := empty && io.sqempty lru.io.flush := sbuffer_state === x_drain_sbuffer && empty switch(sbuffer_state){ is(x_idle){ diff --git a/src/main/scala/xiangshan/package.scala b/src/main/scala/xiangshan/package.scala index 58ab2181c3f2658770d8f3bfdff976355e4ed920..da03c1b122674be599df51e37111cd4726155748 100644 --- a/src/main/scala/xiangshan/package.scala +++ b/src/main/scala/xiangshan/package.scala @@ -48,10 +48,11 @@ package object xiangshan { def apply() = UInt(log2Up(num).W) - def isIntExu(fuType: UInt) = !fuType(3) + def isIntExu(fuType: UInt) = !fuType(3) + def isJumpExu(fuType: UInt) = fuType === jmp def isFpExu(fuType: UInt) = fuType(3, 2) === "b10".U def isMemExu(fuType: UInt) = fuType(3, 2) === "b11".U - def isLoadExu(fuType: UInt) = fuType === ldu || fuType===mou + def isLoadExu(fuType: UInt) = fuType === ldu || fuType === mou def isStoreExu(fuType: UInt) = fuType === stu val functionNameMap = Map( @@ -108,4 +109,44 @@ package object xiangshan { def flushItself(level: UInt) = level(0) def isException(level: UInt) = level(1) && level(0) } + + object ExceptionVec { + def apply() = Vec(16, Bool()) + } + + object PMAMode { + def R = "b1".U << 0 //readable + def W = "b1".U << 1 //writeable + def X = "b1".U << 2 //executable + def I = "b1".U << 3 //cacheable: icache + def D = "b1".U << 4 //cacheable: dcache + def S = "b1".U << 5 //enable speculative access + def A = "b1".U << 6 //enable atomic operation, A imply R & W + def C = "b1".U << 7 //if it is cacheable is configable + def Reserved = "b0".U + + def apply() = UInt(7.W) + + def read(mode: UInt) = mode(0) + def write(mode: UInt) = mode(1) + def execute(mode: UInt) = mode(2) + def icache(mode: UInt) = mode(3) + def dcache(mode: UInt) = mode(4) + def speculate(mode: UInt) = mode(5) + def atomic(mode: UInt) = mode(6) + def configable_cache(mode: UInt) = mode(7) + + def strToMode(s: String) = { + var result = 0.U << 8 + if (s.toUpperCase.indexOf("R") >= 0) result = result + R + if (s.toUpperCase.indexOf("W") >= 0) result = result + W + if (s.toUpperCase.indexOf("X") >= 0) result = result + X + if (s.toUpperCase.indexOf("I") >= 0) result = result + I + if (s.toUpperCase.indexOf("D") >= 0) result = result + D + if (s.toUpperCase.indexOf("S") >= 0) result = result + S + if (s.toUpperCase.indexOf("A") >= 0) result = result + A + if (s.toUpperCase.indexOf("C") >= 0) result = result + C + result + } + } } diff --git a/src/test/csrc/ram.cpp b/src/test/csrc/ram.cpp index d52f8964b7e19d9b39b9bd067bae35ab82d82af2..d7192983ad7be7fadb21c1605d53469a436b91cc 100644 --- a/src/test/csrc/ram.cpp +++ b/src/test/csrc/ram.cpp @@ -4,6 +4,7 @@ #include "ram.h" #include "compress.h" +// #define TLB_UNITTEST #ifdef WITH_DRAMSIM3 #include "cosimulation.h" @@ -84,8 +85,8 @@ void addpageSv39() { //pdde[2] = ((0x80000000&0xc0000000) >> 2) | 0xf; for(int i = 0; i < PTENUM ;i++) { - pde[i] = ((PTEADDR(i)&0xfffff000)>>2) | 0x1; - //pde[i] = (((0x8000000+i*2*1024*1024)&0xffe00000)>>2) | 0xf; + // pde[i] = ((PTEADDR(i)&0xfffff000)>>2) | 0x1; + pde[i] = (((0x80000000+i*2*1024*1024)&0xffe00000)>>2) | 0xf; } for(int outidx = 0; outidx < PTENUM; outidx++ ) { @@ -94,6 +95,7 @@ void addpageSv39() { } } + printf("try to add identical tlb page to ram\n"); memcpy((char *)ram+(TOPSIZE-PAGESIZE*(PTENUM+PDDENUM+PDENUM+PDEMMIONUM+PTEMMIONUM+PDEDEVNUM+PTEDEVNUM)),ptedev,PAGESIZE*PTEDEVNUM); memcpy((char *)ram+(TOPSIZE-PAGESIZE*(PTENUM+PDDENUM+PDENUM+PDEMMIONUM+PTEMMIONUM+PDEDEVNUM)),pdedev,PAGESIZE*PDEDEVNUM); memcpy((char *)ram+(TOPSIZE-PAGESIZE*(PTENUM+PDDENUM+PDENUM+PDEMMIONUM+PTEMMIONUM)),ptemmio, PAGESIZE*PTEMMIONUM); @@ -117,6 +119,12 @@ void init_ram(const char *img) { assert(0); } +#ifdef TLB_UNITTEST + //new add + addpageSv39(); + //new end +#endif + int ret; if (isGzFile(img)) { printf("Gzip file detected and loading image from extracted gz file\n"); @@ -143,12 +151,6 @@ void init_ram(const char *img) { fclose(fp); } -#ifdef TLB_UNITTEST - //new add - addpageSv39(); - //new end -#endif - #ifdef WITH_DRAMSIM3 #if !defined(DRAMSIM3_CONFIG) || !defined(DRAMSIM3_OUTDIR) #error DRAMSIM3_CONFIG or DRAMSIM3_OUTDIR is not defined