......@@ -28,16 +28,15 @@ help:
mkdir -p $(@D)
mill XiangShan.test.runMain $(SIMTOP) -td $(@D) --full-stacktrace --output-file $(@F) --disable-all --fpga-platform --remove-assert --infer-rw --repl-seq-mem -c:$(SIMTOP):-o:$(@D)/$(@F).conf $(SIM_ARGS)
# mill XiangShan.runMain top.$(TOP) -X verilog -td $(@D) --output-file $(@F) --infer-rw $(FPGATOP) --repl-seq-mem -c:$(FPGATOP):-o:$(@D)/$(@F).conf
# $(MEM_GEN) $(@D)/$(@F).conf >> $@
$(MEM_GEN) $(@D)/$(@F).conf >> $@
# sed -i -e 's/_\(aw\|ar\|w\|r\|b\)_\(\|bits_\)/_\1/g' $@
# @git log -n 1 >> .__head__
# @git diff >> .__diff__
# @sed -i 's/^/\/\// ' .__head__
# @sed -i 's/^/\/\//' .__diff__
# @cat .__head__ .__diff__ $@ > .__out__
# @mv .__out__ $@
# @rm .__head__ .__diff__
@git log -n 1 >> .__head__
@git diff >> .__diff__
@sed -i 's/^/\/\// ' .__head__
@sed -i 's/^/\/\//' .__diff__
@cat .__head__ .__diff__ $@ > .__out__
@mv .__out__ $@
@rm .__head__ .__diff__
deploy: build/top.zip
Subproject commit ca387163b32f20406d443bdab34bc034d5281b51
Subproject commit cf429e420be6702a2e24b9b91910366187c103b4
export NOOP_HOME=$(pwd)/..
......@@ -61,7 +61,7 @@ class XSSoc()(implicit p: Parameters) extends LazyModule with HasSoCParameter {
cacheName = s"L2"
writeBytes = 8
writeBytes = 32
......@@ -79,7 +79,7 @@ class XSSoc()(implicit p: Parameters) extends LazyModule with HasSoCParameter {
cacheName = "L3"
writeBytes = 8
writeBytes = 32
......@@ -170,6 +170,7 @@ class XSSoc()(implicit p: Parameters) extends LazyModule with HasSoCParameter {
xs_core(i).module.io.externalInterrupt.msip := clint.module.io.msip(i)
// xs_core(i).module.io.externalInterrupt.meip := RegNext(RegNext(io.meip(i)))
xs_core(i).module.io.externalInterrupt.meip := plic.module.io.extra.get.meip(i)
xs_core(i).module.io.l2ToPrefetcher <> l2cache(i).module.io
// do not let dma AXI signals optimized out
* Copyright (c) 2020 Institute of Computing Technology, CAS
* Copyright (c) 2020 University of Chinese Academy of Sciences
* NutShell is licensed under Mulan PSL v2.
* You can use this software according to the terms and conditions of the Mulan PSL v2.
* You may obtain a copy of Mulan PSL v2 at:
* http://license.coscl.org.cn/MulanPSL2
* See the Mulan PSL v2 for more details.
package utils
import chisel3._
import chisel3.experimental.{ChiselAnnotation, annotate}
import chisel3.util._
import firrtl.annotations.Annotation
import freechips.rocketchip.transforms.naming.OverrideDesiredNameAnnotation
class SRAMBundleA(val set: Int) extends Bundle {
val setIdx = Output(UInt(log2Up(set).W))
......@@ -22,7 +35,7 @@ class SRAMBundleAW[T <: Data](private val gen: T, set: Int, val way: Int = 1) ex
def apply(data: T, setIdx: UInt, waymask: UInt) = {
this.data := data
this.waymask.foreach(_ := waymask)
this.waymask.map(_ := waymask)
......@@ -52,90 +65,15 @@ class SRAMWriteBus[T <: Data](private val gen: T, val set: Int, val way: Int = 1
abstract class SRAMTemplate extends Module {
def read(addr: UInt, ren: Bool): Vec[UInt]
def write(addr: UInt, wen: Bool, wdata: UInt, wmask: UInt): Unit
class SinglePortSRAM(set: Int, way: Int, width: Int) extends SRAMTemplate {
val io = IO(new Bundle() {
val addr = Input(UInt(log2Up(set).W))
val ren = Input(Bool())
val rdata = Output(Vec(way, UInt(width.W)))
val wdata = Input(UInt(width.W))
val wen = Input(Bool())
val wmask = Input(UInt(way.W))
val mem = SyncReadMem(set, Vec(way, UInt(width.W)))
val addr = io.addr
mem.write(addr, VecInit(Seq.fill(way)(io.wdata)), io.wmask.asBools())
io.rdata := mem.read(addr, io.ren && !io.wen)
override def read(addr: UInt, ren: Bool): Vec[UInt] = {
io.addr := addr
io.ren := ren
override def write(addr: UInt, wen: Bool, wdata: UInt, wmask: UInt): Unit = {
io.addr := addr
io.wen := wen
io.wdata := wdata
io.wmask := wmask
class DualPortSRAM(set: Int, way: Int, width: Int) extends SRAMTemplate {
val io = IO(new Bundle() {
val raddr = Input(UInt(log2Up(set).W))
val ren = Input(Bool())
val rdata = Output(Vec(way, UInt(width.W)))
val waddr = Input(UInt(log2Up(set).W))
val wdata = Input(UInt(width.W))
val wen = Input(Bool())
val wmask = Input(UInt(way.W))
val mem = SyncReadMem(set, Vec(way, UInt(width.W)))
io.rdata := mem.read(io.raddr, io.ren)
mem.write(io.waddr, VecInit(Seq.fill(way)(io.wdata)), io.wmask.asBools())
override def read(addr: UInt, ren: Bool): Vec[UInt] = {
io.raddr := addr
io.ren := ren
override def write(addr: UInt, wen: Bool, wdata: UInt, wmask: UInt): Unit = {
io.waddr := addr
io.wen := wen
io.wdata := wdata
io.wmask := wmask
class SRAMWrapper[T <: Data]
sramName: String,
gen: T, set: Int, way: Int = 1,
shouldReset: Boolean = false,
holdRead: Boolean = false,
singlePort: Boolean = false
) extends Module {
class SRAMTemplate[T <: Data](gen: T, set: Int, way: Int = 1,
shouldReset: Boolean = false, holdRead: Boolean = false, singlePort: Boolean = false) extends Module {
val io = IO(new Bundle {
val r = Flipped(new SRAMReadBus(gen, set, way))
val w = Flipped(new SRAMWriteBus(gen, set, way))
val wordType = UInt(gen.getWidth.W)
// val array = SyncReadMem(set, Vec(way, wordType))
val array: SRAMTemplate = if(singlePort) {
Module(new SinglePortSRAM(set, way, gen.getWidth))
} else {
Module(new DualPortSRAM(set, way, gen.getWidth))
val array = SyncReadMem(set, Vec(way, wordType))
val (resetState, resetSet) = (WireInit(false.B), WireInit(0.U))
if (shouldReset) {
......@@ -148,38 +86,31 @@ class SRAMWrapper[T <: Data]
val (ren, wen) = (io.r.req.valid, io.w.req.valid || resetState)
val realRen = ren //(if (singlePort) ren && !wen else ren) do mutex inside inner sram
val realRen = (if (singlePort) ren && !wen else ren)
val setIdx = Mux(resetState, resetSet,
if(singlePort) Mux(io.w.req.valid, io.w.req.bits.setIdx, io.r.req.bits.setIdx)
else io.w.req.bits.setIdx
val setIdx = Mux(resetState, resetSet, io.w.req.bits.setIdx)
val wdataword = Mux(resetState, 0.U.asTypeOf(wordType), io.w.req.bits.data.asUInt)
val waymask = Mux(resetState, Fill(way, "b1".U), io.w.req.bits.waymask.getOrElse("b1".U))
array.write(setIdx, wen, wdataword, waymask)
val wdata = VecInit(Seq.fill(way)(wdataword))
when (wen) { array.write(setIdx, wdata, waymask.asBools) }
val rdataWire = if(singlePort) array.read(setIdx, realRen) else array.read(io.r.req.bits.setIdx, realRen)
val rdata = (if(holdRead) HoldUnless(rdataWire, RegNext(realRen)) else rdataWire).map(_.asTypeOf(gen))
val rdata = (if (holdRead) ReadAndHold(array, io.r.req.bits.setIdx, realRen)
else array.read(io.r.req.bits.setIdx, realRen)).map(_.asTypeOf(gen))
io.r.resp.data := VecInit(rdata)
io.r.req.ready := !resetState && (if (singlePort) !wen else true.B)
io.w.req.ready := true.B
val prefix = if(singlePort) "SinglePortSRAM_" else "DualPortSRAM_"
annotate(new ChiselAnnotation {
override def toFirrtl: Annotation = OverrideDesiredNameAnnotation(s"$prefix$sramName", array.toAbsoluteTarget)
class SRAMTemplateWithArbiter[T <: Data](sramName: String, nRead: Int, gen: T, set: Int, way: Int = 1,
class SRAMTemplateWithArbiter[T <: Data](nRead: Int, gen: T, set: Int, way: Int = 1,
shouldReset: Boolean = false) extends Module {
val io = IO(new Bundle {
val r = Flipped(Vec(nRead, new SRAMReadBus(gen, set, way)))
val w = Flipped(new SRAMWriteBus(gen, set, way))
val ram = Module(new SRAMWrapper(sramName, gen, set, way, shouldReset, holdRead = false, singlePort = true))
val ram = Module(new SRAMTemplate(gen, set, way, shouldReset, holdRead = false, singlePort = true))
ram.io.w <> io.w
val readArb = Module(new Arbiter(chiselTypeOf(io.r(0).req.bits), nRead))
......@@ -187,7 +118,7 @@ class SRAMTemplateWithArbiter[T <: Data](sramName: String, nRead: Int, gen: T, s
ram.io.r.req <> readArb.io.out
// latch read results
io.r.map { r => {
io.r.map{ case r => {
r.resp.data := HoldUnless(ram.io.r.resp.data, RegNext(r.req.fire()))
......@@ -10,7 +10,7 @@ import xiangshan.backend.exu.Exu._
import xiangshan.frontend._
import xiangshan.mem._
import xiangshan.backend.fu.HasExceptionNO
import xiangshan.cache.{DCache,InstrUncache, DCacheParameters, ICache, ICacheParameters, L1plusCache, L1plusCacheParameters, PTW, Uncache}
import xiangshan.cache.{DCache,InstrUncache, DCacheParameters, ICache, ICacheParameters, L1plusCache, L1plusCacheParameters, PTW, Uncache, MemoryOpConstants, MissReq}
import xiangshan.cache.prefetch._
import chipsalliance.rocketchip.config
import freechips.rocketchip.diplomacy.{AddressSet, LazyModule, LazyModuleImp}
......@@ -19,6 +19,7 @@ import freechips.rocketchip.devices.tilelink.{DevNullParams, TLError}
import sifive.blocks.inclusivecache.{CacheParameters, InclusiveCache, InclusiveCacheMicroParameters}
import freechips.rocketchip.amba.axi4.{AXI4Deinterleaver, AXI4Fragmenter, AXI4IdIndexer, AXI4IdentityNode, AXI4ToTL, AXI4UserYanker}
import freechips.rocketchip.tile.HasFPUParameters
import sifive.blocks.inclusivecache.PrefetcherIO
import utils._
case class XSCoreParameters
......@@ -236,7 +237,7 @@ trait HasXSParameter {
// dcache prefetcher
val l2PrefetcherParameters = L2PrefetcherParameters(
enable = true,
_type = "stream",
_type = "bop",// "stream" or "bop"
streamParams = StreamPrefetchParameters(
streamCnt = 4,
streamSize = 4,
......@@ -244,7 +245,16 @@ trait HasXSParameter {
blockBytes = L2BlockSize,
reallocStreamOnMissInstantly = true,
cacheName = "dcache"
bopParams = BOPParameters(
rrTableEntries = 256,
rrTagBits = 12,
scoreBits = 5,
roundMax = 50,
badScore = 1,
blockBytes = L2BlockSize,
nEntries = dcacheParameters.nMissEntries * 2 // TODO: this is too large
......@@ -337,6 +347,7 @@ class XSCoreImp(outer: XSCore) extends LazyModuleImp(outer)
val io = IO(new Bundle {
val externalInterrupt = new ExternalInterruptIO
val l2ToPrefetcher = Flipped(new PrefetcherIO(PAddrBits))
println(s"FPGAPlatform:${env.FPGAPlatform} EnableDebug:${env.EnableDebug}")
......@@ -451,7 +462,16 @@ class XSCoreImp(outer: XSCore) extends LazyModuleImp(outer)
ptw.io.sfence <> integerBlock.io.fenceio.sfence
ptw.io.csr <> integerBlock.io.csrio.tlb
l2Prefetcher.io.in <> memBlock.io.toDCachePrefetch
val l2PrefetcherIn = Wire(Decoupled(new MissReq))
if (l2PrefetcherParameters.enable && l2PrefetcherParameters._type == "bop") {
l2PrefetcherIn.valid := io.l2ToPrefetcher.acquire.valid
l2PrefetcherIn.bits := DontCare
l2PrefetcherIn.bits.addr := io.l2ToPrefetcher.acquire.bits.address
l2PrefetcherIn.bits.cmd := Mux(io.l2ToPrefetcher.acquire.bits.write, MemoryOpConstants.M_XWR, MemoryOpConstants.M_XRD)
} else {
l2PrefetcherIn <> memBlock.io.toDCachePrefetch
l2Prefetcher.io.in <> l2PrefetcherIn
if (!env.FPGAPlatform) {
val debugIntReg, debugFpReg = WireInit(VecInit(Seq.fill(32)(0.U(XLEN.W))))
......@@ -41,7 +41,7 @@ class Radix2Divider(len: Int) extends AbstractDivider(len) {
val uopReg = RegEnable(uop, newReq)
val cnt = Counter(len)
when (newReq) {
when (newReq && !io.in.bits.uop.roqIdx.needFlush(io.redirectIn)) {
state := s_log2
} .elsewhen (state === s_log2) {
// `canSkipShift` is calculated as following:
......@@ -37,7 +37,9 @@ class SRT4Divider(len: Int) extends AbstractDivider(len) {
when(io.in.fire()){ state := Mux(divZero, s_finish, s_lzd) }
when (io.in.fire() && !io.in.bits.uop.roqIdx.needFlush(io.redirectIn)) {
state := Mux(divZero, s_finish, s_lzd)
is(s_lzd){ // leading zero detection
state := s_normlize
......@@ -2,7 +2,7 @@ package xiangshan.cache
import chisel3._
import chisel3.util._
import utils.{Code, RandomReplacement, HasTLDump, XSDebug, SRAMWrapper}
import utils.{Code, RandomReplacement, HasTLDump, XSDebug, SRAMTemplate}
import xiangshan.{HasXSLog}
import chipsalliance.rocketchip.config.Parameters
......@@ -130,7 +130,7 @@ class L1plusCacheDataArray extends L1plusCacheModule {
io.read.ready := !rwhazard
for (w <- 0 until nWays) {
val array = Module(new SRAMWrapper("L1Plus_Data", Bits((blockRows * encRowBits).W), set=nSets, way=1,
val array = Module(new SRAMTemplate(Bits((blockRows * encRowBits).W), set=nSets, way=1,
shouldReset=false, holdRead=false, singlePort=singlePort))
// data write
array.io.w.req.valid := io.write.bits.way_en(w) && io.write.valid
......@@ -209,7 +209,7 @@ class L1plusCacheMetadataArray extends L1plusCacheModule {
val rmask = Mux((nWays == 1).B, (-1).asSInt, io.read.bits.way_en.asSInt).asBools
def encTagBits = cacheParams.tagCode.width(tagBits)
val tag_array = Module(new SRAMWrapper("L1Plus_Meta", UInt(encTagBits.W), set=nSets, way=nWays,
val tag_array = Module(new SRAMTemplate(UInt(encTagBits.W), set=nSets, way=nWays,
shouldReset=false, holdRead=false, singlePort=true))
val valid_array = Reg(Vec(nSets, UInt(nWays.W)))
when (reset.toBool || io.flush) {
......@@ -3,7 +3,7 @@ package xiangshan.cache
import chisel3._
import chisel3.util._
import freechips.rocketchip.tilelink.{ClientMetadata, TLClientParameters, TLEdgeOut}
import utils.{Code, RandomReplacement, XSDebug, SRAMWrapper}
import utils.{Code, RandomReplacement, XSDebug, SRAMTemplate}
import scala.math.max
......@@ -197,8 +197,7 @@ class DuplicatedDataArray extends AbstractDataArray
io.resp(j)(w)(r) := Cat((0 until rowWords).reverse map (k => resp(k)))
for (k <- 0 until rowWords) {
val array = Module(new SRAMWrapper(
val array = Module(new SRAMTemplate(
......@@ -245,7 +244,7 @@ class L1MetadataArray(onReset: () => L1Metadata) extends DCacheModule {
val metaBits = rstVal.getWidth
val encMetaBits = cacheParams.tagCode.width(metaBits)
val tag_array = Module(new SRAMWrapper("Dcache_Meta", UInt(encMetaBits.W), set=nSets, way=nWays,
val tag_array = Module(new SRAMTemplate(UInt(encMetaBits.W), set=nSets, way=nWays,
shouldReset=false, holdRead=false, singlePort=true))
// tag write
......@@ -191,8 +191,7 @@ class ICacheMetaArray extends ICachArray
val readResp = Output(Vec(nWays,UInt(tagBits.W)))
val metaArray = Module(new SRAMWrapper(
val metaArray = Module(new SRAMTemplate(
......@@ -233,8 +232,7 @@ class ICacheDataArray extends ICachArray
//dataEntryBits = 144
val dataArray = List.fill(nWays){List.fill(nBanks){Module(new SRAMWrapper(
val dataArray = List.fill(nWays){List.fill(nBanks){Module(new SRAMTemplate(
way = 1,
......@@ -495,8 +495,8 @@ class MissQueue(edge: TLEdgeOut) extends DCacheModule with HasTLDump
if (!env.FPGAPlatform) {
start = entry.io.req.fire(),
stop = entry.io.resp.fire(),
start = entry.io.block_idx.valid,
stop = !entry.io.block_idx.valid,
startHighPriority = true),
"perfCntDCacheMissQueuePenaltyEntry" + Integer.toString(i, 10),
......@@ -12,21 +12,24 @@ case class BOPParameters(
scoreBits: Int,
roundMax: Int,
badScore: Int,
scores: Int = 52,
// TODO: Is 256-offset necessary, which will cross pages?
offsetList: Seq[Int] = Seq(
1, 2, 3, 4, 5, 6, 8, 9, 10, 12,
15, 16, 18, 20, 24, 25, 27, 30, 32, 36,
15, 16/*, 18, 20, 24, 25, 27, 30, 32, 36,
40, 45, 48, 50, 54, 60, 64, 72, 75, 80,
81, 90, 96, 100, 108, 120, 125, 128, 135, 144,
150, 160, 162, 180, 192, 200, 216, 225, 240, 243,
250, 256
250, 256*/
blockBytes: Int
blockBytes: Int,
nEntries: Int
) {
def scores = offsetList.length
def offsetWidth = log2Up(offsetList(scores - 1)) + 1
def rrIdxBits = log2Up(rrTableEntries)
def roundBits = log2Up(roundMax)
def scoreMax = (1 << scoreBits) - 1
def totalWidth = log2Up(nEntries) // id's width
class ScoreTableEntry(p: BOPParameters) extends PrefetchBundle {
......@@ -34,7 +37,7 @@ class ScoreTableEntry(p: BOPParameters) extends PrefetchBundle {
val score = UInt(p.scoreBits.W)
def apply(offset: UInt, score: UInt) = {
val entry = new ScoreTableEntry(p)
val entry = Wire(new ScoreTableEntry(p))
entry.offset := offset
entry.score := score
......@@ -78,9 +81,51 @@ class TestOffsetBundle(p: BOPParameters) extends PrefetchBundle {
override def cloneType: this.type = (new TestOffsetBundle(p)).asInstanceOf[this.type]
class BestOffsetPrefetchReq(p: BOPParameters) extends PrefetchReq {
val id = UInt(p.totalWidth.W)
override def toPrintable: Printable = {
p"addr=0x${Hexadecimal(addr)} w=${write} id=0x${Hexadecimal(id)}"
override def cloneType: this.type = (new BestOffsetPrefetchReq(p)).asInstanceOf[this.type]
class BestOffsetPrefetchResp(p: BOPParameters) extends PrefetchResp {
val id = UInt(p.totalWidth.W)
override def toPrintable: Printable = {
override def cloneType: this.type = (new BestOffsetPrefetchResp(p)).asInstanceOf[this.type]
class BestOffsetPrefetchFinish(p: BOPParameters) extends PrefetchFinish {
val id = UInt(p.totalWidth.W)
override def toPrintable: Printable = {
override def cloneType: this.type = (new BestOffsetPrefetchFinish(p)).asInstanceOf[this.type]
class BestOffsetPrefetchIO(p: BOPParameters) extends PrefetchBundle {
val train = Flipped(ValidIO(new PrefetchTrain))
val req = DecoupledIO(new BestOffsetPrefetchReq(p))
val resp = Flipped(DecoupledIO(new BestOffsetPrefetchResp(p)))
val finish = DecoupledIO(new BestOffsetPrefetchFinish(p))
override def toPrintable: Printable = {
p"train: v=${train.valid} ${train.bits} " +
p"req: v=${req.valid} r=${req.ready} ${req.bits} " +
p"resp: v=${resp.valid} r=${resp.ready} ${resp.bits} " +
p"finish: v=${finish.valid} r=${finish.ready} ${finish.bits}"
override def cloneType: this.type = (new BestOffsetPrefetchIO(p)).asInstanceOf[this.type]
class RecentRequestTable(p: BOPParameters) extends PrefetchModule {
val io = IO(new Bundle {
val w = Flipped(ValidIO(UInt(PAddrBits.W)))
val w = Flipped(DecoupledIO(UInt(PAddrBits.W)))
val r = Flipped(new TestOffsetBundle(p))
def rrIdxBits = p.rrIdxBits
......@@ -108,10 +153,10 @@ class RecentRequestTable(p: BOPParameters) extends PrefetchModule {
val rrTable = Module(new SRAMWrapper("RR_Table", rrTableEntry(), set = rrTableEntries, way = 1, shouldReset = true))
val rrTable = Module(new SRAMTemplate(rrTableEntry(), set = rrTableEntries, way = 1, shouldReset = true, singlePort = true))
val wAddr = io.w.bits
rrTable.io.w.req.valid := io.w.valid
rrTable.io.w.req.valid := io.w.valid && !io.r.req.valid
rrTable.io.w.req.bits.setIdx := idx(wAddr)
rrTable.io.w.req.bits.data.valid := true.B
rrTable.io.w.req.bits.data.tag := tag(wAddr)
......@@ -122,32 +167,35 @@ class RecentRequestTable(p: BOPParameters) extends PrefetchModule {
rrTable.io.r.req.bits.setIdx := idx(rAddr)
rData := rrTable.io.r.resp.data(0)
val rwConflict = io.w.valid && io.r.req.fire() && idx(wAddr) === idx(rAddr)
when (rwConflict) {
rrTable.io.r.req.valid := false.B
when (RegNext(rwConflict)) {
rData.valid := true.B
rData.tag := RegNext(tag(wAddr))
val rwConflict = io.w.fire() && io.r.req.fire() && idx(wAddr) === idx(rAddr)
// when (rwConflict) {
// rrTable.io.r.req.valid := false.B
// }
// when (RegNext(rwConflict)) {
// rData.valid := true.B
// rData.tag := RegNext(tag(wAddr))
// }
io.w.ready := rrTable.io.w.req.ready && !io.r.req.valid
io.r.req.ready := true.B
io.r.resp.valid := RegNext(io.r.req.fire())
io.r.resp.valid := RegNext(rrTable.io.r.req.fire())
io.r.resp.bits.testOffset := RegNext(io.r.req.bits.testOffset)
io.r.resp.bits.ptr := RegNext(io.r.req.bits.ptr)
io.r.resp.bits.hit := rData.valid && rData.tag === RegNext(tag(rAddr))
assert(!RegNext(rwConflict), "single port SRAM should not read and write at the same time")
// debug info
XSDebug(io.w.valid, p"io.write: v=${io.w.valid} addr=0x${Hexadecimal(io.w.bits)}\n")
XSDebug(io.w.fire(), p"io.write: v=${io.w.valid} addr=0x${Hexadecimal(io.w.bits)}\n")
XSDebug(p"io.read: ${io.r}\n")
XSDebug(io.w.valid, p"wAddr=0x${Hexadecimal(wAddr)} idx=${Hexadecimal(idx(wAddr))} tag=${Hexadecimal(tag(wAddr))}\n")
XSDebug(io.w.fire(), p"wAddr=0x${Hexadecimal(wAddr)} idx=${Hexadecimal(idx(wAddr))} tag=${Hexadecimal(tag(wAddr))}\n")
XSDebug(io.r.req.fire(), p"rAddr=0x${Hexadecimal(rAddr)} idx=${Hexadecimal(idx(rAddr))} rData=${rData}\n")
XSDebug(rwConflict, p"write and read conflict!\n")
class OffsetScoreTable(p: BOPParameters) extends PrefetchModule {
val io = IO(new Bundle {
val req = Flipped(DecoupledIO(UInt(PAddrBits.W))) // req addr from L1
val prefetchOffset = Output(UInt(p.offsetWidth.W))
val test = new TestOffsetBundle(p)
......@@ -158,33 +206,34 @@ class OffsetScoreTable(p: BOPParameters) extends PrefetchModule {
def roundBits = p.roundBits
def roundMax = p.roundMax
def scoreMax = p.scoreMax
def badScore = p.badScore
val prefetchOffset = RegInit(1.U(offsetWidth)) // best offset is 1, this is, a next-line prefetcher as initialization
val prefetchOffset = RegInit(2.U(offsetWidth.W)) // best offset is 1, that is, a next-line prefetcher as initialization
val st = RegInit(VecInit(offsetList.map(off => new ScoreTableEntry(p).apply(off.U, 0.U))))
val ptr = RegInit(0.U(log2Up(scores).W))
val round = RegInit(0.U(roundBits.W))
val bestOffset = RegInit(new ScoreTableEntry(p).apply(1.U, 0.U)) // the entry with the highest score while traversing
val testOffset = WireInit(0.U(offsetWidth.W))
val bestOffset = RegInit(new ScoreTableEntry(p).apply(2.U, 0.U)) // the entry with the highest score while traversing
val testOffset = WireInit(st(ptr).offset)
def winner(e1: ScoreTableEntry, e2: ScoreTableEntry): ScoreTableEntry = {
val w = new ScoreTableEntry(p)
val w = Wire(new ScoreTableEntry(p))
w := Mux(e1.score > e2.score, e1, e2)
val s_idle :: s_learn :: s_finish :: Nil = Enum(3)
val s_idle :: s_learn :: Nil = Enum(2)
val state = RegInit(s_idle)
// 1. At the start of a learning phase
// All the scores are reset to 0.
// At the end of every learning phase, the prefetch offset is updated as the one with the highest score.
when (state === s_idle) {
when (ptr =/= scores.U) {
st(ptr).score := 0.U
ptr := ptr + 1.U
}.otherwise {
ptr := 0.U
state := s_learn
st.foreach(_.score := 0.U)
ptr := 0.U
round := 0.U
bestOffset.score := badScore.U
prefetchOffset := bestOffset.offset
state := s_learn
// 2. During a learning phase
......@@ -196,16 +245,18 @@ class OffsetScoreTable(p: BOPParameters) extends PrefetchModule {
// (1) one of the score equals SCOREMAX, or
// (2) the number of rounds equals ROUNDMAX.
when (state === s_learn) {
testOffset := st(ptr).offset
when (io.test.req.fire()) {
val roundFinish = ptr === (scores - 1).U
ptr := Mux(roundFinish, 0.U, ptr + 1.U)
round := Mux(roundFinish, round + 1.U, round)
XSDebug(p"test offset ${testOffset} req fire\n")
// (2) the number of rounds equals ROUNDMAX.
when (round === roundMax.U) {
state := s_finish
when (round >= roundMax.U) {
state := s_idle
XSDebug(p"round reaches roundMax(${roundMax.U})\n")
when (io.test.resp.fire() && io.test.resp.bits.hit) {
......@@ -216,25 +267,148 @@ class OffsetScoreTable(p: BOPParameters) extends PrefetchModule {
st(io.test.resp.bits.ptr).score := newScore
bestOffset := winner(new ScoreTableEntry(p).apply(offset, newScore), bestOffset)
// (1) one of the score equals SCOREMAX
when (newScore === scoreMax.U) {
state := s_finish
when (newScore >= scoreMax.U) {
state := s_idle
XSDebug(p"newScore reaches scoreMax(${scoreMax.U})\n")
// 3. At the end of every learning phase, the prefetch offset is updated as the one with the highest score.
when (state === s_finish) {
prefetchOffset := bestOffset.offset
ptr := 0.U
round := 0.U
bestOffset.offset := 1.U
bestOffset.score := 0.U
state := s_idle
XSDebug(p"test offset ${offset} resp fire and hit. score ${oldScore} -> ${newScore}\n")
io.req.ready := true.B
io.prefetchOffset := prefetchOffset
io.test.req.valid := state === s_learn && round =/= roundMax.U
io.test.req.bits.addr := DontCare // assign this outside the score table
io.test.req.valid := state === s_learn && io.req.fire()
io.test.req.bits.addr := io.req.bits
io.test.req.bits.testOffset := testOffset
io.test.req.bits.ptr := ptr
io.test.resp.ready := true.B
XSDebug(p"state=${state} prefetchOffset=${prefetchOffset} ptr=${ptr} round=${round} bestOffset=${bestOffset} testOffset=${testOffset}\n")
// score table
XSDebug(p"OffsetScoreTable(idx:offset:score) as follows:\n")
for (i <- 0 until scores) {
if (i % 8 == 0) { XSDebug(p"${i.U}:${st(i)}\t") }
else if (i % 8 == 7 || i == scores - 1) { XSDebug(false, true.B, p"${i.U}:${st(i)}\n") }
else { XSDebug(false, true.B, p"${i.U}:${st(i)}\t") }
XSDebug(io.req.fire(), p"receive req from L1. io.req.bits=0x${Hexadecimal(io.req.bits)}\n")
class BestOffsetPrefetchEntry(p: BOPParameters) extends PrefetchModule {
val io = IO(new Bundle {
val id = Input(UInt(p.totalWidth.W))
val prefetchOffset = Input(UInt(p.offsetWidth.W))
val pft = new BestOffsetPrefetchIO(p)
val inflight = ValidIO(UInt(PAddrBits.W))
val writeRRTable = DecoupledIO(UInt(PAddrBits.W))
def blockBytes = p.blockBytes
def getBlockAddr(addr: UInt) = Cat(addr(PAddrBits - 1, log2Up(blockBytes)), 0.U(log2Up(blockBytes).W))
val s_idle :: s_req :: s_resp :: s_write_recent_req :: s_finish :: Nil = Enum(5)
val state = RegInit(s_idle)
val req = RegInit(0.U.asTypeOf(new PrefetchReq))
val baseAddr = RegInit(0.U(PAddrBits.W))
when (state === s_idle) {
when (io.pft.train.valid) {
state := s_req
req.addr := getBlockAddr(io.pft.train.bits.addr) + (io.prefetchOffset << log2Up(blockBytes))
req.write := io.pft.train.bits.write
baseAddr := getBlockAddr(io.pft.train.bits.addr)
when (state === s_req) {
when (io.pft.req.fire()) {
state := s_resp
when (state === s_resp) {
when (io.pft.resp.fire()) {
state := s_write_recent_req
when (state === s_write_recent_req) {
when (io.writeRRTable.fire()) {
state := s_finish
when (state === s_finish) {
when (io.pft.finish.fire()) {
state := s_idle
io.pft.req.valid := state === s_req
io.pft.req.bits.addr := req.addr
io.pft.req.bits.write := req.write
io.pft.req.bits.id := io.id
io.pft.resp.ready := state === s_resp
io.pft.finish.valid := state === s_finish
io.pft.finish.bits.id := io.id
io.inflight.valid := state =/= s_idle
io.inflight.bits := req.addr
io.writeRRTable.valid := state === s_write_recent_req
io.writeRRTable.bits := baseAddr // write this into recent request table
XSDebug(p"bopEntry ${io.id}: state=${state} prefetchOffset=${io.prefetchOffset} inflight=${io.inflight.valid} 0x${Hexadecimal(io.inflight.bits)} writeRRTable: ${io.writeRRTable.valid} 0x${Hexadecimal(io.writeRRTable.bits)} baseAddr=0x${Hexadecimal(baseAddr)} req: ${req}\n")
XSDebug(p"bopEntry ${io.id}: io.pft: ${io.pft}\n")
class BestOffsetPrefetch(p: BOPParameters) extends PrefetchModule {
val io = IO(new BestOffsetPrefetchIO(p))
def nEntries = p.nEntries
def blockBytes = p.blockBytes
def getBlockAddr(addr: UInt) = Cat(addr(PAddrBits - 1, log2Up(blockBytes)), 0.U(log2Up(blockBytes).W))
val scoreTable = Module(new OffsetScoreTable(p))
val rrTable = Module(new RecentRequestTable(p))
val reqArb = Module(new Arbiter(new BestOffsetPrefetchReq(p), nEntries))
val finishArb = Module(new Arbiter(new BestOffsetPrefetchFinish(p), nEntries))
val writeRRTableArb = Module(new Arbiter(UInt(PAddrBits.W), nEntries))
val entryReadyIdx = Wire(UInt(log2Up(nEntries).W))
val inflightMatchVec = Wire(Vec(nEntries, Bool()))
val bopEntries = (0 until nEntries).map { i =>
val bopEntry = Module(new BestOffsetPrefetchEntry(p))
bopEntry.io.id := i.U
bopEntry.io.prefetchOffset := scoreTable.io.prefetchOffset
bopEntry.io.pft.train.valid := io.train.valid && i.U === entryReadyIdx && !inflightMatchVec.asUInt.orR
bopEntry.io.pft.train.bits := io.train.bits
reqArb.io.in(i) <> bopEntry.io.pft.req
bopEntry.io.pft.resp.valid := io.resp.valid && i.U === io.resp.bits.id
bopEntry.io.pft.resp.bits := io.resp.bits
finishArb.io.in(i) <> bopEntry.io.pft.finish
writeRRTableArb.io.in(i) <> bopEntry.io.writeRRTable
entryReadyIdx := PriorityEncoder(bopEntries.map { e => !e.io.inflight.valid })
(0 until nEntries).foreach(i =>
inflightMatchVec(i) := bopEntries(i).io.inflight.valid && bopEntries(i).io.inflight.bits === getBlockAddr(io.train.bits.addr)
io.req <> reqArb.io.out
io.resp.ready := VecInit(bopEntries.zipWithIndex.map { case (e, i) => i.U === io.resp.bits.id && e.io.pft.resp.ready }).asUInt.orR
io.finish <> finishArb.io.out
rrTable.io.w <> writeRRTableArb.io.out
rrTable.io.r <> scoreTable.io.test
scoreTable.io.req.valid := io.train.valid
scoreTable.io.req.bits := getBlockAddr(io.train.bits.addr)
XSDebug(p"io: ${io}\n")
XSDebug(p"entryReadyIdx=${entryReadyIdx} inflightMatchVec=${Binary(inflightMatchVec.asUInt)}\n")
......@@ -15,13 +15,30 @@ import freechips.rocketchip.tilelink.{TLClientNode, TLClientParameters,
TLEdgeOut, TLBundleA, TLBundleD,
ClientStates, ClientMetadata, TLHints
import sifive.blocks.inclusivecache.PrefetcherIO
case class L2PrefetcherParameters(
enable: Boolean,
_type: String,
streamParams: StreamPrefetchParameters
streamParams: StreamPrefetchParameters,
bopParams: BOPParameters
) {
def nEntries: Int = streamParams.streamCnt * streamParams.streamSize
// def nEntries: Int = streamParams.streamCnt * streamParams.streamSize
def nEntries: Int = {
if (enable && _type == "stream") { streamParams.streamCnt * streamParams.streamSize }
else if (enable && _type == "bop") { bopParams.nEntries }
else 1
def totalWidth: Int = {
if (enable && _type == "stream") streamParams.totalWidth
else if (enable && _type == "bop") bopParams.totalWidth
else 1
def blockBytes: Int = {
if (enable && _type == "stream") streamParams.blockBytes
else if (enable && _type == "bop") bopParams.blockBytes
else 64
class L2Prefetcher()(implicit p: Parameters) extends LazyModule with HasPrefetchParameters {
......@@ -37,18 +54,41 @@ class L2Prefetcher()(implicit p: Parameters) extends LazyModule with HasPrefetch
lazy val module = new L2PrefetcherImp(this)
class L2PrefetcherIO extends XSBundle with HasPrefetchParameters {
val in = Flipped(DecoupledIO(new MissReq))
// prefetch DCache lines in L2 using StreamPrefetch
class L2PrefetcherImp(outer: L2Prefetcher) extends LazyModuleImp(outer) with HasPrefetchParameters with HasXSLog {
val io = IO(new Bundle {
val in = Flipped(DecoupledIO(new MissReq))
// prefetch
// val mem_acquire = Decoupled(new TLBundleA(edge.bundle))
// val mem_grant = Flipped(Decoupled(new TLBundleD(edge.bundle)))
// val mem_finish = Decoupled(new TLBundleE(edge.bundle))
val io = IO(new L2PrefetcherIO)
val (bus, edge) = outer.clientNode.out.head
if (l2PrefetcherParameters.enable && l2PrefetcherParameters._type == "stream") {
if (l2PrefetcherParameters.enable && l2PrefetcherParameters._type == "bop") {
val bopParams = l2PrefetcherParameters.bopParams
val dPrefetch = Module(new BestOffsetPrefetch(bopParams))
dPrefetch.io.train.valid := io.in.fire()
dPrefetch.io.train.bits.addr := io.in.bits.addr
dPrefetch.io.train.bits.write := MemoryOpConstants.isWrite(io.in.bits.cmd)
dPrefetch.io.train.bits.miss := true.B
io.in.ready := true.B
bus.a.valid := dPrefetch.io.req.valid
bus.a.bits := DontCare
bus.a.bits := edge.Hint(
fromSource = dPrefetch.io.req.bits.id,
toAddress = dPrefetch.io.req.bits.addr,
lgSize = log2Up(bopParams.blockBytes).U,
param = Mux(dPrefetch.io.req.bits.write, TLHints.PREFETCH_WRITE, TLHints.PREFETCH_READ)
dPrefetch.io.req.ready := bus.a.ready
dPrefetch.io.resp.valid := bus.d.valid
dPrefetch.io.resp.bits.id := bus.d.bits.source(bopParams.totalWidth - 1, 0)
bus.d.ready := dPrefetch.io.resp.ready
dPrefetch.io.finish.ready := true.B
} else if (l2PrefetcherParameters.enable && l2PrefetcherParameters._type == "stream") {
val streamParams = l2PrefetcherParameters.streamParams
val dPrefetch = Module(new StreamPrefetch(streamParams))
dPrefetch.io.train.valid := io.in.fire()
......@@ -62,49 +102,44 @@ class L2PrefetcherImp(outer: L2Prefetcher) extends LazyModuleImp(outer) with Has
bus.a.bits := edge.Hint(
fromSource = dPrefetch.io.req.bits.id,
toAddress = dPrefetch.io.req.bits.addr,
lgSize = log2Up(streamParams.blockBytes).U,
lgSize = log2Up(l2PrefetcherParameters.blockBytes).U,
param = Mux(dPrefetch.io.req.bits.write, TLHints.PREFETCH_WRITE, TLHints.PREFETCH_READ) // TODO
dPrefetch.io.req.ready := bus.a.ready
bus.b.ready := true.B
bus.c.valid := false.B
bus.c.bits := DontCare
dPrefetch.io.resp.valid := bus.d.valid
dPrefetch.io.resp.bits.id := bus.d.bits.source(streamParams.totalWidth - 1, 0)
dPrefetch.io.resp.bits.id := bus.d.bits.source(l2PrefetcherParameters.totalWidth - 1, 0)
bus.d.ready := dPrefetch.io.resp.ready
bus.e.valid := false.B
bus.e.bits := DontCare
dPrefetch.io.finish.ready := true.B
if (!env.FPGAPlatform) {
ExcitingUtils.addSource(bus.a.fire(), "perfCntL2PrefetchReqCnt", Perf)
def idWidth = log2Up(l2PrefetcherParameters.nEntries)
(0 until l2PrefetcherParameters.nEntries).foreach(i =>
start = bus.a.fire() && dPrefetch.io.req.bits.id(streamParams.totalWidth - 1, 0) === i.U,
stop = bus.d.fire() && bus.d.bits.source(streamParams.totalWidth - 1, 0) === i.U,
startHighPriority = true
"perfCntL2PrefetchPenaltyEntry" + Integer.toString(i, 10),
} else {
bus.a.valid := false.B
bus.a.bits := DontCare
bus.b.ready := true.B
bus.c.valid := false.B
bus.c.bits := DontCare
bus.d.ready := true.B
bus.e.valid := false.B
bus.e.bits := DontCare
bus.b.ready := true.B
bus.c.valid := false.B
bus.c.bits := DontCare
bus.e.valid := false.B
bus.e.bits := DontCare
if (!env.FPGAPlatform) {
ExcitingUtils.addSource(bus.a.fire(), "perfCntL2PrefetchReqCnt", Perf)
(0 until l2PrefetcherParameters.nEntries).foreach(i =>
start = bus.a.fire() && bus.a.bits.source(l2PrefetcherParameters.totalWidth - 1, 0) === i.U,
stop = bus.d.fire() && bus.d.bits.source(l2PrefetcherParameters.totalWidth - 1, 0) === i.U,
startHighPriority = true
"perfCntL2PrefetchPenaltyEntry" + Integer.toString(i, 10),
......@@ -40,11 +40,11 @@ class PrefetchTrain extends PrefetchBundle {
class PrefetchIO extends PrefetchBundle {
val train = Flipped(ValidIO(new PrefetchTrain))
val req = DecoupledIO(new PrefetchReq)
val resp = Flipped(DecoupledIO(new PrefetchResp))
// class PrefetchIO extends PrefetchBundle {
// val train = Flipped(ValidIO(new PrefetchTrain))
// val req = DecoupledIO(new PrefetchReq)
// val resp = Flipped(DecoupledIO(new PrefetchResp))
// }
// class FakePrefetcher extends PrefetchModule {
// val io = IO(new PrefetchIO)
......@@ -323,8 +323,7 @@ class PTWImp(outer: PTW) extends PtwModule(outer){
// two level: l2-tlb-cache && pde/pte-cache
// l2-tlb-cache is ram-larger-edition tlb
// pde/pte-cache is cache of page-table, speeding up ptw
val tlbl2 = Module(new SRAMWrapper(
val tlbl2 = Module(new SRAMTemplate(
new L2TlbEntires(num = TlbL2LineSize, tagLen = TlbL2TagLen),
set = TlbL2LineNum,
singlePort = true
......@@ -339,8 +338,7 @@ class PTWImp(outer: PTW) extends PtwModule(outer){
val ptwl1 = Reg(Vec(PtwL1EntrySize, new PtwEntry(tagLen = PtwL1TagLen)))
val l1v = RegInit(0.U(PtwL1EntrySize.W)) // valid
val l1g = Reg(UInt(PtwL1EntrySize.W))
val ptwl2 = Module(new SRAMWrapper(
val ptwl2 = Module(new SRAMTemplate(
new PtwEntries(num = PtwL2LineSize, tagLen = PtwL2TagLen),
set = PtwL2LineNum,
singlePort = true
......@@ -34,7 +34,7 @@ class BIM extends BasePredictor with BimParams {
val bimAddr = new TableAddr(log2Up(BimSize), BimBanks)
val bim = List.fill(BimBanks) {
Module(new SRAMWrapper("Bim", UInt(2.W), set = nRows, shouldReset = false, holdRead = true))
Module(new SRAMTemplate(UInt(2.W), set = nRows, shouldReset = false, holdRead = true))
val doing_reset = RegInit(true.B)
......@@ -78,15 +78,15 @@ class BTB extends BasePredictor with BTBParams{
val data = List.fill(BtbWays) {
List.fill(BtbBanks) {
Module(new SRAMWrapper("BTB_Data", new BtbDataEntry, set = nRows, shouldReset = true, holdRead = true))
Module(new SRAMTemplate(new BtbDataEntry, set = nRows, shouldReset = true, holdRead = true))
val meta = List.fill(BtbWays) {
List.fill(BtbBanks) {
Module(new SRAMWrapper("BTB_Meta", new BtbMetaEntry, set = nRows, shouldReset = true, holdRead = true))
Module(new SRAMTemplate(new BtbMetaEntry, set = nRows, shouldReset = true, holdRead = true))
val edata = Module(new SRAMWrapper("BTB_Edata", UInt(VAddrBits.W), set = extendedNRows, shouldReset = true, holdRead = true))
val edata = Module(new SRAMTemplate(UInt(VAddrBits.W), set = extendedNRows, shouldReset = true, holdRead = true))
val if1_mask = io.inMask
val if2_mask = RegEnable(if1_mask, io.pc.valid)
......@@ -44,7 +44,7 @@ class SCTable(val nRows: Int, val ctrBits: Int, val histLen: Int) extends BaseSC
val table = List.fill(TageBanks) {
List.fill(2) {
Module(new SRAMWrapper("SC_Table", SInt(ctrBits.W), set=nRows, shouldReset=false, holdRead=true, singlePort=false))
Module(new SRAMTemplate(SInt(ctrBits.W), set=nRows, shouldReset=false, holdRead=true, singlePort=false))
......@@ -162,7 +162,7 @@ class TageTable(val nRows: Int, val histLen: Int, val tagLen: Int, val uBitPerio
val hi_us = List.fill(TageBanks)(Module(new HL_Bank(nRows)))
val lo_us = List.fill(TageBanks)(Module(new HL_Bank(nRows)))
val table = List.fill(TageBanks)(Module(new SRAMWrapper(s"TageTable_H${histLen}_T${tagLen}", new TageEntry, set=nRows, shouldReset=false, holdRead=true, singlePort=false)))
val table = List.fill(TageBanks)(Module(new SRAMTemplate(new TageEntry, set=nRows, shouldReset=false, holdRead=true, singlePort=false)))
val if3_hi_us_r = WireInit(0.U.asTypeOf(Vec(TageBanks, Bool())))
val if3_lo_us_r = WireInit(0.U.asTypeOf(Vec(TageBanks, Bool())))
......@@ -55,7 +55,7 @@ class JBTAC extends XSModule {
val isRVC = Bool()
val jbtac = List.fill(JbtacBanks)(Module(new SRAMWrapper("JBTac", jbtacEntry(), set = JbtacSize / JbtacBanks, shouldReset = true, holdRead = true, singlePort = false)))
val jbtac = List.fill(JbtacBanks)(Module(new SRAMTemplate(jbtacEntry(), set = JbtacSize / JbtacBanks, shouldReset = true, holdRead = true, singlePort = false)))
val readEntries = Wire(Vec(JbtacBanks, jbtacEntry()))
......@@ -163,109 +163,43 @@ class LoadQueue extends XSModule
}.otherwise {
XSInfo(io.loadIn(i).valid, "load hit write to cbd lqidx %d pc 0x%x vaddr %x paddr %x data %x mask %x forwardData %x forwardMask: %x mmio %x\n",
val loadWbIndex = io.loadIn(i).bits.uop.lqIdx.value
datavalid(loadWbIndex) := !io.loadIn(i).bits.miss && !io.loadIn(i).bits.mmio
writebacked(loadWbIndex) := !io.loadIn(i).bits.miss && !io.loadIn(i).bits.mmio
val loadWbData = Wire(new LQDataEntry)
loadWbData.paddr := io.loadIn(i).bits.paddr
loadWbData.mask := io.loadIn(i).bits.mask
loadWbData.data := io.loadIn(i).bits.data // fwd data
loadWbData.fwdMask := io.loadIn(i).bits.forwardMask
dataModule.io.wbWrite(i, loadWbIndex, loadWbData)
dataModule.io.wb.wen(i) := true.B
vaddrModule.io.waddr(i) := loadWbIndex
vaddrModule.io.wdata(i) := io.loadIn(i).bits.vaddr
vaddrModule.io.wen(i) := true.B
debug_mmio(loadWbIndex) := io.loadIn(i).bits.mmio
val dcacheMissed = io.loadIn(i).bits.miss && !io.loadIn(i).bits.mmio
miss(loadWbIndex) := dcacheMissed
pending(loadWbIndex) := io.loadIn(i).bits.mmio
uop(loadWbIndex).debugInfo.issueTime := io.loadIn(i).bits.uop.debugInfo.issueTime
}.otherwise {
XSInfo(io.loadIn(i).valid, "load hit write to cbd lqidx %d pc 0x%x vaddr %x paddr %x data %x mask %x forwardData %x forwardMask: %x mmio %x\n",
val loadWbIndex = io.loadIn(i).bits.uop.lqIdx.value
datavalid(loadWbIndex) := !io.loadIn(i).bits.miss && !io.loadIn(i).bits.mmio
writebacked(loadWbIndex) := !io.loadIn(i).bits.miss && !io.loadIn(i).bits.mmio
val loadWbData = Wire(new LQDataEntry)
loadWbData.paddr := io.loadIn(i).bits.paddr
loadWbData.mask := io.loadIn(i).bits.mask
loadWbData.data := io.loadIn(i).bits.data // fwd data
loadWbData.fwdMask := io.loadIn(i).bits.forwardMask
dataModule.io.wbWrite(i, loadWbIndex, loadWbData)
dataModule.io.wb.wen(i) := true.B
vaddrModule.io.waddr(i) := loadWbIndex
vaddrModule.io.wdata(i) := io.loadIn(i).bits.vaddr
vaddrModule.io.wen(i) := true.B
debug_mmio(loadWbIndex) := io.loadIn(i).bits.mmio
val dcacheMissed = io.loadIn(i).bits.miss && !io.loadIn(i).bits.mmio
miss(loadWbIndex) := dcacheMissed
pending(loadWbIndex) := io.loadIn(i).bits.mmio
uop(loadWbIndex).debugInfo.issueTime := io.loadIn(i).bits.uop.debugInfo.issueTime
* Cache miss request
* (1) writeback: miss
* (2) send to dcache: listing
* (3) dcache response: datavalid
* (4) writeback to ROB: writeback
when(io.dcache.valid) {
XSDebug("miss resp: paddr:0x%x data %x\n", io.dcache.bits.addr, io.dcache.bits.data)
......@@ -303,7 +237,7 @@ class LoadQueue extends XSModule
val loadWbSel = Wire(Vec(LoadPipelineWidth, UInt(log2Up(LoadQueueSize).W))) // index selected last cycle
val loadWbSelV = RegInit(VecInit(List.fill(LoadPipelineWidth)(false.B))) // index selected in last cycle is valid
val loadWbSelV = Wire(Vec(LoadPipelineWidth, Bool())) // index selected in last cycle is valid
val loadWbSelVec = VecInit((0 until LoadQueueSize).map(i => {
allocated(i) && !writebacked(i) && datavalid(i)
......@@ -329,17 +263,11 @@ class LoadQueue extends XSModule
loadWbSelVGen(1) := loadOddSelVec.asUInt.orR
(0 until LoadPipelineWidth).map(i => {
val canGo = io.ldout(i).fire() || !loadWbSelV(i)
val valid = loadWbSelVGen(i)
loadWbSel(i) := RegNext(loadWbSelGen(i))
loadWbSelV(i) := RegNext(loadWbSelVGen(i), init = false.B)
// Mark them as writebacked, so they will not be selected in the next cycle
writebacked(loadWbSel(i)) := true.B
// update loadWbSelValidReg
loadWbSelV(i) := false.B
when(valid && canGo){
loadWbSelV(i) := true.B
......@@ -440,7 +368,9 @@ class LoadQueue extends XSModule
* Besides, load instructions in LoadUnit_S1 and S2 are also checked.
* Cycle 1: Redirect Generation
* There're three possible types of violations. Choose the oldest load.
* Set io.redirect according to the detected violation.
* Prepare redirect request according to the detected violation.
* Cycle 2: Redirect Fire
* Fire redirect request (if valid)
io.load_s1 := DontCare
def detectRollback(i: Int) = {
......@@ -540,18 +470,29 @@ class LoadQueue extends XSModule
val rollbackSelected = ParallelOperation(rollback, rollbackSel)
val lastCycleRedirect = RegNext(io.brqRedirect)
// S2: select rollback and generate rollback request
// Note that we use roqIdx - 1.U to flush the load instruction itself.
// Thus, here if last cycle's roqIdx equals to this cycle's roqIdx, it still triggers the redirect.
io.rollback.valid := rollbackSelected.valid &&
val rollbackGen = Wire(Valid(new Redirect))
val rollbackReg = Reg(Valid(new Redirect))
rollbackGen.valid := rollbackSelected.valid &&
(!lastCycleRedirect.valid || !isAfter(rollbackSelected.bits.roqIdx, lastCycleRedirect.bits.roqIdx)) &&
!(lastCycleRedirect.valid && lastCycleRedirect.bits.isUnconditional())
io.rollback.bits.roqIdx := rollbackSelected.bits.roqIdx
io.rollback.bits.level := RedirectLevel.flush
io.rollback.bits.interrupt := DontCare
io.rollback.bits.pc := DontCare
io.rollback.bits.target := rollbackSelected.bits.cf.pc
io.rollback.bits.brTag := rollbackSelected.bits.brTag
rollbackGen.bits.roqIdx := rollbackSelected.bits.roqIdx
rollbackGen.bits.level := RedirectLevel.flush
rollbackGen.bits.interrupt := DontCare
rollbackGen.bits.pc := DontCare
rollbackGen.bits.target := rollbackSelected.bits.cf.pc
rollbackGen.bits.brTag := rollbackSelected.bits.brTag
rollbackReg := rollbackGen
// S3: fire rollback request
io.rollback := rollbackReg
io.rollback.valid := rollbackReg.valid &&
(!lastCycleRedirect.valid || !isAfter(rollbackReg.bits.roqIdx, lastCycleRedirect.bits.roqIdx)) &&
!(lastCycleRedirect.valid && lastCycleRedirect.bits.isUnconditional())
when(io.rollback.valid) {
XSDebug("Mem rollback: pc %x roqidx %d\n", io.rollback.bits.pc, io.rollback.bits.roqIdx.asUInt)
......@@ -102,6 +102,18 @@ class StoreUnit_S1 extends XSModule {
class StoreUnit_S2 extends XSModule {
val io = IO(new Bundle() {
val in = Flipped(Decoupled(new LsPipelineBundle))
val out = Decoupled(new LsPipelineBundle)
io.in.ready := true.B
io.out.bits := io.in.bits
io.out.valid := io.in.valid
class StoreUnit_S3 extends XSModule {
val io = IO(new Bundle() {
val in = Flipped(Decoupled(new LsPipelineBundle))
val stout = DecoupledIO(new ExuOutput) // writeback store
......@@ -134,6 +146,7 @@ class StoreUnit extends XSModule {
val store_s0 = Module(new StoreUnit_S0)
val store_s1 = Module(new StoreUnit_S1)
val store_s2 = Module(new StoreUnit_S2)
val store_s3 = Module(new StoreUnit_S3)
store_s0.io.in <> io.stin
store_s0.io.dtlbReq <> io.dtlb.req
......@@ -146,7 +159,9 @@ class StoreUnit extends XSModule {
PipelineConnect(store_s1.io.out, store_s2.io.in, true.B, store_s1.io.out.bits.uop.roqIdx.needFlush(io.redirect))
store_s2.io.stout <> io.stout
PipelineConnect(store_s2.io.out, store_s3.io.in, true.B, store_s2.io.out.bits.uop.roqIdx.needFlush(io.redirect))
store_s3.io.stout <> io.stout
private def printPipeLine(pipeline: LsPipelineBundle, cond: Bool, name: String): Unit = {
