提交 b35479a0 编写于 作者: W William Wang

Merge remote-tracking branch 'origin/master' into constantin

......@@ -161,7 +161,8 @@ class MinimalConfig(n: Int = 1) extends Config(
l3nWays = 8,
spSize = 2,
),
L2CacheParamsOpt = None // remove L2 Cache
L2CacheParamsOpt = None, // remove L2 Cache
prefetcher = None // if L2 pf_recv_node does not exist, disable SMS prefetcher
)
)
case SoCParamsKey =>
......@@ -244,7 +245,7 @@ class WithNKBL2
)),
reqField = Seq(PreferCacheField()),
echoField = Seq(DirtyField()),
prefetch = Some(huancun.prefetch.BOPParameters()),
prefetch = Some(huancun.prefetch.PrefetchReceiverParams()),
enablePerf = true,
sramDepthDiv = 2,
tagECC = Some("secded"),
......
package utils
import chisel3._
import chisel3.util._
class OverrideableQueue[T <: Data](gen: T, n: Int) extends Module {
val io = IO(new Bundle() {
val in = Flipped(ValidIO(gen))
val out = Decoupled(gen)
})
val entries = Seq.fill(n){ Reg(gen) }
val valids = Seq.fill(n){ RegInit(false.B) }
val rd_ptr = RegInit(0.U(log2Up(n).W))
val wr_ptr = RegInit(0.U(log2Up(n).W))
when(io.in.valid){
wr_ptr := wr_ptr + 1.U
}
when(io.out.fire){
rd_ptr := rd_ptr + 1.U
}
val w_mask = (0 until n).map(i => i.U === wr_ptr)
val r_mask = (0 until n).map(i => i.U === rd_ptr)
for((v, r) <- valids.zip(r_mask)){
when(r && io.out.fire){
v := false.B
}
}
for(((v, e), w) <- valids.zip(entries).zip(w_mask)){
when(io.in.valid && w){
v := true.B
e := io.in.bits
}
}
io.out.valid := Mux1H(r_mask, valids)
io.out.bits := Mux1H(r_mask, entries)
}
......@@ -477,6 +477,14 @@ class CustomCSRCtrlIO(implicit p: Parameters) extends XSBundle {
// Prefetcher
val l1I_pf_enable = Output(Bool())
val l2_pf_enable = Output(Bool())
val l1D_pf_enable = Output(Bool())
val l1D_pf_train_on_hit = Output(Bool())
val l1D_pf_enable_agt = Output(Bool())
val l1D_pf_enable_pht = Output(Bool())
val l1D_pf_active_threshold = Output(UInt(4.W))
val l1D_pf_active_stride = Output(UInt(6.W))
val l1D_pf_enable_stride = Output(Bool())
val l2_pf_store_only = Output(Bool())
// ICache
val icache_parity_enable = Output(Bool())
// Labeled XiangShan
......
......@@ -30,6 +30,8 @@ import freechips.rocketchip.diplomacy.AddressSet
import system.SoCParamsKey
import huancun._
import huancun.debug._
import xiangshan.mem.prefetch.{PrefetcherParams, SMSParams}
import scala.math.min
case object XSTileKey extends Field[Seq[XSCoreParameters]]
......@@ -152,6 +154,7 @@ case class XSCoreParameters
LduCnt = 2,
StuCnt = 2
),
prefetcher: Option[PrefetcherParams] = Some(SMSParams()),
LoadPipelineWidth: Int = 2,
StorePipelineWidth: Int = 2,
VecMemSrcInWidth: Int = 2,
......@@ -237,7 +240,7 @@ case class XSCoreParameters
level = 2,
ways = 8,
sets = 1024, // default 512KB L2
prefetch = Some(huancun.prefetch.BOPParameters())
prefetch = Some(huancun.prefetch.PrefetchReceiverParams())
)),
L2NBanks: Int = 1,
usePTWRepeater: Boolean = false,
......
......@@ -31,6 +31,7 @@ import xiangshan.backend._
import xiangshan.backend.exu.{ExuConfig, Wb2Ctrl, WbArbiterWrapper}
import xiangshan.cache.mmu._
import xiangshan.frontend._
import xiangshan.mem.L1PrefetchFuzzer
import scala.collection.mutable.ListBuffer
......@@ -327,6 +328,13 @@ class XSCoreImp(outer: XSCoreBase) extends LazyModuleImp(outer)
exuBlocks(0).io.scheExtra.fpRfReadIn.get <> exuBlocks(1).io.scheExtra.fpRfReadOut.get
exuBlocks(0).io.scheExtra.fpStateReadIn.get <> exuBlocks(1).io.scheExtra.fpStateReadOut.get
for((c, e) <- ctrlBlock.io.ld_pc_read.zip(exuBlocks(0).io.issue.get)){
// read load pc at load s0
c.ptr := e.bits.uop.cf.ftqPtr
c.offset := e.bits.uop.cf.ftqOffset
}
// return load pc at load s2
memBlock.io.loadPc <> VecInit(ctrlBlock.io.ld_pc_read.map(_.data))
memBlock.io.issue <> exuBlocks(0).io.issue.get
// By default, instructions do not have exceptions when they enter the function units.
memBlock.io.issue.map(_.bits.uop.clearExceptions())
......
......@@ -126,6 +126,10 @@ class XSTile()(implicit p: Parameters) extends LazyModule
l2cache match {
case Some(l2) =>
misc.l2_binder.get :*= l2.node :*= TLBuffer() :*= TLBuffer() :*= misc.l1_xbar
l2.pf_recv_node.map(recv => {
println("Connecting L1 prefetcher to L2!")
recv := core.memBlock.pf_sender_opt.get
})
case None =>
}
......
......@@ -28,7 +28,7 @@ import xiangshan.backend.dispatch.{Dispatch, Dispatch2Rs, DispatchQueue}
import xiangshan.backend.fu.PFEvent
import xiangshan.backend.rename.{Rename, RenameTableWrapper}
import xiangshan.backend.rob.{Rob, RobCSRIO, RobLsqIO}
import xiangshan.frontend.{FtqRead, Ftq_RF_Components}
import xiangshan.frontend.{FtqPtr, FtqRead, Ftq_RF_Components}
import xiangshan.mem.mdp.{LFST, SSIT, WaitTable}
import xiangshan.ExceptionNO._
import xiangshan.backend.exu.ExuConfig
......@@ -214,6 +214,7 @@ class CtrlBlockImp(outer: CtrlBlock)(implicit p: Parameters) extends LazyModuleI
val lqCancelCnt = Input(UInt(log2Up(LoadQueueSize + 1).W))
val sqCancelCnt = Input(UInt(log2Up(StoreQueueSize + 1).W))
val sqDeq = Input(UInt(log2Ceil(EnsbufferWidth + 1).W))
val ld_pc_read = Vec(exuParameters.LduCnt, Flipped(new FtqRead(UInt(VAddrBits.W))))
// from int block
val exuRedirect = Vec(exuParameters.AluCnt + exuParameters.JmpCnt, Flipped(ValidIO(new ExuOutput)))
val stIn = Vec(exuParameters.StuCnt, Flipped(ValidIO(new ExuInput)))
......@@ -267,8 +268,11 @@ class CtrlBlockImp(outer: CtrlBlock)(implicit p: Parameters) extends LazyModuleI
val fpDq = Module(new DispatchQueue(dpParams.FpDqSize, RenameWidth, dpParams.FpDqDeqWidth))
val lsDq = Module(new DispatchQueue(dpParams.LsDqSize, RenameWidth, dpParams.LsDqDeqWidth))
val redirectGen = Module(new RedirectGenerator)
// jumpPc (2) + redirects (1) + loadPredUpdate (1) + jalr_target (1) + robFlush (1)
val pcMem = Module(new SyncDataModuleTemplate(new Ftq_RF_Components, FtqSize, 6, 1, "BackendPC"))
// jumpPc (2) + redirects (1) + loadPredUpdate (1) + jalr_target (1) + [ld pc (LduCnt)] + robFlush (1)
val pcMem = Module(new SyncDataModuleTemplate(
new Ftq_RF_Components, FtqSize,
6 + exuParameters.LduCnt, 1, "CtrlPcMem")
)
val rob = outer.rob.module
pcMem.io.wen.head := RegNext(io.frontend.fromFtq.pc_mem_wen)
......@@ -538,6 +542,11 @@ class CtrlBlockImp(outer: CtrlBlock)(implicit p: Parameters) extends LazyModuleI
val jalrTargetRead = pcMem.io.rdata(4).startAddr
val read_from_newest_entry = RegNext(jalrTargetReadPtr) === RegNext(io.frontend.fromFtq.newest_entry_ptr)
io.jalr_target := Mux(read_from_newest_entry, RegNext(io.frontend.fromFtq.newest_entry_target), jalrTargetRead)
for(i <- 0 until exuParameters.LduCnt){
// load s0 -> get rdata (s1) -> reg next (s2) -> output (s2)
pcMem.io.raddr(i + 5) := io.ld_pc_read(i).ptr.value
io.ld_pc_read(i).data := pcMem.io.rdata(i + 5).getPc(RegNext(io.ld_pc_read(i).offset))
}
rob.io.hartId := io.hartId
io.cpu_halt := DelayN(rob.io.cpu_halt, 5)
......
......@@ -19,8 +19,9 @@ package xiangshan.backend
import chipsalliance.rocketchip.config.Parameters
import chisel3._
import chisel3.util._
import freechips.rocketchip.diplomacy.{LazyModule, LazyModuleImp}
import freechips.rocketchip.diplomacy.{BundleBridgeSource, LazyModule, LazyModuleImp}
import freechips.rocketchip.tile.HasFPUParameters
import huancun.PrefetchRecv
import utils._
import utility._
import xiangshan._
......@@ -30,6 +31,7 @@ import xiangshan.backend.rob.RobLsqIO
import xiangshan.cache._
import xiangshan.cache.mmu.{VectorTlbPtwIO, TLBNonBlock, TlbReplace}
import xiangshan.mem._
import xiangshan.mem.prefetch.{BasePrefecher, SMSParams, SMSPrefetcher}
class Std(implicit p: Parameters) extends FunctionUnit {
io.in.ready := true.B
......@@ -43,6 +45,9 @@ class MemBlock()(implicit p: Parameters) extends LazyModule
val dcache = LazyModule(new DCacheWrapper())
val uncache = LazyModule(new Uncache())
val pf_sender_opt = coreParams.prefetcher.map(_ =>
BundleBridgeSource(() => new PrefetchRecv)
)
lazy val module = new MemBlockImp(this)
......@@ -69,6 +74,7 @@ class MemBlockImp(outer: MemBlock) extends LazyModuleImp(outer)
val loadFastMatch = Vec(exuParameters.LduCnt, Input(UInt(exuParameters.LduCnt.W)))
val loadFastImm = Vec(exuParameters.LduCnt, Input(UInt(12.W)))
val rsfeedback = Vec(exuParameters.StuCnt, new MemRSFeedbackIO)
val loadPc = Vec(exuParameters.LduCnt, Input(UInt(VAddrBits.W))) // for hw prefetch
val stIssuePtr = Output(new SqPtr())
val int2vlsu = Flipped(new Int2VLSUIO)
val vec2vlsu = Flipped(new Vec2VLSUIO)
......@@ -79,10 +85,12 @@ class MemBlockImp(outer: MemBlock) extends LazyModuleImp(outer)
val vlsu2vec = new VLSU2VecIO
val vlsu2int = new VLSU2IntIO
val vlsu2ctrl = new VLSU2CtrlIO
// prefetch to l1 req
val prefetch_req = Flipped(DecoupledIO(new L1PrefetchReq))
// misc
val stIn = Vec(exuParameters.StuCnt, ValidIO(new ExuInput))
val memoryViolation = ValidIO(new Redirect)
val ptw = new VectorTlbPtwIO(exuParameters.LduCnt + exuParameters.StuCnt)
val ptw = new VectorTlbPtwIO(exuParameters.LduCnt + exuParameters.StuCnt + 1) // load + store + hw prefetch
val sfence = Input(new SfenceBundle)
val tlbCsr = Input(new TlbCsrBundle)
val fenceToSbuffer = Flipped(new FenceToSbuffer)
......@@ -119,6 +127,7 @@ class MemBlockImp(outer: MemBlock) extends LazyModuleImp(outer)
val csrCtrl = DelayN(io.csrCtrl, 2)
dcache.io.csr.distribute_csr <> csrCtrl.distribute_csr
dcache.io.l2_pf_store_only := RegNext(io.csrCtrl.l2_pf_store_only, false.B)
io.csrUpdate := RegNext(dcache.io.csr.update)
io.error <> RegNext(RegNext(dcache.io.error))
when(!csrCtrl.cache_error_enable){
......@@ -131,6 +140,31 @@ class MemBlockImp(outer: MemBlock) extends LazyModuleImp(outer)
val stdExeUnits = Seq.fill(exuParameters.StuCnt)(Module(new StdExeUnit))
val stData = stdExeUnits.map(_.io.out)
val exeUnits = loadUnits ++ storeUnits
val l1_pf_req = Wire(Decoupled(new L1PrefetchReq()))
val prefetcherOpt: Option[BasePrefecher] = coreParams.prefetcher.map {
case _: SMSParams =>
val sms = Module(new SMSPrefetcher())
sms.io_agt_en := RegNextN(io.csrCtrl.l1D_pf_enable_agt, 2, Some(false.B))
sms.io_pht_en := RegNextN(io.csrCtrl.l1D_pf_enable_pht, 2, Some(false.B))
sms.io_act_threshold := RegNextN(io.csrCtrl.l1D_pf_active_threshold, 2, Some(12.U))
sms.io_act_stride := RegNextN(io.csrCtrl.l1D_pf_active_stride, 2, Some(30.U))
sms.io_stride_en := RegNextN(io.csrCtrl.l1D_pf_enable_stride, 2, Some(true.B))
sms
}
prefetcherOpt.foreach(pf => {
val pf_to_l2 = ValidIODelay(pf.io.pf_addr, 2)
outer.pf_sender_opt.get.out.head._1.addr_valid := pf_to_l2.valid
outer.pf_sender_opt.get.out.head._1.addr := pf_to_l2.bits
outer.pf_sender_opt.get.out.head._1.l2_pf_en := RegNextN(io.csrCtrl.l2_pf_enable, 2, Some(true.B))
pf.io.enable := RegNextN(io.csrCtrl.l1D_pf_enable, 2, Some(false.B))
})
prefetcherOpt match {
case Some(pf) => l1_pf_req <> pf.io.l1_req
case None =>
l1_pf_req.valid := false.B
l1_pf_req.bits := DontCare
}
val pf_train_on_hit = RegNextN(io.csrCtrl.l1D_pf_train_on_hit, 2, Some(true.B))
loadUnits.zipWithIndex.map(x => x._1.suggestName("LoadUnit_"+x._2))
storeUnits.zipWithIndex.map(x => x._1.suggestName("StoreUnit_"+x._2))
......@@ -159,6 +193,35 @@ class MemBlockImp(outer: MemBlock) extends LazyModuleImp(outer)
io.otherFastWakeup.take(2).zip(loadUnits.map(_.io.fastUop)).foreach{case(a,b)=> a := b}
val stOut = io.writeback.drop(exuParameters.LduCnt).dropRight(exuParameters.StuCnt)
// prefetch to l1 req
loadUnits.foreach(load_unit => {
load_unit.io.prefetch_req.valid <> l1_pf_req.valid
load_unit.io.prefetch_req.bits <> l1_pf_req.bits
})
// when loadUnits(0) stage 0 is busy, hw prefetch will never use that pipeline
loadUnits(0).io.prefetch_req.bits.confidence := 0.U
l1_pf_req.ready := (l1_pf_req.bits.confidence > 0.U) ||
loadUnits.map(!_.io.ldin.valid).reduce(_ || _)
// l1 pf fuzzer interface
val DebugEnableL1PFFuzzer = false
if (DebugEnableL1PFFuzzer) {
// l1 pf req fuzzer
val fuzzer = Module(new L1PrefetchFuzzer())
fuzzer.io.vaddr := DontCare
fuzzer.io.paddr := DontCare
// override load_unit prefetch_req
loadUnits.foreach(load_unit => {
load_unit.io.prefetch_req.valid <> fuzzer.io.req.valid
load_unit.io.prefetch_req.bits <> fuzzer.io.req.bits
})
fuzzer.io.req.ready := l1_pf_req.ready
}
// TODO: fast load wakeup
val lsq = Module(new LsqWrappper)
val vlsq = Module(new DummyVectorLsq)
val sbuffer = Module(new Sbuffer)
......@@ -182,7 +245,11 @@ class MemBlockImp(outer: MemBlock) extends LazyModuleImp(outer)
val tlb_st = Module(new TLBNonBlock(exuParameters.StuCnt, 1, sttlbParams))
tlb_st.io // let the module have name in waveform
})
val dtlb = dtlb_ld ++ dtlb_st
val dtlb_prefetch = VecInit(Seq.fill(1){
val tlb_prefetch = Module(new TLBNonBlock(1, 2, sttlbParams))
tlb_prefetch.io // let the module have name in waveform
})
val dtlb = dtlb_ld ++ dtlb_st ++ dtlb_prefetch
val dtlb_reqs = dtlb.map(_.requestor).flatten
val dtlb_pmps = dtlb.map(_.pmp).flatten
dtlb.map(_.sfence := sfence)
......@@ -192,7 +259,7 @@ class MemBlockImp(outer: MemBlock) extends LazyModuleImp(outer)
require(ldtlbParams.outReplace == sttlbParams.outReplace)
require(ldtlbParams.outReplace)
val replace = Module(new TlbReplace(exuParameters.LduCnt + exuParameters.StuCnt, ldtlbParams))
val replace = Module(new TlbReplace(exuParameters.LduCnt + exuParameters.StuCnt + 1, ldtlbParams))
replace.io.apply_sep(dtlb_ld.map(_.replace) ++ dtlb_st.map(_.replace), io.ptw.resp.bits.data.entry.tag)
} else {
if (ldtlbParams.outReplace) {
......@@ -209,10 +276,9 @@ class MemBlockImp(outer: MemBlock) extends LazyModuleImp(outer)
val ptw_resp_v = RegNext(io.ptw.resp.valid && !(sfence.valid && tlbcsr.satp.changed), init = false.B)
io.ptw.resp.ready := true.B
(dtlb.map(a => a.ptw.req.map(b => b)))
.flatten
dtlb.flatMap(a => a.ptw.req)
.zipWithIndex
.map{ case (tlb, i) =>
.foreach{ case (tlb, i) =>
tlb <> io.ptw.req(i)
val vector_hit = if (refillBothTlb) Cat(ptw_resp_next.vector).orR
else if (i < exuParameters.LduCnt) Cat(ptw_resp_next.vector.take(exuParameters.LduCnt)).orR
......@@ -220,12 +286,13 @@ class MemBlockImp(outer: MemBlock) extends LazyModuleImp(outer)
io.ptw.req(i).valid := tlb.valid && !(ptw_resp_v && vector_hit &&
ptw_resp_next.data.entry.hit(tlb.bits.vpn, tlbcsr.satp.asid, allType = true, ignoreAsid = true))
}
dtlb.map(_.ptw.resp.bits := ptw_resp_next.data)
dtlb.foreach(_.ptw.resp.bits := ptw_resp_next.data)
if (refillBothTlb) {
dtlb.map(_.ptw.resp.valid := ptw_resp_v && Cat(ptw_resp_next.vector).orR)
dtlb.foreach(_.ptw.resp.valid := ptw_resp_v && Cat(ptw_resp_next.vector).orR)
} else {
dtlb_ld.map(_.ptw.resp.valid := ptw_resp_v && Cat(ptw_resp_next.vector.take(exuParameters.LduCnt)).orR)
dtlb_st.map(_.ptw.resp.valid := ptw_resp_v && Cat(ptw_resp_next.vector.drop(exuParameters.LduCnt)).orR)
dtlb_ld.foreach(_.ptw.resp.valid := ptw_resp_v && Cat(ptw_resp_next.vector.take(exuParameters.LduCnt)).orR)
dtlb_st.foreach(_.ptw.resp.valid := ptw_resp_v && Cat(ptw_resp_next.vector.drop(exuParameters.LduCnt).take(exuParameters.StuCnt)).orR)
dtlb_prefetch.foreach(_.ptw.resp.valid := ptw_resp_v && Cat(ptw_resp_next.vector.drop(exuParameters.LduCnt + exuParameters.StuCnt)).orR)
}
......@@ -233,7 +300,7 @@ class MemBlockImp(outer: MemBlock) extends LazyModuleImp(outer)
val pmp = Module(new PMP())
pmp.io.distribute_csr <> csrCtrl.distribute_csr
val pmp_check = VecInit(Seq.fill(exuParameters.LduCnt + exuParameters.StuCnt)(Module(new PMPChecker(3)).io))
val pmp_check = VecInit(Seq.fill(exuParameters.LduCnt + exuParameters.StuCnt + 1)(Module(new PMPChecker(3)).io))
for ((p,d) <- pmp_check zip dtlb_pmps) {
p.apply(tlbcsr.priv.dmode, pmp.io.pmp, pmp.io.pma, d)
require(p.req.bits.size.getWidth == d.bits.size.getWidth)
......@@ -285,6 +352,18 @@ class MemBlockImp(outer: MemBlock) extends LazyModuleImp(outer)
for (s <- 0 until StorePipelineWidth) {
loadUnits(i).io.reExecuteQuery(s) := storeUnits(s).io.reExecuteQuery
}
// prefetch
prefetcherOpt.foreach(pf => {
pf.io.ld_in(i).valid := Mux(pf_train_on_hit,
loadUnits(i).io.prefetch_train.valid,
loadUnits(i).io.prefetch_train.valid && loadUnits(i).io.prefetch_train.bits.isFirstIssue && (
loadUnits(i).io.prefetch_train.bits.miss || loadUnits(i).io.prefetch_train.bits.meta_prefetch
)
)
pf.io.ld_in(i).bits := loadUnits(i).io.prefetch_train.bits
pf.io.ld_in(i).bits.uop.cf.pc := Mux(loadUnits(i).io.s2IsPointerChasing, io.loadPc(i), RegNext(io.loadPc(i)))
})
// load to load fast forward: load(i) prefers data(i)
val fastPriority = (i until exuParameters.LduCnt) ++ (0 until i)
val fastValidVec = fastPriority.map(j => loadUnits(j).io.fastpathOut.valid)
......@@ -351,6 +430,13 @@ class MemBlockImp(outer: MemBlock) extends LazyModuleImp(outer)
p"has trigger hit vec ${io.writeback(i).bits.uop.cf.trigger.backendHit}\n")
}
// Prefetcher
val PrefetcherDTLBPortIndex = exuParameters.LduCnt + exuParameters.StuCnt
dtlb_reqs(PrefetcherDTLBPortIndex) := DontCare
dtlb_reqs(PrefetcherDTLBPortIndex).req.valid := false.B
prefetcherOpt.foreach(pf => {
dtlb_reqs(PrefetcherDTLBPortIndex) <> pf.io.tlb_req
})
// StoreUnit
for (i <- 0 until exuParameters.StuCnt) {
......@@ -539,9 +625,14 @@ class MemBlockImp(outer: MemBlock) extends LazyModuleImp(outer)
// for atomicsUnit, it uses loadUnit(0)'s TLB port
when (state =/= s_normal) {
// use store wb port instead of load
loadUnits(0).io.ldout.ready := false.B
// use load_0's TLB
atomicsUnit.io.dtlb <> amoTlb
// hw prefetch should be disabled while executing atomic insts
loadUnits.map(i => i.io.prefetch_req.valid := false.B)
// make sure there's no in-flight uops in load unit
assert(!loadUnits(0).io.ldout.valid)
}
......
......@@ -483,9 +483,35 @@ class CSR(implicit p: Parameters) extends FunctionUnit with HasCSRConst with PMP
// spfctl Bit 0: L1I Cache Prefetcher Enable
// spfctl Bit 1: L2Cache Prefetcher Enable
val spfctl = RegInit(UInt(XLEN.W), "b11".U)
// spfctl Bit 2: L1D Cache Prefetcher Enable
// spfctl Bit 3: L1D train prefetch on hit
// spfctl Bit 4: L1D prefetch enable agt
// spfctl Bit 5: L1D prefetch enable pht
// spfctl Bit [9:6]: L1D prefetch active page threshold
// spfctl Bit [15:10]: L1D prefetch active page stride
// turn off L2 BOP, turn on L1 SMS by default
val spfctl = RegInit(UInt(XLEN.W), Seq(
0 << 17, // L2 pf store only [17] init: false
1 << 16, // L1D pf enable stride [16] init: true
30 << 10, // L1D active page stride [15:10] init: 30
12 << 6, // L1D active page threshold [9:6] init: 12
1 << 5, // L1D enable pht [5] init: true
1 << 4, // L1D enable agt [4] init: true
0 << 3, // L1D train on hit [3] init: false
1 << 2, // L1D pf enable [2] init: true
1 << 1, // L2 pf enable [1] init: true
1 << 0, // L1I pf enable [0] init: true
).reduce(_|_).U(XLEN.W))
csrio.customCtrl.l1I_pf_enable := spfctl(0)
csrio.customCtrl.l2_pf_enable := spfctl(1)
csrio.customCtrl.l1D_pf_enable := spfctl(2)
csrio.customCtrl.l1D_pf_train_on_hit := spfctl(3)
csrio.customCtrl.l1D_pf_enable_agt := spfctl(4)
csrio.customCtrl.l1D_pf_enable_pht := spfctl(5)
csrio.customCtrl.l1D_pf_active_threshold := spfctl(9, 6)
csrio.customCtrl.l1D_pf_active_stride := spfctl(15, 10)
csrio.customCtrl.l1D_pf_enable_stride := spfctl(16)
csrio.customCtrl.l2_pf_store_only := spfctl(17)
// sfetchctl Bit 0: L1I Cache Parity check enable
val sfetchctl = RegInit(UInt(XLEN.W), "b0".U)
......
......@@ -100,12 +100,21 @@ trait HasDCacheParameters extends HasL1CacheParameters {
def blockProbeAfterGrantCycles = 8 // give the processor some time to issue a request after a grant
def nSourceType = 3
def nSourceType = 10
def sourceTypeWidth = log2Up(nSourceType)
// non-prefetch source < 3
def LOAD_SOURCE = 0
def STORE_SOURCE = 1
def AMO_SOURCE = 2
def SOFT_PREFETCH = 3
// prefetch source >= 3
def DCACHE_PREFETCH_SOURCE = 3
def SOFT_PREFETCH = 4
def HW_PREFETCH_AGT = 5
def HW_PREFETCH_PHT_CUR = 6
def HW_PREFETCH_PHT_INC = 7
def HW_PREFETCH_PHT_DEC = 8
def HW_PREFETCH_BOP = 9
def HW_PREFETCH_STRIDE = 10
// each source use a id to distinguish its multiple reqs
def reqIdWidth = log2Up(nEntries) max log2Up(StoreBufferSize)
......@@ -142,6 +151,10 @@ trait HasDCacheParameters extends HasL1CacheParameters {
// uncache
val uncacheIdxBits = log2Up(StoreQueueSize) max log2Up(LoadQueueSize)
// hardware prefetch parameters
// high confidence hardware prefetch port
val HighConfHWPFLoadPort = LoadPipelineWidth - 1 // use the last load port by default
val IgnorePrefetchConfidence = false
// parameters about duplicating regs to solve fanout
// In Main Pipe:
......@@ -274,6 +287,15 @@ class ReplacementWayReqIO(implicit p: Parameters) extends DCacheBundle {
val way = Input(UInt(log2Up(nWays).W))
}
class DCacheExtraMeta(implicit p: Parameters) extends DCacheBundle
{
val error = Bool() // cache line has been marked as corrupted by l2 / ecc error detected when store
val prefetch = Bool() // cache line is first required by prefetch
val access = Bool() // cache line has been accessed by load / store
// val debug_access_timestamp = UInt(64.W) // last time a load / store / refill access that cacheline
}
// memory request in word granularity(load, mmio, lr/sc, atomics)
class DCacheWordReq(implicit p: Parameters) extends DCacheBundle
{
......@@ -336,6 +358,8 @@ class BaseDCacheWordResp(implicit p: Parameters) extends DCacheBundle
class DCacheWordResp(implicit p: Parameters) extends BaseDCacheWordResp
{
val meta_prefetch = Bool()
val meta_access = Bool()
// 1 cycle after data resp
val error_delayed = Bool() // all kinds of errors, include tag error
}
......@@ -461,6 +485,7 @@ class DCacheLoadIO(implicit p: Parameters) extends DCacheWordIO
// kill previous cycle's req
val s1_kill = Output(Bool())
val s2_kill = Output(Bool())
val s2_pc = Output(UInt(VAddrBits.W))
// cycle 0: virtual address: req.addr
// cycle 1: physical address: s1_paddr
val s1_paddr_dup_lsu = Output(UInt(PAddrBits.W)) // lsu side paddr
......@@ -618,6 +643,7 @@ class DCacheToLsuIO(implicit p: Parameters) extends DCacheBundle {
class DCacheIO(implicit p: Parameters) extends DCacheBundle {
val hartId = Input(UInt(8.W))
val l2_pf_store_only = Input(Bool())
val lsu = new DCacheToLsuIO
val csr = new L1CacheToCsrIO
val error = new L1CacheErrorInfo
......@@ -664,8 +690,10 @@ class DCacheImp(outer: DCache) extends LazyModuleImp(outer) with HasDCacheParame
//----------------------------------------
// core data structures
val bankedDataArray = Module(new BankedDataArray)
val metaArray = Module(new AsynchronousMetaArray(readPorts = LoadPipelineWidth + 1, writePorts = 2))
val errorArray = Module(new ErrorArray(readPorts = LoadPipelineWidth + 1, writePorts = 2)) // TODO: add it to meta array
val metaArray = Module(new L1CohMetaArray(readPorts = LoadPipelineWidth + 1, writePorts = 2))
val errorArray = Module(new L1FlagMetaArray(readPorts = LoadPipelineWidth + 1, writePorts = 2))
val prefetchArray = Module(new L1FlagMetaArray(readPorts = LoadPipelineWidth + 1, writePorts = 2)) // prefetch flag array
val accessArray = Module(new L1FlagMetaArray(readPorts = LoadPipelineWidth + 1, writePorts = LoadPipelineWidth + 2))
val tagArray = Module(new DuplicatedTagArray(readPorts = LoadPipelineWidth + 1))
bankedDataArray.dump()
......@@ -680,6 +708,7 @@ class DCacheImp(outer: DCache) extends LazyModuleImp(outer) with HasDCacheParame
val wb = Module(new WritebackQueue(edge))
missQueue.io.hartId := io.hartId
missQueue.io.l2_pf_store_only := RegNext(io.l2_pf_store_only, false.B)
val errors = ldu.map(_.io.error) ++ // load error
Seq(mainPipe.io.error) // store / misc error
......@@ -687,6 +716,8 @@ class DCacheImp(outer: DCache) extends LazyModuleImp(outer) with HasDCacheParame
//----------------------------------------
// meta array
// read / write coh meta
val meta_read_ports = ldu.map(_.io.meta_read) ++
Seq(mainPipe.io.meta_read)
val meta_resp_ports = ldu.map(_.io.meta_resp) ++
......@@ -699,16 +730,41 @@ class DCacheImp(outer: DCache) extends LazyModuleImp(outer) with HasDCacheParame
meta_resp_ports.zip(metaArray.io.resp).foreach { case (p, r) => p := r }
meta_write_ports.zip(metaArray.io.write).foreach { case (p, w) => w <> p }
val error_flag_resp_ports = ldu.map(_.io.error_flag_resp) ++
Seq(mainPipe.io.error_flag_resp)
// read extra meta
meta_read_ports.zip(errorArray.io.read).foreach { case (p, r) => r <> p }
meta_read_ports.zip(prefetchArray.io.read).foreach { case (p, r) => r <> p }
meta_read_ports.zip(accessArray.io.read).foreach { case (p, r) => r <> p }
val extra_meta_resp_ports = ldu.map(_.io.extra_meta_resp) ++
Seq(mainPipe.io.extra_meta_resp)
extra_meta_resp_ports.zip(errorArray.io.resp).foreach { case (p, r) => {
(0 until nWays).map(i => { p(i).error := r(i) })
}}
extra_meta_resp_ports.zip(prefetchArray.io.resp).foreach { case (p, r) => {
(0 until nWays).map(i => { p(i).prefetch := r(i) })
}}
extra_meta_resp_ports.zip(accessArray.io.resp).foreach { case (p, r) => {
(0 until nWays).map(i => { p(i).access := r(i) })
}}
// write extra meta
val error_flag_write_ports = Seq(
mainPipe.io.error_flag_write,
refillPipe.io.error_flag_write
mainPipe.io.error_flag_write, // error flag generated by corrupted store
refillPipe.io.error_flag_write // corrupted signal from l2
)
meta_read_ports.zip(errorArray.io.read).foreach { case (p, r) => r <> p }
error_flag_resp_ports.zip(errorArray.io.resp).foreach { case (p, r) => p := r }
error_flag_write_ports.zip(errorArray.io.write).foreach { case (p, w) => w <> p }
val prefetch_flag_write_ports = Seq(
mainPipe.io.prefetch_flag_write, // set prefetch_flag to false if coh is set to Nothing
refillPipe.io.prefetch_flag_write // refill required by prefetch will set prefetch_flag
)
prefetch_flag_write_ports.zip(prefetchArray.io.write).foreach { case (p, w) => w <> p }
val access_flag_write_ports = ldu.map(_.io.access_flag_write) ++ Seq(
mainPipe.io.access_flag_write,
refillPipe.io.access_flag_write
)
access_flag_write_ports.zip(accessArray.io.write).foreach { case (p, w) => w <> p }
//----------------------------------------
// tag array
require(tagArray.io.read.size == (ldu.size + 1))
......
......@@ -34,7 +34,7 @@ class LoadPipe(id: Int)(implicit p: Parameters) extends DCacheModule with HasPer
// meta and data array read port
val meta_read = DecoupledIO(new MetaReadReq)
val meta_resp = Input(Vec(nWays, new Meta))
val error_flag_resp = Input(Vec(nWays, Bool()))
val extra_meta_resp = Input(Vec(nWays, new DCacheExtraMeta))
val tag_read = DecoupledIO(new TagReadReq)
val tag_resp = Input(Vec(nWays, UInt(encTagBits.W)))
......@@ -43,6 +43,9 @@ class LoadPipe(id: Int)(implicit p: Parameters) extends DCacheModule with HasPer
val banked_data_resp = Input(new L1BankedDataReadResult())
val read_error_delayed = Input(Bool())
// access bit update
val access_flag_write = DecoupledIO(new FlagMetaWriteReq)
// banked data read conflict
val bank_conflict_slow = Input(Bool())
val bank_conflict_fast = Input(Bool())
......@@ -183,13 +186,16 @@ class LoadPipe(id: Int)(implicit p: Parameters) extends DCacheModule with HasPer
// this simplifies our logic in s2 stage
val s1_hit_meta = Mux(s1_tag_match_dup_dc, Mux1H(s1_tag_match_way_dup_dc, wayMap((w: Int) => meta_resp(w))), s1_fake_meta)
val s1_hit_coh = s1_hit_meta.coh
val s1_hit_error = Mux(s1_tag_match_dup_dc, Mux1H(s1_tag_match_way_dup_dc, wayMap((w: Int) => io.error_flag_resp(w))), false.B)
val s1_hit_error = Mux(s1_tag_match_dup_dc, Mux1H(s1_tag_match_way_dup_dc, wayMap((w: Int) => io.extra_meta_resp(w).error)), false.B)
val s1_hit_prefetch = Mux(s1_tag_match_dup_dc, Mux1H(s1_tag_match_way_dup_dc, wayMap((w: Int) => io.extra_meta_resp(w).prefetch)), false.B)
val s1_hit_access = Mux(s1_tag_match_dup_dc, Mux1H(s1_tag_match_way_dup_dc, wayMap((w: Int) => io.extra_meta_resp(w).access)), false.B)
io.replace_way.set.valid := RegNext(s0_fire)
io.replace_way.set.bits := get_idx(s1_vaddr)
val s1_repl_way_en = UIntToOH(io.replace_way.way)
val s1_repl_tag = Mux1H(s1_repl_way_en, wayMap(w => tag_resp(w)))
val s1_repl_coh = Mux1H(s1_repl_way_en, wayMap(w => meta_resp(w).coh))
val s1_repl_extra_meta = Mux1H(s1_repl_way_en, wayMap(w => io.extra_meta_resp(w)))
val s1_need_replacement = !s1_tag_match_dup_dc
val s1_way_en = Mux(s1_need_replacement, s1_repl_way_en, s1_tag_match_way_dup_dc)
......@@ -232,6 +238,7 @@ class LoadPipe(id: Int)(implicit p: Parameters) extends DCacheModule with HasPer
dump_pipeline_reqs("LoadPipe s2", s2_valid, s2_req)
// hit, miss, nack, permission checking
// dcache side tag match
val s2_tag_match_way = RegEnable(s1_tag_match_way_dup_dc, s1_fire)
......@@ -244,12 +251,13 @@ class LoadPipe(id: Int)(implicit p: Parameters) extends DCacheModule with HasPer
val s2_hit_meta = RegEnable(s1_hit_meta, s1_fire)
val s2_hit_coh = RegEnable(s1_hit_coh, s1_fire)
val s2_has_permission = s2_hit_coh.onAccess(s2_req.cmd)._1 // redundant
val s2_new_hit_coh = s2_hit_coh.onAccess(s2_req.cmd)._3 // redundant
val s2_has_permission = s2_hit_coh.onAccess(s2_req.cmd)._1 // for write prefetch
val s2_new_hit_coh = s2_hit_coh.onAccess(s2_req.cmd)._3 // for write prefetch
val s2_way_en = RegEnable(s1_way_en, s1_fire)
val s2_repl_coh = RegEnable(s1_repl_coh, s1_fire)
val s2_repl_tag = RegEnable(s1_repl_tag, s1_fire)
val s2_repl_extra_meta = RegEnable(s1_repl_extra_meta, s1_fire) // not used for now
val s2_encTag = RegEnable(s1_encTag, s1_fire)
// when req got nacked, upper levels should replay this request
......@@ -269,9 +277,10 @@ class LoadPipe(id: Int)(implicit p: Parameters) extends DCacheModule with HasPer
val s2_tag_error = dcacheParameters.tagCode.decode(s2_encTag).error // error reported by tag ecc check
val s2_flag_error = RegEnable(s1_flag_error, s1_fire)
val s2_hit_prefetch = RegEnable(s1_hit_prefetch, s1_fire)
val s2_hit_access = RegEnable(s1_hit_access, s1_fire)
val s2_hit = s2_tag_match && s2_has_permission && s2_hit_coh === s2_new_hit_coh && !s2_wpu_pred_fail
// assert(!RegNext(s2_valid && (s2_tag_match && !s2_hit)))
// assert(!RegNext(s2_valid && (s2_hit_dup_lsu =/= s2_hit)))
// only dump these signals when they are actually valid
dump_pipeline_valids("LoadPipe s2", "s2_hit", s2_valid && s2_hit)
......@@ -293,6 +302,7 @@ class LoadPipe(id: Int)(implicit p: Parameters) extends DCacheModule with HasPer
io.miss_req.bits.replace_coh := s2_repl_coh
io.miss_req.bits.replace_tag := s2_repl_tag
io.miss_req.bits.cancel := io.lsu.s2_kill || s2_tag_error
io.miss_req.bits.pc := io.lsu.s2_pc
// send back response
val resp = Wire(ValidIO(new DCacheWordResp))
......@@ -312,11 +322,15 @@ class LoadPipe(id: Int)(implicit p: Parameters) extends DCacheModule with HasPer
resp.bits.replay := (resp.bits.miss && (!io.miss_req.fire() || s2_nack)) || io.bank_conflict_slow || s2_wpu_pred_fail
resp.bits.replayCarry.valid := resp.bits.miss
resp.bits.replayCarry.real_way_en := s2_real_way_en
resp.bits.meta_prefetch := s2_hit_prefetch
resp.bits.meta_access := s2_hit_access
resp.bits.tag_error := s2_tag_error // report tag_error in load s2
resp.bits.mshr_id := io.miss_resp.id
XSPerfAccumulate("dcache_read_bank_conflict", io.bank_conflict_slow && s2_valid)
XSPerfAccumulate("wpu_pred_fail", s2_wpu_pred_fail && s2_valid)
XSPerfAccumulate("dcache_read_bank_conflict", io.bank_conflict_slow && s2_valid)
XSPerfAccumulate("dcache_read_from_prefetched_line", s2_valid && s2_hit_prefetch && !resp.bits.miss)
XSPerfAccumulate("dcache_first_read_from_prefetched_line", s2_valid && s2_hit_prefetch && !resp.bits.miss && !s2_hit_access)
io.lsu.resp.valid := resp.valid
io.lsu.resp.bits := resp.bits
......@@ -337,11 +351,13 @@ class LoadPipe(id: Int)(implicit p: Parameters) extends DCacheModule with HasPer
// report ecc error and get selected dcache data
val s3_valid = RegNext(s2_valid)
val s3_vaddr = RegEnable(s2_vaddr, s2_fire)
val s3_paddr = RegEnable(s2_paddr, s2_fire)
val s3_hit = RegEnable(s2_hit, s2_fire)
val s3_tag_match_way = RegEnable(s2_tag_match_way, s2_fire)
val s3_banked_data_resp_word = io.banked_data_resp.raw_data
val s3_data_error = io.read_error_delayed // banked_data_resp_word.error && !bank_conflict
val s3_data_error = io.read_error_delayed && s3_hit // banked_data_resp_word.error && !bank_conflict
val s3_tag_error = RegEnable(s2_tag_error, s2_fire)
val s3_flag_error = RegEnable(s2_flag_error, s2_fire)
val s3_error = s3_tag_error || s3_flag_error || s3_data_error
......@@ -361,12 +377,17 @@ class LoadPipe(id: Int)(implicit p: Parameters) extends DCacheModule with HasPer
// report tag error / l2 corrupted to CACHE_ERROR csr
io.error.valid := s3_error && s3_valid
// update plru, report error in s3
// update plru in s3
io.replace_access.valid := RegNext(RegNext(RegNext(io.meta_read.fire()) && s1_valid && !io.lsu.s1_kill) && !s2_nack_no_mshr)
io.replace_access.bits.set := RegNext(RegNext(get_idx(s1_req.addr)))
io.replace_access.bits.way := RegNext(RegNext(Mux(s1_tag_match_dup_dc, OHToUInt(s1_tag_match_way_dup_dc), io.replace_way.way)))
// update access bit
io.access_flag_write.valid := s3_valid && s3_hit
io.access_flag_write.bits.idx := get_idx(s3_vaddr)
io.access_flag_write.bits.way_en := s3_tag_match_way
io.access_flag_write.bits.flag := true.B
// --------------------------------------------------------------------------------
// Debug logging functions
def dump_pipeline_reqs(pipeline_stage_name: String, valid: Bool,
......
......@@ -131,9 +131,11 @@ class MainPipe(implicit p: Parameters) extends DCacheModule with HasPerfEvents {
// meta array
val meta_read = DecoupledIO(new MetaReadReq)
val meta_resp = Input(Vec(nWays, new Meta))
val meta_write = DecoupledIO(new MetaWriteReq)
val error_flag_resp = Input(Vec(nWays, Bool()))
val error_flag_write = DecoupledIO(new ErrorWriteReq)
val meta_write = DecoupledIO(new CohMetaWriteReq)
val extra_meta_resp = Input(Vec(nWays, new DCacheExtraMeta))
val error_flag_write = DecoupledIO(new FlagMetaWriteReq)
val prefetch_flag_write = DecoupledIO(new FlagMetaWriteReq)
val access_flag_write = DecoupledIO(new FlagMetaWriteReq)
// tag sram
val tag_read = DecoupledIO(new TagReadReq)
......@@ -282,9 +284,13 @@ class MainPipe(implicit p: Parameters) extends DCacheModule with HasPerfEvents {
val s1_hit_tag = Mux(s1_tag_match, Mux1H(s1_tag_match_way, wayMap(w => tag_resp(w))), get_tag(s1_req.addr))
val s1_hit_coh = ClientMetadata(Mux(s1_tag_match, Mux1H(s1_tag_match_way, wayMap(w => meta_resp(w))), 0.U))
val s1_encTag = Mux1H(s1_tag_match_way, wayMap((w: Int) => enc_tag_resp(w)))
val s1_flag_error = Mux(s1_tag_match, Mux1H(s1_tag_match_way, wayMap(w => io.error_flag_resp(w))), false.B)
val s1_flag_error = Mux(s1_tag_match, Mux1H(s1_tag_match_way, wayMap(w => io.extra_meta_resp(w).error)), false.B)
val s1_extra_meta = Mux1H(s1_tag_match_way, wayMap(w => io.extra_meta_resp(w)))
val s1_l2_error = s1_req.error
XSPerfAccumulate("probe_unused_prefetch", s1_req.probe && s1_extra_meta.prefetch && !s1_extra_meta.access) // may not be accurate
XSPerfAccumulate("replace_unused_prefetch", s1_req.replace && s1_extra_meta.prefetch && !s1_extra_meta.access) // may not be accurate
// replacement policy
val s1_repl_way_en = WireInit(0.U(nWays.W))
s1_repl_way_en := Mux(RegNext(s0_fire), UIntToOH(io.replace_way.way), RegNext(s1_repl_way_en))
......@@ -1412,6 +1418,7 @@ class MainPipe(implicit p: Parameters) extends DCacheModule with HasPerfEvents {
miss_req.replace_tag := s2_repl_tag
miss_req.id := s2_req.id
miss_req.cancel := false.B
miss_req.pc := DontCare
io.store_replay_resp.valid := s2_valid_dup(5) && s2_can_go_to_mq_dup(1) && replay && s2_req.isStore
io.store_replay_resp.bits.data := DontCare
......@@ -1470,7 +1477,22 @@ class MainPipe(implicit p: Parameters) extends DCacheModule with HasPerfEvents {
io.error_flag_write.valid := s3_fire_dup_for_err_w_valid && update_meta_dup_for_err_w_valid && s3_l2_error
io.error_flag_write.bits.idx := s3_idx_dup(3)
io.error_flag_write.bits.way_en := s3_way_en_dup(1)
io.error_flag_write.bits.error := s3_l2_error
io.error_flag_write.bits.flag := s3_l2_error
// if we use (prefetch_flag && meta =/= ClientStates.Nothing) for prefetch check
// prefetch_flag_write can be omited
// io.prefetch_flag_write.valid := io.meta_write.valid && new_coh === ClientStates.Nothing
// io.prefetch_flag_write.bits.idx := s3_idx_dup(3)
// io.prefetch_flag_write.bits.way_en := s3_way_en_dup(1)
// io.prefetch_flag_write.bits.flag := false.B
io.prefetch_flag_write.valid := false.B
io.prefetch_flag_write.bits := DontCare
// probe / replace will not update access bit
io.access_flag_write.valid := s3_fire_dup_for_meta_w_valid && !s3_req.probe && !s3_req.replace
io.access_flag_write.bits.idx := s3_idx_dup(3)
io.access_flag_write.bits.way_en := s3_way_en_dup(1)
io.access_flag_write.bits.flag := true.B
io.tag_write.valid := s3_fire_dup_for_tag_w_valid && s3_req_miss_dup_for_tag_w_valid
io.tag_write.bits.idx := s3_idx_dup(4)
......
......@@ -30,6 +30,7 @@ import difftest._
import huancun.{AliasKey, DirtyKey, PreferCacheKey, PrefetchKey}
import utility.FastArbiter
import mem.{AddPipelineReg}
import mem.trace._
class MissReqWoStoreData(implicit p: Parameters) extends DCacheBundle {
val source = UInt(sourceTypeWidth.W)
......@@ -37,6 +38,7 @@ class MissReqWoStoreData(implicit p: Parameters) extends DCacheBundle {
val addr = UInt(PAddrBits.W)
val vaddr = UInt(VAddrBits.W)
val way_en = UInt(DCacheWays.W)
val pc = UInt(VAddrBits.W)
// store
val full_overwrite = Bool()
......@@ -61,9 +63,13 @@ class MissReqWoStoreData(implicit p: Parameters) extends DCacheBundle {
// 2. pmp check failed
val cancel = Bool() // cancel is slow to generate, it will cancel missreq.valid
def isLoad = source === LOAD_SOURCE.U
def isStore = source === STORE_SOURCE.U
def isAMO = source === AMO_SOURCE.U
// Req source decode
// Note that req source is NOT cmd type
// For instance, a req which isFromPrefetch may have R or W cmd
def isFromLoad = source === LOAD_SOURCE.U
def isFromStore = source === STORE_SOURCE.U
def isFromAMO = source === AMO_SOURCE.U
def isFromPrefetch = source >= DCACHE_PREFETCH_SOURCE.U
def hit = req_coh.isValid()
}
......@@ -103,6 +109,7 @@ class MissReq(implicit p: Parameters) extends MissReqWoStoreData {
out.replace_tag := replace_tag
out.id := id
out.cancel := cancel
out.pc := pc
out
}
}
......@@ -160,6 +167,7 @@ class MissEntry(edge: TLEdgeOut)(implicit p: Parameters) extends DCacheModule {
val req_handled_by_this_entry = Output(Bool())
val forwardInfo = Output(new MissEntryForwardIO)
val l2_pf_store_only = Input(Bool())
})
assert(!RegNext(io.primary_valid && !io.primary_ready))
......@@ -169,6 +177,8 @@ class MissEntry(edge: TLEdgeOut)(implicit p: Parameters) extends DCacheModule {
val req_valid = RegInit(false.B)
val set = addr_to_dcache_set(req.vaddr)
val input_req_is_prefetch = isPrefetch(io.req.bits.cmd)
val s_acquire = RegInit(true.B)
val s_grantack = RegInit(true.B)
val s_replace_req = RegInit(true.B)
......@@ -188,11 +198,13 @@ class MissEntry(edge: TLEdgeOut)(implicit p: Parameters) extends DCacheModule {
val data_not_refilled = !w_grantfirst
val error = RegInit(false.B)
val prefetch = RegInit(false.B)
val access = RegInit(false.B)
val should_refill_data_reg = Reg(Bool())
val should_refill_data = WireInit(should_refill_data_reg)
// val full_overwrite = req.isStore && req_store_mask.andR
// val full_overwrite = req.isFromStore && req_store_mask.andR
val full_overwrite = Reg(Bool())
val (_, _, refill_done, refill_count) = edge.count(io.mem_grant)
......@@ -235,46 +247,51 @@ class MissEntry(edge: TLEdgeOut)(implicit p: Parameters) extends DCacheModule {
w_grantfirst := false.B
w_grantlast := false.B
s_write_storedata := !io.req.bits.isStore // only store need to wait for data
full_overwrite := io.req.bits.isStore && io.req.bits.full_overwrite
s_write_storedata := !io.req.bits.isFromStore // only store need to wait for data
full_overwrite := io.req.bits.isFromStore && io.req.bits.full_overwrite
when (!io.req.bits.isAMO) {
when (!io.req.bits.isFromAMO) {
s_refill := false.B
w_refill_resp := false.B
}
when (!io.req.bits.hit && io.req.bits.replace_coh.isValid() && !io.req.bits.isAMO) {
when (!io.req.bits.hit && io.req.bits.replace_coh.isValid() && !io.req.bits.isFromAMO) {
s_replace_req := false.B
w_replace_resp := false.B
}
when (io.req.bits.isAMO) {
when (io.req.bits.isFromAMO) {
s_mainpipe_req := false.B
w_mainpipe_resp := false.B
}
should_refill_data_reg := io.req.bits.isLoad
should_refill_data_reg := io.req.bits.isFromLoad
error := false.B
prefetch := input_req_is_prefetch
access := false.B
}
when (secondary_fire) {
assert(io.req.bits.req_coh.state <= req.req_coh.state)
assert(!(io.req.bits.isAMO || req.isAMO))
assert(io.req.bits.req_coh.state <= req.req_coh.state || (prefetch && !access))
assert(!(io.req.bits.isFromAMO || req.isFromAMO))
// use the most uptodate meta
req.req_coh := io.req.bits.req_coh
when (io.req.bits.isStore) {
when (io.req.bits.isFromStore) {
req := io.req.bits
req.addr := get_block_addr(io.req.bits.addr)
req.way_en := req.way_en
req.replace_coh := req.replace_coh
req.replace_tag := req.replace_tag
s_write_storedata := false.B // only store need to wait for data
full_overwrite := io.req.bits.isStore && io.req.bits.full_overwrite
full_overwrite := io.req.bits.isFromStore && io.req.bits.full_overwrite
}
should_refill_data := should_refill_data_reg || io.req.bits.isLoad
should_refill_data := should_refill_data_reg || io.req.bits.isFromLoad
should_refill_data_reg := should_refill_data
when (!input_req_is_prefetch) {
access := true.B // when merge non-prefetch req, set access bit
}
}
when (io.mem_acquire.fire()) {
......@@ -301,7 +318,7 @@ class MissEntry(edge: TLEdgeOut)(implicit p: Parameters) extends DCacheModule {
// new_data(i) := req.store_data(rowBits * (i + 1) - 1, rowBits * i)
new_data(i) := refill_and_store_data(i)
// we only need to merge data for Store
new_mask(i) := Mux(req.isStore, req_store_mask(rowBytes * (i + 1) - 1, rowBytes * i), 0.U)
new_mask(i) := Mux(req.isFromStore, req_store_mask(rowBytes * (i + 1) - 1, rowBytes * i), 0.U)
}
val hasData = RegInit(true.B)
......@@ -362,19 +379,21 @@ class MissEntry(edge: TLEdgeOut)(implicit p: Parameters) extends DCacheModule {
w_mainpipe_resp := true.B
}
def before_read_sent_can_merge(new_req: MissReqWoStoreData): Bool = {
acquire_not_sent && req.isLoad && (new_req.isLoad || new_req.isStore)
def before_req_sent_can_merge(new_req: MissReqWoStoreData): Bool = {
acquire_not_sent && (req.isFromLoad || req.isFromPrefetch) && (new_req.isFromLoad || new_req.isFromStore)
}
def before_data_refill_can_merge(new_req: MissReqWoStoreData): Bool = {
data_not_refilled && (req.isLoad || req.isStore) && new_req.isLoad
data_not_refilled && (req.isFromLoad || req.isFromStore || req.isFromPrefetch) && new_req.isFromLoad
}
// Note that late prefetch will be ignored
def should_merge(new_req: MissReqWoStoreData): Bool = {
val block_match = get_block(req.addr) === get_block(new_req.addr)
block_match &&
(
before_read_sent_can_merge(new_req) ||
before_req_sent_can_merge(new_req) ||
before_data_refill_can_merge(new_req)
)
}
......@@ -392,7 +411,7 @@ class MissEntry(edge: TLEdgeOut)(implicit p: Parameters) extends DCacheModule {
req_valid &&
Mux(
block_match,
!before_read_sent_can_merge(new_req) &&
!before_req_sent_can_merge(new_req) &&
!before_data_refill_can_merge(new_req),
set_match && new_req.way_en === req.way_en
)
......@@ -437,7 +456,7 @@ class MissEntry(edge: TLEdgeOut)(implicit p: Parameters) extends DCacheModule {
// resolve cache alias by L2
io.mem_acquire.bits.user.lift(AliasKey).foreach( _ := req.vaddr(13, 12))
// trigger prefetch
io.mem_acquire.bits.user.lift(PrefetchKey).foreach(_ := true.B)
io.mem_acquire.bits.user.lift(PrefetchKey).foreach(_ := Mux(io.l2_pf_store_only, req.isFromStore, true.B))
// prefer not to cache data in L2 by default
io.mem_acquire.bits.user.lift(PreferCacheKey).foreach(_ := false.B)
require(nSets <= 256)
......@@ -471,7 +490,7 @@ class MissEntry(edge: TLEdgeOut)(implicit p: Parameters) extends DCacheModule {
refill.addr := req.addr
refill.way_en := req.way_en
refill.wmask := Mux(
hasData || req.isLoad,
hasData || req.isFromLoad,
~0.U(DCacheBanks.W),
VecInit((0 until DCacheBanks).map(i => get_mask_of_bank(i, req_store_mask).orR)).asUInt
)
......@@ -493,6 +512,8 @@ class MissEntry(edge: TLEdgeOut)(implicit p: Parameters) extends DCacheModule {
}
refill.meta.coh := ClientMetadata(missCohGen(req.cmd, grant_param, isDirty))
refill.error := error
refill.prefetch := prefetch
refill.access := access
refill.alias := req.vaddr(13, 12) // TODO
io.main_pipe_req.valid := !s_mainpipe_req && w_grantlast
......@@ -535,13 +556,14 @@ class MissEntry(edge: TLEdgeOut)(implicit p: Parameters) extends DCacheModule {
XSPerfAccumulate("penalty_waiting_for_channel_D", s_acquire && !w_grantlast && !io.mem_grant.valid)
XSPerfAccumulate("penalty_waiting_for_channel_E", io.mem_finish.valid && !io.mem_finish.ready)
XSPerfAccumulate("penalty_from_grant_to_refill", !w_refill_resp && w_grantlast)
XSPerfAccumulate("soft_prefetch_number", primary_fire && io.req.bits.source === SOFT_PREFETCH.U)
XSPerfAccumulate("prefetch_req_primary", primary_fire && io.req.bits.source === DCACHE_PREFETCH_SOURCE.U)
XSPerfAccumulate("prefetch_req_merged", secondary_fire && io.req.bits.source === DCACHE_PREFETCH_SOURCE.U)
val (mshr_penalty_sample, mshr_penalty) = TransactionLatencyCounter(RegNext(primary_fire), release_entry)
XSPerfHistogram("miss_penalty", mshr_penalty, mshr_penalty_sample, 0, 20, 1, true, true)
XSPerfHistogram("miss_penalty", mshr_penalty, mshr_penalty_sample, 20, 100, 10, true, false)
val load_miss_begin = primary_fire && io.req.bits.isLoad
val load_miss_begin = primary_fire && io.req.bits.isFromLoad
val refill_finished = RegNext(!w_grantlast && refill_done) && should_refill_data
val (load_miss_penalty_sample, load_miss_penalty) = TransactionLatencyCounter(load_miss_begin, refill_finished) // not real refill finish time
XSPerfHistogram("load_miss_penalty_to_use", load_miss_penalty, load_miss_penalty_sample, 0, 20, 1, true, true)
......@@ -590,6 +612,7 @@ class MissQueue(edge: TLEdgeOut)(implicit p: Parameters) extends DCacheModule wi
// forward missqueue
val forward = Vec(LoadPipelineWidth, new LduToMissqueueForwardIO)
val l2_pf_store_only = Input(Bool())
})
// 128KBL1: FIXME: provide vaddr for l2
......@@ -656,6 +679,7 @@ class MissQueue(edge: TLEdgeOut)(implicit p: Parameters) extends DCacheModule wi
e.io.hartId := io.hartId
e.io.id := i.U
e.io.l2_pf_store_only := io.l2_pf_store_only
e.io.req.valid := io.req.valid
e.io.primary_valid := io.req.valid &&
!merge &&
......@@ -707,6 +731,17 @@ class MissQueue(edge: TLEdgeOut)(implicit p: Parameters) extends DCacheModule wi
io.full := ~Cat(entries.map(_.io.primary_ready)).andR
// L1MissTrace Chisel DB
val debug_miss_trace = Wire(new L1MissTrace)
debug_miss_trace.vaddr := io.req.bits.vaddr
debug_miss_trace.paddr := io.req.bits.addr
debug_miss_trace.source := io.req.bits.source
debug_miss_trace.pc := io.req.bits.pc
val table = ChiselDB.createTable("L1MissQMissTrace_hart"+ p(XSCoreParamsKey).HartId.toString, new L1MissTrace)
table.log(debug_miss_trace, io.req.valid && !io.req.bits.cancel && alloc, "MissQueue", clock, reset)
// Difftest
if (env.EnableDifftest) {
val difftest = Module(new DifftestRefillEvent)
difftest.io.clock := clock
......@@ -717,11 +752,14 @@ class MissQueue(edge: TLEdgeOut)(implicit p: Parameters) extends DCacheModule wi
difftest.io.data := io.refill_to_ldq.bits.data_raw.asTypeOf(difftest.io.data)
}
// Perf count
XSPerfAccumulate("miss_req", io.req.fire())
XSPerfAccumulate("miss_req_allocate", io.req.fire() && alloc)
XSPerfAccumulate("miss_req_merge_load", io.req.fire() && merge && io.req.bits.isLoad)
XSPerfAccumulate("miss_req_reject_load", io.req.valid && reject && io.req.bits.isLoad)
XSPerfAccumulate("miss_req_merge_load", io.req.fire() && merge && io.req.bits.isFromLoad)
XSPerfAccumulate("miss_req_reject_load", io.req.valid && reject && io.req.bits.isFromLoad)
XSPerfAccumulate("probe_blocked_by_miss", io.probe_block)
XSPerfAccumulate("prefetch_primary_fire", io.req.fire() && alloc && io.req.bits.isFromPrefetch)
XSPerfAccumulate("prefetch_secondary_fire", io.req.fire() && merge && io.req.bits.isFromPrefetch)
val max_inflight = RegInit(0.U((log2Up(cfg.nMissEntries) + 1).W))
val num_valids = PopCount(~Cat(primary_ready_vec).asUInt)
when (num_valids > max_inflight) {
......
......@@ -30,6 +30,8 @@ class RefillPipeReqCtrl(implicit p: Parameters) extends DCacheBundle {
val id = UInt(reqIdWidth.W)
val error = Bool()
val prefetch = Bool()
val access = Bool()
def paddrWithVirtualAlias: UInt = {
Cat(alias, addr(DCacheSameVPAddrLength - 1, 0))
......@@ -51,6 +53,8 @@ class RefillPipeReq(implicit p: Parameters) extends RefillPipeReqCtrl {
ctrl.miss_id := miss_id
ctrl.id := id
ctrl.error := error
ctrl.prefetch := prefetch
ctrl.access := access
ctrl
}
}
......@@ -67,8 +71,10 @@ class RefillPipe(implicit p: Parameters) extends DCacheModule {
val data_write = DecoupledIO(new L1BankedDataWriteReq)
val data_write_dup = Vec(DCacheBanks, Valid(new L1BankedDataWriteReqCtrl))
val meta_write = DecoupledIO(new MetaWriteReq)
val error_flag_write = DecoupledIO(new ErrorWriteReq)
val meta_write = DecoupledIO(new CohMetaWriteReq)
val error_flag_write = DecoupledIO(new FlagMetaWriteReq)
val prefetch_flag_write = DecoupledIO(new FlagMetaWriteReq)
val access_flag_write = DecoupledIO(new FlagMetaWriteReq)
val tag_write = DecoupledIO(new TagWriteReq)
val store_resp = ValidIO(new DCacheLineResp)
val release_wakeup = ValidIO(UInt(log2Up(cfg.nMissEntries).W))
......@@ -113,7 +119,17 @@ class RefillPipe(implicit p: Parameters) extends DCacheModule {
io.error_flag_write.valid := io.req_dup_for_err_w.valid
io.error_flag_write.bits.idx := req_dup_for_err_w.idx
io.error_flag_write.bits.way_en := req_dup_for_err_w.way_en
io.error_flag_write.bits.error := refill_w_req.error
io.error_flag_write.bits.flag := refill_w_req.error
io.prefetch_flag_write.valid := io.req_dup_for_err_w.valid
io.prefetch_flag_write.bits.idx := req_dup_for_err_w.idx
io.prefetch_flag_write.bits.way_en := req_dup_for_err_w.way_en
io.prefetch_flag_write.bits.flag := refill_w_req.prefetch
io.access_flag_write.valid := io.req_dup_for_err_w.valid
io.access_flag_write.bits.idx := req_dup_for_err_w.idx
io.access_flag_write.bits.way_en := req_dup_for_err_w.way_en
io.access_flag_write.bits.flag := refill_w_req.access
io.tag_write.valid := io.req_dup_for_tag_w.valid
io.tag_write.bits.idx := req_dup_for_tag_w.idx
......
......@@ -40,19 +40,19 @@ class MetaReadReq(implicit p: Parameters) extends DCacheBundle {
val way_en = UInt(nWays.W)
}
class MetaWriteReq(implicit p: Parameters) extends MetaReadReq {
class CohMetaWriteReq(implicit p: Parameters) extends MetaReadReq {
val meta = new Meta
}
class ErrorWriteReq(implicit p: Parameters) extends MetaReadReq {
val error = Bool()
class FlagMetaWriteReq(implicit p: Parameters) extends MetaReadReq {
val flag = Bool()
}
class AsynchronousMetaArray(readPorts: Int, writePorts: Int)(implicit p: Parameters) extends DCacheModule {
class L1CohMetaArray(readPorts: Int, writePorts: Int)(implicit p: Parameters) extends DCacheModule {
val io = IO(new Bundle() {
val read = Vec(readPorts, Flipped(DecoupledIO(new MetaReadReq)))
val resp = Output(Vec(readPorts, Vec(nWays, new Meta)))
val write = Vec(writePorts, Flipped(DecoupledIO(new MetaWriteReq)))
val write = Vec(writePorts, Flipped(DecoupledIO(new CohMetaWriteReq)))
})
val meta_array = RegInit(
......@@ -103,12 +103,12 @@ class AsynchronousMetaArray(readPorts: Int, writePorts: Int)(implicit p: Paramet
}
}
class ErrorArray(readPorts: Int, writePorts: Int)(implicit p: Parameters) extends DCacheModule {
class L1FlagMetaArray(readPorts: Int, writePorts: Int)(implicit p: Parameters) extends DCacheModule {
val io = IO(new Bundle() {
val read = Vec(readPorts, Flipped(DecoupledIO(new MetaReadReq)))
val resp = Output(Vec(readPorts, Vec(nWays, Bool())))
val write = Vec(writePorts, Flipped(DecoupledIO(new ErrorWriteReq)))
// customized cache op port
val write = Vec(writePorts, Flipped(DecoupledIO(new FlagMetaWriteReq)))
// customized cache op port
// val cacheOp = Flipped(new L1CacheInnerOpIO)
})
......@@ -152,7 +152,7 @@ class ErrorArray(readPorts: Int, writePorts: Int)(implicit p: Parameters) extend
s0_way_wen(way)(wport) := write.valid && wen
s1_way_wen(way)(wport) := RegNext(s0_way_wen(way)(wport))
s1_way_waddr(way)(wport) := RegEnable(write.bits.idx, s0_way_wen(way)(wport))
s1_way_wdata(way)(wport) := RegEnable(write.bits.error, s0_way_wen(way)(wport))
s1_way_wdata(way)(wport) := RegEnable(write.bits.flag, s0_way_wen(way)(wport))
when (s1_way_wen(way)(wport)) {
meta_array(s1_way_waddr(way)(wport))(way) := s1_way_wdata(way)(wport)
}
......
......@@ -359,6 +359,8 @@ class TlbReq(implicit p: Parameters) extends TlbBundle {
val cmd = Output(TlbCmd())
val size = Output(UInt(log2Ceil(log2Ceil(XLEN/8)+1).W))
val kill = Output(Bool()) // Use for blocked tlb that need sync with other module like icache
// do not translate, but still do pmp/pma check
val no_translate = Output(Bool())
val debug = new Bundle {
val pc = Output(UInt(XLEN.W))
val robIdx = Output(new RobPtr)
......
......@@ -72,6 +72,7 @@ class TLB(Width: Int, nRespDups: Int = 1, Block: Seq[Boolean], q: TLBParameters)
// val vmEnable = satp.mode === 8.U // && (mode < ModeM) // FIXME: fix me when boot xv6/linux...
val vmEnable = if (EnbaleTlbDebug) (satp.mode === 8.U)
else (satp.mode === 8.U && (mode < ModeM))
val portTranslateEnable = (0 until Width).map(i => vmEnable && !req(i).bits.no_translate)
val req_in = req
val req_out = req.map(a => RegEnable(a.bits, a.fire()))
......@@ -118,10 +119,11 @@ class TLB(Width: Int, nRespDups: Int = 1, Block: Seq[Boolean], q: TLBParameters)
def TLBRead(i: Int) = {
val (e_hit, e_ppn, e_perm, e_super_hit, e_super_ppn, static_pm) = entries.io.r_resp_apply(i)
val (p_hit, p_ppn, p_perm) = ptw_resp_bypass(get_pn(req_in(i).bits.vaddr))
val enable = portTranslateEnable(i)
val hit = e_hit || p_hit
val miss = !hit && vmEnable
val fast_miss = !(e_super_hit || p_hit) && vmEnable
val miss = !hit && enable
val fast_miss = !(e_super_hit || p_hit) && enable
hit.suggestName(s"hit_read_${i}")
miss.suggestName(s"miss_read_${i}")
......@@ -138,15 +140,15 @@ class TLB(Width: Int, nRespDups: Int = 1, Block: Seq[Boolean], q: TLBParameters)
perm(d) := Mux(p_hit, p_perm, e_perm(d))
val paddr = Cat(ppn(d), get_off(req_out(i).vaddr))
resp(i).bits.paddr(d) := Mux(vmEnable, paddr, vaddr)
resp(i).bits.paddr(d) := Mux(enable, paddr, vaddr)
}
XSDebug(req_out_v(i), p"(${i.U}) hit:${hit} miss:${miss} ppn:${Hexadecimal(ppn(0))} perm:${perm(0)}\n")
val pmp_paddr = Mux(vmEnable, Cat(Mux(p_hit, p_ppn, e_super_ppn), get_off(req_out(i).vaddr)), vaddr)
val pmp_paddr = Mux(enable, Cat(Mux(p_hit, p_ppn, e_super_ppn), get_off(req_out(i).vaddr)), vaddr)
// pmp_paddr seems same to paddr functionally. It abandons normal_ppn for timing optimization.
// val pmp_paddr = Mux(vmEnable, paddr, vaddr)
val static_pm_valid = !(e_super_hit || p_hit) && vmEnable && q.partialStaticPMP.B
// val pmp_paddr = Mux(enable, paddr, vaddr)
val static_pm_valid = !(e_super_hit || p_hit) && enable && q.partialStaticPMP.B
(hit, miss, pmp_paddr, static_pm, static_pm_valid, perm)
}
......@@ -174,7 +176,7 @@ class TLB(Width: Int, nRespDups: Int = 1, Block: Seq[Boolean], q: TLBParameters)
val ldPf = (ldPermFail || pf) && (TlbCmd.isRead(cmd) && !TlbCmd.isAmo(cmd))
val stPf = (stPermFail || pf) && (TlbCmd.isWrite(cmd) || TlbCmd.isAmo(cmd))
val instrPf = (instrPermFail || pf) && TlbCmd.isExec(cmd)
val fault_valid = vmEnable
val fault_valid = portTranslateEnable(idx)
resp(idx).bits.excp(nDups).pf.ld := (ldPf || ldUpdate) && fault_valid && !af
resp(idx).bits.excp(nDups).pf.st := (stPf || stUpdate) && fault_valid && !af
resp(idx).bits.excp(nDups).pf.instr := (instrPf || instrUpdate) && fault_valid && !af
......@@ -218,8 +220,8 @@ class TLB(Width: Int, nRespDups: Int = 1, Block: Seq[Boolean], q: TLBParameters)
io.ptw.req(idx).fire() || resp(idx).fire(), flush_pipe(idx))
// when ptw resp, check if hit, reset miss_v, resp to lsu/ifu
resp(idx).valid := req_out_v(idx) && !(miss_v && vmEnable)
when (io.ptw.resp.fire() && hit && req_out_v(idx) && vmEnable) {
resp(idx).valid := req_out_v(idx) && !(miss_v && portTranslateEnable(idx))
when (io.ptw.resp.fire() && hit && req_out_v(idx) && portTranslateEnable(idx)) {
val pte = io.ptw.resp.bits
resp(idx).valid := true.B
resp(idx).bits.miss := false.B // for blocked tlb, this is useless
......@@ -242,7 +244,7 @@ class TLB(Width: Int, nRespDups: Int = 1, Block: Seq[Boolean], q: TLBParameters)
// however, some outside modules like icache, dont care flushPipe, and still waiting for tlb resp
// just resp valid and raise page fault to go through. The pipe(ifu) will abandon it.
if (!q.outsideRecvFlush) {
when (req_out_v(idx) && flush_pipe(idx) && vmEnable) {
when (req_out_v(idx) && flush_pipe(idx) && portTranslateEnable(idx)) {
resp(idx).valid := true.B
for (d <- 0 until nRespDups) {
resp(idx).bits.excp(d).pf.ld := true.B // sfence happened, pf for not to use this addr
......@@ -271,21 +273,21 @@ class TLB(Width: Int, nRespDups: Int = 1, Block: Seq[Boolean], q: TLBParameters)
val result_ok = req_in.map(a => RegNext(a.fire()))
val perfEvents =
Seq(
("access", PopCount((0 until Width).map{i => if (Block(i)) io.requestor(i).req.fire() else vmEnable && result_ok(i) })),
("miss ", PopCount((0 until Width).map{i => if (Block(i)) vmEnable && result_ok(i) && missVec(i) else ptw.req(i).fire() })),
("access", PopCount((0 until Width).map{i => if (Block(i)) io.requestor(i).req.fire() else portTranslateEnable(i) && result_ok(i) })),
("miss ", PopCount((0 until Width).map{i => if (Block(i)) portTranslateEnable(i) && result_ok(i) && missVec(i) else ptw.req(i).fire() })),
)
generatePerfEvent()
// perf log
for (i <- 0 until Width) {
if (Block(i)) {
XSPerfAccumulate(s"access${i}",result_ok(i) && vmEnable)
XSPerfAccumulate(s"access${i}",result_ok(i) && portTranslateEnable(i))
XSPerfAccumulate(s"miss${i}", result_ok(i) && missVec(i))
} else {
XSPerfAccumulate("first_access" + Integer.toString(i, 10), result_ok(i) && vmEnable && RegNext(req(i).bits.debug.isFirstIssue))
XSPerfAccumulate("access" + Integer.toString(i, 10), result_ok(i) && vmEnable)
XSPerfAccumulate("first_miss" + Integer.toString(i, 10), result_ok(i) && vmEnable && missVec(i) && RegNext(req(i).bits.debug.isFirstIssue))
XSPerfAccumulate("miss" + Integer.toString(i, 10), result_ok(i) && vmEnable && missVec(i))
XSPerfAccumulate("first_access" + Integer.toString(i, 10), result_ok(i) && portTranslateEnable(i) && RegNext(req(i).bits.debug.isFirstIssue))
XSPerfAccumulate("access" + Integer.toString(i, 10), result_ok(i) && portTranslateEnable(i))
XSPerfAccumulate("first_miss" + Integer.toString(i, 10), result_ok(i) && portTranslateEnable(i) && missVec(i) && RegNext(req(i).bits.debug.isFirstIssue))
XSPerfAccumulate("miss" + Integer.toString(i, 10), result_ok(i) && portTranslateEnable(i) && missVec(i))
}
}
XSPerfAccumulate("ptw_resp_count", ptw.resp.fire())
......@@ -322,7 +324,7 @@ class TLB(Width: Int, nRespDups: Int = 1, Block: Seq[Boolean], q: TLBParameters)
val difftest = Module(new DifftestL1TLBEvent)
difftest.io.clock := clock
difftest.io.coreid := p(XSCoreParamsKey).HartId.asUInt
difftest.io.valid := RegNext(io.requestor(i).req.fire) && !RegNext(io.requestor(i).req_kill) && io.requestor(i).resp.fire && !io.requestor(i).resp.bits.miss && !pf && !af && vmEnable
difftest.io.valid := RegNext(io.requestor(i).req.fire) && !RegNext(io.requestor(i).req_kill) && io.requestor(i).resp.fire && !io.requestor(i).resp.bits.miss && !pf && !af && portTranslateEnable(i)
difftest.io.index := i.U
difftest.io.l1tlbid := l1tlbid
difftest.io.satp := io.csr.satp.ppn
......
......@@ -551,6 +551,7 @@ class NewIFU(implicit p: Parameters) extends XSModule
io.iTLBInter.req.bits.kill := false.B // IFU use itlb for mmio, doesn't need sync, set it to false
io.iTLBInter.req.bits.cmd := TlbCmd.exec
io.iTLBInter.req.bits.debug.robIdx := DontCare
io.iTLBInter.req.bits.no_translate := false.B
io.iTLBInter.req.bits.debug.isFirstIssue := DontCare
io.pmp.req.valid := (mmio_state === m_sendPMP) && f3_req_is_mmio
......
......@@ -196,6 +196,7 @@ class ICacheMainPipe(implicit p: Parameters) extends ICacheModule
toITLB.map{port =>
port.bits.cmd := TlbCmd.exec
port.bits.debug.robIdx := DontCare
port.bits.no_translate := false.B
port.bits.debug.isFirstIssue := DontCare
}
......
......@@ -107,6 +107,7 @@ class IPrefetchPipe(implicit p: Parameters) extends IPrefetchModule
toITLB.bits.kill := DontCare
toITLB.bits.cmd := TlbCmd.exec
toITLB.bits.debug.robIdx := DontCare
toITLB.bits.no_translate := false.B
toITLB.bits.debug.isFirstIssue := DontCare
......
......@@ -68,8 +68,10 @@ class LsPipelineBundle(implicit p: Parameters) extends XSBundleWithMicroOp with
val forwardMask = Vec(8, Bool())
val forwardData = Vec(8, UInt(8.W))
//softprefetch
val isSoftPrefetch = Bool()
// prefetch
val isPrefetch = Bool()
val isHWPrefetch = Bool()
def isSWPrefetch = isPrefetch && !isHWPrefetch
// For debug usage
val isFirstIssue = Bool()
......@@ -84,6 +86,37 @@ class LsPipelineBundle(implicit p: Parameters) extends XSBundleWithMicroOp with
val forward_tlDchannel = Bool()
}
class LdPrefetchTrainBundle(implicit p: Parameters) extends LsPipelineBundle {
val meta_prefetch = Bool()
val meta_access = Bool()
def fromLsPipelineBundle(input: LsPipelineBundle) = {
vaddr := input.vaddr
paddr := input.paddr
mask := input.mask
data := input.data
uop := input.uop
wlineflag := input.wlineflag
miss := input.miss
tlbMiss := input.tlbMiss
ptwBack := input.ptwBack
mmio := input.mmio
rsIdx := input.rsIdx
forwardMask := input.forwardMask
forwardData := input.forwardData
isPrefetch := input.isPrefetch
isHWPrefetch := input.isHWPrefetch
isFirstIssue := input.isFirstIssue
meta_prefetch := DontCare
meta_access := DontCare
forward_tlDchannel := DontCare
mshrid := DontCare
replayCarry := DontCare
atomic := DontCare
isLoadReplay := DontCare
}
}
class LqWriteBundle(implicit p: Parameters) extends LsPipelineBundle {
// queue entry data, except flag bits, will be updated if writeQueue is true,
// valid bit in LqWriteBundle will be ignored
......@@ -104,7 +137,8 @@ class LqWriteBundle(implicit p: Parameters) extends LsPipelineBundle {
rsIdx := input.rsIdx
forwardMask := input.forwardMask
forwardData := input.forwardData
isSoftPrefetch := input.isSoftPrefetch
isPrefetch := input.isPrefetch
isHWPrefetch := input.isHWPrefetch
isFirstIssue := input.isFirstIssue
isLoadReplay := input.isLoadReplay
mshrid := input.mshrid
......
/***************************************************************************************
* Copyright (c) 2020-2021 Institute of Computing Technology, Chinese Academy of Sciences
* Copyright (c) 2020-2021 Peng Cheng Laboratory
*
* XiangShan is licensed under Mulan PSL v2.
* You can use this software according to the terms and conditions of the Mulan PSL v2.
* You may obtain a copy of Mulan PSL v2 at:
* http://license.coscl.org.cn/MulanPSL2
*
* THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND,
* EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT,
* MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE.
*
* See the Mulan PSL v2 for more details.
***************************************************************************************/
package xiangshan.mem.trace
import chipsalliance.rocketchip.config.Parameters
import chisel3._
import chisel3.util._
class L1MissTrace extends Bundle {
val vaddr = UInt(39.W)
val paddr = UInt(36.W)
val source = UInt(4.W)
val pc = UInt(39.W)
}
\ No newline at end of file
......@@ -347,7 +347,8 @@ class LoadQueue(implicit p: Parameters) extends XSModule
})
(0 until LoadPipelineWidth).map(i => {
vaddrModule.io.raddr(LoadPipelineWidth + i) := loadReplaySelGen(i)
// vaddrModule rport 0 and 1 is used by exception and mmio
vaddrModule.io.raddr(2 + i) := loadReplaySelGen(i)
})
(0 until LoadPipelineWidth).map(i => {
......
......@@ -55,6 +55,7 @@ class AtomicsUnit(implicit p: Parameters) extends XSModule with MemoryOpConstant
val in = Reg(new ExuInput())
val exceptionVec = RegInit(0.U.asTypeOf(ExceptionVec()))
val atom_override_xtval = RegInit(false.B)
val have_sent_first_tlb_req = RegInit(false.B)
val isLr = in.uop.ctrl.fuOpType === LSUOpType.lr_w || in.uop.ctrl.fuOpType === LSUOpType.lr_d
// paddr after translation
val paddr = Reg(UInt())
......@@ -100,6 +101,7 @@ class AtomicsUnit(implicit p: Parameters) extends XSModule with MemoryOpConstant
in := io.in.bits
in.src(1) := in.src(1) // leave src2 unchanged
state := s_tlb_and_flush_sbuffer_req
have_sent_first_tlb_req := false.B
}
}
......@@ -136,7 +138,12 @@ class AtomicsUnit(implicit p: Parameters) extends XSModule with MemoryOpConstant
// send req to sbuffer to flush it if it is not empty
io.flush_sbuffer.valid := Mux(sbuffer_empty, false.B, true.B)
when(io.dtlb.resp.fire){
// do not accept tlb resp in the first cycle
// this limition is for hw prefetcher
// when !have_sent_first_tlb_req, tlb resp may come from hw prefetch
have_sent_first_tlb_req := true.B
when(io.dtlb.resp.fire && have_sent_first_tlb_req){
paddr := io.dtlb.resp.bits.paddr(0)
// exception handling
val addrAligned = LookupTree(in.uop.ctrl.fuOpType(1,0), List(
......
......@@ -53,6 +53,7 @@ class StoreUnit_S0(implicit p: Parameters) extends XSModule {
io.dtlbReq.bits.size := LSUOpType.size(io.in.bits.uop.ctrl.fuOpType)
io.dtlbReq.bits.kill := DontCare
io.dtlbReq.bits.debug.robIdx := io.in.bits.uop.robIdx
io.dtlbReq.bits.no_translate := false.B
io.dtlbReq.bits.debug.pc := io.in.bits.uop.cf.pc
io.dtlbReq.bits.debug.isFirstIssue := io.isFirstIssue
......
package xiangshan.mem.prefetch
import chisel3._
import chisel3.util._
import chipsalliance.rocketchip.config.Parameters
import xiangshan._
import xiangshan.cache.mmu.TlbRequestIO
import xiangshan.mem.{LdPrefetchTrainBundle, L1PrefetchReq}
class PrefetcherIO()(implicit p: Parameters) extends XSBundle {
val ld_in = Flipped(Vec(exuParameters.LduCnt, ValidIO(new LdPrefetchTrainBundle())))
val tlb_req = new TlbRequestIO(nRespDups = 2)
val pf_addr = ValidIO(UInt(PAddrBits.W))
val l1_req = DecoupledIO(new L1PrefetchReq())
val enable = Input(Bool())
}
trait PrefetcherParams
abstract class BasePrefecher()(implicit p: Parameters) extends XSModule {
val io = IO(new PrefetcherIO())
}
\ No newline at end of file
/***************************************************************************************
* Copyright (c) 2020-2021 Institute of Computing Technology, Chinese Academy of Sciences
* Copyright (c) 2020-2021 Peng Cheng Laboratory
*
* XiangShan is licensed under Mulan PSL v2.
* You can use this software according to the terms and conditions of the Mulan PSL v2.
* You may obtain a copy of Mulan PSL v2 at:
* http://license.coscl.org.cn/MulanPSL2
*
* THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND,
* EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT,
* MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE.
*
* See the Mulan PSL v2 for more details.
***************************************************************************************/
package xiangshan.mem
import chipsalliance.rocketchip.config.Parameters
import chisel3._
import chisel3.util._
import utils._
import utility._
import xiangshan.ExceptionNO._
import xiangshan._
import xiangshan.backend.fu.PMPRespBundle
import xiangshan.cache._
import xiangshan.cache.mmu.{TlbCmd, TlbReq, TlbRequestIO, TlbResp}
class L1PrefetchReq (implicit p: Parameters) extends XSBundle with HasDCacheParameters{
val paddr = UInt(PAddrBits.W)
val alias = UInt(2.W)
val confidence = UInt(1.W)
val is_store = Bool()
// only index bit is used, do not use tag
def getVaddr(): UInt = {
Cat(alias, paddr(DCacheSameVPAddrLength-1, 0))
}
// when l1 cache prefetch req arrives at load unit:
// if (confidence == 1)
// override load unit 2 load req
// else if (load unit 1/2 is available)
// send prefetch req
// else
// report prefetch !ready
}
class L1PrefetchHint (implicit p: Parameters) extends XSBundle with HasDCacheParameters{
val loadbusy = Bool()
val missqbusy = Bool()
}
class L1PrefetchFuzzer(implicit p: Parameters) extends DCacheModule{
val io = IO(new Bundle() {
// prefetch req interface
val req = Decoupled(new L1PrefetchReq())
// for fuzzer address gen
val vaddr = Input(UInt(VAddrBits.W))
val paddr = Input(UInt(PAddrBits.W))
})
// prefetch req queue is not provided, prefetcher must maintain its
// own prefetch req queue.
val rand_offset = LFSR64(seed=Some(123L))(5,0) << 6
val rand_addr_select = LFSR64(seed=Some(567L))(3,0) === 0.U
// use valid vaddr and paddr
val rand_vaddr = DelayN(io.vaddr, 2)
val rand_paddr = DelayN(io.paddr, 2)
io.req.bits.paddr := 0x80000000L.U + rand_offset
io.req.bits.alias := io.req.bits.paddr(13,12)
io.req.bits.confidence := LFSR64(seed=Some(789L))(4,0) === 0.U
io.req.bits.is_store := LFSR64(seed=Some(890L))(4,0) === 0.U
io.req.valid := LFSR64(seed=Some(901L))(3,0) === 0.U
}
\ No newline at end of file
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册