未验证 提交 09203307 编写于 作者: W William Wang 提交者: GitHub

mem: reduce refill to use latency (#1401)

* mem: optimize missq reject to lq timing

DCache replay request is quite slow to generate, as it need to compare
load address with address in all valid miss queue entries.

Now we delay the usage of replay request from data cache.
Now replay request will not influence normal execuation flow until
load_s3 (1 cycle after load_s2, load result writeback to RS).

It is worth mentioning that "select refilling inst for load
writeback" will be disabled if dcacheRequireReplay in the
last cycle.

* dcache: compare probe block addr instead of full addr

* mem: do not replay from RS when ldld vio or fwd failed

ld-ld violation or forward failure will let an normal load inst replay
from fetch. If TLB hit and ld-ld violation / forward failure happens,
we write back that inst immediately. Meanwhile, such insts will not be
replayed from rs.

It should fix "mem: optimize missq reject to lq timing"

* mem: fix replay from rs condition

* mem: reduce refill to use latency

This commit update lq entry flag carefully in load_s3 to avoid extra
refill delay. It will remove the extra refill delay introduced by #1375
without harming memblock timing.

In #1375, we delayed load refill when dcache miss queue entry fails
to accept a miss. #1375 exchanges performance for better timing.

* mem: fix rs feedback priority

When dataInvalid && mshrFull, a succeed refill should not cancel
rs replay.
上级 8d8ac704
......@@ -103,6 +103,8 @@ class MemBlockImp(outer: MemBlock) extends LazyModuleImp(outer)
val dcache = outer.dcache.module
val uncache = outer.uncache.module
val delayedDcacheRefill = RegNext(dcache.io.lsu.lsq)
val csrCtrl = DelayN(io.csrCtrl, 2)
dcache.io.csr.distribute_csr <> csrCtrl.distribute_csr
io.csrUpdate := RegNext(dcache.io.csr.update)
......@@ -261,6 +263,8 @@ class MemBlockImp(outer: MemBlock) extends LazyModuleImp(outer)
// ld-ld violation check
loadUnits(i).io.lsq.loadViolationQuery <> lsq.io.loadViolationQuery(i)
loadUnits(i).io.csrCtrl <> csrCtrl
// dcache refill req
loadUnits(i).io.refill <> delayedDcacheRefill
// dtlb
loadUnits(i).io.tlb <> dtlb_ld(i).requestor(0)
// pmp
......@@ -430,9 +434,7 @@ class MemBlockImp(outer: MemBlock) extends LazyModuleImp(outer)
io.memoryViolation <> lsq.io.rollback
lsq.io.uncache <> uncache.io.lsq
// delay dcache refill for 1 cycle for better timing
// TODO: remove RegNext after fixing refill paddr timing
// lsq.io.dcache <> dcache.io.lsu.lsq
lsq.io.dcache := RegNext(dcache.io.lsu.lsq)
lsq.io.refill := delayedDcacheRefill
lsq.io.release := dcache.io.lsu.release
lsq.io.lqCancelCnt <> io.lqCancelCnt
lsq.io.sqCancelCnt <> io.sqCancelCnt
......@@ -445,9 +447,6 @@ class MemBlockImp(outer: MemBlock) extends LazyModuleImp(outer)
// Sbuffer
sbuffer.io.csrCtrl <> csrCtrl
sbuffer.io.dcache <> dcache.io.lsu.store
// TODO: if dcache sbuffer resp needs to ne delayed
// sbuffer.io.dcache.pipe_resp.valid := RegNext(dcache.io.lsu.store.pipe_resp.valid)
// sbuffer.io.dcache.pipe_resp.bits := RegNext(dcache.io.lsu.store.pipe_resp.bits)
// flush sbuffer
val fenceFlush = io.fenceToSbuffer.flushSb
......
......@@ -157,6 +157,10 @@ trait HasDCacheParameters extends HasL1CacheParameters {
data(DCacheSRAMRowBytes * (bank + 1) - 1, DCacheSRAMRowBytes * bank)
}
def refill_addr_hit(a: UInt, b: UInt): Bool = {
a(PAddrBits-1, DCacheIndexOffset) === b(PAddrBits-1, DCacheIndexOffset)
}
def arbiter[T <: Bundle](
in: Seq[DecoupledIO[T]],
out: DecoupledIO[T],
......
......@@ -70,7 +70,7 @@ class LsqWrappper(implicit p: Parameters) extends XSModule with HasDCacheParamet
val loadViolationQuery = Vec(LoadPipelineWidth, Flipped(new LoadViolationQueryIO))
val rob = Flipped(new RobLsqIO)
val rollback = Output(Valid(new Redirect))
val dcache = Flipped(ValidIO(new Refill))
val refill = Flipped(ValidIO(new Refill))
val release = Flipped(ValidIO(new Release))
val uncache = new DCacheWordIO
val exceptionAddr = new ExceptionAddrIO
......@@ -120,7 +120,7 @@ class LsqWrappper(implicit p: Parameters) extends XSModule with HasDCacheParamet
loadQueue.io.ldout <> io.ldout
loadQueue.io.rob <> io.rob
loadQueue.io.rollback <> io.rollback
loadQueue.io.dcache <> io.dcache
loadQueue.io.refill <> io.refill
loadQueue.io.release <> io.release
loadQueue.io.trigger <> io.trigger
loadQueue.io.exceptionAddr.isStore := DontCare
......
......@@ -93,7 +93,7 @@ class LoadQueue(implicit p: Parameters) extends XSModule
val loadViolationQuery = Vec(LoadPipelineWidth, Flipped(new LoadViolationQueryIO))
val rob = Flipped(new RobLsqIO)
val rollback = Output(Valid(new Redirect)) // replay now starts from load instead of store
val dcache = Flipped(ValidIO(new Refill)) // TODO: to be renamed
val refill = Flipped(ValidIO(new Refill))
val release = Flipped(ValidIO(new Release))
val uncache = new DCacheWordIO
val exceptionAddr = new ExceptionAddrIO
......@@ -264,15 +264,15 @@ class LoadQueue(implicit p: Parameters) extends XSModule
vaddrModule.io.wen(i) := RegNext(io.loadIn(i).fire())
}
when(io.dcache.valid) {
XSDebug("miss resp: paddr:0x%x data %x\n", io.dcache.bits.addr, io.dcache.bits.data)
when(io.refill.valid) {
XSDebug("miss resp: paddr:0x%x data %x\n", io.refill.bits.addr, io.refill.bits.data)
}
// Refill 64 bit in a cycle
// Refill data comes back from io.dcache.resp
dataModule.io.refill.valid := io.dcache.valid
dataModule.io.refill.paddr := io.dcache.bits.addr
dataModule.io.refill.data := io.dcache.bits.data
dataModule.io.refill.valid := io.refill.valid
dataModule.io.refill.paddr := io.refill.bits.addr
dataModule.io.refill.data := io.refill.bits.data
val dcacheRequireReplay = WireInit(VecInit((0 until LoadPipelineWidth).map(i =>{
RegNext(io.loadIn(i).fire()) && RegNext(io.dcacheRequireReplay(i))
......@@ -284,10 +284,7 @@ class LoadQueue(implicit p: Parameters) extends XSModule
when(dataModule.io.refill.valid && dataModule.io.refill.refillMask(i) && dataModule.io.refill.matchMask(i)) {
datavalid(i) := true.B
miss(i) := false.B
when(!dcacheRequireReplay.asUInt.orR){
refilling(i) := true.B
}
when(io.dcache.bits.error) {
when(io.refill.bits.error) {
error(i) := true.B
}
}
......@@ -299,7 +296,7 @@ class LoadQueue(implicit p: Parameters) extends XSModule
// dcacheRequireReplay will be used to update lq flag 1 cycle after for better timing
//
// io.dcacheRequireReplay comes from dcache miss req reject, which is quite slow to generate
when(dcacheRequireReplay(i)) {
when(dcacheRequireReplay(i) && !refill_addr_hit(RegNext(io.loadIn(i).bits.paddr), io.refill.bits.addr)) {
// do not writeback if that inst will be resend from rs
// rob writeback will not be triggered by a refill before inst replay
miss(RegNext(loadWbIndex)) := false.B // disable refill listening
......@@ -790,7 +787,7 @@ class LoadQueue(implicit p: Parameters) extends XSModule
dataModule.io.uncacheWrite(deqPtr, io.uncache.resp.bits.data(XLEN-1, 0))
dataModule.io.uncache.wen := true.B
XSDebug("uncache resp: data %x\n", io.dcache.bits.data)
XSDebug("uncache resp: data %x\n", io.refill.bits.data)
}
// Read vaddr for mem exception
......@@ -849,7 +846,7 @@ class LoadQueue(implicit p: Parameters) extends XSModule
XSPerfAccumulate("rollback", io.rollback.valid) // rollback redirect generated
XSPerfAccumulate("mmioCycle", uncacheState =/= s_idle) // lq is busy dealing with uncache req
XSPerfAccumulate("mmioCnt", io.uncache.req.fire())
XSPerfAccumulate("refill", io.dcache.valid)
XSPerfAccumulate("refill", io.refill.valid)
XSPerfAccumulate("writeback_success", PopCount(VecInit(io.ldout.map(i => i.fire()))))
XSPerfAccumulate("writeback_blocked", PopCount(VecInit(io.ldout.map(i => i.valid && !i.ready))))
XSPerfAccumulate("utilization_miss", PopCount((0 until LoadQueueSize).map(i => allocated(i) && miss(i))))
......@@ -858,7 +855,7 @@ class LoadQueue(implicit p: Parameters) extends XSModule
("rollback ", io.rollback.valid ),
("mmioCycle ", uncacheState =/= s_idle ),
("mmio_Cnt ", io.uncache.req.fire() ),
("refill ", io.dcache.valid ),
("refill ", io.refill.valid ),
("writeback_success", PopCount(VecInit(io.ldout.map(i => i.fire()))) ),
("writeback_blocked", PopCount(VecInit(io.ldout.map(i => i.valid && !i.ready))) ),
("ltq_1_4_valid ", (validCount < (LoadQueueSize.U/4.U)) ),
......
......@@ -451,10 +451,12 @@ class LoadUnit_S2(implicit p: Parameters) extends XSModule with HasLoadHelper {
io.rsFeedback.bits.hit := !s2_need_replay_from_rs
io.rsFeedback.bits.rsIdx := io.in.bits.rsIdx
io.rsFeedback.bits.flushState := io.in.bits.ptwBack
// feedback source priority: tlbMiss > dataInvalid > mshrFull
// general case priority: tlbMiss > exception (include forward_fail / ldld_violation) > mmio > dataInvalid > mshrFull > normal miss / hit
io.rsFeedback.bits.sourceType := Mux(s2_tlb_miss, RSFeedbackType.tlbMiss,
Mux(s2_cache_replay,
RSFeedbackType.mshrFull,
RSFeedbackType.dataInvalid
Mux(s2_data_invalid,
RSFeedbackType.dataInvalid,
RSFeedbackType.mshrFull
)
)
io.rsFeedback.bits.dataInvalidSqIdx.value := io.dataInvalidSqIdx
......@@ -496,7 +498,11 @@ class LoadUnit_S2(implicit p: Parameters) extends XSModule with HasLoadHelper {
XSPerfAccumulate("replay_from_fetch_load_vio", io.out.valid && ldldVioReplay)
}
class LoadUnit(implicit p: Parameters) extends XSModule with HasLoadHelper with HasPerfEvents {
class LoadUnit(implicit p: Parameters) extends XSModule
with HasLoadHelper
with HasPerfEvents
with HasDCacheParameters
{
val io = IO(new Bundle() {
val ldin = Flipped(Decoupled(new ExuInput))
val ldout = Decoupled(new ExuOutput)
......@@ -508,6 +514,7 @@ class LoadUnit(implicit p: Parameters) extends XSModule with HasLoadHelper with
val dcache = new DCacheLoadIO
val sbuffer = new LoadForwardQueryIO
val lsq = new LoadToLsqIO
val refill = Flipped(ValidIO(new Refill))
val fastUop = ValidIO(new MicroOp) // early wakeup signal generated in load_s1
val trigger = Vec(3, new LoadUnitTriggerIO)
......@@ -569,8 +576,13 @@ class LoadUnit(implicit p: Parameters) extends XSModule with HasLoadHelper with
io.lsq.dcacheRequireReplay := load_s2.io.dcacheRequireReplay
// feedback tlb miss / dcache miss queue full
io.feedbackSlow.bits := RegNext(load_s2.io.rsFeedback.bits)
io.feedbackSlow.valid := RegNext(load_s2.io.rsFeedback.valid && !load_s2.io.out.bits.uop.robIdx.needFlush(io.redirect))
io.feedbackSlow.bits := RegNext(load_s2.io.rsFeedback.bits)
val s3_replay_for_mshrfull = RegNext(!load_s2.io.rsFeedback.bits.hit && load_s2.io.rsFeedback.bits.sourceType === RSFeedbackType.mshrFull)
val s3_refill_hit_load_paddr = refill_addr_hit(RegNext(load_s2.io.out.bits.paddr), io.refill.bits.addr)
// update replay request
io.feedbackSlow.bits.hit := RegNext(load_s2.io.rsFeedback.bits).hit ||
s3_refill_hit_load_paddr && s3_replay_for_mshrfull
// feedback bank conflict to rs
io.feedbackFast.bits := load_s1.io.rsFeedback.bits
......@@ -635,7 +647,11 @@ class LoadUnit(implicit p: Parameters) extends XSModule with HasLoadHelper with
io.lsq.ldout.ready := !hitLoadOut.valid
when(io.feedbackSlow.valid && !io.feedbackSlow.bits.hit){
// when need replay from rs, inst should not be writebacked to rob
assert(RegNext(!hitLoadOut.valid))
// when need replay from rs
// * inst should not be writebacked to lq, or
// * lq state will be updated in load_s3 (next cycle)
assert(RegNext(!io.lsq.loadIn.valid) || RegNext(load_s2.io.dcacheRequireReplay))
}
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册