From 09203307ca868620f1a4adc68afd3002457ac935 Mon Sep 17 00:00:00 2001 From: William Wang Date: Sat, 2 Apr 2022 19:08:46 +0800 Subject: [PATCH] mem: reduce refill to use latency (#1401) * mem: optimize missq reject to lq timing DCache replay request is quite slow to generate, as it need to compare load address with address in all valid miss queue entries. Now we delay the usage of replay request from data cache. Now replay request will not influence normal execuation flow until load_s3 (1 cycle after load_s2, load result writeback to RS). It is worth mentioning that "select refilling inst for load writeback" will be disabled if dcacheRequireReplay in the last cycle. * dcache: compare probe block addr instead of full addr * mem: do not replay from RS when ldld vio or fwd failed ld-ld violation or forward failure will let an normal load inst replay from fetch. If TLB hit and ld-ld violation / forward failure happens, we write back that inst immediately. Meanwhile, such insts will not be replayed from rs. It should fix "mem: optimize missq reject to lq timing" * mem: fix replay from rs condition * mem: reduce refill to use latency This commit update lq entry flag carefully in load_s3 to avoid extra refill delay. It will remove the extra refill delay introduced by #1375 without harming memblock timing. In #1375, we delayed load refill when dcache miss queue entry fails to accept a miss. #1375 exchanges performance for better timing. * mem: fix rs feedback priority When dataInvalid && mshrFull, a succeed refill should not cancel rs replay. --- .../scala/xiangshan/backend/MemBlock.scala | 11 ++++---- .../cache/dcache/DCacheWrapper.scala | 4 +++ .../xiangshan/mem/lsqueue/LSQWrapper.scala | 4 +-- .../xiangshan/mem/lsqueue/LoadQueue.scala | 25 ++++++++---------- .../xiangshan/mem/pipeline/LoadUnit.scala | 26 +++++++++++++++---- 5 files changed, 43 insertions(+), 27 deletions(-) diff --git a/src/main/scala/xiangshan/backend/MemBlock.scala b/src/main/scala/xiangshan/backend/MemBlock.scala index 036c13c97..44954de9c 100644 --- a/src/main/scala/xiangshan/backend/MemBlock.scala +++ b/src/main/scala/xiangshan/backend/MemBlock.scala @@ -103,6 +103,8 @@ class MemBlockImp(outer: MemBlock) extends LazyModuleImp(outer) val dcache = outer.dcache.module val uncache = outer.uncache.module + val delayedDcacheRefill = RegNext(dcache.io.lsu.lsq) + val csrCtrl = DelayN(io.csrCtrl, 2) dcache.io.csr.distribute_csr <> csrCtrl.distribute_csr io.csrUpdate := RegNext(dcache.io.csr.update) @@ -261,6 +263,8 @@ class MemBlockImp(outer: MemBlock) extends LazyModuleImp(outer) // ld-ld violation check loadUnits(i).io.lsq.loadViolationQuery <> lsq.io.loadViolationQuery(i) loadUnits(i).io.csrCtrl <> csrCtrl + // dcache refill req + loadUnits(i).io.refill <> delayedDcacheRefill // dtlb loadUnits(i).io.tlb <> dtlb_ld(i).requestor(0) // pmp @@ -430,9 +434,7 @@ class MemBlockImp(outer: MemBlock) extends LazyModuleImp(outer) io.memoryViolation <> lsq.io.rollback lsq.io.uncache <> uncache.io.lsq // delay dcache refill for 1 cycle for better timing - // TODO: remove RegNext after fixing refill paddr timing - // lsq.io.dcache <> dcache.io.lsu.lsq - lsq.io.dcache := RegNext(dcache.io.lsu.lsq) + lsq.io.refill := delayedDcacheRefill lsq.io.release := dcache.io.lsu.release lsq.io.lqCancelCnt <> io.lqCancelCnt lsq.io.sqCancelCnt <> io.sqCancelCnt @@ -445,9 +447,6 @@ class MemBlockImp(outer: MemBlock) extends LazyModuleImp(outer) // Sbuffer sbuffer.io.csrCtrl <> csrCtrl sbuffer.io.dcache <> dcache.io.lsu.store - // TODO: if dcache sbuffer resp needs to ne delayed - // sbuffer.io.dcache.pipe_resp.valid := RegNext(dcache.io.lsu.store.pipe_resp.valid) - // sbuffer.io.dcache.pipe_resp.bits := RegNext(dcache.io.lsu.store.pipe_resp.bits) // flush sbuffer val fenceFlush = io.fenceToSbuffer.flushSb diff --git a/src/main/scala/xiangshan/cache/dcache/DCacheWrapper.scala b/src/main/scala/xiangshan/cache/dcache/DCacheWrapper.scala index f526fd7e5..471b563ef 100644 --- a/src/main/scala/xiangshan/cache/dcache/DCacheWrapper.scala +++ b/src/main/scala/xiangshan/cache/dcache/DCacheWrapper.scala @@ -157,6 +157,10 @@ trait HasDCacheParameters extends HasL1CacheParameters { data(DCacheSRAMRowBytes * (bank + 1) - 1, DCacheSRAMRowBytes * bank) } + def refill_addr_hit(a: UInt, b: UInt): Bool = { + a(PAddrBits-1, DCacheIndexOffset) === b(PAddrBits-1, DCacheIndexOffset) + } + def arbiter[T <: Bundle]( in: Seq[DecoupledIO[T]], out: DecoupledIO[T], diff --git a/src/main/scala/xiangshan/mem/lsqueue/LSQWrapper.scala b/src/main/scala/xiangshan/mem/lsqueue/LSQWrapper.scala index e15905576..a42c0f591 100644 --- a/src/main/scala/xiangshan/mem/lsqueue/LSQWrapper.scala +++ b/src/main/scala/xiangshan/mem/lsqueue/LSQWrapper.scala @@ -70,7 +70,7 @@ class LsqWrappper(implicit p: Parameters) extends XSModule with HasDCacheParamet val loadViolationQuery = Vec(LoadPipelineWidth, Flipped(new LoadViolationQueryIO)) val rob = Flipped(new RobLsqIO) val rollback = Output(Valid(new Redirect)) - val dcache = Flipped(ValidIO(new Refill)) + val refill = Flipped(ValidIO(new Refill)) val release = Flipped(ValidIO(new Release)) val uncache = new DCacheWordIO val exceptionAddr = new ExceptionAddrIO @@ -120,7 +120,7 @@ class LsqWrappper(implicit p: Parameters) extends XSModule with HasDCacheParamet loadQueue.io.ldout <> io.ldout loadQueue.io.rob <> io.rob loadQueue.io.rollback <> io.rollback - loadQueue.io.dcache <> io.dcache + loadQueue.io.refill <> io.refill loadQueue.io.release <> io.release loadQueue.io.trigger <> io.trigger loadQueue.io.exceptionAddr.isStore := DontCare diff --git a/src/main/scala/xiangshan/mem/lsqueue/LoadQueue.scala b/src/main/scala/xiangshan/mem/lsqueue/LoadQueue.scala index 48d6ffadd..3649c7a1d 100644 --- a/src/main/scala/xiangshan/mem/lsqueue/LoadQueue.scala +++ b/src/main/scala/xiangshan/mem/lsqueue/LoadQueue.scala @@ -93,7 +93,7 @@ class LoadQueue(implicit p: Parameters) extends XSModule val loadViolationQuery = Vec(LoadPipelineWidth, Flipped(new LoadViolationQueryIO)) val rob = Flipped(new RobLsqIO) val rollback = Output(Valid(new Redirect)) // replay now starts from load instead of store - val dcache = Flipped(ValidIO(new Refill)) // TODO: to be renamed + val refill = Flipped(ValidIO(new Refill)) val release = Flipped(ValidIO(new Release)) val uncache = new DCacheWordIO val exceptionAddr = new ExceptionAddrIO @@ -264,15 +264,15 @@ class LoadQueue(implicit p: Parameters) extends XSModule vaddrModule.io.wen(i) := RegNext(io.loadIn(i).fire()) } - when(io.dcache.valid) { - XSDebug("miss resp: paddr:0x%x data %x\n", io.dcache.bits.addr, io.dcache.bits.data) + when(io.refill.valid) { + XSDebug("miss resp: paddr:0x%x data %x\n", io.refill.bits.addr, io.refill.bits.data) } // Refill 64 bit in a cycle // Refill data comes back from io.dcache.resp - dataModule.io.refill.valid := io.dcache.valid - dataModule.io.refill.paddr := io.dcache.bits.addr - dataModule.io.refill.data := io.dcache.bits.data + dataModule.io.refill.valid := io.refill.valid + dataModule.io.refill.paddr := io.refill.bits.addr + dataModule.io.refill.data := io.refill.bits.data val dcacheRequireReplay = WireInit(VecInit((0 until LoadPipelineWidth).map(i =>{ RegNext(io.loadIn(i).fire()) && RegNext(io.dcacheRequireReplay(i)) @@ -284,10 +284,7 @@ class LoadQueue(implicit p: Parameters) extends XSModule when(dataModule.io.refill.valid && dataModule.io.refill.refillMask(i) && dataModule.io.refill.matchMask(i)) { datavalid(i) := true.B miss(i) := false.B - when(!dcacheRequireReplay.asUInt.orR){ - refilling(i) := true.B - } - when(io.dcache.bits.error) { + when(io.refill.bits.error) { error(i) := true.B } } @@ -299,7 +296,7 @@ class LoadQueue(implicit p: Parameters) extends XSModule // dcacheRequireReplay will be used to update lq flag 1 cycle after for better timing // // io.dcacheRequireReplay comes from dcache miss req reject, which is quite slow to generate - when(dcacheRequireReplay(i)) { + when(dcacheRequireReplay(i) && !refill_addr_hit(RegNext(io.loadIn(i).bits.paddr), io.refill.bits.addr)) { // do not writeback if that inst will be resend from rs // rob writeback will not be triggered by a refill before inst replay miss(RegNext(loadWbIndex)) := false.B // disable refill listening @@ -790,7 +787,7 @@ class LoadQueue(implicit p: Parameters) extends XSModule dataModule.io.uncacheWrite(deqPtr, io.uncache.resp.bits.data(XLEN-1, 0)) dataModule.io.uncache.wen := true.B - XSDebug("uncache resp: data %x\n", io.dcache.bits.data) + XSDebug("uncache resp: data %x\n", io.refill.bits.data) } // Read vaddr for mem exception @@ -849,7 +846,7 @@ class LoadQueue(implicit p: Parameters) extends XSModule XSPerfAccumulate("rollback", io.rollback.valid) // rollback redirect generated XSPerfAccumulate("mmioCycle", uncacheState =/= s_idle) // lq is busy dealing with uncache req XSPerfAccumulate("mmioCnt", io.uncache.req.fire()) - XSPerfAccumulate("refill", io.dcache.valid) + XSPerfAccumulate("refill", io.refill.valid) XSPerfAccumulate("writeback_success", PopCount(VecInit(io.ldout.map(i => i.fire())))) XSPerfAccumulate("writeback_blocked", PopCount(VecInit(io.ldout.map(i => i.valid && !i.ready)))) XSPerfAccumulate("utilization_miss", PopCount((0 until LoadQueueSize).map(i => allocated(i) && miss(i)))) @@ -858,7 +855,7 @@ class LoadQueue(implicit p: Parameters) extends XSModule ("rollback ", io.rollback.valid ), ("mmioCycle ", uncacheState =/= s_idle ), ("mmio_Cnt ", io.uncache.req.fire() ), - ("refill ", io.dcache.valid ), + ("refill ", io.refill.valid ), ("writeback_success", PopCount(VecInit(io.ldout.map(i => i.fire()))) ), ("writeback_blocked", PopCount(VecInit(io.ldout.map(i => i.valid && !i.ready))) ), ("ltq_1_4_valid ", (validCount < (LoadQueueSize.U/4.U)) ), diff --git a/src/main/scala/xiangshan/mem/pipeline/LoadUnit.scala b/src/main/scala/xiangshan/mem/pipeline/LoadUnit.scala index 56f2b85c0..1663164a7 100644 --- a/src/main/scala/xiangshan/mem/pipeline/LoadUnit.scala +++ b/src/main/scala/xiangshan/mem/pipeline/LoadUnit.scala @@ -451,10 +451,12 @@ class LoadUnit_S2(implicit p: Parameters) extends XSModule with HasLoadHelper { io.rsFeedback.bits.hit := !s2_need_replay_from_rs io.rsFeedback.bits.rsIdx := io.in.bits.rsIdx io.rsFeedback.bits.flushState := io.in.bits.ptwBack + // feedback source priority: tlbMiss > dataInvalid > mshrFull + // general case priority: tlbMiss > exception (include forward_fail / ldld_violation) > mmio > dataInvalid > mshrFull > normal miss / hit io.rsFeedback.bits.sourceType := Mux(s2_tlb_miss, RSFeedbackType.tlbMiss, - Mux(s2_cache_replay, - RSFeedbackType.mshrFull, - RSFeedbackType.dataInvalid + Mux(s2_data_invalid, + RSFeedbackType.dataInvalid, + RSFeedbackType.mshrFull ) ) io.rsFeedback.bits.dataInvalidSqIdx.value := io.dataInvalidSqIdx @@ -496,7 +498,11 @@ class LoadUnit_S2(implicit p: Parameters) extends XSModule with HasLoadHelper { XSPerfAccumulate("replay_from_fetch_load_vio", io.out.valid && ldldVioReplay) } -class LoadUnit(implicit p: Parameters) extends XSModule with HasLoadHelper with HasPerfEvents { +class LoadUnit(implicit p: Parameters) extends XSModule + with HasLoadHelper + with HasPerfEvents + with HasDCacheParameters +{ val io = IO(new Bundle() { val ldin = Flipped(Decoupled(new ExuInput)) val ldout = Decoupled(new ExuOutput) @@ -508,6 +514,7 @@ class LoadUnit(implicit p: Parameters) extends XSModule with HasLoadHelper with val dcache = new DCacheLoadIO val sbuffer = new LoadForwardQueryIO val lsq = new LoadToLsqIO + val refill = Flipped(ValidIO(new Refill)) val fastUop = ValidIO(new MicroOp) // early wakeup signal generated in load_s1 val trigger = Vec(3, new LoadUnitTriggerIO) @@ -569,8 +576,13 @@ class LoadUnit(implicit p: Parameters) extends XSModule with HasLoadHelper with io.lsq.dcacheRequireReplay := load_s2.io.dcacheRequireReplay // feedback tlb miss / dcache miss queue full - io.feedbackSlow.bits := RegNext(load_s2.io.rsFeedback.bits) io.feedbackSlow.valid := RegNext(load_s2.io.rsFeedback.valid && !load_s2.io.out.bits.uop.robIdx.needFlush(io.redirect)) + io.feedbackSlow.bits := RegNext(load_s2.io.rsFeedback.bits) + val s3_replay_for_mshrfull = RegNext(!load_s2.io.rsFeedback.bits.hit && load_s2.io.rsFeedback.bits.sourceType === RSFeedbackType.mshrFull) + val s3_refill_hit_load_paddr = refill_addr_hit(RegNext(load_s2.io.out.bits.paddr), io.refill.bits.addr) + // update replay request + io.feedbackSlow.bits.hit := RegNext(load_s2.io.rsFeedback.bits).hit || + s3_refill_hit_load_paddr && s3_replay_for_mshrfull // feedback bank conflict to rs io.feedbackFast.bits := load_s1.io.rsFeedback.bits @@ -635,7 +647,11 @@ class LoadUnit(implicit p: Parameters) extends XSModule with HasLoadHelper with io.lsq.ldout.ready := !hitLoadOut.valid when(io.feedbackSlow.valid && !io.feedbackSlow.bits.hit){ + // when need replay from rs, inst should not be writebacked to rob assert(RegNext(!hitLoadOut.valid)) + // when need replay from rs + // * inst should not be writebacked to lq, or + // * lq state will be updated in load_s3 (next cycle) assert(RegNext(!io.lsq.loadIn.valid) || RegNext(load_s2.io.dcacheRequireReplay)) } -- GitLab