From 09203307ca868620f1a4adc68afd3002457ac935 Mon Sep 17 00:00:00 2001
From: William Wang <zeweiwang@outlook.com>
Date: Sat, 2 Apr 2022 19:08:46 +0800
Subject: [PATCH] mem: reduce refill to use latency (#1401)

* mem: optimize missq reject to lq timing

DCache replay request is quite slow to generate, as it need to compare
load address with address in all valid miss queue entries.

Now we delay the usage of replay request from data cache.
Now replay request will not influence normal execuation flow until
load_s3 (1 cycle after load_s2, load result writeback to RS).

It is worth mentioning that "select refilling inst for load
writeback" will be disabled if dcacheRequireReplay in the
last cycle.

* dcache: compare probe block addr instead of full addr

* mem: do not replay from RS when ldld vio or fwd failed

ld-ld violation or forward failure will let an normal load inst replay
from fetch. If TLB hit and ld-ld violation / forward failure happens,
we write back that inst immediately. Meanwhile, such insts will not be
replayed from rs.

It should fix "mem: optimize missq reject to lq timing"

* mem: fix replay from rs condition

* mem: reduce refill to use latency

This commit update lq entry flag carefully in load_s3 to avoid extra
refill delay. It will remove the extra refill delay introduced by #1375
without harming memblock timing.

In #1375, we delayed load refill when dcache miss queue entry fails
to accept a miss. #1375 exchanges performance for better timing.

* mem: fix rs feedback priority

When dataInvalid && mshrFull, a succeed refill should not cancel
rs replay.
---
 .../scala/xiangshan/backend/MemBlock.scala    | 11 ++++----
 .../cache/dcache/DCacheWrapper.scala          |  4 +++
 .../xiangshan/mem/lsqueue/LSQWrapper.scala    |  4 +--
 .../xiangshan/mem/lsqueue/LoadQueue.scala     | 25 ++++++++----------
 .../xiangshan/mem/pipeline/LoadUnit.scala     | 26 +++++++++++++++----
 5 files changed, 43 insertions(+), 27 deletions(-)

diff --git a/src/main/scala/xiangshan/backend/MemBlock.scala b/src/main/scala/xiangshan/backend/MemBlock.scala
index 036c13c97..44954de9c 100644
--- a/src/main/scala/xiangshan/backend/MemBlock.scala
+++ b/src/main/scala/xiangshan/backend/MemBlock.scala
@@ -103,6 +103,8 @@ class MemBlockImp(outer: MemBlock) extends LazyModuleImp(outer)
   val dcache = outer.dcache.module
   val uncache = outer.uncache.module
 
+  val delayedDcacheRefill = RegNext(dcache.io.lsu.lsq)
+
   val csrCtrl = DelayN(io.csrCtrl, 2)
   dcache.io.csr.distribute_csr <> csrCtrl.distribute_csr
   io.csrUpdate := RegNext(dcache.io.csr.update)
@@ -261,6 +263,8 @@ class MemBlockImp(outer: MemBlock) extends LazyModuleImp(outer)
     // ld-ld violation check
     loadUnits(i).io.lsq.loadViolationQuery <> lsq.io.loadViolationQuery(i)
     loadUnits(i).io.csrCtrl       <> csrCtrl
+    // dcache refill req
+    loadUnits(i).io.refill           <> delayedDcacheRefill
     // dtlb
     loadUnits(i).io.tlb <> dtlb_ld(i).requestor(0)
     // pmp
@@ -430,9 +434,7 @@ class MemBlockImp(outer: MemBlock) extends LazyModuleImp(outer)
   io.memoryViolation    <> lsq.io.rollback
   lsq.io.uncache        <> uncache.io.lsq
   // delay dcache refill for 1 cycle for better timing
-  // TODO: remove RegNext after fixing refill paddr timing
-  // lsq.io.dcache         <> dcache.io.lsu.lsq
-  lsq.io.dcache         := RegNext(dcache.io.lsu.lsq)
+  lsq.io.refill         := delayedDcacheRefill
   lsq.io.release        := dcache.io.lsu.release
   lsq.io.lqCancelCnt <> io.lqCancelCnt
   lsq.io.sqCancelCnt <> io.sqCancelCnt
@@ -445,9 +447,6 @@ class MemBlockImp(outer: MemBlock) extends LazyModuleImp(outer)
   // Sbuffer
   sbuffer.io.csrCtrl    <> csrCtrl
   sbuffer.io.dcache     <> dcache.io.lsu.store
-  // TODO: if dcache sbuffer resp needs to ne delayed 
-  // sbuffer.io.dcache.pipe_resp.valid := RegNext(dcache.io.lsu.store.pipe_resp.valid)
-  // sbuffer.io.dcache.pipe_resp.bits := RegNext(dcache.io.lsu.store.pipe_resp.bits)
 
   // flush sbuffer
   val fenceFlush = io.fenceToSbuffer.flushSb
diff --git a/src/main/scala/xiangshan/cache/dcache/DCacheWrapper.scala b/src/main/scala/xiangshan/cache/dcache/DCacheWrapper.scala
index f526fd7e5..471b563ef 100644
--- a/src/main/scala/xiangshan/cache/dcache/DCacheWrapper.scala
+++ b/src/main/scala/xiangshan/cache/dcache/DCacheWrapper.scala
@@ -157,6 +157,10 @@ trait HasDCacheParameters extends HasL1CacheParameters {
     data(DCacheSRAMRowBytes * (bank + 1) - 1, DCacheSRAMRowBytes * bank)
   }
 
+  def refill_addr_hit(a: UInt, b: UInt): Bool = {
+    a(PAddrBits-1, DCacheIndexOffset) === b(PAddrBits-1, DCacheIndexOffset)
+  }
+
   def arbiter[T <: Bundle](
     in: Seq[DecoupledIO[T]],
     out: DecoupledIO[T],
diff --git a/src/main/scala/xiangshan/mem/lsqueue/LSQWrapper.scala b/src/main/scala/xiangshan/mem/lsqueue/LSQWrapper.scala
index e15905576..a42c0f591 100644
--- a/src/main/scala/xiangshan/mem/lsqueue/LSQWrapper.scala
+++ b/src/main/scala/xiangshan/mem/lsqueue/LSQWrapper.scala
@@ -70,7 +70,7 @@ class LsqWrappper(implicit p: Parameters) extends XSModule with HasDCacheParamet
     val loadViolationQuery = Vec(LoadPipelineWidth, Flipped(new LoadViolationQueryIO))
     val rob = Flipped(new RobLsqIO)
     val rollback = Output(Valid(new Redirect))
-    val dcache = Flipped(ValidIO(new Refill))
+    val refill = Flipped(ValidIO(new Refill))
     val release = Flipped(ValidIO(new Release))
     val uncache = new DCacheWordIO
     val exceptionAddr = new ExceptionAddrIO
@@ -120,7 +120,7 @@ class LsqWrappper(implicit p: Parameters) extends XSModule with HasDCacheParamet
   loadQueue.io.ldout <> io.ldout
   loadQueue.io.rob <> io.rob
   loadQueue.io.rollback <> io.rollback
-  loadQueue.io.dcache <> io.dcache
+  loadQueue.io.refill <> io.refill
   loadQueue.io.release <> io.release
   loadQueue.io.trigger <> io.trigger
   loadQueue.io.exceptionAddr.isStore := DontCare
diff --git a/src/main/scala/xiangshan/mem/lsqueue/LoadQueue.scala b/src/main/scala/xiangshan/mem/lsqueue/LoadQueue.scala
index 48d6ffadd..3649c7a1d 100644
--- a/src/main/scala/xiangshan/mem/lsqueue/LoadQueue.scala
+++ b/src/main/scala/xiangshan/mem/lsqueue/LoadQueue.scala
@@ -93,7 +93,7 @@ class LoadQueue(implicit p: Parameters) extends XSModule
     val loadViolationQuery = Vec(LoadPipelineWidth, Flipped(new LoadViolationQueryIO))
     val rob = Flipped(new RobLsqIO)
     val rollback = Output(Valid(new Redirect)) // replay now starts from load instead of store
-    val dcache = Flipped(ValidIO(new Refill)) // TODO: to be renamed
+    val refill = Flipped(ValidIO(new Refill))
     val release = Flipped(ValidIO(new Release))
     val uncache = new DCacheWordIO
     val exceptionAddr = new ExceptionAddrIO
@@ -264,15 +264,15 @@ class LoadQueue(implicit p: Parameters) extends XSModule
     vaddrModule.io.wen(i) := RegNext(io.loadIn(i).fire())
   }
 
-  when(io.dcache.valid) {
-    XSDebug("miss resp: paddr:0x%x data %x\n", io.dcache.bits.addr, io.dcache.bits.data)
+  when(io.refill.valid) {
+    XSDebug("miss resp: paddr:0x%x data %x\n", io.refill.bits.addr, io.refill.bits.data)
   }
 
   // Refill 64 bit in a cycle
   // Refill data comes back from io.dcache.resp
-  dataModule.io.refill.valid := io.dcache.valid
-  dataModule.io.refill.paddr := io.dcache.bits.addr
-  dataModule.io.refill.data := io.dcache.bits.data
+  dataModule.io.refill.valid := io.refill.valid
+  dataModule.io.refill.paddr := io.refill.bits.addr
+  dataModule.io.refill.data := io.refill.bits.data
 
   val dcacheRequireReplay = WireInit(VecInit((0 until LoadPipelineWidth).map(i =>{
     RegNext(io.loadIn(i).fire()) && RegNext(io.dcacheRequireReplay(i))
@@ -284,10 +284,7 @@ class LoadQueue(implicit p: Parameters) extends XSModule
     when(dataModule.io.refill.valid && dataModule.io.refill.refillMask(i) && dataModule.io.refill.matchMask(i)) {
       datavalid(i) := true.B
       miss(i) := false.B
-      when(!dcacheRequireReplay.asUInt.orR){
-        refilling(i) := true.B
-      }
-      when(io.dcache.bits.error) {
+      when(io.refill.bits.error) {
         error(i) := true.B
       }
     }
@@ -299,7 +296,7 @@ class LoadQueue(implicit p: Parameters) extends XSModule
       // dcacheRequireReplay will be used to update lq flag 1 cycle after for better timing
       //
       // io.dcacheRequireReplay comes from dcache miss req reject, which is quite slow to generate 
-      when(dcacheRequireReplay(i)) {
+      when(dcacheRequireReplay(i) && !refill_addr_hit(RegNext(io.loadIn(i).bits.paddr), io.refill.bits.addr)) {
         // do not writeback if that inst will be resend from rs
         // rob writeback will not be triggered by a refill before inst replay
         miss(RegNext(loadWbIndex)) := false.B // disable refill listening
@@ -790,7 +787,7 @@ class LoadQueue(implicit p: Parameters) extends XSModule
     dataModule.io.uncacheWrite(deqPtr, io.uncache.resp.bits.data(XLEN-1, 0))
     dataModule.io.uncache.wen := true.B
 
-    XSDebug("uncache resp: data %x\n", io.dcache.bits.data)
+    XSDebug("uncache resp: data %x\n", io.refill.bits.data)
   }
 
   // Read vaddr for mem exception
@@ -849,7 +846,7 @@ class LoadQueue(implicit p: Parameters) extends XSModule
   XSPerfAccumulate("rollback", io.rollback.valid) // rollback redirect generated
   XSPerfAccumulate("mmioCycle", uncacheState =/= s_idle) // lq is busy dealing with uncache req
   XSPerfAccumulate("mmioCnt", io.uncache.req.fire())
-  XSPerfAccumulate("refill", io.dcache.valid)
+  XSPerfAccumulate("refill", io.refill.valid)
   XSPerfAccumulate("writeback_success", PopCount(VecInit(io.ldout.map(i => i.fire()))))
   XSPerfAccumulate("writeback_blocked", PopCount(VecInit(io.ldout.map(i => i.valid && !i.ready))))
   XSPerfAccumulate("utilization_miss", PopCount((0 until LoadQueueSize).map(i => allocated(i) && miss(i))))
@@ -858,7 +855,7 @@ class LoadQueue(implicit p: Parameters) extends XSModule
     ("rollback         ", io.rollback.valid                                                               ),
     ("mmioCycle        ", uncacheState =/= s_idle                                                         ),
     ("mmio_Cnt         ", io.uncache.req.fire()                                                           ),
-    ("refill           ", io.dcache.valid                                                                 ),
+    ("refill           ", io.refill.valid                                                                 ),
     ("writeback_success", PopCount(VecInit(io.ldout.map(i => i.fire())))                                  ),
     ("writeback_blocked", PopCount(VecInit(io.ldout.map(i => i.valid && !i.ready)))                       ),
     ("ltq_1_4_valid    ", (validCount < (LoadQueueSize.U/4.U))                                            ),
diff --git a/src/main/scala/xiangshan/mem/pipeline/LoadUnit.scala b/src/main/scala/xiangshan/mem/pipeline/LoadUnit.scala
index 56f2b85c0..1663164a7 100644
--- a/src/main/scala/xiangshan/mem/pipeline/LoadUnit.scala
+++ b/src/main/scala/xiangshan/mem/pipeline/LoadUnit.scala
@@ -451,10 +451,12 @@ class LoadUnit_S2(implicit p: Parameters) extends XSModule with HasLoadHelper {
   io.rsFeedback.bits.hit := !s2_need_replay_from_rs
   io.rsFeedback.bits.rsIdx := io.in.bits.rsIdx
   io.rsFeedback.bits.flushState := io.in.bits.ptwBack
+  // feedback source priority: tlbMiss > dataInvalid > mshrFull
+  // general case priority: tlbMiss > exception (include forward_fail / ldld_violation) > mmio > dataInvalid > mshrFull > normal miss / hit
   io.rsFeedback.bits.sourceType := Mux(s2_tlb_miss, RSFeedbackType.tlbMiss,
-    Mux(s2_cache_replay,
-      RSFeedbackType.mshrFull,
-      RSFeedbackType.dataInvalid
+    Mux(s2_data_invalid,
+      RSFeedbackType.dataInvalid,
+      RSFeedbackType.mshrFull
     )
   )
   io.rsFeedback.bits.dataInvalidSqIdx.value := io.dataInvalidSqIdx
@@ -496,7 +498,11 @@ class LoadUnit_S2(implicit p: Parameters) extends XSModule with HasLoadHelper {
   XSPerfAccumulate("replay_from_fetch_load_vio", io.out.valid && ldldVioReplay)
 }
 
-class LoadUnit(implicit p: Parameters) extends XSModule with HasLoadHelper with HasPerfEvents {
+class LoadUnit(implicit p: Parameters) extends XSModule 
+  with HasLoadHelper
+  with HasPerfEvents
+  with HasDCacheParameters
+{
   val io = IO(new Bundle() {
     val ldin = Flipped(Decoupled(new ExuInput))
     val ldout = Decoupled(new ExuOutput)
@@ -508,6 +514,7 @@ class LoadUnit(implicit p: Parameters) extends XSModule with HasLoadHelper with
     val dcache = new DCacheLoadIO
     val sbuffer = new LoadForwardQueryIO
     val lsq = new LoadToLsqIO
+    val refill = Flipped(ValidIO(new Refill))
     val fastUop = ValidIO(new MicroOp) // early wakeup signal generated in load_s1
     val trigger = Vec(3, new LoadUnitTriggerIO)
 
@@ -569,8 +576,13 @@ class LoadUnit(implicit p: Parameters) extends XSModule with HasLoadHelper with
   io.lsq.dcacheRequireReplay := load_s2.io.dcacheRequireReplay
 
   // feedback tlb miss / dcache miss queue full
-  io.feedbackSlow.bits := RegNext(load_s2.io.rsFeedback.bits)
   io.feedbackSlow.valid := RegNext(load_s2.io.rsFeedback.valid && !load_s2.io.out.bits.uop.robIdx.needFlush(io.redirect))
+  io.feedbackSlow.bits := RegNext(load_s2.io.rsFeedback.bits)
+  val s3_replay_for_mshrfull = RegNext(!load_s2.io.rsFeedback.bits.hit && load_s2.io.rsFeedback.bits.sourceType === RSFeedbackType.mshrFull)
+  val s3_refill_hit_load_paddr = refill_addr_hit(RegNext(load_s2.io.out.bits.paddr), io.refill.bits.addr)
+  // update replay request
+  io.feedbackSlow.bits.hit := RegNext(load_s2.io.rsFeedback.bits).hit || 
+    s3_refill_hit_load_paddr && s3_replay_for_mshrfull
 
   // feedback bank conflict to rs
   io.feedbackFast.bits := load_s1.io.rsFeedback.bits
@@ -635,7 +647,11 @@ class LoadUnit(implicit p: Parameters) extends XSModule with HasLoadHelper with
   io.lsq.ldout.ready := !hitLoadOut.valid
 
   when(io.feedbackSlow.valid && !io.feedbackSlow.bits.hit){
+    // when need replay from rs, inst should not be writebacked to rob
     assert(RegNext(!hitLoadOut.valid))
+    // when need replay from rs
+    // * inst should not be writebacked to lq, or
+    // * lq state will be updated in load_s3 (next cycle)
     assert(RegNext(!io.lsq.loadIn.valid) || RegNext(load_s2.io.dcacheRequireReplay))
   }
 
-- 
GitLab