mem: reduce refill to use latency (#1401)

* mem: optimize missq reject to lq timing DCache replay request is quite slow to generate, as it need to compare load address with address in all valid miss queue entries. Now we delay the usage of replay request from data cache. Now replay request will not influence normal execuation flow until load_s3 (1 cycle after load_s2, load result writeback to RS). It is worth mentioning that "select refilling inst for load writeback" will be disabled if dcacheRequireReplay in the last cycle. * dcache: compare probe block addr instead of full addr * mem: do not replay from RS when ldld vio or fwd failed ld-ld violation or forward failure will let an normal load inst replay from fetch. If TLB hit and ld-ld violation / forward failure happens, we write back that inst immediately. Meanwhile, such insts will not be replayed from rs. It should fix "mem: optimize missq reject to lq timing" * mem: fix replay from rs condition * mem: reduce refill to use latency This commit update lq entry flag carefully in load_s3 to avoid extra refill delay. It will remove the extra refill delay introduced by #1375 without harming memblock timing. In #1375, we delayed load refill when dcache miss queue entry fails to accept a miss. #1375 exchanges performance for better timing. * mem: fix rs feedback priority When dataInvalid && mshrFull, a succeed refill should not cancel rs replay.

mem: reduce refill to use latency (#1401)
* mem: optimize missq reject to lq timing DCache replay request is quite slow to generate, as it need to compare load address with address in all valid miss queue entries. Now we delay the usage of replay request from data cache. Now replay request will not influence normal execuation flow until load_s3 (1 cycle after load_s2, load result writeback to RS). It is worth mentioning that "select refilling inst for load writeback" will be disabled if dcacheRequireReplay in the last cycle. * dcache: compare probe block addr instead of full addr * mem: do not replay from RS when ldld vio or fwd failed ld-ld violation or forward failure will let an normal load inst replay from fetch. If TLB hit and ld-ld violation / forward failure happens, we write back that inst immediately. Meanwhile, such insts will not be replayed from rs. It should fix "mem: optimize missq reject to lq timing" * mem: fix replay from rs condition * mem: reduce refill to use latency This commit update lq entry flag carefully in load_s3 to avoid extra refill delay. It will remove the extra refill delay introduced by #1375 without harming memblock timing. In #1375, we delayed load refill when dcache miss queue entry fails to accept a miss. #1375 exchanges performance for better timing. * mem: fix rs feedback priority When dataInvalid && mshrFull, a succeed refill should not cancel rs replay.
09203307 · William Wang · GitHub · 8d8ac704 · 09203307 · 09203307
5 changed file
--- a/src/main/scala/xiangshan/backend/MemBlock.scala
+++ b/src/main/scala/xiangshan/backend/MemBlock.scala
@@ -103,6 +103,8 @@ class MemBlockImp(outer: MemBlock) extends LazyModuleImp(outer)
  val dcache = outer.dcache.module
  val uncache = outer.uncache.module

+  val delayedDcacheRefill = RegNext(dcache.io.lsu.lsq)
+
  val csrCtrl = DelayN(io.csrCtrl, 2)
  dcache.io.csr.distribute_csr <> csrCtrl.distribute_csr
  io.csrUpdate := RegNext(dcache.io.csr.update)
@@ -261,6 +263,8 @@ class MemBlockImp(outer: MemBlock) extends LazyModuleImp(outer)
    // ld-ld violation check
    loadUnits(i).io.lsq.loadViolationQuery <> lsq.io.loadViolationQuery(i)
    loadUnits(i).io.csrCtrl       <> csrCtrl
+    // dcache refill req
+    loadUnits(i).io.refill           <> delayedDcacheRefill
    // dtlb
    loadUnits(i).io.tlb <> dtlb_ld(i).requestor(0)
    // pmp
@@ -430,9 +434,7 @@ class MemBlockImp(outer: MemBlock) extends LazyModuleImp(outer)
  io.memoryViolation    <> lsq.io.rollback
  lsq.io.uncache        <> uncache.io.lsq
  // delay dcache refill for 1 cycle for better timing
-  // TODO: remove RegNext after fixing refill paddr timing
-  // lsq.io.dcache         <> dcache.io.lsu.lsq
-  lsq.io.dcache         := RegNext(dcache.io.lsu.lsq)
+  lsq.io.refill         := delayedDcacheRefill
  lsq.io.release        := dcache.io.lsu.release
  lsq.io.lqCancelCnt <> io.lqCancelCnt
  lsq.io.sqCancelCnt <> io.sqCancelCnt
@@ -445,9 +447,6 @@ class MemBlockImp(outer: MemBlock) extends LazyModuleImp(outer)
  // Sbuffer
  sbuffer.io.csrCtrl    <> csrCtrl
  sbuffer.io.dcache     <> dcache.io.lsu.store
-  // TODO: if dcache sbuffer resp needs to ne delayed 
-  // sbuffer.io.dcache.pipe_resp.valid := RegNext(dcache.io.lsu.store.pipe_resp.valid)
-  // sbuffer.io.dcache.pipe_resp.bits := RegNext(dcache.io.lsu.store.pipe_resp.bits)

  // flush sbuffer
  val fenceFlush = io.fenceToSbuffer.flushSb

--- a/src/main/scala/xiangshan/cache/dcache/DCacheWrapper.scala
+++ b/src/main/scala/xiangshan/cache/dcache/DCacheWrapper.scala
@@ -157,6 +157,10 @@ trait HasDCacheParameters extends HasL1CacheParameters {
    data(DCacheSRAMRowBytes * (bank + 1) - 1, DCacheSRAMRowBytes * bank)
  }

+  def refill_addr_hit(a: UInt, b: UInt): Bool = {
+    a(PAddrBits-1, DCacheIndexOffset) === b(PAddrBits-1, DCacheIndexOffset)
+  }
+
  def arbiter[T <: Bundle](
    in: Seq[DecoupledIO[T]],
    out: DecoupledIO[T],

--- a/src/main/scala/xiangshan/mem/lsqueue/LSQWrapper.scala
+++ b/src/main/scala/xiangshan/mem/lsqueue/LSQWrapper.scala
@@ -70,7 +70,7 @@ class LsqWrappper(implicit p: Parameters) extends XSModule with HasDCacheParamet
    val loadViolationQuery = Vec(LoadPipelineWidth, Flipped(new LoadViolationQueryIO))
    val rob = Flipped(new RobLsqIO)
    val rollback = Output(Valid(new Redirect))
-    val dcache = Flipped(ValidIO(new Refill))
+    val refill = Flipped(ValidIO(new Refill))
    val release = Flipped(ValidIO(new Release))
    val uncache = new DCacheWordIO
    val exceptionAddr = new ExceptionAddrIO
@@ -120,7 +120,7 @@ class LsqWrappper(implicit p: Parameters) extends XSModule with HasDCacheParamet
  loadQueue.io.ldout <> io.ldout
  loadQueue.io.rob <> io.rob
  loadQueue.io.rollback <> io.rollback
-  loadQueue.io.dcache <> io.dcache
+  loadQueue.io.refill <> io.refill
  loadQueue.io.release <> io.release
  loadQueue.io.trigger <> io.trigger
  loadQueue.io.exceptionAddr.isStore := DontCare

--- a/src/main/scala/xiangshan/mem/lsqueue/LoadQueue.scala
+++ b/src/main/scala/xiangshan/mem/lsqueue/LoadQueue.scala
@@ -93,7 +93,7 @@ class LoadQueue(implicit p: Parameters) extends XSModule
    val loadViolationQuery = Vec(LoadPipelineWidth, Flipped(new LoadViolationQueryIO))
    val rob = Flipped(new RobLsqIO)
    val rollback = Output(Valid(new Redirect)) // replay now starts from load instead of store
-    val dcache = Flipped(ValidIO(new Refill)) // TODO: to be renamed
+    val refill = Flipped(ValidIO(new Refill))
    val release = Flipped(ValidIO(new Release))
    val uncache = new DCacheWordIO
    val exceptionAddr = new ExceptionAddrIO
@@ -264,15 +264,15 @@ class LoadQueue(implicit p: Parameters) extends XSModule
    vaddrModule.io.wen(i) := RegNext(io.loadIn(i).fire())
  }

-  when(io.dcache.valid) {
-    XSDebug("miss resp: paddr:0x%x data %x\n", io.dcache.bits.addr, io.dcache.bits.data)
+  when(io.refill.valid) {
+    XSDebug("miss resp: paddr:0x%x data %x\n", io.refill.bits.addr, io.refill.bits.data)
  }

  // Refill 64 bit in a cycle
  // Refill data comes back from io.dcache.resp
-  dataModule.io.refill.valid := io.dcache.valid
-  dataModule.io.refill.paddr := io.dcache.bits.addr
-  dataModule.io.refill.data := io.dcache.bits.data
+  dataModule.io.refill.valid := io.refill.valid
+  dataModule.io.refill.paddr := io.refill.bits.addr
+  dataModule.io.refill.data := io.refill.bits.data

  val dcacheRequireReplay = WireInit(VecInit((0 until LoadPipelineWidth).map(i =>{
    RegNext(io.loadIn(i).fire()) && RegNext(io.dcacheRequireReplay(i))
@@ -284,10 +284,7 @@ class LoadQueue(implicit p: Parameters) extends XSModule
    when(dataModule.io.refill.valid && dataModule.io.refill.refillMask(i) && dataModule.io.refill.matchMask(i)) {
      datavalid(i) := true.B
      miss(i) := false.B
-      when(!dcacheRequireReplay.asUInt.orR){
-        refilling(i) := true.B
-      }
-      when(io.dcache.bits.error) {
+      when(io.refill.bits.error) {
        error(i) := true.B
      }
    }
@@ -299,7 +296,7 @@ class LoadQueue(implicit p: Parameters) extends XSModule
      // dcacheRequireReplay will be used to update lq flag 1 cycle after for better timing
      //
      // io.dcacheRequireReplay comes from dcache miss req reject, which is quite slow to generate 
-      when(dcacheRequireReplay(i)) {
+      when(dcacheRequireReplay(i) && !refill_addr_hit(RegNext(io.loadIn(i).bits.paddr), io.refill.bits.addr)) {
        // do not writeback if that inst will be resend from rs
        // rob writeback will not be triggered by a refill before inst replay
        miss(RegNext(loadWbIndex)) := false.B // disable refill listening
@@ -790,7 +787,7 @@ class LoadQueue(implicit p: Parameters) extends XSModule
    dataModule.io.uncacheWrite(deqPtr, io.uncache.resp.bits.data(XLEN-1, 0))
    dataModule.io.uncache.wen := true.B

-    XSDebug("uncache resp: data %x\n", io.dcache.bits.data)
+    XSDebug("uncache resp: data %x\n", io.refill.bits.data)
  }

  // Read vaddr for mem exception
@@ -849,7 +846,7 @@ class LoadQueue(implicit p: Parameters) extends XSModule
  XSPerfAccumulate("rollback", io.rollback.valid) // rollback redirect generated
  XSPerfAccumulate("mmioCycle", uncacheState =/= s_idle) // lq is busy dealing with uncache req
  XSPerfAccumulate("mmioCnt", io.uncache.req.fire())
-  XSPerfAccumulate("refill", io.dcache.valid)
+  XSPerfAccumulate("refill", io.refill.valid)
  XSPerfAccumulate("writeback_success", PopCount(VecInit(io.ldout.map(i => i.fire()))))
  XSPerfAccumulate("writeback_blocked", PopCount(VecInit(io.ldout.map(i => i.valid && !i.ready))))
  XSPerfAccumulate("utilization_miss", PopCount((0 until LoadQueueSize).map(i => allocated(i) && miss(i))))
@@ -858,7 +855,7 @@ class LoadQueue(implicit p: Parameters) extends XSModule
    ("rollback         ", io.rollback.valid                                                               ),
    ("mmioCycle        ", uncacheState =/= s_idle                                                         ),
    ("mmio_Cnt         ", io.uncache.req.fire()                                                           ),
-    ("refill           ", io.dcache.valid                                                                 ),
+    ("refill           ", io.refill.valid                                                                 ),
    ("writeback_success", PopCount(VecInit(io.ldout.map(i => i.fire())))                                  ),
    ("writeback_blocked", PopCount(VecInit(io.ldout.map(i => i.valid && !i.ready)))                       ),
    ("ltq_1_4_valid    ", (validCount < (LoadQueueSize.U/4.U))                                            ),

--- a/src/main/scala/xiangshan/mem/pipeline/LoadUnit.scala
+++ b/src/main/scala/xiangshan/mem/pipeline/LoadUnit.scala
@@ -451,10 +451,12 @@ class LoadUnit_S2(implicit p: Parameters) extends XSModule with HasLoadHelper {
  io.rsFeedback.bits.hit := !s2_need_replay_from_rs
  io.rsFeedback.bits.rsIdx := io.in.bits.rsIdx
  io.rsFeedback.bits.flushState := io.in.bits.ptwBack
+  // feedback source priority: tlbMiss > dataInvalid > mshrFull
+  // general case priority: tlbMiss > exception (include forward_fail / ldld_violation) > mmio > dataInvalid > mshrFull > normal miss / hit
  io.rsFeedback.bits.sourceType := Mux(s2_tlb_miss, RSFeedbackType.tlbMiss,
-    Mux(s2_cache_replay,
-      RSFeedbackType.mshrFull,
-      RSFeedbackType.dataInvalid
+    Mux(s2_data_invalid,
+      RSFeedbackType.dataInvalid,
+      RSFeedbackType.mshrFull
    )
  )
  io.rsFeedback.bits.dataInvalidSqIdx.value := io.dataInvalidSqIdx
@@ -496,7 +498,11 @@ class LoadUnit_S2(implicit p: Parameters) extends XSModule with HasLoadHelper {
  XSPerfAccumulate("replay_from_fetch_load_vio", io.out.valid && ldldVioReplay)
 }

-class LoadUnit(implicit p: Parameters) extends XSModule with HasLoadHelper with HasPerfEvents {
+class LoadUnit(implicit p: Parameters) extends XSModule 
+  with HasLoadHelper
+  with HasPerfEvents
+  with HasDCacheParameters
+{
  val io = IO(new Bundle() {
    val ldin = Flipped(Decoupled(new ExuInput))
    val ldout = Decoupled(new ExuOutput)
@@ -508,6 +514,7 @@ class LoadUnit(implicit p: Parameters) extends XSModule with HasLoadHelper with
    val dcache = new DCacheLoadIO
    val sbuffer = new LoadForwardQueryIO
    val lsq = new LoadToLsqIO
+    val refill = Flipped(ValidIO(new Refill))
    val fastUop = ValidIO(new MicroOp) // early wakeup signal generated in load_s1
    val trigger = Vec(3, new LoadUnitTriggerIO)

@@ -569,8 +576,13 @@ class LoadUnit(implicit p: Parameters) extends XSModule with HasLoadHelper with
  io.lsq.dcacheRequireReplay := load_s2.io.dcacheRequireReplay

  // feedback tlb miss / dcache miss queue full
-  io.feedbackSlow.bits := RegNext(load_s2.io.rsFeedback.bits)
  io.feedbackSlow.valid := RegNext(load_s2.io.rsFeedback.valid && !load_s2.io.out.bits.uop.robIdx.needFlush(io.redirect))
+  io.feedbackSlow.bits := RegNext(load_s2.io.rsFeedback.bits)
+  val s3_replay_for_mshrfull = RegNext(!load_s2.io.rsFeedback.bits.hit && load_s2.io.rsFeedback.bits.sourceType === RSFeedbackType.mshrFull)
+  val s3_refill_hit_load_paddr = refill_addr_hit(RegNext(load_s2.io.out.bits.paddr), io.refill.bits.addr)
+  // update replay request
+  io.feedbackSlow.bits.hit := RegNext(load_s2.io.rsFeedback.bits).hit || 
+    s3_refill_hit_load_paddr && s3_replay_for_mshrfull

  // feedback bank conflict to rs
  io.feedbackFast.bits := load_s1.io.rsFeedback.bits
@@ -635,7 +647,11 @@ class LoadUnit(implicit p: Parameters) extends XSModule with HasLoadHelper with
  io.lsq.ldout.ready := !hitLoadOut.valid

  when(io.feedbackSlow.valid && !io.feedbackSlow.bits.hit){
+    // when need replay from rs, inst should not be writebacked to rob
    assert(RegNext(!hitLoadOut.valid))
+    // when need replay from rs
+    // * inst should not be writebacked to lq, or
+    // * lq state will be updated in load_s3 (next cycle)
    assert(RegNext(!io.lsq.loadIn.valid) || RegNext(load_s2.io.dcacheRequireReplay))
  }