lsq: add LsqEnqCtrl to optimize enqueue timing (#1380)

This commit adds an LsqEnqCtrl module to add one more clock cycle between dispatch and load/store queue. LsqEnqCtrl maintains the lqEnqPtr/sqEnqPtr and lqCounter/sqCounter. They are used to determine whether load/store queue can accept new instructions. After that, instructions are sent to load/store queue. This module decouples queue allocation and real enqueue. Besides, uop storage in load/store queue are optimized. In dispatch, only robIdx is required. Other information is naturally conveyed in the pipeline and can be stored later in load/store queue if needed. For example, exception vector, trigger, ftqIdx, pdest, etc are unnecessary before the instruction leaves the load/store pipeline.

lsq: add LsqEnqCtrl to optimize enqueue timing (#1380)
This commit adds an LsqEnqCtrl module to add one more clock cycle between dispatch and load/store queue. LsqEnqCtrl maintains the lqEnqPtr/sqEnqPtr and lqCounter/sqCounter. They are used to determine whether load/store queue can accept new instructions. After that, instructions are sent to load/store queue. This module decouples queue allocation and real enqueue. Besides, uop storage in load/store queue are optimized. In dispatch, only robIdx is required. Other information is naturally conveyed in the pipeline and can be stored later in load/store queue if needed. For example, exception vector, trigger, ftqIdx, pdest, etc are unnecessary before the instruction leaves the load/store pipeline.
10551d4e · Yinan Xu · GitHub · 67c26c34 · 10551d4e · 10551d4e
6 changed file
--- a/src/main/scala/xiangshan/XSCore.scala
+++ b/src/main/scala/xiangshan/XSCore.scala
@@ -287,6 +287,12 @@ class XSCoreImp(outer: XSCoreBase) extends LazyModuleImp(outer)
  ctrlBlock.io.stIn <> memBlock.io.stIn
  ctrlBlock.io.memoryViolation <> memBlock.io.memoryViolation
  exuBlocks.head.io.scheExtra.enqLsq.get <> memBlock.io.enqLsq
+  exuBlocks.foreach(b => {
+    b.io.scheExtra.lcommit := ctrlBlock.io.robio.lsq.lcommit
+    b.io.scheExtra.scommit := memBlock.io.sqDeq
+    b.io.scheExtra.lqCancelCnt := memBlock.io.lqCancelCnt
+    b.io.scheExtra.sqCancelCnt := memBlock.io.sqCancelCnt
+  })
  val sourceModules = outer.writebackSources.map(_.map(_.module.asInstanceOf[HasWritebackSourceImp]))
  outer.ctrlBlock.generateWritebackIO()


--- a/src/main/scala/xiangshan/backend/MemBlock.scala
+++ b/src/main/scala/xiangshan/backend/MemBlock.scala
@@ -93,6 +93,9 @@ class MemBlockImp(outer: MemBlock) extends LazyModuleImp(outer)
      val dcacheMSHRFull = Output(Bool())
    }
    val perfEventsPTW = Input(Vec(19, new PerfEvent))
+    val lqCancelCnt = Output(UInt(log2Up(LoadQueueSize + 1).W))
+    val sqCancelCnt = Output(UInt(log2Up(StoreQueueSize + 1).W))
+    val sqDeq = Output(UInt(2.W))
  })

  override def writebackSource1: Option[Seq[Seq[DecoupledIO[ExuOutput]]]] = Some(Seq(io.writeback))
@@ -415,6 +418,9 @@ class MemBlockImp(outer: MemBlock) extends LazyModuleImp(outer)
  // lsq.io.dcache         <> dcache.io.lsu.lsq
  lsq.io.dcache         := RegNext(dcache.io.lsu.lsq)
  lsq.io.release        := dcache.io.lsu.release
+  lsq.io.lqCancelCnt <> io.lqCancelCnt
+  lsq.io.sqCancelCnt <> io.sqCancelCnt
+  lsq.io.sqDeq <> io.sqDeq

  // LSQ to store buffer
  lsq.io.sbuffer        <> sbuffer.io.in

--- a/src/main/scala/xiangshan/backend/Scheduler.scala
+++ b/src/main/scala/xiangshan/backend/Scheduler.scala
@@ -29,7 +29,7 @@ import xiangshan.backend.fu.fpu.FMAMidResultIO
 import xiangshan.backend.issue.ReservationStationWrapper
 import xiangshan.backend.regfile.{Regfile, RfReadPort}
 import xiangshan.backend.rename.{BusyTable, BusyTableReadIO}
-import xiangshan.mem.{LsqEnqIO, MemWaitUpdateReq, SqPtr}
+import xiangshan.mem.{LsqEnqCtrl, LsqEnqIO, MemWaitUpdateReq, SqPtr}

 class DispatchArbiter(func: Seq[MicroOp => Bool])(implicit p: Parameters) extends XSModule {
  val numTarget = func.length
@@ -251,6 +251,11 @@ class SchedulerImp(outer: Scheduler) extends LazyModuleImp(outer) with HasXSPara
    val stIssuePtr = Input(new SqPtr())
    // special ports for load / store rs
    val enqLsq = if (outer.numReplayPorts > 0) Some(Flipped(new LsqEnqIO)) else None
+    val lcommit = Input(UInt(log2Up(CommitWidth + 1).W))
+    val scommit = Input(UInt(log2Up(CommitWidth + 1).W))
+    // from lsq
+    val lqCancelCnt = Input(UInt(log2Up(LoadQueueSize + 1).W))
+    val sqCancelCnt = Input(UInt(log2Up(StoreQueueSize + 1).W))
    val memWaitUpdateReq = Flipped(new MemWaitUpdateReq)
    // debug
    val debug_int_rat = Vec(32, Input(UInt(PhyRegIdxWidth.W)))
@@ -283,7 +288,16 @@ class SchedulerImp(outer: Scheduler) extends LazyModuleImp(outer) with HasXSPara
  val dispatch2 = outer.dispatch2.map(_.module)

  // dirty code for ls dp
-  dispatch2.foreach(dp => if (dp.io.enqLsq.isDefined) dp.io.enqLsq.get <> io.extra.enqLsq.get)
+  dispatch2.foreach(dp => if (dp.io.enqLsq.isDefined) {
+    val lsqCtrl = Module(new LsqEnqCtrl)
+    lsqCtrl.io.redirect <> io.redirect
+    lsqCtrl.io.enq <> dp.io.enqLsq.get
+    lsqCtrl.io.lcommit := io.extra.lcommit
+    lsqCtrl.io.scommit := io.extra.scommit
+    lsqCtrl.io.lqCancelCnt := io.extra.lqCancelCnt
+    lsqCtrl.io.sqCancelCnt := io.extra.sqCancelCnt
+    io.extra.enqLsq.get <> lsqCtrl.io.enqLsq
+  })

  io.in <> dispatch2.flatMap(_.io.in)
  val readIntState = dispatch2.flatMap(_.io.readIntState.getOrElse(Seq()))

--- a/src/main/scala/xiangshan/mem/lsqueue/LSQWrapper.scala
+++ b/src/main/scala/xiangshan/mem/lsqueue/LSQWrapper.scala
@@ -78,6 +78,9 @@ class LsqWrappper(implicit p: Parameters) extends XSModule with HasDCacheParamet
    val issuePtrExt = Output(new SqPtr)
    val sqFull = Output(Bool())
    val lqFull = Output(Bool())
+    val lqCancelCnt = Output(UInt(log2Up(LoadQueueSize + 1).W))
+    val sqCancelCnt = Output(UInt(log2Up(StoreQueueSize + 1).W))
+    val sqDeq = Output(UInt(2.W))
    val trigger = Vec(LoadPipelineWidth, new LqTriggerIO)
  })

@@ -121,6 +124,7 @@ class LsqWrappper(implicit p: Parameters) extends XSModule with HasDCacheParamet
  loadQueue.io.release <> io.release
  loadQueue.io.trigger <> io.trigger
  loadQueue.io.exceptionAddr.isStore := DontCare
+  loadQueue.io.lqCancelCnt <> io.lqCancelCnt

  // store queue wiring
  // storeQueue.io <> DontCare
@@ -133,6 +137,8 @@ class LsqWrappper(implicit p: Parameters) extends XSModule with HasDCacheParamet
  storeQueue.io.rob <> io.rob
  storeQueue.io.exceptionAddr.isStore := DontCare
  storeQueue.io.issuePtrExt <> io.issuePtrExt
+  storeQueue.io.sqCancelCnt <> io.sqCancelCnt
+  storeQueue.io.sqDeq <> io.sqDeq

  loadQueue.io.load_s1 <> io.forward
  storeQueue.io.forward <> io.forward // overlap forwardMask & forwardData, DO NOT CHANGE SEQUENCE
@@ -196,3 +202,80 @@ class LsqWrappper(implicit p: Parameters) extends XSModule with HasDCacheParamet
  val perfEvents = Seq(loadQueue, storeQueue).flatMap(_.getPerfEvents)
  generatePerfEvent()
 }
+
+class LsqEnqCtrl(implicit p: Parameters) extends XSModule {
+  val io = IO(new Bundle {
+    val redirect = Flipped(ValidIO(new Redirect))
+    // to dispatch
+    val enq = new LsqEnqIO
+    // from rob
+    val lcommit = Input(UInt(log2Up(CommitWidth + 1).W))
+    val scommit = Input(UInt(log2Up(CommitWidth + 1).W))
+    // from/tp lsq
+    val lqCancelCnt = Input(UInt(log2Up(LoadQueueSize + 1).W))
+    val sqCancelCnt = Input(UInt(log2Up(StoreQueueSize + 1).W))
+    val enqLsq = Flipped(new LsqEnqIO)
+  })
+
+  val lqPtr = RegInit(0.U.asTypeOf(new LqPtr))
+  val sqPtr = RegInit(0.U.asTypeOf(new SqPtr))
+  val lqCounter = RegInit(LoadQueueSize.U(log2Up(LoadQueueSize + 1).W))
+  val sqCounter = RegInit(StoreQueueSize.U(log2Up(StoreQueueSize + 1).W))
+  val canAccept = RegInit(false.B)
+
+  val loadEnqNumber = PopCount(io.enq.req.zip(io.enq.needAlloc).map(x => x._1.valid && x._2(0)))
+  val storeEnqNumber = PopCount(io.enq.req.zip(io.enq.needAlloc).map(x => x._1.valid && x._2(1)))
+
+  // How to update ptr and counter:
+  // (1) by default, updated according to enq/commit
+  // (2) when redirect and dispatch queue is empty, update according to lsq
+  val t1_redirect = RegNext(io.redirect.valid)
+  val t2_redirect = RegNext(t1_redirect)
+  val t2_update = t2_redirect && !VecInit(io.enq.needAlloc.map(_.orR)).asUInt.orR
+  val t3_update = RegNext(t2_update)
+  val t3_lqCancelCnt = RegNext(io.lqCancelCnt)
+  val t3_sqCancelCnt = RegNext(io.sqCancelCnt)
+  when (t3_update) {
+    lqPtr := lqPtr - t3_lqCancelCnt
+    lqCounter := lqCounter + io.lcommit + t3_lqCancelCnt
+    sqPtr := sqPtr - t3_sqCancelCnt
+    sqCounter := sqCounter + io.scommit + t3_sqCancelCnt
+  }.elsewhen (!io.redirect.valid && io.enq.canAccept) {
+    lqPtr := lqPtr + loadEnqNumber
+    lqCounter := lqCounter + io.lcommit - loadEnqNumber
+    sqPtr := sqPtr + storeEnqNumber
+    sqCounter := sqCounter + io.scommit - storeEnqNumber
+  }.otherwise {
+    lqCounter := lqCounter + io.lcommit
+    sqCounter := sqCounter + io.scommit
+  }
+
+
+  val maxAllocate = Seq(exuParameters.LduCnt, exuParameters.StuCnt).max
+  val ldCanAccept = lqCounter >= loadEnqNumber +& maxAllocate.U
+  val sqCanAccept = sqCounter >= storeEnqNumber +& maxAllocate.U
+  // It is possible that t3_update and enq are true at the same clock cycle.
+  // For example, if redirect.valid lasts more than one clock cycle,
+  // after the last redirect, new instructions may enter but previously redirect
+  // has not been resolved (updated according to the cancel count from LSQ).
+  // To solve the issue easily, we block enqueue when t3_update, which is RegNext(t2_update).
+  io.enq.canAccept := RegNext(ldCanAccept && sqCanAccept && !t2_update)
+  val lqOffset = Wire(Vec(io.enq.resp.length, UInt(log2Up(maxAllocate + 1).W)))
+  val sqOffset = Wire(Vec(io.enq.resp.length, UInt(log2Up(maxAllocate + 1).W)))
+  for ((resp, i) <- io.enq.resp.zipWithIndex) {
+    lqOffset(i) := PopCount(io.enq.needAlloc.take(i).map(a => a(0)))
+    resp.lqIdx := lqPtr + lqOffset(i)
+    sqOffset(i) := PopCount(io.enq.needAlloc.take(i).map(a => a(1)))
+    resp.sqIdx := sqPtr + sqOffset(i)
+  }
+
+  io.enqLsq.needAlloc := RegNext(io.enq.needAlloc)
+  io.enqLsq.req.zip(io.enq.req).zip(io.enq.resp).foreach{ case ((toLsq, enq), resp) =>
+    val do_enq = enq.valid && !io.redirect.valid && io.enq.canAccept
+    toLsq.valid := RegNext(do_enq)
+    toLsq.bits := RegEnable(enq.bits, do_enq)
+    toLsq.bits.lqIdx := RegEnable(resp.lqIdx, do_enq)
+    toLsq.bits.sqIdx := RegEnable(resp.sqIdx, do_enq)
+  }
+
+}
--- a/src/main/scala/xiangshan/mem/lsqueue/LoadQueue.scala
+++ b/src/main/scala/xiangshan/mem/lsqueue/LoadQueue.scala
@@ -99,6 +99,7 @@ class LoadQueue(implicit p: Parameters) extends XSModule
    val uncache = new DCacheWordIO
    val exceptionAddr = new ExceptionAddrIO
    val lqFull = Output(Bool())
+    val lqCancelCnt = Output(UInt(log2Up(LoadQueueSize + 1).W))
    val trigger = Vec(LoadPipelineWidth, new LqTriggerIO)
  })

@@ -128,11 +129,13 @@ class LoadQueue(implicit p: Parameters) extends XSModule
  val enqPtrExt = RegInit(VecInit((0 until io.enq.req.length).map(_.U.asTypeOf(new LqPtr))))
  val deqPtrExt = RegInit(0.U.asTypeOf(new LqPtr))
  val deqPtrExtNext = Wire(new LqPtr)
-  val allowEnqueue = RegInit(true.B)

  val enqPtr = enqPtrExt(0).value
  val deqPtr = deqPtrExt.value

+  val validCount = distanceBetween(enqPtrExt(0), deqPtrExt)
+  val allowEnqueue = validCount <= (LoadQueueSize - 2).U
+
  val deqMask = UIntToMask(deqPtr, LoadQueueSize)
  val enqMask = UIntToMask(enqPtr, LoadQueueSize)

@@ -145,12 +148,14 @@ class LoadQueue(implicit p: Parameters) extends XSModule
    */
  io.enq.canAccept := allowEnqueue

+  val canEnqueue = io.enq.req.map(_.valid)
+  val enqCancel = io.enq.req.map(_.bits.robIdx.needFlush(io.brqRedirect))
  for (i <- 0 until io.enq.req.length) {
    val offset = if (i == 0) 0.U else PopCount(io.enq.needAlloc.take(i))
    val lqIdx = enqPtrExt(offset)
-    val index = lqIdx.value
-    when (io.enq.req(i).valid && io.enq.canAccept && io.enq.sqCanAccept && !io.brqRedirect.valid) {
-      uop(index) := io.enq.req(i).bits
+    val index = io.enq.req(i).bits.lqIdx.value
+    when (canEnqueue(i) && !enqCancel(i)) {
+      uop(index).robIdx := io.enq.req(i).bits.robIdx
      allocated(index) := true.B
      datavalid(index) := false.B
      writebacked(index) := false.B
@@ -158,6 +163,8 @@ class LoadQueue(implicit p: Parameters) extends XSModule
      miss(index) := false.B
      pending(index) := false.B
      error(index) := false.B
+      XSError(!io.enq.canAccept || !io.enq.sqCanAccept, s"must accept $i\n")
+      XSError(index =/= lqIdx.value, s"must be the same entry $i\n")
    }
    io.enq.resp(i) := lqIdx
  }
@@ -227,10 +234,11 @@ class LoadQueue(implicit p: Parameters) extends XSModule
      val dcacheMissed = io.loadIn(i).bits.miss && !io.loadIn(i).bits.mmio
      miss(loadWbIndex) := dcacheMissed && !io.loadDataForwarded(i) && !io.needReplayFromRS(i)
      pending(loadWbIndex) := io.loadIn(i).bits.mmio
+      // dirty code for load instr
+      uop(loadWbIndex).pdest := io.loadIn(i).bits.uop.pdest
+      uop(loadWbIndex).cf := io.loadIn(i).bits.uop.cf
+      uop(loadWbIndex).ctrl := io.loadIn(i).bits.uop.ctrl
      uop(loadWbIndex).debugInfo := io.loadIn(i).bits.uop.debugInfo
-      // update replayInst (replay from fetch) bit, 
-      // for replayInst may be set to true in load pipeline
-      uop(loadWbIndex).ctrl.replayInst := io.loadIn(i).bits.uop.ctrl.replayInst
    }
    // vaddrModule write is delayed, as vaddrModule will not be read right after write
    vaddrModule.io.waddr(i) := RegNext(loadWbIndex)
@@ -378,6 +386,7 @@ class LoadQueue(implicit p: Parameters) extends XSModule
  (0 until CommitWidth).map(i => {
    when(commitCount > i.U){
      allocated((deqPtrExt+i.U).value) := false.B
+      XSError(!allocated((deqPtrExt+i.U).value), s"why commit invalid entry $i?\n")
    }
  })

@@ -755,19 +764,19 @@ class LoadQueue(implicit p: Parameters) extends XSModule
  for (i <- 0 until LoadQueueSize) {
    needCancel(i) := uop(i).robIdx.needFlush(io.brqRedirect) && allocated(i)
    when (needCancel(i)) {
-        allocated(i) := false.B
+      allocated(i) := false.B
    }
  }

  /**
    * update pointers
    */
+  val lastEnqCancel = PopCount(RegNext(VecInit(canEnqueue.zip(enqCancel).map(x => x._1 && x._2))))
  val lastCycleCancelCount = PopCount(RegNext(needCancel))
-  // when io.brqRedirect.valid, we don't allow eneuque even though it may fire.
-  val enqNumber = Mux(io.enq.canAccept && io.enq.sqCanAccept && !io.brqRedirect.valid, PopCount(io.enq.req.map(_.valid)), 0.U)
+  val enqNumber = Mux(io.enq.canAccept && io.enq.sqCanAccept, PopCount(io.enq.req.map(_.valid)), 0.U)
  when (lastCycleRedirect.valid) {
    // we recover the pointers in the next cycle after redirect
-    enqPtrExt := VecInit(enqPtrExt.map(_ - lastCycleCancelCount))
+    enqPtrExt := VecInit(enqPtrExt.map(_ - (lastCycleCancelCount + lastEnqCancel)))
  }.otherwise {
    enqPtrExt := VecInit(enqPtrExt.map(_ + enqNumber))
  }
@@ -775,9 +784,7 @@ class LoadQueue(implicit p: Parameters) extends XSModule
  deqPtrExtNext := deqPtrExt + commitCount
  deqPtrExt := deqPtrExtNext

-  val validCount = distanceBetween(enqPtrExt(0), deqPtrExt)
-
-  allowEnqueue := validCount + enqNumber <= (LoadQueueSize - io.enq.req.length).U
+  io.lqCancelCnt := RegNext(lastCycleCancelCount + lastEnqCancel)

  /**
    * misc

--- a/src/main/scala/xiangshan/mem/lsqueue/StoreQueue.scala
+++ b/src/main/scala/xiangshan/mem/lsqueue/StoreQueue.scala
@@ -79,6 +79,8 @@ class StoreQueue(implicit p: Parameters) extends XSModule
    val sqempty = Output(Bool())
    val issuePtrExt = Output(new SqPtr) // used to wake up delayed load/store
    val sqFull = Output(Bool())
+    val sqCancelCnt = Output(UInt(log2Up(StoreQueueSize + 1).W))
+    val sqDeq = Output(UInt(2.W))
  })

  println("StoreQueue: size:" + StoreQueueSize)
@@ -130,12 +132,14 @@ class StoreQueue(implicit p: Parameters) extends XSModule
  val cmtPtrExt = RegInit(VecInit((0 until CommitWidth).map(_.U.asTypeOf(new SqPtr))))
  val issuePtrExt = RegInit(0.U.asTypeOf(new SqPtr))
  val validCounter = RegInit(0.U(log2Ceil(LoadQueueSize + 1).W))
-  val allowEnqueue = RegInit(true.B)

  val enqPtr = enqPtrExt(0).value
  val deqPtr = deqPtrExt(0).value
  val cmtPtr = cmtPtrExt(0).value

+  val validCount = distanceBetween(enqPtrExt(0), deqPtrExt(0))
+  val allowEnqueue = validCount <= (StoreQueueSize - 2).U
+
  val deqMask = UIntToMask(deqPtr, StoreQueueSize)
  val enqMask = UIntToMask(enqPtr, StoreQueueSize)

@@ -151,12 +155,15 @@ class StoreQueue(implicit p: Parameters) extends XSModule
    )
  ))
  // deqPtrExtNext traces which inst is about to leave store queue
-  val deqPtrExtNext = WireInit(Mux(io.sbuffer(1).fire(),
+  val deqPtrExtNext = Mux(io.sbuffer(1).fire(),
    VecInit(deqPtrExt.map(_ + 2.U)),
    Mux(io.sbuffer(0).fire() || io.mmioStout.fire(),
      VecInit(deqPtrExt.map(_ + 1.U)),
      deqPtrExt
    )
+  )
+  io.sqDeq := RegNext(Mux(io.sbuffer(1).fire(), 2.U,
+    Mux(io.sbuffer(0).fire() || io.mmioStout.fire(), 1.U, 0.U)
  ))
  for (i <- 0 until StorePipelineWidth) {
    dataModule.io.raddr(i) := rdataPtrExtNext(i).value
@@ -173,17 +180,21 @@ class StoreQueue(implicit p: Parameters) extends XSModule
    * Currently, StoreQueue only allows enqueue when #emptyEntries > EnqWidth
    */
  io.enq.canAccept := allowEnqueue
+  val canEnqueue = io.enq.req.map(_.valid)
+  val enqCancel = io.enq.req.map(_.bits.robIdx.needFlush(io.brqRedirect))
  for (i <- 0 until io.enq.req.length) {
    val offset = if (i == 0) 0.U else PopCount(io.enq.needAlloc.take(i))
    val sqIdx = enqPtrExt(offset)
-    val index = sqIdx.value
-    when (io.enq.req(i).valid && io.enq.canAccept && io.enq.lqCanAccept && !io.brqRedirect.valid) {
-      uop(index) := io.enq.req(i).bits
+    val index = io.enq.req(i).bits.sqIdx.value
+    when (canEnqueue(i) && !enqCancel(i)) {
+      uop(index).robIdx := io.enq.req(i).bits.robIdx
      allocated(index) := true.B
      datavalid(index) := false.B
      addrvalid(index) := false.B
      committed(index) := false.B
      pending(index) := false.B
+      XSError(!io.enq.canAccept || !io.enq.lqCanAccept, s"must accept $i\n")
+      XSError(index =/= sqIdx.value, s"must be the same entry $i\n")
    }
    io.enq.resp(i) := sqIdx
  }
@@ -255,6 +266,7 @@ class StoreQueue(implicit p: Parameters) extends XSModule

      // mmio(stWbIndex) := io.storeIn(i).bits.mmio

+      uop(stWbIndex).ctrl := io.storeIn(i).bits.uop.ctrl
      uop(stWbIndex).debugInfo := io.storeIn(i).bits.uop.debugInfo
      XSInfo("store addr write to sq idx %d pc 0x%x miss:%d vaddr %x paddr %x mmio %x\n",
        io.storeIn(i).bits.uop.sqIdx.value,
@@ -585,20 +597,20 @@ class StoreQueue(implicit p: Parameters) extends XSModule
  for (i <- 0 until StoreQueueSize) {
    needCancel(i) := uop(i).robIdx.needFlush(io.brqRedirect) && allocated(i) && !committed(i)
    when (needCancel(i)) {
-        allocated(i) := false.B
+      allocated(i) := false.B
    }
  }

  /**
    * update pointers
    */
+  val lastEnqCancel = PopCount(RegNext(VecInit(canEnqueue.zip(enqCancel).map(x => x._1 && x._2))))
  val lastCycleRedirect = RegNext(io.brqRedirect.valid)
  val lastCycleCancelCount = PopCount(RegNext(needCancel))
-  // when io.brqRedirect.valid, we don't allow eneuque even though it may fire.
-  val enqNumber = Mux(io.enq.canAccept && io.enq.lqCanAccept && !io.brqRedirect.valid, PopCount(io.enq.req.map(_.valid)), 0.U)
+  val enqNumber = Mux(io.enq.canAccept && io.enq.lqCanAccept, PopCount(io.enq.req.map(_.valid)), 0.U)
  when (lastCycleRedirect) {
    // we recover the pointers in the next cycle after redirect
-    enqPtrExt := VecInit(enqPtrExt.map(_ - lastCycleCancelCount))
+    enqPtrExt := VecInit(enqPtrExt.map(_ - (lastCycleCancelCount + lastEnqCancel)))
  }.otherwise {
    enqPtrExt := VecInit(enqPtrExt.map(_ + enqNumber))
  }
@@ -607,9 +619,9 @@ class StoreQueue(implicit p: Parameters) extends XSModule
  rdataPtrExt := rdataPtrExtNext

  val dequeueCount = Mux(io.sbuffer(1).fire(), 2.U, Mux(io.sbuffer(0).fire() || io.mmioStout.fire(), 1.U, 0.U))
-  val validCount = distanceBetween(enqPtrExt(0), deqPtrExt(0))

-  allowEnqueue := validCount + enqNumber <= (StoreQueueSize - io.enq.req.length).U
+  // If redirect at T0, sqCancelCnt is at T2
+  io.sqCancelCnt := RegNext(lastCycleCancelCount + lastEnqCancel)

  // io.sqempty will be used by sbuffer
  // We delay it for 1 cycle for better timing