Merge remote-tracking branch 'origin/master' into ftq

8b2adfb7 · Lingrui98 · 4055685b · a29e91b2 · 54a97b8b · ca387163
60 changed file
--- a/block-inclusivecache-sifive @ 54a97b8b
+++ b/block-inclusivecache-sifive @ 54a97b8b
-Subproject commit ca387163b32f20406d443bdab34bc034d5281b51
+Subproject commit 54a97b8b9325921ea7cdaa45db7519d9a3666da5
--- a/debug/Makefile
+++ b/debug/Makefile
@@ -100,6 +100,12 @@ SUITE = cache.L2CacheTest
 unit-test:
 	cd .. && mill XiangShan.test.testOnly -o -s $(SUITE)

+tlc-test:
+	cd .. && mill XiangShan.test.testOnly -o -s cache.TLCTest.TLCCacheTest
+
+l1-test:
+	cd .. && mill XiangShan.test.testOnly -o -s cache.L1DTest.L1DCacheTest
+
 unit-test-all:
 	cd .. && mill XiangShan.test.test -P$(P)


--- a/src/main/scala/utils/BitUtils.scala
+++ b/src/main/scala/utils/BitUtils.scala
@@ -58,6 +58,8 @@ object OneHot {
  def OH1ToUInt(x: UInt): UInt = OHToUInt(OH1ToOH(x))
  def UIntToOH1(x: UInt, width: Int): UInt = ~((-1).S(width.W).asUInt << x)(width-1, 0)
  def UIntToOH1(x: UInt): UInt = UIntToOH1(x, (1 << x.getWidth) - 1)
+  def checkOneHot(in: Bits): Unit = assert(PopCount(in) <= 1.U)
+  def checkOneHot(in: Iterable[Bool]): Unit = assert(PopCount(in) <= 1.U)
 }

 object LowerMask {

--- a/src/main/scala/utils/DebugIdentityNode.scala
+++ b/src/main/scala/utils/DebugIdentityNode.scala
@@ -26,7 +26,7 @@ class DebugIdentityNode()(implicit p: Parameters) extends LazyModule  {
      val channels = Seq(t.a, t.b, t.c, t.d, t.e)
      channels.foreach(c =>
        when(fire(c)){
-          XSDebug(" ")
+          XSDebug(" isFire:%d ",c.fire())
          c.bits.dump
        }
      )

--- a/src/main/scala/utils/LogUtils.scala
+++ b/src/main/scala/utils/LogUtils.scala
@@ -20,8 +20,7 @@ object XSLogLevel extends Enumeration {
 object XSLog {
  val MagicStr = "9527"
  def apply(debugLevel: XSLogLevel)
-           (prefix: Boolean, cond: Bool, pable: Printable)
-           (implicit name: String): Any =
+           (prefix: Boolean, cond: Bool, pable: Printable): Any =
  {
    val logEnable = WireInit(false.B)
    val logTimestamp = WireInit(0.U(64.W))
@@ -53,15 +52,15 @@ object XSLog {

 sealed abstract class LogHelper(val logLevel: XSLogLevel) extends HasXSParameter {

-  def apply(cond: Bool, fmt: String, data: Bits*)(implicit name: String): Any =
+  def apply(cond: Bool, fmt: String, data: Bits*): Any =
    apply(cond, Printable.pack(fmt, data:_*))
-  def apply(cond: Bool, pable: Printable)(implicit name: String): Any = apply(true, cond, pable)
-  def apply(fmt: String, data: Bits*)(implicit name: String): Any =
+  def apply(cond: Bool, pable: Printable): Any = apply(true, cond, pable)
+  def apply(fmt: String, data: Bits*): Any =
    apply(Printable.pack(fmt, data:_*))
-  def apply(pable: Printable)(implicit name: String): Any = apply(true.B, pable)
-  def apply(prefix: Boolean, cond: Bool, fmt: String, data: Bits*)(implicit name: String): Any =
+  def apply(pable: Printable): Any = apply(true.B, pable)
+  def apply(prefix: Boolean, cond: Bool, fmt: String, data: Bits*): Any =
    apply(prefix, cond, Printable.pack(fmt, data:_*))
-  def apply(prefix: Boolean, cond: Bool, pable: Printable)(implicit name: String): Any =
+  def apply(prefix: Boolean, cond: Bool, pable: Printable): Any =
    XSLog(logLevel)(prefix, cond, pable)

  // trigger log or not
@@ -70,7 +69,7 @@ sealed abstract class LogHelper(val logLevel: XSLogLevel) extends HasXSParameter
    XSLog.displayLog
  }

-  def printPrefix()(implicit name: String): Unit = {
+  def printPrefix(): Unit = {
    val commonInfo = p"[$logLevel][time=${GTimer()}] ${XSLog.MagicStr}: "
    when (trigger) {
      printf(commonInfo)
@@ -78,7 +77,7 @@ sealed abstract class LogHelper(val logLevel: XSLogLevel) extends HasXSParameter
  }

  // dump under with certain prefix
-  def exec(dump: () => Unit)(implicit name: String): Unit = {
+  def exec(dump: () => Unit): Unit = {
    when (trigger) {
      printPrefix
      dump
@@ -86,7 +85,7 @@ sealed abstract class LogHelper(val logLevel: XSLogLevel) extends HasXSParameter
  }

  // dump under certain condition and with certain prefix
-  def exec(cond: Bool, dump: () => Unit)(implicit name: String): Unit = {
+  def exec(cond: Bool, dump: () => Unit): Unit = {
    when (trigger && cond) {
      printPrefix
      dump

--- a/src/main/scala/xiangshan/Bundle.scala
+++ b/src/main/scala/xiangshan/Bundle.scala
@@ -388,6 +388,8 @@ class TlbFeedback extends XSBundle {
  val hit = Bool()
 }

+class RSFeedback extends TlbFeedback
+
 class FrontendToBackendIO extends XSBundle {
  // to backend end
  val cfVec = Vec(DecodeWidth, DecoupledIO(new CtrlFlow))

--- a/src/main/scala/xiangshan/XSCore.scala
+++ b/src/main/scala/xiangshan/XSCore.scala
@@ -208,8 +208,9 @@ trait HasXSParameter {
    tagECC = Some("secded"),
    dataECC = Some("secded"),
    nMissEntries = 16,
-    nLoadMissEntries = 8,
-    nStoreMissEntries = 8
+    nProbeEntries = 16,
+    nReleaseEntries = 16,
+    nStoreReplayEntries = 16
  )

  val LRSCCycles = 100

--- a/src/main/scala/xiangshan/backend/FloatBlock.scala
+++ b/src/main/scala/xiangshan/backend/FloatBlock.scala
@@ -6,7 +6,7 @@ import xiangshan._
 import utils._
 import xiangshan.backend.regfile.Regfile
 import xiangshan.backend.exu._
-import xiangshan.backend.issue.{ReservationStationCtrl, ReservationStationData}
+import xiangshan.backend.issue.{ReservationStation}


 class FpBlockToCtrlIO extends XSBundle {
@@ -71,68 +71,63 @@ class FloatBlock

    val inBlockWbData = exeUnits.filter(e => e.config.hasCertainLatency && readFpRf).map(_.io.toFp.bits.data)
    val writeBackData = inBlockWbData ++ io.wakeUpIn.fast.map(_.bits.data)
-    val wakeupCnt = writeBackData.length
+    val fastPortsCnt = writeBackData.length

    val inBlockListenPorts = exeUnits.filter(e => e.config.hasUncertainlatency && readFpRf).map(_.io.toFp)
-    val extraListenPorts = inBlockListenPorts ++ io.wakeUpIn.slow
-    val extraListenPortsCnt = extraListenPorts.length
+    val slowPorts = inBlockListenPorts ++ io.wakeUpIn.slow
+    val slowPortsCnt = slowPorts.length

-    println(s"${i}: exu:${cfg.name} wakeupCnt: ${wakeupCnt} " +
-      s"extraListenPorts: ${extraListenPortsCnt} " +
+    println(s"${i}: exu:${cfg.name} fastPortsCnt: ${fastPortsCnt} " +
+      s"slowPorts: ${slowPortsCnt} " +
      s"delay:${certainLatency}"
    )

-    val rsCtrl = Module(new ReservationStationCtrl(cfg, wakeupCnt, extraListenPortsCnt, fixedDelay = certainLatency, feedback = false))
-    val rsData = Module(new ReservationStationData(cfg, wakeupCnt, extraListenPortsCnt, fixedDelay = certainLatency, feedback = false))
+    val rs = Module(new ReservationStation(cfg, fastPortsCnt, slowPortsCnt, fixedDelay = certainLatency, fastWakeup = certainLatency >= 0, feedback = false))

-    rsCtrl.io.data <> rsData.io.ctrl
-    rsCtrl.io.redirect <> redirect // TODO: remove it
-    rsCtrl.io.flush <> flush // TODO: remove it
-    rsCtrl.io.numExist <> io.toCtrlBlock.numExist(i)
-    rsCtrl.io.enqCtrl <> io.fromCtrlBlock.enqIqCtrl(i)
+    rs.io.redirect <> redirect // TODO: remove it
+    rs.io.flush <> flush // TODO: remove it
+    rs.io.numExist <> io.toCtrlBlock.numExist(i)
+    rs.io.fromDispatch <> io.fromCtrlBlock.enqIqCtrl(i)

-    rsData.io.srcRegValue := DontCare
+    rs.io.srcRegValue := DontCare
    val src1Value = VecInit((0 until 4).map(i => fpRf.io.readPorts(i * 3).data))
    val src2Value = VecInit((0 until 4).map(i => fpRf.io.readPorts(i * 3 + 1).data))
    val src3Value = VecInit((0 until 4).map(i => fpRf.io.readPorts(i * 3 + 2).data))
-    
-    rsData.io.srcRegValue(0) := src1Value(readPortIndex(i))
-    rsData.io.srcRegValue(1) := src2Value(readPortIndex(i))
-    if (cfg.fpSrcCnt > 2) rsData.io.srcRegValue(2) := src3Value(readPortIndex(i))
-    rsData.io.redirect <> redirect
-    rsData.io.flush <> flush
-
-    rsData.io.writeBackedData <> writeBackData
-    for ((x, y) <- rsData.io.extraListenPorts.zip(extraListenPorts)) {
+
+    rs.io.srcRegValue(0) := src1Value(readPortIndex(i))
+    rs.io.srcRegValue(1) := src2Value(readPortIndex(i))
+    if (cfg.fpSrcCnt > 2) rs.io.srcRegValue(2) := src3Value(readPortIndex(i))
+
+    rs.io.fastDatas <> writeBackData
+    for ((x, y) <- rs.io.slowPorts.zip(slowPorts)) {
      x.valid := y.fire()
      x.bits := y.bits
    }

    exeUnits(i).io.redirect <> redirect
    exeUnits(i).io.flush <> flush
-    exeUnits(i).io.fromFp <> rsData.io.deq
-    rsData.io.feedback := DontCare
+    exeUnits(i).io.fromFp <> rs.io.deq
+    rs.io.feedback := DontCare

-    rsCtrl.suggestName(s"rsc_${cfg.name}")
-    rsData.suggestName(s"rsd_${cfg.name}")
+    rs.suggestName(s"rs_${cfg.name}")

-    rsData
+    rs
  })

  for(rs <- reservedStations){
    val inBlockUops = reservedStations.filter(x =>
      x.exuCfg.hasCertainLatency && x.exuCfg.writeFpRf
    ).map(x => {
-      val raw = WireInit(x.io.selectedUop)
-      raw.valid := x.io.selectedUop.valid && raw.bits.ctrl.fpWen
+      val raw = WireInit(x.io.fastUopOut)
+      raw.valid := x.io.fastUopOut.valid && raw.bits.ctrl.fpWen
      raw
    })
-    rs.io.broadcastedUops <> inBlockUops ++ io.wakeUpIn.fastUops
+    rs.io.fastUopsIn <> inBlockUops ++  io.wakeUpIn.fastUops
  }

  io.wakeUpFpOut.fastUops <> reservedStations.filter(
    rs => fpFastFilter(rs.exuCfg)
-  ).map(_.io.selectedUop).map(fpValid)
+  ).map(_.io.fastUopOut).map(fpValid)

  io.wakeUpFpOut.fast <> exeUnits.filter(
    x => fpFastFilter(x.config)
@@ -144,7 +139,7 @@ class FloatBlock

  io.wakeUpIntOut.fastUops <> reservedStations.filter(
    rs => intFastFilter(rs.exuCfg)
-  ).map(_.io.selectedUop).map(intValid)
+  ).map(_.io.fastUopOut).map(intValid)

  io.wakeUpIntOut.fast <> exeUnits.filter(
    x => intFastFilter(x.config)

--- a/src/main/scala/xiangshan/backend/IntegerBlock.scala
+++ b/src/main/scala/xiangshan/backend/IntegerBlock.scala
@@ -6,7 +6,7 @@ import xiangshan._
 import xiangshan.backend.exu.Exu.{ldExeUnitCfg, stExeUnitCfg}
 import xiangshan.backend.exu._
 import xiangshan.backend.fu.FenceToSbuffer
-import xiangshan.backend.issue.{ReservationStationCtrl, ReservationStationData}
+import xiangshan.backend.issue.{ReservationStation}
 import xiangshan.backend.regfile.Regfile
 import xiangshan.backend.roq.RoqExceptionInfo

@@ -151,72 +151,64 @@ class IntegerBlock
    val readIntRf = cfg.readIntRf

    val inBlockWbData = exeUnits.filter(e => e.config.hasCertainLatency && readIntRf).map(_.io.toInt.bits.data)
-    val writeBackData = inBlockWbData ++ io.wakeUpIn.fast.map(_.bits.data)
-    val wakeupCnt = writeBackData.length
+    val fastDatas = inBlockWbData ++ io.wakeUpIn.fast.map(_.bits.data)
+    val wakeupCnt = fastDatas.length

    val inBlockListenPorts = exeUnits.filter(e => e.config.hasUncertainlatency && readIntRf).map(_.io.toInt)
-    val extraListenPorts = inBlockListenPorts ++ io.wakeUpIn.slow
-    val extraListenPortsCnt = extraListenPorts.length
+    val slowPorts = inBlockListenPorts ++ io.wakeUpIn.slow
+    val extraListenPortsCnt = slowPorts.length

    val feedback = (cfg == ldExeUnitCfg) || (cfg == stExeUnitCfg)

-    println(s"${i}: exu:${cfg.name} wakeupCnt: ${wakeupCnt} extraListenPorts: ${extraListenPortsCnt} delay:${certainLatency} feedback:${feedback}")
+    println(s"${i}: exu:${cfg.name} wakeupCnt: ${wakeupCnt} slowPorts: ${extraListenPortsCnt} delay:${certainLatency} feedback:${feedback}")

-    // val rs = Module(new ReservationStationNew(
-    //   cfg, wakeupCnt, extraListenPortsCnt, fixedDelay = certainLatency, feedback = feedback
-    // ))
-    val rsCtrl = Module(new ReservationStationCtrl(cfg, wakeupCnt, extraListenPortsCnt, fixedDelay = certainLatency, feedback = feedback))
-    val rsData = Module(new ReservationStationData(cfg, wakeupCnt, extraListenPortsCnt, fixedDelay = certainLatency, feedback = feedback))
+    val rs = Module(new ReservationStation(cfg, wakeupCnt, extraListenPortsCnt, fixedDelay = certainLatency, fastWakeup = certainLatency >= 0, feedback = feedback))

-    rsCtrl.io.data <> rsData.io.ctrl
-    rsCtrl.io.redirect <> redirect // TODO: remove it
-    rsCtrl.io.flush <> flush // TODO: remove it
-    rsCtrl.io.numExist <> io.toCtrlBlock.numExist(i)
-    rsCtrl.io.enqCtrl <> io.fromCtrlBlock.enqIqCtrl(i)
+    rs.io.redirect <> redirect
+    rs.io.flush <> flush // TODO: remove it
+    rs.io.numExist <> io.toCtrlBlock.numExist(i)
+    rs.io.fromDispatch <> io.fromCtrlBlock.enqIqCtrl(i)

-    rsData.io.srcRegValue := DontCare
+    rs.io.srcRegValue := DontCare
    val src1Value = VecInit((0 until 4).map(i => intRf.io.readPorts(i * 2).data))
    val src2Value = VecInit((0 until 4).map(i => intRf.io.readPorts(i * 2 + 1).data))
-    rsData.io.srcRegValue(0) := src1Value(readPortIndex(i))
-    if (cfg.intSrcCnt > 1) rsData.io.srcRegValue(1) := src2Value(readPortIndex(i))
+    rs.io.srcRegValue(0) := src1Value(readPortIndex(i))
+    if (cfg.intSrcCnt > 1) rs.io.srcRegValue(1) := src2Value(readPortIndex(i))
    if (cfg == Exu.jumpExeUnitCfg) {
-      rsData.io.jumpPc := io.fromCtrlBlock.jumpPc
-      rsData.io.jalr_target := io.fromCtrlBlock.jalr_target
+      rs.io.jumpPc := io.fromCtrlBlock.jumpPc
+      rs.io.jalr_target := io.fromCtrlBlock.jalr_target
    }
-    rsData.io.redirect <> redirect
-    rsData.io.flush <> flush

-    rsData.io.writeBackedData <> writeBackData
-    for ((x, y) <- rsData.io.extraListenPorts.zip(extraListenPorts)) {
+    rs.io.fastDatas <> fastDatas
+    for ((x, y) <- rs.io.slowPorts.zip(slowPorts)) {
      x.valid := y.fire()
      x.bits := y.bits
    }

    exeUnits(i).io.redirect <> redirect
+    exeUnits(i).io.fromInt <> rs.io.deq
    exeUnits(i).io.flush <> flush
-    exeUnits(i).io.fromInt <> rsData.io.deq
-    rsData.io.feedback := DontCare
+    rs.io.feedback := DontCare

-    rsCtrl.suggestName(s"rsc_${cfg.name}")
-    rsData.suggestName(s"rsd_${cfg.name}")
+    rs.suggestName(s"rs_${cfg.name}")

-    rsData
+    rs
  })

  for(rs <- reservationStations){
    val inBlockUops = reservationStations.filter(x =>
      x.exuCfg.hasCertainLatency && x.exuCfg.writeIntRf
    ).map(x => {
-      val raw = WireInit(x.io.selectedUop)
-      raw.valid := x.io.selectedUop.valid && raw.bits.ctrl.rfWen
+      val raw = WireInit(x.io.fastUopOut)
+      raw.valid := x.io.fastUopOut.valid && raw.bits.ctrl.rfWen
      raw
    })
-    rs.io.broadcastedUops <> inBlockUops ++ io.wakeUpIn.fastUops
+    rs.io.fastUopsIn <> inBlockUops ++ io.wakeUpIn.fastUops
  }

  io.wakeUpFpOut.fastUops <> reservationStations.filter(
    rs => fpFastFilter(rs.exuCfg)
-  ).map(_.io.selectedUop).map(fpValid)
+  ).map(_.io.fastUopOut).map(fpValid)

  io.wakeUpFpOut.fast <> exeUnits.filter(
    x => fpFastFilter(x.config)
@@ -228,7 +220,7 @@ class IntegerBlock

  io.wakeUpIntOut.fastUops <> reservationStations.filter(
    rs => intFastFilter(rs.exuCfg)
-  ).map(_.io.selectedUop).map(intValid)
+  ).map(_.io.fastUopOut).map(intValid)

  io.wakeUpIntOut.fast <> exeUnits.filter(
    x => intFastFilter(x.config)

--- a/src/main/scala/xiangshan/backend/MemBlock.scala
+++ b/src/main/scala/xiangshan/backend/MemBlock.scala
@@ -12,7 +12,7 @@ import xiangshan.backend.exu._
 import xiangshan.cache._
 import xiangshan.mem._
 import xiangshan.backend.fu.{HasExceptionNO, FenceToSbuffer}
-import xiangshan.backend.issue.{ReservationStationCtrl, ReservationStationData}
+import xiangshan.backend.issue.{ReservationStation}
 import xiangshan.backend.regfile.RfReadPort

 class LsBlockToCtrlIO extends XSBundle {
@@ -132,60 +132,55 @@ class MemBlockImp
    val readFpRf = cfg.readFpRf

    // load has uncertain latency, so only use external wake up data
-    val writeBackData = fastWakeUpIn.zip(io.wakeUpIn.fast)
+    val fastDatas = fastWakeUpIn.zip(io.wakeUpIn.fast)
      .filter(x => (x._1.writeIntRf && readIntRf) || (x._1.writeFpRf && readFpRf))
      .map(_._2.bits.data)
-    val wakeupCnt = writeBackData.length
+    val wakeupCnt = fastDatas.length

    val inBlockListenPorts = intExeWbReqs ++ fpExeWbReqs
-    val extraListenPorts = inBlockListenPorts ++
+    val slowPorts = inBlockListenPorts ++
      slowWakeUpIn.zip(io.wakeUpIn.slow)
        .filter(x => (x._1.writeIntRf && readIntRf) || (x._1.writeFpRf && readFpRf))
        .map(_._2)

-    val extraListenPortsCnt = extraListenPorts.length
+    val slowPortsCnt = slowPorts.length

    // if tlb miss, replay
    val feedback = true

-    println(s"${i}: exu:${cfg.name} wakeupCnt: ${wakeupCnt} extraListenPorts: ${extraListenPortsCnt} delay:${certainLatency} feedback:${feedback}")
+    println(s"${i}: exu:${cfg.name} wakeupCnt: ${wakeupCnt} slowPorts: ${slowPortsCnt} delay:${certainLatency} feedback:${feedback}")

-    val rsCtrl = Module(new ReservationStationCtrl(cfg, wakeupCnt, extraListenPortsCnt, fixedDelay = certainLatency, feedback = feedback))
-    val rsData = Module(new ReservationStationData(cfg, wakeupCnt, extraListenPortsCnt, fixedDelay = certainLatency, feedback = feedback))
+    val rs = Module(new ReservationStation(cfg, wakeupCnt, slowPortsCnt, fixedDelay = certainLatency, fastWakeup = certainLatency >= 0, feedback = feedback))

-    rsCtrl.io.data     <> rsData.io.ctrl
-    rsCtrl.io.redirect <> redirect // TODO: remove it
-    rsCtrl.io.flush    <> io.fromCtrlBlock.flush // TODO: remove it
-    rsCtrl.io.numExist <> io.toCtrlBlock.numExist(i)
-    rsCtrl.io.enqCtrl  <> io.fromCtrlBlock.enqIqCtrl(i)
+    rs.io.redirect <> redirect // TODO: remove it
+    rs.io.flush    <> io.fromCtrlBlock.flush // TODO: remove it
+    rs.io.numExist <> io.toCtrlBlock.numExist(i)
+    rs.io.fromDispatch  <> io.fromCtrlBlock.enqIqCtrl(i)

    val src2IsFp = RegNext(io.fromCtrlBlock.enqIqCtrl(i).bits.ctrl.src2Type === SrcType.fp)
-    rsData.io.srcRegValue := DontCare
-    rsData.io.srcRegValue(0) := io.fromIntBlock.readIntRf(readPortIndex(i)).data
+    rs.io.srcRegValue := DontCare
+    rs.io.srcRegValue(0) := io.fromIntBlock.readIntRf(readPortIndex(i)).data
    if (i >= exuParameters.LduCnt) {
-      rsData.io.srcRegValue(1) := Mux(src2IsFp, io.fromFpBlock.readFpRf(i - exuParameters.LduCnt).data, io.fromIntBlock.readIntRf(readPortIndex(i) + 1).data)
+      rs.io.srcRegValue(1) := Mux(src2IsFp, io.fromFpBlock.readFpRf(i - exuParameters.LduCnt).data, io.fromIntBlock.readIntRf(readPortIndex(i) + 1).data)
    }
-    rsData.io.redirect <> redirect
-    rsData.io.flush <> io.fromCtrlBlock.flush

-    rsData.io.writeBackedData <> writeBackData
-    for ((x, y) <- rsData.io.extraListenPorts.zip(extraListenPorts)) {
+    rs.io.fastDatas <> fastDatas
+    for ((x, y) <- rs.io.slowPorts.zip(slowPorts)) {
      x.valid := y.fire()
      x.bits  := y.bits
    }

    // exeUnits(i).io.redirect <> redirect
-    // exeUnits(i).io.fromInt <> rsData.io.deq
-    rsData.io.feedback := DontCare
+    // exeUnits(i).io.fromInt <> rs.io.deq
+    rs.io.feedback := DontCare

-    rsCtrl.suggestName(s"rsc_${cfg.name}")
-    rsData.suggestName(s"rsd_${cfg.name}")
+    rs.suggestName(s"rsd_${cfg.name}")

-    rsData
+    rs
  })

  for(rs <- reservationStations){
-    rs.io.broadcastedUops <> fastWakeUpIn.zip(io.wakeUpIn.fastUops)
+    rs.io.fastUopsIn <> fastWakeUpIn.zip(io.wakeUpIn.fastUops)
      .filter(x => (x._1.writeIntRf && rs.exuCfg.readIntRf) || (x._1.writeFpRf && rs.exuCfg.readFpRf))
      .map(_._2)
  }

--- a/src/main/scala/xiangshan/backend/ftq/Ftq.scala
+++ b/src/main/scala/xiangshan/backend/ftq/Ftq.scala
@@ -172,7 +172,7 @@ class Ftq extends XSModule with HasCircularQueuePtrHelper {
    when(wb.bits.redirectValid) {
      mispredict_vec(wbIdx)(offset) := cfiUpdate.isMisPred
      when(cfiUpdate.taken && offset < cfiIndex_vec(wbIdx).bits) {
-        cfiIndex_vec(wbIdx).valid := true.B
+        
        cfiIndex_vec(wbIdx).bits := offset
        cfiIsCall(wbIdx) := wb.bits.uop.cf.pd.isCall
        cfiIsRet(wbIdx) := wb.bits.uop.cf.pd.isRet

--- a/src/main/scala/xiangshan/backend/fu/fpu/IntToFP.scala
+++ b/src/main/scala/xiangshan/backend/fu/fpu/IntToFP.scala
@@ -18,6 +18,7 @@ class IntToFP extends FPUSubModule {

  val src1 = RegEnable(io.in.bits.src(0)(XLEN-1, 0), io.in.fire())
  val uopReg = RegEnable(io.in.bits.uop, io.in.fire())
+  val rmReg = RegEnable(rm, io.in.fire())

  switch(state){
    is(s_idle){
@@ -63,7 +64,7 @@ class IntToFP extends FPUSubModule {
      val i2f = Module(new INToRecFN(XLEN, t.exp, t.sig))
      i2f.io.signedIn := ~typ(0)
      i2f.io.in := intValue
-      i2f.io.roundingMode := rm
+      i2f.io.roundingMode := rmReg
      i2f.io.detectTininess := hardfloat.consts.tininess_afterRounding
      (sanitizeNaN(i2f.io.out, t), i2f.io.exceptionFlags)
    }

--- a/src/main/scala/xiangshan/backend/issue/ReservationStationNew.scala
+++ b/src/main/scala/xiangshan/backend/issue/ReservationStationNew.scala
--- a/src/main/scala/xiangshan/cache/AtomicsReplayUnit.scala
+++ b/src/main/scala/xiangshan/cache/AtomicsReplayUnit.scala
+package xiangshan.cache
+
+import chisel3._
+import chisel3.util._
+
+import utils.XSDebug
+
+class AtomicsReplayEntry extends DCacheModule
+{
+  val io = IO(new Bundle {
+    val lsu  = Flipped(new DCacheWordIO)
+    val pipe_req  = Decoupled(new MainPipeReq)
+    val pipe_resp = Flipped(ValidIO(new MainPipeResp))
+
+    val block_addr  = Output(Valid(UInt()))
+  })
+
+  val s_invalid :: s_pipe_req :: s_pipe_resp :: s_resp :: Nil = Enum(4)
+  val state = RegInit(s_invalid)
+
+  val req = Reg(new DCacheWordReq)
+
+  // assign default values to output signals
+  io.lsu.req.ready     := state === s_invalid
+  io.lsu.resp.valid    := false.B
+  io.lsu.resp.bits     := DontCare
+
+  io.pipe_req.valid    := false.B
+  io.pipe_req.bits     := DontCare
+
+  io.block_addr.valid := state =/= s_invalid
+  io.block_addr.bits  := req.addr
+
+
+  when (state =/= s_invalid) {
+    XSDebug("AtomicsReplayEntry: state: %d block_addr: %x\n", state, io.block_addr.bits)
+  }
+
+  // --------------------------------------------
+  // s_invalid: receive requests
+  when (state === s_invalid) {
+    when (io.lsu.req.fire()) {
+      req   := io.lsu.req.bits
+      state := s_pipe_req
+    }
+  }
+
+  // --------------------------------------------
+  // replay
+  when (state === s_pipe_req) {
+    io.pipe_req.valid := true.B
+
+    val pipe_req = io.pipe_req.bits
+    pipe_req := DontCare
+    pipe_req.miss := false.B
+    pipe_req.probe := false.B
+    pipe_req.source := AMO_SOURCE.U
+    pipe_req.cmd    := req.cmd
+    pipe_req.addr   := get_block_addr(req.addr)
+    pipe_req.word_idx  := get_word(req.addr)
+    pipe_req.amo_data  := req.data
+    pipe_req.amo_mask  := req.mask
+
+    when (io.pipe_req.fire()) {
+      state := s_pipe_resp
+    }
+  }
+
+  val resp_data = Reg(UInt())
+  val resp_id   = Reg(UInt())
+  when (state === s_pipe_resp) {
+    // when not miss
+    // everything is OK, simply send response back to sbuffer
+    // when miss and not replay
+    // wait for missQueue to handling miss and replaying our request
+    // when miss and replay
+    // req missed and fail to enter missQueue, manually replay it later
+    // TODO: add assertions:
+    // 1. add a replay delay counter?
+    // 2. when req gets into MissQueue, it should not miss any more
+    when (io.pipe_resp.fire()) {
+      when (io.pipe_resp.bits.miss) {
+        when (io.pipe_resp.bits.replay) {
+          state := s_pipe_req
+        }
+      } .otherwise {
+        resp_data := io.pipe_resp.bits.data
+        resp_id   := io.pipe_resp.bits.id
+        state := s_resp
+      }
+    }
+  }
+
+  // --------------------------------------------
+  when (state === s_resp) {
+    io.lsu.resp.valid := true.B
+    io.lsu.resp.bits  := DontCare
+    io.lsu.resp.bits.data := resp_data
+    io.lsu.resp.bits.id   := resp_id
+
+    when (io.lsu.resp.fire()) {
+      state := s_invalid
+    }
+  }
+
+  // debug output
+  when (io.lsu.req.fire()) {
+    io.lsu.req.bits.dump()
+  }
+
+  when (io.lsu.resp.fire()) {
+    io.lsu.resp.bits.dump()
+  }
+
+  when (io.pipe_req.fire()) {
+    io.pipe_req.bits.dump()
+  }
+
+  when (io.pipe_resp.fire()) {
+    io.pipe_resp.bits.dump()
+  }
+}
--- a/src/main/scala/xiangshan/cache/dcache.scala
+++ b/src/main/scala/xiangshan/cache/dcache.scala
@@ -20,12 +20,10 @@ case class DCacheParameters
    tagECC: Option[String] = None,
    dataECC: Option[String] = None,
    nMissEntries: Int = 1,
-    nLoadMissEntries: Int = 1,
-    nStoreMissEntries: Int = 1,
-    nMiscMissEntries: Int = 1,
+    nProbeEntries: Int = 1,
+    nReleaseEntries: Int = 1,
+    nStoreReplayEntries: Int = 1,
    nMMIOEntries: Int = 1,
-    nSDQ: Int = 17,
-    nRPQ: Int = 16,
    nMMIOs: Int = 1,
    blockBytes: Int = 64
 ) extends L1CacheParameters {
@@ -48,23 +46,12 @@ trait HasDCacheParameters extends HasL1CacheParameters {
  def nIOMSHRs = cacheParams.nMMIOs
  def maxUncachedInFlight = cacheParams.nMMIOs

-  def missQueueEntryIdWidth = log2Up(cfg.nMissEntries)
-  def loadMissQueueEntryIdWidth = log2Up(cfg.nLoadMissEntries)
-  def storeMissQueueEntryIdWidth = log2Up(cfg.nStoreMissEntries)
-  def miscMissQueueEntryIdWidth = log2Up(cfg.nMiscMissEntries)
-  def clientMissQueueEntryIdWidth = max(
-    max(loadMissQueueEntryIdWidth,
-      storeMissQueueEntryIdWidth),
-      miscMissQueueEntryIdWidth)
-
-  // clients: ldu 0, ldu1, stu, atomics
-  def nClientMissQueues = 4
-  def clientIdWidth = log2Up(nClientMissQueues)
-  def missQueueClientIdWidth = clientIdWidth + clientMissQueueEntryIdWidth
-  def clientIdMSB = missQueueClientIdWidth - 1
-  def clientIdLSB = clientMissQueueEntryIdWidth
-  def entryIdMSB = clientMissQueueEntryIdWidth - 1
-  def entryIdLSB = 0
+  def nSourceType = 3
+  def sourceTypeWidth = log2Up(nSourceType)
+  def LOAD_SOURCE = 0
+  def STORE_SOURCE = 1
+  def AMO_SOURCE = 2
+  // each source use a id to distinguish its multiple reqs
  def reqIdWidth = 64

  require(isPow2(nSets), s"nSets($nSets) must be pow2")
@@ -73,6 +60,7 @@ trait HasDCacheParameters extends HasL1CacheParameters {
  require(full_divide(beatBits, rowBits), s"beatBits($beatBits) must be multiple of rowBits($rowBits)")
  // this is a VIPT L1 cache
  require(pgIdxBits >= untagBits, s"page aliasing problem: pgIdxBits($pgIdxBits) < untagBits($untagBits)")
+  require(rowWords == 1, "Our DCache Implementation assumes rowWords == 1")
 }

 abstract class DCacheModule extends L1CacheModule
@@ -218,7 +206,7 @@ class DuplicatedDataArray extends AbstractDataArray
          val ren = io.read(j).valid && io.read(j).bits.way_en(w) && io.read(j).bits.rmask(r)
          array.io.r.req.valid := ren
          array.io.r.req.bits.apply(setIdx=raddr)
-          resp(k) := RegNext(array.io.r.resp.data(0))
+          resp(k) := array.io.r.resp.data(0)
        }
      }
    }

--- a/src/main/scala/xiangshan/cache/DCacheWrapper.scala
+++ b/src/main/scala/xiangshan/cache/DCacheWrapper.scala
+package xiangshan.cache
+
+import chipsalliance.rocketchip.config.Parameters
+import chisel3._
+import chisel3.util._
+import xiangshan._
+import utils._
+import freechips.rocketchip.diplomacy.{IdRange, LazyModule, LazyModuleImp, TransferSizes}
+import freechips.rocketchip.tilelink.{TLClientNode, TLClientParameters,
+  TLMasterParameters, TLMasterPortParameters, TLArbiter, TLMessages}
+
+// memory request in word granularity(load, mmio, lr/sc, atomics)
+class DCacheWordReq  extends DCacheBundle
+{
+  val cmd    = UInt(M_SZ.W)
+  val addr   = UInt(PAddrBits.W)
+  val data   = UInt(DataBits.W)
+  val mask   = UInt((DataBits/8).W)
+  val id     = UInt(reqIdWidth.W)
+  def dump() = {
+    XSDebug("DCacheWordReq: cmd: %x addr: %x data: %x mask: %x id: %d\n",
+      cmd, addr, data, mask, id)
+  }
+}
+
+// memory request in word granularity(store)
+class DCacheLineReq  extends DCacheBundle
+{
+  val cmd    = UInt(M_SZ.W)
+  val addr   = UInt(PAddrBits.W)
+  val data   = UInt((cfg.blockBytes * 8).W)
+  val mask   = UInt(cfg.blockBytes.W)
+  val id     = UInt(reqIdWidth.W)
+  def dump() = {
+    XSDebug("DCacheLineReq: cmd: %x addr: %x data: %x mask: %x id: %d\n",
+      cmd, addr, data, mask, id)
+  }
+}
+
+class DCacheWordResp extends DCacheBundle
+{
+  val data         = UInt(DataBits.W)
+  // cache req missed, send it to miss queue
+  val miss   = Bool()
+  // cache req nacked, replay it later
+  val replay = Bool()
+  val id     = UInt(reqIdWidth.W)
+  def dump() = {
+    XSDebug("DCacheWordResp: data: %x id: %d miss: %b replay: %b\n",
+      data, id, miss, replay)
+  }
+}
+
+class DCacheLineResp extends DCacheBundle
+{
+  val data   = UInt((cfg.blockBytes * 8).W)
+  // cache req missed, send it to miss queue
+  val miss   = Bool()
+  // cache req nacked, replay it later
+  val replay = Bool()
+  val id     = UInt(reqIdWidth.W)
+  def dump() = {
+    XSDebug("DCacheLineResp: data: %x id: %d miss: %b replay: %b\n",
+      data, id, miss, replay)
+  }
+}
+
+class Refill extends DCacheBundle
+{
+  val addr   = UInt(PAddrBits.W)
+  val data   = UInt((cfg.blockBytes * 8).W)
+  def dump() = {
+    XSDebug("Refill: addr: %x data: %x\n", addr, data)
+  }
+}
+
+class DCacheWordIO extends DCacheBundle
+{
+  val req  = DecoupledIO(new DCacheWordReq)
+  val resp = Flipped(DecoupledIO(new DCacheWordResp))
+}
+
+// used by load unit
+class DCacheLoadIO extends DCacheWordIO
+{
+  // kill previous cycle's req
+  val s1_kill  = Output(Bool())
+  // cycle 0: virtual address: req.addr
+  // cycle 1: physical address: s1_paddr
+  val s1_paddr = Output(UInt(PAddrBits.W))
+  val s1_data  = Input(Vec(nWays, UInt(DataBits.W)))
+  val s2_hit_way = Input(UInt(nWays.W))
+}
+
+class DCacheLineIO extends DCacheBundle
+{
+  val req  = DecoupledIO(new DCacheLineReq )
+  val resp = Flipped(DecoupledIO(new DCacheLineResp))
+}
+
+class DCacheToLsuIO extends DCacheBundle {
+  val load  = Vec(LoadPipelineWidth, Flipped(new DCacheLoadIO)) // for speculative load
+  val lsq = ValidIO(new Refill)  // refill to load queue, wake up load misses
+  val store = Flipped(new DCacheLineIO) // for sbuffer
+  val atomics  = Flipped(new DCacheWordIO)  // atomics reqs
+}
+
+class DCacheIO extends DCacheBundle {
+  val lsu = new DCacheToLsuIO
+  val prefetch = DecoupledIO(new MissReq)
+}
+
+
+class DCache()(implicit p: Parameters) extends LazyModule with HasDCacheParameters {
+
+  val clientParameters = TLMasterPortParameters.v1(
+    Seq(TLMasterParameters.v1(
+      name = "dcache",
+      sourceId = IdRange(0, cfg.nMissEntries+1),
+      supportsProbe = TransferSizes(cfg.blockBytes)
+    ))
+  )
+
+  val clientNode = TLClientNode(Seq(clientParameters))
+
+  lazy val module = new DCacheImp(this)
+}
+
+
+class DCacheImp(outer: DCache) extends LazyModuleImp(outer) with HasDCacheParameters with HasXSLog {
+
+  val io = IO(new DCacheIO)
+
+  val (bus, edge) = outer.clientNode.out.head
+  require(bus.d.bits.data.getWidth == l1BusDataWidth, "DCache: tilelink width does not match")
+
+  //----------------------------------------
+  // core data structures
+  val dataArray = Module(new DuplicatedDataArray)
+  val metaArray = Module(new DuplicatedMetaArray)
+  /*
+  dataArray.dump()
+  metaArray.dump()
+  */
+
+
+  //----------------------------------------
+  // core modules
+  val ldu = Seq.fill(LoadPipelineWidth) { Module(new LoadPipe) }
+  val storeReplayUnit = Module(new StoreReplayQueue)
+  val atomicsReplayUnit = Module(new AtomicsReplayEntry)
+
+  val mainPipe   = Module(new MainPipe)
+  val missQueue  = Module(new MissQueue(edge))
+  val probeQueue = Module(new ProbeQueue(edge))
+  val wb         = Module(new WritebackQueue(edge))
+
+
+  //----------------------------------------
+  // meta array
+  val MetaWritePortCount = 1
+  val MainPipeMetaWritePort = 0
+  metaArray.io.write <> mainPipe.io.meta_write
+
+  // MainPipe contend MetaRead with Load 0
+  // give priority to MainPipe
+  val MetaReadPortCount = 2
+  val MainPipeMetaReadPort = 0
+  val LoadPipeMetaReadPort = 1
+
+  val metaReadArb = Module(new Arbiter(new L1MetaReadReq, MetaReadPortCount))
+
+  metaReadArb.io.in(LoadPipeMetaReadPort) <> ldu(0).io.meta_read
+  metaReadArb.io.in(MainPipeMetaReadPort) <> mainPipe.io.meta_read
+
+  metaArray.io.read(0) <> metaReadArb.io.out
+
+  ldu(0).io.meta_resp    <>  metaArray.io.resp(0)
+  mainPipe.io.meta_resp  <>  metaArray.io.resp(0)
+
+  for (w <- 1 until LoadPipelineWidth) {
+    metaArray.io.read(w) <> ldu(w).io.meta_read
+    ldu(w).io.meta_resp <> metaArray.io.resp(w)
+  }
+
+  //----------------------------------------
+  // data array
+  val DataWritePortCount = 1
+  val MainPipeDataWritePort = 0
+
+  dataArray.io.write <> mainPipe.io.data_write
+
+  // give priority to MainPipe
+  val DataReadPortCount = 2
+  val MainPipeDataReadPort = 0
+  val LoadPipeDataReadPort = 1
+
+  val dataReadArb = Module(new Arbiter(new L1DataReadReq, DataReadPortCount))
+
+  dataReadArb.io.in(LoadPipeDataReadPort)  <> ldu(0).io.data_read
+  dataReadArb.io.in(MainPipeDataReadPort)  <> mainPipe.io.data_read
+
+  dataArray.io.read(0) <> dataReadArb.io.out
+
+  dataArray.io.resp(0) <> ldu(0).io.data_resp
+  dataArray.io.resp(0) <> mainPipe.io.data_resp
+
+  for (w <- 1 until LoadPipelineWidth) {
+    dataArray.io.read(w) <> ldu(w).io.data_read
+    dataArray.io.resp(w) <> ldu(w).io.data_resp
+  }
+
+  //----------------------------------------
+  // load pipe
+  // the s1 kill signal
+  // only lsu uses this, replay never kills
+  for (w <- 0 until LoadPipelineWidth) {
+    ldu(w).io.lsu <> io.lsu.load(w)
+
+    // replay and nack not needed anymore
+    // TODO: remove replay and nack
+    ldu(w).io.nack := false.B
+  }
+
+  //----------------------------------------
+  // store pipe and store miss queue
+  storeReplayUnit.io.lsu    <> io.lsu.store
+
+  //----------------------------------------
+  // atomics
+  // atomics not finished yet
+  io.lsu.atomics <> atomicsReplayUnit.io.lsu
+
+  //----------------------------------------
+  // miss queue
+  val MissReqPortCount = LoadPipelineWidth + 1
+  val MainPipeMissReqPort = 0
+
+  // Request
+  val missReqArb = Module(new RRArbiter(new MissReq, MissReqPortCount))
+
+  missReqArb.io.in(MainPipeMissReqPort) <> mainPipe.io.miss_req
+  for (w <- 0 until LoadPipelineWidth) { missReqArb.io.in(w + 1) <> ldu(w).io.miss_req }
+
+  wb.io.miss_req.valid := missReqArb.io.out.valid
+  wb.io.miss_req.bits  := missReqArb.io.out.bits.addr
+
+  block_decoupled(missReqArb.io.out, missQueue.io.req, wb.io.block_miss_req)
+
+  // refill to load queue
+  io.lsu.lsq <> missQueue.io.refill
+
+  // tilelink stuff
+  bus.a <> missQueue.io.mem_acquire
+  bus.e <> missQueue.io.mem_finish
+
+  //----------------------------------------
+  // probe
+  probeQueue.io.mem_probe <> bus.b
+
+  //----------------------------------------
+  // mainPipe
+  val MainPipeReqPortCount = 4
+  val MissMainPipeReqPort = 0
+  val StoreMainPipeReqPort = 1
+  val AtomicsMainPipeReqPort = 2
+  val ProbeMainPipeReqPort = 3
+
+  val mainPipeReqArb = Module(new RRArbiter(new MainPipeReq, MainPipeReqPortCount))
+  mainPipeReqArb.io.in(MissMainPipeReqPort)    <> missQueue.io.pipe_req
+  mainPipeReqArb.io.in(StoreMainPipeReqPort)   <> storeReplayUnit.io.pipe_req
+  mainPipeReqArb.io.in(AtomicsMainPipeReqPort) <> atomicsReplayUnit.io.pipe_req
+  mainPipeReqArb.io.in(ProbeMainPipeReqPort)   <> probeQueue.io.pipe_req
+
+  mainPipe.io.req <> mainPipeReqArb.io.out
+
+  missQueue.io.pipe_resp         <> mainPipe.io.miss_resp
+  storeReplayUnit.io.pipe_resp   <> mainPipe.io.store_resp
+  atomicsReplayUnit.io.pipe_resp <> mainPipe.io.amo_resp
+
+  probeQueue.io.lrsc_locked_block <> mainPipe.io.lrsc_locked_block
+
+  //----------------------------------------
+  // wb
+  // add a queue between MainPipe and WritebackUnit to reduce MainPipe stalls due to WritebackUnit busy
+  wb.io.req <> mainPipe.io.wb_req
+  bus.c     <> wb.io.mem_release
+
+  // connect bus d 
+  missQueue.io.mem_grant.valid := false.B
+  missQueue.io.mem_grant.bits  := DontCare
+
+  wb.io.mem_grant.valid := false.B
+  wb.io.mem_grant.bits  := DontCare
+
+  // in L1DCache, we ony expect Grant[Data] and ReleaseAck
+  bus.d.ready := false.B
+  when (bus.d.bits.opcode === TLMessages.Grant || bus.d.bits.opcode === TLMessages.GrantData) {
+    missQueue.io.mem_grant <> bus.d
+  } .elsewhen (bus.d.bits.opcode === TLMessages.ReleaseAck) {
+    wb.io.mem_grant <> bus.d
+  } .otherwise {
+    assert (!bus.d.fire())
+  }
+
+
+  // dcache should only deal with DRAM addresses
+  when (bus.a.fire()) {
+    assert(bus.a.bits.address >= 0x80000000L.U)
+  }
+  when (bus.b.fire()) {
+    assert(bus.b.bits.address >= 0x80000000L.U)
+  }
+  when (bus.c.fire()) {
+    assert(bus.c.bits.address >= 0x80000000L.U)
+  }
+
+  io.prefetch.valid := missQueue.io.req.fire()
+  io.prefetch.bits := missQueue.io.req.bits
+
+  def block_decoupled[T <: Data](source: DecoupledIO[T], sink: DecoupledIO[T], block_signal: Bool) = {
+    sink.valid   := source.valid && !block_signal
+    source.ready := sink.ready   && !block_signal
+    sink.bits    := source.bits
+  }
+}
--- a/src/main/scala/xiangshan/cache/ldu.scala
+++ b/src/main/scala/xiangshan/cache/ldu.scala
@@ -2,6 +2,7 @@ package xiangshan.cache

 import chisel3._
 import chisel3.util._
+import freechips.rocketchip.tilelink.ClientMetadata

 import utils.XSDebug

@@ -24,9 +25,6 @@ class LoadPipe extends DCacheModule
  })

  // LSU requests
-  // replayed req should never be nacked
-  assert(!(io.lsu.req.valid && io.lsu.req.bits.meta.replay && io.nack))
-
  // it you got nacked, you can directly passdown
  val not_nacked_ready = io.meta_read.ready && io.data_read.ready
  val nacked_ready     = true.B
@@ -50,6 +48,7 @@ class LoadPipe extends DCacheModule
  data_read.rmask  := UIntToOH(get_row(io.lsu.req.bits.addr))

  // Pipeline
+  // --------------------------------------------------------------------------------
  // stage 0
  val s0_valid = io.lsu.req.fire()
  val s0_req = io.lsu.req.bits
@@ -58,6 +57,8 @@ class LoadPipe extends DCacheModule

  dump_pipeline_reqs("LoadPipe s0", s0_valid, s0_req)

+
+  // --------------------------------------------------------------------------------
  // stage 1
  val s1_req = RegNext(s0_req)
  val s1_valid = RegNext(s0_valid, init = false.B)
@@ -73,80 +74,68 @@ class LoadPipe extends DCacheModule
  val s1_tag_eq_way = wayMap((w: Int) => meta_resp(w).tag === (get_tag(s1_addr))).asUInt
  val s1_tag_match_way = wayMap((w: Int) => s1_tag_eq_way(w) && meta_resp(w).coh.isValid()).asUInt
  val s1_tag_match = s1_tag_match_way.orR
-  val s1_hit_meta = Mux1H(s1_tag_match_way, wayMap((w: Int) => meta_resp(w)))
-  val s1_hit_state = s1_hit_meta.coh
-
-  // replacement policy
-  val replacer = cacheParams.replacement
-  val s1_repl_way_en = UIntToOH(replacer.way)
-  val s1_repl_meta = Mux1H(s1_repl_way_en, wayMap((w: Int) => meta_resp(w)))
-  when (io.miss_req.fire()) {
-    replacer.miss
+
+  val s1_fake_meta = Wire(new L1Metadata)
+  s1_fake_meta.tag := get_tag(s1_addr)
+  s1_fake_meta.coh := ClientMetadata.onReset
+
+  // when there are no tag match, we give it a Fake Meta
+  // this simplifies our logic in s2 stage
+  val s1_hit_meta  = Mux(s1_tag_match, Mux1H(s1_tag_match_way, wayMap((w: Int) => meta_resp(w))), s1_fake_meta)
+  val s1_hit_coh = s1_hit_meta.coh
+
+
+  // select the row we are interested in
+  val s1_data = Wire(Vec(nWays, UInt(encRowBits.W)))
+  val data_resp = io.data_resp
+  for (w <- 0 until nWays) { s1_data(w) := data_resp(w)(get_row(s1_addr)) }
+
+  // select the word
+  // the index of word in a row, in case rowBits != wordBits
+  val s1_word_idx   = if (rowWords == 1) 0.U else s1_addr(log2Up(rowWords*wordBytes)-1, log2Up(wordBytes))
+
+  // load data gen
+  val s1_data_words = Wire(Vec(nWays, Vec(rowWords, UInt(encWordBits.W))))
+  for (w <- 0 until nWays) {
+    for (r <- 0 until rowWords) {
+      s1_data_words(w)(r) := s1_data(w)(encWordBits * (r + 1) - 1, encWordBits * r)
+    }
  }

-  assert(!(s1_valid && s1_req.meta.replay && io.lsu.s1_kill),
-    "lsq tried to kill an replayed request!")
+  val s1_words = (0 until nWays) map (i => s1_data_words(i)(s1_word_idx))
+
+  val s1_decoded = (0 until nWays) map (i => cacheParams.dataCode.decode(s1_words(i)))
+  val s1_word_decoded = VecInit((0 until nWays) map (i => s1_decoded(i).corrected))
+  (0 until nWays) map (i => assert (!(s1_valid && s1_tag_match && (i.U === OHToUInt(s1_tag_match_way)) && s1_decoded(i).uncorrectable)))
+
+  io.lsu.s1_data := s1_word_decoded

+  // --------------------------------------------------------------------------------
  // stage 2
  val s2_req   = RegNext(s1_req)
  val s2_valid = RegNext(s1_valid && !io.lsu.s1_kill, init = false.B)
+  val s2_addr = RegNext(s1_addr)

  dump_pipeline_reqs("LoadPipe s2", s2_valid, s2_req)

-  val s2_addr = RegNext(s1_addr)
+  // hit, miss, nack, permission checking
  val s2_tag_match_way = RegNext(s1_tag_match_way)
  val s2_tag_match     = RegNext(s1_tag_match)

  val s2_hit_meta      = RegNext(s1_hit_meta)
-  val s2_hit_state     = RegNext(s1_hit_state)
-  val s2_has_permission = s2_hit_state.onAccess(s2_req.cmd)._1
-  val s2_new_hit_state  = s2_hit_state.onAccess(s2_req.cmd)._3
-
-  val s2_repl_meta     = RegNext(s1_repl_meta)
-  val s2_repl_way_en   = RegNext(s1_repl_way_en)
-
-  val s2_old_meta      = Mux(s2_tag_match, s2_hit_meta, s2_repl_meta)
-  val s2_way_en        = Mux(s2_tag_match, s2_tag_match_way, s2_repl_way_en)
-
-
-  // we not only need permissions
-  // we also require that state does not change on hit
-  // thus we require new_hit_state === old_hit_state
-  //
-  // If state changes on hit,
-  // we should treat it as not hit, and let mshr deal with it,
-  // since we can not write meta data on the main pipeline.
-  // It's possible that we had permission but state changes on hit:
-  // eg: write to exclusive but clean block
-  val s2_hit = s2_tag_match && s2_has_permission && s2_hit_state === s2_new_hit_state
-  // nacked or not
-  val s2_nack = Wire(Bool())
-  val s2_data = Wire(Vec(nWays, UInt(encRowBits.W)))
-  val data_resp = io.data_resp
-  for (w <- 0 until nWays) {
-    s2_data(w) := data_resp(w)(get_row(s2_addr))
-  }
+  val s2_hit_coh     = RegNext(s1_hit_coh)
+  val s2_has_permission = s2_hit_coh.onAccess(s2_req.cmd)._1
+  val s2_new_hit_coh    = s2_hit_coh.onAccess(s2_req.cmd)._3

-  val s2_data_muxed = Mux1H(s2_tag_match_way, s2_data)
-  // the index of word in a row, in case rowBits != wordBits
-  val s2_word_idx   = if (rowWords == 1) 0.U else s2_addr(log2Up(rowWords*wordBytes)-1, log2Up(wordBytes))
-  // load data gen
-  val s2_data_words = Wire(Vec(rowWords, UInt(encWordBits.W)))
-  for (w <- 0 until rowWords) {
-    s2_data_words(w) := s2_data_muxed(encWordBits * (w + 1) - 1, encWordBits * w)
-  }
-  val s2_data_word =  s2_data_words(s2_word_idx)
-  val s2_decoded = cacheParams.dataCode.decode(s2_data_word)
-  val s2_data_word_decoded = s2_decoded.corrected
-  // annotate out this assertion
-  // when TLB misses, s2_hit may still be true
-  // which may cause unnecessary assertion
-  // assert(!(s2_valid && s2_hit && !s2_nack && s2_decoded.uncorrectable))
+  val s2_hit = s2_tag_match && s2_has_permission && s2_hit_coh === s2_new_hit_coh

+  // generate data
+  val s2_data = RegNext(s1_word_decoded)
+  // select the way out
+  val s2_data_muxed = Mux1H(s2_tag_match_way, s2_data)

  // when req got nacked, upper levels should replay this request
-
-  // the same set is busy
+  // nacked or not
  val s2_nack_hit    = RegNext(s1_nack)
  // can no allocate mshr for load miss
  val s2_nack_no_mshr = io.miss_req.valid && !io.miss_req.ready
@@ -154,7 +143,7 @@ class LoadPipe extends DCacheModule
  // For now, we use DuplicatedDataArray, so no bank conflicts
  val s2_nack_data   = false.B

-  s2_nack   := s2_nack_hit || s2_nack_no_mshr || s2_nack_data
+  val s2_nack = s2_nack_hit || s2_nack_no_mshr || s2_nack_data

  // only dump these signals when they are actually valid
  dump_pipeline_valids("LoadPipe s2", "s2_hit", s2_valid && s2_hit)
@@ -163,19 +152,18 @@ class LoadPipe extends DCacheModule
  dump_pipeline_valids("LoadPipe s2", "s2_nack_no_mshr", s2_valid && s2_nack_no_mshr)

  // send load miss to miss queue
-  io.miss_req.valid          := s2_valid && !s2_nack_hit && !s2_nack_data && !s2_hit
-  io.miss_req.bits.cmd       := s2_req.cmd
-  io.miss_req.bits.addr      := get_block_addr(s2_addr)
-  io.miss_req.bits.tag_match := s2_tag_match
-  io.miss_req.bits.way_en    := s2_way_en
-  io.miss_req.bits.old_meta  := s2_old_meta
-  io.miss_req.bits.client_id := 0.U
+  io.miss_req.valid       := s2_valid && !s2_nack_hit && !s2_nack_data && !s2_hit
+  io.miss_req.bits        := DontCare
+  io.miss_req.bits.source := LOAD_SOURCE.U
+  io.miss_req.bits.cmd    := s2_req.cmd
+  io.miss_req.bits.addr   := get_block_addr(s2_addr)
+  io.miss_req.bits.coh    := s2_hit_coh

  // send back response
  val resp = Wire(ValidIO(new DCacheWordResp))
  resp.valid     := s2_valid
-  resp.bits.data := s2_data_word_decoded
-  resp.bits.meta := s2_req.meta
+  resp.bits      := DontCare
+  resp.bits.data := s2_data_muxed
  // on miss or nack, upper level should replay request
  // but if we successfully sent the request to miss queue
  // upper level does not need to replay request
@@ -188,17 +176,18 @@ class LoadPipe extends DCacheModule
  assert(!(resp.valid && !io.lsu.resp.ready))

  when (resp.valid) {
-    XSDebug(s"LoadPipe resp: data: %x id: %d replayed_req: %b miss: %b need_replay: %b\n",
-      resp.bits.data, resp.bits.meta.id, resp.bits.meta.replay, resp.bits.miss, resp.bits.replay)
+    resp.bits.dump()
  }

+  io.lsu.s2_hit_way := s2_tag_match_way
+
  // -------
  // Debug logging functions
  def dump_pipeline_reqs(pipeline_stage_name: String, valid: Bool,
    req: DCacheWordReq ) = {
      when (valid) {
-        XSDebug(s"$pipeline_stage_name cmd: %x addr: %x data: %x mask: %x id: %d replay: %b\n",
-          req.cmd, req.addr, req.data, req.mask, req.meta.id, req.meta.replay)
+        XSDebug(s"$pipeline_stage_name: ")
+        req.dump()
      }
  }


--- a/src/main/scala/xiangshan/cache/MainPipe.scala
+++ b/src/main/scala/xiangshan/cache/MainPipe.scala
--- a/src/main/scala/xiangshan/cache/MissQueue.scala
+++ b/src/main/scala/xiangshan/cache/MissQueue.scala
+package xiangshan.cache
+
+import chisel3._
+import chisel3.util._
+import chisel3.ExcitingUtils._
+
+import freechips.rocketchip.tilelink.{TLEdgeOut, TLBundleA, TLBundleD, TLBundleE, TLPermissions, TLArbiter, ClientMetadata}
+import utils.{HasTLDump, XSDebug, BoolStopWatch, OneHot}
+
+class MissReq extends DCacheBundle
+{
+  val source = UInt(sourceTypeWidth.W)
+  val cmd    = UInt(M_SZ.W)
+  // must be aligned to block
+  val addr   = UInt(PAddrBits.W)
+
+  // store
+  val store_data   = UInt((cfg.blockBytes * 8).W)
+  val store_mask   = UInt(cfg.blockBytes.W)
+
+  // which word does amo work on?
+  val word_idx = UInt(log2Up(blockWords).W)
+  val amo_data = UInt(DataBits.W)
+  val amo_mask = UInt((DataBits/8).W)
+
+  // coherence state
+  val coh = new ClientMetadata
+  val id  = UInt(reqIdWidth.W)
+
+  def dump() = {
+    XSDebug("MissReq source: %d cmd: %d addr: %x store_data: %x store_mask: %x word_idx: %d amo_data: %x amo_mask: %x coh: %d id: %d\n",
+      source, cmd, addr, store_data, store_mask, word_idx, amo_data, amo_mask, coh.state, id)
+  }
+}
+
+// One miss entry deals with one missed block
+class MissEntry(edge: TLEdgeOut) extends DCacheModule
+{
+  val io = IO(new Bundle {
+    // MSHR ID
+    val id = Input(UInt())
+
+    // client requests
+    val req_valid = Input(Bool())
+    // this entry is free and can be allocated to new reqs
+    val primary_ready = Output(Bool())
+    // this entry is busy, but it can merge the new req
+    val secondary_ready = Output(Bool())
+    // this entry is busy and it can not merge the new req
+    val secondary_reject = Output(Bool())
+    val req    = Input((new MissReq))
+    val refill = ValidIO(new Refill)
+
+    // bus
+    val mem_acquire = DecoupledIO(new TLBundleA(edge.bundle))
+    val mem_grant   = Flipped(DecoupledIO(new TLBundleD(edge.bundle)))
+    val mem_finish  = DecoupledIO(new TLBundleE(edge.bundle))
+
+    val pipe_req  = DecoupledIO(new MainPipeReq)
+    val pipe_resp = Flipped(ValidIO(new MainPipeResp))
+  })
+
+  // MSHR:
+  // 1. receive req
+  // 2. send acquire req
+  // 3. receive grant resp
+  // 4. let main pipe do refill and replace
+  // 5. wait for resp
+  // 6. send finish to end the tilelink transaction
+  //    We only send finish after data is written into cache.
+  //    This prevents L2 from probing the block down.
+  //    See Tilelink spec 1.8.1 page 69
+  //    A slave should not issue a Probe if there is a pending GrantAck on the block. Once the Probe is
+  //    issued, the slave should not issue further Probes on that block until it receives a ProbeAck.
+  val s_invalid :: s_refill_req :: s_refill_resp :: s_main_pipe_req :: s_main_pipe_resp :: s_mem_finish :: Nil = Enum(6)
+
+  val state = RegInit(s_invalid)
+
+  // --------------------------------------------
+  // internal registers
+  val req = Reg(new MissReq)
+
+  // param of grant
+  val grant_param = Reg(UInt(TLPermissions.bdWidth.W))
+
+  // recording the source/sink info from Grant
+  // so that we can use it grantack
+  val grantack = Reg(Valid(new TLBundleE(edge.bundle)))
+
+  // should we refill the data to load queue to wake up any missed load?
+  val should_refill_data  = Reg(Bool())
+
+
+  // --------------------------------------------
+  // merge reqs
+  // see whether we can merge requests
+  // do not count s_invalid state in
+  // since we can not merge request at that state
+  val acquire_not_sent = state === s_refill_req && !io.mem_acquire.ready
+  val data_not_refilled = state === s_refill_req || state === s_refill_resp
+
+  def can_merge(new_req: MissReq): Bool = {
+    // caution: do not merge with AMO
+    // we can not do amoalu calculation in MissQueue
+    // so, we do not know the result after AMO calculation
+    // so do not merge with AMO
+
+    // before read acquire is fired, we can merge read or write
+    val before_read_sent = acquire_not_sent && req.source === LOAD_SOURCE.U && (new_req.source === LOAD_SOURCE.U || new_req.source === STORE_SOURCE.U)
+    // before read/write refills data to LoadQueue, we can merge any read
+    val before_data_refill = data_not_refilled && (req.source === LOAD_SOURCE.U || req.source === STORE_SOURCE.U) && new_req.source === LOAD_SOURCE.U
+
+    before_read_sent || before_data_refill
+  }
+
+  def should_merge(new_req: MissReq): Bool = {
+    val block_match = req.addr === new_req.addr
+    block_match && can_merge(new_req)
+  }
+
+  def should_reject(new_req: MissReq): Bool = {
+    val block_match = req.addr === new_req.addr
+    // do not reject any req when we are in s_invalid
+    block_match && !can_merge(new_req) && state =/= s_invalid
+  }
+
+  io.primary_ready    := state === s_invalid
+  io.secondary_ready  := should_merge(io.req)
+  io.secondary_reject := should_reject(io.req)
+
+  // should not allocate, merge or reject at the same time
+  // one at a time
+  OneHot.checkOneHot(Seq(io.primary_ready, io.secondary_ready, io.secondary_reject))
+
+
+  // --------------------------------------------
+  // assign default values to output signals
+  io.refill.valid := false.B
+  io.refill.bits  := DontCare
+
+  io.mem_acquire.valid   := false.B
+  io.mem_acquire.bits    := DontCare
+  io.mem_grant.ready     := false.B
+  io.mem_finish.valid    := false.B
+  io.mem_finish.bits     := DontCare
+
+  io.pipe_req.valid := false.B
+  io.pipe_req.bits  := DontCare
+
+  when (state =/= s_invalid) {
+    XSDebug("entry: %d state: %d\n", io.id, state)
+    req.dump()
+  }
+
+
+  // --------------------------------------------
+  // State Machine
+
+  // --------------------------------------------
+  // receive requests
+  // primary request: allocate for a new request
+  when (io.req_valid && io.primary_ready) {
+    assert (state === s_invalid)
+
+    // re init some fields
+    req := io.req
+    grantack.valid := false.B
+    // only miss req from load needs a refill to LoadQueue
+    should_refill_data := io.req.source === LOAD_SOURCE.U
+
+    state := s_refill_req
+  }
+
+  // secondary request: merge with existing request
+  when (io.req_valid && io.secondary_ready) {
+    // The merged reqs should never have higher permissions
+    // which means the cache silently upgrade the permission of our block
+    // without merge with this miss queue request!
+    // Either our req come in with stale meta, or the req that upgrade the permission does not merge with this req.
+    // Both cases are bugs of DCache.
+    //
+    // DCache can silently drop permission(eg, probed or evicted)
+    // it should never silently upgrade permissions.
+    //
+    // TODO: please check Tilelink Metadata.scala
+    // and make sure that lower permission are encoded as smaller number
+    assert (io.req.coh.state <= req.coh.state)
+    // use the most uptodate meta
+    req.coh := io.req.coh
+
+    // when merging with store
+    // we should remember its info into our req
+    // or we will not be able to replay store
+    when (io.req.source === STORE_SOURCE.U) {
+      req := io.req
+    }
+
+    should_refill_data := io.req.source === LOAD_SOURCE.U
+  }
+
+
+  // --------------------------------------------
+  // refill
+
+  // for full overwrite, we can use AcquirePerm to save memory bandwidth
+  val full_overwrite = req.source === STORE_SOURCE.U && req.store_mask.andR
+  when (state === s_refill_req) {
+
+    val grow_param = req.coh.onAccess(req.cmd)._2
+    val acquireBlock = edge.AcquireBlock(
+      fromSource      = io.id,
+      toAddress       = req.addr,
+      lgSize          = (log2Up(cfg.blockBytes)).U,
+      growPermissions = grow_param)._2
+    val acquirePerm = edge.AcquirePerm(
+      fromSource      = io.id,
+      toAddress       = req.addr,
+      lgSize          = (log2Up(cfg.blockBytes)).U,
+      growPermissions = grow_param)._2
+
+    io.mem_acquire.valid := true.B
+    io.mem_acquire.bits := Mux(full_overwrite, acquirePerm, acquireBlock)
+
+    when (io.mem_acquire.fire()) {
+      state := s_refill_resp
+    }
+  }
+
+  val (_, _, refill_done, refill_count) = edge.count(io.mem_grant)
+
+  // raw data
+  val refill_data = Reg(Vec(blockRows, UInt(rowBits.W)))
+  val new_data    = Wire(Vec(blockRows, UInt(rowBits.W)))
+  val new_mask    = Wire(Vec(blockRows, UInt(rowBytes.W)))
+
+  for (i <- 0 until blockRows) {
+    new_data(i) := req.store_data(rowBits * (i + 1) - 1, rowBits * i)
+    // we only need to merge data for Store
+    new_mask(i) := Mux(req.source === STORE_SOURCE.U,
+      req.store_mask(rowBytes * (i + 1) - 1, rowBytes * i), 0.U(rowBytes.W))
+  }
+
+  def mergePutData(old_data: UInt, new_data: UInt, wmask: UInt): UInt = {
+    val full_wmask = FillInterleaved(8, wmask)
+    ((~full_wmask & old_data) | (full_wmask & new_data))
+  }
+
+  when (state === s_refill_resp) {
+    io.mem_grant.ready := true.B
+    when (io.mem_grant.fire()) {
+      when (edge.hasData(io.mem_grant.bits)) {
+        // GrantData
+        for (i <- 0 until beatRows) {
+          val idx = (refill_count << log2Floor(beatRows)) + i.U
+          refill_data(idx) := mergePutData(io.mem_grant.bits.data(rowBits * (i + 1) - 1, rowBits * i), new_data(idx), new_mask(idx))
+        }
+      } .otherwise {
+        // Grant
+
+        // since we do not sync between MissQueue and WritebackQueue
+        // for a AcquireBlock BtoT, we can not protect our block from being replaced by another miss and written back by WritebackQueue
+        // so AcquireBlock BtoT, we need L2 to give us GrantData, not Grant.
+        // So that whether our block is replaced or not, we can always refill the block with valid data
+        // So, if we enters here
+        // we must be a AcquirePerm, not a AcquireBlock!!!
+        assert (full_overwrite)
+        // when we only acquire perm, not data
+        // use Store's data
+        for (i <- 0 until blockRows) {
+          refill_data(i) := new_data(i)
+        }
+      }
+    }
+
+    when (refill_done) {
+      grantack.valid := edge.isRequest(io.mem_grant.bits)
+      grantack.bits := edge.GrantAck(io.mem_grant.bits)
+      grant_param := io.mem_grant.bits.param
+
+      state := s_main_pipe_req
+    }
+  }
+
+  // put should_refill_data out of RegNext
+  // so that when load miss are merged at refill_done
+  // we can still refill data back
+  io.refill.valid := RegNext(state === s_refill_resp && refill_done) && should_refill_data
+  io.refill.bits.addr := req.addr
+  io.refill.bits.data := refill_data.asUInt
+
+  when (state === s_main_pipe_req) {
+    io.pipe_req.valid := true.B
+    val pipe_req = io.pipe_req.bits
+    pipe_req.miss := true.B
+    pipe_req.miss_id := io.id
+    pipe_req.miss_param := grant_param
+
+    pipe_req.probe := false.B
+    pipe_req.probe_param := DontCare
+
+    pipe_req.source := req.source
+    pipe_req.cmd    := req.cmd
+    pipe_req.addr   := req.addr
+    pipe_req.store_data := refill_data.asUInt
+    // full overwrite
+    pipe_req.store_mask := Fill(cfg.blockBytes, "b1".U)
+    pipe_req.word_idx := req.word_idx
+    pipe_req.amo_data   := req.amo_data
+    pipe_req.amo_mask   := req.amo_mask
+    pipe_req.id     := req.id
+
+    when (io.pipe_req.fire()) {
+      state := s_main_pipe_resp
+    }
+  }
+
+  when (state === s_main_pipe_resp) {
+    when (io.pipe_resp.fire()) {
+      state := s_mem_finish
+    }
+  }
+
+  when (state === s_mem_finish) {
+    io.mem_finish.valid := grantack.valid
+    io.mem_finish.bits  := grantack.bits
+
+    when (io.mem_finish.fire()) {
+      grantack.valid := false.B
+      state := s_invalid
+    }
+  }
+}
+
+
+class MissQueue(edge: TLEdgeOut) extends DCacheModule with HasTLDump
+{
+  val io = IO(new Bundle {
+    val req    = Flipped(DecoupledIO(new MissReq))
+    val refill = ValidIO(new Refill)
+
+    val mem_acquire = Decoupled(new TLBundleA(edge.bundle))
+    val mem_grant   = Flipped(DecoupledIO(new TLBundleD(edge.bundle)))
+    val mem_finish  = Decoupled(new TLBundleE(edge.bundle))
+
+    val pipe_req  = DecoupledIO(new MainPipeReq)
+    val pipe_resp = Flipped(ValidIO(new MainPipeResp))
+  })
+
+  val pipe_req_arb = Module(new RRArbiter(new MainPipeReq, cfg.nMissEntries))
+  val refill_arb   = Module(new RRArbiter(new Refill, cfg.nMissEntries))
+
+  // dispatch req to MSHR
+  val primary_ready  = Wire(Vec(cfg.nMissEntries, Bool()))
+  val secondary_ready  = Wire(Vec(cfg.nMissEntries, Bool()))
+  val secondary_reject  = Wire(Vec(cfg.nMissEntries, Bool()))
+
+  // try merging with existing reqs
+  val merge = secondary_ready.asUInt.orR
+  val merge_idx = PriorityEncoder(secondary_ready)
+  // some req says the request can not be merged
+  val reject = secondary_reject.asUInt.orR
+  // allocate a new entry for this req
+  val allocate = !reject && !merge && primary_ready.asUInt.orR
+  val alloc_idx = PriorityEncoder(primary_ready)
+
+  // will this req be accepted
+  val accept = (merge || allocate) && !reject
+  // if it's accepted, which entry will it enter
+  val entry_idx = Mux(allocate, alloc_idx, merge_idx)
+
+  // for one block, their should be only one MSHR
+  // one block should not be stay in multiple MSHRs
+  // if we a req can not merge with existing reqs
+  // block it!
+  OneHot.checkOneHot(secondary_ready)
+  OneHot.checkOneHot(secondary_reject)
+  // should not merge and reject at the same time
+  OneHot.checkOneHot(Seq(merge, reject))
+
+  io.req.ready := accept
+  io.mem_grant.ready := false.B
+
+  val entries = (0 until cfg.nMissEntries) map { i =>
+    val entry = Module(new MissEntry(edge))
+
+    entry.io.id := i.U(log2Up(cfg.nMissEntries).W)
+
+    // entry req
+    entry.io.req_valid  := (i.U === entry_idx) && accept && io.req.valid
+    primary_ready(i)    := entry.io.primary_ready
+    secondary_ready(i)  := entry.io.secondary_ready
+    secondary_reject(i) := entry.io.secondary_reject
+    entry.io.req        := io.req.bits
+
+    // entry refill
+    refill_arb.io.in(i).valid := entry.io.refill.valid
+    refill_arb.io.in(i).bits  := entry.io.refill.bits
+
+    // pipe_req
+    pipe_req_arb.io.in(i)     <> entry.io.pipe_req
+
+    // pipe_req
+    entry.io.pipe_resp.valid  := false.B
+    entry.io.pipe_resp.bits   := DontCare
+    when (io.pipe_resp.bits.id === i.U) {
+      entry.io.pipe_resp <> io.pipe_resp
+    }
+
+    entry.io.mem_grant.valid := false.B
+    entry.io.mem_grant.bits  := DontCare
+    when (io.mem_grant.bits.source === i.U) {
+      entry.io.mem_grant <> io.mem_grant
+    }
+
+    /*
+    if (!env.FPGAPlatform) {
+      ExcitingUtils.addSource(
+        BoolStopWatch(
+          start = entry.io.req.fire(), 
+          stop = entry.io.resp.fire(),
+          startHighPriority = true),
+        "perfCntDCacheMissQueuePenaltyEntry" + Integer.toString(i, 10),
+        Perf
+      )
+    }
+    */
+
+    entry
+  }
+
+  io.refill.valid := refill_arb.io.out.valid
+  io.refill.bits  := refill_arb.io.out.bits
+  refill_arb.io.out.ready := true.B
+
+  // one refill at a time
+  OneHot.checkOneHot(refill_arb.io.in.map(r => r.valid))
+
+  TLArbiter.robin(edge, io.mem_acquire, entries.map(_.io.mem_acquire):_*)
+  TLArbiter.robin(edge, io.mem_finish,  entries.map(_.io.mem_finish):_*)
+
+  io.pipe_req <> pipe_req_arb.io.out
+
+
+  // print all input/output requests for debug purpose
+
+  when (io.req.fire()) {
+    io.req.bits.dump()
+    // sanity check
+    val source = io.req.bits.source
+    val cmd = io.req.bits.cmd
+    when (source === LOAD_SOURCE.U) {
+      assert (cmd === M_XRD)
+    }
+    when (source === STORE_SOURCE.U) {
+      assert (cmd === M_XWR)
+    }
+
+    when (source === AMO_SOURCE.U) {
+      assert (
+        cmd === M_XA_SWAP ||
+        cmd === M_XLR     ||
+        cmd === M_XSC     ||
+        cmd === M_XA_ADD  ||
+        cmd === M_XA_XOR  ||
+        cmd === M_XA_OR   ||
+        cmd === M_XA_AND  ||
+        cmd === M_XA_MIN  ||
+        cmd === M_XA_MAX  ||
+        cmd === M_XA_MINU ||
+        cmd === M_XA_MAXU)
+    }
+    // req addr must be aligned to block boundary
+    assert (io.req.bits.addr(blockOffBits - 1, 0) === 0.U)
+  }
+
+  when (io.refill.fire()) {
+    io.refill.bits.dump()
+  }
+
+  when (io.mem_acquire.fire()) {
+    XSDebug("mem_acquire ")
+    io.mem_acquire.bits.dump
+  }
+
+  when (io.mem_grant.fire()) {
+    XSDebug("mem_grant ")
+    io.mem_grant.bits.dump
+  }
+
+  when (io.mem_finish.fire()) {
+    XSDebug("mem_finish ")
+    io.mem_finish.bits.dump
+  }
+
+  if (!env.FPGAPlatform) {
+    ExcitingUtils.addSource(io.req.fire(), "perfCntDCacheMiss", Perf)
+  }
+}
--- a/src/main/scala/xiangshan/cache/Probe.scala
+++ b/src/main/scala/xiangshan/cache/Probe.scala
+package xiangshan.cache
+
+import chisel3._
+import chisel3.util._
+
+import utils.XSDebug
+import freechips.rocketchip.tilelink.{TLEdgeOut, TLBundleB, TLMessages, TLPermissions}
+
+import utils.{HasTLDump, XSDebug}
+
+class ProbeReq extends DCacheBundle
+{
+  val source = UInt()
+  val opcode = UInt()
+  val addr   = UInt(PAddrBits.W)
+  val param  = UInt(TLPermissions.bdWidth.W)
+
+  def dump() = {
+    XSDebug("ProbeReq source: %d opcode: %d addr: %x param: %d\n",
+      source, opcode, addr, param)
+  }
+}
+
+class ProbeEntry extends DCacheModule {
+  val io = IO(new Bundle {
+    val req = Flipped(Decoupled(new ProbeReq))
+    val pipe_req  = DecoupledIO(new MainPipeReq)
+    val lrsc_locked_block = Input(Valid(UInt()))
+
+    // the block we are probing
+    val block_addr  = Output(Valid(UInt()))
+  })
+
+  val s_invalid :: s_pipe_req :: Nil = Enum(2)
+
+  val state = RegInit(s_invalid)
+
+  val req = Reg(new ProbeReq)
+
+  // assign default values to signals
+  io.req.ready      := false.B
+  io.pipe_req.valid := false.B
+  io.pipe_req.bits  := DontCare
+
+  io.block_addr.valid := state =/= s_invalid
+  io.block_addr.bits  := req.addr
+
+  when (state =/= s_invalid) {
+    XSDebug("state: %d\n", state)
+  }
+
+  when (state =/= s_invalid) {
+    XSDebug("ProbeEntry: state: %d block_addr: %x\n", state, io.block_addr.bits)
+  }
+
+  when (state === s_invalid) {
+    io.req.ready := true.B
+    when (io.req.fire()) {
+      req := io.req.bits
+      state := s_pipe_req
+    }
+  }
+
+  when (state === s_pipe_req) {
+    val lrsc_blocked = io.lrsc_locked_block.valid && io.lrsc_locked_block.bits === req.addr
+    io.pipe_req.valid := !lrsc_blocked
+
+    val pipe_req = io.pipe_req.bits
+    pipe_req := DontCare
+    pipe_req.miss := false.B
+    pipe_req.probe := true.B
+    pipe_req.probe_param := req.param
+    pipe_req.addr   := req.addr
+
+    when (io.pipe_req.fire()) {
+      state := s_invalid
+    }
+  }
+}
+
+class ProbeQueue(edge: TLEdgeOut) extends DCacheModule with HasTLDump
+{
+  val io = IO(new Bundle {
+    val mem_probe = Flipped(Decoupled(new TLBundleB(edge.bundle)))
+    val pipe_req  = DecoupledIO(new MainPipeReq)
+    val lrsc_locked_block = Input(Valid(UInt()))
+  })
+
+  val pipe_req_arb = Module(new RRArbiter(new MainPipeReq, cfg.nProbeEntries))
+
+  // allocate a free entry for incoming request
+  val primary_ready  = Wire(Vec(cfg.nProbeEntries, Bool()))
+  val allocate = primary_ready.asUInt.orR
+  val alloc_idx = PriorityEncoder(primary_ready)
+
+  // translate to inner req
+  val req = Wire(new ProbeReq)
+  req.source := io.mem_probe.bits.source
+  req.opcode := io.mem_probe.bits.opcode
+  req.addr := io.mem_probe.bits.address
+  req.param := io.mem_probe.bits.param
+
+  io.mem_probe.ready := allocate
+
+  val entries = (0 until cfg.nProbeEntries) map { i =>
+    val entry = Module(new ProbeEntry)
+
+    // entry req
+    entry.io.req.valid := (i.U === alloc_idx) && allocate && io.mem_probe.valid
+    primary_ready(i)   := entry.io.req.ready
+    entry.io.req.bits  := req
+
+    // pipe_req
+    pipe_req_arb.io.in(i) <> entry.io.pipe_req
+
+    entry.io.lrsc_locked_block := io.lrsc_locked_block
+
+    entry
+  }
+
+  io.pipe_req <> pipe_req_arb.io.out
+
+  // print all input/output requests for debug purpose
+  when (io.mem_probe.valid) {
+    // before a probe finishes, L2 should not further issue probes on this block
+    val probe_conflict = VecInit(entries.map(e => e.io.block_addr.valid && e.io.block_addr.bits === io.mem_probe.bits.address)).asUInt.orR
+    assert (!probe_conflict)
+    // for now, we can only deal with ProbeBlock
+    assert (io.mem_probe.bits.opcode === TLMessages.Probe)
+  }
+
+  // debug output
+  when (io.mem_probe.fire()) {
+    XSDebug("mem_probe: ")
+    io.mem_probe.bits.dump
+  }
+
+  when (io.pipe_req.fire()) {
+    io.pipe_req.bits.dump()
+  }
+
+  when (io.lrsc_locked_block.valid) {
+    XSDebug("lrsc_locked_block: %x\n", io.lrsc_locked_block.bits)
+  }
+}
--- a/src/main/scala/xiangshan/cache/StoreReplayUnit.scala
+++ b/src/main/scala/xiangshan/cache/StoreReplayUnit.scala
+package xiangshan.cache
+
+import chisel3._
+import chisel3.util._
+
+import utils.XSDebug
+import bus.tilelink._
+
+class StoreReplayEntry extends DCacheModule
+{
+  val io = IO(new Bundle {
+    val id = Input(UInt())
+
+    val lsu  = Flipped(new DCacheLineIO)
+    val pipe_req  = Decoupled(new MainPipeReq)
+    val pipe_resp = Flipped(ValidIO(new MainPipeResp))
+
+    val block_addr  = Output(Valid(UInt()))
+  })
+
+  val s_invalid :: s_pipe_req :: s_pipe_resp :: s_wait :: s_resp :: Nil = Enum(5)
+  val state = RegInit(s_invalid)
+
+  val req = Reg(new DCacheLineReq)
+
+  // assign default values to output signals
+  io.lsu.req.ready     := state === s_invalid
+  io.lsu.resp.valid    := false.B
+  io.lsu.resp.bits     := DontCare
+
+  io.pipe_req.valid    := false.B
+  io.pipe_req.bits     := DontCare
+
+  io.block_addr.valid := state =/= s_invalid
+  io.block_addr.bits  := req.addr
+
+
+  when (state =/= s_invalid) {
+    XSDebug("StoreReplayEntry: %d state: %d block_addr: %x\n", io.id, state, io.block_addr.bits)
+  }
+
+  // --------------------------------------------
+  // s_invalid: receive requests
+  when (state === s_invalid) {
+    when (io.lsu.req.fire()) {
+      req   := io.lsu.req.bits
+      state := s_pipe_req
+    }
+  }
+
+  // --------------------------------------------
+  // replay
+  when (state === s_pipe_req) {
+    io.pipe_req.valid := true.B
+
+    val pipe_req = io.pipe_req.bits
+    pipe_req := DontCare
+    pipe_req.miss := false.B
+    pipe_req.probe := false.B
+    pipe_req.source := STORE_SOURCE.U
+    pipe_req.cmd    := req.cmd
+    pipe_req.addr   := req.addr
+    pipe_req.store_data  := req.data
+    pipe_req.store_mask  := req.mask
+    pipe_req.id := io.id
+
+    when (io.pipe_req.fire()) {
+      state := s_pipe_resp
+    }
+  }
+
+  val ReplayDelayCycles = 16
+  val delay_counter = Counter(ReplayDelayCycles)
+
+  when (state === s_pipe_resp) {
+    // when not miss
+    // everything is OK, simply send response back to sbuffer
+    // when miss and not replay
+    // wait for missQueue to handling miss and replaying our request
+    // when miss and replay
+    // req missed and fail to enter missQueue, manually replay it later
+    when (io.pipe_resp.fire()) {
+      when (io.pipe_resp.bits.miss) {
+        when (io.pipe_resp.bits.replay) {
+          delay_counter.value := 0.U
+          state := s_wait
+        }
+      } .otherwise {
+        state := s_resp
+      }
+    }
+  }
+
+  when (state === s_wait) {
+    delay_counter.inc()
+    when (delay_counter.value === (ReplayDelayCycles - 1).U) {
+      state := s_pipe_req
+    }
+  }
+
+  // --------------------------------------------
+  when (state === s_resp) {
+    io.lsu.resp.valid := true.B
+    io.lsu.resp.bits  := DontCare
+    io.lsu.resp.bits.id := req.id
+
+    when (io.lsu.resp.fire()) {
+      state := s_invalid
+    }
+  }
+
+  // debug output
+  when (io.lsu.req.fire()) {
+    XSDebug(s"StoreReplayEntryTransaction req %d\n", io.id)
+  }
+
+  when (io.lsu.resp.fire()) {
+    XSDebug(s"StoreReplayEntryTransaction resp %d\n", io.id)
+  }
+}
+
+
+class StoreReplayQueue extends DCacheModule
+{
+  val io = IO(new Bundle {
+    val lsu       = Flipped(new DCacheLineIO)
+    val pipe_req  = Decoupled(new MainPipeReq)
+    val pipe_resp = Flipped(ValidIO(new MainPipeResp))
+  })
+
+  val pipe_req_arb = Module(new RRArbiter(new MainPipeReq, cfg.nStoreReplayEntries))
+  val resp_arb     = Module(new RRArbiter(new DCacheLineResp, cfg.nStoreReplayEntries))
+
+  // allocate a free entry for incoming request
+  val primary_ready  = Wire(Vec(cfg.nStoreReplayEntries, Bool()))
+  val allocate = primary_ready.asUInt.orR
+  val alloc_idx = PriorityEncoder(primary_ready)
+
+  val req = io.lsu.req
+  val block_conflict = Wire(Bool())
+  req.ready := allocate && !block_conflict
+
+  val entries = (0 until cfg.nStoreReplayEntries) map { i =>
+    val entry = Module(new StoreReplayEntry)
+
+    entry.io.id := i.U
+
+    // entry req
+    entry.io.lsu.req.valid := (i.U === alloc_idx) && allocate && req.valid && !block_conflict
+    primary_ready(i)       := entry.io.lsu.req.ready
+    entry.io.lsu.req.bits  := req.bits
+
+    // lsu req and resp
+    resp_arb.io.in(i)  <> entry.io.lsu.resp
+
+    // replay req and resp
+    pipe_req_arb.io.in(i) <> entry.io.pipe_req
+
+    entry.io.pipe_resp.valid := (i.U === io.pipe_resp.bits.id) && io.pipe_resp.valid
+    entry.io.pipe_resp.bits  := io.pipe_resp.bits
+
+    entry
+  }
+
+  io.lsu.resp  <> resp_arb.io.out
+  io.pipe_req  <> pipe_req_arb.io.out
+
+  block_conflict := VecInit(entries.map(e => e.io.block_addr.valid && e.io.block_addr.bits === io.lsu.req.bits.addr)).asUInt.orR
+
+  // sanity check
+  when (io.lsu.req.valid) {
+    assert(io.lsu.req.bits.cmd === M_XWR)
+    assert (!block_conflict)
+  }
+
+  // debug output
+  when (io.lsu.req.fire()) {
+    io.lsu.req.bits.dump()
+  }
+
+  when (io.lsu.resp.fire()) {
+    io.lsu.resp.bits.dump()
+  }
+
+  when (io.pipe_req.fire()) {
+    io.pipe_req.bits.dump()
+  }
+
+  when (io.pipe_resp.fire()) {
+    io.pipe_resp.bits.dump()
+  }
+}
--- a/src/main/scala/xiangshan/cache/WritebackQueue.scala
+++ b/src/main/scala/xiangshan/cache/WritebackQueue.scala
+package xiangshan.cache
+
+import chisel3._
+import chisel3.util._
+import utils.{XSDebug, HasTLDump}
+import freechips.rocketchip.tilelink.{TLBundleC, TLBundleD, TLEdgeOut, TLPermissions, TLArbiter}
+
+class WritebackReq extends DCacheBundle {
+  val addr = UInt(PAddrBits.W)
+  val param  = UInt(TLPermissions.cWidth.W)
+  val voluntary = Bool()
+  val hasData = Bool()
+  val data = UInt((cfg.blockBytes * 8).W)
+
+  def dump() = {
+    XSDebug("WritebackReq addr: %x param: %d voluntary: %b hasData: %b data: %x\n",
+      addr, param, voluntary, hasData, data)
+  }
+}
+
+class WritebackEntry(edge: TLEdgeOut) extends DCacheModule with HasTLDump
+{
+  val io = IO(new Bundle {
+    val id = Input(UInt())
+
+    val req = Flipped(DecoupledIO(new WritebackReq))
+    val mem_release = DecoupledIO(new TLBundleC(edge.bundle))
+    val mem_grant = Flipped(DecoupledIO(new TLBundleD(edge.bundle)))
+
+    val block_addr  = Output(Valid(UInt()))
+  })
+
+  val s_invalid :: s_release_req :: s_release_resp :: Nil = Enum(3)
+  val state = RegInit(s_invalid)
+
+  // internal regs
+  // remaining beats
+  val remain = RegInit(0.U(refillCycles.W))
+  val remain_set = WireInit(0.U(refillCycles.W))
+  val remain_clr = WireInit(0.U(refillCycles.W))
+  remain := (remain | remain_set) & ~remain_clr
+
+  val busy = remain.orR
+
+  val req  = Reg(new WritebackReq)
+
+  // assign default signals to output signals
+  io.req.ready := false.B
+  io.mem_release.valid := false.B
+  io.mem_release.bits  := DontCare
+  io.mem_grant.ready   := false.B
+  io.block_addr.valid  := state =/= s_invalid
+  io.block_addr.bits   := req.addr
+
+
+  when (state =/= s_invalid) {
+    XSDebug("WritebackEntry: %d state: %d block_addr: %x\n", io.id, state, io.block_addr.bits)
+  }
+
+  // --------------------------------------------------------------------------------
+  // s_invalid: receive requests
+  // new req entering
+  io.req.ready := state === s_invalid
+  when (io.req.fire()) {
+    assert (remain === 0.U)
+    remain_set := Mux(io.req.bits.hasData, ~0.U(refillCycles.W), 1.U(refillCycles.W))
+    req        := io.req.bits
+    state      := s_release_req
+  }
+
+  // --------------------------------------------------------------------------------
+  // while there beats remaining to be sent, we keep sending
+  // which beat to send in this cycle?
+  val beat = PriorityEncoder(remain)
+
+  val beat_data = Wire(Vec(refillCycles, UInt(beatBits.W)))
+  for (i <- 0 until refillCycles) {
+    beat_data(i) := req.data((i + 1) * beatBits - 1, i * beatBits)
+  }
+
+  val probeResponse = edge.ProbeAck(
+    fromSource = io.id,
+    toAddress = req.addr,
+    lgSize = log2Ceil(cfg.blockBytes).U,
+    reportPermissions = req.param
+  )
+
+  val probeResponseData = edge.ProbeAck(
+    fromSource = io.id,
+    toAddress = req.addr,
+    lgSize = log2Ceil(cfg.blockBytes).U,
+    reportPermissions = req.param,
+    data = beat_data(beat)
+  )
+
+  val voluntaryRelease = edge.Release(
+    fromSource = io.id,
+    toAddress = req.addr,
+    lgSize = log2Ceil(cfg.blockBytes).U,
+    shrinkPermissions = req.param
+  )._2
+
+  val voluntaryReleaseData = edge.Release(
+    fromSource = io.id,
+    toAddress = req.addr,
+    lgSize = log2Ceil(cfg.blockBytes).U,
+    shrinkPermissions = req.param,
+    data = beat_data(beat)
+  )._2
+
+  io.mem_release.valid := busy
+  io.mem_release.bits  := Mux(req.voluntary,
+    Mux(req.hasData, voluntaryReleaseData, voluntaryRelease),
+    Mux(req.hasData, probeResponseData, probeResponse))
+
+  when (io.mem_release.fire()) { remain_clr := PriorityEncoderOH(remain) }
+
+  val (_, _, release_done, _) = edge.count(io.mem_release)
+
+  when (state === s_release_req && release_done) {
+    state := Mux(req.voluntary, s_release_resp, s_invalid)
+  }
+
+  // --------------------------------------------------------------------------------
+  // receive ReleaseAck for Releases
+  when (state === s_release_resp) {
+    io.mem_grant.ready := true.B
+    when (io.mem_grant.fire()) {
+      state := s_invalid
+    }
+  }
+}
+
+class WritebackQueue(edge: TLEdgeOut) extends DCacheModule with HasTLDump
+{
+  val io = IO(new Bundle {
+    val req = Flipped(DecoupledIO(new WritebackReq))
+    val mem_release = DecoupledIO(new TLBundleC(edge.bundle))
+    val mem_grant = Flipped(DecoupledIO(new TLBundleD(edge.bundle)))
+    
+    val miss_req  = Flipped(Valid(UInt()))
+    val block_miss_req  = Output(Bool())
+  })
+
+  // allocate a free entry for incoming request
+  val primary_ready  = Wire(Vec(cfg.nReleaseEntries, Bool()))
+  val allocate = primary_ready.asUInt.orR
+  val alloc_idx = PriorityEncoder(primary_ready)
+
+  val req = io.req
+  val block_conflict = Wire(Bool())
+  req.ready := allocate && !block_conflict
+
+  // assign default values to output signals
+  io.mem_release.valid := false.B
+  io.mem_release.bits  := DontCare
+  io.mem_grant.ready   := false.B
+
+  val entries = (0 until cfg.nReleaseEntries) map { i =>
+    val entry = Module(new WritebackEntry(edge))
+
+    entry.io.id := i.U
+
+    // entry req
+    entry.io.req.valid := (i.U === alloc_idx) && allocate && req.valid && !block_conflict
+    primary_ready(i)   := entry.io.req.ready
+    entry.io.req.bits  := req.bits
+
+    entry.io.mem_grant.valid := (i.U === io.mem_grant.bits.source) && io.mem_grant.valid
+    entry.io.mem_grant.bits  := io.mem_grant.bits
+    when (i.U === io.mem_grant.bits.source) {
+      io.mem_grant.ready := entry.io.mem_grant.ready
+    }
+
+    entry
+  }
+
+  block_conflict := VecInit(entries.map(e => e.io.block_addr.valid && e.io.block_addr.bits === io.req.bits.addr)).asUInt.orR
+  val miss_req_conflict = VecInit(entries.map(e => e.io.block_addr.valid && e.io.block_addr.bits === io.miss_req.bits)).asUInt.orR
+  io.block_miss_req := io.miss_req.valid && miss_req_conflict
+
+  TLArbiter.robin(edge, io.mem_release, entries.map(_.io.mem_release):_*)
+
+  // sanity check
+  // print all input/output requests for debug purpose
+  // print req
+  when (io.req.fire()) {
+    io.req.bits.dump()
+  }
+
+  when (io.mem_release.fire()) {
+    io.mem_release.bits.dump
+  }
+
+  when (io.mem_grant.fire()) {
+    io.mem_grant.bits.dump
+  }
+
+  when (io.miss_req.valid) {
+    XSDebug("miss_req: addr: %x\n", io.miss_req.bits)
+  }
+
+  when (io.block_miss_req) {
+    XSDebug("block_miss_req\n")
+  }
+}
--- a/src/main/scala/xiangshan/cache/atomics.scala
+++ b/src/main/scala/xiangshan/cache/atomics.scala
-package xiangshan.cache
-
-import chisel3._
-import chisel3.util._
-
-import utils.{XSDebug}
-
-// this is a traditional cache pipeline:
-// it handles load/store/amo/lr,sc
-class AtomicsPipe extends DCacheModule
-{
-  val io = IO(new DCacheBundle{
-    val lsu       = Flipped(new DCacheWordIO)
-    val data_read  = DecoupledIO(new L1DataReadReq)
-    val data_resp  = Input(Vec(nWays, Vec(blockRows, Bits(encRowBits.W))))
-    val data_write = DecoupledIO(new L1DataWriteReq)
-    val meta_read  = DecoupledIO(new L1MetaReadReq)
-    val meta_resp  = Input(Vec(nWays, new L1Metadata))
-    val inflight_req_idxes       = Output(Vec(3, Valid(UInt())))
-    val inflight_req_block_addrs = Output(Vec(3, Valid(UInt())))
-    val block_probe_addr   = Output(Valid(UInt()))
-    val wb_invalidate_lrsc = Input(Valid(UInt()))
-
-    // send miss request to miss queue
-    val miss_req    = DecoupledIO(new MissReq)
-  })
-
-  // LSU requests
-  io.lsu.req.ready := io.meta_read.ready && io.data_read.ready
-  io.meta_read.valid := io.lsu.req.valid
-  io.data_read.valid := io.lsu.req.valid
-
-  val meta_read = io.meta_read.bits
-  val data_read = io.data_read.bits
-
-  // Tag read for new requests
-  meta_read.idx    := get_idx(io.lsu.req.bits.addr)
-  meta_read.way_en := ~0.U(nWays.W)
-  meta_read.tag    := DontCare
-  // Data read for new requests
-  data_read.addr   := io.lsu.req.bits.addr
-  data_read.way_en := ~0.U(nWays.W)
-  // only needs to read the specific beat
-  data_read.rmask  := UIntToOH(get_row(io.lsu.req.bits.addr))
-
-  // Pipeline
-  // ---------------------------------------
-  // stage 0
-  val s0_valid = io.lsu.req.fire()
-  val s0_req = io.lsu.req.bits
-
-  dump_pipeline_reqs("AtomicsPipe s0", s0_valid, s0_req)
-
-
-  // ---------------------------------------
-  // stage 1
-  val s1_req = RegNext(s0_req)
-  val s1_valid = RegNext(s0_valid, init = false.B)
-  val s1_addr = s1_req.addr
-  val s1_nack = false.B 
-
-  dump_pipeline_reqs("AtomicsPipe s1", s1_valid, s1_req)
-
-  // tag check
-  val meta_resp = io.meta_resp
-  def wayMap[T <: Data](f: Int => T) = VecInit((0 until nWays).map(f))
-  val s1_tag_eq_way = wayMap((w: Int) => meta_resp(w).tag === (get_tag(s1_addr))).asUInt
-  val s1_tag_match_way = wayMap((w: Int) => s1_tag_eq_way(w) && meta_resp(w).coh.isValid()).asUInt
-  val s1_tag_match = s1_tag_match_way.orR
-  val s1_hit_meta = Mux1H(s1_tag_match_way, wayMap((w: Int) => meta_resp(w)))
-  val s1_hit_state = s1_hit_meta.coh
-
-  // replacement policy
-  val replacer = cacheParams.replacement
-  val s1_repl_way_en = UIntToOH(replacer.way)
-  val s1_repl_meta = Mux1H(s1_repl_way_en, wayMap((w: Int) => meta_resp(w)))
-  when (io.miss_req.fire()) {
-    replacer.miss
-  }
-
-
-  // ---------------------------------------
-  // stage 2
-  val s2_req   = RegNext(s1_req)
-  val s2_valid = RegNext(s1_valid, init = false.B)
-
-  dump_pipeline_reqs("AtomicsPipe s2", s2_valid, s2_req)
-
-  val s2_tag_match_way = RegNext(s1_tag_match_way)
-  val s2_tag_match     = s2_tag_match_way.orR
-
-  val s2_hit_meta      = RegNext(s1_hit_meta)
-  val s2_hit_state     = Mux1H(s2_tag_match_way, wayMap((w: Int) => RegNext(meta_resp(w).coh)))
-  val s2_has_permission = s2_hit_state.onAccess(s2_req.cmd)._1
-  val s2_new_hit_state  = s2_hit_state.onAccess(s2_req.cmd)._3
-
-  val s2_repl_meta     = RegNext(s1_repl_meta)
-  val s2_repl_way_en   = RegNext(s1_repl_way_en)
-
-  val s2_old_meta      = Mux(s2_tag_match, s2_hit_meta, s2_repl_meta)
-  val s2_way_en        = Mux(s2_tag_match, s2_tag_match_way, s2_repl_way_en)
-
-  // we not only need permissions
-  // we also require that state does not change on hit
-  // thus we require new_hit_state === old_hit_state
-  //
-  // If state changes on hit,
-  // we should treat it as not hit, and let mshr deal with it,
-  // since we can not write meta data on the main pipeline.
-  // It's possible that we had permission but state changes on hit:
-  // eg: write to exclusive but clean block
-  val s2_hit = s2_tag_match && s2_has_permission && s2_hit_state === s2_new_hit_state
-  val s2_nack = Wire(Bool())
-
-  // when req got nacked, upper levels should replay this request
-
-  // the same set is busy
-  val s2_nack_hit    = RegNext(s1_nack)
-  // can no allocate mshr for store miss
-  val s2_nack_no_mshr = io.miss_req.valid && !io.miss_req.ready
-  // Bank conflict on data arrays
-  // For now, we use DuplicatedDataArray, so no bank conflicts
-  val s2_nack_data   = false.B
-
-  s2_nack   := s2_nack_hit || s2_nack_no_mshr || s2_nack_data
-
-
-  // lr/sc
-  val debug_sc_fail_addr = RegInit(0.U)
-  val debug_sc_fail_cnt  = RegInit(0.U(8.W))
-
-  val lrsc_count = RegInit(0.U(log2Ceil(lrscCycles).W))
-  val lrsc_valid = lrsc_count > lrscBackoff.U
-  val lrsc_addr  = Reg(UInt())
-  val s2_lr = s2_req.cmd === M_XLR && !s2_nack
-  val s2_sc = s2_req.cmd === M_XSC && !s2_nack
-  val s2_lrsc_addr_match = lrsc_valid && lrsc_addr === get_block_addr(s2_req.addr)
-  val s2_sc_fail = s2_sc && !s2_lrsc_addr_match
-  val s2_sc_resp = Mux(s2_sc_fail, 1.U, 0.U)
-
-  // we have permission on this block
-  // but we can not finish in this pass
-  // we need to go to miss queue to update meta and set dirty first
-  val s2_set_dirty = s2_tag_match && s2_has_permission && s2_hit_state =/= s2_new_hit_state
-  // this sc should succeed, but we need to set dirty first
-  // do not treat it as a sc failure and reset lr sc counter
-  val sc_set_dirty = s2_set_dirty && !s2_nack && s2_sc && s2_lrsc_addr_match
-
-  when (s2_valid && !sc_set_dirty) {
-    when (s2_hit && !s2_nack && s2_lr) {
-      lrsc_count := (lrscCycles - 1).U
-      lrsc_addr := get_block_addr(s2_req.addr)
-    } .otherwise {
-      lrsc_count := 0.U
-    }
-  } .elsewhen (lrsc_count > 0.U) {
-    lrsc_count := lrsc_count - 1.U
-  }
-
-  io.block_probe_addr.valid := lrsc_valid
-  io.block_probe_addr.bits  := lrsc_addr
-
-  // when we release this block,
-  // we invalidate this reservation set
-  when (io.wb_invalidate_lrsc.valid) {
-    when (io.wb_invalidate_lrsc.bits === lrsc_addr) {
-      lrsc_count := 0.U
-    }
-
-    // when we release this block, there should be no matching lrsc inflight
-    assert (!(s2_valid && (s2_lr || s2_sc) && io.wb_invalidate_lrsc.bits === get_block_addr(s2_req.addr)))
-  }
-
-  when (s2_valid) {
-    when (s2_req.addr === debug_sc_fail_addr) {
-      when (s2_sc_fail) {
-        debug_sc_fail_cnt := debug_sc_fail_cnt + 1.U
-      } .elsewhen (s2_sc) {
-        debug_sc_fail_cnt := 0.U
-      }
-    } .otherwise {
-      when (s2_sc_fail) {
-        debug_sc_fail_addr := s2_req.addr
-        debug_sc_fail_cnt  := 1.U
-      }
-    }
-  }
-  assert(debug_sc_fail_cnt < 100.U, "L1DCache failed too many SCs in a row")
-
-  // only dump these signals when they are actually valid
-  dump_pipeline_valids("AtomicsPipe s2", "s2_hit", s2_valid && s2_hit)
-  dump_pipeline_valids("AtomicsPipe s2", "s2_nack", s2_valid && s2_nack)
-  dump_pipeline_valids("AtomicsPipe s2", "s2_nack_hit", s2_valid && s2_nack_hit)
-  dump_pipeline_valids("AtomicsPipe s2", "s2_nack_no_mshr", s2_valid && s2_nack_no_mshr)
-  dump_pipeline_valids("AtomicsPipe s2", "s2_nack_data", s2_valid && s2_nack_data)
-  when (s2_valid) {
-    XSDebug("lrsc_count: %d lrsc_valid: %b lrsc_addr: %x\n",
-      lrsc_count, lrsc_valid, lrsc_addr)
-    XSDebug("s2_lr: %b s2_sc: %b s2_lrsc_addr_match: %b s2_sc_fail: %b s2_sc_resp: %x\n",
-      s2_lr, s2_sc, s2_lrsc_addr_match, s2_sc_fail, s2_sc_resp)
-    XSDebug("debug_sc_fail_addr: %x debug_sc_fail_cnt: %d\n",
-      debug_sc_fail_addr, debug_sc_fail_cnt)
-  }
-
-  // load data gen
-  val s2_data = Wire(Vec(nWays, UInt(encRowBits.W)))
-  val data_resp = io.data_resp
-  for (w <- 0 until nWays) {
-    s2_data(w) := data_resp(w)(get_row(s2_req.addr))
-  }
-
-  val s2_data_muxed = Mux1H(s2_tag_match_way, s2_data)
-  // the index of word in a row, in case rowBits != wordBits
-  val s2_word_idx   = if (rowWords == 1) 0.U else s2_req.addr(log2Up(rowWords*wordBytes)-1, log2Up(wordBytes))
-  val s2_data_words = Wire(Vec(rowWords, UInt(encWordBits.W)))
-  for (w <- 0 until rowWords) {
-    s2_data_words(w) := s2_data_muxed(encWordBits * (w + 1) - 1, encWordBits * w)
-  }
-  val s2_data_word =  s2_data_words(s2_word_idx)
-  val s2_decoded = cacheParams.dataCode.decode(s2_data_word)
-  val s2_data_word_decoded = s2_decoded.corrected
-  assert(!(s2_valid && s2_hit && !s2_nack && s2_decoded.uncorrectable))
-
-
-  // send load miss to miss queue
-  io.miss_req.valid          := s2_valid && !s2_nack_hit && !s2_nack_data && !s2_hit
-  io.miss_req.bits.cmd       := s2_req.cmd
-  io.miss_req.bits.addr      := get_block_addr(s2_req.addr)
-  io.miss_req.bits.tag_match := s2_tag_match
-  io.miss_req.bits.way_en    := s2_way_en
-  io.miss_req.bits.old_meta  := s2_old_meta
-  io.miss_req.bits.client_id := s2_req.meta.id
-
-  val resp = Wire(ValidIO(new DCacheWordResp))
-  resp.valid        := s2_valid
-  resp.bits.data    := Mux(s2_sc, s2_sc_resp, s2_data_word)
-  resp.bits.meta    := s2_req.meta
-  // reuse this field to pass lr sc valid to commit
-  // nemu use this to see whether lr sc counter is still valid
-  resp.bits.meta.id := lrsc_valid
-  resp.bits.miss := !s2_hit || s2_nack
-  resp.bits.replay := resp.bits.miss && (!io.miss_req.fire() || s2_nack)
-
-  io.lsu.resp.valid := resp.valid
-  io.lsu.resp.bits := resp.bits
-  assert(!(resp.valid && !io.lsu.resp.ready))
-
-  when (resp.valid) {
-    XSDebug(s"AtomicsPipe resp: data: %x id: %d replayed_req: %b miss: %b need_replay: %b\n",
-      resp.bits.data, resp.bits.meta.id, resp.bits.meta.replay, resp.bits.miss, resp.bits.replay)
-  }
-
-
-  // ---------------------------------------
-  // s3: do data write
-  // Store/amo hits
-  val amoalu   = Module(new AMOALU(wordBits))
-  amoalu.io.mask := s2_req.mask
-  amoalu.io.cmd  := s2_req.cmd
-  amoalu.io.lhs  := s2_data_word_decoded
-  amoalu.io.rhs  := s2_req.data
-
-  val s3_req   = RegNext(s2_req)
-  val s3_valid = RegNext(s2_valid && s2_hit && isWrite(s2_req.cmd) && !s2_nack && !s2_sc_fail)
-  val s3_tag_match_way = RegNext(s2_tag_match_way)
-
-  val wdata_encoded = cacheParams.dataCode.encode(amoalu.io.out)
-  val s3_wdata = Reg(UInt())
-  s3_wdata := wdata_encoded
-
-  // write dcache if hit
-  // only needs to read the specific beat
-  val wmask = WireInit(VecInit((0 until blockRows) map (i => 0.U(rowWords.W))))
-  val wdata = WireInit(VecInit((0 until blockRows) map (i => Cat(
-    (0 until rowWords) map { w => s3_wdata }))))
-  wmask(get_row(s3_req.addr)) := ~0.U(rowWords.W)
-
-  val data_write = io.data_write.bits
-  io.data_write.valid := s3_valid
-  data_write.rmask    := DontCare
-  data_write.way_en   := s3_tag_match_way
-  data_write.addr     := s3_req.addr
-  data_write.wmask    := wmask
-  data_write.data     := wdata
-
-  assert(!(io.data_write.valid && !io.data_write.ready))
-
-  dump_pipeline_reqs("AtomicsPipe s3", s3_valid, s3_req)
-
-
-  // -------
-  // wire out signals for synchronization
-  io.inflight_req_idxes(0).valid := io.lsu.req.valid
-  io.inflight_req_idxes(1).valid := s1_valid
-  io.inflight_req_idxes(2).valid := s2_valid
-
-  io.inflight_req_idxes(0).bits  := get_idx(s0_req.addr)
-  io.inflight_req_idxes(1).bits  := get_idx(s1_req.addr)
-  io.inflight_req_idxes(2).bits  := get_idx(s2_req.addr)
-
-  io.inflight_req_block_addrs(0).valid := io.lsu.req.valid
-  io.inflight_req_block_addrs(1).valid := s1_valid
-  io.inflight_req_block_addrs(2).valid := s2_valid
-
-  io.inflight_req_block_addrs(0).bits  := get_block_addr(s0_req.addr)
-  io.inflight_req_block_addrs(1).bits  := get_block_addr(s1_req.addr)
-  io.inflight_req_block_addrs(2).bits  := get_block_addr(s2_req.addr)
-
-  // -------
-  // Debug logging functions
-  def dump_pipeline_reqs(pipeline_stage_name: String, valid: Bool,
-    req: DCacheWordReq ) = {
-      when (valid) {
-        XSDebug(s"$pipeline_stage_name cmd: %x addr: %x data: %x mask: %x id: %d replay: %b\n",
-          req.cmd, req.addr, req.data, req.mask, req.meta.id, req.meta.replay)
-      }
-  }
-
-  def dump_pipeline_valids(pipeline_stage_name: String, signal_name: String, valid: Bool) = {
-    when (valid) {
-      XSDebug(s"$pipeline_stage_name $signal_name\n")
-    }
-  }
-}
--- a/src/main/scala/xiangshan/cache/atomicsMissQueue.scala
+++ b/src/main/scala/xiangshan/cache/atomicsMissQueue.scala
-package xiangshan.cache
-
-import chisel3._
-import chisel3.util._
-
-import utils.XSDebug
-
-// wraps around AtomicsPipe
-// when requests misse, send miss req to missQueue and replays reqs
-class AtomicsMissQueue extends DCacheModule
-{
-  val io = IO(new DCacheBundle {
-    val lsu         = Flipped(new DCacheWordIO)
-    val replay      = new DCacheWordIO
-    val miss_resp   = Flipped(ValidIO(new MissResp))
-    val miss_finish = DecoupledIO(new MissFinish)
-  })
-
-  val s_invalid :: s_replay_req :: s_replay_resp :: s_resp :: s_miss_resp :: s_miss_finish :: Nil = Enum(6)
-  val state = RegInit(s_invalid)
-  val id = 0.U
-
-  val req     = Reg(new DCacheWordReq)
-  val resp    = Reg(new DCacheWordResp)
-  val req_block_addr = get_block_addr(req.addr)
-  val reg_miss_resp = Reg(new MissResp)
-
-  // assign default values to output signals
-  io.lsu.req.ready     := state === s_invalid
-  io.lsu.resp.valid    := false.B
-  io.lsu.resp.bits     := DontCare
-
-  io.replay.req.valid  := false.B
-  io.replay.req.bits   := DontCare
-  io.replay.resp.ready := false.B
-
-  io.miss_finish.valid := false.B
-  io.miss_finish.bits  := DontCare
-
-  when (state =/= s_invalid) {
-    XSDebug("state: %d\n", state)
-  }
-
-  // --------------------------------------------
-  // s_invalid: receive requests
-  when (state === s_invalid) {
-    when (io.lsu.req.fire()) {
-      assert(!io.lsu.req.bits.meta.replay)
-      req   := io.lsu.req.bits
-      state := s_replay_req
-    }
-  }
-
-  // --------------------------------------------
-  // replay
-  when (state === s_replay_req) {
-    io.replay.req.valid := true.B
-    io.replay.req.bits  := req
-    when (io.replay.req.fire()) {
-      state := s_replay_resp
-    }
-  }
-
-  when (state === s_replay_resp) {
-    io.replay.resp.ready := true.B
-    when (io.replay.resp.fire()) {
-      // req missed
-      when (io.replay.resp.bits.miss) {
-        // replayed reqs should not miss
-        assert(!req.meta.replay)
-        // the req missed and did not enter mshr
-        // so replay it until it hits or enters mshr
-        when (io.replay.resp.bits.replay) {
-          state := s_replay_req
-        } .otherwise {
-          // the req missed and enters mshr
-          // wait for miss response
-          state := s_miss_resp
-        }
-      } .otherwise {
-        // req hits, everything OK
-        resp := io.replay.resp.bits
-        when (!req.meta.replay) {
-          state := s_resp
-        } .otherwise {
-          // if it's a replayed request
-          // we need to tell mshr, we are done
-          state := s_miss_finish
-        }
-      }
-    }
-  }
-
-  when (state === s_miss_resp) {
-    when (io.miss_resp.fire()) {
-      reg_miss_resp   := io.miss_resp.bits
-      // mark req as replayed req
-      req.meta.replay := true.B
-      state           := s_replay_req
-    }
-  }
-
-  when (state === s_miss_finish) {
-    io.miss_finish.valid          := true.B
-    io.miss_finish.bits.client_id := id
-    io.miss_finish.bits.entry_id  := reg_miss_resp.entry_id
-    when (io.miss_finish.fire()) {
-      state := s_resp
-    }
-  }
-
-  // --------------------------------------------
-  when (state === s_resp) {
-    io.lsu.resp.valid := true.B
-    io.lsu.resp.bits  := resp
-
-    when (io.lsu.resp.fire()) {
-      state := s_invalid
-    }
-  }
-
-  // debug output
-  when (io.lsu.req.fire()) {
-    XSDebug(s"io.lsu.req cmd: %x addr: %x data: %x mask: %x id: %d replayed_req: %b\n",
-      io.lsu.req.bits.cmd, io.lsu.req.bits.addr, io.lsu.req.bits.data, io.lsu.req.bits.mask, io.lsu.req.bits.meta.id, io.lsu.req.bits.meta.replay)
-  }
-
-  val replay = io.replay.req
-  when (replay.fire()) {
-    XSDebug(s"replay cmd: %x addr: %x data: %x mask: %x id: %d replayed_req: %b\n",
-      replay.bits.cmd, replay.bits.addr, replay.bits.data, replay.bits.mask, replay.bits.meta.id, replay.bits.meta.replay)
-  }
-
-  when (io.lsu.resp.fire()) {
-    XSDebug(s"io.lsu.resp: data: %x id: %d replayed_req: %b miss: %b need_replay: %b\n",
-      io.lsu.resp.bits.data, io.lsu.resp.bits.meta.id, io.lsu.resp.bits.meta.replay, io.lsu.resp.bits.miss, io.lsu.resp.bits.replay)
-  }
-
-  val miss_resp = io.miss_resp
-  XSDebug(miss_resp.fire(), "miss_resp client_id: %d entry_id: %d\n",
-    miss_resp.bits.client_id, miss_resp.bits.entry_id)
-
-  val miss_finish = io.miss_finish
-  XSDebug(miss_finish.fire(), "miss_finish client_id: %d entry_id: %d\n",
-    miss_finish.bits.client_id, miss_finish.bits.entry_id)
-
-  when (io.lsu.req.fire()) {
-    XSDebug(s"AtomicsMissEntryTransaction req 0\n")
-  }
-
-  when (io.lsu.resp.fire()) {
-    XSDebug(s"AtomicsMissEntryTransaction resp 0\n")
-  }
-}
--- a/src/main/scala/xiangshan/cache/dcacheWrapper.scala
+++ b/src/main/scala/xiangshan/cache/dcacheWrapper.scala
-package xiangshan.cache
-
-import chipsalliance.rocketchip.config.Parameters
-import chisel3._
-import chisel3.util._
-import xiangshan._
-import utils._
-import freechips.rocketchip.diplomacy.{IdRange, LazyModule, LazyModuleImp, TransferSizes}
-import freechips.rocketchip.tilelink.{TLClientNode, TLClientParameters, TLMasterParameters, TLMasterPortParameters, TLArbiter}
-
-// Meta data for dcache requests
-// anything that should go with reqs and resps goes here
-class DCacheMeta extends DCacheBundle {
-  val id      = UInt(reqIdWidth.W)
-  val vaddr   = UInt(VAddrBits.W) // maybe we should use VAddrBits?
-  val paddr   = UInt(PAddrBits.W)
-  val uop     = new MicroOp //FIXME: opt data width
-  val mmio    = Bool()
-  val tlb_miss = Bool()
-  // dcache request id
-  // master uses id to correlate resps to reqs
-  // different master can allocate and free ids independently
-  // as long as they do not share resp  
-  val mask    = UInt((DataBits/8).W)
-  val replay  = Bool() // whether it's a replayed request?
-}
-
-// memory request in word granularity(load, mmio, lr/sc, atomics)
-class DCacheWordReq  extends DCacheBundle
-{
-  val cmd    = UInt(M_SZ.W)
-  val addr   = UInt(PAddrBits.W)
-  val data   = UInt(DataBits.W)
-  val mask   = UInt((DataBits/8).W)
-  val meta   = new DCacheMeta
-}
-
-// memory request in word granularity(store)
-class DCacheLineReq  extends DCacheBundle
-{
-  val cmd    = UInt(M_SZ.W)
-  val addr   = UInt(PAddrBits.W)
-  val data   = UInt((cfg.blockBytes * 8).W)
-  val mask   = UInt(cfg.blockBytes.W)
-  val meta   = new DCacheMeta
-}
-
-class DCacheWordResp extends DCacheBundle
-{
-  val data         = UInt(DataBits.W)
-  val meta         = new DCacheMeta
-  // cache req missed, send it to miss queue
-  val miss   = Bool()
-  // cache req nacked, replay it later
-  val replay = Bool()
-}
-
-class DCacheLineResp extends DCacheBundle
-{
-  val data   = UInt((cfg.blockBytes * 8).W)
-  val meta   = new DCacheMeta
-  // cache req missed, send it to miss queue
-  val miss   = Bool()
-  // cache req nacked, replay it later
-  val replay = Bool()
-}
-
-class Refill extends DCacheBundle
-{
-  val addr   = UInt(PAddrBits.W)
-  val data   = UInt((cfg.blockBytes * 8).W)
-}
-
-class DCacheWordIO extends DCacheBundle
-{
-  val req  = DecoupledIO(new DCacheWordReq)
-  val resp = Flipped(DecoupledIO(new DCacheWordResp))
-}
-
-// used by load unit
-class DCacheLoadIO extends DCacheWordIO
-{
-  // kill previous cycle's req
-  val s1_kill  = Output(Bool())
-  // cycle 0: virtual address: req.addr
-  // cycle 1: physical address: s1_paddr
-  val s1_paddr   = Output(UInt(PAddrBits.W))
-}
-
-class DCacheLineIO extends DCacheBundle
-{
-  val req  = DecoupledIO(new DCacheLineReq )
-  val resp = Flipped(DecoupledIO(new DCacheLineResp))
-}
-
-class DCacheToLsuIO extends DCacheBundle {
-  val load  = Vec(LoadPipelineWidth, Flipped(new DCacheLoadIO)) // for speculative load
-  val lsq = ValidIO(new Refill)  // refill to load queue, wake up load misses
-  val store = Flipped(new DCacheLineIO) // for sbuffer
-  val atomics  = Flipped(new DCacheWordIO)  // atomics reqs
-}
-
-class DCacheIO extends DCacheBundle {
-  val lsu = new DCacheToLsuIO
-  val prefetch = DecoupledIO(new MissReq)
-}
-
-
-class DCache()(implicit p: Parameters) extends LazyModule with HasDCacheParameters {
-
-  val clientParameters = TLMasterPortParameters.v1(
-    Seq(TLMasterParameters.v1(
-      name = "dcache",
-      sourceId = IdRange(0, cfg.nMissEntries+1),
-      supportsProbe = TransferSizes(cfg.blockBytes)
-    ))
-  )
-
-  val clientNode = TLClientNode(Seq(clientParameters))
-
-  lazy val module = new DCacheImp(this)
-}
-
-
-class DCacheImp(outer: DCache) extends LazyModuleImp(outer) with HasDCacheParameters with HasXSLog {
-
-  val io = IO(new DCacheIO)
-
-  val (bus, edge) = outer.clientNode.out.head
-  require(bus.d.bits.data.getWidth == l1BusDataWidth, "DCache: tilelink width does not match")
-
-  //----------------------------------------
-  // core data structures
-  val dataArray = Module(new DuplicatedDataArray)
-  val metaArray = Module(new DuplicatedMetaArray)
-  /*
-  dataArray.dump()
-  metaArray.dump()
-  */
-
-
-  //----------------------------------------
-  // core modules
-  val ldu = Seq.fill(LoadPipelineWidth) { Module(new LoadPipe) }
-  val stu = Module(new StorePipe)
-  val atomics = Module(new AtomicsPipe)
-  val storeMissQueue = Module(new StoreMissQueue)
-  val atomicsMissQueue = Module(new AtomicsMissQueue)
-  val missQueue = Module(new MissQueue(edge))
-  val wb = Module(new WritebackUnit(edge))
-  val prober = Module(new ProbeUnit(edge))
-
-
-  //----------------------------------------
-  // meta array
-  val MetaWritePortCount = 2
-  val MissQueueMetaWritePort = 0
-  val ProberMetaWritePort = 1
-  val metaWriteArb = Module(new Arbiter(new L1MetaWriteReq, MetaWritePortCount))
-
-  metaWriteArb.io.in(MissQueueMetaWritePort)    <> missQueue.io.meta_write
-  metaWriteArb.io.in(ProberMetaWritePort)       <> prober.io.meta_write
-
-  metaArray.io.write <> metaWriteArb.io.out
-
-  // To simplify port arbitration
-  // MissQueue, Prober and StorePipe all use port 0
-  // if contention got severe, considering load balancing on two ports?
-  val MetaReadPortCount = 4
-  val ProberMetaReadPort = 0
-  val StorePipeMetaReadPort = 1
-  val LoadPipeMetaReadPort = 2
-  val AtomicsPipeMetaReadPort = 3
-
-  val metaReadArb = Module(new Arbiter(new L1MetaReadReq, MetaReadPortCount))
-
-  metaReadArb.io.in(ProberMetaReadPort)       <> prober.io.meta_read
-  metaReadArb.io.in(StorePipeMetaReadPort)    <> stu.io.meta_read
-  metaReadArb.io.in(LoadPipeMetaReadPort)     <> ldu(0).io.meta_read
-  metaReadArb.io.in(AtomicsPipeMetaReadPort)  <> atomics.io.meta_read
-
-  metaArray.io.read(0) <> metaReadArb.io.out
-
-  prober.io.meta_resp    <>  metaArray.io.resp(0)
-  stu.io.meta_resp       <>  metaArray.io.resp(0)
-  ldu(0).io.meta_resp    <>  metaArray.io.resp(0)
-  atomics.io.meta_resp      <>  metaArray.io.resp(0)
-
-  for (w <- 1 until LoadPipelineWidth) {
-    metaArray.io.read(w) <> ldu(w).io.meta_read
-    ldu(w).io.meta_resp <> metaArray.io.resp(w)
-  }
-
-  //----------------------------------------
-  // data array
-  val DataWritePortCount = 3
-  val StorePipeDataWritePort = 0
-  val AtomicsPipeDataWritePort = 1
-  val MissQueueDataWritePort = 2
-
-  val dataWriteArb = Module(new Arbiter(new L1DataWriteReq, DataWritePortCount))
-
-  dataWriteArb.io.in(StorePipeDataWritePort) <> stu.io.data_write
-  dataWriteArb.io.in(MissQueueDataWritePort) <> missQueue.io.data_write
-  dataWriteArb.io.in(AtomicsPipeDataWritePort)  <> atomics.io.data_write
-
-  dataArray.io.write <> dataWriteArb.io.out
-
-  // To simplify port arbitration
-  // WritebackUnit and StorePipe use port 0
-  val DataReadPortCount = 4
-  val WritebackDataReadPort = 0
-  val StorePipeDataReadPort = 1
-  val LoadPipeDataReadPort = 2
-  val AtomicsPipeDataReadPort = 3
-
-  val dataReadArb = Module(new Arbiter(new L1DataReadReq, DataReadPortCount))
-
-  dataReadArb.io.in(WritebackDataReadPort) <> wb.io.data_req
-  dataReadArb.io.in(StorePipeDataReadPort) <> stu.io.data_read
-  dataReadArb.io.in(LoadPipeDataReadPort)  <> ldu(0).io.data_read
-  dataReadArb.io.in(AtomicsPipeDataReadPort) <> atomics.io.data_read
-
-  dataArray.io.read(0) <> dataReadArb.io.out
-  dataArray.io.resp(0) <> wb.io.data_resp
-  dataArray.io.resp(0) <> stu.io.data_resp
-  dataArray.io.resp(0) <> atomics.io.data_resp
-  dataArray.io.resp(0) <> ldu(0).io.data_resp
-
-  for (w <- 1 until LoadPipelineWidth) {
-    dataArray.io.read(w) <> ldu(w).io.data_read
-    dataArray.io.resp(w) <> ldu(w).io.data_resp
-  }
-
-  //----------------------------------------
-  // load pipe and load miss queue
-  // the s1 kill signal
-  // only lsu uses this, replay never kills
-  for (w <- 0 until LoadPipelineWidth) {
-    val load_w_nack = nack_load(io.lsu.load(w).req.bits.addr)
-    ldu(w).io.lsu.req <> io.lsu.load(w).req
-    ldu(w).io.lsu.s1_paddr <> io.lsu.load(w).s1_paddr
-    ldu(w).io.nack := load_w_nack
-    XSDebug(load_w_nack, s"LoadUnit $w nacked\n")
-
-    ldu(w).io.lsu.resp <> io.lsu.load(w).resp
-    ldu(w).io.lsu.s1_kill <> io.lsu.load(w).s1_kill
-    assert(!(io.lsu.load(w).req.fire() && io.lsu.load(w).req.bits.meta.replay), "LSU should not replay requests")
-  }
-
-  for (w <- 0 until LoadPipelineWidth) {
-    assert(!(io.lsu.load(w).req.fire() && io.lsu.load(w).req.bits.meta.mmio), "MMIO requests should not go to cache")
-    assert(!(io.lsu.load(w).req.fire() && io.lsu.load(w).req.bits.meta.tlb_miss), "TLB missed requests should not go to cache")
-  }
-
-  //----------------------------------------
-  // store pipe and store miss queue
-  storeMissQueue.io.lsu    <> io.lsu.store
-  /*
-  assert(!(storeMissQueue.io.replay.req.fire() && !storeMissQueue.io.replay.req.bits.meta.replay),
-    "StoreMissQueue should replay requests")
-  */
-  assert(!(io.lsu.store.req.fire() && io.lsu.store.req.bits.meta.replay),
-    "Sbuffer should not should replay requests")
-  assert(!(io.lsu.store.req.fire() && io.lsu.store.req.bits.meta.mmio),
-    "MMIO requests should not go to cache")
-  assert(!(io.lsu.store.req.fire() && io.lsu.store.req.bits.meta.tlb_miss),
-    "TLB missed requests should not go to cache")
-
-  val store_block = block_store(storeMissQueue.io.replay.req.bits.addr)
-  block_decoupled(storeMissQueue.io.replay.req, stu.io.lsu.req, store_block && !storeMissQueue.io.replay.req.bits.meta.replay)
-  storeMissQueue.io.replay.resp <> stu.io.lsu.resp
-  XSDebug(store_block, "StorePipe blocked\n")
-
-  //----------------------------------------
-  // atomics pipe
-  atomics.io.wb_invalidate_lrsc := wb.io.inflight_addr
-  atomicsMissQueue.io.lsu <> io.lsu.atomics
-  atomicsMissQueue.io.replay <> atomics.io.lsu
-
-  val atomics_block = block_atomics(atomicsMissQueue.io.replay.req.bits.addr)
-  block_decoupled(atomicsMissQueue.io.replay.req, atomics.io.lsu.req, atomics_block && !atomicsMissQueue.io.replay.req.bits.meta.replay)
-  XSDebug(atomics_block, "AtomicsPipe blocked\n")
-
-  // when atomics are in flight, there should be no load or store in flight
-  // so atomics and store should not show up at the same time
-  val atomics_inflight = VecInit(atomics.io.inflight_req_block_addrs map (entry => entry.valid)).reduce(_||_)
-  val store_inflight = VecInit(stu.io.inflight_req_block_addrs map (entry => entry.valid)).reduce(_||_)
-  assert(!(atomics_inflight && store_inflight))
-
-
-  // some other stuff
-  val atomicsReq = io.lsu.atomics.req
-  assert(!(atomicsReq.fire() && atomicsReq.bits.meta.replay),
-    "Atomics does not support request replay")
-  assert(!(atomicsReq.fire() && atomicsReq.bits.meta.mmio),
-    "MMIO requests should not go to cache")
-  assert(!(atomicsReq.fire() && atomicsReq.bits.meta.tlb_miss),
-    "TLB missed requests should not go to cache")
-
-  //----------------------------------------
-  // miss queue
-  require(LoadPipelineWidth == 2, "We hard code the number of load misses")
-  val loadMissQueueClientId_0  = 0.U(clientIdWidth.W)
-  val loadMissQueueClientId_1  = 1.U(clientIdWidth.W)
-  val storeMissQueueClientId   = 2.U(clientIdWidth.W)
-  val atomicsMissQueueClientId = 3.U(clientIdWidth.W)
-
-  // Request
-  val missReqArb = Module(new Arbiter(new MissReq, nClientMissQueues))
-
-  val missReq      = missQueue.io.req
-  val loadMissReq_0  = ldu(0).io.miss_req
-  val loadMissReq_1  = ldu(1).io.miss_req
-  val storeMissReq  = stu.io.miss_req
-  val atomicsMissReq  = atomics.io.miss_req
-
-  missReqArb.io.in(0) <> loadMissReq_0
-  missReqArb.io.in(0).bits.client_id := Cat(loadMissQueueClientId_0,
-    loadMissReq_0.bits.client_id(entryIdMSB, entryIdLSB))
-
-  missReqArb.io.in(1) <> loadMissReq_1
-  missReqArb.io.in(1).bits.client_id := Cat(loadMissQueueClientId_1,
-    loadMissReq_0.bits.client_id(entryIdMSB, entryIdLSB))
-
-  missReqArb.io.in(2).valid          := storeMissReq.valid
-  storeMissReq.ready                 := missReqArb.io.in(2).ready
-  missReqArb.io.in(2).bits           := storeMissReq.bits
-  missReqArb.io.in(2).bits.client_id := Cat(storeMissQueueClientId,
-    storeMissReq.bits.client_id(entryIdMSB, entryIdLSB))
-
-  missReqArb.io.in(3).valid          := atomicsMissReq.valid
-  atomicsMissReq.ready               := missReqArb.io.in(3).ready
-  missReqArb.io.in(3).bits           := atomicsMissReq.bits
-  missReqArb.io.in(3).bits.client_id := Cat(atomicsMissQueueClientId,
-    atomicsMissReq.bits.client_id(entryIdMSB, entryIdLSB))
-
-  val miss_block = block_miss(missReqArb.io.out.bits.addr)
-  block_decoupled(missReqArb.io.out, missReq, miss_block)
-  XSDebug(miss_block, "MissQueue blocked\n")
-
-  // Response
-  // store and atomics wait for miss queue responses
-  val missResp        = missQueue.io.resp
-  val storeMissResp   = storeMissQueue.io.miss_resp
-  val atomicsMissResp = atomicsMissQueue.io.miss_resp
-
-  val clientId = missResp.bits.client_id(clientIdMSB, clientIdLSB)
-
-  val isStoreMissResp = clientId === storeMissQueueClientId
-  storeMissResp.valid := missResp.valid && isStoreMissResp
-  storeMissResp.bits  := missResp.bits
-  storeMissResp.bits.client_id := missResp.bits.client_id(entryIdMSB, entryIdLSB)
-
-  val isAtomicsMissResp = clientId === atomicsMissQueueClientId
-  atomicsMissResp.valid := missResp.valid && isAtomicsMissResp
-  atomicsMissResp.bits  := missResp.bits
-  atomicsMissResp.bits.client_id := missResp.bits.client_id(entryIdMSB, entryIdLSB)
-
-  // Finish
-  val missFinish        = missQueue.io.finish
-  val storeMissFinish   = storeMissQueue.io.miss_finish
-  val atomicsMissFinish = atomicsMissQueue.io.miss_finish
-
-  val missFinishArb = Module(new Arbiter(new MissFinish, 2))
-  missFinishArb.io.in(0).valid          := storeMissFinish.valid
-  storeMissFinish.ready                 := missFinishArb.io.in(0).ready
-  missFinishArb.io.in(0).bits.entry_id  := storeMissFinish.bits.entry_id
-  missFinishArb.io.in(0).bits.client_id := Cat(storeMissQueueClientId,
-      storeMissFinish.bits.client_id(entryIdMSB, entryIdLSB))
-
-  missFinishArb.io.in(1).valid          := atomicsMissFinish.valid
-  atomicsMissFinish.ready                  := missFinishArb.io.in(1).ready
-  missFinishArb.io.in(1).bits.entry_id  := atomicsMissFinish.bits.entry_id
-  missFinishArb.io.in(1).bits.client_id := Cat(atomicsMissQueueClientId,
-    atomicsMissFinish.bits.client_id(entryIdMSB, entryIdLSB))
-
-  missFinish                            <> missFinishArb.io.out
-
-  // refill to load queue
-  io.lsu.lsq <> missQueue.io.refill
-
-  // tilelink stuff
-  bus.a <> missQueue.io.mem_acquire
-  bus.e <> missQueue.io.mem_finish
-
-  when (bus.d.bits.source === cfg.nMissEntries.U) {
-    // This should be ReleaseAck
-    bus.d.ready := true.B
-    missQueue.io.mem_grant.valid := false.B
-    missQueue.io.mem_grant.bits  := DontCare
-  } .otherwise {
-    // This should be GrantData
-    missQueue.io.mem_grant <> bus.d
-  }
-
-
-  // sync with prober
-  missQueue.io.probe_wb_req.valid := prober.io.wb_req.fire()
-  missQueue.io.probe_wb_req.bits  := prober.io.wb_req.bits
-  missQueue.io.probe_active       := prober.io.inflight_req_idx
-
-  //----------------------------------------
-  // prober
-  prober.io.req.valid := bus.b.valid && !block_probe(get_block_addr(bus.b.bits.address))
-  bus.b.ready         := prober.io.req.ready && !block_probe(get_block_addr(bus.b.bits.address))
-  prober.io.req.bits  := bus.b.bits
-
-  //----------------------------------------
-  // wb
-  // 0 goes to prober, 1 goes to missQueue evictions
-  val wbArb = Module(new Arbiter(new WritebackReq(edge.bundle.sourceBits), 2))
-  wbArb.io.in(0)       <> prober.io.wb_req
-  wbArb.io.in(1)       <> missQueue.io.wb_req
-  wb.io.req            <> wbArb.io.out
-  missQueue.io.wb_resp := wb.io.resp
-  prober.io.wb_resp    := wb.io.resp
-  wb.io.mem_grant      := bus.d.fire() && bus.d.bits.source === cfg.nMissEntries.U
-
-  TLArbiter.lowestFromSeq(edge, bus.c, Seq(prober.io.rep, wb.io.release))
-
-  // dcache should only deal with DRAM addresses
-  when (bus.a.fire()) {
-    assert(bus.a.bits.address >= 0x80000000L.U)
-  }
-  when (bus.b.fire()) {
-    assert(bus.b.bits.address >= 0x80000000L.U)
-  }
-  when (bus.c.fire()) {
-    assert(bus.c.bits.address >= 0x80000000L.U)
-  }
-
-  io.prefetch.valid := missQueue.io.req.fire()
-  io.prefetch.bits := missQueue.io.req.bits
-
-  // synchronization stuff
-  def nack_load(addr: UInt) = {
-    val store_addr_matches = VecInit(stu.io.inflight_req_block_addrs map (entry => entry.valid && entry.bits === get_block_addr(addr)))
-    val store_addr_match = store_addr_matches.reduce(_||_)
-
-    val atomics_addr_matches = VecInit(atomics.io.inflight_req_block_addrs map (entry => entry.valid && entry.bits === get_block_addr(addr)))
-    val atomics_addr_match = atomics_addr_matches.reduce(_||_)
-
-    val prober_idx_match = prober.io.inflight_req_block_addr.valid && get_idx(prober.io.inflight_req_block_addr.bits) === get_idx(addr)
-
-    val miss_idx_matches = VecInit(missQueue.io.inflight_req_idxes map (entry => entry.valid && entry.bits === get_idx(addr)))
-    val miss_idx_match = miss_idx_matches.reduce(_||_)
-
-    store_addr_match || atomics_addr_match || prober_idx_match || miss_idx_match
-  }
-
-  def block_store(addr: UInt) = {
-    val prober_idx_match = prober.io.inflight_req_block_addr.valid && get_idx(prober.io.inflight_req_block_addr.bits) === get_idx(addr)
-
-    val miss_idx_matches = VecInit(missQueue.io.inflight_req_idxes map (entry => entry.valid && entry.bits === get_idx(addr)))
-    val miss_idx_match = miss_idx_matches.reduce(_||_)
-    prober_idx_match || miss_idx_match
-  }
-
-  def block_atomics(addr: UInt) = {
-    val prober_idx_match = prober.io.inflight_req_block_addr.valid && get_idx(prober.io.inflight_req_block_addr.bits) === get_idx(addr)
-
-    val miss_idx_matches = VecInit(missQueue.io.inflight_req_idxes map (entry => entry.valid && entry.bits === get_idx(addr)))
-    val miss_idx_match = miss_idx_matches.reduce(_||_)
-    prober_idx_match || miss_idx_match
-  }
-
-  def block_miss(addr: UInt) = {
-    val prober_idx_match = prober.io.inflight_req_idx.valid && prober.io.inflight_req_idx.bits === get_idx(addr)
-
-    val miss_idx_matches = VecInit(missQueue.io.inflight_req_idxes map (entry => entry.valid && entry.bits === get_idx(addr)))
-    val miss_idx_match = miss_idx_matches.reduce(_||_)
-
-    prober_idx_match || miss_idx_match
-  }
-
-  def block_probe(addr: UInt) = {
-    val store_idx_matches = VecInit(stu.io.inflight_req_block_addrs map (entry => entry.valid && get_idx(entry.bits) === get_idx(addr)))
-    val store_idx_match = store_idx_matches.reduce(_||_)
-
-    val atomics_idx_matches = VecInit(atomics.io.inflight_req_block_addrs map (entry => entry.valid && get_idx(entry.bits) === get_idx(addr)))
-    val atomics_idx_match = atomics_idx_matches.reduce(_||_)
-
-    val lrsc_addr_match = atomics.io.block_probe_addr.valid && atomics.io.block_probe_addr.bits === get_block_addr(addr)
-
-    val miss_idx_matches = VecInit(missQueue.io.block_probe_idxes map (entry => entry.valid && entry.bits === get_idx(addr)))
-    val miss_idx_match = miss_idx_matches.reduce(_||_)
-
-    // the missed req
-    val miss_req_idx_match = missReq.fire() && get_idx(missReq.bits.addr) === get_idx(addr)
-
-    store_idx_match || atomics_idx_match || lrsc_addr_match || miss_idx_match || miss_req_idx_match
-  }
-
-  def block_decoupled[T <: Data](source: DecoupledIO[T], sink: DecoupledIO[T], block_signal: Bool) = {
-    sink.valid   := source.valid && !block_signal
-    source.ready := sink.ready   && !block_signal
-    sink.bits    := source.bits
-  }
-}
--- a/src/main/scala/xiangshan/cache/dtlb.scala
+++ b/src/main/scala/xiangshan/cache/dtlb.scala
@@ -140,10 +140,10 @@ class TlbEntry(superpage: Boolean = false) extends TlbBundle {
      val insideLevel = level.getOrElse(0.U)
      val a = tag(vpnnLen*3-1, vpnnLen*2) === vpn(vpnnLen*3-1, vpnnLen*2)
      val b = tag(vpnnLen*2-1, vpnnLen*1) === vpn(vpnnLen*2-1, vpnnLen*1)
-      XSDebug(Mux(insideLevel.asBool, a&b, a), p"Hit superpage: hit:${Mux(insideLevel.asBool, a&b, a)} tag:${Hexadecimal(tag)} level:${insideLevel} data:${data} a:${a} b:${b} vpn:${Hexadecimal(vpn)}\n")("TlbEntrySuperpage")
+      XSDebug(Mux(insideLevel.asBool, a&b, a), p"Hit superpage: hit:${Mux(insideLevel.asBool, a&b, a)} tag:${Hexadecimal(tag)} level:${insideLevel} data:${data} a:${a} b:${b} vpn:${Hexadecimal(vpn)}\n")
      Mux(insideLevel.asBool, a&b, a)
    } else {
-      XSDebug(tag === vpn, p"Hit normalpage: hit:${tag === vpn} tag:${Hexadecimal(tag)} data:${data}  vpn:${Hexadecimal(vpn)}\n")("TlbEntryNormalpage")
+      XSDebug(tag === vpn, p"Hit normalpage: hit:${tag === vpn} tag:${Hexadecimal(tag)} data:${data}  vpn:${Hexadecimal(vpn)}\n")
      tag === vpn
    }
  }

--- a/src/main/scala/xiangshan/cache/missQueue.scala
+++ b/src/main/scala/xiangshan/cache/missQueue.scala
--- a/src/main/scala/xiangshan/cache/probe.scala
+++ b/src/main/scala/xiangshan/cache/probe.scala
-package xiangshan.cache
-
-import chisel3._
-import chisel3.util._
-
-import utils.XSDebug
-import freechips.rocketchip.tilelink._
-import utils.{HasTLDump, XSDebug}
-
-class ProbeUnit(edge: TLEdgeOut) extends DCacheModule with HasTLDump {
-  val io = IO(new Bundle {
-    val req = Flipped(Decoupled(new TLBundleB(edge.bundle)))
-    val rep = Decoupled(new TLBundleC(edge.bundle))
-    val meta_read = Decoupled(new L1MetaReadReq)
-    val meta_resp   = Input(Vec(nWays, new L1Metadata))
-    val meta_write = Decoupled(new L1MetaWriteReq)
-    val wb_req = Decoupled(new WritebackReq(edge.bundle.sourceBits))
-    val wb_resp = Input(Bool())
-    val inflight_req_idx        = Output(Valid(UInt()))
-    val inflight_req_block_addr = Output(Valid(UInt()))
-  })
-
-  val s_invalid :: s_meta_read_req :: s_meta_read_resp :: s_decide_next_state :: s_release :: s_wb_req :: s_wb_resp :: s_meta_write_req :: Nil = Enum(8)
-
-  val state = RegInit(s_invalid)
-
-  val req = Reg(new TLBundleB(edge.bundle))
-  val req_idx = get_idx(req.address)
-  val req_tag = get_tag(req.address)
-  val req_block_addr = get_block_addr(req.address)
-
-  val req_way_en = Reg(UInt())
-  val tag_matches = req_way_en.orR
-  val old_coh = Reg(new ClientMetadata)
-  val miss_coh = ClientMetadata.onReset
-  val reply_coh = Mux(tag_matches, old_coh, miss_coh)
-  val (is_dirty, report_param, new_coh) = reply_coh.onProbe(req.param)
-
-  // assign default values to signals
-  io.req.ready := false.B
-  io.rep.valid := false.B
-  io.rep.bits  := DontCare
-  io.meta_read.valid := false.B
-  io.meta_read.bits  := DontCare
-  io.meta_write.valid := false.B
-  io.meta_write.bits  := DontCare
-  io.wb_req.valid := false.B
-  io.wb_req.bits  := DontCare
-
-  io.inflight_req_idx.valid := state =/= s_invalid
-  io.inflight_req_idx.bits  := req_idx
-
-  io.inflight_req_block_addr.valid := state =/= s_invalid
-  io.inflight_req_block_addr.bits  := req_block_addr
-
-  when (state =/= s_invalid) {
-    XSDebug("state: %d\n", state)
-  }
-
-  when (state === s_invalid) {
-    io.req.ready := true.B
-    when (io.req.fire()) {
-      req := io.req.bits
-      state := s_meta_read_req
-    }
-  }
-
-  when (state === s_meta_read_req) {
-    io.meta_read.valid := true.B
-    val meta_read = io.meta_read.bits
-    meta_read.idx    := req_idx
-    meta_read.way_en := ~0.U(nWays.W)
-    meta_read.tag    := DontCare
-
-    when (io.meta_read.fire()) {
-      state := s_meta_read_resp
-    }
-  }
-
-  when (state === s_meta_read_resp) {
-    // tag check
-    def wayMap[T <: Data](f: Int => T) = VecInit((0 until nWays).map(f))
-    val tag_eq_way = wayMap((w: Int) => io.meta_resp(w).tag === (req_tag)).asUInt
-    val tag_match_way = wayMap((w: Int) => tag_eq_way(w) && io.meta_resp(w).coh.isValid()).asUInt
-    val hit_state = Mux1H(tag_match_way, wayMap((w: Int) => io.meta_resp(w).coh))
-
-    old_coh := hit_state
-    req_way_en := tag_match_way
-
-    state := s_decide_next_state
-  }
-
-  when (state === s_decide_next_state) {
-    // decide next state
-    state := Mux(tag_matches && is_dirty, s_wb_req, s_release)
-  }
-
-
-  // no need to write back, just release
-  when (state === s_release) {
-    io.rep.valid := true.B
-    io.rep.bits  := edge.ProbeAck(req, report_param)
-
-    when (io.rep.fire()) {
-      state := Mux(tag_matches, s_meta_write_req, s_invalid)
-    }
-  }
-
-  when (state === s_wb_req) {
-    io.wb_req.valid          := true.B
-
-    io.wb_req.bits.tag       := req_tag
-    io.wb_req.bits.idx       := req_idx
-    io.wb_req.bits.param     := report_param
-    io.wb_req.bits.way_en    := req_way_en
-    io.wb_req.bits.source    := req.source
-    io.wb_req.bits.voluntary := false.B
-
-    when (io.wb_req.fire()) {
-      state := s_wb_resp
-    }
-  }
-
-  when (state === s_wb_resp) {
-    when (io.wb_resp) {
-      state := s_meta_write_req
-    }
-  }
-
-  when (state === s_meta_write_req) {
-    io.meta_write.valid         := true.B
-    io.meta_write.bits.idx      := req_idx
-    io.meta_write.bits.data.coh := new_coh
-    io.meta_write.bits.data.tag := req_tag
-    io.meta_write.bits.way_en   := req_way_en
-
-    when (io.meta_write.fire()) {
-      state := s_invalid
-    }
-  }
-
-  // print wb_req
-  XSDebug(io.wb_req.fire(), "wb_req idx %x tag: %x source: %d param: %x way_en: %x voluntary: %b\n",
-    io.wb_req.bits.idx, io.wb_req.bits.tag,
-    io.wb_req.bits.source, io.wb_req.bits.param,
-    io.wb_req.bits.way_en, io.wb_req.bits.voluntary)
-
-  // print tilelink messages
-  when (io.req.fire()) {
-    XSDebug("mem_probe ")
-    io.req.bits.dump
-  }
-  when (io.rep.fire()) {
-    XSDebug("mem_release ")
-    io.rep.bits.dump
-  }
-}
--- a/src/main/scala/xiangshan/cache/storeMissQueue.scala
+++ b/src/main/scala/xiangshan/cache/storeMissQueue.scala
--- a/src/main/scala/xiangshan/cache/stu.scala
+++ b/src/main/scala/xiangshan/cache/stu.scala
--- a/src/main/scala/xiangshan/cache/uncache.scala
+++ b/src/main/scala/xiangshan/cache/uncache.scala
@@ -106,10 +106,10 @@ class MMIOEntry(edge: TLEdgeOut) extends DCacheModule
  // --------------------------------------------
  when (state === s_send_resp) {
    io.resp.valid := true.B
-    io.resp.bits.data := resp_data
+    io.resp.bits.data   := resp_data
    // meta data should go with the response
-    io.resp.bits.meta := req.meta
-    io.resp.bits.miss := false.B
+    io.resp.bits.id     := req.id
+    io.resp.bits.miss   := false.B
    io.resp.bits.replay := false.B

    when (io.resp.fire()) {

--- a/src/main/scala/xiangshan/cache/wbu.scala
+++ b/src/main/scala/xiangshan/cache/wbu.scala
--- a/src/main/scala/xiangshan/mem/lsqueue/LSQWrapper.scala
+++ b/src/main/scala/xiangshan/mem/lsqueue/LSQWrapper.scala
@@ -17,8 +17,8 @@ class ExceptionAddrIO extends XSBundle {
 }

 class FwdEntry extends XSBundle {
-  val mask = Vec(8, Bool())
-  val data = Vec(8, UInt(8.W))
+  val valid = Bool()
+  val data = UInt(8.W)
 }

 // inflight miss block reqs

--- a/src/main/scala/xiangshan/mem/lsqueue/LoadQueue.scala
+++ b/src/main/scala/xiangshan/mem/lsqueue/LoadQueue.scala
@@ -535,14 +535,7 @@ class LoadQueue extends XSModule
  io.uncache.req.bits.data := dataModule.io.uncache.rdata.data
  io.uncache.req.bits.mask := dataModule.io.uncache.rdata.mask

-  io.uncache.req.bits.meta.id       := DontCare
-  io.uncache.req.bits.meta.vaddr    := DontCare
-  io.uncache.req.bits.meta.paddr    := dataModule.io.uncache.rdata.paddr
-  io.uncache.req.bits.meta.uop      := uop(deqPtr)
-  io.uncache.req.bits.meta.mmio     := true.B
-  io.uncache.req.bits.meta.tlb_miss := false.B
-  io.uncache.req.bits.meta.mask     := dataModule.io.uncache.rdata.mask
-  io.uncache.req.bits.meta.replay   := false.B
+  io.uncache.req.bits.id   := DontCare

  io.uncache.resp.ready := true.B


--- a/src/main/scala/xiangshan/mem/lsqueue/LoadQueueData.scala
+++ b/src/main/scala/xiangshan/mem/lsqueue/LoadQueueData.scala
--- a/src/main/scala/xiangshan/mem/lsqueue/StoreQueue.scala
+++ b/src/main/scala/xiangshan/mem/lsqueue/StoreQueue.scala
--- a/src/main/scala/xiangshan/mem/lsqueue/StoreQueueData.scala
+++ b/src/main/scala/xiangshan/mem/lsqueue/StoreQueueData.scala
--- a/src/main/scala/xiangshan/mem/pipeline/AtomicsUnit.scala
+++ b/src/main/scala/xiangshan/mem/pipeline/AtomicsUnit.scala
--- a/src/main/scala/xiangshan/mem/pipeline/LoadUnit.scala
+++ b/src/main/scala/xiangshan/mem/pipeline/LoadUnit.scala
--- a/src/main/scala/xiangshan/mem/sbuffer/FakeSbuffer.scala
+++ b/src/main/scala/xiangshan/mem/sbuffer/FakeSbuffer.scala
@@ -57,7 +57,7 @@ class FakeSbuffer extends XSModule {
    dcache_req.bits.addr := block_addr(req.addr)
    dcache_req.bits.data := wdataVec.asUInt
    dcache_req.bits.mask := wmaskVec.asUInt
-    dcache_req.bits.meta := DontCare
+    dcache_req.bits.id   := DontCare

    when (dcache_req.fire()) {
      state := s_resp

--- a/src/main/scala/xiangshan/mem/sbuffer/NewSbuffer.scala
+++ b/src/main/scala/xiangshan/mem/sbuffer/NewSbuffer.scala
--- a/src/main/scala/xiangshan/mem/sbuffer/Sbuffer.scala
+++ b/src/main/scala/xiangshan/mem/sbuffer/Sbuffer.scala
@@ -368,7 +368,7 @@ class Sbuffer extends XSModule with HasSBufferConst {
  io.dcache.req.bits.data := dcacheData
  io.dcache.req.bits.mask := dcacheMask
  io.dcache.req.bits.cmd  := MemoryOpConstants.M_XWR
-  io.dcache.req.bits.meta := DontCare // NOT USED
+  io.dcache.req.bits.id   := DontCare // NOT USED
  io.dcache.resp.ready := false.B

  wb_arb.io.out.ready := false.B

--- a/src/test/scala/cache/L1DTest/CoreAgent.scala
+++ b/src/test/scala/cache/L1DTest/CoreAgent.scala
--- a/src/test/scala/cache/L1DTest/CoreTransatcion.scala
+++ b/src/test/scala/cache/L1DTest/CoreTransatcion.scala
--- a/src/test/scala/cache/L1DTest/L1DTest.scala
+++ b/src/test/scala/cache/L1DTest/L1DTest.scala
--- a/src/test/scala/cache/L1plusCacheTest.scala
+++ b/src/test/scala/cache/L1plusCacheTest.scala
--- a/src/test/scala/cache/L2CacheNonInclusiveGetTest.scala
+++ b/src/test/scala/cache/L2CacheNonInclusiveGetTest.scala
--- a/src/test/scala/cache/L2CacheTest.scala
+++ b/src/test/scala/cache/L2CacheTest.scala
--- a/src/test/scala/cache/TLCTest/BigIntUtil.scala
+++ b/src/test/scala/cache/TLCTest/BigIntUtil.scala
--- a/src/test/scala/cache/TLCTest/FixedBlockFuzzer.scala
+++ b/src/test/scala/cache/TLCTest/FixedBlockFuzzer.scala
--- a/src/test/scala/cache/TLCTest/TLCAgent.scala
+++ b/src/test/scala/cache/TLCTest/TLCAgent.scala
--- a/src/test/scala/cache/TLCTest/TLCTest.scala
+++ b/src/test/scala/cache/TLCTest/TLCTest.scala
--- a/src/test/scala/cache/TLCTest/TLCTransaction.scala
+++ b/src/test/scala/cache/TLCTest/TLCTransaction.scala
--- a/src/test/scala/cache/TLCTest/TLMasterMMIO.scala
+++ b/src/test/scala/cache/TLCTest/TLMasterMMIO.scala
--- a/src/test/scala/cache/TLCTest/TLSlaveMMIO.scala
+++ b/src/test/scala/cache/TLCTest/TLSlaveMMIO.scala
--- a/src/test/scala/cache/TLCTest/TLULAgent.scala
+++ b/src/test/scala/cache/TLCTest/TLULAgent.scala
--- a/src/test/scala/cache/TLCTest/TLULMMIO.scala
+++ b/src/test/scala/cache/TLCTest/TLULMMIO.scala
--- a/src/test/scala/cache/UnalignedGetTest.scala
+++ b/src/test/scala/cache/UnalignedGetTest.scala
--- a/src/test/scala/xiangshan/backend/issue/ReservationStationDataTest.scala
+++ b/src/test/scala/xiangshan/backend/issue/ReservationStationDataTest.scala
--- a/src/test/scala/xiangshan/memend/SbufferTest.scala
+++ b/src/test/scala/xiangshan/memend/SbufferTest.scala
@@ -30,7 +30,7 @@ class SbufferWapper extends XSModule {
  // fake dcache
  sbuffer.io.dcache.req.ready := true.B
  sbuffer.io.dcache.resp.valid := RegNext(RegNext(RegNext(RegNext(sbuffer.io.dcache.req.valid))))
-  sbuffer.io.dcache.resp.bits.meta.id := RegNext(RegNext(RegNext(RegNext(sbuffer.io.dcache.req.bits.meta.id))))
+  sbuffer.io.dcache.resp.bits.id := RegNext(RegNext(RegNext(RegNext(sbuffer.io.dcache.req.bits.id))))
 }

 class SbufferTest extends AnyFlatSpec