Merge remote-tracking branch 'origin/master' into constantin

b35479a0 · William Wang · 349f0b17 · 50c287a7 · b35479a0 · b35479a0
30 changed file
--- a/src/main/scala/top/Configs.scala
+++ b/src/main/scala/top/Configs.scala
@@ -161,7 +161,8 @@ class MinimalConfig(n: Int = 1) extends Config(
          l3nWays = 8,
          spSize = 2,
        ),
-        L2CacheParamsOpt = None // remove L2 Cache
+        L2CacheParamsOpt = None, // remove L2 Cache
+        prefetcher = None // if L2 pf_recv_node does not exist, disable SMS prefetcher
      )
    )
    case SoCParamsKey =>
@@ -244,7 +245,7 @@ class WithNKBL2
        )),
        reqField = Seq(PreferCacheField()),
        echoField = Seq(DirtyField()),
-        prefetch = Some(huancun.prefetch.BOPParameters()),
+        prefetch = Some(huancun.prefetch.PrefetchReceiverParams()),
        enablePerf = true,
        sramDepthDiv = 2,
        tagECC = Some("secded"),

--- a/src/main/scala/utils/OverrideableQueue.scala
+++ b/src/main/scala/utils/OverrideableQueue.scala
+package utils
+
+import chisel3._
+import chisel3.util._
+
+class OverrideableQueue[T <: Data](gen: T, n: Int) extends Module {
+  val io = IO(new Bundle() {
+    val in = Flipped(ValidIO(gen))
+    val out = Decoupled(gen)
+  })
+  val entries = Seq.fill(n){ Reg(gen) }
+  val valids = Seq.fill(n){ RegInit(false.B) }
+  val rd_ptr = RegInit(0.U(log2Up(n).W))
+  val wr_ptr = RegInit(0.U(log2Up(n).W))
+
+  when(io.in.valid){
+    wr_ptr := wr_ptr + 1.U
+  }
+  when(io.out.fire){
+    rd_ptr := rd_ptr + 1.U
+  }
+
+  val w_mask = (0 until n).map(i => i.U === wr_ptr)
+  val r_mask = (0 until n).map(i => i.U === rd_ptr)
+
+  for((v, r) <- valids.zip(r_mask)){
+    when(r && io.out.fire){
+      v := false.B
+    }
+  }
+
+  for(((v, e), w) <- valids.zip(entries).zip(w_mask)){
+    when(io.in.valid && w){
+      v := true.B
+      e := io.in.bits
+    }
+  }
+
+  io.out.valid := Mux1H(r_mask, valids)
+  io.out.bits := Mux1H(r_mask, entries)
+}
--- a/src/main/scala/xiangshan/Bundle.scala
+++ b/src/main/scala/xiangshan/Bundle.scala
@@ -477,6 +477,14 @@ class CustomCSRCtrlIO(implicit p: Parameters) extends XSBundle {
  // Prefetcher
  val l1I_pf_enable = Output(Bool())
  val l2_pf_enable = Output(Bool())
+  val l1D_pf_enable = Output(Bool())
+  val l1D_pf_train_on_hit = Output(Bool())
+  val l1D_pf_enable_agt = Output(Bool())
+  val l1D_pf_enable_pht = Output(Bool())
+  val l1D_pf_active_threshold = Output(UInt(4.W))
+  val l1D_pf_active_stride = Output(UInt(6.W))
+  val l1D_pf_enable_stride = Output(Bool())
+  val l2_pf_store_only = Output(Bool())
  // ICache
  val icache_parity_enable = Output(Bool())
  // Labeled XiangShan

--- a/src/main/scala/xiangshan/Parameters.scala
+++ b/src/main/scala/xiangshan/Parameters.scala
@@ -30,6 +30,8 @@ import freechips.rocketchip.diplomacy.AddressSet
 import system.SoCParamsKey
 import huancun._
 import huancun.debug._
+import xiangshan.mem.prefetch.{PrefetcherParams, SMSParams}
+
 import scala.math.min

 case object XSTileKey extends Field[Seq[XSCoreParameters]]
@@ -152,6 +154,7 @@ case class XSCoreParameters
    LduCnt = 2,
    StuCnt = 2
  ),
+  prefetcher: Option[PrefetcherParams] = Some(SMSParams()),
  LoadPipelineWidth: Int = 2,
  StorePipelineWidth: Int = 2,
  VecMemSrcInWidth: Int = 2,
@@ -237,7 +240,7 @@ case class XSCoreParameters
    level = 2,
    ways = 8,
    sets = 1024, // default 512KB L2
-    prefetch = Some(huancun.prefetch.BOPParameters())
+    prefetch = Some(huancun.prefetch.PrefetchReceiverParams())
  )),
  L2NBanks: Int = 1,
  usePTWRepeater: Boolean = false,

--- a/src/main/scala/xiangshan/XSCore.scala
+++ b/src/main/scala/xiangshan/XSCore.scala
@@ -31,6 +31,7 @@ import xiangshan.backend._
 import xiangshan.backend.exu.{ExuConfig, Wb2Ctrl, WbArbiterWrapper}
 import xiangshan.cache.mmu._
 import xiangshan.frontend._
+import xiangshan.mem.L1PrefetchFuzzer

 import scala.collection.mutable.ListBuffer

@@ -327,6 +328,13 @@ class XSCoreImp(outer: XSCoreBase) extends LazyModuleImp(outer)
  exuBlocks(0).io.scheExtra.fpRfReadIn.get <> exuBlocks(1).io.scheExtra.fpRfReadOut.get
  exuBlocks(0).io.scheExtra.fpStateReadIn.get <> exuBlocks(1).io.scheExtra.fpStateReadOut.get

+  for((c, e) <- ctrlBlock.io.ld_pc_read.zip(exuBlocks(0).io.issue.get)){
+    // read load pc at load s0
+    c.ptr := e.bits.uop.cf.ftqPtr
+    c.offset := e.bits.uop.cf.ftqOffset
+  }
+  // return load pc at load s2
+  memBlock.io.loadPc <> VecInit(ctrlBlock.io.ld_pc_read.map(_.data))
  memBlock.io.issue <> exuBlocks(0).io.issue.get
  // By default, instructions do not have exceptions when they enter the function units.
  memBlock.io.issue.map(_.bits.uop.clearExceptions())

--- a/src/main/scala/xiangshan/XSTile.scala
+++ b/src/main/scala/xiangshan/XSTile.scala
@@ -126,6 +126,10 @@ class XSTile()(implicit p: Parameters) extends LazyModule
  l2cache match {
    case Some(l2) =>
      misc.l2_binder.get :*= l2.node :*= TLBuffer() :*= TLBuffer() :*= misc.l1_xbar
+      l2.pf_recv_node.map(recv => {
+        println("Connecting L1 prefetcher to L2!")
+        recv := core.memBlock.pf_sender_opt.get
+      })
    case None =>
  }


--- a/src/main/scala/xiangshan/backend/CtrlBlock.scala
+++ b/src/main/scala/xiangshan/backend/CtrlBlock.scala
@@ -28,7 +28,7 @@ import xiangshan.backend.dispatch.{Dispatch, Dispatch2Rs, DispatchQueue}
 import xiangshan.backend.fu.PFEvent
 import xiangshan.backend.rename.{Rename, RenameTableWrapper}
 import xiangshan.backend.rob.{Rob, RobCSRIO, RobLsqIO}
-import xiangshan.frontend.{FtqRead, Ftq_RF_Components}
+import xiangshan.frontend.{FtqPtr, FtqRead, Ftq_RF_Components}
 import xiangshan.mem.mdp.{LFST, SSIT, WaitTable}
 import xiangshan.ExceptionNO._
 import xiangshan.backend.exu.ExuConfig
@@ -214,6 +214,7 @@ class CtrlBlockImp(outer: CtrlBlock)(implicit p: Parameters) extends LazyModuleI
    val lqCancelCnt = Input(UInt(log2Up(LoadQueueSize + 1).W))
    val sqCancelCnt = Input(UInt(log2Up(StoreQueueSize + 1).W))
    val sqDeq = Input(UInt(log2Ceil(EnsbufferWidth + 1).W))
+    val ld_pc_read = Vec(exuParameters.LduCnt, Flipped(new FtqRead(UInt(VAddrBits.W))))
    // from int block
    val exuRedirect = Vec(exuParameters.AluCnt + exuParameters.JmpCnt, Flipped(ValidIO(new ExuOutput)))
    val stIn = Vec(exuParameters.StuCnt, Flipped(ValidIO(new ExuInput)))
@@ -267,8 +268,11 @@ class CtrlBlockImp(outer: CtrlBlock)(implicit p: Parameters) extends LazyModuleI
  val fpDq = Module(new DispatchQueue(dpParams.FpDqSize, RenameWidth, dpParams.FpDqDeqWidth))
  val lsDq = Module(new DispatchQueue(dpParams.LsDqSize, RenameWidth, dpParams.LsDqDeqWidth))
  val redirectGen = Module(new RedirectGenerator)
-  // jumpPc (2) + redirects (1) + loadPredUpdate (1) + jalr_target (1) + robFlush (1)
-  val pcMem = Module(new SyncDataModuleTemplate(new Ftq_RF_Components, FtqSize, 6, 1, "BackendPC"))
+  // jumpPc (2) + redirects (1) + loadPredUpdate (1) + jalr_target (1) + [ld pc (LduCnt)] + robFlush (1)
+  val pcMem = Module(new SyncDataModuleTemplate(
+    new Ftq_RF_Components, FtqSize,
+    6 + exuParameters.LduCnt, 1, "CtrlPcMem")
+  )
  val rob = outer.rob.module

  pcMem.io.wen.head   := RegNext(io.frontend.fromFtq.pc_mem_wen)
@@ -538,6 +542,11 @@ class CtrlBlockImp(outer: CtrlBlock)(implicit p: Parameters) extends LazyModuleI
  val jalrTargetRead = pcMem.io.rdata(4).startAddr
  val read_from_newest_entry = RegNext(jalrTargetReadPtr) === RegNext(io.frontend.fromFtq.newest_entry_ptr)
  io.jalr_target := Mux(read_from_newest_entry, RegNext(io.frontend.fromFtq.newest_entry_target), jalrTargetRead)
+  for(i <- 0 until exuParameters.LduCnt){
+    // load s0 -> get rdata (s1) -> reg next (s2) -> output (s2)
+    pcMem.io.raddr(i + 5) := io.ld_pc_read(i).ptr.value
+    io.ld_pc_read(i).data := pcMem.io.rdata(i + 5).getPc(RegNext(io.ld_pc_read(i).offset))
+  }

  rob.io.hartId := io.hartId
  io.cpu_halt := DelayN(rob.io.cpu_halt, 5)

--- a/src/main/scala/xiangshan/backend/MemBlock.scala
+++ b/src/main/scala/xiangshan/backend/MemBlock.scala
@@ -19,8 +19,9 @@ package xiangshan.backend
 import chipsalliance.rocketchip.config.Parameters
 import chisel3._
 import chisel3.util._
-import freechips.rocketchip.diplomacy.{LazyModule, LazyModuleImp}
+import freechips.rocketchip.diplomacy.{BundleBridgeSource, LazyModule, LazyModuleImp}
 import freechips.rocketchip.tile.HasFPUParameters
+import huancun.PrefetchRecv
 import utils._
 import utility._
 import xiangshan._
@@ -30,6 +31,7 @@ import xiangshan.backend.rob.RobLsqIO
 import xiangshan.cache._
 import xiangshan.cache.mmu.{VectorTlbPtwIO, TLBNonBlock, TlbReplace}
 import xiangshan.mem._
+import xiangshan.mem.prefetch.{BasePrefecher, SMSParams, SMSPrefetcher}

 class Std(implicit p: Parameters) extends FunctionUnit {
  io.in.ready := true.B
@@ -43,6 +45,9 @@ class MemBlock()(implicit p: Parameters) extends LazyModule

  val dcache = LazyModule(new DCacheWrapper())
  val uncache = LazyModule(new Uncache())
+  val pf_sender_opt = coreParams.prefetcher.map(_ =>
+    BundleBridgeSource(() => new PrefetchRecv)
+  )

  lazy val module = new MemBlockImp(this)

@@ -69,6 +74,7 @@ class MemBlockImp(outer: MemBlock) extends LazyModuleImp(outer)
    val loadFastMatch = Vec(exuParameters.LduCnt, Input(UInt(exuParameters.LduCnt.W)))
    val loadFastImm = Vec(exuParameters.LduCnt, Input(UInt(12.W)))
    val rsfeedback = Vec(exuParameters.StuCnt, new MemRSFeedbackIO)
+    val loadPc = Vec(exuParameters.LduCnt, Input(UInt(VAddrBits.W))) // for hw prefetch
    val stIssuePtr = Output(new SqPtr())
    val int2vlsu = Flipped(new Int2VLSUIO)
    val vec2vlsu = Flipped(new Vec2VLSUIO)
@@ -79,10 +85,12 @@ class MemBlockImp(outer: MemBlock) extends LazyModuleImp(outer)
    val vlsu2vec = new VLSU2VecIO
    val vlsu2int = new VLSU2IntIO
    val vlsu2ctrl = new VLSU2CtrlIO
+    // prefetch to l1 req
+    val prefetch_req = Flipped(DecoupledIO(new L1PrefetchReq))
    // misc
    val stIn = Vec(exuParameters.StuCnt, ValidIO(new ExuInput))
    val memoryViolation = ValidIO(new Redirect)
-    val ptw = new VectorTlbPtwIO(exuParameters.LduCnt + exuParameters.StuCnt)
+    val ptw = new VectorTlbPtwIO(exuParameters.LduCnt + exuParameters.StuCnt + 1) // load + store + hw prefetch
    val sfence = Input(new SfenceBundle)
    val tlbCsr = Input(new TlbCsrBundle)
    val fenceToSbuffer = Flipped(new FenceToSbuffer)
@@ -119,6 +127,7 @@ class MemBlockImp(outer: MemBlock) extends LazyModuleImp(outer)

  val csrCtrl = DelayN(io.csrCtrl, 2)
  dcache.io.csr.distribute_csr <> csrCtrl.distribute_csr
+  dcache.io.l2_pf_store_only := RegNext(io.csrCtrl.l2_pf_store_only, false.B)
  io.csrUpdate := RegNext(dcache.io.csr.update)
  io.error <> RegNext(RegNext(dcache.io.error))
  when(!csrCtrl.cache_error_enable){
@@ -131,6 +140,31 @@ class MemBlockImp(outer: MemBlock) extends LazyModuleImp(outer)
  val stdExeUnits = Seq.fill(exuParameters.StuCnt)(Module(new StdExeUnit))
  val stData = stdExeUnits.map(_.io.out)
  val exeUnits = loadUnits ++ storeUnits
+  val l1_pf_req = Wire(Decoupled(new L1PrefetchReq()))
+  val prefetcherOpt: Option[BasePrefecher] = coreParams.prefetcher.map {
+    case _: SMSParams =>
+      val sms = Module(new SMSPrefetcher())
+      sms.io_agt_en := RegNextN(io.csrCtrl.l1D_pf_enable_agt, 2, Some(false.B))
+      sms.io_pht_en := RegNextN(io.csrCtrl.l1D_pf_enable_pht, 2, Some(false.B))
+      sms.io_act_threshold := RegNextN(io.csrCtrl.l1D_pf_active_threshold, 2, Some(12.U))
+      sms.io_act_stride := RegNextN(io.csrCtrl.l1D_pf_active_stride, 2, Some(30.U))
+      sms.io_stride_en := RegNextN(io.csrCtrl.l1D_pf_enable_stride, 2, Some(true.B))
+      sms
+  }
+  prefetcherOpt.foreach(pf => {
+    val pf_to_l2 = ValidIODelay(pf.io.pf_addr, 2)
+    outer.pf_sender_opt.get.out.head._1.addr_valid := pf_to_l2.valid
+    outer.pf_sender_opt.get.out.head._1.addr := pf_to_l2.bits
+    outer.pf_sender_opt.get.out.head._1.l2_pf_en := RegNextN(io.csrCtrl.l2_pf_enable, 2, Some(true.B))
+    pf.io.enable := RegNextN(io.csrCtrl.l1D_pf_enable, 2, Some(false.B))
+  })
+  prefetcherOpt match {
+    case Some(pf) => l1_pf_req <> pf.io.l1_req
+    case None =>
+      l1_pf_req.valid := false.B
+      l1_pf_req.bits := DontCare
+  }
+  val pf_train_on_hit = RegNextN(io.csrCtrl.l1D_pf_train_on_hit, 2, Some(true.B))

  loadUnits.zipWithIndex.map(x => x._1.suggestName("LoadUnit_"+x._2))
  storeUnits.zipWithIndex.map(x => x._1.suggestName("StoreUnit_"+x._2))
@@ -159,6 +193,35 @@ class MemBlockImp(outer: MemBlock) extends LazyModuleImp(outer)
  io.otherFastWakeup.take(2).zip(loadUnits.map(_.io.fastUop)).foreach{case(a,b)=> a := b}
  val stOut = io.writeback.drop(exuParameters.LduCnt).dropRight(exuParameters.StuCnt)

+  // prefetch to l1 req
+  loadUnits.foreach(load_unit => {
+    load_unit.io.prefetch_req.valid <> l1_pf_req.valid
+    load_unit.io.prefetch_req.bits <> l1_pf_req.bits
+  })
+  // when loadUnits(0) stage 0 is busy, hw prefetch will never use that pipeline
+  loadUnits(0).io.prefetch_req.bits.confidence := 0.U
+
+  l1_pf_req.ready := (l1_pf_req.bits.confidence > 0.U) ||
+    loadUnits.map(!_.io.ldin.valid).reduce(_ || _)
+
+  // l1 pf fuzzer interface
+  val DebugEnableL1PFFuzzer = false
+  if (DebugEnableL1PFFuzzer) {
+    // l1 pf req fuzzer
+    val fuzzer = Module(new L1PrefetchFuzzer())
+    fuzzer.io.vaddr := DontCare
+    fuzzer.io.paddr := DontCare
+
+    // override load_unit prefetch_req
+    loadUnits.foreach(load_unit => {
+      load_unit.io.prefetch_req.valid <> fuzzer.io.req.valid
+      load_unit.io.prefetch_req.bits <> fuzzer.io.req.bits
+    })
+
+    fuzzer.io.req.ready := l1_pf_req.ready
+  }
+
+  // TODO: fast load wakeup
  val lsq     = Module(new LsqWrappper)
  val vlsq    = Module(new DummyVectorLsq)
  val sbuffer = Module(new Sbuffer)
@@ -182,7 +245,11 @@ class MemBlockImp(outer: MemBlock) extends LazyModuleImp(outer)
    val tlb_st = Module(new TLBNonBlock(exuParameters.StuCnt, 1, sttlbParams))
    tlb_st.io // let the module have name in waveform
  })
-  val dtlb = dtlb_ld ++ dtlb_st
+  val dtlb_prefetch = VecInit(Seq.fill(1){
+    val tlb_prefetch = Module(new TLBNonBlock(1, 2, sttlbParams))
+    tlb_prefetch.io // let the module have name in waveform
+  })
+  val dtlb = dtlb_ld ++ dtlb_st ++ dtlb_prefetch
  val dtlb_reqs = dtlb.map(_.requestor).flatten
  val dtlb_pmps = dtlb.map(_.pmp).flatten
  dtlb.map(_.sfence := sfence)
@@ -192,7 +259,7 @@ class MemBlockImp(outer: MemBlock) extends LazyModuleImp(outer)
    require(ldtlbParams.outReplace == sttlbParams.outReplace)
    require(ldtlbParams.outReplace)

-    val replace = Module(new TlbReplace(exuParameters.LduCnt + exuParameters.StuCnt, ldtlbParams))
+    val replace = Module(new TlbReplace(exuParameters.LduCnt + exuParameters.StuCnt + 1, ldtlbParams))
    replace.io.apply_sep(dtlb_ld.map(_.replace) ++ dtlb_st.map(_.replace), io.ptw.resp.bits.data.entry.tag)
  } else {
    if (ldtlbParams.outReplace) {
@@ -209,10 +276,9 @@ class MemBlockImp(outer: MemBlock) extends LazyModuleImp(outer)
  val ptw_resp_v = RegNext(io.ptw.resp.valid && !(sfence.valid && tlbcsr.satp.changed), init = false.B)
  io.ptw.resp.ready := true.B

-  (dtlb.map(a => a.ptw.req.map(b => b)))
-    .flatten
+  dtlb.flatMap(a => a.ptw.req)
    .zipWithIndex
-    .map{ case (tlb, i) =>
+    .foreach{ case (tlb, i) =>
    tlb <> io.ptw.req(i)
    val vector_hit = if (refillBothTlb) Cat(ptw_resp_next.vector).orR
      else if (i < exuParameters.LduCnt) Cat(ptw_resp_next.vector.take(exuParameters.LduCnt)).orR
@@ -220,12 +286,13 @@ class MemBlockImp(outer: MemBlock) extends LazyModuleImp(outer)
    io.ptw.req(i).valid := tlb.valid && !(ptw_resp_v && vector_hit &&
      ptw_resp_next.data.entry.hit(tlb.bits.vpn, tlbcsr.satp.asid, allType = true, ignoreAsid = true))
  }
-  dtlb.map(_.ptw.resp.bits := ptw_resp_next.data)
+  dtlb.foreach(_.ptw.resp.bits := ptw_resp_next.data)
  if (refillBothTlb) {
-    dtlb.map(_.ptw.resp.valid := ptw_resp_v && Cat(ptw_resp_next.vector).orR)
+    dtlb.foreach(_.ptw.resp.valid := ptw_resp_v && Cat(ptw_resp_next.vector).orR)
  } else {
-    dtlb_ld.map(_.ptw.resp.valid := ptw_resp_v && Cat(ptw_resp_next.vector.take(exuParameters.LduCnt)).orR)
-    dtlb_st.map(_.ptw.resp.valid := ptw_resp_v && Cat(ptw_resp_next.vector.drop(exuParameters.LduCnt)).orR)
+    dtlb_ld.foreach(_.ptw.resp.valid := ptw_resp_v && Cat(ptw_resp_next.vector.take(exuParameters.LduCnt)).orR)
+    dtlb_st.foreach(_.ptw.resp.valid := ptw_resp_v && Cat(ptw_resp_next.vector.drop(exuParameters.LduCnt).take(exuParameters.StuCnt)).orR)
+    dtlb_prefetch.foreach(_.ptw.resp.valid := ptw_resp_v && Cat(ptw_resp_next.vector.drop(exuParameters.LduCnt + exuParameters.StuCnt)).orR)
  }


@@ -233,7 +300,7 @@ class MemBlockImp(outer: MemBlock) extends LazyModuleImp(outer)
  val pmp = Module(new PMP())
  pmp.io.distribute_csr <> csrCtrl.distribute_csr

-  val pmp_check = VecInit(Seq.fill(exuParameters.LduCnt + exuParameters.StuCnt)(Module(new PMPChecker(3)).io))
+  val pmp_check = VecInit(Seq.fill(exuParameters.LduCnt + exuParameters.StuCnt + 1)(Module(new PMPChecker(3)).io))
  for ((p,d) <- pmp_check zip dtlb_pmps) {
    p.apply(tlbcsr.priv.dmode, pmp.io.pmp, pmp.io.pma, d)
    require(p.req.bits.size.getWidth == d.bits.size.getWidth)
@@ -285,6 +352,18 @@ class MemBlockImp(outer: MemBlock) extends LazyModuleImp(outer)
    for (s <- 0 until StorePipelineWidth) {
      loadUnits(i).io.reExecuteQuery(s) := storeUnits(s).io.reExecuteQuery
    }
+    // prefetch
+    prefetcherOpt.foreach(pf => {
+      pf.io.ld_in(i).valid := Mux(pf_train_on_hit,
+        loadUnits(i).io.prefetch_train.valid,
+        loadUnits(i).io.prefetch_train.valid && loadUnits(i).io.prefetch_train.bits.isFirstIssue && (
+          loadUnits(i).io.prefetch_train.bits.miss || loadUnits(i).io.prefetch_train.bits.meta_prefetch
+          )
+      )
+      pf.io.ld_in(i).bits := loadUnits(i).io.prefetch_train.bits
+      pf.io.ld_in(i).bits.uop.cf.pc := Mux(loadUnits(i).io.s2IsPointerChasing, io.loadPc(i), RegNext(io.loadPc(i)))
+    })
+
    // load to load fast forward: load(i) prefers data(i)
    val fastPriority = (i until exuParameters.LduCnt) ++ (0 until i)
    val fastValidVec = fastPriority.map(j => loadUnits(j).io.fastpathOut.valid)
@@ -351,6 +430,13 @@ class MemBlockImp(outer: MemBlock) extends LazyModuleImp(outer)
    p"has trigger hit vec ${io.writeback(i).bits.uop.cf.trigger.backendHit}\n")

  }
+  // Prefetcher
+  val PrefetcherDTLBPortIndex = exuParameters.LduCnt + exuParameters.StuCnt
+  dtlb_reqs(PrefetcherDTLBPortIndex) := DontCare
+  dtlb_reqs(PrefetcherDTLBPortIndex).req.valid := false.B
+  prefetcherOpt.foreach(pf => {
+    dtlb_reqs(PrefetcherDTLBPortIndex) <> pf.io.tlb_req
+  })

  // StoreUnit
  for (i <- 0 until exuParameters.StuCnt) {
@@ -539,9 +625,14 @@ class MemBlockImp(outer: MemBlock) extends LazyModuleImp(outer)
  // for atomicsUnit, it uses loadUnit(0)'s TLB port

  when (state =/= s_normal) {
+    // use store wb port instead of load
    loadUnits(0).io.ldout.ready := false.B
+    // use load_0's TLB
    atomicsUnit.io.dtlb <> amoTlb

+    // hw prefetch should be disabled while executing atomic insts
+    loadUnits.map(i => i.io.prefetch_req.valid := false.B)
+
    // make sure there's no in-flight uops in load unit
    assert(!loadUnits(0).io.ldout.valid)
  }

--- a/src/main/scala/xiangshan/backend/fu/CSR.scala
+++ b/src/main/scala/xiangshan/backend/fu/CSR.scala
@@ -483,9 +483,35 @@ class CSR(implicit p: Parameters) extends FunctionUnit with HasCSRConst with PMP

  // spfctl Bit 0: L1I Cache Prefetcher Enable
  // spfctl Bit 1: L2Cache Prefetcher Enable
-  val spfctl = RegInit(UInt(XLEN.W), "b11".U)
+  // spfctl Bit 2: L1D Cache Prefetcher Enable
+  // spfctl Bit 3: L1D train prefetch on hit
+  // spfctl Bit 4: L1D prefetch enable agt
+  // spfctl Bit 5: L1D prefetch enable pht
+  // spfctl Bit [9:6]: L1D prefetch active page threshold
+  // spfctl Bit [15:10]: L1D prefetch active page stride
+  // turn off L2 BOP, turn on L1 SMS by default
+  val spfctl = RegInit(UInt(XLEN.W), Seq(
+    0 << 17,    // L2 pf store only [17] init: false
+    1 << 16,    // L1D pf enable stride [16] init: true
+    30 << 10,   // L1D active page stride [15:10] init: 30
+    12 << 6,    // L1D active page threshold [9:6] init: 12
+    1  << 5,    // L1D enable pht [5] init: true
+    1  << 4,    // L1D enable agt [4] init: true
+    0  << 3,    // L1D train on hit [3] init: false
+    1  << 2,    // L1D pf enable [2] init: true
+    1  << 1,    // L2 pf enable [1] init: true
+    1  << 0,    // L1I pf enable [0] init: true
+  ).reduce(_|_).U(XLEN.W))
  csrio.customCtrl.l1I_pf_enable := spfctl(0)
  csrio.customCtrl.l2_pf_enable := spfctl(1)
+  csrio.customCtrl.l1D_pf_enable := spfctl(2)
+  csrio.customCtrl.l1D_pf_train_on_hit := spfctl(3)
+  csrio.customCtrl.l1D_pf_enable_agt := spfctl(4)
+  csrio.customCtrl.l1D_pf_enable_pht := spfctl(5)
+  csrio.customCtrl.l1D_pf_active_threshold := spfctl(9, 6)
+  csrio.customCtrl.l1D_pf_active_stride := spfctl(15, 10)
+  csrio.customCtrl.l1D_pf_enable_stride := spfctl(16)
+  csrio.customCtrl.l2_pf_store_only := spfctl(17)

  // sfetchctl Bit 0: L1I Cache Parity check enable
  val sfetchctl = RegInit(UInt(XLEN.W), "b0".U)

--- a/src/main/scala/xiangshan/cache/dcache/DCacheWrapper.scala
+++ b/src/main/scala/xiangshan/cache/dcache/DCacheWrapper.scala
@@ -100,12 +100,21 @@ trait HasDCacheParameters extends HasL1CacheParameters {

  def blockProbeAfterGrantCycles = 8 // give the processor some time to issue a request after a grant

-  def nSourceType = 3
+  def nSourceType = 10
  def sourceTypeWidth = log2Up(nSourceType)
+  // non-prefetch source < 3
  def LOAD_SOURCE = 0
  def STORE_SOURCE = 1
  def AMO_SOURCE = 2
-  def SOFT_PREFETCH = 3
+  // prefetch source >= 3
+  def DCACHE_PREFETCH_SOURCE = 3
+  def SOFT_PREFETCH = 4
+  def HW_PREFETCH_AGT = 5
+  def HW_PREFETCH_PHT_CUR = 6
+  def HW_PREFETCH_PHT_INC = 7
+  def HW_PREFETCH_PHT_DEC = 8
+  def HW_PREFETCH_BOP = 9
+  def HW_PREFETCH_STRIDE = 10

  // each source use a id to distinguish its multiple reqs
  def reqIdWidth = log2Up(nEntries) max log2Up(StoreBufferSize)
@@ -142,6 +151,10 @@ trait HasDCacheParameters extends HasL1CacheParameters {

  // uncache
  val uncacheIdxBits = log2Up(StoreQueueSize) max log2Up(LoadQueueSize)
+  // hardware prefetch parameters
+  // high confidence hardware prefetch port
+  val HighConfHWPFLoadPort = LoadPipelineWidth - 1 // use the last load port by default
+  val IgnorePrefetchConfidence = false

  // parameters about duplicating regs to solve fanout
  // In Main Pipe:
@@ -274,6 +287,15 @@ class ReplacementWayReqIO(implicit p: Parameters) extends DCacheBundle {
  val way = Input(UInt(log2Up(nWays).W))
 }

+class DCacheExtraMeta(implicit p: Parameters) extends DCacheBundle
+{
+  val error = Bool() // cache line has been marked as corrupted by l2 / ecc error detected when store
+  val prefetch = Bool() // cache line is first required by prefetch
+  val access = Bool() // cache line has been accessed by load / store
+
+  // val debug_access_timestamp = UInt(64.W) // last time a load / store / refill access that cacheline
+}
+
 // memory request in word granularity(load, mmio, lr/sc, atomics)
 class DCacheWordReq(implicit p: Parameters)  extends DCacheBundle
 {
@@ -336,6 +358,8 @@ class BaseDCacheWordResp(implicit p: Parameters) extends DCacheBundle

 class DCacheWordResp(implicit p: Parameters) extends BaseDCacheWordResp
 {
+  val meta_prefetch = Bool()
+  val meta_access = Bool()
  // 1 cycle after data resp
  val error_delayed = Bool() // all kinds of errors, include tag error
 }
@@ -461,6 +485,7 @@ class DCacheLoadIO(implicit p: Parameters) extends DCacheWordIO
  // kill previous cycle's req
  val s1_kill  = Output(Bool())
  val s2_kill  = Output(Bool())
+  val s2_pc = Output(UInt(VAddrBits.W))
  // cycle 0: virtual address: req.addr
  // cycle 1: physical address: s1_paddr
  val s1_paddr_dup_lsu = Output(UInt(PAddrBits.W)) // lsu side paddr
@@ -618,6 +643,7 @@ class DCacheToLsuIO(implicit p: Parameters) extends DCacheBundle {

 class DCacheIO(implicit p: Parameters) extends DCacheBundle {
  val hartId = Input(UInt(8.W))
+  val l2_pf_store_only = Input(Bool())
  val lsu = new DCacheToLsuIO
  val csr = new L1CacheToCsrIO
  val error = new L1CacheErrorInfo
@@ -664,8 +690,10 @@ class DCacheImp(outer: DCache) extends LazyModuleImp(outer) with HasDCacheParame
  //----------------------------------------
  // core data structures
  val bankedDataArray = Module(new BankedDataArray)
-  val metaArray = Module(new AsynchronousMetaArray(readPorts = LoadPipelineWidth + 1, writePorts = 2))
-  val errorArray = Module(new ErrorArray(readPorts = LoadPipelineWidth + 1, writePorts = 2)) // TODO: add it to meta array
+  val metaArray = Module(new L1CohMetaArray(readPorts = LoadPipelineWidth + 1, writePorts = 2))
+  val errorArray = Module(new L1FlagMetaArray(readPorts = LoadPipelineWidth + 1, writePorts = 2))
+  val prefetchArray = Module(new L1FlagMetaArray(readPorts = LoadPipelineWidth + 1, writePorts = 2)) // prefetch flag array
+  val accessArray = Module(new L1FlagMetaArray(readPorts = LoadPipelineWidth + 1, writePorts = LoadPipelineWidth + 2))
  val tagArray = Module(new DuplicatedTagArray(readPorts = LoadPipelineWidth + 1))
  bankedDataArray.dump()

@@ -680,6 +708,7 @@ class DCacheImp(outer: DCache) extends LazyModuleImp(outer) with HasDCacheParame
  val wb         = Module(new WritebackQueue(edge))

  missQueue.io.hartId := io.hartId
+  missQueue.io.l2_pf_store_only := RegNext(io.l2_pf_store_only, false.B)

  val errors = ldu.map(_.io.error) ++ // load error
    Seq(mainPipe.io.error) // store / misc error 
@@ -687,6 +716,8 @@ class DCacheImp(outer: DCache) extends LazyModuleImp(outer) with HasDCacheParame

  //----------------------------------------
  // meta array
+
+  // read / write coh meta
  val meta_read_ports = ldu.map(_.io.meta_read) ++
    Seq(mainPipe.io.meta_read)
  val meta_resp_ports = ldu.map(_.io.meta_resp) ++
@@ -699,16 +730,41 @@ class DCacheImp(outer: DCache) extends LazyModuleImp(outer) with HasDCacheParame
  meta_resp_ports.zip(metaArray.io.resp).foreach { case (p, r) => p := r }
  meta_write_ports.zip(metaArray.io.write).foreach { case (p, w) => w <> p }

-  val error_flag_resp_ports = ldu.map(_.io.error_flag_resp) ++
-    Seq(mainPipe.io.error_flag_resp)
+  // read extra meta
+  meta_read_ports.zip(errorArray.io.read).foreach { case (p, r) => r <> p }
+  meta_read_ports.zip(prefetchArray.io.read).foreach { case (p, r) => r <> p }
+  meta_read_ports.zip(accessArray.io.read).foreach { case (p, r) => r <> p }
+  val extra_meta_resp_ports = ldu.map(_.io.extra_meta_resp) ++
+    Seq(mainPipe.io.extra_meta_resp)
+  extra_meta_resp_ports.zip(errorArray.io.resp).foreach { case (p, r) => {
+    (0 until nWays).map(i => { p(i).error := r(i) })
+  }}
+  extra_meta_resp_ports.zip(prefetchArray.io.resp).foreach { case (p, r) => {
+    (0 until nWays).map(i => { p(i).prefetch := r(i) })
+  }}
+  extra_meta_resp_ports.zip(accessArray.io.resp).foreach { case (p, r) => {
+    (0 until nWays).map(i => { p(i).access := r(i) })
+  }}
+
+  // write extra meta
  val error_flag_write_ports = Seq(
-    mainPipe.io.error_flag_write,
-    refillPipe.io.error_flag_write
+    mainPipe.io.error_flag_write, // error flag generated by corrupted store
+    refillPipe.io.error_flag_write // corrupted signal from l2
  )
-  meta_read_ports.zip(errorArray.io.read).foreach { case (p, r) => r <> p }
-  error_flag_resp_ports.zip(errorArray.io.resp).foreach { case (p, r) => p := r }
  error_flag_write_ports.zip(errorArray.io.write).foreach { case (p, w) => w <> p }

+  val prefetch_flag_write_ports = Seq(
+    mainPipe.io.prefetch_flag_write, // set prefetch_flag to false if coh is set to Nothing
+    refillPipe.io.prefetch_flag_write // refill required by prefetch will set prefetch_flag
+  )
+  prefetch_flag_write_ports.zip(prefetchArray.io.write).foreach { case (p, w) => w <> p }
+
+  val access_flag_write_ports = ldu.map(_.io.access_flag_write) ++ Seq(
+    mainPipe.io.access_flag_write,
+    refillPipe.io.access_flag_write
+  )
+  access_flag_write_ports.zip(accessArray.io.write).foreach { case (p, w) => w <> p }
+
  //----------------------------------------
  // tag array
  require(tagArray.io.read.size == (ldu.size + 1))

--- a/src/main/scala/xiangshan/cache/dcache/loadpipe/LoadPipe.scala
+++ b/src/main/scala/xiangshan/cache/dcache/loadpipe/LoadPipe.scala
@@ -34,7 +34,7 @@ class LoadPipe(id: Int)(implicit p: Parameters) extends DCacheModule with HasPer
    // meta and data array read port
    val meta_read = DecoupledIO(new MetaReadReq)
    val meta_resp = Input(Vec(nWays, new Meta))
-    val error_flag_resp = Input(Vec(nWays, Bool()))
+    val extra_meta_resp = Input(Vec(nWays, new DCacheExtraMeta))

    val tag_read = DecoupledIO(new TagReadReq)
    val tag_resp = Input(Vec(nWays, UInt(encTagBits.W)))
@@ -43,6 +43,9 @@ class LoadPipe(id: Int)(implicit p: Parameters) extends DCacheModule with HasPer
    val banked_data_resp = Input(new L1BankedDataReadResult())
    val read_error_delayed = Input(Bool())

+    // access bit update
+    val access_flag_write = DecoupledIO(new FlagMetaWriteReq)
+
    // banked data read conflict
    val bank_conflict_slow = Input(Bool())
    val bank_conflict_fast = Input(Bool())
@@ -183,13 +186,16 @@ class LoadPipe(id: Int)(implicit p: Parameters) extends DCacheModule with HasPer
  // this simplifies our logic in s2 stage
  val s1_hit_meta = Mux(s1_tag_match_dup_dc, Mux1H(s1_tag_match_way_dup_dc, wayMap((w: Int) => meta_resp(w))), s1_fake_meta)
  val s1_hit_coh = s1_hit_meta.coh
-  val s1_hit_error = Mux(s1_tag_match_dup_dc, Mux1H(s1_tag_match_way_dup_dc, wayMap((w: Int) => io.error_flag_resp(w))), false.B)
+  val s1_hit_error = Mux(s1_tag_match_dup_dc, Mux1H(s1_tag_match_way_dup_dc, wayMap((w: Int) => io.extra_meta_resp(w).error)), false.B)
+  val s1_hit_prefetch = Mux(s1_tag_match_dup_dc, Mux1H(s1_tag_match_way_dup_dc, wayMap((w: Int) => io.extra_meta_resp(w).prefetch)), false.B)
+  val s1_hit_access = Mux(s1_tag_match_dup_dc, Mux1H(s1_tag_match_way_dup_dc, wayMap((w: Int) => io.extra_meta_resp(w).access)), false.B)

  io.replace_way.set.valid := RegNext(s0_fire)
  io.replace_way.set.bits := get_idx(s1_vaddr)
  val s1_repl_way_en = UIntToOH(io.replace_way.way)
  val s1_repl_tag = Mux1H(s1_repl_way_en, wayMap(w => tag_resp(w)))
  val s1_repl_coh = Mux1H(s1_repl_way_en, wayMap(w => meta_resp(w).coh))
+  val s1_repl_extra_meta = Mux1H(s1_repl_way_en, wayMap(w => io.extra_meta_resp(w)))

  val s1_need_replacement = !s1_tag_match_dup_dc
  val s1_way_en = Mux(s1_need_replacement, s1_repl_way_en, s1_tag_match_way_dup_dc)
@@ -232,6 +238,7 @@ class LoadPipe(id: Int)(implicit p: Parameters) extends DCacheModule with HasPer

  dump_pipeline_reqs("LoadPipe s2", s2_valid, s2_req)

+
  // hit, miss, nack, permission checking
  // dcache side tag match
  val s2_tag_match_way = RegEnable(s1_tag_match_way_dup_dc, s1_fire)
@@ -244,12 +251,13 @@ class LoadPipe(id: Int)(implicit p: Parameters) extends DCacheModule with HasPer

  val s2_hit_meta = RegEnable(s1_hit_meta, s1_fire)
  val s2_hit_coh = RegEnable(s1_hit_coh, s1_fire)
-  val s2_has_permission = s2_hit_coh.onAccess(s2_req.cmd)._1 // redundant
-  val s2_new_hit_coh = s2_hit_coh.onAccess(s2_req.cmd)._3 // redundant
+  val s2_has_permission = s2_hit_coh.onAccess(s2_req.cmd)._1 // for write prefetch
+  val s2_new_hit_coh = s2_hit_coh.onAccess(s2_req.cmd)._3 // for write prefetch

  val s2_way_en = RegEnable(s1_way_en, s1_fire)
  val s2_repl_coh = RegEnable(s1_repl_coh, s1_fire)
  val s2_repl_tag = RegEnable(s1_repl_tag, s1_fire)
+  val s2_repl_extra_meta = RegEnable(s1_repl_extra_meta, s1_fire) // not used for now
  val s2_encTag = RegEnable(s1_encTag, s1_fire)

  // when req got nacked, upper levels should replay this request
@@ -269,9 +277,10 @@ class LoadPipe(id: Int)(implicit p: Parameters) extends DCacheModule with HasPer
  val s2_tag_error = dcacheParameters.tagCode.decode(s2_encTag).error // error reported by tag ecc check
  val s2_flag_error = RegEnable(s1_flag_error, s1_fire)

+  val s2_hit_prefetch = RegEnable(s1_hit_prefetch, s1_fire)
+  val s2_hit_access = RegEnable(s1_hit_access, s1_fire)
+
  val s2_hit = s2_tag_match && s2_has_permission && s2_hit_coh === s2_new_hit_coh && !s2_wpu_pred_fail
-  // assert(!RegNext(s2_valid && (s2_tag_match && !s2_hit)))
-  // assert(!RegNext(s2_valid && (s2_hit_dup_lsu =/= s2_hit)))

  // only dump these signals when they are actually valid
  dump_pipeline_valids("LoadPipe s2", "s2_hit", s2_valid && s2_hit)
@@ -293,6 +302,7 @@ class LoadPipe(id: Int)(implicit p: Parameters) extends DCacheModule with HasPer
  io.miss_req.bits.replace_coh := s2_repl_coh
  io.miss_req.bits.replace_tag := s2_repl_tag
  io.miss_req.bits.cancel := io.lsu.s2_kill || s2_tag_error
+  io.miss_req.bits.pc := io.lsu.s2_pc

  // send back response
  val resp = Wire(ValidIO(new DCacheWordResp))
@@ -312,11 +322,15 @@ class LoadPipe(id: Int)(implicit p: Parameters) extends DCacheModule with HasPer
  resp.bits.replay := (resp.bits.miss && (!io.miss_req.fire() || s2_nack)) || io.bank_conflict_slow || s2_wpu_pred_fail
  resp.bits.replayCarry.valid := resp.bits.miss
  resp.bits.replayCarry.real_way_en := s2_real_way_en
+  resp.bits.meta_prefetch := s2_hit_prefetch
+  resp.bits.meta_access := s2_hit_access
  resp.bits.tag_error := s2_tag_error // report tag_error in load s2
  resp.bits.mshr_id := io.miss_resp.id

-  XSPerfAccumulate("dcache_read_bank_conflict", io.bank_conflict_slow && s2_valid)
  XSPerfAccumulate("wpu_pred_fail", s2_wpu_pred_fail && s2_valid)
+  XSPerfAccumulate("dcache_read_bank_conflict", io.bank_conflict_slow && s2_valid)
+  XSPerfAccumulate("dcache_read_from_prefetched_line", s2_valid && s2_hit_prefetch && !resp.bits.miss)
+  XSPerfAccumulate("dcache_first_read_from_prefetched_line", s2_valid && s2_hit_prefetch && !resp.bits.miss && !s2_hit_access)

  io.lsu.resp.valid := resp.valid
  io.lsu.resp.bits := resp.bits
@@ -337,11 +351,13 @@ class LoadPipe(id: Int)(implicit p: Parameters) extends DCacheModule with HasPer
  // report ecc error and get selected dcache data

  val s3_valid = RegNext(s2_valid)
+  val s3_vaddr = RegEnable(s2_vaddr, s2_fire)
  val s3_paddr = RegEnable(s2_paddr, s2_fire)
  val s3_hit = RegEnable(s2_hit, s2_fire)
+  val s3_tag_match_way = RegEnable(s2_tag_match_way, s2_fire)

  val s3_banked_data_resp_word = io.banked_data_resp.raw_data
-  val s3_data_error = io.read_error_delayed // banked_data_resp_word.error && !bank_conflict
+  val s3_data_error = io.read_error_delayed && s3_hit // banked_data_resp_word.error && !bank_conflict
  val s3_tag_error = RegEnable(s2_tag_error, s2_fire)
  val s3_flag_error = RegEnable(s2_flag_error, s2_fire)
  val s3_error = s3_tag_error || s3_flag_error || s3_data_error
@@ -361,12 +377,17 @@ class LoadPipe(id: Int)(implicit p: Parameters) extends DCacheModule with HasPer
  // report tag error / l2 corrupted to CACHE_ERROR csr
  io.error.valid := s3_error && s3_valid

-  // update plru, report error in s3
-
+  // update plru in s3
  io.replace_access.valid := RegNext(RegNext(RegNext(io.meta_read.fire()) && s1_valid && !io.lsu.s1_kill) && !s2_nack_no_mshr)
  io.replace_access.bits.set := RegNext(RegNext(get_idx(s1_req.addr)))
  io.replace_access.bits.way := RegNext(RegNext(Mux(s1_tag_match_dup_dc, OHToUInt(s1_tag_match_way_dup_dc), io.replace_way.way)))

+  // update access bit
+  io.access_flag_write.valid := s3_valid && s3_hit
+  io.access_flag_write.bits.idx := get_idx(s3_vaddr)
+  io.access_flag_write.bits.way_en := s3_tag_match_way
+  io.access_flag_write.bits.flag := true.B
+
  // --------------------------------------------------------------------------------
  // Debug logging functions
  def dump_pipeline_reqs(pipeline_stage_name: String, valid: Bool,

--- a/src/main/scala/xiangshan/cache/dcache/mainpipe/MainPipe.scala
+++ b/src/main/scala/xiangshan/cache/dcache/mainpipe/MainPipe.scala
@@ -131,9 +131,11 @@ class MainPipe(implicit p: Parameters) extends DCacheModule with HasPerfEvents {
    // meta array
    val meta_read = DecoupledIO(new MetaReadReq)
    val meta_resp = Input(Vec(nWays, new Meta))
-    val meta_write = DecoupledIO(new MetaWriteReq)
-    val error_flag_resp = Input(Vec(nWays, Bool()))
-    val error_flag_write = DecoupledIO(new ErrorWriteReq)
+    val meta_write = DecoupledIO(new CohMetaWriteReq)
+    val extra_meta_resp = Input(Vec(nWays, new DCacheExtraMeta))
+    val error_flag_write = DecoupledIO(new FlagMetaWriteReq)
+    val prefetch_flag_write = DecoupledIO(new FlagMetaWriteReq)
+    val access_flag_write = DecoupledIO(new FlagMetaWriteReq)

    // tag sram
    val tag_read = DecoupledIO(new TagReadReq)
@@ -282,9 +284,13 @@ class MainPipe(implicit p: Parameters) extends DCacheModule with HasPerfEvents {
  val s1_hit_tag = Mux(s1_tag_match, Mux1H(s1_tag_match_way, wayMap(w => tag_resp(w))), get_tag(s1_req.addr))
  val s1_hit_coh = ClientMetadata(Mux(s1_tag_match, Mux1H(s1_tag_match_way, wayMap(w => meta_resp(w))), 0.U))
  val s1_encTag = Mux1H(s1_tag_match_way, wayMap((w: Int) => enc_tag_resp(w)))
-  val s1_flag_error = Mux(s1_tag_match, Mux1H(s1_tag_match_way, wayMap(w => io.error_flag_resp(w))), false.B)
+  val s1_flag_error = Mux(s1_tag_match, Mux1H(s1_tag_match_way, wayMap(w => io.extra_meta_resp(w).error)), false.B)
+  val s1_extra_meta = Mux1H(s1_tag_match_way, wayMap(w => io.extra_meta_resp(w)))
  val s1_l2_error = s1_req.error

+  XSPerfAccumulate("probe_unused_prefetch", s1_req.probe && s1_extra_meta.prefetch && !s1_extra_meta.access) // may not be accurate
+  XSPerfAccumulate("replace_unused_prefetch", s1_req.replace && s1_extra_meta.prefetch && !s1_extra_meta.access) // may not be accurate
+
  // replacement policy
  val s1_repl_way_en = WireInit(0.U(nWays.W))
  s1_repl_way_en := Mux(RegNext(s0_fire), UIntToOH(io.replace_way.way), RegNext(s1_repl_way_en))
@@ -1412,6 +1418,7 @@ class MainPipe(implicit p: Parameters) extends DCacheModule with HasPerfEvents {
  miss_req.replace_tag := s2_repl_tag
  miss_req.id := s2_req.id
  miss_req.cancel := false.B
+  miss_req.pc := DontCare

  io.store_replay_resp.valid := s2_valid_dup(5) && s2_can_go_to_mq_dup(1) && replay && s2_req.isStore
  io.store_replay_resp.bits.data := DontCare
@@ -1470,7 +1477,22 @@ class MainPipe(implicit p: Parameters) extends DCacheModule with HasPerfEvents {
  io.error_flag_write.valid := s3_fire_dup_for_err_w_valid && update_meta_dup_for_err_w_valid && s3_l2_error
  io.error_flag_write.bits.idx := s3_idx_dup(3)
  io.error_flag_write.bits.way_en := s3_way_en_dup(1)
-  io.error_flag_write.bits.error := s3_l2_error
+  io.error_flag_write.bits.flag := s3_l2_error
+
+  // if we use (prefetch_flag && meta =/= ClientStates.Nothing) for prefetch check
+  // prefetch_flag_write can be omited
+  // io.prefetch_flag_write.valid := io.meta_write.valid && new_coh === ClientStates.Nothing
+  // io.prefetch_flag_write.bits.idx := s3_idx_dup(3)
+  // io.prefetch_flag_write.bits.way_en := s3_way_en_dup(1)
+  // io.prefetch_flag_write.bits.flag := false.B
+  io.prefetch_flag_write.valid := false.B
+  io.prefetch_flag_write.bits := DontCare
+
+  // probe / replace will not update access bit
+  io.access_flag_write.valid := s3_fire_dup_for_meta_w_valid && !s3_req.probe && !s3_req.replace
+  io.access_flag_write.bits.idx := s3_idx_dup(3)
+  io.access_flag_write.bits.way_en := s3_way_en_dup(1)
+  io.access_flag_write.bits.flag := true.B

  io.tag_write.valid := s3_fire_dup_for_tag_w_valid && s3_req_miss_dup_for_tag_w_valid
  io.tag_write.bits.idx := s3_idx_dup(4)

--- a/src/main/scala/xiangshan/cache/dcache/mainpipe/MissQueue.scala
+++ b/src/main/scala/xiangshan/cache/dcache/mainpipe/MissQueue.scala
@@ -30,6 +30,7 @@ import difftest._
 import huancun.{AliasKey, DirtyKey, PreferCacheKey, PrefetchKey}
 import utility.FastArbiter
 import mem.{AddPipelineReg}
+import mem.trace._

 class MissReqWoStoreData(implicit p: Parameters) extends DCacheBundle {
  val source = UInt(sourceTypeWidth.W)
@@ -37,6 +38,7 @@ class MissReqWoStoreData(implicit p: Parameters) extends DCacheBundle {
  val addr = UInt(PAddrBits.W)
  val vaddr = UInt(VAddrBits.W)
  val way_en = UInt(DCacheWays.W)
+  val pc = UInt(VAddrBits.W)

  // store
  val full_overwrite = Bool()
@@ -61,9 +63,13 @@ class MissReqWoStoreData(implicit p: Parameters) extends DCacheBundle {
  // 2. pmp check failed
  val cancel = Bool() // cancel is slow to generate, it will cancel missreq.valid

-  def isLoad = source === LOAD_SOURCE.U
-  def isStore = source === STORE_SOURCE.U
-  def isAMO = source === AMO_SOURCE.U
+  // Req source decode
+  // Note that req source is NOT cmd type
+  // For instance, a req which isFromPrefetch may have R or W cmd
+  def isFromLoad = source === LOAD_SOURCE.U
+  def isFromStore = source === STORE_SOURCE.U
+  def isFromAMO = source === AMO_SOURCE.U
+  def isFromPrefetch = source >= DCACHE_PREFETCH_SOURCE.U
  def hit = req_coh.isValid()
 }

@@ -103,6 +109,7 @@ class MissReq(implicit p: Parameters) extends MissReqWoStoreData {
    out.replace_tag := replace_tag
    out.id := id
    out.cancel := cancel
+    out.pc := pc
    out
  }
 }
@@ -160,6 +167,7 @@ class MissEntry(edge: TLEdgeOut)(implicit p: Parameters) extends DCacheModule {
    val req_handled_by_this_entry = Output(Bool())

    val forwardInfo = Output(new MissEntryForwardIO)
+    val l2_pf_store_only = Input(Bool())
  })

  assert(!RegNext(io.primary_valid && !io.primary_ready))
@@ -169,6 +177,8 @@ class MissEntry(edge: TLEdgeOut)(implicit p: Parameters) extends DCacheModule {
  val req_valid = RegInit(false.B)
  val set = addr_to_dcache_set(req.vaddr)

+  val input_req_is_prefetch = isPrefetch(io.req.bits.cmd)
+
  val s_acquire = RegInit(true.B)
  val s_grantack = RegInit(true.B)
  val s_replace_req = RegInit(true.B)
@@ -188,11 +198,13 @@ class MissEntry(edge: TLEdgeOut)(implicit p: Parameters) extends DCacheModule {
  val data_not_refilled = !w_grantfirst

  val error = RegInit(false.B)
+  val prefetch = RegInit(false.B)
+  val access = RegInit(false.B)

  val should_refill_data_reg =  Reg(Bool())
  val should_refill_data = WireInit(should_refill_data_reg)

-  // val full_overwrite = req.isStore && req_store_mask.andR
+  // val full_overwrite = req.isFromStore && req_store_mask.andR
  val full_overwrite = Reg(Bool())

  val (_, _, refill_done, refill_count) = edge.count(io.mem_grant)
@@ -235,46 +247,51 @@ class MissEntry(edge: TLEdgeOut)(implicit p: Parameters) extends DCacheModule {
    w_grantfirst := false.B
    w_grantlast := false.B

-    s_write_storedata := !io.req.bits.isStore // only store need to wait for data
-    full_overwrite := io.req.bits.isStore && io.req.bits.full_overwrite
+    s_write_storedata := !io.req.bits.isFromStore // only store need to wait for data
+    full_overwrite := io.req.bits.isFromStore && io.req.bits.full_overwrite

-    when (!io.req.bits.isAMO) {
+    when (!io.req.bits.isFromAMO) {
      s_refill := false.B
      w_refill_resp := false.B
    }

-    when (!io.req.bits.hit && io.req.bits.replace_coh.isValid() && !io.req.bits.isAMO) {
+    when (!io.req.bits.hit && io.req.bits.replace_coh.isValid() && !io.req.bits.isFromAMO) {
      s_replace_req := false.B
      w_replace_resp := false.B
    }

-    when (io.req.bits.isAMO) {
+    when (io.req.bits.isFromAMO) {
      s_mainpipe_req := false.B
      w_mainpipe_resp := false.B
    }

-    should_refill_data_reg := io.req.bits.isLoad
+    should_refill_data_reg := io.req.bits.isFromLoad
    error := false.B
+    prefetch := input_req_is_prefetch
+    access := false.B
  }

  when (secondary_fire) {
-    assert(io.req.bits.req_coh.state <= req.req_coh.state)
-    assert(!(io.req.bits.isAMO || req.isAMO))
+    assert(io.req.bits.req_coh.state <= req.req_coh.state || (prefetch && !access))
+    assert(!(io.req.bits.isFromAMO || req.isFromAMO))
    // use the most uptodate meta
    req.req_coh := io.req.bits.req_coh

-    when (io.req.bits.isStore) {
+    when (io.req.bits.isFromStore) {
      req := io.req.bits
      req.addr := get_block_addr(io.req.bits.addr)
      req.way_en := req.way_en
      req.replace_coh := req.replace_coh
      req.replace_tag := req.replace_tag
      s_write_storedata := false.B // only store need to wait for data
-      full_overwrite := io.req.bits.isStore && io.req.bits.full_overwrite
+      full_overwrite := io.req.bits.isFromStore && io.req.bits.full_overwrite
    }

-    should_refill_data := should_refill_data_reg || io.req.bits.isLoad
+    should_refill_data := should_refill_data_reg || io.req.bits.isFromLoad
    should_refill_data_reg := should_refill_data
+    when (!input_req_is_prefetch) {
+      access := true.B // when merge non-prefetch req, set access bit
+    }
  }

  when (io.mem_acquire.fire()) {
@@ -301,7 +318,7 @@ class MissEntry(edge: TLEdgeOut)(implicit p: Parameters) extends DCacheModule {
    // new_data(i) := req.store_data(rowBits * (i + 1) - 1, rowBits * i)
    new_data(i) := refill_and_store_data(i)
    // we only need to merge data for Store
-    new_mask(i) := Mux(req.isStore, req_store_mask(rowBytes * (i + 1) - 1, rowBytes * i), 0.U)
+    new_mask(i) := Mux(req.isFromStore, req_store_mask(rowBytes * (i + 1) - 1, rowBytes * i), 0.U)
  }

  val hasData = RegInit(true.B)
@@ -362,19 +379,21 @@ class MissEntry(edge: TLEdgeOut)(implicit p: Parameters) extends DCacheModule {
    w_mainpipe_resp := true.B
  }

-  def before_read_sent_can_merge(new_req: MissReqWoStoreData): Bool = {
-    acquire_not_sent && req.isLoad && (new_req.isLoad || new_req.isStore)
+  def before_req_sent_can_merge(new_req: MissReqWoStoreData): Bool = {
+    acquire_not_sent && (req.isFromLoad || req.isFromPrefetch) && (new_req.isFromLoad || new_req.isFromStore)
  }
-
+  
  def before_data_refill_can_merge(new_req: MissReqWoStoreData): Bool = {
-    data_not_refilled && (req.isLoad || req.isStore) && new_req.isLoad
+    data_not_refilled && (req.isFromLoad || req.isFromStore || req.isFromPrefetch) && new_req.isFromLoad
  }
+  
+  // Note that late prefetch will be ignored

  def should_merge(new_req: MissReqWoStoreData): Bool = {
    val block_match = get_block(req.addr) === get_block(new_req.addr)
    block_match &&
    (
-      before_read_sent_can_merge(new_req) ||
+      before_req_sent_can_merge(new_req) ||
      before_data_refill_can_merge(new_req)
    )
  }
@@ -392,7 +411,7 @@ class MissEntry(edge: TLEdgeOut)(implicit p: Parameters) extends DCacheModule {
    req_valid &&
      Mux(
        block_match,
-        !before_read_sent_can_merge(new_req) &&
+        !before_req_sent_can_merge(new_req) &&
          !before_data_refill_can_merge(new_req),
        set_match && new_req.way_en === req.way_en
      )
@@ -437,7 +456,7 @@ class MissEntry(edge: TLEdgeOut)(implicit p: Parameters) extends DCacheModule {
  // resolve cache alias by L2
  io.mem_acquire.bits.user.lift(AliasKey).foreach( _ := req.vaddr(13, 12))
  // trigger prefetch
-  io.mem_acquire.bits.user.lift(PrefetchKey).foreach(_ := true.B)
+  io.mem_acquire.bits.user.lift(PrefetchKey).foreach(_ := Mux(io.l2_pf_store_only, req.isFromStore, true.B))
  // prefer not to cache data in L2 by default
  io.mem_acquire.bits.user.lift(PreferCacheKey).foreach(_ := false.B)
  require(nSets <= 256)
@@ -471,7 +490,7 @@ class MissEntry(edge: TLEdgeOut)(implicit p: Parameters) extends DCacheModule {
  refill.addr := req.addr
  refill.way_en := req.way_en
  refill.wmask := Mux(
-    hasData || req.isLoad,
+    hasData || req.isFromLoad,
    ~0.U(DCacheBanks.W),
    VecInit((0 until DCacheBanks).map(i => get_mask_of_bank(i, req_store_mask).orR)).asUInt
  )
@@ -493,6 +512,8 @@ class MissEntry(edge: TLEdgeOut)(implicit p: Parameters) extends DCacheModule {
  }
  refill.meta.coh := ClientMetadata(missCohGen(req.cmd, grant_param, isDirty))
  refill.error := error
+  refill.prefetch := prefetch
+  refill.access := access
  refill.alias := req.vaddr(13, 12) // TODO

  io.main_pipe_req.valid := !s_mainpipe_req && w_grantlast
@@ -535,13 +556,14 @@ class MissEntry(edge: TLEdgeOut)(implicit p: Parameters) extends DCacheModule {
  XSPerfAccumulate("penalty_waiting_for_channel_D", s_acquire && !w_grantlast && !io.mem_grant.valid)
  XSPerfAccumulate("penalty_waiting_for_channel_E", io.mem_finish.valid && !io.mem_finish.ready)
  XSPerfAccumulate("penalty_from_grant_to_refill", !w_refill_resp && w_grantlast)
-  XSPerfAccumulate("soft_prefetch_number", primary_fire && io.req.bits.source === SOFT_PREFETCH.U)
+  XSPerfAccumulate("prefetch_req_primary", primary_fire && io.req.bits.source === DCACHE_PREFETCH_SOURCE.U)
+  XSPerfAccumulate("prefetch_req_merged", secondary_fire && io.req.bits.source === DCACHE_PREFETCH_SOURCE.U)

  val (mshr_penalty_sample, mshr_penalty) = TransactionLatencyCounter(RegNext(primary_fire), release_entry)
  XSPerfHistogram("miss_penalty", mshr_penalty, mshr_penalty_sample, 0, 20, 1, true, true)
  XSPerfHistogram("miss_penalty", mshr_penalty, mshr_penalty_sample, 20, 100, 10, true, false)

-  val load_miss_begin = primary_fire && io.req.bits.isLoad
+  val load_miss_begin = primary_fire && io.req.bits.isFromLoad
  val refill_finished = RegNext(!w_grantlast && refill_done) && should_refill_data
  val (load_miss_penalty_sample, load_miss_penalty) = TransactionLatencyCounter(load_miss_begin, refill_finished) // not real refill finish time
  XSPerfHistogram("load_miss_penalty_to_use", load_miss_penalty, load_miss_penalty_sample, 0, 20, 1, true, true)
@@ -590,6 +612,7 @@ class MissQueue(edge: TLEdgeOut)(implicit p: Parameters) extends DCacheModule wi

    // forward missqueue
    val forward = Vec(LoadPipelineWidth, new LduToMissqueueForwardIO)
+    val l2_pf_store_only = Input(Bool())
  })
  
  // 128KBL1: FIXME: provide vaddr for l2
@@ -656,6 +679,7 @@ class MissQueue(edge: TLEdgeOut)(implicit p: Parameters) extends DCacheModule wi
      
      e.io.hartId := io.hartId
      e.io.id := i.U
+      e.io.l2_pf_store_only := io.l2_pf_store_only
      e.io.req.valid := io.req.valid
      e.io.primary_valid := io.req.valid && 
        !merge && 
@@ -707,6 +731,17 @@ class MissQueue(edge: TLEdgeOut)(implicit p: Parameters) extends DCacheModule wi

  io.full := ~Cat(entries.map(_.io.primary_ready)).andR

+  // L1MissTrace Chisel DB
+  val debug_miss_trace = Wire(new L1MissTrace)
+  debug_miss_trace.vaddr := io.req.bits.vaddr
+  debug_miss_trace.paddr := io.req.bits.addr
+  debug_miss_trace.source := io.req.bits.source
+  debug_miss_trace.pc := io.req.bits.pc
+
+  val table = ChiselDB.createTable("L1MissQMissTrace_hart"+ p(XSCoreParamsKey).HartId.toString, new L1MissTrace)
+  table.log(debug_miss_trace, io.req.valid && !io.req.bits.cancel && alloc, "MissQueue", clock, reset)
+
+  // Difftest
  if (env.EnableDifftest) {
    val difftest = Module(new DifftestRefillEvent)
    difftest.io.clock := clock
@@ -717,11 +752,14 @@ class MissQueue(edge: TLEdgeOut)(implicit p: Parameters) extends DCacheModule wi
    difftest.io.data := io.refill_to_ldq.bits.data_raw.asTypeOf(difftest.io.data)
  }

+  // Perf count
  XSPerfAccumulate("miss_req", io.req.fire())
  XSPerfAccumulate("miss_req_allocate", io.req.fire() && alloc)
-  XSPerfAccumulate("miss_req_merge_load", io.req.fire() && merge && io.req.bits.isLoad)
-  XSPerfAccumulate("miss_req_reject_load", io.req.valid && reject && io.req.bits.isLoad)
+  XSPerfAccumulate("miss_req_merge_load", io.req.fire() && merge && io.req.bits.isFromLoad)
+  XSPerfAccumulate("miss_req_reject_load", io.req.valid && reject && io.req.bits.isFromLoad)
  XSPerfAccumulate("probe_blocked_by_miss", io.probe_block)
+  XSPerfAccumulate("prefetch_primary_fire", io.req.fire() && alloc && io.req.bits.isFromPrefetch)
+  XSPerfAccumulate("prefetch_secondary_fire", io.req.fire() && merge && io.req.bits.isFromPrefetch)
  val max_inflight = RegInit(0.U((log2Up(cfg.nMissEntries) + 1).W))
  val num_valids = PopCount(~Cat(primary_ready_vec).asUInt)
  when (num_valids > max_inflight) {

--- a/src/main/scala/xiangshan/cache/dcache/mainpipe/RefillPipe.scala
+++ b/src/main/scala/xiangshan/cache/dcache/mainpipe/RefillPipe.scala
@@ -30,6 +30,8 @@ class RefillPipeReqCtrl(implicit p: Parameters) extends DCacheBundle {

  val id = UInt(reqIdWidth.W)
  val error = Bool()
+  val prefetch = Bool()
+  val access = Bool()

  def paddrWithVirtualAlias: UInt = {
    Cat(alias, addr(DCacheSameVPAddrLength - 1, 0))
@@ -51,6 +53,8 @@ class RefillPipeReq(implicit p: Parameters) extends RefillPipeReqCtrl {
    ctrl.miss_id := miss_id
    ctrl.id := id
    ctrl.error := error
+    ctrl.prefetch := prefetch
+    ctrl.access := access
    ctrl
  }
 }
@@ -67,8 +71,10 @@ class RefillPipe(implicit p: Parameters) extends DCacheModule {

    val data_write = DecoupledIO(new L1BankedDataWriteReq)
    val data_write_dup = Vec(DCacheBanks, Valid(new L1BankedDataWriteReqCtrl))
-    val meta_write = DecoupledIO(new MetaWriteReq)
-    val error_flag_write = DecoupledIO(new ErrorWriteReq)
+    val meta_write = DecoupledIO(new CohMetaWriteReq)
+    val error_flag_write = DecoupledIO(new FlagMetaWriteReq)
+    val prefetch_flag_write = DecoupledIO(new FlagMetaWriteReq)
+    val access_flag_write = DecoupledIO(new FlagMetaWriteReq)
    val tag_write = DecoupledIO(new TagWriteReq)
    val store_resp = ValidIO(new DCacheLineResp)
    val release_wakeup = ValidIO(UInt(log2Up(cfg.nMissEntries).W))
@@ -113,7 +119,17 @@ class RefillPipe(implicit p: Parameters) extends DCacheModule {
  io.error_flag_write.valid := io.req_dup_for_err_w.valid
  io.error_flag_write.bits.idx := req_dup_for_err_w.idx
  io.error_flag_write.bits.way_en := req_dup_for_err_w.way_en
-  io.error_flag_write.bits.error := refill_w_req.error
+  io.error_flag_write.bits.flag := refill_w_req.error
+
+  io.prefetch_flag_write.valid := io.req_dup_for_err_w.valid
+  io.prefetch_flag_write.bits.idx := req_dup_for_err_w.idx
+  io.prefetch_flag_write.bits.way_en := req_dup_for_err_w.way_en
+  io.prefetch_flag_write.bits.flag := refill_w_req.prefetch
+
+  io.access_flag_write.valid := io.req_dup_for_err_w.valid
+  io.access_flag_write.bits.idx := req_dup_for_err_w.idx
+  io.access_flag_write.bits.way_en := req_dup_for_err_w.way_en
+  io.access_flag_write.bits.flag := refill_w_req.access

  io.tag_write.valid := io.req_dup_for_tag_w.valid
  io.tag_write.bits.idx := req_dup_for_tag_w.idx

--- a/src/main/scala/xiangshan/cache/dcache/meta/AsynchronousMetaArray.scala
+++ b/src/main/scala/xiangshan/cache/dcache/meta/AsynchronousMetaArray.scala
@@ -40,19 +40,19 @@ class MetaReadReq(implicit p: Parameters) extends DCacheBundle {
  val way_en = UInt(nWays.W)
 }

-class MetaWriteReq(implicit p: Parameters) extends MetaReadReq {
+class CohMetaWriteReq(implicit p: Parameters) extends MetaReadReq {
  val meta = new Meta
 }

-class ErrorWriteReq(implicit p: Parameters) extends MetaReadReq {
-  val error = Bool()
+class FlagMetaWriteReq(implicit p: Parameters) extends MetaReadReq {
+  val flag = Bool()
 }

-class AsynchronousMetaArray(readPorts: Int, writePorts: Int)(implicit p: Parameters) extends DCacheModule {
+class L1CohMetaArray(readPorts: Int, writePorts: Int)(implicit p: Parameters) extends DCacheModule {
  val io = IO(new Bundle() {
    val read = Vec(readPorts, Flipped(DecoupledIO(new MetaReadReq)))
    val resp = Output(Vec(readPorts, Vec(nWays, new Meta)))
-    val write = Vec(writePorts, Flipped(DecoupledIO(new MetaWriteReq)))
+    val write = Vec(writePorts, Flipped(DecoupledIO(new CohMetaWriteReq)))
  })

  val meta_array = RegInit(
@@ -103,12 +103,12 @@ class AsynchronousMetaArray(readPorts: Int, writePorts: Int)(implicit p: Paramet
  }
 }

-class ErrorArray(readPorts: Int, writePorts: Int)(implicit p: Parameters) extends DCacheModule {
+class L1FlagMetaArray(readPorts: Int, writePorts: Int)(implicit p: Parameters) extends DCacheModule {
  val io = IO(new Bundle() {
    val read = Vec(readPorts, Flipped(DecoupledIO(new MetaReadReq)))
    val resp = Output(Vec(readPorts, Vec(nWays, Bool())))
-    val write = Vec(writePorts, Flipped(DecoupledIO(new ErrorWriteReq)))
-    // customized cache op port
+    val write = Vec(writePorts, Flipped(DecoupledIO(new FlagMetaWriteReq)))
+    // customized cache op port 
    // val cacheOp = Flipped(new L1CacheInnerOpIO)
  })

@@ -152,7 +152,7 @@ class ErrorArray(readPorts: Int, writePorts: Int)(implicit p: Parameters) extend
          s0_way_wen(way)(wport) := write.valid && wen
          s1_way_wen(way)(wport) := RegNext(s0_way_wen(way)(wport))
          s1_way_waddr(way)(wport) := RegEnable(write.bits.idx, s0_way_wen(way)(wport))
-          s1_way_wdata(way)(wport) := RegEnable(write.bits.error, s0_way_wen(way)(wport))
+          s1_way_wdata(way)(wport) := RegEnable(write.bits.flag, s0_way_wen(way)(wport))
          when (s1_way_wen(way)(wport)) {
            meta_array(s1_way_waddr(way)(wport))(way) := s1_way_wdata(way)(wport)
          }

--- a/src/main/scala/xiangshan/cache/dcache/meta/MetaArray.scala
+++ b/src/main/scala/xiangshan/cache/dcache/meta/MetaArray.scala
--- a/src/main/scala/xiangshan/cache/mmu/MMUBundle.scala
+++ b/src/main/scala/xiangshan/cache/mmu/MMUBundle.scala
@@ -359,6 +359,8 @@ class TlbReq(implicit p: Parameters) extends TlbBundle {
  val cmd = Output(TlbCmd())
  val size = Output(UInt(log2Ceil(log2Ceil(XLEN/8)+1).W))
  val kill = Output(Bool()) // Use for blocked tlb that need sync with other module like icache
+  // do not translate, but still do pmp/pma check
+  val no_translate = Output(Bool()) 
  val debug = new Bundle {
    val pc = Output(UInt(XLEN.W))
    val robIdx = Output(new RobPtr)

--- a/src/main/scala/xiangshan/cache/mmu/TLB.scala
+++ b/src/main/scala/xiangshan/cache/mmu/TLB.scala
@@ -72,6 +72,7 @@ class TLB(Width: Int, nRespDups: Int = 1, Block: Seq[Boolean], q: TLBParameters)
  // val vmEnable = satp.mode === 8.U // && (mode < ModeM) // FIXME: fix me when boot xv6/linux...
  val vmEnable = if (EnbaleTlbDebug) (satp.mode === 8.U)
    else (satp.mode === 8.U && (mode < ModeM))
+  val portTranslateEnable = (0 until Width).map(i => vmEnable && !req(i).bits.no_translate)

  val req_in = req
  val req_out = req.map(a => RegEnable(a.bits, a.fire()))
@@ -118,10 +119,11 @@ class TLB(Width: Int, nRespDups: Int = 1, Block: Seq[Boolean], q: TLBParameters)
  def TLBRead(i: Int) = {
    val (e_hit, e_ppn, e_perm, e_super_hit, e_super_ppn, static_pm) = entries.io.r_resp_apply(i)
    val (p_hit, p_ppn, p_perm) = ptw_resp_bypass(get_pn(req_in(i).bits.vaddr))
+    val enable = portTranslateEnable(i)

    val hit = e_hit || p_hit
-    val miss = !hit && vmEnable
-    val fast_miss = !(e_super_hit || p_hit) && vmEnable
+    val miss = !hit && enable
+    val fast_miss = !(e_super_hit || p_hit) && enable
    hit.suggestName(s"hit_read_${i}")
    miss.suggestName(s"miss_read_${i}")

@@ -138,15 +140,15 @@ class TLB(Width: Int, nRespDups: Int = 1, Block: Seq[Boolean], q: TLBParameters)
      perm(d) := Mux(p_hit, p_perm, e_perm(d))

      val paddr = Cat(ppn(d), get_off(req_out(i).vaddr))
-      resp(i).bits.paddr(d) := Mux(vmEnable, paddr, vaddr)
+      resp(i).bits.paddr(d) := Mux(enable, paddr, vaddr)
    }

    XSDebug(req_out_v(i), p"(${i.U}) hit:${hit} miss:${miss} ppn:${Hexadecimal(ppn(0))} perm:${perm(0)}\n")

-    val pmp_paddr = Mux(vmEnable, Cat(Mux(p_hit, p_ppn, e_super_ppn), get_off(req_out(i).vaddr)), vaddr)
+    val pmp_paddr = Mux(enable, Cat(Mux(p_hit, p_ppn, e_super_ppn), get_off(req_out(i).vaddr)), vaddr)
    // pmp_paddr seems same to paddr functionally. It abandons normal_ppn for timing optimization.
-    // val pmp_paddr = Mux(vmEnable, paddr, vaddr)
-    val static_pm_valid = !(e_super_hit || p_hit) && vmEnable && q.partialStaticPMP.B
+    // val pmp_paddr = Mux(enable, paddr, vaddr)
+    val static_pm_valid = !(e_super_hit || p_hit) && enable && q.partialStaticPMP.B

    (hit, miss, pmp_paddr, static_pm, static_pm_valid, perm)
  }
@@ -174,7 +176,7 @@ class TLB(Width: Int, nRespDups: Int = 1, Block: Seq[Boolean], q: TLBParameters)
    val ldPf = (ldPermFail || pf) && (TlbCmd.isRead(cmd) && !TlbCmd.isAmo(cmd))
    val stPf = (stPermFail || pf) && (TlbCmd.isWrite(cmd) || TlbCmd.isAmo(cmd))
    val instrPf = (instrPermFail || pf) && TlbCmd.isExec(cmd)
-    val fault_valid = vmEnable
+    val fault_valid = portTranslateEnable(idx)
    resp(idx).bits.excp(nDups).pf.ld := (ldPf || ldUpdate) && fault_valid && !af
    resp(idx).bits.excp(nDups).pf.st := (stPf || stUpdate) && fault_valid && !af
    resp(idx).bits.excp(nDups).pf.instr := (instrPf || instrUpdate) && fault_valid && !af
@@ -218,8 +220,8 @@ class TLB(Width: Int, nRespDups: Int = 1, Block: Seq[Boolean], q: TLBParameters)
      io.ptw.req(idx).fire() || resp(idx).fire(), flush_pipe(idx))

    // when ptw resp, check if hit, reset miss_v, resp to lsu/ifu
-    resp(idx).valid := req_out_v(idx) && !(miss_v && vmEnable)
-    when (io.ptw.resp.fire() && hit && req_out_v(idx) && vmEnable) {
+    resp(idx).valid := req_out_v(idx) && !(miss_v && portTranslateEnable(idx))
+    when (io.ptw.resp.fire() && hit && req_out_v(idx) && portTranslateEnable(idx)) {
      val pte = io.ptw.resp.bits
      resp(idx).valid := true.B
      resp(idx).bits.miss := false.B // for blocked tlb, this is useless
@@ -242,7 +244,7 @@ class TLB(Width: Int, nRespDups: Int = 1, Block: Seq[Boolean], q: TLBParameters)
    // however, some outside modules like icache, dont care flushPipe, and still waiting for tlb resp
    // just resp valid and raise page fault to go through. The pipe(ifu) will abandon it.
    if (!q.outsideRecvFlush) {
-      when (req_out_v(idx) && flush_pipe(idx) && vmEnable) {
+      when (req_out_v(idx) && flush_pipe(idx) && portTranslateEnable(idx)) {
        resp(idx).valid := true.B
        for (d <- 0 until nRespDups) {
          resp(idx).bits.excp(d).pf.ld := true.B // sfence happened, pf for not to use this addr
@@ -271,21 +273,21 @@ class TLB(Width: Int, nRespDups: Int = 1, Block: Seq[Boolean], q: TLBParameters)
  val result_ok = req_in.map(a => RegNext(a.fire()))
  val perfEvents =
    Seq(
-      ("access", PopCount((0 until Width).map{i => if (Block(i)) io.requestor(i).req.fire() else vmEnable && result_ok(i) })),
-      ("miss  ", PopCount((0 until Width).map{i => if (Block(i)) vmEnable && result_ok(i) && missVec(i) else ptw.req(i).fire() })),
+      ("access", PopCount((0 until Width).map{i => if (Block(i)) io.requestor(i).req.fire() else portTranslateEnable(i) && result_ok(i) })),
+      ("miss  ", PopCount((0 until Width).map{i => if (Block(i)) portTranslateEnable(i) && result_ok(i) && missVec(i) else ptw.req(i).fire() })),
    )
  generatePerfEvent()

  // perf log
  for (i <- 0 until Width) {
    if (Block(i)) {
-      XSPerfAccumulate(s"access${i}",result_ok(i)  && vmEnable)
+      XSPerfAccumulate(s"access${i}",result_ok(i) && portTranslateEnable(i))
      XSPerfAccumulate(s"miss${i}", result_ok(i) && missVec(i))
    } else {
-      XSPerfAccumulate("first_access" + Integer.toString(i, 10), result_ok(i) && vmEnable && RegNext(req(i).bits.debug.isFirstIssue))
-      XSPerfAccumulate("access" + Integer.toString(i, 10), result_ok(i) && vmEnable)
-      XSPerfAccumulate("first_miss" + Integer.toString(i, 10), result_ok(i) && vmEnable && missVec(i) && RegNext(req(i).bits.debug.isFirstIssue))
-      XSPerfAccumulate("miss" + Integer.toString(i, 10), result_ok(i) && vmEnable && missVec(i))
+      XSPerfAccumulate("first_access" + Integer.toString(i, 10), result_ok(i) && portTranslateEnable(i) && RegNext(req(i).bits.debug.isFirstIssue))
+      XSPerfAccumulate("access" + Integer.toString(i, 10), result_ok(i) && portTranslateEnable(i))
+      XSPerfAccumulate("first_miss" + Integer.toString(i, 10), result_ok(i) && portTranslateEnable(i) && missVec(i) && RegNext(req(i).bits.debug.isFirstIssue))
+      XSPerfAccumulate("miss" + Integer.toString(i, 10), result_ok(i) && portTranslateEnable(i) && missVec(i))
    }
  }
  XSPerfAccumulate("ptw_resp_count", ptw.resp.fire())
@@ -322,7 +324,7 @@ class TLB(Width: Int, nRespDups: Int = 1, Block: Seq[Boolean], q: TLBParameters)
      val difftest = Module(new DifftestL1TLBEvent)
      difftest.io.clock := clock
      difftest.io.coreid := p(XSCoreParamsKey).HartId.asUInt
-      difftest.io.valid := RegNext(io.requestor(i).req.fire) && !RegNext(io.requestor(i).req_kill) && io.requestor(i).resp.fire && !io.requestor(i).resp.bits.miss && !pf && !af && vmEnable
+      difftest.io.valid := RegNext(io.requestor(i).req.fire) && !RegNext(io.requestor(i).req_kill) && io.requestor(i).resp.fire && !io.requestor(i).resp.bits.miss && !pf && !af && portTranslateEnable(i)
      difftest.io.index := i.U
      difftest.io.l1tlbid := l1tlbid
      difftest.io.satp := io.csr.satp.ppn

--- a/src/main/scala/xiangshan/frontend/IFU.scala
+++ b/src/main/scala/xiangshan/frontend/IFU.scala
@@ -551,6 +551,7 @@ class NewIFU(implicit p: Parameters) extends XSModule
  io.iTLBInter.req.bits.kill                := false.B // IFU use itlb for mmio, doesn't need sync, set it to false
  io.iTLBInter.req.bits.cmd                 := TlbCmd.exec
  io.iTLBInter.req.bits.debug.robIdx        := DontCare
+  io.iTLBInter.req.bits.no_translate        := false.B
  io.iTLBInter.req.bits.debug.isFirstIssue  := DontCare

  io.pmp.req.valid := (mmio_state === m_sendPMP) && f3_req_is_mmio

--- a/src/main/scala/xiangshan/frontend/icache/ICacheMainPipe.scala
+++ b/src/main/scala/xiangshan/frontend/icache/ICacheMainPipe.scala
@@ -196,6 +196,7 @@ class ICacheMainPipe(implicit p: Parameters) extends ICacheModule
  toITLB.map{port =>
    port.bits.cmd                 := TlbCmd.exec
    port.bits.debug.robIdx        := DontCare
+    port.bits.no_translate        := false.B
    port.bits.debug.isFirstIssue  := DontCare
  }


--- a/src/main/scala/xiangshan/frontend/icache/IPrefetch.scala
+++ b/src/main/scala/xiangshan/frontend/icache/IPrefetch.scala
@@ -107,6 +107,7 @@ class IPrefetchPipe(implicit p: Parameters) extends  IPrefetchModule
  toITLB.bits.kill                := DontCare
  toITLB.bits.cmd                 := TlbCmd.exec
  toITLB.bits.debug.robIdx        := DontCare
+  toITLB.bits.no_translate        := false.B
  toITLB.bits.debug.isFirstIssue  := DontCare



--- a/src/main/scala/xiangshan/mem/MemCommon.scala
+++ b/src/main/scala/xiangshan/mem/MemCommon.scala
@@ -68,8 +68,10 @@ class LsPipelineBundle(implicit p: Parameters) extends XSBundleWithMicroOp with
  val forwardMask = Vec(8, Bool())
  val forwardData = Vec(8, UInt(8.W))

-  //softprefetch
-  val isSoftPrefetch = Bool() 
+  // prefetch
+  val isPrefetch = Bool()
+  val isHWPrefetch = Bool()
+  def isSWPrefetch = isPrefetch && !isHWPrefetch

  // For debug usage
  val isFirstIssue = Bool()
@@ -84,6 +86,37 @@ class LsPipelineBundle(implicit p: Parameters) extends XSBundleWithMicroOp with
  val forward_tlDchannel = Bool()
 }

+class LdPrefetchTrainBundle(implicit p: Parameters) extends LsPipelineBundle {
+  val meta_prefetch = Bool()
+  val meta_access = Bool()
+
+  def fromLsPipelineBundle(input: LsPipelineBundle) = {
+    vaddr := input.vaddr
+    paddr := input.paddr
+    mask := input.mask
+    data := input.data
+    uop := input.uop
+    wlineflag := input.wlineflag
+    miss := input.miss
+    tlbMiss := input.tlbMiss
+    ptwBack := input.ptwBack
+    mmio := input.mmio
+    rsIdx := input.rsIdx
+    forwardMask := input.forwardMask
+    forwardData := input.forwardData
+    isPrefetch := input.isPrefetch
+    isHWPrefetch := input.isHWPrefetch
+    isFirstIssue := input.isFirstIssue
+    meta_prefetch := DontCare
+    meta_access := DontCare
+    forward_tlDchannel := DontCare
+    mshrid := DontCare
+    replayCarry := DontCare
+    atomic := DontCare
+    isLoadReplay := DontCare
+  }
+}
+
 class LqWriteBundle(implicit p: Parameters) extends LsPipelineBundle {
  // queue entry data, except flag bits, will be updated if writeQueue is true,
  // valid bit in LqWriteBundle will be ignored
@@ -104,7 +137,8 @@ class LqWriteBundle(implicit p: Parameters) extends LsPipelineBundle {
    rsIdx := input.rsIdx
    forwardMask := input.forwardMask
    forwardData := input.forwardData
-    isSoftPrefetch := input.isSoftPrefetch
+    isPrefetch := input.isPrefetch
+    isHWPrefetch := input.isHWPrefetch
    isFirstIssue := input.isFirstIssue
    isLoadReplay := input.isLoadReplay
    mshrid := input.mshrid

--- a/src/main/scala/xiangshan/mem/MemTrace.scala
+++ b/src/main/scala/xiangshan/mem/MemTrace.scala
+/***************************************************************************************
+* Copyright (c) 2020-2021 Institute of Computing Technology, Chinese Academy of Sciences
+* Copyright (c) 2020-2021 Peng Cheng Laboratory
+*
+* XiangShan is licensed under Mulan PSL v2.
+* You can use this software according to the terms and conditions of the Mulan PSL v2.
+* You may obtain a copy of Mulan PSL v2 at:
+*          http://license.coscl.org.cn/MulanPSL2
+*
+* THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND,
+* EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT,
+* MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE.
+*
+* See the Mulan PSL v2 for more details.
+***************************************************************************************/
+
+package xiangshan.mem.trace
+
+import chipsalliance.rocketchip.config.Parameters
+import chisel3._
+import chisel3.util._
+
+class L1MissTrace extends Bundle {
+  val vaddr = UInt(39.W)
+  val paddr = UInt(36.W)
+  val source = UInt(4.W)
+  val pc = UInt(39.W)
+}
\ No newline at end of file
--- a/src/main/scala/xiangshan/mem/lsqueue/LoadQueue.scala
+++ b/src/main/scala/xiangshan/mem/lsqueue/LoadQueue.scala
@@ -347,7 +347,8 @@ class LoadQueue(implicit p: Parameters) extends XSModule
  })

  (0 until LoadPipelineWidth).map(i => {
-    vaddrModule.io.raddr(LoadPipelineWidth + i) := loadReplaySelGen(i)
+    // vaddrModule rport 0 and 1 is used by exception and mmio 
+    vaddrModule.io.raddr(2 + i) := loadReplaySelGen(i)
  })

  (0 until LoadPipelineWidth).map(i => {

--- a/src/main/scala/xiangshan/mem/pipeline/AtomicsUnit.scala
+++ b/src/main/scala/xiangshan/mem/pipeline/AtomicsUnit.scala
@@ -55,6 +55,7 @@ class AtomicsUnit(implicit p: Parameters) extends XSModule with MemoryOpConstant
  val in = Reg(new ExuInput())
  val exceptionVec = RegInit(0.U.asTypeOf(ExceptionVec()))
  val atom_override_xtval = RegInit(false.B)
+  val have_sent_first_tlb_req = RegInit(false.B)
  val isLr = in.uop.ctrl.fuOpType === LSUOpType.lr_w || in.uop.ctrl.fuOpType === LSUOpType.lr_d
  // paddr after translation
  val paddr = Reg(UInt())
@@ -100,6 +101,7 @@ class AtomicsUnit(implicit p: Parameters) extends XSModule with MemoryOpConstant
      in := io.in.bits
      in.src(1) := in.src(1) // leave src2 unchanged
      state := s_tlb_and_flush_sbuffer_req
+      have_sent_first_tlb_req := false.B
    }
  }

@@ -136,7 +138,12 @@ class AtomicsUnit(implicit p: Parameters) extends XSModule with MemoryOpConstant
    // send req to sbuffer to flush it if it is not empty
    io.flush_sbuffer.valid := Mux(sbuffer_empty, false.B, true.B)

-    when(io.dtlb.resp.fire){
+    // do not accept tlb resp in the first cycle
+    // this limition is for hw prefetcher
+    // when !have_sent_first_tlb_req, tlb resp may come from hw prefetch
+    have_sent_first_tlb_req := true.B
+
+    when(io.dtlb.resp.fire && have_sent_first_tlb_req){
      paddr := io.dtlb.resp.bits.paddr(0)
      // exception handling
      val addrAligned = LookupTree(in.uop.ctrl.fuOpType(1,0), List(

--- a/src/main/scala/xiangshan/mem/pipeline/LoadUnit.scala
+++ b/src/main/scala/xiangshan/mem/pipeline/LoadUnit.scala
--- a/src/main/scala/xiangshan/mem/pipeline/StoreUnit.scala
+++ b/src/main/scala/xiangshan/mem/pipeline/StoreUnit.scala
@@ -53,6 +53,7 @@ class StoreUnit_S0(implicit p: Parameters) extends XSModule {
  io.dtlbReq.bits.size := LSUOpType.size(io.in.bits.uop.ctrl.fuOpType)
  io.dtlbReq.bits.kill := DontCare
  io.dtlbReq.bits.debug.robIdx := io.in.bits.uop.robIdx
+  io.dtlbReq.bits.no_translate := false.B
  io.dtlbReq.bits.debug.pc := io.in.bits.uop.cf.pc
  io.dtlbReq.bits.debug.isFirstIssue := io.isFirstIssue


--- a/src/main/scala/xiangshan/mem/prefetch/BasePrefecher.scala
+++ b/src/main/scala/xiangshan/mem/prefetch/BasePrefecher.scala
+package xiangshan.mem.prefetch
+
+import chisel3._
+import chisel3.util._
+import chipsalliance.rocketchip.config.Parameters
+import xiangshan._
+import xiangshan.cache.mmu.TlbRequestIO
+import xiangshan.mem.{LdPrefetchTrainBundle, L1PrefetchReq}
+
+class PrefetcherIO()(implicit p: Parameters) extends XSBundle {
+  val ld_in = Flipped(Vec(exuParameters.LduCnt, ValidIO(new LdPrefetchTrainBundle())))
+  val tlb_req = new TlbRequestIO(nRespDups = 2)
+  val pf_addr = ValidIO(UInt(PAddrBits.W))
+  val l1_req = DecoupledIO(new L1PrefetchReq())
+  val enable = Input(Bool())
+}
+
+trait PrefetcherParams
+
+abstract class BasePrefecher()(implicit p: Parameters) extends XSModule {
+  val io = IO(new PrefetcherIO())
+}
\ No newline at end of file
--- a/src/main/scala/xiangshan/mem/prefetch/L1PrefetchInterface.scala
+++ b/src/main/scala/xiangshan/mem/prefetch/L1PrefetchInterface.scala
+/***************************************************************************************
+* Copyright (c) 2020-2021 Institute of Computing Technology, Chinese Academy of Sciences
+* Copyright (c) 2020-2021 Peng Cheng Laboratory
+*
+* XiangShan is licensed under Mulan PSL v2.
+* You can use this software according to the terms and conditions of the Mulan PSL v2.
+* You may obtain a copy of Mulan PSL v2 at:
+*          http://license.coscl.org.cn/MulanPSL2
+*
+* THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND,
+* EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT,
+* MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE.
+*
+* See the Mulan PSL v2 for more details.
+***************************************************************************************/
+
+package xiangshan.mem
+
+import chipsalliance.rocketchip.config.Parameters
+import chisel3._
+import chisel3.util._
+import utils._
+import utility._
+import xiangshan.ExceptionNO._
+import xiangshan._
+import xiangshan.backend.fu.PMPRespBundle
+import xiangshan.cache._
+import xiangshan.cache.mmu.{TlbCmd, TlbReq, TlbRequestIO, TlbResp}
+
+class L1PrefetchReq (implicit p: Parameters) extends XSBundle with HasDCacheParameters{
+  val paddr = UInt(PAddrBits.W)
+  val alias = UInt(2.W)
+  val confidence = UInt(1.W)
+  val is_store = Bool()
+
+  // only index bit is used, do not use tag
+  def getVaddr(): UInt = {
+    Cat(alias, paddr(DCacheSameVPAddrLength-1, 0))
+  }
+
+  // when l1 cache prefetch req arrives at load unit:
+  // if (confidence == 1) 
+  //   override load unit 2 load req
+  // else if (load unit 1/2 is available)
+  //   send prefetch req
+  // else
+  //   report prefetch !ready
+}
+
+class L1PrefetchHint (implicit p: Parameters) extends XSBundle with HasDCacheParameters{
+  val loadbusy = Bool()
+  val missqbusy = Bool()
+}
+
+class L1PrefetchFuzzer(implicit p: Parameters) extends DCacheModule{
+  val io = IO(new Bundle() {
+    // prefetch req interface
+    val req = Decoupled(new L1PrefetchReq())
+    // for fuzzer address gen
+    val vaddr = Input(UInt(VAddrBits.W))
+    val paddr = Input(UInt(PAddrBits.W))
+  })
+
+  // prefetch req queue is not provided, prefetcher must maintain its
+  // own prefetch req queue.
+  val rand_offset = LFSR64(seed=Some(123L))(5,0) << 6
+  val rand_addr_select = LFSR64(seed=Some(567L))(3,0) === 0.U
+
+  // use valid vaddr and paddr
+  val rand_vaddr = DelayN(io.vaddr, 2)
+  val rand_paddr = DelayN(io.paddr, 2)
+
+  io.req.bits.paddr := 0x80000000L.U + rand_offset
+  io.req.bits.alias := io.req.bits.paddr(13,12)
+  io.req.bits.confidence := LFSR64(seed=Some(789L))(4,0) === 0.U
+  io.req.bits.is_store := LFSR64(seed=Some(890L))(4,0) === 0.U
+  io.req.valid := LFSR64(seed=Some(901L))(3,0) === 0.U
+}
\ No newline at end of file
--- a/src/main/scala/xiangshan/mem/prefetch/SMSPrefetcher.scala
+++ b/src/main/scala/xiangshan/mem/prefetch/SMSPrefetcher.scala