dcache: optimize the ready signal of missqueue (#1965)

Add a custom arbiter. In the case of multiple sources with the same cache block address, the arbiter will assign only one entry in misssqueue but ready for all same cache block address requests. This will reduce the number of replays of the load instruction which cannot enter the missqueue

dcache: optimize the ready signal of missqueue (#1965)
Add a custom arbiter. In the case of multiple sources with the same cache block address, the arbiter will assign only one entry in misssqueue but ready for all same cache block address requests. This will reduce the number of replays of the load instruction which cannot enter the missqueue
6008d57d · happy-lx · GitHub · ece4c195 · 6008d57d · 6008d57d
Showing with 70 addition and 1 deletion

src/main/scala/utils/ArbiterHelper.scala src/main/scala/utils/ArbiterHelper.scala +66 -0

src/main/scala/xiangshan/cache/dcache/DCacheWrapper.scala src/main/scala/xiangshan/cache/dcache/DCacheWrapper.scala +4 -1

未找到文件。
--- a/src/main/scala/utils/ArbiterHelper.scala
+++ b/src/main/scala/utils/ArbiterHelper.scala
+/***************************************************************************************
+* Copyright (c) 2020-2021 Institute of Computing Technology, Chinese Academy of Sciences
+* Copyright (c) 2020-2021 Peng Cheng Laboratory
+*
+* XiangShan is licensed under Mulan PSL v2.
+* You can use this software according to the terms and conditions of the Mulan PSL v2.
+* You may obtain a copy of Mulan PSL v2 at:
+*          http://license.coscl.org.cn/MulanPSL2
+*
+* THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND,
+* EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT,
+* MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE.
+*
+* See the Mulan PSL v2 for more details.
+***************************************************************************************/
+
+package utils
+
+import chipsalliance.rocketchip.config.Parameters
+import chisel3._
+import chisel3.util._
+import chisel3.internal.naming.chiselName
+import xiangshan.cache._
+
+object ArbiterCtrl {
+  def apply(request: Seq[Bool]): Seq[Bool] = request.length match {
+    case 0 => Seq()
+    case 1 => Seq(true.B)
+    case _ => true.B +: request.tail.init.scanLeft(request.head)(_ || _).map(!_)
+  }
+}
+
+/** Hardware module that is used to sequence n producers into 1 consumer.
+  * Priority is given to lower producer.
+  * if any producer's cache block addr matches the one of chosen producer, the producer will be served
+  *
+  * @param gen data type, must have addr which indicates physical address
+  * @param n number of inputs
+  * @param offset_width cache line offset width
+  * @param paddr_bits how many bits in paddr
+  *
+  * @example {{{
+  * val arb = Module(new Arbiter(UInt(), 2))
+  * arb.io.in(0) <> producer0.io.out
+  * arb.io.in(1) <> producer1.io.out
+  * consumer.io.in <> arb.io.out
+  * }}}
+  */
+@chiselName
+class ArbiterFilterByCacheLineAddr[T <: MissReqWoStoreData](val gen: T, val n: Int, val offset_width: Int, val paddr_bits: Int) extends Module{
+  val io = IO(new ArbiterIO(gen, n))
+
+  io.chosen := (n - 1).asUInt
+  io.out.bits := io.in(n - 1).bits
+  for (i <- n - 2 to 0 by -1) {
+    when(io.in(i).valid) {
+      io.chosen := i.asUInt
+      io.out.bits := io.in(i).bits
+    }
+  }
+
+  val grant = ArbiterCtrl(io.in.map(_.valid))
+  for ((in, g) <- io.in.zip(grant))
+    in.ready := (g || (in.bits.addr(paddr_bits - 1, offset_width) === io.out.bits.addr(paddr_bits - 1, offset_width))) && io.out.ready
+  io.out.valid := !grant.last || io.in.last.valid
+}
--- a/src/main/scala/xiangshan/cache/dcache/DCacheWrapper.scala
+++ b/src/main/scala/xiangshan/cache/dcache/DCacheWrapper.scala
@@ -861,7 +861,7 @@ class DCacheImp(outer: DCache) extends LazyModuleImp(outer) with HasDCacheParame
  val MainPipeMissReqPort = 0

  // Request
-  val missReqArb = Module(new Arbiter(new MissReq, MissReqPortCount))
+  val missReqArb = Module(new ArbiterFilterByCacheLineAddr(new MissReq, MissReqPortCount, blockOffBits, PAddrBits))

  missReqArb.io.in(MainPipeMissReqPort) <> mainPipe.io.miss_req
  for (w <- 0 until LoadPipelineWidth) { missReqArb.io.in(w + 1) <> ldu(w).io.miss_req }
@@ -879,6 +879,9 @@ class DCacheImp(outer: DCache) extends LazyModuleImp(outer) with HasDCacheParame
    missReqArb.io.out.ready := false.B
  }

+  XSPerfAccumulate("miss_queue_fire", PopCount(VecInit(missReqArb.io.in.map(_.fire))) >= 1.U)
+  XSPerfAccumulate("miss_queue_muti_fire", PopCount(VecInit(missReqArb.io.in.map(_.fire))) > 1.U)
+
  // forward missqueue
  (0 until LoadPipelineWidth).map(i => io.lsu.forward_mshr(i).connect(missQueue.io.forward(i)))