From 0dc4893d7629b8ec8a9c78fd371177de2d6aac94 Mon Sep 17 00:00:00 2001 From: Yinan Xu Date: Sun, 10 Jul 2022 15:14:08 +0800 Subject: [PATCH] core: optimize redirect timing (#1630) This commit adds separated redirect registers in ExuBlock and MemBlock. They have one cycle latency compared to redirect in CtrlBlock. This will help reduce the fanout of redirect registers. --- src/main/scala/utils/BitUtils.scala | 14 +++++++++ .../scala/xiangshan/backend/CtrlBlock.scala | 15 +++++----- .../scala/xiangshan/backend/ExuBlock.scala | 12 ++++---- .../scala/xiangshan/backend/MemBlock.scala | 14 +++++---- .../backend/dispatch/DispatchQueue.scala | 3 +- .../xiangshan/backend/exu/WbArbiter.scala | 29 ++++++++++--------- .../scala/xiangshan/backend/rob/Rob.scala | 1 + 7 files changed, 55 insertions(+), 33 deletions(-) diff --git a/src/main/scala/utils/BitUtils.scala b/src/main/scala/utils/BitUtils.scala index 7625fb869..4b91b0e21 100644 --- a/src/main/scala/utils/BitUtils.scala +++ b/src/main/scala/utils/BitUtils.scala @@ -20,6 +20,20 @@ import chisel3._ import chisel3.util._ import scala.math.min +object RegNextWithEnable { + def apply[T <: Data](data: Valid[T], hasInit: Boolean = true): Valid[T] = { + val next = Wire(data.cloneType) + if (hasInit) { + next.valid := RegNext(data.valid, false.B) + } + else { + next.valid := RegNext(data.valid) + } + next.bits := RegEnable(data.bits, data.valid) + next + } +} + class CircularShift(data: UInt) { private def helper(step: Int, isLeft: Boolean): UInt = { if (step == 0) { diff --git a/src/main/scala/xiangshan/backend/CtrlBlock.scala b/src/main/scala/xiangshan/backend/CtrlBlock.scala index 3f21fd61c..f47baf3d2 100644 --- a/src/main/scala/xiangshan/backend/CtrlBlock.scala +++ b/src/main/scala/xiangshan/backend/CtrlBlock.scala @@ -268,7 +268,7 @@ class CtrlBlockImp(outer: CtrlBlock)(implicit p: Parameters) extends LazyModuleI val exuOutput = WireInit(writeback) val timer = GTimer() for ((wb_next, wb) <- exuOutput.zip(writeback)) { - wb_next.valid := RegNext(wb.valid && !wb.bits.uop.robIdx.needFlush(stage2Redirect)) + wb_next.valid := RegNext(wb.valid && !wb.bits.uop.robIdx.needFlush(Seq(stage2Redirect, redirectForExu))) wb_next.bits := RegNext(wb.bits) wb_next.bits.uop.debugInfo.writebackTime := timer } @@ -302,11 +302,12 @@ class CtrlBlockImp(outer: CtrlBlock)(implicit p: Parameters) extends LazyModuleI flushRedirectReg.bits := RegEnable(flushRedirect.bits, flushRedirect.valid) val stage2Redirect = Mux(flushRedirect.valid, flushRedirect, redirectGen.io.stage2Redirect) - // val stage3Redirect = Mux(flushRedirectReg.valid, flushRedirectReg, redirectGen.io.stage3Redirect) + // Redirect will be RegNext at ExuBlocks. + val redirectForExu = RegNextWithEnable(stage2Redirect) val exuRedirect = io.exuRedirect.map(x => { val valid = x.valid && x.bits.redirectValid - val killedByOlder = x.bits.uop.robIdx.needFlush(stage2Redirect) + val killedByOlder = x.bits.uop.robIdx.needFlush(Seq(stage2Redirect, redirectForExu)) val delayed = Wire(Valid(new ExuOutput)) delayed.valid := RegNext(valid && !killedByOlder, init = false.B) delayed.bits := RegEnable(x.bits, x.valid) @@ -314,7 +315,7 @@ class CtrlBlockImp(outer: CtrlBlock)(implicit p: Parameters) extends LazyModuleI }) val loadReplay = Wire(Valid(new Redirect)) loadReplay.valid := RegNext(io.memoryViolation.valid && - !io.memoryViolation.bits.robIdx.needFlush(stage2Redirect), + !io.memoryViolation.bits.robIdx.needFlush(Seq(stage2Redirect, redirectForExu)), init = false.B ) loadReplay.bits := RegEnable(io.memoryViolation.bits, io.memoryViolation.valid) @@ -471,9 +472,9 @@ class CtrlBlockImp(outer: CtrlBlock)(implicit p: Parameters) extends LazyModuleI dispatch.io.allocPregs <> io.allocPregs dispatch.io.singleStep := RegNext(io.csrCtrl.singlestep) - intDq.io.redirect <> stage2Redirect - fpDq.io.redirect <> stage2Redirect - lsDq.io.redirect <> stage2Redirect + intDq.io.redirect <> redirectForExu + fpDq.io.redirect <> redirectForExu + lsDq.io.redirect <> redirectForExu io.dispatch <> intDq.io.deq ++ lsDq.io.deq ++ fpDq.io.deq diff --git a/src/main/scala/xiangshan/backend/ExuBlock.scala b/src/main/scala/xiangshan/backend/ExuBlock.scala index 311fce783..8b63fc159 100644 --- a/src/main/scala/xiangshan/backend/ExuBlock.scala +++ b/src/main/scala/xiangshan/backend/ExuBlock.scala @@ -81,9 +81,11 @@ class ExuBlockImp(outer: ExuBlock)(implicit p: Parameters) extends LazyModuleImp }) override def writebackSource1: Option[Seq[Seq[DecoupledIO[ExuOutput]]]] = Some(Seq(io.fuWriteback)) + val redirect = RegNextWithEnable(io.redirect) + // IO for the scheduler scheduler.io.hartId := io.hartId - scheduler.io.redirect <> io.redirect + scheduler.io.redirect <> redirect scheduler.io.allocPregs <> io.allocPregs scheduler.io.in <> io.in scheduler.io.fastUopOut <> io.fastUopOut @@ -101,7 +103,7 @@ class ExuBlockImp(outer: ExuBlock)(implicit p: Parameters) extends LazyModuleImp } // IO for the function units - fuBlock.io.redirect <> io.redirect + fuBlock.io.redirect <> redirect fuBlock.io.writeback <> io.fuWriteback fuBlock.io.extra <> io.fuExtra @@ -134,7 +136,7 @@ class ExuBlockImp(outer: ExuBlock)(implicit p: Parameters) extends LazyModuleImp scheWb.valid := wb.valid scheWb.bits := wb.bits if (cfg.hasFastUopOut) { - val isFlushed = wb.bits.uop.robIdx.needFlush(io.redirect) + val isFlushed = wb.bits.uop.robIdx.needFlush(redirect) scheWb.valid := RegNext(wb.valid && !isFlushed) scheWb.bits.uop := RegNext(wb.bits.uop) } @@ -164,7 +166,7 @@ class ExuBlockImp(outer: ExuBlock)(implicit p: Parameters) extends LazyModuleImp wbOut.bits.uop := fastWakeup.bits } else { - val isFlushed = fastWakeup.bits.robIdx.needFlush(io.redirect) + val isFlushed = fastWakeup.bits.robIdx.needFlush(redirect) wbOut.valid := RegNext(fastWakeup.valid && !isFlushed) wbOut.bits.uop := RegNext(fastWakeup.bits) } @@ -188,7 +190,7 @@ class ExuBlockImp(outer: ExuBlock)(implicit p: Parameters) extends LazyModuleImp require(wakeupIdx.length == wbIdx.length) for ((i, j) <- wakeupIdx.zip(wbIdx)) { val scheWb = scheduler.io.writeback(j) - val isFlushed = scheduler.io.fastUopOut(i).bits.robIdx.needFlush(io.redirect) + val isFlushed = scheduler.io.fastUopOut(i).bits.robIdx.needFlush(redirect) scheWb.valid := RegNext(scheduler.io.fastUopOut(i).valid && !isFlushed) scheWb.bits.uop := RegNext(scheduler.io.fastUopOut(i).bits) } diff --git a/src/main/scala/xiangshan/backend/MemBlock.scala b/src/main/scala/xiangshan/backend/MemBlock.scala index aeab88acb..b9bd6bfd3 100644 --- a/src/main/scala/xiangshan/backend/MemBlock.scala +++ b/src/main/scala/xiangshan/backend/MemBlock.scala @@ -101,6 +101,8 @@ class MemBlockImp(outer: MemBlock) extends LazyModuleImp(outer) override def writebackSource1: Option[Seq[Seq[DecoupledIO[ExuOutput]]]] = Some(Seq(io.writeback)) + val redirect = RegNextWithEnable(io.redirect) + val dcache = outer.dcache.module val uncache = outer.uncache.module @@ -248,7 +250,7 @@ class MemBlockImp(outer: MemBlock) extends LazyModuleImp(outer) // LoadUnit for (i <- 0 until exuParameters.LduCnt) { - loadUnits(i).io.redirect <> io.redirect + loadUnits(i).io.redirect <> redirect loadUnits(i).io.feedbackSlow <> io.rsfeedback(i).feedbackSlow loadUnits(i).io.feedbackFast <> io.rsfeedback(i).feedbackFast loadUnits(i).io.rsIdx := io.rsfeedback(i).rsIdx @@ -333,12 +335,12 @@ class MemBlockImp(outer: MemBlock) extends LazyModuleImp(outer) for (i <- 0 until exuParameters.StuCnt) { val stu = storeUnits(i) - stdExeUnits(i).io.redirect <> io.redirect + stdExeUnits(i).io.redirect <> redirect stdExeUnits(i).io.fromInt <> io.issue(i + exuParameters.LduCnt + exuParameters.StuCnt) stdExeUnits(i).io.fromFp := DontCare stdExeUnits(i).io.out := DontCare - stu.io.redirect <> io.redirect + stu.io.redirect <> redirect stu.io.feedbackSlow <> io.rsfeedback(exuParameters.LduCnt + i).feedbackSlow stu.io.rsIdx <> io.rsfeedback(exuParameters.LduCnt + i).rsIdx // NOTE: just for dtlb's perf cnt @@ -437,7 +439,7 @@ class MemBlockImp(outer: MemBlock) extends LazyModuleImp(outer) // Lsq lsq.io.rob <> io.lsqio.rob lsq.io.enq <> io.enqLsq - lsq.io.brqRedirect <> io.redirect + lsq.io.brqRedirect <> redirect io.memoryViolation <> lsq.io.rollback lsq.io.uncache <> uncache.io.lsq // delay dcache refill for 1 cycle for better timing @@ -499,7 +501,7 @@ class MemBlockImp(outer: MemBlock) extends LazyModuleImp(outer) st_data_atomics(i) -> stData(i).bits)) atomicsUnit.io.rsIdx := Mux1H(Seq.tabulate(exuParameters.StuCnt)(i => st_atomics(i) -> io.rsfeedback(atomic_rs(i)).rsIdx)) - atomicsUnit.io.redirect <> io.redirect + atomicsUnit.io.redirect <> redirect // TODO: complete amo's pmp support val amoTlb = dtlb_ld(0).requestor(0) @@ -533,7 +535,7 @@ class MemBlockImp(outer: MemBlock) extends LazyModuleImp(outer) // Exception address is used serveral cycles after flush. // We delay it by 10 cycles to ensure its flush safety. val atomicsException = RegInit(false.B) - when (DelayN(io.redirect.valid, 10) && atomicsException) { + when (DelayN(redirect.valid, 10) && atomicsException) { atomicsException := false.B }.elsewhen (atomicsUnit.io.exceptionAddr.valid) { atomicsException := true.B diff --git a/src/main/scala/xiangshan/backend/dispatch/DispatchQueue.scala b/src/main/scala/xiangshan/backend/dispatch/DispatchQueue.scala index ee9f57c37..755c665c0 100644 --- a/src/main/scala/xiangshan/backend/dispatch/DispatchQueue.scala +++ b/src/main/scala/xiangshan/backend/dispatch/DispatchQueue.scala @@ -68,7 +68,6 @@ class DispatchQueue(size: Int, enqnum: Int, deqnum: Int)(implicit p: Parameters) val isTrueEmpty = !VecInit(stateEntries.map(_ === s_valid)).asUInt.orR val canEnqueue = allowEnqueue - val canActualEnqueue = canEnqueue && !io.redirect.valid /** * Part 1: update states and uops when enqueue, dequeue, commit, redirect/replay @@ -87,7 +86,7 @@ class DispatchQueue(size: Int, enqnum: Int, deqnum: Int)(implicit p: Parameters) val enqIndexOH = (0 until enqnum).map(i => tailPtrOHVec(PopCount(io.enq.needAlloc.take(i)))) for (i <- 0 until size) { val validVec = io.enq.req.map(_.valid).zip(enqIndexOH).map{ case (v, oh) => v && oh(i) } - when (VecInit(validVec).asUInt.orR && canActualEnqueue) { + when (VecInit(validVec).asUInt.orR && canEnqueue) { data(i) := Mux1H(validVec, io.enq.req.map(_.bits)) stateEntries(i) := s_valid } diff --git a/src/main/scala/xiangshan/backend/exu/WbArbiter.scala b/src/main/scala/xiangshan/backend/exu/WbArbiter.scala index ab27107b1..58d06b683 100644 --- a/src/main/scala/xiangshan/backend/exu/WbArbiter.scala +++ b/src/main/scala/xiangshan/backend/exu/WbArbiter.scala @@ -75,7 +75,7 @@ class ExuWbArbiter(n: Int, hasFastUopOut: Boolean, fastVec: Seq[Boolean])(implic val sel = VecInit(io.in.map(_.fire)).asUInt io.out.bits := Mux1H(RegNext(sel), dataVec) // uop comes at the same cycle with valid and only RegNext is needed. - io.out.bits.uop := RegNext(uop) + io.out.bits.uop := RegEnable(uop, ctrl_arb.io.out.valid) } } @@ -156,6 +156,8 @@ class WbArbiterImp(outer: WbArbiter)(implicit p: Parameters) extends LazyModuleI val out = Vec(outer.numOutPorts, ValidIO(new ExuOutput)) }) + val redirect = RegNextWithEnable(io.redirect) + val exclusiveIn = outer.exclusivePorts.map(io.in(_)) val sharedIn = outer.sharedPorts.map(io.in(_)) @@ -168,12 +170,12 @@ class WbArbiterImp(outer: WbArbiter)(implicit p: Parameters) extends LazyModuleI require(!hasFastUopOut || !outer.needRegNext(i)) if (hasFastUopOut) { // When hasFastUopOut, only uop comes at the same cycle with valid. - out.valid := RegNext(in.valid && !in.bits.uop.robIdx.needFlush(io.redirect)) - out.bits.uop := RegNext(in.bits.uop) + out.valid := RegNext(in.valid && !in.bits.uop.robIdx.needFlush(redirect)) + out.bits.uop := RegEnable(in.bits.uop, in.valid) } if (outer.needRegNext(i)) { - out.valid := RegNext(in.valid && !in.bits.uop.robIdx.needFlush(io.redirect)) - out.bits := RegNext(in.bits) + out.valid := RegNext(in.valid && !in.bits.uop.robIdx.needFlush(redirect)) + out.bits := RegEnable(in.bits, in.valid) } in.ready := true.B } @@ -190,7 +192,7 @@ class WbArbiterImp(outer: WbArbiter)(implicit p: Parameters) extends LazyModuleI val flushFunc = (o: ExuOutput, r: Valid[Redirect]) => o.uop.robIdx.needFlush(r) if (outer.cfgHasFast(i)) { val ctrl_pipe = Wire(io.in(i).cloneType) - val buffer = PipelineConnect(io.in(i), ctrl_pipe, flushFunc, io.redirect, io.in(i).bits, 1) + val buffer = PipelineConnect(io.in(i), ctrl_pipe, flushFunc, redirect, io.in(i).bits, 1) buffer.extra.in := io.in(i).bits val buffer_out = Wire(io.in(i).cloneType) ctrl_pipe.ready := buffer_out.ready @@ -200,7 +202,7 @@ class WbArbiterImp(outer: WbArbiter)(implicit p: Parameters) extends LazyModuleI buffer_out } else { - PipelineNext(io.in(i), flushFunc, io.redirect) + PipelineNext(io.in(i), flushFunc, redirect) } } else io.in(i) @@ -208,7 +210,7 @@ class WbArbiterImp(outer: WbArbiter)(implicit p: Parameters) extends LazyModuleI val hasFastUopOut = outer.hasFastUopOut(portIndex) val fastVec = outer.hasFastUopOutVec(portIndex) val arb = Module(new ExuWbArbiter(shared.size, hasFastUopOut, fastVec)) - arb.io.redirect <> io.redirect + arb.io.redirect <> redirect arb.io.in <> shared out.valid := arb.io.out.valid out.bits := arb.io.out.bits @@ -347,8 +349,8 @@ class Wb2Ctrl(configs: Seq[ExuConfig])(implicit p: Parameters) extends LazyModul module.io.in := sink._1.zip(sink._2).zip(sourceMod).flatMap(x => x._1._1.writebackSource1(x._2)(x._1._2)) } - lazy val module = new LazyModuleImp(this) - with HasWritebackSourceImp + lazy val module = new LazyModuleImp(this) + with HasWritebackSourceImp with HasXSParameter { val io = IO(new Bundle { @@ -357,13 +359,14 @@ class Wb2Ctrl(configs: Seq[ExuConfig])(implicit p: Parameters) extends LazyModul val out = Vec(configs.length, ValidIO(new ExuOutput)) val delayedLoadError = Vec(LoadPipelineWidth, Input(Bool())) // Dirty fix of data ecc error timing }) + val redirect = RegNextWithEnable(io.redirect) for (((out, in), config) <- io.out.zip(io.in).zip(configs)) { out.valid := in.fire out.bits := in.bits if (config.hasFastUopOut || config.hasLoadError) { - out.valid := RegNext(in.fire && !in.bits.uop.robIdx.needFlush(io.redirect)) - out.bits.uop := RegNext(in.bits.uop) + out.valid := RegNext(in.fire && !in.bits.uop.robIdx.needFlush(redirect)) + out.bits.uop := RegEnable(in.bits.uop, in.fire) } } @@ -374,7 +377,7 @@ class Wb2Ctrl(configs: Seq[ExuConfig])(implicit p: Parameters) extends LazyModul ){ // overwrite load exception writeback out.bits.uop.cf.exceptionVec(loadAccessFault) := delayed_error || - RegNext(in.bits.uop.cf.exceptionVec(loadAccessFault)) + RegEnable(in.bits.uop.cf.exceptionVec(loadAccessFault), in.valid) } } diff --git a/src/main/scala/xiangshan/backend/rob/Rob.scala b/src/main/scala/xiangshan/backend/rob/Rob.scala index a48f2493f..477f29730 100644 --- a/src/main/scala/xiangshan/backend/rob/Rob.scala +++ b/src/main/scala/xiangshan/backend/rob/Rob.scala @@ -35,6 +35,7 @@ class RobPtr(implicit p: Parameters) extends CircularQueuePtr[RobPtr]( redirect.valid && (flushItself || isAfter(this, redirect.bits.robIdx)) } + def needFlush(redirect: Seq[Valid[Redirect]]): Bool = VecInit(redirect.map(needFlush)).asUInt.orR } object RobPtr { -- GitLab