未验证 提交 fa9d712c 编写于 作者: Y Yinan Xu 提交者: GitHub

dp2: add a pipeline for load/store (#1597)

* dp2: add a pipeline for load/store

Load/store Dispatch2 has a bad timing because it requires the fuType
to disguish the out ports. This brings timing issues because the
instruction has to read busyTable after the port arbitration.

This commit adds a pipeline in dp2Ls, which may cause performance
degradation. Instructions are dispatched according to out, and at
the next cycle it will leave dp2.

* bump difftest trying to fix vcs
上级 92816bbc
Subproject commit 7204a60b2e52453fdac0877402a8bbc09bec6e44
Subproject commit bafef94dde8cffc036093ef01544481c4efb3f92
......@@ -27,15 +27,7 @@ class PipelineConnectPipe[T <: Data](gen: T) extends Module {
val isFlush = Input(Bool())
})
val valid = RegInit(false.B)
val leftFire = io.in.valid && io.out.ready
when (io.rightOutFire) { valid := false.B }
when (leftFire) { valid := true.B }
when (io.isFlush) { valid := false.B }
io.in.ready := io.out.ready
io.out.bits := RegEnable(io.in.bits, leftFire)
io.out.valid := valid
PipelineConnect.connect(io.in, io.out, io.rightOutFire, io.isFlush, false.B)
}
class PipelineConnectBuffer[T <: Data, FlushT <: Data](gen: T, flushGen: FlushT, flushFunc: (T, FlushT) => Bool)
......@@ -106,6 +98,24 @@ class PipelineConnectBufferWithExtraData[T <: Data, FlushT <: Data, ExtraT <: Da
}
object PipelineConnect {
def connect[T <: Data](
left: DecoupledIO[T],
right: DecoupledIO[T],
rightOutFire: Bool,
isFlush: Bool,
block: Bool
): Unit = {
val valid = RegInit(false.B)
val leftFire = left.valid && right.ready && !block
when (rightOutFire) { valid := false.B }
when (leftFire) { valid := true.B }
when (isFlush) { valid := false.B }
left.ready := right.ready && !block
right.bits := RegEnable(left.bits, leftFire)
right.valid := valid
}
def apply[T <: Data](
left: DecoupledIO[T],
right: DecoupledIO[T],
......@@ -114,13 +124,19 @@ object PipelineConnect {
block: Bool = false.B,
moduleName: Option[String] = None
): Unit = {
val pipeline = Module(new PipelineConnectPipe(left.bits))
if(moduleName.nonEmpty) pipeline.suggestName(moduleName.get)
pipeline.io.in <> left
pipeline.io.rightOutFire := rightOutFire
pipeline.io.isFlush := isFlush
pipeline.io.out <> right
pipeline.io.out.ready := right.ready && !block
if (moduleName.isDefined) {
val pipeline = Module(new PipelineConnectPipe(left.bits))
pipeline.suggestName(moduleName.get)
pipeline.io.in <> left
pipeline.io.rightOutFire := rightOutFire
pipeline.io.isFlush := isFlush
pipeline.io.out <> right
pipeline.io.out.ready := right.ready && !block
}
else {
// do not use module here to please DCE
connect(left, right, rightOutFire, isFlush, block)
}
}
def apply[T <: Data, FlushT <: Data](
......@@ -137,7 +153,6 @@ object PipelineConnect {
pipe_buffer.io.flush := flush
}
def apply[T <: Data, FlushT <: Data, ExtraT <: Data](
left: DecoupledIO[T],
right: DecoupledIO[T],
......@@ -161,7 +176,7 @@ object PipelineNext {
isFlush: Bool
): DecoupledIO[T] = {
val right = Wire(Decoupled(left.bits.cloneType))
PipelineConnect(left, right, rightOutFire, isFlush, moduleName = Some("pipeline"))
PipelineConnect(left, right, rightOutFire, isFlush)
right
}
......
......@@ -287,6 +287,7 @@ class SchedulerImp(outer: Scheduler) extends LazyModuleImp(outer) with HasXSPara
})
val dispatch2 = outer.dispatch2.map(_.module)
dispatch2.foreach(_.io.redirect := io.redirect)
// dirty code for ls dp
dispatch2.foreach(dp => if (dp.io.enqLsq.isDefined) {
......
......@@ -57,6 +57,7 @@ class Dispatch2RsImp(outer: Dispatch2Rs)(implicit p: Parameters) extends LazyMod
val numFpStateRead = outer.numFpStateRead
val io = IO(new Bundle() {
val redirect = Flipped(ValidIO(new Redirect))
val in = Flipped(Vec(outer.numIn, DecoupledIO(new MicroOp)))
val readIntState = if (numIntStateRead > 0) Some(Vec(numIntStateRead, Flipped(new BusyTableReadIO))) else None
val readFpState = if (numFpStateRead > 0) Some(Vec(numFpStateRead, Flipped(new BusyTableReadIO))) else None
......@@ -64,12 +65,12 @@ class Dispatch2RsImp(outer: Dispatch2Rs)(implicit p: Parameters) extends LazyMod
val enqLsq = if (outer.hasLoadStore) Some(Flipped(new LsqEnqIO)) else None
})
val numInFire = PopCount(io.in.map(_.fire()))
val numStaFire = PopCount(io.out.zip(outer.configs).filter(_._2.contains(StaExeUnitCfg)).map(_._1.fire()))
val numStdFire = PopCount(io.out.zip(outer.configs).filter(_._2.contains(StdExeUnitCfg)).map(_._1.fire()))
XSError(numStaFire =/= numStdFire, "sta_fire != std_fire\n")
val numOutFire = PopCount(io.out.map(_.fire())) - numStdFire
XSError(numInFire =/= numOutFire, "in != out\n")
val numInFire = PopCount(io.in.map(_.fire))
val numStaFire = PopCount(io.out.zip(outer.configs).filter(_._2.contains(StaExeUnitCfg)).map(_._1.fire))
val numStdFire = PopCount(io.out.zip(outer.configs).filter(_._2.contains(StdExeUnitCfg)).map(_._1.fire))
// XSError(numStaFire =/= numStdFire, "sta_fire != std_fire\n")
val numOutFire = PopCount(io.out.map(_.fire)) - numStdFire
// XSError(numInFire =/= numOutFire, "in != out\n")
XSPerfAccumulate("in_valid", PopCount(io.in.map(_.valid)))
XSPerfAccumulate("in_fire", PopCount(io.in.map(_.fire)))
......@@ -179,6 +180,8 @@ class Dispatch2RsDistinctImp(outer: Dispatch2Rs)(implicit p: Parameters) extends
in.foreach(_.ready := false.B)
io.in.zip(in).foreach(x => x._1.ready := x._2.ready)
// add one pipeline before out
val s0_out = Wire(io.out.cloneType)
// dirty code for lsq enq
val is_blocked = WireDefault(VecInit(Seq.fill(io.in.length)(false.B)))
if (io.enqLsq.isDefined) {
......@@ -207,7 +210,7 @@ class Dispatch2RsDistinctImp(outer: Dispatch2Rs)(implicit p: Parameters) extends
in(i).bits.lqIdx := enqLsq.resp(i).lqIdx
in(i).bits.sqIdx := enqLsq.resp(i).sqIdx
enqLsq.req(i).valid := in(i).valid && VecInit(io.out.map(_.ready)).asUInt.andR
enqLsq.req(i).valid := in(i).valid && VecInit(s0_out.map(_.ready)).asUInt.andR
}
}
}
......@@ -219,24 +222,49 @@ class Dispatch2RsDistinctImp(outer: Dispatch2Rs)(implicit p: Parameters) extends
val select = SelectOne("naive", canAccept, numOfThisExu)
for ((idx, j) <- outIndices.zipWithIndex) {
val (selectValid, selectIdxOH) = select.getNthOH(j + 1)
io.out(idx).valid := selectValid && !Mux1H(selectIdxOH, is_blocked)
io.out(idx).bits := Mux1H(selectIdxOH, in.map(_.bits))
s0_out(idx).valid := selectValid && !Mux1H(selectIdxOH, is_blocked)
s0_out(idx).bits := Mux1H(selectIdxOH, in.map(_.bits))
// Special case for STD
if (config.contains(StdExeUnitCfg)) {
val sta = io.out(idx - StorePipelineWidth)
sta.valid := io.out(idx).valid
io.out(idx).bits.ctrl.srcType(0) := io.out(idx).bits.ctrl.srcType(1)
io.out(idx).bits.psrc(0) := io.out(idx).bits.psrc(1)
XSPerfAccumulate(s"st_rs_not_ready_$idx", selectValid && (!sta.ready || !io.out(idx).ready))
XSPerfAccumulate(s"sta_rs_not_ready_$idx", selectValid && !sta.ready && io.out(idx).ready)
XSPerfAccumulate(s"std_rs_not_ready_$idx", selectValid && sta.ready && !io.out(idx).ready)
val sta = s0_out(idx - StorePipelineWidth)
sta.valid := s0_out(idx).valid
s0_out(idx).bits.ctrl.srcType(0) := s0_out(idx).bits.ctrl.srcType(1)
s0_out(idx).bits.psrc(0) := s0_out(idx).bits.psrc(1)
XSPerfAccumulate(s"st_rs_not_ready_$idx", selectValid && (!sta.ready || !s0_out(idx).ready))
XSPerfAccumulate(s"sta_rs_not_ready_$idx", selectValid && !sta.ready && s0_out(idx).ready)
XSPerfAccumulate(s"std_rs_not_ready_$idx", selectValid && sta.ready && !s0_out(idx).ready)
}
else {
in.zip(selectIdxOH).foreach{ case (in, v) => when (v) { in.ready := io.out(idx).ready }}
in.zip(selectIdxOH).foreach{ case (in, v) => when (v) { in.ready := s0_out(idx).ready }}
}
}
}
// dispatch is allowed when lsq and rs can accept all the instructions
// TODO: better algorithm here?
if (io.enqLsq.isDefined) {
when (!VecInit(s0_out.map(_.ready)).asUInt.andR || !io.enqLsq.get.canAccept) {
in.foreach(_.ready := false.B)
s0_out.foreach(_.valid := false.B)
}
}
// agreement with dispatch queue: don't enqueue when io.redirect.valid
when (io.redirect.valid) {
s0_out.foreach(_.valid := false.B)
}
// Note: the dispatch queue must not dequeue when io.redirect.valid
val s1_rightFire = Wire(Vec(s0_out.length, Bool()))
val s1_flush = Wire(Vec(s0_out.length, Bool()))
val s1_out = io.out.indices.map(i => PipelineNext(s0_out(i), s1_rightFire(i), s1_flush(i)))
for (i <- io.out.indices) {
io.out(i).valid := s1_out(i).valid
io.out(i).bits := s1_out(i).bits
s1_out(i).ready := !s1_out(i).valid || io.out(i).ready
s1_rightFire(i) := io.out(i).ready
s1_flush(i) := s1_out(i).valid && s1_out(i).bits.robIdx.needFlush(io.redirect)
}
if (io.readIntState.isDefined) {
val stateReadReq = io.out.zip(outer.numIntSrc).flatMap(x => x._1.bits.psrc.take(x._2))
io.readIntState.get.map(_.req).zip(stateReadReq).foreach(x => x._1 := x._2)
......@@ -258,12 +286,4 @@ class Dispatch2RsDistinctImp(outer: Dispatch2Rs)(implicit p: Parameters) extends
}
}
// dispatch is allowed when lsq and rs can accept all the instructions
// TODO: better algorithm here?
if (io.enqLsq.isDefined) {
when (!VecInit(io.out.map(_.ready)).asUInt.andR || !io.enqLsq.get.canAccept) {
in.foreach(_.ready := false.B)
io.out.foreach(_.valid := false.B)
}
}
}
......@@ -61,6 +61,7 @@ case class RSParams
def needScheduledBit: Boolean = hasFeedback || delayedRf || hasMidState
def needBalance: Boolean = exuCfg.get.needLoadBalance
def numSelect: Int = numDeq + (if (oldestFirst._1) 1 else 0)
def dropOnRedirect: Boolean = !(isLoad || isStore || isStoreData)
override def toString: String = {
s"type ${exuCfg.get.name}, size $numEntries, enq $numEnq, deq $numDeq, numSrc $numSrc, fast $numFastWakeup, wakeup $numWakeup"
......@@ -221,8 +222,8 @@ class ReservationStationIO(params: RSParams)(implicit p: Parameters) extends XSB
val jumpPc = Input(UInt(VAddrBits.W))
val jalr_target = Input(UInt(VAddrBits.W))
}) else None
val feedback = if (params.hasFeedback) Some(Vec(params.numDeq,
Flipped(new MemRSFeedbackIO)
val feedback = if (params.hasFeedback) Some(Vec(params.numDeq,
Flipped(new MemRSFeedbackIO)
)) else None
val checkwait = if (params.checkWaitBit) Some(new Bundle {
val stIssuePtr = Input(new SqPtr())
......@@ -262,15 +263,20 @@ class ReservationStation(params: RSParams)(implicit p: Parameters) extends XSMod
*/
// enqueue from dispatch
select.io.validVec := statusArray.io.isValid
// agreement with dispatch: don't enqueue when io.redirect.valid
val doEnqueue = VecInit(io.fromDispatch.map(_.fire && !io.redirect.valid))
val enqShouldNotFlushed = io.fromDispatch.map(d => d.fire && !d.bits.robIdx.needFlush(io.redirect))
XSPerfAccumulate("wrong_stall", Mux(io.redirect.valid, PopCount(enqShouldNotFlushed), 0.U))
val doEnqueue = Wire(Vec(params.numEnq, Bool()))
val enqNotFlushed = io.fromDispatch.map(d => d.fire && !d.bits.robIdx.needFlush(io.redirect))
if (params.dropOnRedirect) {
doEnqueue := io.fromDispatch.map(_.fire && !io.redirect.valid)
XSPerfAccumulate("wrong_stall", Mux(io.redirect.valid, PopCount(enqNotFlushed), 0.U))
}
else {
doEnqueue := enqNotFlushed
}
val needFpSource = io.fromDispatch.map(_.bits.needRfRPort(0, true, false))
for (i <- 0 until params.numEnq) {
io.fromDispatch(i).ready := select.io.allocate(i).valid
// for better timing, we update statusArray no matter there's a flush or not
statusArray.io.update(i).enable := io.fromDispatch(i).fire()
statusArray.io.update(i).enable := io.fromDispatch(i).fire
statusArray.io.update(i).addr := select.io.allocate(i).bits
statusArray.io.update(i).data.valid := true.B
statusArray.io.update(i).data.scheduled := params.delayedRf.B && needFpSource(i)
......@@ -764,4 +770,3 @@ class ReservationStation(params: RSParams)(implicit p: Parameters) extends XSMod
def size: Int = params.numEntries
}
......@@ -172,7 +172,8 @@ class StatusArray(params: RSParams)(implicit p: Parameters) extends XSModule
val isFlushed = status.valid && status.robIdx.needFlush(io.redirect)
val (deqRespValid, deqRespSucc, deqRespType, deqRespDataInvalidSqIdx) = deqResp(i)
flushedVec(i) := isFlushed || (deqRespValid && deqRespSucc)
val realUpdateValid = updateValid(i) && !io.redirect.valid
val enqFlushed = if (params.dropOnRedirect) io.redirect.valid else statusNext.robIdx.needFlush(io.redirect)
val realUpdateValid = updateValid(i) && !enqFlushed
statusNext.valid := !flushedVec(i) && (realUpdateValid || status.valid)
XSError(updateValid(i) && status.valid, p"should not update a valid entry $i\n")
......@@ -196,14 +197,14 @@ class StatusArray(params: RSParams)(implicit p: Parameters) extends XSModule
if (params.checkWaitBit) {
val blockNotReleased = isAfter(statusNext.sqIdx, io.stIssuePtr)
val storeAddrWaitforIsIssuing = VecInit((0 until StorePipelineWidth).map(i => {
io.memWaitUpdateReq.staIssue(i).valid &&
io.memWaitUpdateReq.staIssue(i).valid &&
io.memWaitUpdateReq.staIssue(i).bits.uop.robIdx.value === statusNext.waitForRobIdx.value
})).asUInt.orR && !statusNext.waitForStoreData && !statusNext.strictWait // is waiting for store addr ready
val storeDataWaitforIsIssuing = VecInit((0 until StorePipelineWidth).map(i => {
io.memWaitUpdateReq.stdIssue(i).valid &&
io.memWaitUpdateReq.stdIssue(i).valid &&
io.memWaitUpdateReq.stdIssue(i).bits.uop.sqIdx.value === statusNext.waitForSqIdx.value
})).asUInt.orR && statusNext.waitForStoreData
statusNext.blocked := Mux(updateValid(i), updateVal(i).blocked, status.blocked) &&
statusNext.blocked := Mux(updateValid(i), updateVal(i).blocked, status.blocked) &&
!storeAddrWaitforIsIssuing &&
!storeDataWaitforIsIssuing &&
blockNotReleased
......
......@@ -850,7 +850,7 @@ class RobImp(outer: Rob)(implicit p: Parameters) extends LazyModuleImp(outer)
exceptionGen.io.enq(i).bits.exceptionVec := ExceptionNO.selectFrontend(io.enq.req(i).bits.cf.exceptionVec)
exceptionGen.io.enq(i).bits.flushPipe := io.enq.req(i).bits.ctrl.flushPipe
exceptionGen.io.enq(i).bits.replayInst := false.B
assert(io.enq.req(i).bits.ctrl.replayInst === false.B)
XSError(canEnqueue(i) && io.enq.req(i).bits.ctrl.replayInst, "enq should not set replayInst")
exceptionGen.io.enq(i).bits.singleStep := io.enq.req(i).bits.ctrl.singleStep
exceptionGen.io.enq(i).bits.crossPageIPFFix := io.enq.req(i).bits.cf.crossPageIPFFix
exceptionGen.io.enq(i).bits.trigger.clear()
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册