未验证 提交 65e2f311 编写于 作者: Y Yinan Xu 提交者: GitHub

rs, fma: separate fadd and fmul issue (#1042)

This commit splits FMA instructions into FMUL and FADD for execution.

When the first two operands are ready, an FMA instruction can be issued
and the intermediate result will be written back to RS after two cycles.
Since RS currently has DataArray to store the operands, we reuse it to
store the intermediate FMUL result.

When an FMA enters deq stage and leaves RS with only two operands, we
mark it as midState ready at this clock cycle T0.

If the instruction's third operand becomes ready at T0, it can be
selected at T1 and issued at T2, when FMUL is also finished. The
intermediate result will be sent to FADD instead of writing back to RS.
If the instruction's third operand becomes ready later, we have the data
in DataArray or at DataArray's write port. Thus, it's ok to set midState
ready at clock cycle T0.

The separation of FMA instructions will increase issue pressure since RS
needs to issue more times. However, it larges reduce FMA latency if many
FMA instructions are waiting for the third operand.
上级 7bb7bf3d
......@@ -73,6 +73,9 @@ class ExuBlockImp(outer: ExuBlock)(implicit p: Parameters) extends LazyModuleImp
// the scheduler issues instructions to function units
scheduler.io.issue <> fuBlock.io.issue
if (scheduler.io.fmaMid.isDefined) {
scheduler.io.fmaMid.get <> fuBlock.io.fmaMid.get
}
// IO for the function units
fuBlock.io.redirect <> io.redirect
......
......@@ -23,6 +23,7 @@ import utils._
import xiangshan._
import xiangshan.backend.exu._
import xiangshan.backend.fu.CSRFileIO
import xiangshan.backend.fu.fpu.FMAMidResultIO
import xiangshan.mem.StoreDataBundle
class WakeUpBundle(numFast: Int, numSlow: Int)(implicit p: Parameters) extends XSBundle {
......@@ -101,7 +102,7 @@ class FUBlockExtraIO(configs: Seq[(ExuConfig, Int)])(implicit p: Parameters) ext
class FUBlock(configs: Seq[(ExuConfig, Int)])(implicit p: Parameters) extends XSModule {
val numIn = configs.map(_._2).sum
val numFma = configs.filter(_._1 == FmacExeUnitCfg).map(_._2).sum
val io = IO(new Bundle {
val redirect = Flipped(ValidIO(new Redirect))
......@@ -112,6 +113,7 @@ class FUBlock(configs: Seq[(ExuConfig, Int)])(implicit p: Parameters) extends XS
val writeback = Vec(numIn, DecoupledIO(new ExuOutput))
// misc
val extra = new FUBlockExtraIO(configs)
val fmaMid = if (numFma > 0) Some(Vec(numFma, new FMAMidResultIO)) else None
})
val exeUnits = configs.map(c => Seq.fill(c._2)(ExeUnit(c._1))).reduce(_ ++ _)
......@@ -155,6 +157,10 @@ class FUBlock(configs: Seq[(ExuConfig, Int)])(implicit p: Parameters) extends XS
io.extra.stData.get := VecInit(exeUnits.map(_.stData).filter(_.isDefined).map(_.get))
}
if (io.fmaMid.isDefined) {
io.fmaMid.get <> exeUnits.map(_.fmaMid).filter(_.isDefined).map(_.get)
}
for ((iss, i) <- io.issue.zipWithIndex) {
XSPerfAccumulate(s"issue_count_$i", iss.fire())
}
......
......@@ -24,6 +24,7 @@ import freechips.rocketchip.diplomacy.{LazyModule, LazyModuleImp}
import xiangshan._
import utils._
import xiangshan.backend.exu.ExuConfig
import xiangshan.backend.fu.fpu.FMAMidResultIO
import xiangshan.backend.issue.{ReservationStation, ReservationStationWrapper}
import xiangshan.backend.regfile.{Regfile, RfReadPort, RfWritePort}
import xiangshan.mem.{SqPtr, StoreDataBundle}
......@@ -192,6 +193,8 @@ class SchedulerImp(outer: Scheduler) extends LazyModuleImp(outer) with HasXSPara
new SchedulerExtraIO().asInstanceOf[this.type]
}
val numFma = outer.reservationStations.map(_.module.io.fmaMid.getOrElse(Seq()).length).sum
val io = IO(new Bundle {
// global control
val redirect = Flipped(ValidIO(new Redirect))
......@@ -205,8 +208,13 @@ class SchedulerImp(outer: Scheduler) extends LazyModuleImp(outer) with HasXSPara
val fastUopIn = Vec(intRfWritePorts + fpRfWritePorts, Flipped(ValidIO(new MicroOp)))
// feedback ports
val extra = new SchedulerExtraIO
val fmaMid = if (numFma > 0) Some(Vec(numFma, Flipped(new FMAMidResultIO))) else None
})
if (io.fmaMid.isDefined) {
io.fmaMid.get <> outer.reservationStations.flatMap(_.module.io.fmaMid.getOrElse(Seq()))
}
def extraReadRf(numRead: Seq[Int]): Seq[UInt] = {
require(numRead.length == io.allocate.length)
val enq = io.allocate.map(_.bits.psrc)
......
......@@ -20,10 +20,10 @@ package xiangshan.backend.exu
import chipsalliance.rocketchip.config.Parameters
import chisel3._
import chisel3.util._
import utils.{XSDebug, XSPerfAccumulate}
import utils._
import xiangshan._
import xiangshan.backend.Std
import xiangshan.backend.fu.fpu.FPUSubModule
import xiangshan.backend.fu.fpu.{FPUSubModule, FMA}
import xiangshan.backend.fu.{CSR, FUWithRedirect, Fence, FenceToSbuffer}
class FenceIO(implicit p: Parameters) extends XSBundle {
......@@ -83,6 +83,12 @@ class ExeUnit(config: ExuConfig)(implicit p: Parameters) extends Exu(config: Exu
io.out.bits.fflags := Mux1H(fflagsSel.map(_._1), fflagsSel.map(_._2))
}
val fmaModules = functionUnits.filter(_.isInstanceOf[FMA]).map(_.asInstanceOf[FMA])
if (fmaModules.nonEmpty) {
require(fmaModules.length == 1)
fmaModules.head.midResult <> fmaMid.get
}
if (config.fuConfigs.contains(stdCfg)) {
val std = functionUnits.collectFirst {
case s: Std => s
......
......@@ -22,6 +22,7 @@ import chisel3.util._
import utils.XSPerfAccumulate
import xiangshan._
import xiangshan.backend.fu._
import xiangshan.backend.fu.fpu.FMAMidResultIO
import xiangshan.mem.StoreDataBundle
case class ExuParameters
......@@ -110,6 +111,7 @@ abstract class Exu(val config: ExuConfig)(implicit p: Parameters) extends XSModu
val csrio = if (config == JumpCSRExeUnitCfg) Some(IO(new CSRFileIO)) else None
val fenceio = if (config == JumpCSRExeUnitCfg) Some(IO(new FenceIO)) else None
val frm = if (config == FmacExeUnitCfg || config == FmiscExeUnitCfg) Some(IO(Input(UInt(3.W)))) else None
val fmaMid = if (config == FmacExeUnitCfg) Some(IO(new FMAMidResultIO)) else None
val stData = if (config == StdExeUnitCfg) Some(IO(ValidIO(new StoreDataBundle))) else None
val functionUnits = config.fuConfigs.map(cfg => {
......
......@@ -28,6 +28,9 @@ class MulToAddIO(val ftypes: Seq[FPU.FType])(implicit val p: Parameters) extends
val mul_out = MixedVec(ftypes.map(t => new FMULToFADD(t.expWidth, t.precision)))
val addend = UInt(ftypes.map(_.len).max.W)
val uop = new MicroOp
def getFloat = mul_out.head
def getDouble = mul_out.last
}
class FMUL_pipe(val mulLat: Int = 2)(implicit p: Parameters)
......@@ -159,7 +162,35 @@ class FADD_pipe(val addLat: Int = 2)(implicit p: Parameters) extends FPUPipeline
fflags := stages.last.exc
}
class FMAMidResult extends FMULToFADD(FPU.ftypes.last.expWidth, FPU.ftypes.last.precision) {
def toFloat: FMULToFADD = {
val floatMidResult = Wire(new FMULToFADD(FPU.ftypes.head.expWidth, FPU.ftypes.head.precision))
floatMidResult.fp_prod.sign := fp_prod.sign
floatMidResult.fp_prod.exp := fp_prod.exp
floatMidResult.fp_prod.sig := fp_prod.sig
floatMidResult.inter_flags := inter_flags
floatMidResult
}
def fromFloat(float: FMULToFADD): FMULToFADD = {
fp_prod.sign := float.fp_prod.sign
fp_prod.exp := float.fp_prod.exp
fp_prod.sig := float.fp_prod.sig
inter_flags := float.inter_flags
this
}
}
class FMAMidResultIO extends Bundle {
val in = Flipped(ValidIO(new FMAMidResult))
val out = ValidIO(new FMAMidResult)
val waitForAdd = Input(Bool())
}
class FMA(implicit p: Parameters) extends FPUSubModule {
val midResult = IO(new FMAMidResultIO)
override val dataModule = null
val mul_pipe = Module(new FMUL_pipe())
val add_pipe = Module(new FADD_pipe())
......@@ -167,33 +198,46 @@ class FMA(implicit p: Parameters) extends FPUSubModule {
mul_pipe.io.redirectIn := io.redirectIn
mul_pipe.io.flushIn := io.flushIn
mul_pipe.rm := rm
add_pipe.io.redirectIn := io.redirectIn
add_pipe.io.flushIn := io.flushIn
add_pipe.rm := rm
val fpCtrl = io.in.bits.uop.ctrl.fpu
mul_pipe.rm := rm
mul_pipe.io.in <> io.in
mul_pipe.io.in.valid := io.in.valid && !fpCtrl.isAddSub
mul_pipe.io.in.valid := io.in.valid && !fpCtrl.isAddSub && !midResult.in.valid
// For better timing, we let out.valid be true even if it's flushed.
val isFMA = mul_pipe.io.out.valid && mul_pipe.io.out.bits.uop.ctrl.fpu.ren3
val waitAddOperand = RegEnable(midResult.waitForAdd, !mul_pipe.io.out.valid || mul_pipe.io.out.ready)
val isFMA = mul_pipe.io.out.valid && mul_pipe.io.out.bits.uop.ctrl.fpu.ren3 && !waitAddOperand
// However, when sending instructions to add_pipe, we need to determine whether it's flushed.
val isFMAReg = RegNext(isFMA && !mul_pipe.io.out.bits.uop.roqIdx.needFlush(io.redirectIn, io.flushIn))
val mulFlushed = mul_pipe.io.out.bits.uop.roqIdx.needFlush(io.redirectIn, io.flushIn)
val isFMAReg = RegNext(isFMA && !mulFlushed)
add_pipe.mulToAdd <> mul_pipe.toAdd
add_pipe.isFMA := isFMAReg
add_pipe.rm := rm
midResult.out.valid := RegNext(mul_pipe.io.out.valid && waitAddOperand && !mulFlushed)
midResult.out.bits := mul_pipe.toAdd.getDouble
when (RegNext(mul_pipe.io.out.bits.uop.ctrl.fpu.typeTagIn === FPU.S)) {
midResult.out.bits.fromFloat(mul_pipe.toAdd.getFloat)
}
when (midResult.in.valid && !isFMAReg) {
add_pipe.mulToAdd.getDouble := midResult.in.bits
add_pipe.mulToAdd.getFloat := midResult.in.bits.toFloat
add_pipe.mulToAdd.addend := io.in.bits.src(2)
add_pipe.mulToAdd.uop := io.in.bits.uop
}
// For FADD, it accepts instructions from io.in and FMUL.
// When FMUL gives an FMA, FADD accepts this instead of io.in.
// Since FADD gets FMUL data from add_pipe.mulToAdd, only uop needs Mux.
add_pipe.io.in.valid := io.in.valid && fpCtrl.isAddSub || isFMAReg
add_pipe.io.in.valid := io.in.valid && (fpCtrl.isAddSub || midResult.in.valid) || isFMAReg
add_pipe.io.in.bits.src := io.in.bits.src
add_pipe.io.in.bits.uop := Mux(isFMAReg, add_pipe.mulToAdd.uop, io.in.bits.uop)
add_pipe.isFMA := io.in.valid && midResult.in.valid || isFMAReg
// When the in uop is Add/Sub, we check FADD, otherwise fmul is checked.
io.in.ready := Mux(fpCtrl.isAddSub,
io.in.ready := Mux(fpCtrl.isAddSub || midResult.in.valid,
!isFMAReg && add_pipe.io.in.ready,
mul_pipe.io.in.ready
)
......@@ -202,7 +246,7 @@ class FMA(implicit p: Parameters) extends FPUSubModule {
// (1) It always accept FMA from FADD (if an FMA wants FMUL, it's never blocked).
// (2) It has lower writeback arbitration priority than FADD (and may be blocked when FMUL.out.valid).
XSError(isFMA && !add_pipe.io.in.ready, "FMA should not be blocked\n")
mul_pipe.io.out.ready := isFMA || (io.out.ready && !add_pipe.io.out.valid)
mul_pipe.io.out.ready := isFMA || (io.out.ready && !add_pipe.io.out.valid) || waitAddOperand
add_pipe.io.out.ready := io.out.ready
io.out.bits.uop := Mux(add_pipe.io.out.valid,
......@@ -217,5 +261,6 @@ class FMA(implicit p: Parameters) extends FPUSubModule {
add_pipe.fflags,
mul_pipe.fflags
)
io.out.valid := add_pipe.io.out.valid || (mul_pipe.io.out.valid && !isFMA)
io.out.valid := add_pipe.io.out.valid || (mul_pipe.io.out.valid && !isFMA && !waitAddOperand)
}
......@@ -56,6 +56,7 @@ class DataArrayIO(params: RSParams)(implicit p: Parameters) extends XSBundle {
val write = Vec(params.numEnq, new DataArrayWriteIO(params.numEntries, params.numSrc, params.dataBits))
val multiWrite = Vec(params.numWakeup, new DataArrayMultiWriteIO(params.numEntries, params.numSrc, params.dataBits))
val delayedWrite = if (params.delayedRf) Vec(params.numEnq, Flipped(ValidIO(UInt(params.dataBits.W)))) else null
val partialWrite = if (params.hasMidState) Vec(params.numDeq, new DataArrayWriteIO(params.numEntries, params.numSrc - 1, params.dataBits)) else null
override def cloneType: DataArrayIO.this.type =
new DataArrayIO(params).asInstanceOf[this.type]
......@@ -69,9 +70,13 @@ class DataArray(params: RSParams)(implicit p: Parameters) extends XSModule {
val delayedWaddr = if (i == 1 && params.delayedRf) RegNext(VecInit(io.write.map(_.addr))) else Seq()
val delayedWdata = if (i == 1 && params.delayedRf) io.delayedWrite.map(_.bits) else Seq()
val wen = io.write.map(w => w.enable && w.mask(i)) ++ io.multiWrite.map(_.enable) ++ delayedWen
val waddr = io.write.map(_.addr) ++ io.multiWrite.map(_.addr(i)) ++ delayedWaddr
val wdata = io.write.map(_.data(i)) ++ io.multiWrite.map(_.data) ++ delayedWdata
val partialWen = if (i < 2 && params.hasMidState) io.partialWrite.map(_.enable) else Seq()
val partialWaddr = if (i < 2 && params.hasMidState) io.partialWrite.map(_.addr) else Seq()
val partialWdata = if (i < 2 && params.hasMidState) io.partialWrite.map(_.data(i)) else Seq()
val wen = io.write.map(w => w.enable && w.mask(i)) ++ io.multiWrite.map(_.enable) ++ delayedWen ++ partialWen
val waddr = io.write.map(_.addr) ++ io.multiWrite.map(_.addr(i)) ++ delayedWaddr ++ partialWaddr
val wdata = io.write.map(_.data(i)) ++ io.multiWrite.map(_.data) ++ delayedWdata ++ partialWdata
val dataModule = Module(new SyncRawDataModuleTemplate(UInt(params.dataBits.W), params.numEntries, io.read.length, wen.length))
dataModule.io.rvec := VecInit(io.read.map(_.addr))
......
......@@ -24,6 +24,7 @@ import xiangshan._
import utils._
import xiangshan.backend.exu.ExuConfig
import xiangshan.backend.fu.FuConfig
import xiangshan.backend.fu.fpu.{FMAMidResult, FMAMidResultIO}
import xiangshan.mem.{SqPtr, StoreDataBundle}
import scala.math.max
......@@ -55,7 +56,8 @@ case class RSParams
def indexWidth: Int = log2Up(numEntries)
// oldestFirst: (Enable_or_not, Need_balance, Victim_index)
def oldestFirst: (Boolean, Boolean, Int) = (true, !isLoad, if (isLoad) 0 else numDeq - 1)
def needScheduledBit: Boolean = hasFeedback || delayedRf
def hasMidState: Boolean = exuCfg.get == FmacExeUnitCfg
def needScheduledBit: Boolean = hasFeedback || delayedRf || hasMidState
def needBalance: Boolean = exuCfg.get.needLoadBalance
override def toString: String = {
......@@ -182,6 +184,9 @@ class ReservationStationWrapper(implicit p: Parameters) extends LazyModule with
if (io.load.isDefined) {
io.load.get.fastMatch <> rs.flatMap(_.io.load.get.fastMatch)
}
if (io.fmaMid.isDefined) {
io.fmaMid.get <> rs.flatMap(_.io.fmaMid.get)
}
}
var fastWakeupIdx = 0
......@@ -231,6 +236,7 @@ class ReservationStationIO(params: RSParams)(implicit p: Parameters) extends XSB
val load = if (params.isLoad) Some(new Bundle() {
val fastMatch = Vec(params.numDeq, Output(UInt(exuParameters.LduCnt.W)))
}) else None
val fmaMid = if (params.exuCfg.get == FmacExeUnitCfg) Some(Vec(params.numDeq, Flipped(new FMAMidResultIO))) else None
override def cloneType: ReservationStationIO.this.type =
new ReservationStationIO(params).asInstanceOf[this.type]
......@@ -270,6 +276,7 @@ class ReservationStation(params: RSParams)(implicit p: Parameters) extends XSMod
statusArray.io.update(i).data.blocked := params.checkWaitBit.B && io.fromDispatch(i).bits.cf.loadWaitBit
statusArray.io.update(i).data.credit := Mux(params.delayedRf.B && needFpSource(i), 2.U, 0.U)
statusArray.io.update(i).data.srcState := VecInit(io.fromDispatch(i).bits.srcIsReady.take(params.numSrc))
statusArray.io.update(i).data.midState := false.B
statusArray.io.update(i).data.psrc := VecInit(io.fromDispatch(i).bits.psrc.take(params.numSrc))
statusArray.io.update(i).data.srcType := VecInit(io.fromDispatch(i).bits.ctrl.srcType.take(params.numSrc))
statusArray.io.update(i).data.roqIdx := io.fromDispatch(i).bits.roqIdx
......@@ -351,7 +358,9 @@ class ReservationStation(params: RSParams)(implicit p: Parameters) extends XSMod
s1_out(i).valid := issueVec(i).valid && !s1_out(i).bits.uop.roqIdx.needFlush(io.redirect, io.flush)
statusArray.io.issueGranted(i).valid := issueVec(i).valid && s1_out(i).ready
statusArray.io.issueGranted(i).bits := issueVec(i).bits
statusArray.io.deqResp(i).valid := issueVec(i).valid && s1_out(i).ready
// For FMAs that can be scheduled multiple times, only when
// all source operands are ready we dequeue the instruction.
statusArray.io.deqResp(i).valid := issueVec(i).valid && s1_out(i).ready && statusArray.io.allSrcReady(i)
statusArray.io.deqResp(i).bits.rsMask := issueVec(i).bits
statusArray.io.deqResp(i).bits.success := s2_deq(i).ready
statusArray.io.deqResp(i).bits.resptype := DontCare
......@@ -375,6 +384,7 @@ class ReservationStation(params: RSParams)(implicit p: Parameters) extends XSMod
XSPerfAccumulate(s"fast_blocked_$i", issueVec(i).valid && fuCheck && !s1_out(i).ready)
}
}
statusArray.io.updateMidState := 0.U
// select whether the source is from (whether regfile or imm)
// for read-after-issue, it's done over the selected uop
......@@ -489,6 +499,12 @@ class ReservationStation(params: RSParams)(implicit p: Parameters) extends XSMod
io.feedback.get(i).rsIdx := RegEnable(OHToUInt(issueVec(i).bits), pipeline_fire)
io.feedback.get(i).isFirstIssue := RegEnable(statusArray.io.isFirstIssue(i), pipeline_fire)
}
if (params.hasMidState) {
io.fmaMid.get(i).waitForAdd := !RegEnable(statusArray.io.allSrcReady(i), pipeline_fire)
io.fmaMid.get(i).in.valid := !RegEnable(statusArray.io.isFirstIssue(i), pipeline_fire)
XSPerfAccumulate(s"fma_partial2_issue_$i", io.deq(i).fire && io.fmaMid.get(i).waitForAdd)
XSPerfAccumulate(s"fma_final_issue_$i", io.deq(i).fire && io.fmaMid.get(i).in.valid)
}
s2_deq(i).ready := io.deq(i).ready
io.deq(i).valid := s2_deq(i).valid
io.deq(i).bits := s2_deq(i).bits
......@@ -559,6 +575,87 @@ class ReservationStation(params: RSParams)(implicit p: Parameters) extends XSMod
}
}
if (params.hasMidState) {
// For FMA instrutions whose third operand is not ready, once they are successfully issued (T0),
// the FMUL intermediate result will be ready in two clock cycles (T2).
// If the third operand is ready at T2, this instruction will be selected in T3 and issued at T4.
// Note that at cycle T4, FMUL finishes as well and it is able to proceed to FADD.
// Thus, we can set the midState to true two cycles earlier at T0 and forward the result if possible.
val midFinished2 = io.fmaMid.get.zip(io.deq).map(x => x._1.waitForAdd && x._2.fire)
val issuedRsIdxOH = statusArray.io.issueGranted.map(iss => RegEnable(iss.bits, iss.valid))
val updateMid = midFinished2.zip(issuedRsIdxOH).map(x => Mux(x._1, x._2, 0.U)).reduce(_ | _)
statusArray.io.updateMidState := updateMid
// FMUL intermediate results are ready in two cycles
for (i <- 0 until params.numDeq) {
dataArray.io.partialWrite(i).enable := RegNext(RegNext(midFinished2(i)))
dataArray.io.partialWrite(i).mask := DontCare
dataArray.io.partialWrite(i).addr := RegNext(RegNext(issuedRsIdxOH(i)))
val writeData = io.fmaMid.get(i).out.bits.asUInt
require(writeData.getWidth <= 2 * params.dataBits, s"why ${writeData.getWidth}???")
require(writeData.getWidth > params.dataBits, s"why ${writeData.getWidth}???")
dataArray.io.partialWrite(i).data(0) := writeData(params.dataBits - 1, 0)
dataArray.io.partialWrite(i).data(1) := writeData(writeData.getWidth - 1, params.dataBits)
val readData = Cat(io.deq(i).bits.src(1), io.deq(i).bits.src(0))
io.fmaMid.get(i).in.bits := readData.asTypeOf(io.fmaMid.get(i).in.bits.cloneType)
}
// How to forward intermediate results:
// (1) T0 issued FMA is selected at T1 and issued at T2: forward from FMUL results
// NOTE: In this case, this instruction has been issued and the entry is freed.
// Do NOT write data back to data array.
// (2) T0 issued FMA is selected at T2: RegNext FMUL result at the issue stage
// Thus, at issue stage:
// (1.1) If the instruction matches FMA/FMUL two cycles ealier, we issue it and it goes to FADD
// (1.2) If the instruction matches FMA/FMUL two cycles ealier and it's blocked, we need to hold the result
// At select stage: (2) bypass FMUL intermediate results from write ports if possible.
val selectedRsIdx = statusArray.io.issueGranted.map(iss => OHToUInt(iss.bits))
val issuedRsIdx = statusArray.io.issueGranted.zip(selectedRsIdx).map(x => RegEnable(x._2, x._1.valid))
val issuedAtT0 = midFinished2.zip(issuedRsIdx).map(x => (RegNext(RegNext(x._1)), RegNext(RegNext(x._2))))
for (i <- 0 until params.numDeq) {
// cond11: condition (1.1) from different issue ports
val cond11 = issuedAtT0.map(x => x._1 && x._2 === issuedRsIdx(i))
for ((c, j) <- cond11.zipWithIndex) {
when (c) {
io.fmaMid.get(i).in.bits := io.fmaMid.get(j).out.bits
// We should NOT write the intermediate result back to DataArray,
// when this entry has been selected and arrived at the issue stage.
// This entry may be allocated for new instructions from dispatch.
when (io.deq(i).valid) {
dataArray.io.partialWrite(j).enable := false.B
}
}
}
val cond11Issued = io.deq(i).fire && io.fmaMid.get(i).in.valid && VecInit(cond11).asUInt.orR
XSPerfAccumulate(s"fma_final_issue_cond11_$i", cond11Issued)
// cond12: blocked at the issue stage
val cond12 = cond11.map(_ && io.deq(i).valid && !io.deq(i).ready)
val hasCond12 = VecInit(cond12).asUInt.orR
val hasCond12Reg = RegInit(false.B)
when (hasCond12) {
hasCond12Reg := true.B
}.elsewhen (io.deq(i).ready) {
hasCond12Reg := false.B
}
when (hasCond12Reg) {
// TODO: remove these unnecessary registers (use pipeline registers instead)
io.fmaMid.get(i).in.bits := RegEnable(Mux1H(cond12, io.fmaMid.get.map(_.out.bits)), hasCond12)
}
val cond12Issued = io.deq(i).fire && io.fmaMid.get(i).in.valid && hasCond12Reg
XSPerfAccumulate(s"fma_final_issue_cond12_$i", cond12Issued)
// cond2: selected at the select stage
val cond2 = issuedAtT0.map(x => x._1 && x._2 === selectedRsIdx(i))
for ((c, j) <- cond2.zipWithIndex) {
when (c) {
s1_out(i).bits.src(0) := dataArray.io.partialWrite(j).data(0)
s1_out(i).bits.src(1) := dataArray.io.partialWrite(j).data(1)
}
}
val cond2Selected = statusArray.io.issueGranted(i).valid && VecInit(cond2).asUInt.orR
XSPerfAccumulate(s"fma_final_selected_cond2_$i", cond2Selected)
}
}
// logs
for ((dispatch, i) <- io.fromDispatch.zipWithIndex) {
XSDebug(dispatch.valid && !dispatch.ready, p"enq blocked, roqIdx ${dispatch.bits.roqIdx}\n")
......
......@@ -45,6 +45,7 @@ class StatusEntry(params: RSParams)(implicit p: Parameters) extends XSBundle {
val blocked = Bool()
val credit = UInt(4.W)
val srcState = Vec(params.numSrc, Bool())
val midState = Bool()
// data
val psrc = Vec(params.numSrc, UInt(params.dataIdBits.W))
val srcType = Vec(params.numSrc, SrcType())
......@@ -56,7 +57,14 @@ class StatusEntry(params: RSParams)(implicit p: Parameters) extends XSBundle {
def canIssue: Bool = {
val scheduledCond = if (params.needScheduledBit) !scheduled else true.B
val blockedCond = if (params.checkWaitBit) !blocked else true.B
srcState.asUInt.andR && scheduledCond && blockedCond
val checkedSrcState = if (params.numSrc > 2) srcState.take(2) else srcState
val midStateReady = if (params.hasMidState) srcState.last && midState else false.B
(VecInit(checkedSrcState).asUInt.andR && scheduledCond || midStateReady) && blockedCond
}
def allSrcReady: Bool = {
val midStateReady = if (params.hasMidState) srcState.last && midState else false.B
srcState.asUInt.andR || midStateReady
}
override def cloneType: StatusEntry.this.type =
......@@ -82,6 +90,8 @@ class StatusArray(params: RSParams)(implicit p: Parameters) extends XSModule
val issueGranted = Vec(params.numDeq, Flipped(ValidIO(UInt(params.numEntries.W))))
// TODO: if more info is needed, put them in a bundle
val isFirstIssue = Vec(params.numDeq, Output(Bool()))
val allSrcReady = Vec(params.numDeq, Output(Bool()))
val updateMidState = Input(UInt(params.numEntries.W))
val deqResp = Vec(params.numDeq, Flipped(ValidIO(new Bundle {
val rsMask = UInt(params.numEntries.W)
val success = Bool()
......@@ -206,6 +216,9 @@ class StatusArray(params: RSParams)(implicit p: Parameters) extends XSModule
case ((current, update), wakeup) => wakeup || Mux(updateValid(i), update, current)
})
// midState: reset when enqueue; set when receiving feedback
statusNext.midState := !updateValid(i) && (io.updateMidState(i) || status.midState)
// static data fields (only updated when instructions enqueue)
statusNext.psrc := Mux(updateValid(i), updateVal(i).psrc, status.psrc)
statusNext.srcType := Mux(updateValid(i), updateVal(i).srcType, status.srcType)
......@@ -213,8 +226,8 @@ class StatusArray(params: RSParams)(implicit p: Parameters) extends XSModule
statusNext.sqIdx := Mux(updateValid(i), updateVal(i).sqIdx, status.sqIdx)
// isFirstIssue: indicate whether the entry has been issued before
// When the entry is not granted to leave the RS, set isFirstIssue to false.B
statusNext.isFirstIssue := Mux(deqNotGranted, false.B, updateValid(i) || status.isFirstIssue)
// When the entry is not granted to issue, set isFirstIssue to false.B
statusNext.isFirstIssue := Mux(hasIssued, false.B, updateValid(i) || status.isFirstIssue)
XSDebug(status.valid, p"entry[$i]: $status\n")
}
......@@ -222,6 +235,7 @@ class StatusArray(params: RSParams)(implicit p: Parameters) extends XSModule
io.isValid := VecInit(statusArray.map(_.valid)).asUInt
io.canIssue := VecInit(statusArrayNext.map(_.valid).zip(readyVecNext).map{ case (v, r) => v && r}).asUInt
io.isFirstIssue := VecInit(io.issueGranted.map(iss => Mux1H(iss.bits, statusArray.map(_.isFirstIssue))))
io.allSrcReady := VecInit(io.issueGranted.map(iss => Mux1H(iss.bits, statusArray.map(_.allSrcReady))))
io.flushed := flushedVec.asUInt
val validEntries = PopCount(statusArray.map(_.valid))
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册