未验证 提交 a0db5a4b 编写于 作者: Y Yinan Xu 提交者: GitHub

decode: parallel fusion decoder and rat read (#1588)

上级 2f0b133c
......@@ -374,12 +374,14 @@ class CtrlBlockImp(outer: CtrlBlock)(implicit p: Parameters) extends LazyModuleI
decode.io.in <> io.frontend.cfVec
decode.io.csrCtrl := RegNext(io.csrCtrl)
decode.io.intRat <> rat.io.intReadPorts
decode.io.fpRat <> rat.io.fpReadPorts
// memory dependency predict
// when decode, send fold pc to mdp
for (i <- 0 until DecodeWidth) {
val mdp_foldpc = Mux(
decode.io.out(i).fire(),
decode.io.out(i).fire,
decode.io.in(i).bits.foldpc,
rename.io.in(i).bits.cf.foldpc
)
......@@ -400,19 +402,7 @@ class CtrlBlockImp(outer: CtrlBlock)(implicit p: Parameters) extends LazyModuleI
lfst.io.dispatch <> dispatch.io.lfst
rat.io.robCommits := rob.io.commits
for ((r, i) <- rat.io.intReadPorts.zipWithIndex) {
val raddr = decode.io.out(i).bits.ctrl.lsrc.take(2) :+ decode.io.out(i).bits.ctrl.ldest
r.map(_.addr).zip(raddr).foreach(x => x._1 := x._2)
rename.io.intReadPorts(i) := r.map(_.data)
r.foreach(_.hold := !rename.io.in(i).ready)
}
rat.io.intRenamePorts := rename.io.intRenamePorts
for ((r, i) <- rat.io.fpReadPorts.zipWithIndex) {
val raddr = decode.io.out(i).bits.ctrl.lsrc.take(3) :+ decode.io.out(i).bits.ctrl.ldest
r.map(_.addr).zip(raddr).foreach(x => x._1 := x._2)
rename.io.fpReadPorts(i) := r.map(_.data)
r.foreach(_.hold := !rename.io.in(i).ready)
}
rat.io.fpRenamePorts := rename.io.fpRenamePorts
rat.io.debug_int_rat <> io.debug_int_rat
rat.io.debug_fp_rat <> io.debug_fp_rat
......@@ -421,12 +411,17 @@ class CtrlBlockImp(outer: CtrlBlock)(implicit p: Parameters) extends LazyModuleI
for (i <- 0 until RenameWidth) {
PipelineConnect(decode.io.out(i), rename.io.in(i), rename.io.in(i).ready,
stage2Redirect.valid || pendingRedirect)
rename.io.intReadPorts(i) := rat.io.intReadPorts(i).map(_.data)
rename.io.fpReadPorts(i) := rat.io.fpReadPorts(i).map(_.data)
if (i < RenameWidth - 1) {
rename.io.fusionInfo(i) := RegEnable(decode.io.fusionInfo(i), decode.io.out(i).fire)
}
rename.io.waittable(i) := RegEnable(waittable.io.rdata(i), decode.io.out(i).fire)
}
rename.io.redirect <> stage2Redirect
rename.io.robCommits <> rob.io.commits
rename.io.ssit <> ssit.io.rdata
rename.io.waittable <> RegNext(waittable.io.rdata)
// pipeline between rename and dispatch
for (i <- 0 until RenameWidth) {
......
......@@ -21,13 +21,18 @@ import chisel3._
import chisel3.util._
import xiangshan._
import utils._
import xiangshan.backend.rename.RatReadPort
class DecodeStage(implicit p: Parameters) extends XSModule with HasPerfEvents {
val io = IO(new Bundle() {
// from Ibuffer
val in = Vec(DecodeWidth, Flipped(DecoupledIO(new CtrlFlow)))
// to DecBuffer
// to Rename
val out = Vec(DecodeWidth, DecoupledIO(new CfCtrl))
val fusionInfo = Vec(DecodeWidth - 1, new FusionDecodeInfo)
// RAT read
val intRat = Vec(RenameWidth, Vec(3, Flipped(new RatReadPort)))
val fpRat = Vec(RenameWidth, Vec(4, Flipped(new RatReadPort)))
// csr control
val csrCtrl = Input(new CustomCSRCtrlIO)
})
......@@ -43,6 +48,19 @@ class DecodeStage(implicit p: Parameters) extends XSModule with HasPerfEvents {
io.out(i).valid := io.in(i).valid
io.out(i).bits := decoders(i).io.deq.cf_ctrl
io.in(i).ready := io.out(i).ready
// We use the lsrc/ldest before fusion decoder to read RAT for better timing.
io.intRat(i)(0).addr := decoders(i).io.deq.cf_ctrl.ctrl.lsrc(0)
io.intRat(i)(1).addr := decoders(i).io.deq.cf_ctrl.ctrl.lsrc(1)
io.intRat(i)(2).addr := decoders(i).io.deq.cf_ctrl.ctrl.ldest
io.intRat(i).foreach(_.hold := !io.out(i).ready)
// Floating-point instructions can not be fused now.
io.fpRat(i)(0).addr := decoders(i).io.deq.cf_ctrl.ctrl.lsrc(0)
io.fpRat(i)(1).addr := decoders(i).io.deq.cf_ctrl.ctrl.lsrc(1)
io.fpRat(i)(2).addr := decoders(i).io.deq.cf_ctrl.ctrl.lsrc(2)
io.fpRat(i)(3).addr := decoders(i).io.deq.cf_ctrl.ctrl.ldest
io.fpRat(i).foreach(_.hold := !io.out(i).ready)
}
// instruction fusion
......@@ -76,6 +94,7 @@ class DecodeStage(implicit p: Parameters) extends XSModule with HasPerfEvents {
v := false.B
}
}
io.fusionInfo := fusionDecoder.io.info
val hasValid = VecInit(io.in.map(_.valid)).asUInt.orR
XSPerfAccumulate("utilization", PopCount(io.in.map(_.valid)))
......
......@@ -46,7 +46,7 @@ abstract trait DecodeConstants {
// | | | | | | | | | | | | selImm
List(SrcType.DC, SrcType.DC, SrcType.DC, FuType.alu, ALUOpType.sll, N, N, N, N, N, N, N, SelImm.INVALID_INSTR) // Use SelImm to indicate invalid instr
val table: Array[(BitPat, List[BitPat])]
val table: Array[(BitPat, List[BitPat])]
}
trait DecodeUnitConstants
......
......@@ -21,7 +21,7 @@ import chisel3._
import chisel3.util._
import xiangshan._
import utils._
import xiangshan.backend.decode.{Imm_I, Imm_LUI_LOAD, Imm_U}
import xiangshan.backend.decode.{FusionDecodeInfo, Imm_I, Imm_LUI_LOAD, Imm_U}
import xiangshan.backend.rob.RobPtr
import xiangshan.backend.rename.freelist._
import xiangshan.mem.mdp._
......@@ -32,6 +32,7 @@ class Rename(implicit p: Parameters) extends XSModule with HasPerfEvents {
val robCommits = Flipped(new RobCommitIO)
// from decode
val in = Vec(RenameWidth, Flipped(DecoupledIO(new CfCtrl)))
val fusionInfo = Vec(DecodeWidth - 1, Flipped(new FusionDecodeInfo))
// ssit read result
val ssit = Flipped(Vec(RenameWidth, Output(new SSITEntry)))
// waittable read result
......@@ -104,7 +105,6 @@ class Rename(implicit p: Parameters) extends XSModule with HasPerfEvents {
val hasValid = Cat(io.in.map(_.valid)).orR
val isMove = io.in.map(_.bits.ctrl.isMove)
val intPsrc = Wire(Vec(RenameWidth, UInt()))
val intSpecWen = Wire(Vec(RenameWidth, Bool()))
val fpSpecWen = Wire(Vec(RenameWidth, Bool()))
......@@ -135,15 +135,18 @@ class Rename(implicit p: Parameters) extends XSModule with HasPerfEvents {
uops(i).robIdx := robIdxHead + PopCount(io.in.take(i).map(_.valid))
val intPhySrcVec = io.intReadPorts(i).take(2)
val intOldPdest = io.intReadPorts(i).last
intPsrc(i) := intPhySrcVec(0)
val fpPhySrcVec = io.fpReadPorts(i).take(3)
val fpOldPdest = io.fpReadPorts(i).last
uops(i).psrc(0) := Mux(uops(i).ctrl.srcType(0) === SrcType.reg, intPhySrcVec(0), fpPhySrcVec(0))
uops(i).psrc(1) := Mux(uops(i).ctrl.srcType(1) === SrcType.reg, intPhySrcVec(1), fpPhySrcVec(1))
uops(i).psrc(2) := fpPhySrcVec(2)
uops(i).old_pdest := Mux(uops(i).ctrl.rfWen, intOldPdest, fpOldPdest)
uops(i).psrc(0) := Mux(uops(i).ctrl.srcType(0) === SrcType.reg, io.intReadPorts(i)(0), io.fpReadPorts(i)(0))
uops(i).psrc(1) := Mux(uops(i).ctrl.srcType(1) === SrcType.reg, io.intReadPorts(i)(1), io.fpReadPorts(i)(1))
// int psrc2 should be bypassed from next instruction if it is fused
if (i < RenameWidth - 1) {
when (io.fusionInfo(i).rs2FromRs2 || io.fusionInfo(i).rs2FromRs1) {
uops(i).psrc(1) := Mux(io.fusionInfo(i).rs2FromRs2, io.intReadPorts(i + 1)(1), io.intReadPorts(i + 1)(0))
}.elsewhen(io.fusionInfo(i).rs2FromZero) {
uops(i).psrc(1) := 0.U
}
}
uops(i).psrc(2) := io.fpReadPorts(i)(2)
uops(i).old_pdest := Mux(uops(i).ctrl.rfWen, io.intReadPorts(i).last, io.fpReadPorts(i).last)
uops(i).eliminatedMove := isMove(i)
// update pdest
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册