未验证 提交 a0db5a4b 编写于 作者: Y Yinan Xu 提交者: GitHub

decode: parallel fusion decoder and rat read (#1588)

上级 2f0b133c
...@@ -374,12 +374,14 @@ class CtrlBlockImp(outer: CtrlBlock)(implicit p: Parameters) extends LazyModuleI ...@@ -374,12 +374,14 @@ class CtrlBlockImp(outer: CtrlBlock)(implicit p: Parameters) extends LazyModuleI
decode.io.in <> io.frontend.cfVec decode.io.in <> io.frontend.cfVec
decode.io.csrCtrl := RegNext(io.csrCtrl) decode.io.csrCtrl := RegNext(io.csrCtrl)
decode.io.intRat <> rat.io.intReadPorts
decode.io.fpRat <> rat.io.fpReadPorts
// memory dependency predict // memory dependency predict
// when decode, send fold pc to mdp // when decode, send fold pc to mdp
for (i <- 0 until DecodeWidth) { for (i <- 0 until DecodeWidth) {
val mdp_foldpc = Mux( val mdp_foldpc = Mux(
decode.io.out(i).fire(), decode.io.out(i).fire,
decode.io.in(i).bits.foldpc, decode.io.in(i).bits.foldpc,
rename.io.in(i).bits.cf.foldpc rename.io.in(i).bits.cf.foldpc
) )
...@@ -400,19 +402,7 @@ class CtrlBlockImp(outer: CtrlBlock)(implicit p: Parameters) extends LazyModuleI ...@@ -400,19 +402,7 @@ class CtrlBlockImp(outer: CtrlBlock)(implicit p: Parameters) extends LazyModuleI
lfst.io.dispatch <> dispatch.io.lfst lfst.io.dispatch <> dispatch.io.lfst
rat.io.robCommits := rob.io.commits rat.io.robCommits := rob.io.commits
for ((r, i) <- rat.io.intReadPorts.zipWithIndex) {
val raddr = decode.io.out(i).bits.ctrl.lsrc.take(2) :+ decode.io.out(i).bits.ctrl.ldest
r.map(_.addr).zip(raddr).foreach(x => x._1 := x._2)
rename.io.intReadPorts(i) := r.map(_.data)
r.foreach(_.hold := !rename.io.in(i).ready)
}
rat.io.intRenamePorts := rename.io.intRenamePorts rat.io.intRenamePorts := rename.io.intRenamePorts
for ((r, i) <- rat.io.fpReadPorts.zipWithIndex) {
val raddr = decode.io.out(i).bits.ctrl.lsrc.take(3) :+ decode.io.out(i).bits.ctrl.ldest
r.map(_.addr).zip(raddr).foreach(x => x._1 := x._2)
rename.io.fpReadPorts(i) := r.map(_.data)
r.foreach(_.hold := !rename.io.in(i).ready)
}
rat.io.fpRenamePorts := rename.io.fpRenamePorts rat.io.fpRenamePorts := rename.io.fpRenamePorts
rat.io.debug_int_rat <> io.debug_int_rat rat.io.debug_int_rat <> io.debug_int_rat
rat.io.debug_fp_rat <> io.debug_fp_rat rat.io.debug_fp_rat <> io.debug_fp_rat
...@@ -421,12 +411,17 @@ class CtrlBlockImp(outer: CtrlBlock)(implicit p: Parameters) extends LazyModuleI ...@@ -421,12 +411,17 @@ class CtrlBlockImp(outer: CtrlBlock)(implicit p: Parameters) extends LazyModuleI
for (i <- 0 until RenameWidth) { for (i <- 0 until RenameWidth) {
PipelineConnect(decode.io.out(i), rename.io.in(i), rename.io.in(i).ready, PipelineConnect(decode.io.out(i), rename.io.in(i), rename.io.in(i).ready,
stage2Redirect.valid || pendingRedirect) stage2Redirect.valid || pendingRedirect)
rename.io.intReadPorts(i) := rat.io.intReadPorts(i).map(_.data)
rename.io.fpReadPorts(i) := rat.io.fpReadPorts(i).map(_.data)
if (i < RenameWidth - 1) {
rename.io.fusionInfo(i) := RegEnable(decode.io.fusionInfo(i), decode.io.out(i).fire)
}
rename.io.waittable(i) := RegEnable(waittable.io.rdata(i), decode.io.out(i).fire)
} }
rename.io.redirect <> stage2Redirect rename.io.redirect <> stage2Redirect
rename.io.robCommits <> rob.io.commits rename.io.robCommits <> rob.io.commits
rename.io.ssit <> ssit.io.rdata rename.io.ssit <> ssit.io.rdata
rename.io.waittable <> RegNext(waittable.io.rdata)
// pipeline between rename and dispatch // pipeline between rename and dispatch
for (i <- 0 until RenameWidth) { for (i <- 0 until RenameWidth) {
......
...@@ -21,13 +21,18 @@ import chisel3._ ...@@ -21,13 +21,18 @@ import chisel3._
import chisel3.util._ import chisel3.util._
import xiangshan._ import xiangshan._
import utils._ import utils._
import xiangshan.backend.rename.RatReadPort
class DecodeStage(implicit p: Parameters) extends XSModule with HasPerfEvents { class DecodeStage(implicit p: Parameters) extends XSModule with HasPerfEvents {
val io = IO(new Bundle() { val io = IO(new Bundle() {
// from Ibuffer // from Ibuffer
val in = Vec(DecodeWidth, Flipped(DecoupledIO(new CtrlFlow))) val in = Vec(DecodeWidth, Flipped(DecoupledIO(new CtrlFlow)))
// to DecBuffer // to Rename
val out = Vec(DecodeWidth, DecoupledIO(new CfCtrl)) val out = Vec(DecodeWidth, DecoupledIO(new CfCtrl))
val fusionInfo = Vec(DecodeWidth - 1, new FusionDecodeInfo)
// RAT read
val intRat = Vec(RenameWidth, Vec(3, Flipped(new RatReadPort)))
val fpRat = Vec(RenameWidth, Vec(4, Flipped(new RatReadPort)))
// csr control // csr control
val csrCtrl = Input(new CustomCSRCtrlIO) val csrCtrl = Input(new CustomCSRCtrlIO)
}) })
...@@ -43,6 +48,19 @@ class DecodeStage(implicit p: Parameters) extends XSModule with HasPerfEvents { ...@@ -43,6 +48,19 @@ class DecodeStage(implicit p: Parameters) extends XSModule with HasPerfEvents {
io.out(i).valid := io.in(i).valid io.out(i).valid := io.in(i).valid
io.out(i).bits := decoders(i).io.deq.cf_ctrl io.out(i).bits := decoders(i).io.deq.cf_ctrl
io.in(i).ready := io.out(i).ready io.in(i).ready := io.out(i).ready
// We use the lsrc/ldest before fusion decoder to read RAT for better timing.
io.intRat(i)(0).addr := decoders(i).io.deq.cf_ctrl.ctrl.lsrc(0)
io.intRat(i)(1).addr := decoders(i).io.deq.cf_ctrl.ctrl.lsrc(1)
io.intRat(i)(2).addr := decoders(i).io.deq.cf_ctrl.ctrl.ldest
io.intRat(i).foreach(_.hold := !io.out(i).ready)
// Floating-point instructions can not be fused now.
io.fpRat(i)(0).addr := decoders(i).io.deq.cf_ctrl.ctrl.lsrc(0)
io.fpRat(i)(1).addr := decoders(i).io.deq.cf_ctrl.ctrl.lsrc(1)
io.fpRat(i)(2).addr := decoders(i).io.deq.cf_ctrl.ctrl.lsrc(2)
io.fpRat(i)(3).addr := decoders(i).io.deq.cf_ctrl.ctrl.ldest
io.fpRat(i).foreach(_.hold := !io.out(i).ready)
} }
// instruction fusion // instruction fusion
...@@ -76,6 +94,7 @@ class DecodeStage(implicit p: Parameters) extends XSModule with HasPerfEvents { ...@@ -76,6 +94,7 @@ class DecodeStage(implicit p: Parameters) extends XSModule with HasPerfEvents {
v := false.B v := false.B
} }
} }
io.fusionInfo := fusionDecoder.io.info
val hasValid = VecInit(io.in.map(_.valid)).asUInt.orR val hasValid = VecInit(io.in.map(_.valid)).asUInt.orR
XSPerfAccumulate("utilization", PopCount(io.in.map(_.valid))) XSPerfAccumulate("utilization", PopCount(io.in.map(_.valid)))
......
...@@ -21,7 +21,7 @@ import chisel3._ ...@@ -21,7 +21,7 @@ import chisel3._
import chisel3.util._ import chisel3.util._
import xiangshan._ import xiangshan._
import utils._ import utils._
import xiangshan.backend.decode.{Imm_I, Imm_LUI_LOAD, Imm_U} import xiangshan.backend.decode.{FusionDecodeInfo, Imm_I, Imm_LUI_LOAD, Imm_U}
import xiangshan.backend.rob.RobPtr import xiangshan.backend.rob.RobPtr
import xiangshan.backend.rename.freelist._ import xiangshan.backend.rename.freelist._
import xiangshan.mem.mdp._ import xiangshan.mem.mdp._
...@@ -32,6 +32,7 @@ class Rename(implicit p: Parameters) extends XSModule with HasPerfEvents { ...@@ -32,6 +32,7 @@ class Rename(implicit p: Parameters) extends XSModule with HasPerfEvents {
val robCommits = Flipped(new RobCommitIO) val robCommits = Flipped(new RobCommitIO)
// from decode // from decode
val in = Vec(RenameWidth, Flipped(DecoupledIO(new CfCtrl))) val in = Vec(RenameWidth, Flipped(DecoupledIO(new CfCtrl)))
val fusionInfo = Vec(DecodeWidth - 1, Flipped(new FusionDecodeInfo))
// ssit read result // ssit read result
val ssit = Flipped(Vec(RenameWidth, Output(new SSITEntry))) val ssit = Flipped(Vec(RenameWidth, Output(new SSITEntry)))
// waittable read result // waittable read result
...@@ -104,7 +105,6 @@ class Rename(implicit p: Parameters) extends XSModule with HasPerfEvents { ...@@ -104,7 +105,6 @@ class Rename(implicit p: Parameters) extends XSModule with HasPerfEvents {
val hasValid = Cat(io.in.map(_.valid)).orR val hasValid = Cat(io.in.map(_.valid)).orR
val isMove = io.in.map(_.bits.ctrl.isMove) val isMove = io.in.map(_.bits.ctrl.isMove)
val intPsrc = Wire(Vec(RenameWidth, UInt()))
val intSpecWen = Wire(Vec(RenameWidth, Bool())) val intSpecWen = Wire(Vec(RenameWidth, Bool()))
val fpSpecWen = Wire(Vec(RenameWidth, Bool())) val fpSpecWen = Wire(Vec(RenameWidth, Bool()))
...@@ -135,15 +135,18 @@ class Rename(implicit p: Parameters) extends XSModule with HasPerfEvents { ...@@ -135,15 +135,18 @@ class Rename(implicit p: Parameters) extends XSModule with HasPerfEvents {
uops(i).robIdx := robIdxHead + PopCount(io.in.take(i).map(_.valid)) uops(i).robIdx := robIdxHead + PopCount(io.in.take(i).map(_.valid))
val intPhySrcVec = io.intReadPorts(i).take(2) uops(i).psrc(0) := Mux(uops(i).ctrl.srcType(0) === SrcType.reg, io.intReadPorts(i)(0), io.fpReadPorts(i)(0))
val intOldPdest = io.intReadPorts(i).last uops(i).psrc(1) := Mux(uops(i).ctrl.srcType(1) === SrcType.reg, io.intReadPorts(i)(1), io.fpReadPorts(i)(1))
intPsrc(i) := intPhySrcVec(0) // int psrc2 should be bypassed from next instruction if it is fused
val fpPhySrcVec = io.fpReadPorts(i).take(3) if (i < RenameWidth - 1) {
val fpOldPdest = io.fpReadPorts(i).last when (io.fusionInfo(i).rs2FromRs2 || io.fusionInfo(i).rs2FromRs1) {
uops(i).psrc(0) := Mux(uops(i).ctrl.srcType(0) === SrcType.reg, intPhySrcVec(0), fpPhySrcVec(0)) uops(i).psrc(1) := Mux(io.fusionInfo(i).rs2FromRs2, io.intReadPorts(i + 1)(1), io.intReadPorts(i + 1)(0))
uops(i).psrc(1) := Mux(uops(i).ctrl.srcType(1) === SrcType.reg, intPhySrcVec(1), fpPhySrcVec(1)) }.elsewhen(io.fusionInfo(i).rs2FromZero) {
uops(i).psrc(2) := fpPhySrcVec(2) uops(i).psrc(1) := 0.U
uops(i).old_pdest := Mux(uops(i).ctrl.rfWen, intOldPdest, fpOldPdest) }
}
uops(i).psrc(2) := io.fpReadPorts(i)(2)
uops(i).old_pdest := Mux(uops(i).ctrl.rfWen, io.intReadPorts(i).last, io.fpReadPorts(i).last)
uops(i).eliminatedMove := isMove(i) uops(i).eliminatedMove := isMove(i)
// update pdest // update pdest
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册