未验证 提交 c163075e 编写于 作者: S sfencevma 提交者: GitHub

LDU: fix l2l fwd (#2269)

* fix l2l fwd

* fix l2l fwd mask

* fix s0_l2l_fwd_valid

* fix l2l fwd mask and fuOpType logic

* fix l2l fwd cancel logic

* add  fuOpType fast path

* remove useless variable

* fix s1_addr_misaligned

* fix l2l_fwd_out.data
上级 a11e9ab9
......@@ -334,6 +334,7 @@ class XSCoreImp(outer: XSCoreBase) extends LazyModuleImp(outer)
// By default, instructions do not have exceptions when they enter the function units.
memBlock.io.ooo_to_mem.issue.map(_.bits.uop.clearExceptions())
exuBlocks(0).io.scheExtra.loadFastMatch.get <> memBlock.io.ooo_to_mem.loadFastMatch
exuBlocks(0).io.scheExtra.loadFastFuOpType.get <> memBlock.io.ooo_to_mem.loadFastFuOpType
exuBlocks(0).io.scheExtra.loadFastImm.get <> memBlock.io.ooo_to_mem.loadFastImm
val stdIssue = exuBlocks(0).io.issue.get.takeRight(exuParameters.StuCnt)
......
......@@ -44,6 +44,7 @@ class Std(implicit p: Parameters) extends FunctionUnit {
class ooo_to_mem(implicit p: Parameters) extends XSBundle{
val loadFastMatch = Vec(exuParameters.LduCnt, Input(UInt(exuParameters.LduCnt.W)))
val loadFastFuOpType = Vec(exuParameters.LduCnt, Input(FuOpType()))
val loadFastImm = Vec(exuParameters.LduCnt, Input(UInt(12.W)))
val sfence = Input(new SfenceBundle)
val tlbCsr = Input(new TlbCsrBundle)
......@@ -508,6 +509,7 @@ class MemBlockImp(outer: MemBlock) extends LazyModuleImp(outer)
val fastMatch = ParallelPriorityMux(fastValidVec, fastMatchVec)
loadUnits(i).io.ld_fast_match := fastMatch
loadUnits(i).io.ld_fast_imm := io.ooo_to_mem.loadFastImm(i)
loadUnits(i).io.ld_fast_fuOpType := io.ooo_to_mem.loadFastFuOpType(i)
loadUnits(i).io.replay <> lsq.io.replay(i)
loadUnits(i).io.l2_hint <> io.l2_hint
......
......@@ -254,6 +254,7 @@ class SchedulerImp(outer: Scheduler) extends LazyModuleImp(outer) with HasXSPara
val fpRfReadOut = if (outer.outFpRfReadPorts > 0) Some(Vec(outer.outFpRfReadPorts, new RfReadPort(XLEN))) else None
val fpStateReadOut = if (outer.outFpRfReadPorts > 0) Some(Vec(outer.outFpRfReadPorts, new BusyTableReadIO)) else None
val loadFastMatch = if (numLoadPorts > 0) Some(Vec(numLoadPorts, Output(UInt(exuParameters.LduCnt.W)))) else None
val loadFastFuOpType = if (numLoadPorts > 0) Some(Vec(numLoadPorts, Output(FuOpType()))) else None
val loadFastImm = if (numLoadPorts > 0) Some(Vec(numLoadPorts, Output(UInt(12.W)))) else None
// misc
val jumpPc = Input(UInt(VAddrBits.W))
......@@ -474,6 +475,7 @@ class SchedulerImp(outer: Scheduler) extends LazyModuleImp(outer) with HasXSPara
if (io.extra.loadFastMatch.isDefined) {
val allLoadRS = outer.reservationStations.map(_.module.io.load).filter(_.isDefined)
io.extra.loadFastMatch.get := allLoadRS.map(_.get.map(_.fastMatch)).fold(Seq())(_ ++ _)
io.extra.loadFastFuOpType.get := allLoadRS.map(_.get.map(_.fastFuOpType)).fold(Seq())(_ ++ _)
io.extra.loadFastImm.get := allLoadRS.map(_.get.map(_.fastImm)).fold(Seq())(_ ++ _)
}
......
......@@ -243,6 +243,7 @@ class ReservationStationIO(params: RSParams)(implicit p: Parameters) extends XSB
}) else None
val load = if (params.isLoad) Some(Vec(params.numDeq, new Bundle {
val fastMatch = Output(UInt(exuParameters.LduCnt.W))
val fastFuOpType = Output(FuOpType())
val fastImm = Output(UInt(12.W))
})) else None
val fmaMid = if (params.exuCfg.get == FmacExeUnitCfg) Some(Vec(params.numDeq, Flipped(new FMAMidResultIO))) else None
......@@ -777,6 +778,7 @@ class ReservationStation(params: RSParams)(implicit p: Parameters) extends XSMod
io.load.get(i).fastMatch := Mux(s1_issuePtrOH(i).valid, VecInit(
wakeupBypassMask.drop(exuParameters.AluCnt).take(exuParameters.LduCnt).map(_.asUInt.orR)
).asUInt, 0.U)
io.load.get(i).fastFuOpType := s1_out(i).bits.uop.ctrl.fuOpType
io.load.get(i).fastImm := s1_out(i).bits.uop.ctrl.imm
}
......
......@@ -128,8 +128,10 @@ class LoadUnit(implicit p: Parameters) extends XSModule
// load to load fast path
val l2l_fwd_in = Input(new LoadToLoadIO)
val l2l_fwd_out = Output(new LoadToLoadIO)
val ld_fast_match = Input(Bool())
val ld_fast_imm = Input(UInt(12.W))
val ld_fast_match = Input(Bool())
val ld_fast_fuOpType = Input(UInt())
val ld_fast_imm = Input(UInt(12.W))
// rs feedback
val feedback_fast = ValidIO(new RSFeedback) // stage 2
......@@ -200,7 +202,7 @@ class LoadUnit(implicit p: Parameters) extends XSModule
val s0_high_conf_prf_valid = io.prefetch_req.valid && io.prefetch_req.bits.confidence > 0.U
val s0_int_iss_valid = io.ldin.valid // int flow first issue or software prefetch
val s0_vec_iss_valid = WireInit(false.B) // TODO
val s0_l2l_fwd_valid = io.l2l_fwd_in.valid
val s0_l2l_fwd_valid = io.l2l_fwd_in.valid && io.ld_fast_match
val s0_low_conf_prf_valid = io.prefetch_req.valid && io.prefetch_req.bits.confidence === 0.U
dontTouch(s0_super_ld_rep_valid)
dontTouch(s0_ld_fast_rep_valid)
......@@ -443,10 +445,10 @@ class LoadUnit(implicit p: Parameters) extends XSModule
def fromLoadToLoadSource(src: LoadToLoadIO) = {
s0_vaddr := Cat(src.data(XLEN-1, 6), s0_ptr_chasing_vaddr(5,0))
s0_mask := genVWmask(Cat(s0_ptr_chasing_vaddr(3), 0.U(3.W)), LSUOpType.ld)
s0_mask := genVWmask(s0_vaddr, io.ld_fast_fuOpType(1, 0))
// When there's no valid instruction from RS and LSQ, we try the load-to-load forwarding.
// Assume the pointer chasing is always ld.
s0_uop.ctrl.fuOpType := LSUOpType.ld
s0_uop.ctrl.fuOpType := io.ld_fast_fuOpType
s0_try_l2l := true.B
// we dont care s0_isFirstIssue and s0_rsIdx and s0_sqIdx in S0 when trying pointchasing
// because these signals will be updated in S1
......@@ -620,15 +622,15 @@ class LoadUnit(implicit p: Parameters) extends XSModule
(s1_in.mask & io.stld_nuke_query(w).bits.mask).orR // data mask contain
})).asUInt.orR && !s1_tlb_miss
s1_out := s1_in
s1_out.vaddr := s1_vaddr
s1_out.paddr := s1_paddr_dup_lsu
s1_out.tlbMiss := s1_tlb_miss
s1_out.ptwBack := io.tlb.resp.bits.ptwBack
s1_out.rsIdx := s1_in.rsIdx
s1_out.rep_info.debug := s1_in.uop.debugInfo
s1_out.rep_info.nuke := s1_nuke && !s1_sw_prf
s1_out.lateKill := s1_late_kill
s1_out := s1_in
s1_out.vaddr := s1_vaddr
s1_out.paddr := s1_paddr_dup_lsu
s1_out.tlbMiss := s1_tlb_miss
s1_out.ptwBack := io.tlb.resp.bits.ptwBack
s1_out.rsIdx := s1_in.rsIdx
s1_out.rep_info.debug := s1_in.uop.debugInfo
s1_out.rep_info.nuke := s1_nuke && !s1_sw_prf
s1_out.lateKill := s1_late_kill
when (!s1_late_kill) {
// current ori test will cause the case of ldest == 0, below will be modifeid in the future.
......@@ -660,21 +662,23 @@ class LoadUnit(implicit p: Parameters) extends XSModule
// These can be put at S0 if timing is bad at S1.
// Case 0: CACHE_SET(base + offset) != CACHE_SET(base) (lowest 6-bit addition has an overflow)
s1_addr_mismatch := s1_ptr_chasing_vaddr(6) || RegEnable(io.ld_fast_imm(11, 6).orR, s0_do_try_ptr_chasing)
// Case 1: the address is not 64-bit aligned or the fuOpType is not LD
s1_addr_misaligned := s1_ptr_chasing_vaddr(2, 0).orR
s1_fu_op_type_not_ld := io.ldin.bits.uop.ctrl.fuOpType =/= LSUOpType.ld
// Case 2: this is not a valid load-load pair
s1_not_fast_match := RegEnable(!io.ld_fast_match, s0_try_ptr_chasing)
// Case 3: this load-load uop is cancelled
// Case 1: the address is misaligned, kill s1
s1_addr_misaligned := LookupTree(s1_in.uop.ctrl.fuOpType(1, 0), List(
"b00".U -> false.B, //b
"b01".U -> (s1_vaddr(0) =/= 0.U), //h
"b10".U -> (s1_vaddr(1, 0) =/= 0.U), //w
"b11".U -> (s1_vaddr(2, 0) =/= 0.U) //d
))
// Case 2: this load-load uop is cancelled
s1_ptr_chasing_canceled := !io.ldin.valid
when (s1_try_ptr_chasing) {
s1_cancel_ptr_chasing := s1_addr_mismatch || s1_addr_misaligned || s1_fu_op_type_not_ld || s1_not_fast_match || s1_ptr_chasing_canceled
s1_cancel_ptr_chasing := s1_addr_mismatch || s1_addr_misaligned || s1_ptr_chasing_canceled
s1_in.uop := io.ldin.bits.uop
s1_in.rsIdx := io.rsIdx
s1_in.isFirstIssue := io.isFirstIssue
s1_vaddr_lo := Cat(s1_ptr_chasing_vaddr(5, 3), 0.U(3.W))
s1_vaddr_lo := s1_ptr_chasing_vaddr(5, 0)
s1_paddr_dup_lsu := Cat(io.tlb.resp.bits.paddr(0)(PAddrBits - 1, 6), s1_vaddr_lo)
s1_paddr_dup_dcache := Cat(io.tlb.resp.bits.paddr(0)(PAddrBits - 1, 6), s1_vaddr_lo)
......@@ -1152,7 +1156,7 @@ class LoadUnit(implicit p: Parameters) extends XSModule
// fast load to load forward
io.l2l_fwd_out.valid := s3_out.valid && !s3_in.lateKill
io.l2l_fwd_out.data := Mux(s3_ld_raw_data_frm_cache.addrOffset(3), s3_merged_data_frm_cache(127, 64), s3_merged_data_frm_cache(63, 0)) // load to load is for ld only
io.l2l_fwd_out.data := s3_ld_data_frm_cache
io.l2l_fwd_out.dly_ld_err := s3_dly_ld_err // ecc delayed error
// trigger
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册