提交 3db2cf75 编写于 作者: W William Wang

mem: loadpipe will not miss if fullForward succeed

New option `EnableFastForward` is added to config list.
EnableFastForward will reduce L1D$ miss but make timing worse.

* `forwardMaskFast` is generated at load_s1, it is used to generate
fastUop for fast wakeup
* `forwardMask` is generated at load_s2, it will be used to check if
forward result is correct
上级 4887ca7f
......@@ -101,6 +101,7 @@ case class XSCoreParameters
StorePipelineWidth: Int = 2,
StoreBufferSize: Int = 16,
StoreBufferThreshold: Int = 7,
EnableFastForward: Boolean = true,
RefillSize: Int = 512,
TlbEntrySize: Int = 32,
TlbSPEntrySize: Int = 4,
......@@ -235,6 +236,7 @@ trait HasXSParameter {
val StorePipelineWidth = coreParams.StorePipelineWidth
val StoreBufferSize = coreParams.StoreBufferSize
val StoreBufferThreshold = coreParams.StoreBufferThreshold
val EnableFastForward = coreParams.EnableFastForward
val RefillSize = coreParams.RefillSize
val DTLBWidth = coreParams.LoadPipelineWidth + coreParams.StorePipelineWidth
val TlbEntrySize = coreParams.TlbEntrySize
......
......@@ -293,6 +293,9 @@ class CtrlBlock(implicit p: Parameters) extends XSModule
flushPC + 4.U // flush pipe
)
)
when (flushRedirect.valid && RegEnable(roq.io.flushOut.bits.replayInst, flush)) {
XSDebug("replay inst (%x) from rob\n", flushPC);
}
val flushRedirectReg = Wire(Valid(new Redirect))
flushRedirectReg.valid := RegNext(flushRedirect.valid, init = false.B)
flushRedirectReg.bits := RegEnable(flushRedirect.bits, enable = flushRedirect.valid)
......
......@@ -79,6 +79,13 @@ class DataArray(params: RSParams)(implicit p: Parameters) extends XSModule {
dataModule.io.wen := wen
dataModule.io.wvec := waddr
dataModule.io.wdata := wdata
for (i <- 0 until params.numEntries) {
val w = VecInit((0 until wen.length).map(j => dataModule.io.wen(j) && dataModule.io.wvec(j)(i)))
assert(RegNext(PopCount(w) <= 1.U))
when(PopCount(w) > 1.U) {
XSDebug("ERROR: RS DataArray write overlap!\n")
}
}
}
}
......
......@@ -79,6 +79,7 @@ class LoadForwardQueryIO(implicit p: Parameters) extends XSBundle {
val pc = Output(UInt(VAddrBits.W)) //for debug
val valid = Output(Bool()) //for debug
val forwardMaskFast = Input(Vec(8, Bool())) // resp to load_s1
val forwardMask = Input(Vec(8, Bool())) // resp to load_s2
val forwardData = Input(Vec(8, UInt(8.W))) // resp to load_s2
......
......@@ -34,8 +34,9 @@ class ExceptionAddrIO(implicit p: Parameters) extends XSBundle {
}
class FwdEntry extends Bundle {
val valid = Bool()
val data = UInt(8.W)
val validFast = Bool() // validFast is generated the same cycle with query
val valid = Bool() // valid is generated 1 cycle after query request
val data = UInt(8.W) // data is generated 1 cycle after query request
}
// inflight miss block reqs
......
......@@ -283,9 +283,6 @@ class StoreQueue(implicit p: Parameters) extends XSModule with HasDCacheParamete
*/
// check over all lq entries and forward data from the first matched store
for (i <- 0 until LoadPipelineWidth) {
io.forward(i).forwardMask := 0.U(8.W).asBools
io.forward(i).forwardData := DontCare
// Compare deqPtr (deqPtr) and forward.sqIdx, we have two cases:
// (1) if they have the same flag, we need to check range(tail, sqIdx)
// (2) if they have different flags, we need to check range(tail, LoadQueueSize) and range(0, sqIdx)
......@@ -329,6 +326,9 @@ class StoreQueue(implicit p: Parameters) extends XSModule with HasDCacheParamete
XSPerfAccumulate("vaddr_match_failed", vpmaskNotEqual)
XSPerfAccumulate("vaddr_match_really_failed", vaddrMatchFailed)
// Fast forward mask will be generated immediately (load_s1)
io.forward(i).forwardMaskFast := dataModule.io.forwardMaskFast(i)
// Forward result will be generated 1 cycle later (load_s2)
io.forward(i).forwardMask := dataModule.io.forwardMask(i)
io.forward(i).forwardData := dataModule.io.forwardData(i)
......
......@@ -93,6 +93,7 @@ class SQData8Module(numEntries: Int, numRead: Int, numWrite: Int, numForward: In
}
val needForward = Input(Vec(numForward, Vec(2, UInt(numEntries.W))))
val forwardValidFast = Vec(numForward, Output(Bool()))
val forwardValid = Vec(numForward, Output(Bool()))
val forwardData = Vec(numForward, Output(UInt(8.W)))
})
......@@ -149,31 +150,36 @@ class SQData8Module(numEntries: Int, numRead: Int, numWrite: Int, numForward: In
val l = a.asTypeOf(new FwdEntry)
val r = b.asTypeOf(new FwdEntry)
val res = Wire(new FwdEntry)
res.valid := l.valid || r.valid
res.validFast := l.validFast || r.validFast
// res.valid := l.valid || r.valid
res.valid := RegNext(res.validFast)
res.data := Mux(r.valid, r.data, l.data)
res
})
}
// paddrMatch is now included in io.needForward
// for (j <- 0 until numEntries) {
// paddrMatch(j) := io.forward(i).paddr(PAddrBits - 1, 3) === data(j).paddr(PAddrBits - 1, 3)
// }
for (j <- 0 until numEntries) {
val needCheck0 = RegNext(io.needForward(i)(0)(j))
val needCheck1 = RegNext(io.needForward(i)(1)(j))
val needCheck0 = io.needForward(i)(0)(j)
val needCheck1 = io.needForward(i)(1)(j)
val needCheck0Reg = RegNext(needCheck0)
val needCheck1Reg = RegNext(needCheck1)
(0 until XLEN / 8).foreach(k => {
matchResultVec(j).valid := needCheck0 && data(j).valid
matchResultVec(j).validFast := needCheck0 && data(j).valid
matchResultVec(j).valid := needCheck0Reg && data(j).valid
matchResultVec(j).data := data(j).data
matchResultVec(numEntries + j).valid := needCheck1 && data(j).valid
matchResultVec(numEntries + j).validFast := needCheck1 && data(j).valid
matchResultVec(numEntries + j).valid := needCheck1Reg && data(j).valid
matchResultVec(numEntries + j).data := data(j).data
})
}
val parallelFwdResult = parallelFwd(matchResultVec).asTypeOf(new FwdEntry)
// validFast is generated the same cycle with query
io.forwardValidFast(i) := parallelFwdResult.validFast
// valid is generated 1 cycle after query request
io.forwardValid(i) := parallelFwdResult.valid
// data is generated 1 cycle after query request
io.forwardData(i) := parallelFwdResult.data
})
......@@ -201,6 +207,7 @@ class SQDataModule(numEntries: Int, numRead: Int, numWrite: Int, numForward: Int
}
val needForward = Input(Vec(numForward, Vec(2, UInt(numEntries.W))))
val forwardMaskFast = Vec(numForward, Output(Vec(8, Bool())))
val forwardMask = Vec(numForward, Output(Vec(8, Bool())))
val forwardData = Vec(numForward, Output(Vec(8, UInt(8.W))))
})
......@@ -245,6 +252,7 @@ class SQDataModule(numEntries: Int, numRead: Int, numWrite: Int, numForward: Int
// parallel fwd logic
for (j <- 0 until 8) {
data8(j).io.needForward(i) <> io.needForward(i)
io.forwardMaskFast(i) := VecInit((0 until 8).map(j => data8(j).io.forwardValidFast(i)))
io.forwardMask(i) := VecInit((0 until 8).map(j => data8(j).io.forwardValid(i)))
io.forwardData(i) := VecInit((0 until 8).map(j => data8(j).io.forwardData(i)))
}
......
......@@ -115,6 +115,7 @@ class LoadUnit_S1(implicit p: Parameters) extends XSModule {
val dtlbResp = Flipped(DecoupledIO(new TlbResp))
val dcachePAddr = Output(UInt(PAddrBits.W))
val dcacheKill = Output(Bool())
val fullForwardFast = Output(Bool())
val sbuffer = new LoadForwardQueryIO
val lsq = new PipeLoadForwardQueryIO
})
......@@ -152,6 +153,11 @@ class LoadUnit_S1(implicit p: Parameters) extends XSModule {
io.lsq.mask := s1_mask
io.lsq.pc := s1_uop.cf.pc // FIXME: remove it
// Generate forwardMaskFast to wake up insts earlier
val forwardMaskFast = io.lsq.forwardMaskFast.asUInt | io.sbuffer.forwardMaskFast.asUInt
io.fullForwardFast := (~forwardMaskFast & s1_mask) === 0.U
io.out.valid := io.in.valid// && !s1_tlb_miss
io.out.bits.paddr := s1_paddr
io.out.bits.mmio := s1_mmio && !s1_exception
......@@ -192,7 +198,13 @@ class LoadUnit_S2(implicit p: Parameters) extends XSModule with HasLoadHelper {
val s2_mmio = io.in.bits.mmio && !s2_exception
val s2_cache_miss = io.dcacheResp.bits.miss
val s2_cache_replay = io.dcacheResp.bits.replay
// val cnt = RegInit(127.U)
// cnt := cnt + io.in.valid.asUInt
// val s2_forward_fail = io.lsq.matchInvalid || io.sbuffer.matchInvalid || cnt === 0.U
val s2_forward_fail = io.lsq.matchInvalid || io.sbuffer.matchInvalid
// assert(!s2_forward_fail)
io.dcacheResp.ready := true.B
......@@ -257,12 +269,16 @@ class LoadUnit_S2(implicit p: Parameters) extends XSModule with HasLoadHelper {
io.out.bits := io.in.bits
io.out.bits.data := rdataPartialLoad
// when exception occurs, set it to not miss and let it write back to roq (via int port)
io.out.bits.miss := s2_cache_miss && !s2_exception && !s2_forward_fail
if (EnableFastForward) {
io.out.bits.miss := s2_cache_miss && !s2_exception && !s2_forward_fail && !fullForward
} else {
io.out.bits.miss := s2_cache_miss && !s2_exception && !s2_forward_fail
}
io.out.bits.uop.ctrl.fpWen := io.in.bits.uop.ctrl.fpWen && !s2_exception
io.out.bits.uop.cf.replayInst := s2_forward_fail && !s2_mmio // if forward fail, repaly this inst
io.out.bits.mmio := s2_mmio
// For timing reasons, we can not let
// For timing reasons, sometimes we can not let
// io.out.bits.miss := s2_cache_miss && !s2_exception && !fullForward
// We use io.dataForwarded instead. It means forward logic have prepared all data needed,
// and dcache query is no longer needed.
......@@ -329,10 +345,12 @@ class LoadUnit(implicit p: Parameters) extends XSModule with HasLoadHelper {
load_s2.io.dcacheResp <> io.dcache.resp
load_s2.io.lsq.forwardData <> io.lsq.forward.forwardData
load_s2.io.lsq.forwardMask <> io.lsq.forward.forwardMask
load_s2.io.lsq.forwardMaskFast <> io.lsq.forward.forwardMaskFast // should not be used in load_s2
load_s2.io.lsq.dataInvalid <> io.lsq.forward.dataInvalid
load_s2.io.lsq.matchInvalid <> io.lsq.forward.matchInvalid
load_s2.io.sbuffer.forwardData <> io.sbuffer.forwardData
load_s2.io.sbuffer.forwardMask <> io.sbuffer.forwardMask
load_s2.io.sbuffer.forwardMaskFast <> io.sbuffer.forwardMaskFast // should not be used in load_s2
load_s2.io.sbuffer.dataInvalid <> io.sbuffer.dataInvalid // always false
load_s2.io.sbuffer.matchInvalid <> io.sbuffer.matchInvalid
load_s2.io.dataForwarded <> io.lsq.loadDataForwarded
......@@ -348,8 +366,19 @@ class LoadUnit(implicit p: Parameters) extends XSModule with HasLoadHelper {
// load_s2.io.dcacheResp.bits.data := Mux1H(RegNext(io.dcache.s1_hit_way), RegNext(io.dcache.s1_data))
// assert(load_s2.io.dcacheResp.bits.data === io.dcache.resp.bits.data)
io.fastUop.valid := io.dcache.s1_hit_way.orR && !io.dcache.s1_disable_fast_wakeup && load_s1.io.in.valid &&
!load_s1.io.dcacheKill && !io.lsq.forward.dataInvalidFast
if (EnableFastForward) {
io.fastUop.valid := (io.dcache.s1_hit_way.orR || load_s1.io.fullForwardFast) && // dcache hit || full forward
!io.dcache.s1_disable_fast_wakeup && // load fast wakeup should be disabled when dcache data read is not ready
load_s1.io.in.valid && // valid laod request
!load_s1.io.dcacheKill && // not mmio or tlb miss
!io.lsq.forward.dataInvalidFast // forward failed
} else {
io.fastUop.valid := io.dcache.s1_hit_way.orR && // dcache hit
!io.dcache.s1_disable_fast_wakeup && // load fast wakeup should be disabled when dcache data read is not ready
load_s1.io.in.valid && // valid laod request
!load_s1.io.dcacheKill && // not mmio or tlb miss
!io.lsq.forward.dataInvalidFast // forward failed
}
io.fastUop.bits := load_s1.io.out.bits.uop
XSDebug(load_s0.io.out.valid,
......
......@@ -457,6 +457,9 @@ class NewSbuffer(implicit p: Parameters) extends XSModule with HasSbufferConst {
val selectedInflightMask = Mux1H(line_offset_reg, Mux1H(inflight_tag_match_reg, mask).asTypeOf(Vec(CacheLineWords, Vec(DataBytes, Bool()))))
val selectedInflightData = Mux1H(line_offset_reg, Mux1H(inflight_tag_match_reg, data).asTypeOf(Vec(CacheLineWords, Vec(DataBytes, UInt(8.W)))))
val selectedInflightMaskFast = Mux1H(line_offset_mask, Mux1H(inflight_tag_matches, mask).asTypeOf(Vec(CacheLineWords, Vec(DataBytes, Bool()))))
val selectedValidMaskFast = Mux1H(line_offset_mask, Mux1H(valid_tag_matches, mask).asTypeOf(Vec(CacheLineWords, Vec(DataBytes, Bool()))))
forward.dataInvalid := false.B // data in store line merge buffer is always ready
forward.matchInvalid := tag_mismatch // paddr / vaddr cam result does not match
for (j <- 0 until DataBytes) {
......@@ -472,6 +475,8 @@ class NewSbuffer(implicit p: Parameters) extends XSModule with HasSbufferConst {
forward.forwardMask(j) := true.B
forward.forwardData(j) := selectedValidData(j)
}
forward.forwardMaskFast(j) := selectedInflightMaskFast(j) || selectedValidMaskFast(j)
}
}
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册