From 7a919e05a2f06bab282b84b82a947336ef0c9e7b Mon Sep 17 00:00:00 2001 From: William Wang Date: Wed, 3 Aug 2022 19:03:11 +0800 Subject: [PATCH] dcache: delay wbq data update for 1 cycle (#1701) This commit and an extra cycle for miss queue store data and mask write. For now, there are 18 missqueue entries. Each entry has a 512 bit data reg and a 64 bit mask reg. If we update writeback queue data in 1 cycle, the fanout will be at least 18x(512+64) = 10368. Now writeback queue req meta update is unchanged, however, data and mask update will happen 1 cycle after req fire or release update fire (T0). In T0, data and meta will be written to a buffer in missqueue. In T1, s_data_merge or s_data_override in each missqueue entry will be used as data and mask wen. --- .../dcache/mainpipe/WritebackQueue.scala | 121 ++++++++++++++---- 1 file changed, 95 insertions(+), 26 deletions(-) diff --git a/src/main/scala/xiangshan/cache/dcache/mainpipe/WritebackQueue.scala b/src/main/scala/xiangshan/cache/dcache/mainpipe/WritebackQueue.scala index f023b6eb0..1331c3c70 100644 --- a/src/main/scala/xiangshan/cache/dcache/mainpipe/WritebackQueue.scala +++ b/src/main/scala/xiangshan/cache/dcache/mainpipe/WritebackQueue.scala @@ -24,7 +24,7 @@ import freechips.rocketchip.tilelink.{TLArbiter, TLBundleC, TLBundleD, TLEdgeOut import huancun.DirtyKey import utils.{HasPerfEvents, HasTLDump, XSDebug, XSPerfAccumulate} -class WritebackReq(implicit p: Parameters) extends DCacheBundle { +class WritebackReqWodata(implicit p: Parameters) extends DCacheBundle { val addr = UInt(PAddrBits.W) val addr_dup_0 = UInt(PAddrBits.W) val addr_dup_1 = UInt(PAddrBits.W) @@ -32,15 +32,47 @@ class WritebackReq(implicit p: Parameters) extends DCacheBundle { val voluntary = Bool() val hasData = Bool() val dirty = Bool() - val data = UInt((cfg.blockBytes * 8).W) val delay_release = Bool() val miss_id = UInt(log2Up(cfg.nMissEntries).W) def dump() = { + XSDebug("WritebackReq addr: %x param: %d voluntary: %b hasData: %b\n", + addr, param, voluntary, hasData) + } +} + +class WritebackReqData(implicit p: Parameters) extends DCacheBundle { + val data = UInt((cfg.blockBytes * 8).W) +} + +class WritebackReq(implicit p: Parameters) extends WritebackReqWodata { + val data = UInt((cfg.blockBytes * 8).W) + + override def dump() = { XSDebug("WritebackReq addr: %x param: %d voluntary: %b hasData: %b data: %x\n", addr, param, voluntary, hasData, data) } + + def toWritebackReqWodata(): WritebackReqWodata = { + val out = Wire(new WritebackReqWodata) + out.addr := addr + out.addr_dup_0 := addr_dup_0 + out.addr_dup_1 := addr_dup_1 + out.param := param + out.voluntary := voluntary + out.hasData := hasData + out.dirty := dirty + out.delay_release := delay_release + out.miss_id := miss_id + out + } + + def toWritebackReqData(): WritebackReqData = { + val out = Wire(new WritebackReqData) + out.data := data + out + } } // While a Release sleeps and waits for a refill to wake it up, @@ -53,6 +85,16 @@ class ReleaseUpdate(implicit p: Parameters) extends DCacheBundle { val data = UInt((cfg.blockBytes * 8).W) } +// To reduce fanout, miss queue entry data is updated 1 cycle +// after ReleaseUpdate.fire() +class MissQueueEntryReleaseUpdate(implicit p: Parameters) extends DCacheBundle { + // only consider store here + val addr = UInt(PAddrBits.W) + val mask_delayed = UInt(DCacheBanks.W) + val data_delayed = UInt((cfg.blockBytes * 8).W) + val mask_orr = Bool() +} + class WritebackEntry(edge: TLEdgeOut)(implicit p: Parameters) extends DCacheModule with HasTLDump { val io = IO(new Bundle { @@ -64,7 +106,8 @@ class WritebackEntry(edge: TLEdgeOut)(implicit p: Parameters) extends DCacheModu // this entry is busy, but it can merge the new req val secondary_valid = Input(Bool()) val secondary_ready = Output(Bool()) - val req = Flipped(DecoupledIO(new WritebackReq)) + val req = Flipped(DecoupledIO(new WritebackReqWodata)) + val req_data = Input(new WritebackReqData) val mem_release = DecoupledIO(new TLBundleC(edge.bundle)) val mem_grant = Flipped(DecoupledIO(new TLBundleD(edge.bundle))) @@ -72,7 +115,7 @@ class WritebackEntry(edge: TLEdgeOut)(implicit p: Parameters) extends DCacheModu val block_addr = Output(Valid(UInt())) val release_wakeup = Flipped(ValidIO(UInt(log2Up(cfg.nMissEntries).W))) - val release_update = Flipped(ValidIO(new ReleaseUpdate)) + val release_update = Flipped(ValidIO(new MissQueueEntryReleaseUpdate)) }) val s_invalid :: s_sleep :: s_release_req :: s_release_resp :: Nil = Enum(4) @@ -98,9 +141,19 @@ class WritebackEntry(edge: TLEdgeOut)(implicit p: Parameters) extends DCacheModu remain_dup_0 := (remain_dup_0 | remain_set) & ~remain_clr remain_dup_1 := (remain_dup_1 | remain_set) & ~remain_clr - val busy = remain.orR + // writeback queue data + val data = Reg(UInt((cfg.blockBytes * 8).W)) + + // pending data write + // !s_data_override means there is an in-progress data write + val s_data_override = RegInit(true.B) + // !s_data_merge means there is an in-progress data merge + val s_data_merge = RegInit(true.B) - val req = Reg(new WritebackReq) + // there are valid request that can be sent to release bus + val busy = remain.orR && s_data_override && s_data_merge // have remain beats and data write finished + + val req = Reg(new WritebackReqWodata) // assign default signals to output signals io.req.ready := false.B @@ -110,6 +163,9 @@ class WritebackEntry(edge: TLEdgeOut)(implicit p: Parameters) extends DCacheModu io.block_addr.valid := state =/= s_invalid io.block_addr.bits := req.addr + s_data_override := true.B // data_override takes only 1 cycle + s_data_merge := true.B // data_merge takes only 1 cycle + when (state =/= s_invalid) { XSDebug("WritebackEntry: %d state: %d block_addr: %x\n", io.id, state, io.block_addr.bits) @@ -126,6 +182,7 @@ class WritebackEntry(edge: TLEdgeOut)(implicit p: Parameters) extends DCacheModu when (io.req.valid && io.primary_valid && io.primary_ready) { assert (remain === 0.U) req := io.req.bits + s_data_override := false.B when (io.req.bits.delay_release) { state := s_sleep state_dup_0 := s_sleep @@ -148,9 +205,9 @@ class WritebackEntry(edge: TLEdgeOut)(implicit p: Parameters) extends DCacheModu val update = io.release_update.valid && io.release_update.bits.addr === req.addr when (update) { - req.hasData := req.hasData || io.release_update.bits.mask.orR - req.dirty := req.dirty || io.release_update.bits.mask.orR - req.data := mergeData(req.data, io.release_update.bits.data, io.release_update.bits.mask) + req.hasData := req.hasData || io.release_update.bits.mask_orr + req.dirty := req.dirty || io.release_update.bits.mask_orr + s_data_merge := false.B }.elsewhen (merge) { state := s_release_req state_dup_0 := s_release_req @@ -159,11 +216,7 @@ class WritebackEntry(edge: TLEdgeOut)(implicit p: Parameters) extends DCacheModu req.param := req.param req.hasData := req.hasData || io.req.bits.hasData req.dirty := req.dirty || io.req.bits.dirty - req.data := Mux( - io.req.bits.hasData, - io.req.bits.data, - req.data - ) + s_data_override := !io.req.bits.hasData // update data when io.req.bits.hasData req.delay_release := false.B remain_set := Mux(req.hasData || io.req.bits.hasData, ~0.U(refillCycles.W), 1.U(refillCycles.W)) } @@ -174,7 +227,7 @@ class WritebackEntry(edge: TLEdgeOut)(implicit p: Parameters) extends DCacheModu state_dup_1 := s_release_req req.delay_release := false.B remain_set := Mux( - req.hasData || update && io.release_update.bits.mask.orR || merge && io.req.bits.hasData, + req.hasData || update && io.release_update.bits.mask_orr || merge && io.req.bits.hasData, ~0.U(refillCycles.W), 1.U(refillCycles.W) ) @@ -188,7 +241,7 @@ class WritebackEntry(edge: TLEdgeOut)(implicit p: Parameters) extends DCacheModu val beat_data = Wire(Vec(refillCycles, UInt(beatBits.W))) for (i <- 0 until refillCycles) { - beat_data(i) := req.data((i + 1) * beatBits - 1, i * beatBits) + beat_data(i) := data((i + 1) * beatBits - 1, i * beatBits) } val probeResponse = edge.ProbeAck( @@ -255,7 +308,7 @@ class WritebackEntry(edge: TLEdgeOut)(implicit p: Parameters) extends DCacheModu def toWritebackReq = { val r = Wire(new WritebackReq()) - r.data := req.data + r.data := data r.addr := req.addr r.addr_dup_0 := req.addr_dup_0 r.addr_dup_1 := req.addr_dup_1 @@ -298,11 +351,7 @@ class WritebackEntry(edge: TLEdgeOut)(implicit p: Parameters) extends DCacheModu req.voluntary := false.B req.hasData := req.hasData || io.req.bits.hasData req.dirty := req.dirty || io.req.bits.dirty - req.data := Mux( - io.req.bits.hasData, - io.req.bits.data, - req.data - ) + s_data_override := false.B req.delay_release := false.B remain_set := Mux(req.hasData || io.req.bits.hasData, ~0.U(refillCycles.W), 1.U(refillCycles.W)) } @@ -411,6 +460,17 @@ class WritebackEntry(edge: TLEdgeOut)(implicit p: Parameters) extends DCacheModu io.primary_ready := state_dup_1 === s_invalid io.secondary_ready := state_dup_1 =/= s_invalid && io.req.bits.addr === req.addr + // data update logic + when (!s_data_merge) { + data := mergeData(data, io.release_update.bits.data_delayed, io.release_update.bits.mask_delayed) + } + + when (!s_data_override) { + data := io.req_data.data + } + + assert(!RegNext(!s_data_merge && !s_data_override)) + // performance counters XSPerfAccumulate("wb_req", io.req.fire()) XSPerfAccumulate("wb_release", state === s_release_req && release_done && req.voluntary) @@ -448,6 +508,16 @@ class WritebackQueue(edge: TLEdgeOut)(implicit p: Parameters) extends DCacheModu io.mem_release.bits := DontCare io.mem_grant.ready := false.B + // dalay data write in miss queue release update for 1 cycle + val release_update_bits_for_entry = Wire(new MissQueueEntryReleaseUpdate) + release_update_bits_for_entry.addr := io.release_update.bits.addr + release_update_bits_for_entry.mask_delayed := RegEnable(io.release_update.bits.mask, io.release_update.valid) + release_update_bits_for_entry.data_delayed := RegEnable(io.release_update.bits.data, io.release_update.valid) + release_update_bits_for_entry.mask_orr := io.release_update.bits.mask.orR + + // delay data write in miss queue req for 1 cycle + val req_data = RegEnable(io.req.bits.toWritebackReqData(), io.req.valid) + require(isPow2(cfg.nMissEntries)) val grant_source = io.mem_grant.bits.source val entries = Seq.fill(cfg.nReleaseEntries)(Module(new WritebackEntry(edge))) @@ -466,6 +536,7 @@ class WritebackQueue(edge: TLEdgeOut)(implicit p: Parameters) extends DCacheModu primary_ready_vec(i) := entry.io.primary_ready secondary_ready_vec(i) := entry.io.secondary_ready entry.io.req.bits := io.req.bits + entry.io.req_data := req_data entry.io.primary_valid := alloc && !former_primary_ready && @@ -474,12 +545,10 @@ class WritebackQueue(edge: TLEdgeOut)(implicit p: Parameters) extends DCacheModu entry.io.mem_grant.valid := (entry_id === grant_source) && io.mem_grant.valid entry.io.mem_grant.bits := io.mem_grant.bits -// when (entry_id === grant_source) { -// io.mem_grant.ready := entry.io.mem_grant.ready -// } entry.io.release_wakeup := io.release_wakeup - entry.io.release_update := io.release_update + entry.io.release_update.valid := io.release_update.valid + entry.io.release_update.bits := release_update_bits_for_entry // data write delayed } assert(RegNext(!(io.mem_grant.valid && !io.mem_grant.ready))) io.mem_grant.ready := true.B -- GitLab