未验证 提交 a98b054b 编写于 作者: W William Wang 提交者: GitHub

Optimize memblock timing (#1268)

* sbuffer: do flush correctly while draining sbuffer

* mem: disable EnableFastForward for timing reasons

* sbuffer: optimize forward mask gen timing

* dcache: block main pipe req if refill req is valid

Refill req comes from refill arbiter. There is not time left for index
conflict check. Now we block all main pipe req when refill
req comes from miss queue.

* dcache: delay some resp signals for better timing

* dcache: optimize wbq enq entry select timing

* dcache: decouple missq req.valid to valid & cancel

* valid is fast, it is used to select which miss req will be sent to
miss queue
* cancel can be slow to generate, it will cancel miss queue req in the
last moment

* sbuffer: optimize noSameBlockInflight check timing
上级 5e9021cf
......@@ -161,7 +161,7 @@ case class XSCoreParameters
StorePipelineWidth: Int = 2,
StoreBufferSize: Int = 16,
StoreBufferThreshold: Int = 7,
EnableFastForward: Boolean = true,
EnableFastForward: Boolean = false,
EnableLdVioCheckAfterReset: Boolean = true,
RefillSize: Int = 512,
MMUAsidLen: Int = 16, // max is 16, 0 is not supported now
......
......@@ -468,7 +468,7 @@ class DCacheImp(outer: DCache) extends LazyModuleImp(outer) with HasDCacheParame
// atomics
// atomics not finished yet
io.lsu.atomics <> atomicsReplayUnit.io.lsu
atomicsReplayUnit.io.pipe_resp := mainPipe.io.atomic_resp
atomicsReplayUnit.io.pipe_resp := RegNext(mainPipe.io.atomic_resp)
//----------------------------------------
// miss queue
......@@ -478,13 +478,18 @@ class DCacheImp(outer: DCache) extends LazyModuleImp(outer) with HasDCacheParame
// Request
val missReqArb = Module(new Arbiter(new MissReq, MissReqPortCount))
missReqArb.io.in(MainPipeMissReqPort) <> mainPipe.io.miss
missReqArb.io.in(MainPipeMissReqPort) <> mainPipe.io.miss_req
for (w <- 0 until LoadPipelineWidth) { missReqArb.io.in(w + 1) <> ldu(w).io.miss_req }
wb.io.miss_req.valid := missReqArb.io.out.valid
wb.io.miss_req.bits := missReqArb.io.out.bits.addr
block_decoupled(missReqArb.io.out, missQueue.io.req, wb.io.block_miss_req)
// block_decoupled(missReqArb.io.out, missQueue.io.req, wb.io.block_miss_req)
missReqArb.io.out <> missQueue.io.req
when(wb.io.block_miss_req) {
missQueue.io.req.bits.cancel := true.B
missReqArb.io.out.ready := false.B
}
// refill to load queue
io.lsu.lsq <> missQueue.io.refill_to_ldq
......@@ -494,7 +499,7 @@ class DCacheImp(outer: DCache) extends LazyModuleImp(outer) with HasDCacheParame
bus.e <> missQueue.io.mem_finish
missQueue.io.probe_addr := bus.b.bits.address
missQueue.io.main_pipe_resp := mainPipe.io.atomic_resp
missQueue.io.main_pipe_resp := RegNext(mainPipe.io.atomic_resp)
//----------------------------------------
// probe
......@@ -513,18 +518,17 @@ class DCacheImp(outer: DCache) extends LazyModuleImp(outer) with HasDCacheParame
replacePipeStatusS0.valid := replacePipe.io.req.valid
replacePipeStatusS0.bits := get_idx(replacePipe.io.req.bits.vaddr)
val blockMainPipeReqs = Seq(
refillPipeStatus,
replacePipeStatusS0,
replacePipe.io.status.s1_set,
replacePipe.io.status.s2_set
)
val storeShouldBeBlocked = Cat(blockMainPipeReqs.map(r => r.valid && r.bits === io.lsu.store.req.bits.idx)).orR
val probeShouldBeBlocked = Cat(blockMainPipeReqs.map(r => r.valid && r.bits === get_idx(probeQueue.io.pipe_req.bits.vaddr))).orR
val storeShouldBeBlocked = refillPipeStatus.valid || Cat(blockMainPipeReqs.map(r => r.valid && r.bits === io.lsu.store.req.bits.idx)).orR
val probeShouldBeBlocked = refillPipeStatus.valid || Cat(blockMainPipeReqs.map(r => r.valid && r.bits === get_idx(probeQueue.io.pipe_req.bits.vaddr))).orR
block_decoupled(probeQueue.io.pipe_req, mainPipe.io.probe_req, probeShouldBeBlocked)
block_decoupled(io.lsu.store.req, mainPipe.io.store_req, storeShouldBeBlocked)
io.lsu.store.replay_resp := mainPipe.io.store_replay_resp
io.lsu.store.replay_resp := RegNext(mainPipe.io.store_replay_resp)
io.lsu.store.main_pipe_hit_resp := mainPipe.io.store_hit_resp
val mainPipeAtomicReqArb = Module(new Arbiter(new MainPipeReq, 2))
......@@ -532,7 +536,7 @@ class DCacheImp(outer: DCache) extends LazyModuleImp(outer) with HasDCacheParame
mainPipeAtomicReqArb.io.in(1) <> atomicsReplayUnit.io.pipe_req
mainPipe.io.atomic_req <> mainPipeAtomicReqArb.io.out
mainPipe.io.invalid_resv_set := wb.io.req.fire && wb.io.req.bits.addr === mainPipe.io.lrsc_locked_block.bits
mainPipe.io.invalid_resv_set := RegNext(wb.io.req.fire && wb.io.req.bits.addr === mainPipe.io.lrsc_locked_block.bits)
//----------------------------------------
// replace pipe
......@@ -555,7 +559,7 @@ class DCacheImp(outer: DCache) extends LazyModuleImp(outer) with HasDCacheParame
s.bits.way_en === missQueue.io.refill_pipe_req.bits.way_en
)).orR
block_decoupled(missQueue.io.refill_pipe_req, refillPipe.io.req, refillShouldBeBlocked)
io.lsu.store.refill_hit_resp := refillPipe.io.store_resp
io.lsu.store.refill_hit_resp := RegNext(refillPipe.io.store_resp)
//----------------------------------------
// wb
......@@ -566,8 +570,8 @@ class DCacheImp(outer: DCache) extends LazyModuleImp(outer) with HasDCacheParame
bus.c <> wb.io.mem_release
wb.io.release_wakeup := refillPipe.io.release_wakeup
wb.io.release_update := mainPipe.io.release_update
io.lsu.release.valid := bus.c.fire()
io.lsu.release.bits.paddr := bus.c.bits.address
io.lsu.release.valid := RegNext(bus.c.fire())
io.lsu.release.bits.paddr := RegNext(bus.c.bits.address)
// connect bus d
missQueue.io.mem_grant.valid := false.B
......
......@@ -108,6 +108,7 @@ class LoadPipe(id: Int)(implicit p: Parameters) extends DCacheModule {
val s1_vaddr = s1_req.addr
val s1_bank_oh = UIntToOH(addr_to_dcache_bank(s1_req.addr))
val s1_nack = RegNext(io.nack)
val s1_nack_data = !io.banked_data_read.ready
val s1_fire = s1_valid && s2_ready
s1_ready := !s1_valid || s1_fire
......@@ -154,8 +155,14 @@ class LoadPipe(id: Int)(implicit p: Parameters) extends DCacheModule {
io.replace_access.bits.set := RegNext(get_idx(s1_req.addr))
io.replace_access.bits.way := RegNext(OHToUInt(s1_tag_match_way))
// TODO: optimize implementation
val s1_has_permission = s1_hit_coh.onAccess(s1_req.cmd)._1
val s1_new_hit_coh = s1_hit_coh.onAccess(s1_req.cmd)._3
val s1_hit = s1_tag_match && s1_has_permission && s1_hit_coh === s1_new_hit_coh
val s1_will_send_miss_req = s1_valid && !s1_nack && !s1_nack_data && !s1_hit
// tag ecc check
// (0 until nWays).foreach(w => assert(!RegNext(s1_valid && s1_tag_match_way(w) && cacheParams.tagCode.decode(io.meta_resp(w)).uncorrectable)))
// (0 until nWays).foreach(w => assert(!RegNext(s1_valid && s1_tag_match_way(w) && cacheParams.tagCode.decode(io.meta_resp(w)).uncorrectable)))
// --------------------------------------------------------------------------------
// stage 2
......@@ -209,8 +216,10 @@ class LoadPipe(id: Int)(implicit p: Parameters) extends DCacheModule {
dump_pipeline_valids("LoadPipe s2", "s2_nack_hit", s2_valid && s2_nack_hit)
dump_pipeline_valids("LoadPipe s2", "s2_nack_no_mshr", s2_valid && s2_nack_no_mshr)
val s2_can_send_miss_req = RegEnable(s1_will_send_miss_req, s1_fire)
// send load miss to miss queue
io.miss_req.valid := s2_valid && !s2_nack_hit && !s2_nack_data && !s2_hit && !io.lsu.s2_kill
io.miss_req.valid := s2_valid && s2_can_send_miss_req
io.miss_req.bits := DontCare
io.miss_req.bits.source := s2_instrtype
io.miss_req.bits.cmd := s2_req.cmd
......@@ -220,6 +229,7 @@ class LoadPipe(id: Int)(implicit p: Parameters) extends DCacheModule {
io.miss_req.bits.req_coh := s2_hit_coh
io.miss_req.bits.replace_coh := s2_repl_coh
io.miss_req.bits.replace_tag := s2_repl_tag
io.miss_req.bits.cancel := io.lsu.s2_kill
// send back response
val resp = Wire(ValidIO(new DCacheWordResp))
......
......@@ -88,7 +88,7 @@ class MainPipe(implicit p: Parameters) extends DCacheModule {
// probe queue
val probe_req = Flipped(DecoupledIO(new MainPipeReq))
// store miss go to miss queue
val miss = DecoupledIO(new MissReq)
val miss_req = DecoupledIO(new MissReq)
// store buffer
val store_req = Flipped(DecoupledIO(new DCacheLineReq))
val store_replay_resp = ValidIO(new DCacheLineResp)
......@@ -230,6 +230,10 @@ class MainPipe(implicit p: Parameters) extends DCacheModule {
val s1_tag = Mux(s1_need_replacement, s1_repl_tag, s1_hit_tag)
val s1_coh = Mux(s1_need_replacement, s1_repl_coh, s1_hit_coh)
val s1_has_permission = s1_hit_coh.onAccess(s1_req.cmd)._1
val s1_hit = s1_tag_match && s1_has_permission
val s1_pregen_can_go_to_mq = !s1_req.probe && !s1_req.miss && (s1_req.isStore || s1_req.isAMO) && !s1_hit
// s2: select data, return resp if this is a store miss
val s2_valid = RegInit(false.B)
val s2_req = RegEnable(s1_req, s1_fire)
......@@ -254,7 +258,8 @@ class MainPipe(implicit p: Parameters) extends DCacheModule {
// For a store req, it either hits and goes to s3, or miss and enter miss queue immediately
val s2_can_go_to_s3 = (s2_req.probe || s2_req.miss || (s2_req.isStore || s2_req.isAMO) && s2_hit) && s3_ready
val s2_can_go_to_mq = !s2_req.probe && !s2_req.miss && (s2_req.isStore || s2_req.isAMO) && !s2_hit
// val s2_can_go_to_mq = !s2_req.probe && !s2_req.miss && (s2_req.isStore || s2_req.isAMO) && !s2_hit
val s2_can_go_to_mq = RegEnable(s1_pregen_can_go_to_mq, s1_fire)
assert(RegNext(!(s2_valid && s2_can_go_to_s3 && s2_can_go_to_mq)))
val s2_can_go = s2_can_go_to_s3 || s2_can_go_to_mq
val s2_fire = s2_valid && s2_can_go
......@@ -265,7 +270,7 @@ class MainPipe(implicit p: Parameters) extends DCacheModule {
s2_valid := false.B
}
s2_ready := !s2_valid || s2_can_go
val replay = !io.miss.ready
val replay = !io.miss_req.ready
val data_resp = Wire(io.data_resp.cloneType)
data_resp := Mux(RegNext(s1_fire), io.data_resp, RegNext(data_resp))
......@@ -510,23 +515,24 @@ class MainPipe(implicit p: Parameters) extends DCacheModule {
io.data_read.bits.way_en := s1_way_en
io.data_read.bits.addr := s1_req.vaddr
io.miss.valid := s2_valid && s2_can_go_to_mq
val miss = io.miss.bits
miss := DontCare
miss.source := s2_req.source
miss.cmd := s2_req.cmd
miss.addr := s2_req.addr
miss.vaddr := s2_req.vaddr
miss.way_en := s2_way_en
miss.store_data := s2_req.store_data
miss.store_mask := s2_req.store_mask
miss.word_idx := s2_req.word_idx
miss.amo_data := s2_req.amo_data
miss.amo_mask := s2_req.amo_mask
miss.req_coh := s2_hit_coh
miss.replace_coh := s2_repl_coh
miss.replace_tag := s2_repl_tag
miss.id := s2_req.id
io.miss_req.valid := s2_valid && s2_can_go_to_mq
val miss_req = io.miss_req.bits
miss_req := DontCare
miss_req.source := s2_req.source
miss_req.cmd := s2_req.cmd
miss_req.addr := s2_req.addr
miss_req.vaddr := s2_req.vaddr
miss_req.way_en := s2_way_en
miss_req.store_data := s2_req.store_data
miss_req.store_mask := s2_req.store_mask
miss_req.word_idx := s2_req.word_idx
miss_req.amo_data := s2_req.amo_data
miss_req.amo_mask := s2_req.amo_mask
miss_req.req_coh := s2_hit_coh
miss_req.replace_coh := s2_repl_coh
miss_req.replace_tag := s2_repl_tag
miss_req.id := s2_req.id
miss_req.cancel := false.B
io.store_replay_resp.valid := s2_valid && s2_can_go_to_mq && replay && s2_req.isStore
io.store_replay_resp.bits.data := DontCare
......
......@@ -48,6 +48,16 @@ class MissReq(implicit p: Parameters) extends DCacheBundle {
val replace_tag = UInt(tagBits.W)
val id = UInt(reqIdWidth.W)
// For now, miss queue entry req is actually valid when req.valid && !cancel
// * req.valid is fast to generate
// * cancel is slow to generate, it will not be used until the last moment
//
// cancel may come from the following sources:
// 1. miss req blocked by writeback queue:
// a writeback req of the same address is in progress
// 2. pmp check failed
val cancel = Bool() // cancel is slow to generate, it will cancel missreq.valid
def isLoad = source === LOAD_SOURCE.U
def isStore = source === STORE_SOURCE.U
def isAMO = source === AMO_SOURCE.U
......@@ -59,6 +69,7 @@ class MissEntry(edge: TLEdgeOut)(implicit p: Parameters) extends DCacheModule {
// MSHR ID
val id = Input(UInt(log2Up(cfg.nMissEntries).W))
// client requests
val req = Flipped(ValidIO(new MissReq))
// allocate this entry for new req
val primary_valid = Input(Bool())
// this entry is free and can be allocated to new reqs
......@@ -67,9 +78,8 @@ class MissEntry(edge: TLEdgeOut)(implicit p: Parameters) extends DCacheModule {
val secondary_ready = Output(Bool())
// this entry is busy and it can not merge the new req
val secondary_reject = Output(Bool())
val req = Flipped(ValidIO(new MissReq))
val refill_to_ldq = ValidIO(new Refill)
// TODO: bypass refill data to load pipe
// bus
val mem_acquire = DecoupledIO(new TLBundleA(edge.bundle))
......@@ -128,7 +138,12 @@ class MissEntry(edge: TLEdgeOut)(implicit p: Parameters) extends DCacheModule {
val grant_beats = RegInit(0.U(beatBits.W))
when (io.req.valid && io.primary_ready && io.primary_valid) {
when (release_entry && req_valid) {
req_valid := false.B
}
val primary_fire = WireInit(io.req.valid && io.primary_ready && io.primary_valid && !io.req.bits.cancel)
when (primary_fire) {
req_valid := true.B
req := io.req.bits
req.addr := get_block_addr(io.req.bits.addr)
......@@ -155,11 +170,10 @@ class MissEntry(edge: TLEdgeOut)(implicit p: Parameters) extends DCacheModule {
should_refill_data_reg := io.req.bits.isLoad
grant_beats := 0.U
}.elsewhen (release_entry) {
req_valid := false.B
}
when (io.req.valid && io.secondary_ready) {
val secondary_fire = WireInit(io.req.valid && io.secondary_ready && !io.req.bits.cancel)
when (secondary_fire) {
assert(io.req.bits.req_coh.state <= req.req_coh.state)
assert(!(io.req.bits.isAMO || req.isAMO))
// use the most uptodate meta
......@@ -386,24 +400,24 @@ class MissEntry(edge: TLEdgeOut)(implicit p: Parameters) extends DCacheModule {
io.debug_early_replace.bits.idx := addr_to_dcache_set(req.vaddr)
io.debug_early_replace.bits.tag := req.replace_tag
XSPerfAccumulate("miss_req_primary", io.req.valid && io.primary_ready && io.primary_valid)
XSPerfAccumulate("miss_req_merged", io.req.valid && io.secondary_ready)
XSPerfAccumulate("miss_req_primary", primary_fire)
XSPerfAccumulate("miss_req_merged", secondary_fire)
XSPerfAccumulate("load_miss_penalty_to_use",
should_refill_data &&
BoolStopWatch(io.req.valid && io.primary_ready && io.primary_valid, io.refill_to_ldq.valid, true)
BoolStopWatch(primary_fire, io.refill_to_ldq.valid, true)
)
XSPerfAccumulate("main_pipe_penalty", BoolStopWatch(io.main_pipe_req.fire(), io.main_pipe_resp))
XSPerfAccumulate("penalty_blocked_by_channel_A", io.mem_acquire.valid && !io.mem_acquire.ready)
XSPerfAccumulate("penalty_waiting_for_channel_D", s_acquire && !w_grantlast && !io.mem_grant.valid)
XSPerfAccumulate("penalty_waiting_for_channel_E", io.mem_finish.valid && !io.mem_finish.ready)
XSPerfAccumulate("penalty_from_grant_to_refill", !s_refill && w_grantlast)
XSPerfAccumulate("soft_prefetch_number", io.req.valid && io.primary_ready && io.primary_valid && io.req.bits.source === SOFT_PREFETCH.U)
XSPerfAccumulate("soft_prefetch_number", primary_fire && io.req.bits.source === SOFT_PREFETCH.U)
val (mshr_penalty_sample, mshr_penalty) = TransactionLatencyCounter(RegNext(io.req.valid && io.primary_ready && io.primary_valid), release_entry)
val (mshr_penalty_sample, mshr_penalty) = TransactionLatencyCounter(RegNext(primary_fire), release_entry)
XSPerfHistogram("miss_penalty", mshr_penalty, mshr_penalty_sample, 0, 20, 1, true, true)
XSPerfHistogram("miss_penalty", mshr_penalty, mshr_penalty_sample, 20, 100, 10, true, false)
val load_miss_begin = io.req.valid && io.primary_ready && io.primary_valid && io.req.bits.isLoad
val load_miss_begin = primary_fire && io.req.bits.isLoad
val refill_finished = RegNext(!w_grantlast && refill_done) && should_refill_data
val (load_miss_penalty_sample, load_miss_penalty) = TransactionLatencyCounter(load_miss_begin, refill_finished) // not real refill finish time
XSPerfHistogram("load_miss_penalty_to_use", load_miss_penalty, load_miss_penalty_sample, 0, 20, 1, true, true)
......@@ -458,15 +472,9 @@ class MissQueue(edge: TLEdgeOut)(implicit p: Parameters) extends DCacheModule {
val probe_block_vec = entries.map { case e => e.io.block_addr.valid && e.io.block_addr.bits === io.probe_addr }
val merge = Cat(secondary_ready_vec).orR
// val merge_idx = PriorityEncoder(secondary_ready_vec)
val reject = Cat(secondary_reject_vec).orR
val reject = Cat(secondary_reject_vec).orR || io.req.bits.cancel
val alloc = !reject && !merge && Cat(primary_ready_vec).orR
// val alloc_idx = PriorityEncoder(primary_ready_vec)
val accept = alloc || merge
// val entry_idx = Mux(alloc, alloc_idx, merge_idx)
assert(RegNext(PopCount(secondary_ready_vec) <= 1.U))
// assert(RegNext(PopCount(secondary_reject_vec) <= 1.U))
......
......@@ -54,9 +54,14 @@ class WritebackEntry(edge: TLEdgeOut)(implicit p: Parameters) extends DCacheModu
{
val io = IO(new Bundle {
val id = Input(UInt())
// allocate this entry for new req
val primary_valid = Input(Bool())
// this entry is free and can be allocated to new reqs
val primary_ready = Output(Bool())
// this entry is busy, but it can merge the new req
val secondary_ready = Output(Bool())
val req = Flipped(DecoupledIO(new WritebackReq))
val merge = Output(Bool())
val mem_release = DecoupledIO(new TLBundleC(edge.bundle))
val mem_grant = Flipped(DecoupledIO(new TLBundleD(edge.bundle)))
......@@ -101,7 +106,7 @@ class WritebackEntry(edge: TLEdgeOut)(implicit p: Parameters) extends DCacheModu
// --------------------------------------------------------------------------------
// s_invalid: receive requests
// new req entering
when (io.req.fire()) {
when (io.req.valid && io.primary_valid && io.primary_ready) {
assert (remain === 0.U)
req := io.req.bits
when (io.req.bits.delay_release) {
......@@ -114,8 +119,6 @@ class WritebackEntry(edge: TLEdgeOut)(implicit p: Parameters) extends DCacheModu
// --------------------------------------------------------------------------------
// s_sleep: wait for refill pipe to inform me that I can keep releasing
val merge_probe = WireInit(false.B)
io.merge := WireInit(false.B)
when (state === s_sleep) {
assert (remain === 0.U)
......@@ -126,9 +129,7 @@ class WritebackEntry(edge: TLEdgeOut)(implicit p: Parameters) extends DCacheModu
req.data := mergeData(req.data, io.release_update.bits.data, io.release_update.bits.mask)
}
io.merge := !io.req.bits.voluntary && io.req.bits.addr === req.addr
merge_probe := io.req.valid && io.merge
when (merge_probe) {
when (io.req.valid && io.secondary_ready) {
state := s_release_req
req.voluntary := false.B
req.param := req.param
......@@ -220,8 +221,8 @@ class WritebackEntry(edge: TLEdgeOut)(implicit p: Parameters) extends DCacheModu
// 2. When this entry wants to release while still waiting for release_wakeup signal,
// and a probe req with the same addr comes. In this case we merge probe with release,
// handle this probe, so we don't need another release.
io.req.ready := state === s_invalid ||
state === s_sleep && !io.req.bits.voluntary && io.req.bits.addr === req.addr
io.primary_ready := state === s_invalid
io.secondary_ready := state === s_sleep && !io.req.bits.voluntary && io.req.bits.addr === req.addr
// performance counters
XSPerfAccumulate("wb_req", io.req.fire())
......@@ -247,14 +248,6 @@ class WritebackQueue(edge: TLEdgeOut)(implicit p: Parameters) extends DCacheModu
require(cfg.nReleaseEntries > cfg.nMissEntries)
// allocate a free entry for incoming request
val primary_ready = Wire(Vec(cfg.nReleaseEntries, Bool()))
val merge_vec = Wire(Vec(cfg.nReleaseEntries, Bool()))
val allocate = primary_ready.asUInt.orR
val merge = merge_vec.asUInt.orR
val alloc_idx = PriorityEncoder(Mux(merge, merge_vec, primary_ready))
// delay writeback req
val DelayWritebackReq = true
val req_delayed = Wire(Flipped(DecoupledIO(new WritebackReq)))
......@@ -284,9 +277,17 @@ class WritebackQueue(edge: TLEdgeOut)(implicit p: Parameters) extends DCacheModu
io.req.ready := !req_delayed_valid || req_delayed.fire()
dontTouch(req_delayed)
val req = req_delayed
// allocate a free entry for incoming request
val block_conflict = Wire(Bool())
val accept = merge || allocate && !block_conflict
val primary_ready_vec = Wire(Vec(cfg.nReleaseEntries, Bool()))
val secondary_ready_vec = Wire(Vec(cfg.nReleaseEntries, Bool()))
val merge = Cat(secondary_ready_vec).orR
val alloc = !merge && Cat(primary_ready_vec).orR && !block_conflict
// Now we block release until last release of that block is finished
// TODO: Is it possible to merge these release req?
val req = req_delayed
val accept = merge || alloc
req.ready := accept
// assign default values to output signals
......@@ -296,28 +297,35 @@ class WritebackQueue(edge: TLEdgeOut)(implicit p: Parameters) extends DCacheModu
require(isPow2(cfg.nMissEntries))
val grant_source = io.mem_grant.bits.source
val entries = (0 until cfg.nReleaseEntries) map { i =>
val entry = Module(new WritebackEntry(edge))
val entry_id = (i + releaseIdBase).U
entry.io.id := entry_id
// entry req
entry.io.req.valid := (i.U === alloc_idx) && req.valid && accept
primary_ready(i) := entry.io.req.ready
merge_vec(i) := entry.io.merge
entry.io.req.bits := req.bits
entry.io.mem_grant.valid := (entry_id === grant_source) && io.mem_grant.valid
entry.io.mem_grant.bits := io.mem_grant.bits
when (entry_id === grant_source) {
io.mem_grant.ready := entry.io.mem_grant.ready
}
entry.io.release_wakeup := io.release_wakeup
entry.io.release_update := io.release_update
entry
val entries = Seq.fill(cfg.nReleaseEntries)(Module(new WritebackEntry(edge)))
entries.zipWithIndex.foreach {
case (entry, i) =>
val former_primary_ready = if(i == 0)
false.B
else
Cat((0 until i).map(j => entries(j).io.primary_ready)).orR
val entry_id = (i + releaseIdBase).U
entry.io.id := entry_id
// entry req
entry.io.req.valid := req.valid
primary_ready_vec(i) := entry.io.primary_ready
secondary_ready_vec(i) := entry.io.secondary_ready
entry.io.req.bits := req.bits
entry.io.primary_valid := alloc &&
!former_primary_ready &&
entry.io.primary_ready
entry.io.mem_grant.valid := (entry_id === grant_source) && io.mem_grant.valid
entry.io.mem_grant.bits := io.mem_grant.bits
when (entry_id === grant_source) {
io.mem_grant.ready := entry.io.mem_grant.ready
}
entry.io.release_wakeup := io.release_wakeup
entry.io.release_update := io.release_update
}
block_conflict := VecInit(entries.map(e => e.io.block_addr.valid && e.io.block_addr.bits === req.bits.addr)).asUInt.orR
......
......@@ -386,7 +386,11 @@ class LoadUnit_S2(implicit p: Parameters) extends XSModule with HasLoadHelper {
when (io.in.bits.isSoftPrefetch) {
io.rsFeedback.bits.hit := (!s2_tlb_miss && (!s2_cache_replay || s2_mmio || s2_exception))
}.otherwise {
io.rsFeedback.bits.hit := !s2_tlb_miss && (!s2_cache_replay || s2_mmio || s2_exception || fullForward) && !s2_data_invalid
if (EnableFastForward) {
io.rsFeedback.bits.hit := !s2_tlb_miss && (!s2_cache_replay || s2_mmio || s2_exception || fullForward) && !s2_data_invalid
} else {
io.rsFeedback.bits.hit := !s2_tlb_miss && (!s2_cache_replay || s2_mmio || s2_exception) && !s2_data_invalid
}
}
io.rsFeedback.bits.rsIdx := io.in.bits.rsIdx
io.rsFeedback.bits.flushState := io.in.bits.ptwBack
......@@ -400,7 +404,11 @@ class LoadUnit_S2(implicit p: Parameters) extends XSModule with HasLoadHelper {
io.rsFeedback.bits.dataInvalidSqIdx.flag := DontCare
// s2_cache_replay is quite slow to generate, send it separately to LQ
io.needReplayFromRS := s2_cache_replay && !fullForward
if (EnableFastForward) {
io.needReplayFromRS := s2_cache_replay && !fullForward
} else {
io.needReplayFromRS := s2_cache_replay
}
// fast load to load forward
io.fastpath.valid := io.in.valid // for debug only
......
......@@ -50,12 +50,15 @@ trait HasSbufferConst extends HasXSParameter {
class SbufferEntryState (implicit p: Parameters) extends SbufferBundle {
val state_valid = Bool() // this entry is active
val state_inflight = Bool() // sbuffer is trying to write this entry to dcache
val w_timeout = Bool() // waiting for resend store pipeline req timeout
val w_timeout = Bool() // with timeout resp, waiting for resend store pipeline req timeout
val w_sameblock_inflight = Bool() // same cache block dcache req is inflight
val s_recheck_inflight = Bool() // recheck if same cache block dcache req is inflight
def isInvalid(): Bool = !state_valid
def isValid(): Bool = state_valid
def isActive(): Bool = state_valid && !state_inflight
def isInflight(): Bool = state_inflight
def isDcacheReqCandidate(): Bool = state_valid && !state_inflight && !w_sameblock_inflight
}
class SbufferBundle(implicit p: Parameters) extends XSBundle with HasSbufferConst
......@@ -114,6 +117,7 @@ class Sbuffer(implicit p: Parameters) extends DCacheModule with HasSbufferConst
val ptag = Reg(Vec(StoreBufferSize, UInt(PTagWidth.W)))
val vtag = Reg(Vec(StoreBufferSize, UInt(VTagWidth.W)))
val mask = Reg(Vec(StoreBufferSize, Vec(CacheLineWords, Vec(DataBytes, Bool()))))
val waitInflightMask = Reg(Vec(StoreBufferSize, UInt(StoreBufferSize.W)))
val data = dataModule.io.dataOut
val stateVec = RegInit(VecInit(Seq.fill(StoreBufferSize)(0.U.asTypeOf(new SbufferEntryState))))
val cohCount = RegInit(VecInit(Seq.fill(StoreBufferSize)(0.U(EvictCountBits.W))))
......@@ -236,7 +240,12 @@ class Sbuffer(implicit p: Parameters) extends DCacheModule with HasSbufferConst
io.in(1).ready := secondCanInsert && !sameWord && io.in(0).ready
def wordReqToBufLine(req: DCacheWordReq, reqptag: UInt, reqvtag: UInt, insertIdx: UInt, wordOffset: UInt, flushMask: Bool): Unit = {
val sameBlockInflightMask = genSameBlockInflightMask(reqptag)
stateVec(insertIdx).state_valid := true.B
stateVec(insertIdx).w_sameblock_inflight := sameBlockInflightMask.orR // set w_sameblock_inflight when a line is first allocated
when(sameBlockInflightMask.orR){
waitInflightMask(insertIdx) := sameBlockInflightMask
}
cohCount(insertIdx) := 0.U
// missqReplayCount(insertIdx) := 0.U
ptag(insertIdx) := reqptag
......@@ -349,7 +358,9 @@ class Sbuffer(implicit p: Parameters) extends DCacheModule with HasSbufferConst
}
}
is(x_drain_sbuffer){
when(sbuffer_empty){
when(io.flush.valid){
sbuffer_state := x_drain_all
}.elsewhen(sbuffer_empty){
sbuffer_state := x_idle
}
}
......@@ -370,6 +381,16 @@ class Sbuffer(implicit p: Parameters) extends DCacheModule with HasSbufferConst
!Cat(widthMap(i => inflightMask(i) && ptag(idx) === ptag(i))).orR()
}
def genSameBlockInflightMask(ptag_in: UInt): UInt = {
val mask = VecInit(widthMap(i => inflightMask(i) && ptag_in === ptag(i))).asUInt // quite slow, use it with care
assert(!(PopCount(mask) > 1.U))
mask
}
def haveSameBlockInflight(ptag_in: UInt): Bool = {
genSameBlockInflightMask(ptag_in).orR
}
val need_drain = needDrain(sbuffer_state)
val need_replace = do_eviction || (sbuffer_state === x_replace)
val evictionIdx = Mux(missqReplayHasTimeOut,
......@@ -385,7 +406,8 @@ class Sbuffer(implicit p: Parameters) extends DCacheModule with HasSbufferConst
current eviction should be blocked.
*/
val prepareValid = missqReplayHasTimeOut ||
activeMask(evictionIdx) && (need_drain || cohHasTimeOut || need_replace) && noSameBlockInflight(evictionIdx)
stateVec(evictionIdx).isDcacheReqCandidate() && (need_drain || cohHasTimeOut || need_replace)
assert(!(stateVec(evictionIdx).isDcacheReqCandidate && !noSameBlockInflight(evictionIdx)))
val prepareValidReg = RegInit(false.B)
// when canSendDcacheReq, send dcache req stored in pipeline reg to dcache
val canSendDcacheReq = io.dcache.req.ready || !prepareValidReg
......@@ -434,12 +456,16 @@ class Sbuffer(implicit p: Parameters) extends DCacheModule with HasSbufferConst
p"send buf [$evictionIdxReg] to Dcache, req fire\n"
)
// TODO: for timing reasons, dcache store pipe resp may need to be delayed
// update sbuffer status according to dcache resp source
def id_to_sbuffer_id(id: UInt): UInt = {
require(id.getWidth >= log2Up(StoreBufferSize))
id(log2Up(StoreBufferSize)-1, 0)
}
// hit resp
io.dcache.hit_resps.map(resp => {
val dcache_resp_id = resp.bits.id
val dcache_resp_id = resp.bits.id
when (resp.fire()) {
stateVec(dcache_resp_id).state_inflight := false.B
stateVec(dcache_resp_id).state_valid := false.B
......@@ -447,8 +473,25 @@ class Sbuffer(implicit p: Parameters) extends DCacheModule with HasSbufferConst
assert(!resp.bits.miss) // not need to resp if miss, to be opted
assert(stateVec(dcache_resp_id).state_inflight === true.B)
}
// Update w_sameblock_inflight flag is delayed for 1 cycle
//
// When a new req allocate a new line in sbuffer, sameblock_inflight check will ignore
// current dcache.hit_resps. Then, in the next cycle, we have plenty of time to check
// if the same block is still inflight
(0 until StoreBufferSize).map(i => {
when(
stateVec(i).w_sameblock_inflight &&
stateVec(i).state_valid &&
RegNext(resp.fire()) &&
waitInflightMask(i) === UIntToOH(RegNext(id_to_sbuffer_id(dcache_resp_id)))
){
stateVec(i).w_sameblock_inflight := false.B
}
})
})
// replay resp
val replay_resp_id = io.dcache.replay_resp.bits.id
when (io.dcache.replay_resp.fire()) {
......@@ -511,19 +554,26 @@ class Sbuffer(implicit p: Parameters) extends DCacheModule with HasSbufferConst
val valid_tag_match_reg = valid_tag_matches.map(RegNext(_))
val inflight_tag_match_reg = inflight_tag_matches.map(RegNext(_))
val line_offset_reg = RegNext(line_offset_mask)
val forward_mask_candidate_reg = RegEnable(
VecInit(mask.map(entry => entry(getWordOffset(forward.paddr)))),
forward.valid
)
val forward_data_candidate_reg = RegEnable(
VecInit(data.map(entry => entry(getWordOffset(forward.paddr)))),
forward.valid
)
val selectedValidMask = Mux1H(line_offset_reg, Mux1H(valid_tag_match_reg, mask).asTypeOf(Vec(CacheLineWords, Vec(DataBytes, Bool()))))
val selectedValidMask = Mux1H(valid_tag_match_reg, forward_mask_candidate_reg)
val selectedValidData = Mux1H(valid_tag_match_reg, forward_data_candidate_reg)
selectedValidMask.suggestName("selectedValidMask_"+i)
selectedValidData.suggestName("selectedValidData_"+i)
val selectedInflightMask = Mux1H(line_offset_reg, Mux1H(inflight_tag_match_reg, mask).asTypeOf(Vec(CacheLineWords, Vec(DataBytes, Bool()))))
val selectedInflightMask = Mux1H(inflight_tag_match_reg, forward_mask_candidate_reg)
val selectedInflightData = Mux1H(inflight_tag_match_reg, forward_data_candidate_reg)
selectedInflightMask.suggestName("selectedInflightMask_"+i)
selectedInflightData.suggestName("selectedInflightData_"+i)
// currently not being used
val selectedInflightMaskFast = Mux1H(line_offset_mask, Mux1H(inflight_tag_matches, mask).asTypeOf(Vec(CacheLineWords, Vec(DataBytes, Bool()))))
val selectedValidMaskFast = Mux1H(line_offset_mask, Mux1H(valid_tag_matches, mask).asTypeOf(Vec(CacheLineWords, Vec(DataBytes, Bool()))))
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册