提交 b3fc7151 编写于 作者: A Allen

DCache, lsroq: for load miss, do not use replay,

just send data to lsroq to shorten load miss penalty.
上级 dbc85dae
......@@ -22,6 +22,7 @@ trait HasL1CacheParameters extends HasXSParameter
def nSets = cacheParams.nSets
def nWays = cacheParams.nWays
def blockBytes = cacheParams.blockBytes
def blockBits = blockBytes * 8
def idxBits = log2Up(cacheParams.nSets)
def wayBits = log2Up(nWays)
......
......@@ -11,7 +11,7 @@ import utils.{XSDebug}
class AtomicsPipe extends DCacheModule
{
val io = IO(new DCacheBundle{
val lsu = Flipped(new DCacheLoadIO)
val lsu = Flipped(new DCacheWordIO)
val data_read = DecoupledIO(new L1DataReadReq)
val data_resp = Input(Vec(nWays, Vec(blockRows, Bits(encRowBits.W))))
val data_write = DecoupledIO(new L1DataWriteReq)
......@@ -199,7 +199,7 @@ class AtomicsPipe extends DCacheModule
val resp = Wire(ValidIO(new DCacheResp))
val resp = Wire(ValidIO(new DCacheWordResp))
resp.valid := s2_valid
resp.bits.data := Mux(s2_sc, s2_sc_resp, s2_data_word)
resp.bits.meta := s2_req.meta
......
......@@ -10,8 +10,8 @@ import utils.XSDebug
class AtomicsMissQueue extends DCacheModule
{
val io = IO(new DCacheBundle {
val lsu = Flipped(new DCacheLoadIO)
val replay = new DCacheLoadIO
val lsu = Flipped(new DCacheWordIO)
val replay = new DCacheWordIO
val miss_req = DecoupledIO(new MissReq)
val miss_resp = Flipped(ValidIO(new MissResp))
val miss_finish = DecoupledIO(new MissFinish)
......@@ -22,7 +22,7 @@ class AtomicsMissQueue extends DCacheModule
val id = 0.U
val req = Reg(new DCacheWordReq)
val resp = Reg(new DCacheResp)
val resp = Reg(new DCacheWordResp)
val req_block_addr = get_block_addr(req.addr)
val reg_miss_resp = Reg(new MissResp)
......
......@@ -45,7 +45,7 @@ class DCacheLineReq extends DCacheBundle
val meta = new DCacheMeta
}
class DCacheResp extends DCacheBundle
class DCacheWordResp extends DCacheBundle
{
val data = UInt(DataBits.W)
val meta = new DCacheMeta
......@@ -55,25 +55,35 @@ class DCacheResp extends DCacheBundle
val nack = Bool()
}
class DCacheLoadIO extends DCacheBundle
class DCacheLineResp extends DCacheBundle
{
val data = UInt((cfg.blockBytes * 8).W)
val meta = new DCacheMeta
// cache req missed, send it to miss queue
val miss = Bool()
// cache req nacked, replay it later
val nack = Bool()
}
class DCacheWordIO extends DCacheBundle
{
val req = DecoupledIO(new DCacheWordReq )
val resp = Flipped(DecoupledIO(new DCacheResp))
val resp = Flipped(DecoupledIO(new DCacheWordResp))
// kill previous cycle's req
val s1_kill = Output(Bool())
}
class DCacheStoreIO extends DCacheBundle
class DCacheLineIO extends DCacheBundle
{
val req = DecoupledIO(new DCacheLineReq )
val resp = Flipped(DecoupledIO(new DCacheResp))
val resp = Flipped(DecoupledIO(new DCacheLineResp))
}
class DCacheToLsuIO extends DCacheBundle {
val load = Vec(LoadPipelineWidth, Flipped(new DCacheLoadIO)) // for speculative load
val lsroq = Flipped(new DCacheLoadIO) // lsroq load/store
val store = Flipped(new DCacheStoreIO) // for sbuffer
val atomics = Flipped(new DCacheLoadIO) // atomics reqs
val load = Vec(LoadPipelineWidth, Flipped(new DCacheWordIO)) // for speculative load
val lsroq = Flipped(new DCacheLineIO) // lsroq load/store
val store = Flipped(new DCacheLineIO) // for sbuffer
val atomics = Flipped(new DCacheWordIO) // atomics reqs
}
class DCacheIO extends DCacheBundle {
......@@ -185,24 +195,27 @@ class DCacheImp(outer: DCache) extends LazyModuleImp(outer) with HasDCacheParame
// To simplify port arbitration
// WritebackUnit and StorePipe use port 0
val DataReadPortCount = 4
val DataReadPortCount = 5
val WritebackDataReadPort = 0
val StorePipeDataReadPort = 1
val LoadPipeDataReadPort = 2
val AtomicsPipeDataReadPort = 3
val LoadMissDataReadPort = 4
val dataReadArb = Module(new Arbiter(new L1DataReadReq, DataReadPortCount))
dataReadArb.io.in(WritebackDataReadPort) <> wb.io.data_req
dataReadArb.io.in(StorePipeDataReadPort) <> stu.io.data_read
dataReadArb.io.in(AtomicsPipeDataReadPort) <> atomics.io.data_read
dataReadArb.io.in(LoadPipeDataReadPort) <> ldu(0).io.data_read
dataReadArb.io.in(AtomicsPipeDataReadPort) <> atomics.io.data_read
dataReadArb.io.in(LoadMissDataReadPort) <> loadMissQueue.io.data_req
dataArray.io.read(0) <> dataReadArb.io.out
dataArray.io.resp(0) <> wb.io.data_resp
dataArray.io.resp(0) <> stu.io.data_resp
dataArray.io.resp(0) <> atomics.io.data_resp
dataArray.io.resp(0) <> ldu(0).io.data_resp
dataArray.io.resp(0) <> loadMissQueue.io.data_resp
for (w <- 1 until LoadPipelineWidth) {
dataArray.io.read(w) <> ldu(w).io.data_read
......@@ -211,41 +224,9 @@ class DCacheImp(outer: DCache) extends LazyModuleImp(outer) with HasDCacheParame
//----------------------------------------
// load pipe and load miss queue
// load miss queue replays on ldu 0
val loadArb = Module(new Arbiter(new DCacheWordReq , 2))
val loadReplay = loadMissQueue.io.replay
val lsu_0 = io.lsu.load(0)
val ldu_0 = ldu(0).io.lsu
loadArb.io.in(0) <> loadReplay.req
loadArb.io.in(1) <> lsu_0.req
assert(!(lsu_0.req.fire() && lsu_0.req.bits.meta.replay), "LSU should not replay requests")
assert(!(loadReplay.req.fire() && !loadReplay.req.bits.meta.replay), "LoadMissQueue should replay requests")
val ldu_0_nack = nack_load(loadArb.io.out.bits.addr)
// do not nack replayed reqs
ldu_0.req <> loadArb.io.out
ldu(0).io.nack := ldu_0_nack && !loadArb.io.out.bits.meta.replay
XSDebug(ldu_0_nack, "LoadUnit 0 nacked\n")
ldu_0.resp.ready := false.B
val isReplay = ldu_0.resp.bits.meta.replay
loadReplay.resp.valid := ldu_0.resp.valid && isReplay
loadReplay.resp.bits := ldu_0.resp.bits
when (loadReplay.resp.valid) {
ldu_0.resp.ready := loadReplay.resp.ready
}
lsu_0.resp.valid := ldu_0.resp.valid && !isReplay
lsu_0.resp.bits := ldu_0.resp.bits
when (lsu_0.resp.valid) {
ldu_0.resp.ready := lsu_0.resp.ready
}
// the s1 kill signal
// only lsu uses this, replay never kills
ldu_0.s1_kill := lsu_0.s1_kill
for (w <- 1 until LoadPipelineWidth) {
for (w <- 0 until LoadPipelineWidth) {
val load_w_nack = nack_load(io.lsu.load(w).req.bits.addr)
ldu(w).io.lsu.req <> io.lsu.load(w).req
ldu(w).io.nack := load_w_nack
......@@ -263,7 +244,6 @@ class DCacheImp(outer: DCache) extends LazyModuleImp(outer) with HasDCacheParame
// load miss queue
loadMissQueue.io.lsu <> io.lsu.lsroq
assert(!io.lsu.lsroq.s1_kill, "Lsroq should never use s1 kill on loadMissQueue")
//----------------------------------------
// store pipe and store miss queue
......@@ -358,17 +338,17 @@ class DCacheImp(outer: DCache) extends LazyModuleImp(outer) with HasDCacheParame
val isLoadMissResp = clientId === loadMissQueueClientId
loadMissResp.valid := missResp.valid && isLoadMissResp
loadMissResp.bits.entry_id := missResp.bits.entry_id
loadMissResp.bits := missResp.bits
loadMissResp.bits.client_id := missResp.bits.client_id(entryIdMSB, entryIdLSB)
val isStoreMissResp = clientId === storeMissQueueClientId
storeMissResp.valid := missResp.valid && isStoreMissResp
storeMissResp.bits.entry_id := missResp.bits.entry_id
storeMissResp.bits := missResp.bits
storeMissResp.bits.client_id := missResp.bits.client_id(entryIdMSB, entryIdLSB)
val isAtomicsMissResp = clientId === atomicsMissQueueClientId
atomicsMissResp.valid := missResp.valid && isAtomicsMissResp
atomicsMissResp.bits.entry_id := missResp.bits.entry_id
atomicsMissResp.bits := missResp.bits
atomicsMissResp.bits.client_id := missResp.bits.client_id(entryIdMSB, entryIdLSB)
// Finish
......
......@@ -8,7 +8,7 @@ import utils.XSDebug
class LoadPipe extends DCacheModule
{
val io = IO(new DCacheBundle{
val lsu = Flipped(new DCacheLoadIO)
val lsu = Flipped(new DCacheWordIO)
val data_read = DecoupledIO(new L1DataReadReq)
val data_resp = Input(Vec(nWays, Vec(blockRows, Bits(encRowBits.W))))
val meta_read = DecoupledIO(new L1MetaReadReq)
......@@ -129,7 +129,7 @@ class LoadPipe extends DCacheModule
assert(!(s2_valid && s2_hit && !s2_nack && s2_decoded.uncorrectable))
val resp = Wire(ValidIO(new DCacheResp))
val resp = Wire(ValidIO(new DCacheWordResp))
resp.valid := s2_valid
resp.bits.data := s2_data_word_decoded
resp.bits.meta := s2_req.meta
......
......@@ -11,66 +11,60 @@ class LoadMissEntry extends DCacheModule
val io = IO(new Bundle {
val id = Input(UInt())
val req_pri_val = Input(Bool())
val req_pri_rdy = Output(Bool())
val req_sec_val = Input(Bool())
val req_sec_rdy = Output(Bool())
val req = Input(new DCacheWordReq )
val replay = DecoupledIO(new DCacheWordReq )
val lsu = Flipped(new DCacheLineIO)
val miss_req = DecoupledIO(new MissReq)
val miss_resp = Flipped(ValidIO(new MissResp))
val miss_finish = DecoupledIO(new MissFinish)
val data_req = DecoupledIO(new L1DataReadReq)
val data_resp = Input(Vec(nWays, Vec(blockRows, Bits(encRowBits.W))))
val idx = Output(Valid(UInt()))
val tag = Output(Valid(UInt()))
})
val s_invalid :: s_miss_req :: s_miss_resp :: s_drain_rpq :: s_replay_resp :: s_miss_finish :: Nil = Enum(6)
val s_invalid :: s_miss_req :: s_miss_resp :: s_miss_finish :: s_data_read_req :: s_data_read_resp :: s_resp :: Nil = Enum(7)
val state = RegInit(s_invalid)
val req = Reg(new DCacheWordReq )
val req = Reg(new DCacheLineReq)
val resp = Reg(new DCacheLineResp)
val req_idx = get_idx(req.addr)
val req_tag = get_tag(req.addr)
val req_block_addr = get_block_addr(req.addr)
val reg_miss_resp = Reg(new MissResp)
val rpq = Module(new Queue(new DCacheWordReq , cfg.nRPQ))
rpq.io.enq.valid := (io.req_pri_val && io.req_pri_rdy) || (io.req_sec_val && io.req_sec_rdy)
rpq.io.enq.bits := io.req
rpq.io.deq.ready := false.B
when (rpq.io.enq.fire()) {
assert(io.req.cmd === M_XRD)
}
io.req_pri_rdy := state === s_invalid
val sec_rdy = state === s_miss_req || state === s_miss_resp
io.req_sec_rdy := sec_rdy && rpq.io.enq.ready
// assign default values to output signals
io.replay.valid := false.B
io.replay.bits := DontCare
io.lsu.req.ready := state === s_invalid
io.lsu.resp.valid := false.B
io.lsu.resp.bits := DontCare
io.miss_req.valid := false.B
io.miss_req.bits := DontCare
io.miss_finish.valid := false.B
io.miss_finish.bits := DontCare
io.data_req.valid := false.B
io.data_req.bits := DontCare
io.idx.valid := state =/= s_invalid
io.tag.valid := state =/= s_invalid
io.idx.bits := req_idx
io.tag.bits := req_tag
XSDebug("entry: %d state: %d\n", io.id, state)
XSDebug("entry: %d state: %d\n", io.id, state)
// --------------------------------------------
// s_invalid: receive requests
when (state === s_invalid) {
assert(rpq.io.enq.ready)
when (io.req_pri_val && io.req_pri_rdy) {
req := io.req
when (io.lsu.req.fire()) {
assert(io.lsu.req.bits.cmd === M_XRD)
assert(!io.lsu.req.bits.meta.replay)
req := io.lsu.req.bits
resp.meta := io.lsu.req.bits.meta
resp.miss := false.B
resp.nack := false.B
state := s_miss_req
}
}
......@@ -90,38 +84,60 @@ class LoadMissEntry extends DCacheModule
when (state === s_miss_resp) {
when (io.miss_resp.fire()) {
reg_miss_resp := io.miss_resp.bits
state := s_drain_rpq
resp.data := io.miss_resp.bits.data
when (io.miss_resp.bits.has_data) {
state := s_resp
} .otherwise {
// miss queue says that data is already in dcache
// so we need to read it
state := s_data_read_req
}
}
}
// --------------------------------------------
// replay
val loadPipelineLatency = 2
val replay_resp_ctr = Reg(UInt(log2Up(loadPipelineLatency).W))
when (state === s_drain_rpq) {
rpq.io.deq.ready := true.B
io.replay <> rpq.io.deq
io.replay.bits.meta.replay := true.B
when (rpq.io.count === 0.U) {
replay_resp_ctr := 0.U
state := s_replay_resp
val dataArrayLatency = 2
val data_array_ctr = Reg(UInt(log2Up(dataArrayLatency).W))
when (state === s_data_read_req) {
// Data read for new requests
io.data_req.valid := true.B
io.data_req.bits.addr := req_block_addr
io.data_req.bits.way_en := reg_miss_resp.way_en
io.data_req.bits.rmask := ~0.U(blockRows.W)
when (io.data_req.fire()) {
state := s_data_read_resp
data_array_ctr := 0.U
}
}
when (state === s_data_read_resp) {
data_array_ctr := data_array_ctr + 1.U
when (data_array_ctr === (dataArrayLatency - 1).U) {
val way_idx = OHToUInt(reg_miss_resp.way_en)
resp.data := Cat((0 until blockRows).reverse map { i =>
val row = io.data_resp(way_idx)(i)
// decode each word in this row
val row_decoded = Cat((0 until rowWords).reverse map { w =>
val data_word = row(encWordBits * (w + 1) - 1, encWordBits * w)
val decoded = cacheParams.dataCode.decode(data_word)
val data_word_decoded = decoded.corrected
assert(!decoded.uncorrectable)
data_word_decoded
})
row_decoded
})
state := s_resp
}
}
//
// we must wait for response here,
// if we do'not wait for response here,
// this entry may be freed before it's response comes back
// load pipe line latency is 2 cycles
// we send req in s0 and get response in s2
// s_drain_rpq is s0
// when we reach s_replay_resp, load req goes to s1
// we should wait here for another cycle until load req goes to s2
when (state === s_replay_resp) {
replay_resp_ctr := replay_resp_ctr + 1.U
when (replay_resp_ctr === (loadPipelineLatency - 1).U) {
// --------------------------------------------
when (state === s_resp) {
io.lsu.resp.valid := true.B
io.lsu.resp.bits := resp
when (io.lsu.resp.fire()) {
state := s_miss_finish
}
}
......@@ -140,17 +156,21 @@ class LoadMissEntry extends DCacheModule
class LoadMissQueue extends DCacheModule
{
val io = IO(new Bundle {
val lsu = Flipped(new DCacheLoadIO)
val replay = new DCacheLoadIO
val lsu = Flipped(new DCacheLineIO)
val miss_req = DecoupledIO(new MissReq)
val miss_resp = Flipped(ValidIO(new MissResp))
val miss_finish = DecoupledIO(new MissFinish)
val data_req = DecoupledIO(new L1DataReadReq)
val data_resp = Input(Vec(nWays, Vec(blockRows, Bits(encRowBits.W))))
})
val miss_req_arb = Module(new Arbiter(new MissReq, cfg.nLoadMissEntries))
val miss_finish_arb = Module(new Arbiter(new MissFinish, cfg.nLoadMissEntries))
val replay_arb = Module(new Arbiter(new DCacheWordReq , cfg.nLoadMissEntries))
val miss_req_arb = Module(new Arbiter(new MissReq, cfg.nLoadMissEntries))
val miss_finish_arb = Module(new Arbiter(new MissFinish, cfg.nLoadMissEntries))
val data_req_arb = Module(new Arbiter(new L1DataReadReq, cfg.nLoadMissEntries))
val resp_arb = Module(new Arbiter(new DCacheLineResp, cfg.nLoadMissEntries))
val idx_matches = Wire(Vec(cfg.nLoadMissEntries, Bool()))
val tag_matches = Wire(Vec(cfg.nLoadMissEntries, Bool()))
......@@ -158,56 +178,50 @@ class LoadMissQueue extends DCacheModule
val tag_match = Mux1H(idx_matches, tag_matches)
val idx_match = idx_matches.reduce(_||_)
val req = io.lsu.req
val entry_alloc_idx = Wire(UInt())
val pri_rdy = WireInit(false.B)
val pri_val = req.valid && !idx_match
var sec_rdy = false.B
val entry_id_MSB = reqIdWidth - 1
val entry_id_LSB = reqIdWidth - loadMissQueueEntryIdWidth
val entries = (0 until cfg.nLoadMissEntries) map { i =>
val entry = Module(new LoadMissEntry)
entry.io.id := i.U(log2Up(cfg.nLoadMissEntries).W)
entry.io.id := i.U(loadMissQueueEntryIdWidth.W)
idx_matches(i) := entry.io.idx.valid && entry.io.idx.bits === get_idx(req.bits.addr)
tag_matches(i) := entry.io.tag.valid && entry.io.tag.bits === get_tag(req.bits.addr)
when (XSDebug.trigger) {
when (idx_matches(i)) {
XSDebug(s"entry: $i idx_match\n")
}
when (tag_matches(i)) {
XSDebug(s"entry: $i tag_match\n")
}
}
// entry req
entry.io.req_pri_val := (i.U === entry_alloc_idx) && pri_val
// lsu req and resp
val entry_lsu = entry.io.lsu
entry_lsu.req.valid := (i.U === entry_alloc_idx) && pri_val
when (i.U === entry_alloc_idx) {
pri_rdy := entry.io.req_pri_rdy
pri_rdy := entry_lsu.req.ready
}
entry.io.req_sec_val := req.valid && tag_match && idx_matches(i)
sec_rdy = sec_rdy || (entry.io.req_sec_rdy && entry.io.req_sec_val)
entry.io.req := req.bits
entry_lsu.req.bits := req.bits
replay_arb.io.in(i) <> entry.io.replay
miss_req_arb.io.in(i) <> entry.io.miss_req
resp_arb.io.in(i) <> entry_lsu.resp
miss_req_arb.io.in(i) <> entry.io.miss_req
data_req_arb.io.in(i) <> entry.io.data_req
entry.io.miss_resp.valid := (i.U === io.miss_resp.bits.client_id) && io.miss_resp.valid
entry.io.miss_resp.bits := io.miss_resp.bits
entry.io.data_resp := io.data_resp
miss_finish_arb.io.in(i) <> entry.io.miss_finish
entry
}
entry_alloc_idx := PriorityEncoder(entries.map(m=>m.io.req_pri_rdy))
entry_alloc_idx := PriorityEncoder(entries.map(m=>m.io.lsu.req.ready))
req.ready := Mux(idx_match, tag_match && sec_rdy, pri_rdy)
io.replay.req <> replay_arb.io.out
io.lsu.resp <> io.replay.resp
// replay never kills its previous request
io.replay.s1_kill := false.B
// whenever index matches, do not let it in
req.ready := pri_rdy && !idx_match
io.lsu.resp <> resp_arb.io.out
io.miss_req <> miss_req_arb.io.out
io.data_req <> data_req_arb.io.out
io.miss_finish <> miss_finish_arb.io.out
// debug output
......@@ -216,12 +230,6 @@ class LoadMissQueue extends DCacheModule
req.bits.cmd, req.bits.addr, req.bits.data, req.bits.mask, req.bits.meta.id, req.bits.meta.replay)
}
val replay = io.replay.req
when (replay.fire()) {
XSDebug(s"replay cmd: %x addr: %x data: %x mask: %x id: %d replay: %b\n",
replay.bits.cmd, replay.bits.addr, replay.bits.data, replay.bits.mask, replay.bits.meta.id, replay.bits.meta.replay)
}
val resp = io.lsu.resp
when (resp.fire()) {
XSDebug(s"resp: data: %x id: %d replay: %b miss: %b nack: %b\n",
......
......@@ -14,8 +14,11 @@ class MissReq extends DCacheBundle
class MissResp extends DCacheBundle
{
val client_id = UInt(missQueueClientIdWidth.W)
val client_id = UInt(missQueueClientIdWidth.W)
val entry_id = UInt(missQueueEntryIdWidth.W)
val way_en = Bits(nWays.W)
val has_data = Bool()
val data = UInt(blockBits.W)
}
class MissFinish extends DCacheBundle
......@@ -91,6 +94,15 @@ class MissEntry(edge: TLEdgeOut) extends DCacheModule
val grantack = Reg(Valid(new TLBundleE(edge.bundle)))
val refill_ctr = Reg(UInt(log2Up(refillCycles).W))
val should_refill_data = Reg(Bool())
val needs_writeback = Reg(Bool())
// for read, to shorten latency
// we send back response as soon as possible
//
// for store and amo
// we send back response when we have finished everything
// inform clients to replay requests
val early_response = Reg(Bool())
io.block_idx.valid := state =/= s_invalid
io.block_addr.valid := state =/= s_invalid
......@@ -144,6 +156,8 @@ class MissEntry(edge: TLEdgeOut) extends DCacheModule
grantack.valid := false.B
refill_ctr := 0.U
should_refill_data := false.B
needs_writeback := false.B
early_response := false.B
req := io.req.bits
state := s_meta_read_req
}
......@@ -197,23 +211,33 @@ class MissEntry(edge: TLEdgeOut) extends DCacheModule
val new_state = WireInit(s_invalid)
val old_coh = req_old_meta.coh
val needs_wb = old_coh.onCacheControl(M_FLUSH)._1 // does the line we are evicting need to be written back
early_response := req.cmd === M_XRD
when (req_tag_match) {
val (is_hit, _, coh_on_hit) = old_coh.onAccess(req.cmd)
when (is_hit) { // set dirty bit
// we do not need to assert write any more
// read may go here as well
// eg: when several load miss on the same block
// assert(isWrite(req.cmd))
new_coh := coh_on_hit
new_state := s_meta_write_req
when (req.cmd === M_XRD) {
// normal read
// read hit, no need to update meta
new_state := s_send_resp
} .otherwise {
assert(isWrite(req.cmd))
new_coh := coh_on_hit
new_state := s_meta_write_req
}
} .otherwise { // upgrade permissions
new_coh := old_coh
new_state := s_refill_req
}
} .otherwise { // refill and writeback if necessary
new_coh := ClientMetadata.onReset
should_refill_data := true.B
when (needs_wb) {
new_state := s_wb_req
needs_writeback := true.B
} .otherwise {
new_state := s_refill_req
}
......@@ -270,7 +294,7 @@ class MissEntry(edge: TLEdgeOut) extends DCacheModule
when (edge.hasData(io.mem_grant.bits)) {
when (io.mem_grant.fire()) {
should_refill_data := true.B
assert(should_refill_data)
refill_ctr := refill_ctr + 1.U
for (i <- 0 until beatRows) {
val row = io.mem_grant.bits.data(rowBits * (i + 1) - 1, rowBits * i)
......@@ -307,7 +331,11 @@ class MissEntry(edge: TLEdgeOut) extends DCacheModule
when (!should_refill_data) {
state := s_meta_write_req
} .otherwise {
state := s_data_write_req
when (early_response) {
state := s_send_resp
} .otherwise {
state := s_data_write_req
}
}
}
}
......@@ -340,19 +368,26 @@ class MissEntry(edge: TLEdgeOut) extends DCacheModule
}
// --------------------------------------------
// inform clients to replay requests
when (state === s_send_resp) {
io.resp.valid := true.B
io.resp.bits.client_id := req.client_id
io.resp.bits.entry_id := io.id
io.resp.bits.way_en := req_way_en
io.resp.bits.has_data := should_refill_data
io.resp.bits.data := refill_data.asUInt
when (io.resp.fire()) {
// additional assertion
val (is_hit, _, coh_on_hit) = new_coh.onAccess(req.cmd)
assert(is_hit, "We still don't have permissions for this store")
assert(is_hit, "We still don't have permissions for this block")
assert(new_coh === coh_on_hit, "Incorrect coherence meta data")
state := s_client_finish
// for read, we will write data later
when (early_response && should_refill_data) {
state := s_data_write_req
} .otherwise {
state := s_client_finish
}
}
}
......
......@@ -11,8 +11,8 @@ class StoreMissEntry extends DCacheModule
val io = IO(new Bundle {
val id = Input(UInt())
val lsu = Flipped(new DCacheStoreIO)
val replay = new DCacheStoreIO
val lsu = Flipped(new DCacheLineIO)
val replay = new DCacheLineIO
val miss_req = DecoupledIO(new MissReq)
val miss_resp = Flipped(ValidIO(new MissResp))
......@@ -26,7 +26,7 @@ class StoreMissEntry extends DCacheModule
val state = RegInit(s_invalid)
val req = Reg(new DCacheLineReq )
val resp = Reg(new DCacheResp)
val resp = Reg(new DCacheLineResp)
val req_idx = get_idx(req.addr)
val req_tag = get_tag(req.addr)
......@@ -142,8 +142,8 @@ class StoreMissEntry extends DCacheModule
class StoreMissQueue extends DCacheModule
{
val io = IO(new Bundle {
val lsu = Flipped(new DCacheStoreIO)
val replay = new DCacheStoreIO
val lsu = Flipped(new DCacheLineIO)
val replay = new DCacheLineIO
val miss_req = DecoupledIO(new MissReq)
val miss_resp = Flipped(ValidIO(new MissResp))
......@@ -152,8 +152,8 @@ class StoreMissQueue extends DCacheModule
val miss_req_arb = Module(new Arbiter(new MissReq, cfg.nStoreMissEntries))
val miss_finish_arb = Module(new Arbiter(new MissFinish, cfg.nStoreMissEntries))
val replay_arb = Module(new Arbiter(new DCacheLineReq, cfg.nStoreMissEntries))
val resp_arb = Module(new Arbiter(new DCacheResp, cfg.nStoreMissEntries))
val replay_arb = Module(new Arbiter(new DCacheLineReq, cfg.nStoreMissEntries))
val resp_arb = Module(new Arbiter(new DCacheLineResp, cfg.nStoreMissEntries))
val idx_matches = Wire(Vec(cfg.nLoadMissEntries, Bool()))
val tag_matches = Wire(Vec(cfg.nLoadMissEntries, Bool()))
......
......@@ -8,7 +8,7 @@ import utils.{XSDebug}
class StorePipe extends DCacheModule
{
val io = IO(new DCacheBundle{
val lsu = Flipped(new DCacheStoreIO)
val lsu = Flipped(new DCacheLineIO)
val data_read = DecoupledIO(new L1DataReadReq)
val data_resp = Input(Vec(nWays, Vec(blockRows, Bits(encRowBits.W))))
val data_write = DecoupledIO(new L1DataWriteReq)
......@@ -141,7 +141,7 @@ class StorePipe extends DCacheModule
dump_pipeline_valids("StorePipe s2", "s2_nack_hit", s2_valid && s2_nack_hit)
dump_pipeline_valids("StorePipe s2", "s2_nack_set_busy", s2_valid && s2_nack_set_busy)
val resp = Wire(Valid(new DCacheResp))
val resp = Wire(Valid(new DCacheLineResp))
resp.valid := s2_valid
resp.bits.data := DontCare
resp.bits.meta := s2_req.meta
......
......@@ -17,7 +17,7 @@ class MMIOEntry(edge: TLEdgeOut) extends DCacheModule
// client requests
val req = Flipped(DecoupledIO(new DCacheWordReq ))
val resp = DecoupledIO(new DCacheResp)
val resp = DecoupledIO(new DCacheWordResp)
val mem_acquire = DecoupledIO(new TLBundleA(edge.bundle))
val mem_grant = Flipped(DecoupledIO(new TLBundleD(edge.bundle)))
......@@ -119,7 +119,7 @@ class MMIOEntry(edge: TLEdgeOut) extends DCacheModule
}
class UncacheIO extends DCacheBundle {
val lsroq = Flipped(new DCacheLoadIO)
val lsroq = Flipped(new DCacheWordIO)
}
// convert DCacheIO to TileLink
......@@ -150,7 +150,7 @@ class UncacheImp(outer: Uncache)
val (bus, edge) = outer.clientNode.out.head
require(bus.d.bits.data.getWidth == wordBits, "Uncache: tilelink width does not match")
val resp_arb = Module(new Arbiter(new DCacheResp, cfg.nMMIOEntries))
val resp_arb = Module(new Arbiter(new DCacheWordResp, cfg.nMMIOEntries))
val req = io.lsroq.req
val resp = io.lsroq.resp
......
......@@ -4,14 +4,14 @@ import chisel3._
import chisel3.util._
import utils._
import xiangshan._
import xiangshan.cache.{DCacheLoadIO, TlbRequestIO, TlbCmd, MemoryOpConstants}
import xiangshan.cache.{DCacheWordIO, TlbRequestIO, TlbCmd, MemoryOpConstants}
import xiangshan.backend.LSUOpType
class AtomicsUnit extends XSModule with MemoryOpConstants{
val io = IO(new Bundle() {
val in = Flipped(Decoupled(new ExuInput))
val out = Decoupled(new ExuOutput)
val dcache = new DCacheLoadIO
val dcache = new DCacheWordIO
val dtlb = new TlbRequestIO
val flush_sbuffer = new SbufferFlushBundle
val tlbFeedback = ValidIO(new TlbFeedback)
......
......@@ -4,7 +4,7 @@ import chisel3._
import chisel3.util._
import utils._
import xiangshan._
import xiangshan.cache.{DCacheLoadIO, TlbRequestIO, TlbCmd, MemoryOpConstants}
import xiangshan.cache.{DCacheWordIO, TlbRequestIO, TlbCmd, MemoryOpConstants}
import xiangshan.backend.LSUOpType
class LoadToLsroqIO extends XSBundle {
......@@ -19,7 +19,7 @@ class LoadUnit extends XSModule {
val ldout = Decoupled(new ExuOutput)
val redirect = Flipped(ValidIO(new Redirect))
val tlbFeedback = ValidIO(new TlbFeedback)
val dcache = new DCacheLoadIO
val dcache = new DCacheWordIO
val dtlb = new TlbRequestIO()
val sbuffer = new LoadForwardQueryIO
val lsroq = new LoadToLsroqIO
......
......@@ -5,7 +5,7 @@ import chisel3.util._
import utils._
import xiangshan._
import xiangshan.cache._
import xiangshan.cache.{DCacheLoadIO, TlbRequestIO, MemoryOpConstants}
import xiangshan.cache.{DCacheWordIO, DCacheLineIO, TlbRequestIO, MemoryOpConstants}
import xiangshan.backend.LSUOpType
class LsRoqEntry extends XSBundle {
......@@ -20,8 +20,14 @@ class LsRoqEntry extends XSBundle {
val fwdData = Vec(8, UInt(8.W))
}
// inflight miss block reqs
class InflightBlockInfo extends XSBundle {
val block_addr = UInt(PAddrBits.W)
val valid = Bool()
}
// Load/Store Roq (Lsroq) for XiangShan Out of Order LSU
class Lsroq extends XSModule {
class Lsroq extends XSModule with HasDCacheParameters {
val io = IO(new Bundle() {
val dp1Req = Vec(RenameWidth, Flipped(DecoupledIO(new MicroOp)))
val lsroqIdxs = Output(Vec(RenameWidth, UInt(LsroqIdxWidth.W)))
......@@ -34,8 +40,8 @@ class Lsroq extends XSModule {
val forward = Vec(LoadPipelineWidth, Flipped(new LoadForwardQueryIO))
val commits = Flipped(Vec(CommitWidth, Valid(new RoqCommit)))
val rollback = Output(Valid(new Redirect))
val dcache = new DCacheLoadIO
val uncache = new DCacheLoadIO
val dcache = new DCacheLineIO
val uncache = new DCacheWordIO
// val refill = Flipped(Valid(new DCacheLineReq ))
})
......@@ -148,7 +154,9 @@ class Lsroq extends XSModule {
data(io.loadIn(i).bits.uop.lsroqIdx).fwdMask := io.loadIn(i).bits.forwardMask
data(io.loadIn(i).bits.uop.lsroqIdx).fwdData := io.loadIn(i).bits.forwardData
data(io.loadIn(i).bits.uop.lsroqIdx).exception := io.loadIn(i).bits.uop.cf.exceptionVec.asUInt
miss(io.loadIn(i).bits.uop.lsroqIdx) := io.loadIn(i).bits.miss && !io.loadIn(i).bits.mmio
val dcacheMissed = io.loadIn(i).bits.miss && !io.loadIn(i).bits.mmio
miss(io.loadIn(i).bits.uop.lsroqIdx) := dcacheMissed
listening(io.loadIn(i).bits.uop.lsroqIdx) := dcacheMissed
store(io.loadIn(i).bits.uop.lsroqIdx) := false.B
pending(io.loadIn(i).bits.uop.lsroqIdx) := io.loadIn(i).bits.mmio
}
......@@ -182,35 +190,58 @@ class Lsroq extends XSModule {
})
// cache miss request
val inflightReqs = RegInit(VecInit(Seq.fill(cfg.nLoadMissEntries)(0.U.asTypeOf(new InflightBlockInfo))))
val inflightReqFull = inflightReqs.map(req => req.valid).reduce(_&&_)
val reqBlockIndex = PriorityEncoder(~VecInit(inflightReqs.map(req => req.valid)).asUInt)
val missRefillSelVec = VecInit(
(0 until LsroqSize).map(i => allocated(i) && miss(i))
)
(0 until LsroqSize).map{ i =>
val inflight = inflightReqs.map(req => req.valid && req.block_addr === get_block_addr(data(i).paddr)).reduce(_||_)
allocated(i) && miss(i) && !inflight
})
val missRefillSel = getFirstOne(missRefillSelVec, tailMask)
val missRefillBlockAddr = get_block_addr(data(missRefillSel).paddr)
io.dcache.req.valid := missRefillSelVec.asUInt.orR
io.dcache.req.bits.cmd := MemoryOpConstants.M_XRD
io.dcache.req.bits.addr := data(missRefillSel).paddr
io.dcache.req.bits.addr := missRefillBlockAddr
io.dcache.req.bits.data := DontCare
io.dcache.req.bits.mask := data(missRefillSel).mask
io.dcache.req.bits.mask := DontCare
io.dcache.req.bits.meta.id := DontCare // TODO: // FIXME
io.dcache.req.bits.meta.vaddr := DontCare // data(missRefillSel).vaddr
io.dcache.req.bits.meta.paddr := data(missRefillSel).paddr
io.dcache.req.bits.meta.paddr := missRefillBlockAddr
io.dcache.req.bits.meta.uop := uop(missRefillSel)
io.dcache.req.bits.meta.mmio := false.B // data(missRefillSel).mmio
io.dcache.req.bits.meta.tlb_miss := false.B
io.dcache.req.bits.meta.mask := data(missRefillSel).mask
io.dcache.req.bits.meta.mask := DontCare
io.dcache.req.bits.meta.replay := false.B
io.dcache.resp.ready := true.B
io.dcache.s1_kill := false.B
assert(!(data(missRefillSel).mmio && io.dcache.req.valid))
when(io.dcache.req.fire()) {
miss(missRefillSel) := false.B
listening(missRefillSel) := true.B
// mark this block as inflight
inflightReqs(reqBlockIndex).valid := true.B
inflightReqs(reqBlockIndex).block_addr := missRefillBlockAddr
assert(!inflightReqs(reqBlockIndex).valid)
}
when(io.dcache.resp.fire()) {
val inflight = inflightReqs.map(req => req.valid && req.block_addr === get_block_addr(io.dcache.resp.bits.meta.paddr)).reduce(_||_)
assert(inflight)
for (i <- 0 until cfg.nLoadMissEntries) {
when (inflightReqs(i).valid && inflightReqs(i).block_addr === get_block_addr(io.dcache.resp.bits.meta.paddr)) {
inflightReqs(i).valid := false.B
}
}
}
when(io.dcache.req.fire()){
XSDebug("miss req: pc:0x%x roqIdx:%d lsroqIdx:%d (p)addr:0x%x vaddr:0x%x\n", io.dcache.req.bits.meta.uop.cf.pc, io.dcache.req.bits.meta.uop.roqIdx, io.dcache.req.bits.meta.uop.lsroqIdx, io.dcache.req.bits.addr, io.dcache.req.bits.meta.vaddr)
}
......@@ -230,9 +261,14 @@ class Lsroq extends XSModule {
}
(0 until LsroqSize).map(i => {
val addrMatch = data(i).paddr(PAddrBits - 1, 3) === io.dcache.resp.bits.meta.paddr(PAddrBits - 1, 3)
when(allocated(i) && listening(i) && addrMatch && io.dcache.resp.fire()) {
val refillData = io.dcache.resp.bits.data
val blockMatch = get_block_addr(data(i).paddr) === io.dcache.resp.bits.meta.paddr
when(allocated(i) && listening(i) && blockMatch && io.dcache.resp.fire()) {
// split them into words
val words = VecInit((0 until blockWords) map { i =>
io.dcache.resp.bits.data(DataBits * (i + 1) - 1, DataBits * i)
})
val refillData = words(get_word(data(i).paddr))
data(i).data := mergeRefillData(refillData, data(i).fwdData.asUInt, data(i).fwdMask.asUInt)
valid(i) := true.B
listening(i) := false.B
......
......@@ -78,11 +78,11 @@ class MemToBackendIO extends XSBundle {
class Memend extends XSModule {
val io = IO(new Bundle{
val backend = new MemToBackendIO
val loadUnitToDcacheVec = Vec(exuParameters.LduCnt, new DCacheLoadIO)
val loadMiss = new DCacheLoadIO
val atomics = new DCacheLoadIO
val sbufferToDcache = new DCacheStoreIO
val uncache = new DCacheLoadIO
val loadUnitToDcacheVec = Vec(exuParameters.LduCnt, new DCacheWordIO)
val loadMiss = new DCacheLineIO
val atomics = new DCacheWordIO
val sbufferToDcache = new DCacheLineIO
val uncache = new DCacheWordIO
val ptw = new TlbPtwIO
})
......
......@@ -52,7 +52,7 @@ class SbufferFlushBundle extends Bundle {
class Sbuffer extends XSModule with HasSBufferConst {
val io = IO(new Bundle() {
val in = Vec(StorePipelineWidth, Flipped(Decoupled(new DCacheWordReq )))
val dcache = new DCacheStoreIO
val dcache = new DCacheLineIO
val forward = Vec(LoadPipelineWidth, Flipped(new LoadForwardQueryIO))
val flush = new Bundle {
val valid = Input(Bool())
......@@ -490,7 +490,7 @@ class Sbuffer extends XSModule with HasSBufferConst {
class FakeSbuffer extends XSModule {
val io = IO(new Bundle() {
val in = Vec(StorePipelineWidth, Flipped(Decoupled(new DCacheWordReq)))
val dcache = new DCacheStoreIO
val dcache = new DCacheLineIO
val forward = Vec(LoadPipelineWidth, Flipped(new LoadForwardQueryIO))
})
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册