提交 1a55dd89 编写于 作者: Z Zihao Yu

noop,Cache: merge subarray into Sets to save resource

* Now we should read 8 times to fetch a whole cache line to writeback to
  memory. This seems a lot IPC degradation in emu: 0.543329 -> 0.532621.
  But it does not hurt too much on FPGA.
* Temporarily disable coherency.
上级 a011aca3
......@@ -62,10 +62,10 @@ class BPU1 extends NOOPModule {
btb.reset := reset.asBool || flushBTB
btb.io.r.req.valid := io.in.pc.valid
btb.io.r.req.bits.idx := btbAddr.getIdx(io.in.pc.bits)
btb.io.r.req.bits.setIdx := btbAddr.getIdx(io.in.pc.bits)
val btbRead = Wire(btbEntry())
btbRead := btb.io.r.resp.data(0)(0)
btbRead := btb.io.r.resp.data(0)
// since there is one cycle latency to read SyncReadMem,
// we should latch the input pc for one cycle
val pcLatch = RegEnable(io.in.pc.bits, io.in.pc.valid)
......@@ -97,7 +97,7 @@ class BPU1 extends NOOPModule {
// than read request. Again, since the pipeline will be flushed
// in the next cycle, the read request will be useless.
btb.io.w.req.valid := req.isMissPredict && req.valid
btb.io.w.req.bits.idx := btbAddr.getIdx(req.pc)
btb.io.w.req.bits.setIdx := btbAddr.getIdx(req.pc)
btb.io.w.req.bits.data := btbWrite
val cnt = RegNext(pht.read(btbAddr.getIdx(req.pc)))
......
......@@ -32,9 +32,9 @@ sealed trait HasCacheConst {
}
def CacheMetaArrayReadBus() = new SRAMReadBus(new MetaBundle, set = Sets, way = Ways)
def CacheDataArrayReadBus() = new SRAMReadBus(new DataBundle, set = Sets, way = Ways, subarray = LineBeats)
def CacheDataArrayReadBus() = new SRAMReadBus(new DataBundle, set = Sets * LineBeats, way = Ways)
def CacheMetaArrayWriteBus() = new SRAMWriteBus(new MetaBundle, set = Sets, way = Ways)
def CacheDataArrayWriteBus() = new SRAMWriteBus(new DataBundle, set = Sets, way = Ways, subarray = LineBeats)
def CacheDataArrayWriteBus() = new SRAMWriteBus(new DataBundle, set = Sets * LineBeats, way = Ways)
def isSameWord(a1: UInt, a2: UInt) = ((a1 >> 2) === (a2 >> 2))
def isSetConflict(a1: UInt, a2: UInt) = (a1.asTypeOf(addrBundle).index === a2.asTypeOf(addrBundle).index)
......@@ -79,18 +79,20 @@ sealed class CacheStage1(ro: Boolean, name: String, userBits: Int = 0) extends C
if (ro) when (io.in.fire()) { assert(!io.in.bits.isWrite()) }
// read meta array and data array
val addr = io.in.bits.addr.asTypeOf(addrBundle)
List(io.metaReadBus, io.dataReadBus).map { case x => {
x.req.valid := io.in.valid && io.out.ready
x.req.bits.idx := io.in.bits.addr.asTypeOf(addrBundle).index
x.req.valid := io.in.valid && io.out.ready && !io.s2s3Miss
}}
io.metaReadBus.req.bits.setIdx := addr.index
io.dataReadBus.req.bits.setIdx := Cat(addr.index, addr.wordIndex)
io.out.bits.req := io.in.bits
val (addr, s2addr, s3addr) = (io.in.bits.addr, io.s2Req.bits.addr, io.s3Req.bits.addr)
val (s1addr, s2addr, s3addr) = (io.in.bits.addr, io.s2Req.bits.addr, io.s3Req.bits.addr)
// set conflict will evict the dirty line, so we should wait
// the victim line to be up-to-date, else we may writeback staled data
val s2WriteSetConflict = io.s2Req.valid && isSetConflict(s2addr, addr) && io.s2Req.bits.isWrite()
val s3WriteSetConflict = io.s3Req.valid && isSetConflict(s3addr, addr) && io.s3Req.bits.isWrite()
val s2WriteSetConflict = io.s2Req.valid && isSetConflict(s2addr, s1addr) && io.s2Req.bits.isWrite()
val s3WriteSetConflict = io.s3Req.valid && isSetConflict(s3addr, s1addr) && io.s3Req.bits.isWrite()
val stall = s2WriteSetConflict || s3WriteSetConflict
io.out.valid := io.in.valid && !stall && !io.s2s3Miss && io.metaReadBus.req.ready && io.dataReadBus.req.ready
io.in.ready := (!io.in.valid || io.out.fire()) && io.metaReadBus.req.ready && io.dataReadBus.req.ready
......@@ -99,7 +101,7 @@ sealed class CacheStage1(ro: Boolean, name: String, userBits: Int = 0) extends C
sealed class Stage2IO(val userBits: Int = 0) extends CacheBundle {
val req = new SimpleBusReqBundle(userBits = userBits)
val metas = Vec(Ways, new MetaBundle)
val datas = Vec(Ways, Vec(LineBeats, new DataBundle))
val datas = Vec(Ways, new DataBundle)
val hit = Output(Bool())
val waymask = Output(UInt(Ways.W))
}
......@@ -110,7 +112,7 @@ sealed class CacheStage2(ro: Boolean, name: String, userBits: Int = 0) extends C
val in = Flipped(Decoupled(new Stage1IO(userBits)))
val out = Decoupled(new Stage2IO(userBits))
val metaReadResp = Flipped(Vec(Ways, new MetaBundle))
val dataReadResp = Flipped(Vec(Ways, Vec(LineBeats, new DataBundle)))
val dataReadResp = Flipped(Vec(Ways, new DataBundle))
})
val req = io.in.bits.req
......@@ -140,6 +142,7 @@ sealed class CacheStage3(ro: Boolean, name: String, userBits: Int = 0) extends C
val addr = Output(UInt(AddrBits.W))
val flush = Input(Bool())
val dataWriteBus = CacheDataArrayWriteBus()
val dataReadBus = CacheDataArrayReadBus()
val metaWriteBus = CacheMetaArrayWriteBus()
val mem = new SimpleBusUC
})
......@@ -151,9 +154,7 @@ sealed class CacheStage3(ro: Boolean, name: String, userBits: Int = 0) extends C
val miss = io.in.valid && !io.in.bits.hit
val meta = Mux1H(io.in.bits.waymask, io.in.bits.metas)
val dataBlockIdx = Wire(UInt(WordIndexBits.W))
val dataBlock = Mux1H(io.in.bits.waymask, io.in.bits.datas)
val dataRead = dataBlock(dataBlockIdx).data
val dataRead = Mux1H(io.in.bits.waymask, io.in.bits.datas).data
val wordMask = Mux(req.isWrite(), MaskExpand(req.wmask), 0.U(DataBits.W))
val dataHitWriteBus = Wire(CacheDataArrayWriteBus())
......@@ -161,13 +162,12 @@ sealed class CacheStage3(ro: Boolean, name: String, userBits: Int = 0) extends C
val hitWrite = hit && req.isWrite()
val dataMerge = MaskData(dataRead, req.wdata, wordMask)
dataHitWriteBus.req.valid := hitWrite
dataHitWriteBus.req.bits.idx := addr.index
dataHitWriteBus.req.bits.setIdx := Cat(addr.index, addr.wordIndex)
dataHitWriteBus.req.bits.data.data := dataMerge
dataHitWriteBus.req.bits.subarrayMask.map(_ := (1.U << addr.wordIndex))
dataHitWriteBus.req.bits.waymask.map(_ := io.in.bits.waymask)
metaHitWriteBus.req.valid := hitWrite && !meta.dirty
metaHitWriteBus.req.bits.idx := addr.index
metaHitWriteBus.req.bits.setIdx := addr.index
metaHitWriteBus.req.bits.data.valid := true.B
metaHitWriteBus.req.bits.data.tag := meta.tag
metaHitWriteBus.req.bits.waymask.map(_ := io.in.bits.waymask)
......@@ -188,9 +188,26 @@ sealed class CacheStage3(ro: Boolean, name: String, userBits: Int = 0) extends C
val readBeatCnt = Counter(LineBeats)
val writeBeatCnt = Counter(LineBeats)
dataBlockIdx := Mux(state === s_memWriteReq, writeBeatCnt.value, addr.wordIndex)
io.mem.req.bits.wdata := dataRead
val s2_idle :: s2_dataReadReq :: s2_dataReadWait :: s2_memWriteReq :: Nil = Enum(4)
val state2 = RegInit(s2_idle)
val dataWriteBackReadBus = Wire(CacheDataArrayReadBus())
// no dataWrite now, and it is always ready if no probe requests
dataWriteBackReadBus.req.valid := (state === s_memWriteReq) && (state2 === s2_idle)
dataWriteBackReadBus.req.bits.setIdx := Cat(addr.index, writeBeatCnt.value)
io.dataReadBus <> dataWriteBackReadBus
val dataWay = RegEnable(dataWriteBackReadBus.resp.data, state2 === s2_dataReadWait)
switch (state2) {
is (s2_idle) { when (state === s_memWriteReq) { state2 := s2_dataReadReq } }
is (s2_dataReadReq) { state2 := s2_dataReadWait }
is (s2_dataReadWait) { state2 := s2_memWriteReq }
is (s2_memWriteReq) { when (io.mem.req.fire()) { state2 := s2_idle } }
}
io.mem.req.bits.wdata := Mux1H(io.in.bits.waymask, dataWay).data
io.mem.req.bits.wmask := Fill(DataBytes, 1.U)
io.mem.req.bits.cmd := Mux(state === s_memReadReq, SimpleBusCmd.readBurst,
Mux((writeBeatCnt.value === (LineBeats - 1).U), SimpleBusCmd.writeLast, SimpleBusCmd.writeBurst))
......@@ -203,12 +220,12 @@ sealed class CacheStage3(ro: Boolean, name: String, userBits: Int = 0) extends C
io.mem.req.bits.addr := Mux(state === s_memReadReq, raddr, waddr)
io.mem.resp.ready := true.B
io.mem.req.valid := (state === s_memReadReq) || (state === s_memWriteReq)
io.mem.req.valid := (state === s_memReadReq) || ((state === s_memWriteReq) && (state2 === s2_memWriteReq))
val metaRefillWriteBus = Wire(CacheMetaArrayWriteBus())
val dataRefillWriteBus = Wire(CacheDataArrayWriteBus())
dataRefillWriteBus.req.bits.data.data := 0.U
dataRefillWriteBus.req.bits.subarrayMask.map(_ := 0.U)
dataRefillWriteBus.req.bits.data.data := 0.U // assigned inside the state machine
dataRefillWriteBus.req.bits.setIdx := 0.U
val afterFirstRead = RegInit(false.B)
val alreadyOutFire = RegEnable(true.B, init = false.B, io.out.fire())
val readingFirst = !afterFirstRead && io.mem.resp.fire() && (state === s_memReadResp)
......@@ -240,8 +257,7 @@ sealed class CacheStage3(ro: Boolean, name: String, userBits: Int = 0) extends C
} else rdata
dataRefillWriteBus.req.bits.data.data := inRdata
dataRefillWriteBus.req.bits.subarrayMask.map(_ := 1.U << readBeatCnt.value)
dataRefillWriteBus.req.bits.setIdx := Cat(addr.index, readBeatCnt.value)
readBeatCnt.inc()
when (io.mem.resp.bits.isReadLast()) { state := s_wait_resp }
}
......@@ -249,7 +265,7 @@ sealed class CacheStage3(ro: Boolean, name: String, userBits: Int = 0) extends C
is (s_memWriteReq) {
when (io.mem.req.fire()) { writeBeatCnt.inc() }
when (io.mem.req.bits.isWriteLast()) { state := s_memWriteResp }
when (io.mem.req.bits.isWriteLast() && io.mem.req.fire()) { state := s_memWriteResp }
}
is (s_memWriteResp) { when (io.mem.resp.fire()) { state := s_memReadReq } }
......@@ -258,7 +274,6 @@ sealed class CacheStage3(ro: Boolean, name: String, userBits: Int = 0) extends C
dataRefillWriteBus.req.valid := (state === s_memReadResp) && io.mem.resp.fire()
dataRefillWriteBus.req.bits.idx := addr.index
dataRefillWriteBus.req.bits.waymask.map(_ := io.in.bits.waymask)
val dataWriteArb = Module(new Arbiter(CacheDataArrayWriteBus().req.bits, 2))
......@@ -267,7 +282,7 @@ sealed class CacheStage3(ro: Boolean, name: String, userBits: Int = 0) extends C
io.dataWriteBus.req <> dataWriteArb.io.out
metaRefillWriteBus.req.valid := (state === s_memReadResp) && io.mem.resp.fire() && io.mem.resp.bits.isReadLast()
metaRefillWriteBus.req.bits.idx := addr.index
metaRefillWriteBus.req.bits.setIdx := addr.index
metaRefillWriteBus.req.bits.data.valid := true.B
metaRefillWriteBus.req.bits.data.tag := addr.tag
metaRefillWriteBus.req.bits.waymask.map(_ := io.in.bits.waymask)
......@@ -314,19 +329,20 @@ sealed class CacheProbeStage(ro: Boolean, name: String) extends CacheModule {
io.in.ready := (state === s_idle)
val req = RegEnable(io.in.bits, io.in.fire())
val addr = req.addr.asTypeOf(addrBundle)
// read meta array and data array
List(io.metaReadBus, io.dataReadBus).map { case x => {
x.req.valid := (state === s_arrayRead)
x.req.bits.idx := req.addr.asTypeOf(addrBundle).index
x.req.bits.setIdx := addr.index
}}
//io.dataReadBus.req.bits.subarrayIdx.map(_ := addr.wordIndex) // FIXME
// Latching meta and data
val meta = RegEnable(io.metaReadBus.resp.data(0)(0), state === s_arrayReadWait)
val meta = RegEnable(io.metaReadBus.resp.data(0), state === s_arrayReadWait)
val data = RegEnable(io.dataReadBus.resp.data(0), state === s_arrayReadWait)
// check
val addr = req.addr.asTypeOf(addrBundle)
val hit = meta.valid && (meta.tag === addr.tag)
// release
......@@ -359,7 +375,7 @@ sealed class CacheProbeStage(ro: Boolean, name: String) extends CacheModule {
}
io.out.valid := (state === s_check) || (state === s_release)
io.out.bits.rdata := data(idxCnt.value).data
io.out.bits.rdata := data.data // FIXME
io.out.bits.user := 0.U
io.out.bits.cmd := Mux(state === s_release, Mux(last, SimpleBusCmd.readLast, 0.U),
Mux(hit, SimpleBusCmd.probeHit, SimpleBusCmd.probeMiss))
......@@ -380,7 +396,7 @@ class Cache(ro: Boolean, name: String, userBits: Int = 0) extends CacheModule {
val s2 = Module(new CacheStage2(ro, name, userBits))
val s3 = Module(new CacheStage3(ro, name, userBits))
val metaArray = Module(new SRAMTemplate(new MetaBundle, set = Sets, way = Ways, shouldReset = true, singlePort = true))
val dataArray = Module(new SRAMTemplate(new DataBundle, set = Sets, way = Ways, subarray = LineBeats, singlePort = true))
val dataArray = Module(new SRAMTemplate(new DataBundle, set = Sets * LineBeats, way = Ways, singlePort = true))
if (name == "icache") {
// flush icache when executing fence.i
......@@ -420,15 +436,17 @@ class Cache(ro: Boolean, name: String, userBits: Int = 0) extends CacheModule {
s1.io.metaReadBus.resp := metaArray.io.r.resp
metaArray.io.w <> s3.io.metaWriteBus
val dataReadArb = Module(new Arbiter(chiselTypeOf(dataArray.io.r.req.bits), 2))
val dataReadArb = Module(new Arbiter(chiselTypeOf(dataArray.io.r.req.bits), 3))
dataReadArb.io.in(0) <> coh.io.dataReadBus.req
dataReadArb.io.in(1) <> s1.io.dataReadBus.req
dataReadArb.io.in(2) <> s3.io.dataReadBus.req
dataArray.io.r.req <> dataReadArb.io.out
coh.io.dataReadBus.resp := dataArray.io.r.resp
s1.io.dataReadBus.resp := dataArray.io.r.resp
s3.io.dataReadBus.resp := dataArray.io.r.resp
dataArray.io.w <> s3.io.dataWriteBus
s2.io.metaReadResp := metaArray.io.r.resp.data.map(_(0))
s2.io.metaReadResp := metaArray.io.r.resp.data
s2.io.dataReadResp := dataArray.io.r.resp.data
BoringUtils.addSource(s3.io.in.valid && s3.io.in.bits.hit, "perfCntCondM" + name + "Hit")
......
......@@ -7,7 +7,7 @@ import utils._
import bus.simplebus._
trait HasCoherenceConst {
val supportCoh = true
val supportCoh = false //true
}
class CoherenceInterconnect extends Module with HasCoherenceConst {
......
......@@ -4,68 +4,60 @@ import chisel3._
import chisel3.util._
class SRAMBundleA(val set: Int) extends Bundle {
val idx = Output(UInt(log2Up(set).W))
val setIdx = Output(UInt(log2Up(set).W))
}
class SRAMBundleAW[T <: Data](private val gen: T,
set: Int, val way: Int = 1, val subarray: Int = 1) extends SRAMBundleA(set) {
class SRAMBundleAW[T <: Data](private val gen: T, set: Int, val way: Int = 1) extends SRAMBundleA(set) {
val data = Output(gen)
val subarrayMask = if (subarray > 1) Some(Output(UInt(subarray.W))) else None
val waymask = if (way > 1) Some(Output(UInt(way.W))) else None
}
class SRAMBundleR[T <: Data](private val gen: T,
val way: Int = 1, val subarray: Int = 1) extends Bundle {
val data = Output(Vec(way, Vec(subarray, gen)))
class SRAMBundleR[T <: Data](private val gen: T, val way: Int = 1) extends Bundle {
val data = Output(Vec(way, gen))
}
class SRAMReadBus[T <: Data](private val gen: T,
val set: Int, val way: Int = 1, val subarray: Int = 1) extends Bundle {
class SRAMReadBus[T <: Data](private val gen: T, val set: Int, val way: Int = 1) extends Bundle {
val req = Decoupled(new SRAMBundleA(set))
val resp = Flipped(new SRAMBundleR(gen, way, subarray))
val resp = Flipped(new SRAMBundleR(gen, way))
}
class SRAMWriteBus[T <: Data](private val gen: T,
val set: Int, val way: Int = 1, val subarray: Int = 1) extends Bundle {
val req = Decoupled(new SRAMBundleAW(gen, set, way, subarray))
class SRAMWriteBus[T <: Data](private val gen: T, val set: Int, val way: Int = 1) extends Bundle {
val req = Decoupled(new SRAMBundleAW(gen, set, way))
}
class SRAMTemplate[T <: Data](gen: T, set: Int, way: Int = 1, subarray: Int = 1,
class SRAMTemplate[T <: Data](gen: T, set: Int, way: Int = 1,
shouldReset: Boolean = false, holdRead: Boolean = false, singlePort: Boolean = false) extends Module {
val io = IO(new Bundle {
val r = Flipped(new SRAMReadBus(gen, set, way, subarray))
val w = Flipped(new SRAMWriteBus(gen, set, way, subarray))
val r = Flipped(new SRAMReadBus(gen, set, way))
val w = Flipped(new SRAMWriteBus(gen, set, way))
})
val wordType = UInt(gen.getWidth.W)
val arrays = Seq.tabulate(subarray) { i => SyncReadMem(set, Vec(way, wordType)) }
val (resetState, resetIdx) = (WireInit(false.B), WireInit(0.U))
val array = SyncReadMem(set, Vec(way, wordType))
val (resetState, resetSet) = (WireInit(false.B), WireInit(0.U))
if (shouldReset) {
val _resetState = RegInit(true.B)
val (_resetIdx, resetFinish) = Counter(_resetState, set)
val (_resetSet, resetFinish) = Counter(_resetState, set)
when (resetFinish) { _resetState := false.B }
resetState := _resetState
resetIdx := _resetIdx
resetSet := _resetSet
}
val idx = Mux(resetState, resetIdx, io.w.req.bits.idx)
val (ren, wen) = (io.r.req.valid, io.w.req.valid || resetState)
val realRen = (if (singlePort) ren && !wen else ren)
val setIdx = Mux(resetState, resetSet, io.w.req.bits.setIdx)
val wdataword = Mux(resetState, 0.U.asTypeOf(wordType), io.w.req.bits.data.asUInt)
val subarrayMask = Mux(resetState, Fill(subarray, "b1".U), io.w.req.bits.subarrayMask.getOrElse("b1".U))
val waymask = io.w.req.bits.waymask.getOrElse("b1".U)
val wdata = VecInit(Seq.fill(way)(wdataword))
when (wen) { array.write(setIdx, wdata, waymask.asBools) }
val (ren, wen) = (io.r.req.valid, io.w.req.valid || resetState)
val realRen = (if (singlePort) ren && !wen else ren)
val rdatas = for ((array, i) <- arrays.zipWithIndex) yield {
when (wen & subarrayMask(i)) { array.write(idx, wdata, waymask.asBools) }
(if (holdRead) ReadAndHold(array, io.r.req.bits.idx, realRen)
else array.read(io.r.req.bits.idx, realRen)).map(_.asTypeOf(gen))
}
val rdata = (if (holdRead) ReadAndHold(array, io.r.req.bits.setIdx, realRen)
else array.read(io.r.req.bits.setIdx, realRen)).map(_.asTypeOf(gen))
io.r.resp.data := VecInit(rdata)
io.r.resp.data := VecInit(rdatas.transpose.map(VecInit(_)))
io.r.req.ready := !resetState && (if (singlePort) !wen else true.B)
io.w.req.ready := true.B
}
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册