未验证 提交 e94eb96f 编写于 作者: W William Wang 提交者: GitHub

Merge pull request #570 from RISCVERS/mem-timing

Opt memblock timing, dcache timing ignored for now
......@@ -3,7 +3,7 @@ NANOS_HOME ?= $(AM_HOME)/../nanos-lite
SINGLETEST = ALL=min3
B ?= 0
E ?= -1
E ?= 0
V ?= OFF
#V ?= OFF
EMU_ARGS = B=$(B) E=$(E) V=$(V)
......@@ -18,7 +18,7 @@ cache:
#2>&1 | tee > loader.log
cpu:
$(MAKE) -C $(AM_HOME)/tests/cputest $(ARCH) ALL=dummy $(EMU_ARGS) run 2>&1 | tee > dummy.log
$(MAKE) -C $(AM_HOME)/tests/cputest $(ARCH) ALL=dummy $(EMU_ARGS) run 2>&1
# ------------------------------------------------------------------
# run different test sets
......
......@@ -68,4 +68,8 @@ trait HasCircularQueuePtrHelper {
def isAfter[T <: CircularQueuePtr](left: T, right: T): Bool = {
Mux(left.flag === right.flag, left.value > right.value, left.value < right.value)
}
def isBefore[T <: CircularQueuePtr](left: T, right: T): Bool = {
Mux(left.flag === right.flag, left.value < right.value, left.value > right.value)
}
}
......@@ -17,7 +17,25 @@ object MemMap {
}
object AddressSpace {
def MemMapList = List(
def SimpleMemMapList = List(
// Base address Top address Width Description Mode (RWXIDSAC)
MemMap("h00_0000_0000", "h00_0FFF_FFFF", "h0", "Reserved", ""),
MemMap("h00_1000_0000", "h00_1FFF_FFFF", "h0", "QSPI_Flash", "RX"),
MemMap("h00_2000_0000", "h00_2FFF_FFFF", "h0", "Reserved", ""),
MemMap("h00_3000_0000", "h00_3000_FFFF", "h0", "DMA", "RW"),
MemMap("h00_3001_0000", "h00_3004_FFFF", "h0", "GPU", "RWC"),
MemMap("h00_3005_0000", "h00_3006_FFFF", "h0", "USB/SDMMC", "RW"),
MemMap("h00_3007_0000", "h00_30FF_FFFF", "h0", "Reserved", ""),
MemMap("h00_3100_0000", "h00_3111_FFFF", "h0", "MMIO", "RW"),
MemMap("h00_3112_0000", "h00_37FF_FFFF", "h0", "Reserved", ""),
MemMap("h00_3800_0000", "h00_3800_FFFF", "h0", "CLINT", "RW"),
MemMap("h00_3801_0000", "h00_3BFF_FFFF", "h0", "Reserved", ""),
MemMap("h00_3C00_0000", "h00_3FFF_FFFF", "h0", "PLIC", "RW"),
MemMap("h00_4000_0000", "h00_7FFF_FFFF", "h0", "PCIe", "RW"),
MemMap("h00_8000_0000", "h1F_FFFF_FFFF", "h0", "DDR", "RWXIDSA"),
)
def FullMemMapList = List(
// Base address Top address Width Description Mode (RWXIDSAC)
MemMap("h00_0000_0000", "h00_0FFF_FFFF", "h0", "Reserved", ""),
MemMap("h00_1000_0000", "h00_1FFF_FFFF", "h0", "QSPI_Flash", "RX"),
......@@ -55,16 +73,42 @@ object AddressSpace {
MemMap("h00_8000_0000", "h1F_FFFF_FFFF", "h0", "DDR", "RWXIDSA"),
)
def MemMapList = SimpleMemMapList
def printMemmap(){
println("-------------------- memory map --------------------")
for(i <- MemMapList){
println(i._1._1 + "->" + i._1._2 + " width " + (if(i._2.get("width").get == "0") "unlimited" else i._2.get("width").get) + " " + i._2.get("description").get + " [" + i._2.get("mode").get + "]")
println("[" + i._1._1 + " -> " + i._1._2 + "] Width:" + (if(i._2.get("width").get == "h0") "unlimited" else i._2.get("width").get) + " Description:" + i._2.get("description").get + " [" + i._2.get("mode").get + "]")
}
println("----------------------------------------------------")
}
def checkMemmap(){
for(i <- MemMapList){
// pma mode check
val s = i._2.get("mode").get
if(
s.toUpperCase.indexOf("A") >= 0 &&
!(s.toUpperCase.indexOf("R") >= 0 && s.toUpperCase.indexOf("W") >= 0)
){
println("[error] pma atomicable area must be both readable and writeable")
throw new IllegalArgumentException
}
// pma area size check
if(!i._1._1.endsWith("000") || !i._1._2.endsWith("FFF")){
println("[error] pma area must be larger than 4KB")
throw new IllegalArgumentException()
}
}
}
def genMemmapMatchVec(addr: UInt): UInt = {
VecInit(MemMapList.map(i => {
// calculate addr tag and compare mask
// val mask = i._1._2.U - i._1._1.U
// (~(i._1._1.U ^ addr) | mask).andR
// pma is not current critical path, use simple compare for now
i._1._1.U <= addr && addr < i._1._2.U
}).toSeq).asUInt
}
......@@ -75,6 +119,30 @@ object AddressSpace {
}).toSeq))
}
// TODO: FIXME
def queryModeFast(matchVec: UInt): UInt = {
var r = WireInit(false.B)
var w = WireInit(false.B)
var x = WireInit(false.B)
var i = WireInit(false.B)
var d = WireInit(false.B)
var s = WireInit(false.B)
var a = WireInit(false.B)
var c = WireInit(false.B)
for((j, idx) <- MemMapList.zipWithIndex){
val modes = j._2.get("mode").get
if (modes.toUpperCase.indexOf("R") >= 0) r = r || matchVec(idx).asBool
if (modes.toUpperCase.indexOf("W") >= 0) w = w || matchVec(idx).asBool
if (modes.toUpperCase.indexOf("X") >= 0) x = x || matchVec(idx).asBool
if (modes.toUpperCase.indexOf("I") >= 0) i = i || matchVec(idx).asBool
if (modes.toUpperCase.indexOf("D") >= 0) d = d || matchVec(idx).asBool
if (modes.toUpperCase.indexOf("S") >= 0) s = s || matchVec(idx).asBool
if (modes.toUpperCase.indexOf("A") >= 0) a = a || matchVec(idx).asBool
if (modes.toUpperCase.indexOf("C") >= 0) c = c || matchVec(idx).asBool
}
VecInit(Seq(r, w, x, i, d, s, a, c)).asUInt
}
def queryWidth(matchVec: UInt): UInt = {
Mux1H(matchVec, VecInit(MemMapList.map(i => {
i._2.get("width").get.U
......@@ -83,7 +151,11 @@ object AddressSpace {
def memmapAddrMatch(addr: UInt): (UInt, UInt) = {
val matchVec = genMemmapMatchVec(addr)
(queryMode(matchVec), queryWidth(matchVec))
// when(queryMode(matchVec) =/= queryModeFast(matchVec)){
// printf("pma fail: right %b wrong %b\n", queryMode(matchVec), queryModeFast(matchVec))
// }
assert(queryMode(matchVec) === queryModeFast(matchVec))
(queryModeFast(matchVec), queryWidth(matchVec))
}
def isDMMIO(addr: UInt): Bool = !PMAMode.dcache(memmapAddrMatch(addr)._1)
......
......@@ -377,6 +377,7 @@ class XSCoreImp(outer: XSCore) extends LazyModuleImp(outer)
trapIO <> DontCare
println(s"FPGAPlatform:${env.FPGAPlatform} EnableDebug:${env.EnableDebug}")
AddressSpace.checkMemmap()
AddressSpace.printMemmap()
// to fast wake up fp, mem rs
......
......@@ -239,6 +239,7 @@ class MemBlockImp
lsq.io.loadIn(i) <> loadUnits(i).io.lsq.loadIn
lsq.io.ldout(i) <> loadUnits(i).io.lsq.ldout
lsq.io.loadDataForwarded(i) <> loadUnits(i).io.lsq.loadDataForwarded
lsq.io.needReplayFromRS(i) <> loadUnits(i).io.lsq.needReplayFromRS
}
// StoreUnit
......@@ -274,8 +275,11 @@ class MemBlockImp
lsq.io.brqRedirect <> io.fromCtrlBlock.redirect
lsq.io.flush <> io.fromCtrlBlock.flush
io.toCtrlBlock.replay <> lsq.io.rollback
lsq.io.dcache <> dcache.io.lsu.lsq
lsq.io.uncache <> uncache.io.lsq
// delay dcache refill for 1 cycle for better timing
// TODO: remove RegNext after fixing refill paddr timing
// lsq.io.dcache <> dcache.io.lsu.lsq
lsq.io.dcache := RegNext(dcache.io.lsu.lsq)
// LSQ to store buffer
lsq.io.sbuffer <> sbuffer.io.in
......@@ -283,6 +287,9 @@ class MemBlockImp
// Sbuffer
sbuffer.io.dcache <> dcache.io.lsu.store
sbuffer.io.dcache.resp.valid := RegNext(dcache.io.lsu.store.resp.valid)
sbuffer.io.dcache.resp.bits := RegNext(dcache.io.lsu.store.resp.bits)
assert(sbuffer.io.dcache.resp.ready === true.B)
// flush sbuffer
val fenceFlush = io.fenceToSbuffer.flushSb
......
......@@ -300,7 +300,9 @@ class ReservationStationSelect
if (feedback) {
when (io.memfeedback.valid) {
stateQueue(io.memfeedback.bits.rsIdx) := Mux(io.memfeedback.bits.hit, s_idle, s_replay)
when (stateQueue(io.memfeedback.bits.rsIdx) === s_wait) {
stateQueue(io.memfeedback.bits.rsIdx) := Mux(io.memfeedback.bits.hit, s_idle, s_replay)
}
when (!io.memfeedback.bits.hit) {
countQueue(io.memfeedback.bits.rsIdx) := replayDelay(cntCountQueue(io.memfeedback.bits.rsIdx))
}
......
......@@ -71,6 +71,7 @@ class PtePermBundle extends TlbBundle {
class TlbPermBundle extends TlbBundle {
val pf = Bool() // NOTE: if this is true, just raise pf
// pagetable perm (software defined)
val d = Bool()
val a = Bool()
val g = Bool()
......@@ -78,13 +79,14 @@ class TlbPermBundle extends TlbBundle {
val x = Bool()
val w = Bool()
val r = Bool()
// pma perm (hardwired)
val pr = Bool() //readable
val pw = Bool() //writeable
val pe = Bool() //executable
val pa = Bool() //atom op permitted
val pi = Bool() //icacheable
val pd = Bool() //dcacheable
// pma perm check
// val at = Bool() // Access Type
// val as = Bool() // Atomic Swap
// val al = Bool() // Atomic Logical
// val aa = Bool() // Atomic Arithmetic
// TODO: add pma check
override def toPrintable: Printable = {
p"pf:${pf} d:${d} a:${a} g:${g} u:${u} x:${x} w:${w} r:${r}"
}
......@@ -172,6 +174,8 @@ class TlbEntry(superpage: Boolean = false) extends TlbBundle {
this.tag := vpn
this.level.map(_ := level(0))
this.data.ppn := ppn
// refill pagetable perm
val ptePerm = perm.asTypeOf(new PtePermBundle)
this.data.perm.pf:= pf
this.data.perm.d := ptePerm.d
......@@ -182,6 +186,15 @@ class TlbEntry(superpage: Boolean = false) extends TlbBundle {
this.data.perm.w := ptePerm.w
this.data.perm.r := ptePerm.r
// get pma perm
val (pmaMode, accessWidth) = AddressSpace.memmapAddrMatch(Cat(ppn, 0.U(12.W)))
this.data.perm.pr := PMAMode.read(pmaMode)
this.data.perm.pw := PMAMode.write(pmaMode)
this.data.perm.pe := PMAMode.execute(pmaMode)
this.data.perm.pa := PMAMode.atomic(pmaMode)
this.data.perm.pi := PMAMode.icache(pmaMode)
this.data.perm.pd := PMAMode.dcache(pmaMode)
this
}
......@@ -421,11 +434,22 @@ class TLB(Width: Int, isDtlb: Boolean) extends TlbModule with HasCSRConst{
resp(i).bits.excp.pf.st := stPf || update
resp(i).bits.excp.pf.instr := instrPf || update
// if vmenable, use pre-calcuated pma check result
resp(i).bits.mmio := Mux(TlbCmd.isExec(cmdReg), !perm.pi, !perm.pd)
resp(i).bits.excp.af.ld := Mux(TlbCmd.isAtom(cmdReg), !perm.pa, !perm.pr) && TlbCmd.isRead(cmdReg)
resp(i).bits.excp.af.st := Mux(TlbCmd.isAtom(cmdReg), !perm.pa, !perm.pw) && TlbCmd.isWrite(cmdReg)
resp(i).bits.excp.af.instr := Mux(TlbCmd.isAtom(cmdReg), false.B, !perm.pe)
// if !vmenable, check pma
val (pmaMode, accessWidth) = AddressSpace.memmapAddrMatch(resp(i).bits.paddr)
resp(i).bits.mmio := Mux(TlbCmd.isExec(cmdReg), !PMAMode.icache(pmaMode), !PMAMode.dcache(pmaMode))
resp(i).bits.excp.af.ld := Mux(TlbCmd.isAtom(cmdReg), !PMAMode.atomic(pmaMode), !PMAMode.read(pmaMode)) && TlbCmd.isRead(cmdReg)
resp(i).bits.excp.af.st := Mux(TlbCmd.isAtom(cmdReg), !PMAMode.atomic(pmaMode), !PMAMode.write(pmaMode)) && TlbCmd.isWrite(cmdReg)
resp(i).bits.excp.af.instr := Mux(TlbCmd.isAtom(cmdReg), false.B, !PMAMode.execute(pmaMode))
when(!vmEnable){
resp(i).bits.mmio := Mux(TlbCmd.isExec(cmdReg), !PMAMode.icache(pmaMode), !PMAMode.dcache(pmaMode))
resp(i).bits.excp.af.ld := Mux(TlbCmd.isAtom(cmdReg), !PMAMode.atomic(pmaMode), !PMAMode.read(pmaMode)) && TlbCmd.isRead(cmdReg)
resp(i).bits.excp.af.st := Mux(TlbCmd.isAtom(cmdReg), !PMAMode.atomic(pmaMode), !PMAMode.write(pmaMode)) && TlbCmd.isWrite(cmdReg)
resp(i).bits.excp.af.instr := Mux(TlbCmd.isAtom(cmdReg), false.B, !PMAMode.execute(pmaMode))
}
// TODO: MMIO check
(hit, miss, pfHitVec, multiHit)
}
......
package xiangshan.mem
import chisel3._
import chisel3.util._
import xiangshan._
import utils._
import xiangshan.cache._
class MaskedSyncDataModuleTemplate[T <: Data](gen: T, numEntries: Int, numRead: Int, numWrite: Int, numMRead: Int = 0, numMWrite: Int = 0) extends XSModule with HasDCacheParameters {
val io = IO(new Bundle {
// address indexed sync read
val raddr = Input(Vec(numRead, UInt(log2Up(numEntries).W)))
val rdata = Output(Vec(numRead, gen))
// masked sync read (1H)
val mrmask = Input(Vec(numMRead, Vec(numEntries, Bool())))
val mrdata = Output(Vec(numMRead, gen))
// address indexed write
val wen = Input(Vec(numWrite, Bool()))
val waddr = Input(Vec(numWrite, UInt(log2Up(numEntries).W)))
val wdata = Input(Vec(numWrite, gen))
// masked write
val mwmask = Input(Vec(numMWrite, Vec(numEntries, Bool())))
val mwdata = Input(Vec(numMWrite, gen))
})
val data = Reg(Vec(numEntries, gen))
// read ports
for (i <- 0 until numRead) {
io.rdata(i) := data(RegNext(io.raddr(i)))
}
// masked read ports
for (i <- 0 until numMRead) {
io.mrdata(i) := Mux1H(RegNext(io.mrmask(i)), data)
}
// write ports (with priorities)
for (i <- 0 until numWrite) {
when (io.wen(i)) {
data(io.waddr(i)) := io.wdata(i)
}
}
// masked write
for (j <- 0 until numEntries) {
val wen = VecInit((0 until numMWrite).map(i => io.mwmask(i)(j))).asUInt.orR
when (wen) {
data(j) := VecInit((0 until numMWrite).map(i => {
Mux(io.mwmask(i)(j), io.mwdata(i), 0.U).asUInt
})).reduce(_ | _)
}
}
// DataModuleTemplate should not be used when there're any write conflicts
for (i <- 0 until numWrite) {
for (j <- i+1 until numWrite) {
assert(!(io.wen(i) && io.wen(j) && io.waddr(i) === io.waddr(j)))
}
}
}
......@@ -60,3 +60,18 @@ class LoadForwardQueryIO extends XSBundle {
// val lqIdx = Output(UInt(LoadQueueIdxWidth.W))
val sqIdx = Output(new SqPtr)
}
class MaskedLoadForwardQueryIO extends XSBundle {
val paddr = Output(UInt(PAddrBits.W))
val mask = Output(UInt(8.W))
val uop = Output(new MicroOp) // for replay
val pc = Output(UInt(VAddrBits.W)) //for debug
val valid = Output(Bool()) //for debug
val forwardMask = Input(Vec(8, Bool()))
val forwardData = Input(Vec(8, UInt(8.W)))
val sqIdx = Output(new SqPtr) // for debug
// sqIdxMask is calcuated in earlier stage for better timing
val sqIdxMask = Output(UInt(StoreQueueSize.W))
}
......@@ -43,10 +43,11 @@ class LsqWrappper extends XSModule with HasDCacheParameters {
val loadIn = Vec(LoadPipelineWidth, Flipped(Valid(new LsPipelineBundle)))
val storeIn = Vec(StorePipelineWidth, Flipped(Valid(new LsPipelineBundle)))
val loadDataForwarded = Vec(LoadPipelineWidth, Input(Bool()))
val needReplayFromRS = Vec(LoadPipelineWidth, Input(Bool()))
val sbuffer = Vec(StorePipelineWidth, Decoupled(new DCacheWordReq))
val ldout = Vec(2, DecoupledIO(new ExuOutput)) // writeback int load
val mmioStout = DecoupledIO(new ExuOutput) // writeback uncached store
val forward = Vec(LoadPipelineWidth, Flipped(new LoadForwardQueryIO))
val forward = Vec(LoadPipelineWidth, Flipped(new MaskedLoadForwardQueryIO))
val roq = Flipped(new RoqLsqIO)
val rollback = Output(Valid(new Redirect))
val dcache = Flipped(ValidIO(new Refill))
......@@ -94,6 +95,7 @@ class LsqWrappper extends XSModule with HasDCacheParameters {
loadQueue.io.loadIn <> io.loadIn
loadQueue.io.storeIn <> io.storeIn
loadQueue.io.loadDataForwarded <> io.loadDataForwarded
loadQueue.io.needReplayFromRS <> io.needReplayFromRS
loadQueue.io.ldout <> io.ldout
loadQueue.io.roq <> io.roq
loadQueue.io.rollback <> io.rollback
......
......@@ -68,8 +68,9 @@ class LoadQueue extends XSModule
val loadIn = Vec(LoadPipelineWidth, Flipped(Valid(new LsPipelineBundle)))
val storeIn = Vec(StorePipelineWidth, Flipped(Valid(new LsPipelineBundle)))
val loadDataForwarded = Vec(LoadPipelineWidth, Input(Bool()))
val needReplayFromRS = Vec(LoadPipelineWidth, Input(Bool()))
val ldout = Vec(2, DecoupledIO(new ExuOutput)) // writeback int load
val load_s1 = Vec(LoadPipelineWidth, Flipped(new LoadForwardQueryIO))
val load_s1 = Vec(LoadPipelineWidth, Flipped(new MaskedLoadForwardQueryIO))
val roq = Flipped(new RoqLsqIO)
val rollback = Output(Valid(new Redirect)) // replay now starts from load instead of store
val dcache = Flipped(ValidIO(new Refill))
......@@ -81,7 +82,7 @@ class LoadQueue extends XSModule
// val data = Reg(Vec(LoadQueueSize, new LsRoqEntry))
val dataModule = Module(new LoadQueueData(LoadQueueSize, wbNumRead = LoadPipelineWidth, wbNumWrite = LoadPipelineWidth))
dataModule.io := DontCare
val vaddrModule = Module(new AsyncDataModuleTemplate(UInt(VAddrBits.W), LoadQueueSize, numRead = 1, numWrite = LoadPipelineWidth))
val vaddrModule = Module(new SyncDataModuleTemplate(UInt(VAddrBits.W), LoadQueueSize, numRead = 1, numWrite = LoadPipelineWidth))
vaddrModule.io := DontCare
val allocated = RegInit(VecInit(List.fill(LoadQueueSize)(false.B))) // lq entry has been allocated
val datavalid = RegInit(VecInit(List.fill(LoadQueueSize)(false.B))) // data is valid
......@@ -144,7 +145,7 @@ class LoadQueue extends XSModule
*/
for (i <- 0 until LoadPipelineWidth) {
dataModule.io.wb.wen(i) := false.B
vaddrModule.io.wen(i) := false.B
val loadWbIndex = io.loadIn(i).bits.uop.lqIdx.value
when(io.loadIn(i).fire()) {
when(io.loadIn(i).bits.miss) {
XSInfo(io.loadIn(i).valid, "load miss write to lq idx %d pc 0x%x vaddr %x paddr %x data %x mask %x forwardData %x forwardMask: %x mmio %x\n",
......@@ -170,8 +171,9 @@ class LoadQueue extends XSModule
io.loadIn(i).bits.forwardMask.asUInt,
io.loadIn(i).bits.mmio
)}
val loadWbIndex = io.loadIn(i).bits.uop.lqIdx.value
datavalid(loadWbIndex) := (!io.loadIn(i).bits.miss || io.loadDataForwarded(i)) && !io.loadIn(i).bits.mmio
datavalid(loadWbIndex) := (!io.loadIn(i).bits.miss || io.loadDataForwarded(i)) &&
!io.loadIn(i).bits.mmio && // mmio data is not valid until we finished uncache access
!io.needReplayFromRS(i) // do not writeback if that inst will be resend from rs
writebacked(loadWbIndex) := !io.loadIn(i).bits.miss && !io.loadIn(i).bits.mmio
val loadWbData = Wire(new LQDataEntry)
......@@ -182,18 +184,19 @@ class LoadQueue extends XSModule
dataModule.io.wbWrite(i, loadWbIndex, loadWbData)
dataModule.io.wb.wen(i) := true.B
vaddrModule.io.waddr(i) := loadWbIndex
vaddrModule.io.wdata(i) := io.loadIn(i).bits.vaddr
vaddrModule.io.wen(i) := true.B
debug_mmio(loadWbIndex) := io.loadIn(i).bits.mmio
debug_paddr(loadWbIndex) := io.loadIn(i).bits.paddr
val dcacheMissed = io.loadIn(i).bits.miss && !io.loadIn(i).bits.mmio
miss(loadWbIndex) := dcacheMissed && !io.loadDataForwarded(i)
miss(loadWbIndex) := dcacheMissed && !io.loadDataForwarded(i) && !io.needReplayFromRS(i)
pending(loadWbIndex) := io.loadIn(i).bits.mmio
uop(loadWbIndex).debugInfo.issueTime := io.loadIn(i).bits.uop.debugInfo.issueTime
}
// vaddrModule write is delayed, as vaddrModule will not be read right after write
vaddrModule.io.waddr(i) := RegNext(loadWbIndex)
vaddrModule.io.wdata(i) := RegNext(io.loadIn(i).bits.vaddr)
vaddrModule.io.wen(i) := RegNext(io.loadIn(i).fire())
}
when(io.dcache.valid) {
......@@ -361,11 +364,25 @@ class LoadQueue extends XSModule
* Generate match vector for store address with rangeMask(stPtr, enqPtr).
* Besides, load instructions in LoadUnit_S1 and S2 are also checked.
* Cycle 1: Redirect Generation
* There're three possible types of violations. Choose the oldest load.
* Prepare redirect request according to the detected violation.
* There're three possible types of violations, up to 6 possible redirect requests.
* Choose the oldest load (part 1). (4 + 2) -> (1 + 2)
* Cycle 2: Redirect Fire
* Choose the oldest load (part 2). (3 -> 1)
* Prepare redirect request according to the detected violation.
* Fire redirect request (if valid)
*/
// stage 0: lq l1 wb l1 wb lq
// | | | | | | (paddr match)
// stage 1: lq l1 wb l1 wb lq
// | | | | | |
// | |------------| |
// | | |
// stage 2: lq l1wb lq
// | | |
// --------------------
// |
// rollback req
io.load_s1 := DontCare
def detectRollback(i: Int) = {
val startIndex = io.storeIn(i).bits.uop.lqIdx.value
......@@ -413,18 +430,9 @@ class LoadQueue extends XSModule
val l1ViolationUop = getOldestInTwo(l1ViolationVec, RegNext(VecInit(io.load_s1.map(_.uop))))
XSDebug(l1Violation, p"${Binary(Cat(l1ViolationVec))}, $l1ViolationUop\n")
val rollbackValidVec = Seq(lqViolation, wbViolation, l1Violation)
val rollbackUopVec = Seq(lqViolationUop, wbViolationUop, l1ViolationUop)
val mask = getAfterMask(rollbackValidVec, rollbackUopVec)
val oneAfterZero = mask(1)(0)
val rollbackUop = Mux(oneAfterZero && mask(2)(0),
rollbackUopVec(0),
Mux(!oneAfterZero && mask(2)(1), rollbackUopVec(1), rollbackUopVec(2)))
XSDebug(
l1Violation,
"need rollback (l4 load) pc %x roqidx %d target %x\n",
"need rollback (l1 load) pc %x roqidx %d target %x\n",
io.storeIn(i).bits.uop.cf.pc, io.storeIn(i).bits.uop.roqIdx.asUInt, l1ViolationUop.roqIdx.asUInt
)
XSDebug(
......@@ -438,15 +446,7 @@ class LoadQueue extends XSModule
io.storeIn(i).bits.uop.cf.pc, io.storeIn(i).bits.uop.roqIdx.asUInt, wbViolationUop.roqIdx.asUInt
)
(RegNext(io.storeIn(i).valid) && Cat(rollbackValidVec).orR, rollbackUop)
}
// rollback check
val rollback = Wire(Vec(StorePipelineWidth, Valid(new MicroOp)))
for (i <- 0 until StorePipelineWidth) {
val detectedRollback = detectRollback(i)
rollback(i).valid := detectedRollback._1
rollback(i).bits := detectedRollback._2
((lqViolation, lqViolationUop), (wbViolation, wbViolationUop), (l1Violation, l1ViolationUop))
}
def rollbackSel(a: Valid[MicroOp], b: Valid[MicroOp]): ValidIO[MicroOp] = {
......@@ -460,33 +460,72 @@ class LoadQueue extends XSModule
b // sel b
)
}
val rollbackSelected = ParallelOperation(rollback, rollbackSel)
val lastCycleRedirect = RegNext(io.brqRedirect)
val lastlastCycleRedirect = RegNext(lastCycleRedirect)
val lastCycleFlush = RegNext(io.flush)
val lastlastCycleFlush = RegNext(lastCycleFlush)
// S2: select rollback and generate rollback request
// S2: select rollback (part1) and generate rollback request
// rollback check
// Wb/L1 rollback seq check is done in s2
val rollbackWb = Wire(Vec(StorePipelineWidth, Valid(new MicroOp)))
val rollbackL1 = Wire(Vec(StorePipelineWidth, Valid(new MicroOp)))
val rollbackL1Wb = Wire(Vec(StorePipelineWidth*2, Valid(new MicroOp)))
// Lq rollback seq check is done in s3 (next stage), as getting rollbackLq MicroOp is slow
val rollbackLq = Wire(Vec(StorePipelineWidth, Valid(new MicroOp)))
for (i <- 0 until StorePipelineWidth) {
val detectedRollback = detectRollback(i)
rollbackLq(i).valid := detectedRollback._1._1 && RegNext(io.storeIn(i).valid)
rollbackLq(i).bits := detectedRollback._1._2
rollbackWb(i).valid := detectedRollback._2._1 && RegNext(io.storeIn(i).valid)
rollbackWb(i).bits := detectedRollback._2._2
rollbackL1(i).valid := detectedRollback._3._1 && RegNext(io.storeIn(i).valid)
rollbackL1(i).bits := detectedRollback._3._2
rollbackL1Wb(2*i) := rollbackL1(i)
rollbackL1Wb(2*i+1) := rollbackWb(i)
}
val rollbackL1WbSelected = ParallelOperation(rollbackL1Wb, rollbackSel)
val rollbackL1WbVReg = RegNext(rollbackL1WbSelected.valid)
val rollbackL1WbReg = RegEnable(rollbackL1WbSelected.bits, rollbackL1WbSelected.valid)
val rollbackLq0VReg = RegNext(rollbackLq(0).valid)
val rollbackLq0Reg = RegEnable(rollbackLq(0).bits, rollbackLq(0).valid)
val rollbackLq1VReg = RegNext(rollbackLq(1).valid)
val rollbackLq1Reg = RegEnable(rollbackLq(1).bits, rollbackLq(1).valid)
// S3: select rollback (part2), generate rollback request, then fire rollback request
// Note that we use roqIdx - 1.U to flush the load instruction itself.
// Thus, here if last cycle's roqIdx equals to this cycle's roqIdx, it still triggers the redirect.
val rollbackGen = Wire(Valid(new Redirect))
val rollbackReg = Reg(Valid(new Redirect))
rollbackGen.valid := rollbackSelected.valid &&
!rollbackSelected.bits.roqIdx.needFlush(lastCycleRedirect, lastCycleFlush)
rollbackGen.bits.roqIdx := rollbackSelected.bits.roqIdx
rollbackGen.bits.ftqIdx := rollbackSelected.bits.cf.ftqPtr
rollbackGen.bits.ftqOffset := rollbackSelected.bits.cf.ftqOffset
rollbackGen.bits.level := RedirectLevel.flush
rollbackGen.bits.interrupt := DontCare
rollbackGen.bits.cfiUpdate := DontCare
rollbackGen.bits.cfiUpdate.target := rollbackSelected.bits.cf.pc
rollbackReg := rollbackGen
// S3: fire rollback request
io.rollback := rollbackReg
io.rollback.valid := rollbackReg.valid &&
!rollbackReg.bits.roqIdx.needFlush(lastCycleRedirect, lastCycleFlush)
// FIXME: this is ugly
val rollbackValidVec = Seq(rollbackL1WbVReg, rollbackLq0VReg, rollbackLq1VReg)
val rollbackUopVec = Seq(rollbackL1WbReg, rollbackLq0Reg, rollbackLq1Reg)
// select uop in parallel
val mask = getAfterMask(rollbackValidVec, rollbackUopVec)
val oneAfterZero = mask(1)(0)
val rollbackUop = Mux(oneAfterZero && mask(2)(0),
rollbackUopVec(0),
Mux(!oneAfterZero && mask(2)(1), rollbackUopVec(1), rollbackUopVec(2)))
// check if rollback request is still valid in parallel
val rollbackValidVecChecked = Wire(Vec(3, Bool()))
for(((v, uop), idx) <- rollbackValidVec.zip(rollbackUopVec).zipWithIndex) {
rollbackValidVecChecked(idx) := v &&
(!lastCycleRedirect.valid || isBefore(uop.roqIdx, lastCycleRedirect.bits.roqIdx)) &&
(!lastlastCycleRedirect.valid || isBefore(uop.roqIdx, lastlastCycleRedirect.bits.roqIdx))
}
io.rollback.bits.roqIdx := rollbackUop.roqIdx
io.rollback.bits.ftqIdx := rollbackUop.cf.ftqPtr
io.rollback.bits.ftqOffset := rollbackUop.cf.ftqOffset
io.rollback.bits.level := RedirectLevel.flush
io.rollback.bits.interrupt := DontCare
io.rollback.bits.cfiUpdate := DontCare
io.rollback.bits.cfiUpdate.target := rollbackUop.cf.pc
// io.rollback.bits.pc := DontCare
io.rollback.valid := rollbackValidVecChecked.asUInt.orR && !lastCycleFlush && !lastlastCycleFlush
when(io.rollback.valid) {
// XSDebug("Mem rollback: pc %x roqidx %d\n", io.rollback.bits.cfi, io.rollback.bits.roqIdx.asUInt)
......@@ -503,11 +542,13 @@ class LoadQueue extends XSModule
* (5) ROB commits the instruction: same as normal instructions
*/
//(2) when they reach ROB's head, they can be sent to uncache channel
val lqTailMmioPending = WireInit(pending(deqPtr))
val lqTailAllocated = WireInit(allocated(deqPtr))
val s_idle :: s_req :: s_resp :: s_wait :: Nil = Enum(4)
val uncacheState = RegInit(s_idle)
switch(uncacheState) {
is(s_idle) {
when(io.roq.pendingld && pending(deqPtr) && allocated(deqPtr)) {
when(io.roq.pendingld && lqTailMmioPending && lqTailAllocated) {
uncacheState := s_req
}
}
......@@ -563,7 +604,7 @@ class LoadQueue extends XSModule
}
// Read vaddr for mem exception
vaddrModule.io.raddr(0) := deqPtr + commitCount
vaddrModule.io.raddr(0) := deqPtr + io.roq.lcommit
io.exceptionAddr.vaddr := vaddrModule.io.rdata(0)
// misprediction recovery / exception redirect
......@@ -596,6 +637,15 @@ class LoadQueue extends XSModule
allowEnqueue := validCount + enqNumber <= (LoadQueueSize - RenameWidth).U
// perf counter
XSPerf("lqRollback", io.rollback.valid, acc = true) // rollback redirect generated
XSPerf("lqFull", !allowEnqueue, acc = true)
XSPerf("lqMmioCycle", uncacheState =/= s_idle, acc = true) // lq is busy dealing with uncache req
XSPerf("lqMmioCnt", io.uncache.req.fire(), acc = true)
XSPerf("lqRefill", io.dcache.valid, acc = true)
XSPerf("lqWriteback", PopCount(VecInit(io.ldout.map(i => i.fire()))), acc = true)
XSPerf("lqWbBlocked", PopCount(VecInit(io.ldout.map(i => i.valid && !i.ready))), acc = true)
// debug info
XSDebug("enqPtrExt %d:%d deqPtrExt %d:%d\n", enqPtrExt(0).flag, enqPtr, deqPtrExt.flag, deqPtr)
......
......@@ -106,51 +106,51 @@ class MaskModule(numEntries: Int, numRead: Int, numWrite: Int) extends XSModule
}
}
class LQData8Module(numEntries: Int, numRead: Int, numWrite: Int) extends XSModule with HasDCacheParameters {
val io = IO(new Bundle {
// read
val raddr = Input(Vec(numRead, UInt(log2Up(numEntries).W)))
val rdata = Output(Vec(numRead, UInt(8.W)))
// address indexed write
val wen = Input(Vec(numWrite, Bool()))
val waddr = Input(Vec(numWrite, UInt(log2Up(numEntries).W)))
val wdata = Input(Vec(numWrite, UInt(8.W)))
// masked write
val mwmask = Input(Vec(blockWords, Vec(numEntries, Bool())))
val mwdata = Input(Vec(blockWords, UInt(8.W)))
})
val data = Reg(Vec(numEntries, UInt(8.W)))
// read ports
for (i <- 0 until numRead) {
io.rdata(i) := data(RegNext(io.raddr(i)))
}
// below is the write ports (with priorities)
for (i <- 0 until numWrite) {
when (io.wen(i)) {
data(io.waddr(i)) := io.wdata(i)
}
}
// masked write
for (j <- 0 until numEntries) {
val wen = VecInit((0 until blockWords).map(i => io.mwmask(i)(j))).asUInt.orR
when (wen) {
data(j) := VecInit((0 until blockWords).map(i => {
Mux(io.mwmask(i)(j), io.mwdata(i), 0.U)
})).reduce(_ | _)
}
}
// DataModuleTemplate should not be used when there're any write conflicts
for (i <- 0 until numWrite) {
for (j <- i+1 until numWrite) {
assert(!(io.wen(i) && io.wen(j) && io.waddr(i) === io.waddr(j)))
}
}
}
// class LQData8Module(numEntries: Int, numRead: Int, numWrite: Int) extends XSModule with HasDCacheParameters {
// val io = IO(new Bundle {
// // read
// val raddr = Input(Vec(numRead, UInt(log2Up(numEntries).W)))
// val rdata = Output(Vec(numRead, UInt(8.W)))
// // address indexed write
// val wen = Input(Vec(numWrite, Bool()))
// val waddr = Input(Vec(numWrite, UInt(log2Up(numEntries).W)))
// val wdata = Input(Vec(numWrite, UInt(8.W)))
// // masked write
// val mwmask = Input(Vec(blockWords, Vec(numEntries, Bool())))
// val mwdata = Input(Vec(blockWords, UInt(8.W)))
// })
// val data = Reg(Vec(numEntries, UInt(8.W)))
// // read ports
// for (i <- 0 until numRead) {
// io.rdata(i) := data(RegNext(io.raddr(i)))
// }
// // below is the write ports (with priorities)
// for (i <- 0 until numWrite) {
// when (io.wen(i)) {
// data(io.waddr(i)) := io.wdata(i)
// }
// }
// // masked write
// for (j <- 0 until numEntries) {
// val wen = VecInit((0 until blockWords).map(i => io.mwmask(i)(j))).asUInt.orR
// when (wen) {
// data(j) := VecInit((0 until blockWords).map(i => {
// Mux(io.mwmask(i)(j), io.mwdata(i), 0.U)
// })).reduce(_ | _)
// }
// }
// // DataModuleTemplate should not be used when there're any write conflicts
// for (i <- 0 until numWrite) {
// for (j <- i+1 until numWrite) {
// assert(!(io.wen(i) && io.wen(j) && io.waddr(i) === io.waddr(j)))
// }
// }
// }
class CoredataModule(numEntries: Int, numRead: Int, numWrite: Int) extends XSModule with HasDCacheParameters {
val io = IO(new Bundle {
......@@ -177,7 +177,7 @@ class CoredataModule(numEntries: Int, numRead: Int, numWrite: Int) extends XSMod
val paddrWen = Input(Vec(numWrite, Bool()))
})
val data8 = Seq.fill(8)(Module(new LQData8Module(numEntries, numRead, numWrite)))
val data8 = Seq.fill(8)(Module(new MaskedSyncDataModuleTemplate(UInt(8.W), numEntries, numRead, numWrite, numMWrite = blockWords)))
val fwdMask = Reg(Vec(numEntries, UInt(8.W)))
val wordIndex = Reg(Vec(numEntries, UInt((blockOffBits - wordOffBits).W)))
......
......@@ -38,7 +38,7 @@ class StoreQueue extends XSModule with HasDCacheParameters with HasCircularQueue
val storeIn = Vec(StorePipelineWidth, Flipped(Valid(new LsPipelineBundle)))
val sbuffer = Vec(StorePipelineWidth, Decoupled(new DCacheWordReq))
val mmioStout = DecoupledIO(new ExuOutput) // writeback uncached store
val forward = Vec(LoadPipelineWidth, Flipped(new LoadForwardQueryIO))
val forward = Vec(LoadPipelineWidth, Flipped(new MaskedLoadForwardQueryIO))
val roq = Flipped(new RoqLsqIO)
val uncache = new DCacheWordIO
// val refill = Flipped(Valid(new DCacheLineReq ))
......@@ -61,7 +61,7 @@ class StoreQueue extends XSModule with HasDCacheParameters with HasCircularQueue
dataModule.io := DontCare
val paddrModule = Module(new SQPaddrModule(StoreQueueSize, numRead = StorePipelineWidth, numWrite = StorePipelineWidth, numForward = StorePipelineWidth))
paddrModule.io := DontCare
val vaddrModule = Module(new AsyncDataModuleTemplate(UInt(VAddrBits.W), StoreQueueSize, numRead = 1, numWrite = StorePipelineWidth))
val vaddrModule = Module(new SyncDataModuleTemplate(UInt(VAddrBits.W), StoreQueueSize, numRead = 1, numWrite = StorePipelineWidth))
vaddrModule.io := DontCare
// state & misc
......@@ -104,7 +104,7 @@ class StoreQueue extends XSModule with HasDCacheParameters with HasCircularQueue
dataModule.io.raddr(i) := deqPtrExtNext(i).value
paddrModule.io.raddr(i) := deqPtrExtNext(i).value
}
vaddrModule.io.raddr(0) := cmtPtr + commitCount
vaddrModule.io.raddr(0) := cmtPtr + io.roq.scommit
/**
* Enqueue at dispatch
......@@ -144,9 +144,8 @@ class StoreQueue extends XSModule with HasDCacheParameters with HasCircularQueue
for (i <- 0 until StorePipelineWidth) {
dataModule.io.wen(i) := false.B
paddrModule.io.wen(i) := false.B
vaddrModule.io.wen(i) := false.B
val stWbIndex = io.storeIn(i).bits.uop.sqIdx.value
when (io.storeIn(i).fire()) {
val stWbIndex = io.storeIn(i).bits.uop.sqIdx.value
datavalid(stWbIndex) := !io.storeIn(i).bits.mmio
writebacked(stWbIndex) := !io.storeIn(i).bits.mmio
pending(stWbIndex) := io.storeIn(i).bits.mmio
......@@ -164,9 +163,6 @@ class StoreQueue extends XSModule with HasDCacheParameters with HasCircularQueue
paddrModule.io.wdata(i) := io.storeIn(i).bits.paddr
paddrModule.io.wen(i) := true.B
vaddrModule.io.waddr(i) := stWbIndex
vaddrModule.io.wdata(i) := io.storeIn(i).bits.vaddr
vaddrModule.io.wen(i) := true.B
mmio(stWbIndex) := io.storeIn(i).bits.mmio
......@@ -179,6 +175,10 @@ class StoreQueue extends XSModule with HasDCacheParameters with HasCircularQueue
io.storeIn(i).bits.mmio
)
}
// vaddrModule write is delayed, as vaddrModule will not be read right after write
vaddrModule.io.waddr(i) := RegNext(stWbIndex)
vaddrModule.io.wdata(i) := RegNext(io.storeIn(i).bits.vaddr)
vaddrModule.io.wen(i) := RegNext(io.storeIn(i).fire())
}
/**
......@@ -199,7 +199,7 @@ class StoreQueue extends XSModule with HasDCacheParameters with HasCircularQueue
// Forward2: Mux(same_flag, 0.U, range(0, sqIdx) )
// i.e. forward1 is the target entries with the same flag bits and forward2 otherwise
val differentFlag = deqPtrExt(0).flag =/= io.forward(i).sqIdx.flag
val forwardMask = UIntToMask(io.forward(i).sqIdx.value, StoreQueueSize)
val forwardMask = io.forward(i).sqIdxMask
val storeWritebackedVec = WireInit(VecInit(Seq.fill(StoreQueueSize)(false.B)))
for (j <- 0 until StoreQueueSize) {
storeWritebackedVec(j) := datavalid(j) && allocated(j) // all datavalid terms need to be checked
......@@ -388,6 +388,16 @@ class StoreQueue extends XSModule with HasDCacheParameters with HasCircularQueue
// for 1 cycle will also promise that sq is empty in that cycle
io.sqempty := RegNext(enqPtrExt(0).value === deqPtrExt(0).value && enqPtrExt(0).flag === deqPtrExt(0).flag)
// perf counter
XSPerf("sqFull", !allowEnqueue, acc = true)
XSPerf("sqMmioCycle", uncacheState =/= s_idle, acc = true) // lq is busy dealing with uncache req
XSPerf("sqMmioCnt", io.uncache.req.fire(), acc = true)
XSPerf("sqWriteback", io.mmioStout.fire(), acc = true)
XSPerf("sqWbBlocked", io.mmioStout.valid && !io.mmioStout.ready, acc = true)
XSPerf("sqValidEntryCnt", distanceBetween(enqPtrExt(0), deqPtrExt(0)))
XSPerf("sqCmtEntryCnt", distanceBetween(cmtPtrExt(0), deqPtrExt(0)))
XSPerf("sqNCmtEntryCnt", distanceBetween(enqPtrExt(0), cmtPtrExt(0)))
// debug info
XSDebug("enqPtrExt %d:%d deqPtrExt %d:%d\n", enqPtrExt(0).flag, enqPtr, deqPtrExt(0).flag, deqPtr)
......
......@@ -13,7 +13,8 @@ class LoadToLsqIO extends XSBundle {
val loadIn = ValidIO(new LsPipelineBundle)
val ldout = Flipped(DecoupledIO(new ExuOutput))
val loadDataForwarded = Output(Bool())
val forward = new LoadForwardQueryIO
val needReplayFromRS = Output(Bool())
val forward = new MaskedLoadForwardQueryIO
}
// Load Pipeline Stage 0
......@@ -28,17 +29,15 @@ class LoadUnit_S0 extends XSModule {
})
val s0_uop = io.in.bits.uop
val s0_vaddr_old = io.in.bits.src1 + SignExt(ImmUnion.I.toImm32(s0_uop.ctrl.imm), XLEN)
// val s0_vaddr = io.in.bits.src1 + SignExt(s0_uop.ctrl.imm(11,0), VAddrBits)
// val s0_mask = genWmask(s0_vaddr, s0_uop.ctrl.fuOpType(1,0))
val imm12 = WireInit(s0_uop.ctrl.imm(11,0))
val s0_vaddr_lo = io.in.bits.src1(11,0) + Cat(0.U(1.W), imm12)
val s0_vaddr_hi = Mux(imm12(11),
Mux((s0_vaddr_lo(12)), io.in.bits.src1(VAddrBits-1, 12), io.in.bits.src1(VAddrBits-1, 12)+SignExt(1.U, VAddrBits-12)),
Mux((s0_vaddr_lo(12)), io.in.bits.src1(VAddrBits-1, 12)+1.U, io.in.bits.src1(VAddrBits-1, 12))
val s0_vaddr_hi = Mux(s0_vaddr_lo(12),
Mux(imm12(11), io.in.bits.src1(VAddrBits-1, 12), io.in.bits.src1(VAddrBits-1, 12)+1.U),
Mux(imm12(11), io.in.bits.src1(VAddrBits-1, 12)+SignExt(1.U, VAddrBits-12), io.in.bits.src1(VAddrBits-1, 12)),
)
val s0_vaddr = Cat(s0_vaddr_hi, s0_vaddr_lo(11,0))
when(io.in.fire() && s0_vaddr(VAddrBits-1,0) =/= (io.in.bits.src1 + SignExt(ImmUnion.I.toImm32(s0_uop.ctrl.imm), XLEN))(VAddrBits-1,0)){
printf("s0_vaddr %x s0_vaddr_old %x\n", s0_vaddr, s0_vaddr_old(VAddrBits-1,0))
}
val s0_mask = genWmask(s0_vaddr_lo, s0_uop.ctrl.fuOpType(1,0))
// query DTLB
......@@ -92,7 +91,7 @@ class LoadUnit_S1 extends XSModule {
val dcachePAddr = Output(UInt(PAddrBits.W))
val dcacheKill = Output(Bool())
val sbuffer = new LoadForwardQueryIO
val lsq = new LoadForwardQueryIO
val lsq = new MaskedLoadForwardQueryIO
})
val s1_uop = io.in.bits.uop
......@@ -122,6 +121,7 @@ class LoadUnit_S1 extends XSModule {
io.lsq.paddr := s1_paddr
io.lsq.uop := s1_uop
io.lsq.sqIdx := s1_uop.sqIdx
io.lsq.sqIdxMask := DontCare // will be overwritten by sqIdxMask pre-generated in s0
io.lsq.mask := s1_mask
io.lsq.pc := s1_uop.cf.pc // FIXME: remove it
......@@ -149,6 +149,7 @@ class LoadUnit_S2 extends XSModule with HasLoadHelper {
val lsq = new LoadForwardQueryIO
val sbuffer = new LoadForwardQueryIO
val dataForwarded = Output(Bool())
val needReplayFromRS = Output(Bool())
})
val s2_uop = io.in.bits.uop
......@@ -168,10 +169,22 @@ class LoadUnit_S2 extends XSModule with HasLoadHelper {
io.tlbFeedback.valid := io.in.valid
io.tlbFeedback.bits.hit := !s2_tlb_miss && (!s2_cache_replay || s2_mmio)
io.tlbFeedback.bits.rsIdx := io.in.bits.rsIdx
io.needReplayFromRS := s2_cache_replay
// merge forward result
// lsq has higher priority than sbuffer
val forwardMask = Wire(Vec(8, Bool()))
val forwardData = Wire(Vec(8, UInt(8.W)))
val forwardMask = io.out.bits.forwardMask
val forwardData = io.out.bits.forwardData
val fullForward = (~forwardMask.asUInt & s2_mask) === 0.U
io.lsq := DontCare
io.sbuffer := DontCare
// generate XLEN/8 Muxs
for (i <- 0 until XLEN / 8) {
forwardMask(i) := io.lsq.forwardMask(i) || io.sbuffer.forwardMask(i)
forwardData(i) := Mux(io.lsq.forwardMask(i), io.lsq.forwardData(i), io.sbuffer.forwardData(i))
}
XSDebug(io.out.fire(), "[FWD LOAD RESP] pc %x fwd %x(%b) + %x(%b)\n",
s2_uop.cf.pc,
......@@ -180,8 +193,9 @@ class LoadUnit_S2 extends XSModule with HasLoadHelper {
)
// data merge
val rdata = VecInit((0 until XLEN / 8).map(j =>
Mux(forwardMask(j), forwardData(j), io.dcacheResp.bits.data(8*(j+1)-1, 8*j)))).asUInt
val rdataVec = VecInit((0 until XLEN / 8).map(j =>
Mux(forwardMask(j), forwardData(j), io.dcacheResp.bits.data(8*(j+1)-1, 8*j))))
val rdata = rdataVec.asUInt
val rdataSel = LookupTree(s2_paddr(2, 0), List(
"b000".U -> rdata(63, 0),
"b001".U -> rdata(63, 8),
......@@ -194,9 +208,7 @@ class LoadUnit_S2 extends XSModule with HasLoadHelper {
))
val rdataPartialLoad = rdataHelper(s2_uop, rdataSel)
// TODO: ECC check
io.out.valid := io.in.valid && !s2_tlb_miss && (!s2_cache_replay || s2_mmio || s2_exception)
io.out.valid := io.in.valid && !s2_tlb_miss
// Inst will be canceled in store queue / lsq,
// so we do not need to care about flush in load / store unit's out.valid
io.out.bits := io.in.bits
......@@ -212,28 +224,16 @@ class LoadUnit_S2 extends XSModule with HasLoadHelper {
// and dcache query is no longer needed.
// Such inst will be writebacked from load queue.
io.dataForwarded := s2_cache_miss && fullForward && !s2_exception
// io.out.bits.forwardX will be send to lq
io.out.bits.forwardMask := forwardMask
// data retbrived from dcache is also included in io.out.bits.forwardData
io.out.bits.forwardData := rdataVec
io.in.ready := io.out.ready || !io.in.valid
// merge forward result
// lsq has higher priority than sbuffer
io.lsq := DontCare
io.sbuffer := DontCare
// generate XLEN/8 Muxs
for (i <- 0 until XLEN / 8) {
when (io.sbuffer.forwardMask(i)) {
io.out.bits.forwardMask(i) := true.B
io.out.bits.forwardData(i) := io.sbuffer.forwardData(i)
}
when (io.lsq.forwardMask(i)) {
io.out.bits.forwardMask(i) := true.B
io.out.bits.forwardData(i) := io.lsq.forwardData(i)
}
}
XSDebug(io.out.fire(), "[DCACHE LOAD RESP] pc %x rdata %x <- D$ %x + fwd %x(%b)\n",
s2_uop.cf.pc, rdataPartialLoad, io.dcacheResp.bits.data,
io.out.bits.forwardData.asUInt, io.out.bits.forwardMask.asUInt
forwardData.asUInt, forwardMask.asUInt
)
}
......@@ -271,13 +271,19 @@ class LoadUnit extends XSModule with HasLoadHelper {
PipelineConnect(load_s1.io.out, load_s2.io.in, true.B, load_s1.io.out.bits.uop.roqIdx.needFlush(io.redirect, io.flush))
load_s2.io.tlbFeedback <> io.tlbFeedback
load_s2.io.dcacheResp <> io.dcache.resp
load_s2.io.lsq.forwardData <> io.lsq.forward.forwardData
load_s2.io.lsq.forwardMask <> io.lsq.forward.forwardMask
load_s2.io.sbuffer.forwardData <> io.sbuffer.forwardData
load_s2.io.sbuffer.forwardMask <> io.sbuffer.forwardMask
load_s2.io.dataForwarded <> io.lsq.loadDataForwarded
io.tlbFeedback.bits := RegNext(load_s2.io.tlbFeedback.bits)
io.tlbFeedback.valid := RegNext(load_s2.io.tlbFeedback.valid && !load_s2.io.out.bits.uop.roqIdx.needFlush(io.redirect, io.flush))
io.lsq.needReplayFromRS := load_s2.io.needReplayFromRS
// pre-calcuate sqIdx mask in s0, then send it to lsq in s1 for forwarding
val sqIdxMaskReg = RegNext(UIntToMask(load_s0.io.in.bits.uop.sqIdx.value, StoreQueueSize))
io.lsq.forward.sqIdxMask := sqIdxMaskReg
// use s2_hit_way to select data received in s1
load_s2.io.dcacheResp.bits.data := Mux1H(io.dcache.s2_hit_way, RegNext(io.dcache.s1_data))
......@@ -317,19 +323,26 @@ class LoadUnit extends XSModule with HasLoadHelper {
io.ldout.bits := Mux(intHitLoadOut.valid, intHitLoadOut.bits, io.lsq.ldout.bits)
io.ldout.valid := intHitLoadOut.valid || io.lsq.ldout.valid && !refillFpLoad
// Fp load, if hit, will be send to recoder at s2, then it will be recoded & writebacked at s3
// Fp load, if hit, will be stored to reg at s2, then it will be recoded at s3, writebacked at s4
val fpHitLoadOut = Wire(Valid(new ExuOutput))
fpHitLoadOut.valid := s2_wb_valid && load_s2.io.out.bits.uop.ctrl.fpWen
fpHitLoadOut.bits := intHitLoadOut.bits
val fpLoadOut = Wire(Valid(new ExuOutput))
fpLoadOut.bits := Mux(fpHitLoadOut.valid, fpHitLoadOut.bits, io.lsq.ldout.bits)
fpLoadOut.valid := fpHitLoadOut.valid || io.lsq.ldout.valid && refillFpLoad
val fpLoadUnRecodedReg = Reg(Valid(new ExuOutput))
fpLoadUnRecodedReg.valid := fpHitLoadOut.valid || io.lsq.ldout.valid && refillFpLoad
when(fpHitLoadOut.valid || io.lsq.ldout.valid && refillFpLoad){
fpLoadUnRecodedReg.bits := Mux(fpHitLoadOut.valid, fpHitLoadOut.bits, io.lsq.ldout.bits)
}
val fpLoadOutReg = RegNext(fpLoadOut)
io.fpout.bits := fpLoadOutReg.bits
io.fpout.bits.data := fpRdataHelper(fpLoadOutReg.bits.uop, fpLoadOutReg.bits.data) // recode
io.fpout.valid := RegNext(fpLoadOut.valid)
val fpLoadRecodedReg = Reg(Valid(new ExuOutput))
when(fpLoadUnRecodedReg.valid){
fpLoadRecodedReg := fpLoadUnRecodedReg
fpLoadRecodedReg.bits.data := fpRdataHelper(fpLoadUnRecodedReg.bits.uop, fpLoadUnRecodedReg.bits.data) // recode
}
fpLoadRecodedReg.valid := fpLoadUnRecodedReg.valid
io.fpout.bits := fpLoadRecodedReg.bits
io.fpout.valid := fpLoadRecodedReg.valid
io.lsq.ldout.ready := Mux(refillFpLoad, !fpHitLoadOut.valid, !intHitLoadOut.valid)
......
......@@ -18,17 +18,14 @@ class StoreUnit_S0 extends XSModule {
})
// send req to dtlb
val saddr_old = io.in.bits.src1 + SignExt(ImmUnion.S.toImm32(io.in.bits.uop.ctrl.imm), XLEN)
// val saddr = io.in.bits.src1 + SignExt(io.in.bits.uop.ctrl.imm(11,0), VAddrBits)
val imm12 = WireInit(io.in.bits.uop.ctrl.imm(11,0))
val saddr_lo = io.in.bits.src1(11,0) + Cat(0.U(1.W), imm12)
val saddr_hi = Mux(imm12(11),
Mux((saddr_lo(12)), io.in.bits.src1(VAddrBits-1, 12), io.in.bits.src1(VAddrBits-1, 12)+SignExt(1.U, VAddrBits-12)),
Mux((saddr_lo(12)), io.in.bits.src1(VAddrBits-1, 12)+1.U, io.in.bits.src1(VAddrBits-1, 12))
val saddr_hi = Mux(saddr_lo(12),
Mux(imm12(11), io.in.bits.src1(VAddrBits-1, 12), io.in.bits.src1(VAddrBits-1, 12)+1.U),
Mux(imm12(11), io.in.bits.src1(VAddrBits-1, 12)+SignExt(1.U, VAddrBits-12), io.in.bits.src1(VAddrBits-1, 12)),
)
val saddr = Cat(saddr_hi, saddr_lo(11,0))
when(io.in.fire() && saddr(VAddrBits-1,0) =/= (io.in.bits.src1 + SignExt(ImmUnion.S.toImm32(io.in.bits.uop.ctrl.imm), XLEN))(VAddrBits-1,0)){
printf("saddr %x saddr_old %x\n", saddr, saddr_old(VAddrBits-1,0))
}
io.dtlbReq.bits.vaddr := saddr
io.dtlbReq.valid := io.in.valid
......
......@@ -129,6 +129,9 @@ class NewSbuffer extends XSModule with HasSbufferCst {
difftestIO <> DontCare
val buffer = Mem(StoreBufferSize, new SbufferLine)
val tag = Reg(Vec(StoreBufferSize, UInt(TagWidth.W)))
val mask = Reg(Vec(StoreBufferSize, Vec(CacheLineWords, Vec(DataBytes, Bool()))))
val data = Reg(Vec(StoreBufferSize, Vec(CacheLineWords, Vec(DataBytes, UInt(8.W))))) // TODO: will be replaced by SyncDataModuleTemplate
val stateVec = RegInit(VecInit(Seq.fill(StoreBufferSize)(s_invalid)))
val cohCount = Reg(Vec(StoreBufferSize, UInt(countBits.W)))
/*
......@@ -165,30 +168,13 @@ class NewSbuffer extends XSModule with HasSbufferCst {
val validCount = RegInit(0.U((log2Up(StoreBufferSize) + 1).W))
val full = invalidCount === 0.U // full = TODO: validCount(log2Up(StoreBufferSize))
val bufferRead = VecInit((0 until StoreBufferSize).map(i => buffer(i)))
val stateRead = VecInit((0 until StoreBufferSize).map(i => stateVec(i)))
val dataRead = VecInit((0 until StoreBufferSize).map(i => bufferRead(i).data.asTypeOf(Vec(CacheLineWords, Vec(DataBytes, UInt(8.W))))))
val maskRead = VecInit((0 until StoreBufferSize).map(i => bufferRead(i).mask.asTypeOf(Vec(CacheLineWords, Vec(DataBytes, Bool())))))
val tagRead = VecInit((0 until StoreBufferSize).map(i => bufferRead(i).tag))
val dataUpdate = WireInit(dataRead)
val maskUpdate = WireInit(maskRead)
val tagUpdate = WireInit(tagRead)
val stateUpdate = WireInit(stateRead)
val bufferUpdate = Wire(Vec(StoreBufferSize, new SbufferLine))
(0 until StoreBufferSize) foreach { i =>
bufferUpdate(i).tag := tagUpdate(i)
bufferUpdate(i).data := dataUpdate(i).asUInt()
bufferUpdate(i).mask := maskUpdate(i).asUInt()
}
val lru = Module(new ChooseReplace(StoreBufferSize))
val evictionIdx = lru.io.way
lru.io.mask := stateRead.map(isValid(_))
lru.io.mask := stateVec.map(isValid(_))
val tags = io.in.map(in => getTag(in.bits.addr))
val sameTag = tags(0) === tags(1)
val intags = io.in.map(in => getTag(in.bits.addr))
val sameTag = intags(0) === intags(1)
val firstWord = getWord(io.in(0).bits.addr)
val secondWord = getWord(io.in(1).bits.addr)
val sameWord = firstWord === secondWord
......@@ -201,13 +187,14 @@ class NewSbuffer extends XSModule with HasSbufferCst {
for(i <- 0 until StorePipelineWidth){
mergeMask(i) := widthMap(j =>
Mux(tags(i) === tagRead(j) && isValid(stateRead(j)), true.B, false.B))
intags(i) === tag(j) && isValid(stateVec(j))
)
}
// insert confition
// firstInsert: the first invalid entry
// if first entry canMerge or second entry has the same tag with the first entry , secondInsert equal the first invalid entry, otherwise, the second invalid entry
val invalidMask = stateRead.map(s => isInvalid(s))
val invalidMask = stateVec.map(s => isInvalid(s))
val evenInvalidMask = GetEvenBits(VecInit(invalidMask).asUInt)
val oddInvalidMask = GetOddBits(VecInit(invalidMask).asUInt)
......@@ -232,27 +219,26 @@ class NewSbuffer extends XSModule with HasSbufferCst {
Mux(~enbufferSelReg, evenCanInsert, oddCanInsert)
)
io.in(0).ready := firstCanInsert || canMerge(0)
io.in(1).ready := (secondCanInsert || canMerge(1)) && !sameWord && io.in(0).ready
io.in(0).ready := firstCanInsert
io.in(1).ready := secondCanInsert && !sameWord && io.in(0).ready
def wordReqToBufLine(req: DCacheWordReq, tag: UInt, insertIdx: UInt, wordOffset: UInt, flushMask: Bool): Unit = {
stateUpdate(insertIdx) := s_valid
tagUpdate(insertIdx) := tag
def wordReqToBufLine(req: DCacheWordReq, reqtag: UInt, insertIdx: UInt, wordOffset: UInt, flushMask: Bool): Unit = {
stateVec(insertIdx) := s_valid
cohCount(insertIdx) := 0.U
tag(insertIdx) := reqtag
when(flushMask){
for(j <- 0 until CacheLineWords){
for(i <- 0 until DataBytes){
maskUpdate(insertIdx)(j)(i) := false.B
mask(insertIdx)(j)(i) := false.B
}
}
}
for(i <- 0 until DataBytes){
when(req.mask(i)){
maskUpdate(insertIdx)(wordOffset)(i) := true.B
dataUpdate(insertIdx)(wordOffset)(i) := req.data(i*8+7, i*8)
mask(insertIdx)(wordOffset)(i) := true.B
data(insertIdx)(wordOffset)(i) := req.data(i*8+7, i*8)
}
}
}
......@@ -261,8 +247,8 @@ class NewSbuffer extends XSModule with HasSbufferCst {
cohCount(mergeIdx) := 0.U
for(i <- 0 until DataBytes){
when(req.mask(i)){
maskUpdate(mergeIdx)(wordOffset)(i) := true.B
dataUpdate(mergeIdx)(wordOffset)(i) := req.data(i*8+7, i*8)
mask(mergeIdx)(wordOffset)(i) := true.B
data(mergeIdx)(wordOffset)(i) := req.data(i*8+7, i*8)
}
}
}
......@@ -273,7 +259,7 @@ class NewSbuffer extends XSModule with HasSbufferCst {
mergeWordReq(io.in(0).bits, mergeIdx(0), firstWord)
XSDebug(p"merge req 0 to line [${mergeIdx(0)}]\n")
}.otherwise{
wordReqToBufLine(io.in(0).bits, tags(0), firstInsertIdx, firstWord, true.B)
wordReqToBufLine(io.in(0).bits, intags(0), firstInsertIdx, firstWord, true.B)
XSDebug(p"insert req 0 to line[$firstInsertIdx]\n")
}
}
......@@ -284,19 +270,14 @@ class NewSbuffer extends XSModule with HasSbufferCst {
mergeWordReq(io.in(1).bits, mergeIdx(1), secondWord)
XSDebug(p"merge req 1 to line [${mergeIdx(1)}]\n")
}.otherwise{
wordReqToBufLine(io.in(1).bits, tags(1), secondInsertIdx, secondWord, !sameTag)
wordReqToBufLine(io.in(1).bits, intags(1), secondInsertIdx, secondWord, !sameTag)
XSDebug(p"insert req 1 to line[$secondInsertIdx]\n")
}
}
for(i <- 0 until StoreBufferSize){
buffer.write(i.U, bufferUpdate(i))
stateVec(i) := stateUpdate(i)
}
for(i <- 0 until StoreBufferSize){
XSDebug(stateVec(i)=/=s_invalid,
p"[$i] timeout:${cohCount(i)(countBits-1)} state:${stateVec(i)} buf:${bufferRead(i)}\n"
p"[$i] timeout:${cohCount(i)(countBits-1)} state:${stateVec(i)}\n"
)
}
......@@ -320,7 +301,7 @@ class NewSbuffer extends XSModule with HasSbufferCst {
do_eviction := validCount >= 12.U
io.flush.empty := empty && io.sqempty
io.flush.empty := RegNext(empty && io.sqempty)
lru.io.flush := sbuffer_state === x_drain_sbuffer && empty
switch(sbuffer_state){
is(x_idle){
......@@ -346,11 +327,11 @@ class NewSbuffer extends XSModule with HasSbufferCst {
XSDebug(p"sbuffer state:${sbuffer_state} do eviction:${do_eviction} empty:${empty}\n")
def noSameBlockInflight(idx: UInt): Bool = {
val tag = tagRead(idx)
val atag = tag(idx)
!Cat(widthMap(i => {
// stateVec(idx) itself must not be s_inflight*
(isInflight(stateRead(i)) || isPrepare(stateRead(i))) &&
tag === tagRead(i)
(isInflight(stateVec(i)) || isPrepare(stateVec(i))) &&
atag === tag(i)
})).orR()
}
......@@ -384,9 +365,9 @@ class NewSbuffer extends XSModule with HasSbufferCst {
dcacheReqValid := false.B
}
when(prepareEn && (!dcacheReqValid || io.dcache.req.fire())) {
dcacheCandidate.addr := getAddr(tagRead(prepareIdx))
dcacheCandidate.data := bufferRead(prepareIdx).data
dcacheCandidate.mask := bufferRead(prepareIdx).mask
dcacheCandidate.addr := getAddr(tag(prepareIdx))
dcacheCandidate.data := data(prepareIdx).asUInt
dcacheCandidate.mask := mask(prepareIdx).asUInt
dcacheCandidate.cmd := MemoryOpConstants.M_XWR
dcacheCandidate.id := prepareIdx
stateVec(prepareIdx) := s_inflight
......@@ -411,9 +392,9 @@ class NewSbuffer extends XSModule with HasSbufferCst {
if (!env.FPGAPlatform) {
difftestIO.sbufferResp := WireInit(io.dcache.resp.fire())
difftestIO.sbufferAddr := WireInit(getAddr(tagRead(respId)))
difftestIO.sbufferData := WireInit(bufferRead(respId).data.asTypeOf(Vec(CacheLineBytes, UInt(8.W))))
difftestIO.sbufferMask := WireInit(bufferRead(respId).mask)
difftestIO.sbufferAddr := WireInit(getAddr(tag(respId)))
difftestIO.sbufferData := WireInit(data(respId).asTypeOf(Vec(CacheLineBytes, UInt(8.W))))
difftestIO.sbufferMask := WireInit(mask(respId).asUInt)
}
val needSpace = (io.in(0).fire && !canMerge(0)) +& (io.in(1).fire && !canMerge(1) && !sameTag)
......@@ -431,7 +412,7 @@ class NewSbuffer extends XSModule with HasSbufferCst {
when(isValid(stateVec(i))){
when(cohCount(i)(countBits-1)){
assert(stateVec(i) === s_valid)
stateUpdate(i) := s_prepare
stateVec(i) := s_prepare
}
cohCount(i) := cohCount(i)+1.U
}
......@@ -440,7 +421,7 @@ class NewSbuffer extends XSModule with HasSbufferCst {
// ---------------------- Load Data Forward ---------------------
for ((forward, i) <- io.forward.zipWithIndex) {
val tag_matches = widthMap(i => tagRead(i) === getTag(forward.paddr))
val tag_matches = widthMap(i => tag(i) === getTag(forward.paddr))
val valid_tag_matches = widthMap(i => tag_matches(i) && isValid(stateVec(i)))
val inflight_tag_matches = widthMap(i =>
tag_matches(i) && (isInflight(stateVec(i)) || isPrepare(stateVec(i)))
......@@ -451,13 +432,11 @@ class NewSbuffer extends XSModule with HasSbufferCst {
val inflight_tag_match_reg = inflight_tag_matches.map(RegNext(_))
val line_offset_reg = RegNext(line_offset_mask)
val selectedValidLine = Mux1H(valid_tag_match_reg, bufferRead)
val selectedValidMask = Mux1H(line_offset_reg, selectedValidLine.mask.asTypeOf(Vec(CacheLineWords, Vec(DataBytes, Bool()))))
val selectedValidData = Mux1H(line_offset_reg, selectedValidLine.data.asTypeOf(Vec(CacheLineWords, Vec(DataBytes, UInt(8.W)))))
val selectedValidMask = Mux1H(line_offset_reg, Mux1H(valid_tag_match_reg, mask).asTypeOf(Vec(CacheLineWords, Vec(DataBytes, Bool()))))
val selectedValidData = Mux1H(line_offset_reg, Mux1H(valid_tag_match_reg, data).asTypeOf(Vec(CacheLineWords, Vec(DataBytes, UInt(8.W)))))
val selectedInflightLine = Mux1H(inflight_tag_match_reg, bufferRead)
val selectedInflightMask = Mux1H(line_offset_reg, selectedInflightLine.mask.asTypeOf(Vec(CacheLineWords, Vec(DataBytes, Bool()))))
val selectedInflightData = Mux1H(line_offset_reg, selectedInflightLine.data.asTypeOf(Vec(CacheLineWords, Vec(DataBytes, UInt(8.W)))))
val selectedInflightMask = Mux1H(line_offset_reg, Mux1H(inflight_tag_match_reg, mask).asTypeOf(Vec(CacheLineWords, Vec(DataBytes, Bool()))))
val selectedInflightData = Mux1H(line_offset_reg, Mux1H(inflight_tag_match_reg, data).asTypeOf(Vec(CacheLineWords, Vec(DataBytes, UInt(8.W)))))
for (j <- 0 until DataBytes) {
forward.forwardMask(j) := false.B
......
......@@ -148,7 +148,7 @@ package object xiangshan {
def configable_cache(mode: UInt) = mode(7)
def strToMode(s: String) = {
var result = 0.U << 8
var result = 0.U(8.W)
if (s.toUpperCase.indexOf("R") >= 0) result = result + R
if (s.toUpperCase.indexOf("W") >= 0) result = result + W
if (s.toUpperCase.indexOf("X") >= 0) result = result + X
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册