未验证 提交 e94eb96f 编写于 作者: W William Wang 提交者: GitHub

Merge pull request #570 from RISCVERS/mem-timing

Opt memblock timing, dcache timing ignored for now
...@@ -3,7 +3,7 @@ NANOS_HOME ?= $(AM_HOME)/../nanos-lite ...@@ -3,7 +3,7 @@ NANOS_HOME ?= $(AM_HOME)/../nanos-lite
SINGLETEST = ALL=min3 SINGLETEST = ALL=min3
B ?= 0 B ?= 0
E ?= -1 E ?= 0
V ?= OFF V ?= OFF
#V ?= OFF #V ?= OFF
EMU_ARGS = B=$(B) E=$(E) V=$(V) EMU_ARGS = B=$(B) E=$(E) V=$(V)
...@@ -18,7 +18,7 @@ cache: ...@@ -18,7 +18,7 @@ cache:
#2>&1 | tee > loader.log #2>&1 | tee > loader.log
cpu: cpu:
$(MAKE) -C $(AM_HOME)/tests/cputest $(ARCH) ALL=dummy $(EMU_ARGS) run 2>&1 | tee > dummy.log $(MAKE) -C $(AM_HOME)/tests/cputest $(ARCH) ALL=dummy $(EMU_ARGS) run 2>&1
# ------------------------------------------------------------------ # ------------------------------------------------------------------
# run different test sets # run different test sets
......
...@@ -68,4 +68,8 @@ trait HasCircularQueuePtrHelper { ...@@ -68,4 +68,8 @@ trait HasCircularQueuePtrHelper {
def isAfter[T <: CircularQueuePtr](left: T, right: T): Bool = { def isAfter[T <: CircularQueuePtr](left: T, right: T): Bool = {
Mux(left.flag === right.flag, left.value > right.value, left.value < right.value) Mux(left.flag === right.flag, left.value > right.value, left.value < right.value)
} }
def isBefore[T <: CircularQueuePtr](left: T, right: T): Bool = {
Mux(left.flag === right.flag, left.value < right.value, left.value > right.value)
}
} }
...@@ -17,7 +17,25 @@ object MemMap { ...@@ -17,7 +17,25 @@ object MemMap {
} }
object AddressSpace { object AddressSpace {
def MemMapList = List( def SimpleMemMapList = List(
// Base address Top address Width Description Mode (RWXIDSAC)
MemMap("h00_0000_0000", "h00_0FFF_FFFF", "h0", "Reserved", ""),
MemMap("h00_1000_0000", "h00_1FFF_FFFF", "h0", "QSPI_Flash", "RX"),
MemMap("h00_2000_0000", "h00_2FFF_FFFF", "h0", "Reserved", ""),
MemMap("h00_3000_0000", "h00_3000_FFFF", "h0", "DMA", "RW"),
MemMap("h00_3001_0000", "h00_3004_FFFF", "h0", "GPU", "RWC"),
MemMap("h00_3005_0000", "h00_3006_FFFF", "h0", "USB/SDMMC", "RW"),
MemMap("h00_3007_0000", "h00_30FF_FFFF", "h0", "Reserved", ""),
MemMap("h00_3100_0000", "h00_3111_FFFF", "h0", "MMIO", "RW"),
MemMap("h00_3112_0000", "h00_37FF_FFFF", "h0", "Reserved", ""),
MemMap("h00_3800_0000", "h00_3800_FFFF", "h0", "CLINT", "RW"),
MemMap("h00_3801_0000", "h00_3BFF_FFFF", "h0", "Reserved", ""),
MemMap("h00_3C00_0000", "h00_3FFF_FFFF", "h0", "PLIC", "RW"),
MemMap("h00_4000_0000", "h00_7FFF_FFFF", "h0", "PCIe", "RW"),
MemMap("h00_8000_0000", "h1F_FFFF_FFFF", "h0", "DDR", "RWXIDSA"),
)
def FullMemMapList = List(
// Base address Top address Width Description Mode (RWXIDSAC) // Base address Top address Width Description Mode (RWXIDSAC)
MemMap("h00_0000_0000", "h00_0FFF_FFFF", "h0", "Reserved", ""), MemMap("h00_0000_0000", "h00_0FFF_FFFF", "h0", "Reserved", ""),
MemMap("h00_1000_0000", "h00_1FFF_FFFF", "h0", "QSPI_Flash", "RX"), MemMap("h00_1000_0000", "h00_1FFF_FFFF", "h0", "QSPI_Flash", "RX"),
...@@ -55,16 +73,42 @@ object AddressSpace { ...@@ -55,16 +73,42 @@ object AddressSpace {
MemMap("h00_8000_0000", "h1F_FFFF_FFFF", "h0", "DDR", "RWXIDSA"), MemMap("h00_8000_0000", "h1F_FFFF_FFFF", "h0", "DDR", "RWXIDSA"),
) )
def MemMapList = SimpleMemMapList
def printMemmap(){ def printMemmap(){
println("-------------------- memory map --------------------") println("-------------------- memory map --------------------")
for(i <- MemMapList){ for(i <- MemMapList){
println(i._1._1 + "->" + i._1._2 + " width " + (if(i._2.get("width").get == "0") "unlimited" else i._2.get("width").get) + " " + i._2.get("description").get + " [" + i._2.get("mode").get + "]") println("[" + i._1._1 + " -> " + i._1._2 + "] Width:" + (if(i._2.get("width").get == "h0") "unlimited" else i._2.get("width").get) + " Description:" + i._2.get("description").get + " [" + i._2.get("mode").get + "]")
} }
println("----------------------------------------------------") println("----------------------------------------------------")
} }
def checkMemmap(){
for(i <- MemMapList){
// pma mode check
val s = i._2.get("mode").get
if(
s.toUpperCase.indexOf("A") >= 0 &&
!(s.toUpperCase.indexOf("R") >= 0 && s.toUpperCase.indexOf("W") >= 0)
){
println("[error] pma atomicable area must be both readable and writeable")
throw new IllegalArgumentException
}
// pma area size check
if(!i._1._1.endsWith("000") || !i._1._2.endsWith("FFF")){
println("[error] pma area must be larger than 4KB")
throw new IllegalArgumentException()
}
}
}
def genMemmapMatchVec(addr: UInt): UInt = { def genMemmapMatchVec(addr: UInt): UInt = {
VecInit(MemMapList.map(i => { VecInit(MemMapList.map(i => {
// calculate addr tag and compare mask
// val mask = i._1._2.U - i._1._1.U
// (~(i._1._1.U ^ addr) | mask).andR
// pma is not current critical path, use simple compare for now
i._1._1.U <= addr && addr < i._1._2.U i._1._1.U <= addr && addr < i._1._2.U
}).toSeq).asUInt }).toSeq).asUInt
} }
...@@ -75,6 +119,30 @@ object AddressSpace { ...@@ -75,6 +119,30 @@ object AddressSpace {
}).toSeq)) }).toSeq))
} }
// TODO: FIXME
def queryModeFast(matchVec: UInt): UInt = {
var r = WireInit(false.B)
var w = WireInit(false.B)
var x = WireInit(false.B)
var i = WireInit(false.B)
var d = WireInit(false.B)
var s = WireInit(false.B)
var a = WireInit(false.B)
var c = WireInit(false.B)
for((j, idx) <- MemMapList.zipWithIndex){
val modes = j._2.get("mode").get
if (modes.toUpperCase.indexOf("R") >= 0) r = r || matchVec(idx).asBool
if (modes.toUpperCase.indexOf("W") >= 0) w = w || matchVec(idx).asBool
if (modes.toUpperCase.indexOf("X") >= 0) x = x || matchVec(idx).asBool
if (modes.toUpperCase.indexOf("I") >= 0) i = i || matchVec(idx).asBool
if (modes.toUpperCase.indexOf("D") >= 0) d = d || matchVec(idx).asBool
if (modes.toUpperCase.indexOf("S") >= 0) s = s || matchVec(idx).asBool
if (modes.toUpperCase.indexOf("A") >= 0) a = a || matchVec(idx).asBool
if (modes.toUpperCase.indexOf("C") >= 0) c = c || matchVec(idx).asBool
}
VecInit(Seq(r, w, x, i, d, s, a, c)).asUInt
}
def queryWidth(matchVec: UInt): UInt = { def queryWidth(matchVec: UInt): UInt = {
Mux1H(matchVec, VecInit(MemMapList.map(i => { Mux1H(matchVec, VecInit(MemMapList.map(i => {
i._2.get("width").get.U i._2.get("width").get.U
...@@ -83,7 +151,11 @@ object AddressSpace { ...@@ -83,7 +151,11 @@ object AddressSpace {
def memmapAddrMatch(addr: UInt): (UInt, UInt) = { def memmapAddrMatch(addr: UInt): (UInt, UInt) = {
val matchVec = genMemmapMatchVec(addr) val matchVec = genMemmapMatchVec(addr)
(queryMode(matchVec), queryWidth(matchVec)) // when(queryMode(matchVec) =/= queryModeFast(matchVec)){
// printf("pma fail: right %b wrong %b\n", queryMode(matchVec), queryModeFast(matchVec))
// }
assert(queryMode(matchVec) === queryModeFast(matchVec))
(queryModeFast(matchVec), queryWidth(matchVec))
} }
def isDMMIO(addr: UInt): Bool = !PMAMode.dcache(memmapAddrMatch(addr)._1) def isDMMIO(addr: UInt): Bool = !PMAMode.dcache(memmapAddrMatch(addr)._1)
......
...@@ -377,6 +377,7 @@ class XSCoreImp(outer: XSCore) extends LazyModuleImp(outer) ...@@ -377,6 +377,7 @@ class XSCoreImp(outer: XSCore) extends LazyModuleImp(outer)
trapIO <> DontCare trapIO <> DontCare
println(s"FPGAPlatform:${env.FPGAPlatform} EnableDebug:${env.EnableDebug}") println(s"FPGAPlatform:${env.FPGAPlatform} EnableDebug:${env.EnableDebug}")
AddressSpace.checkMemmap()
AddressSpace.printMemmap() AddressSpace.printMemmap()
// to fast wake up fp, mem rs // to fast wake up fp, mem rs
......
...@@ -239,6 +239,7 @@ class MemBlockImp ...@@ -239,6 +239,7 @@ class MemBlockImp
lsq.io.loadIn(i) <> loadUnits(i).io.lsq.loadIn lsq.io.loadIn(i) <> loadUnits(i).io.lsq.loadIn
lsq.io.ldout(i) <> loadUnits(i).io.lsq.ldout lsq.io.ldout(i) <> loadUnits(i).io.lsq.ldout
lsq.io.loadDataForwarded(i) <> loadUnits(i).io.lsq.loadDataForwarded lsq.io.loadDataForwarded(i) <> loadUnits(i).io.lsq.loadDataForwarded
lsq.io.needReplayFromRS(i) <> loadUnits(i).io.lsq.needReplayFromRS
} }
// StoreUnit // StoreUnit
...@@ -274,8 +275,11 @@ class MemBlockImp ...@@ -274,8 +275,11 @@ class MemBlockImp
lsq.io.brqRedirect <> io.fromCtrlBlock.redirect lsq.io.brqRedirect <> io.fromCtrlBlock.redirect
lsq.io.flush <> io.fromCtrlBlock.flush lsq.io.flush <> io.fromCtrlBlock.flush
io.toCtrlBlock.replay <> lsq.io.rollback io.toCtrlBlock.replay <> lsq.io.rollback
lsq.io.dcache <> dcache.io.lsu.lsq
lsq.io.uncache <> uncache.io.lsq lsq.io.uncache <> uncache.io.lsq
// delay dcache refill for 1 cycle for better timing
// TODO: remove RegNext after fixing refill paddr timing
// lsq.io.dcache <> dcache.io.lsu.lsq
lsq.io.dcache := RegNext(dcache.io.lsu.lsq)
// LSQ to store buffer // LSQ to store buffer
lsq.io.sbuffer <> sbuffer.io.in lsq.io.sbuffer <> sbuffer.io.in
...@@ -283,6 +287,9 @@ class MemBlockImp ...@@ -283,6 +287,9 @@ class MemBlockImp
// Sbuffer // Sbuffer
sbuffer.io.dcache <> dcache.io.lsu.store sbuffer.io.dcache <> dcache.io.lsu.store
sbuffer.io.dcache.resp.valid := RegNext(dcache.io.lsu.store.resp.valid)
sbuffer.io.dcache.resp.bits := RegNext(dcache.io.lsu.store.resp.bits)
assert(sbuffer.io.dcache.resp.ready === true.B)
// flush sbuffer // flush sbuffer
val fenceFlush = io.fenceToSbuffer.flushSb val fenceFlush = io.fenceToSbuffer.flushSb
......
...@@ -300,7 +300,9 @@ class ReservationStationSelect ...@@ -300,7 +300,9 @@ class ReservationStationSelect
if (feedback) { if (feedback) {
when (io.memfeedback.valid) { when (io.memfeedback.valid) {
stateQueue(io.memfeedback.bits.rsIdx) := Mux(io.memfeedback.bits.hit, s_idle, s_replay) when (stateQueue(io.memfeedback.bits.rsIdx) === s_wait) {
stateQueue(io.memfeedback.bits.rsIdx) := Mux(io.memfeedback.bits.hit, s_idle, s_replay)
}
when (!io.memfeedback.bits.hit) { when (!io.memfeedback.bits.hit) {
countQueue(io.memfeedback.bits.rsIdx) := replayDelay(cntCountQueue(io.memfeedback.bits.rsIdx)) countQueue(io.memfeedback.bits.rsIdx) := replayDelay(cntCountQueue(io.memfeedback.bits.rsIdx))
} }
......
...@@ -71,6 +71,7 @@ class PtePermBundle extends TlbBundle { ...@@ -71,6 +71,7 @@ class PtePermBundle extends TlbBundle {
class TlbPermBundle extends TlbBundle { class TlbPermBundle extends TlbBundle {
val pf = Bool() // NOTE: if this is true, just raise pf val pf = Bool() // NOTE: if this is true, just raise pf
// pagetable perm (software defined)
val d = Bool() val d = Bool()
val a = Bool() val a = Bool()
val g = Bool() val g = Bool()
...@@ -78,13 +79,14 @@ class TlbPermBundle extends TlbBundle { ...@@ -78,13 +79,14 @@ class TlbPermBundle extends TlbBundle {
val x = Bool() val x = Bool()
val w = Bool() val w = Bool()
val r = Bool() val r = Bool()
// pma perm (hardwired)
val pr = Bool() //readable
val pw = Bool() //writeable
val pe = Bool() //executable
val pa = Bool() //atom op permitted
val pi = Bool() //icacheable
val pd = Bool() //dcacheable
// pma perm check
// val at = Bool() // Access Type
// val as = Bool() // Atomic Swap
// val al = Bool() // Atomic Logical
// val aa = Bool() // Atomic Arithmetic
// TODO: add pma check
override def toPrintable: Printable = { override def toPrintable: Printable = {
p"pf:${pf} d:${d} a:${a} g:${g} u:${u} x:${x} w:${w} r:${r}" p"pf:${pf} d:${d} a:${a} g:${g} u:${u} x:${x} w:${w} r:${r}"
} }
...@@ -172,6 +174,8 @@ class TlbEntry(superpage: Boolean = false) extends TlbBundle { ...@@ -172,6 +174,8 @@ class TlbEntry(superpage: Boolean = false) extends TlbBundle {
this.tag := vpn this.tag := vpn
this.level.map(_ := level(0)) this.level.map(_ := level(0))
this.data.ppn := ppn this.data.ppn := ppn
// refill pagetable perm
val ptePerm = perm.asTypeOf(new PtePermBundle) val ptePerm = perm.asTypeOf(new PtePermBundle)
this.data.perm.pf:= pf this.data.perm.pf:= pf
this.data.perm.d := ptePerm.d this.data.perm.d := ptePerm.d
...@@ -182,6 +186,15 @@ class TlbEntry(superpage: Boolean = false) extends TlbBundle { ...@@ -182,6 +186,15 @@ class TlbEntry(superpage: Boolean = false) extends TlbBundle {
this.data.perm.w := ptePerm.w this.data.perm.w := ptePerm.w
this.data.perm.r := ptePerm.r this.data.perm.r := ptePerm.r
// get pma perm
val (pmaMode, accessWidth) = AddressSpace.memmapAddrMatch(Cat(ppn, 0.U(12.W)))
this.data.perm.pr := PMAMode.read(pmaMode)
this.data.perm.pw := PMAMode.write(pmaMode)
this.data.perm.pe := PMAMode.execute(pmaMode)
this.data.perm.pa := PMAMode.atomic(pmaMode)
this.data.perm.pi := PMAMode.icache(pmaMode)
this.data.perm.pd := PMAMode.dcache(pmaMode)
this this
} }
...@@ -421,11 +434,22 @@ class TLB(Width: Int, isDtlb: Boolean) extends TlbModule with HasCSRConst{ ...@@ -421,11 +434,22 @@ class TLB(Width: Int, isDtlb: Boolean) extends TlbModule with HasCSRConst{
resp(i).bits.excp.pf.st := stPf || update resp(i).bits.excp.pf.st := stPf || update
resp(i).bits.excp.pf.instr := instrPf || update resp(i).bits.excp.pf.instr := instrPf || update
// if vmenable, use pre-calcuated pma check result
resp(i).bits.mmio := Mux(TlbCmd.isExec(cmdReg), !perm.pi, !perm.pd)
resp(i).bits.excp.af.ld := Mux(TlbCmd.isAtom(cmdReg), !perm.pa, !perm.pr) && TlbCmd.isRead(cmdReg)
resp(i).bits.excp.af.st := Mux(TlbCmd.isAtom(cmdReg), !perm.pa, !perm.pw) && TlbCmd.isWrite(cmdReg)
resp(i).bits.excp.af.instr := Mux(TlbCmd.isAtom(cmdReg), false.B, !perm.pe)
// if !vmenable, check pma
val (pmaMode, accessWidth) = AddressSpace.memmapAddrMatch(resp(i).bits.paddr) val (pmaMode, accessWidth) = AddressSpace.memmapAddrMatch(resp(i).bits.paddr)
resp(i).bits.mmio := Mux(TlbCmd.isExec(cmdReg), !PMAMode.icache(pmaMode), !PMAMode.dcache(pmaMode)) when(!vmEnable){
resp(i).bits.excp.af.ld := Mux(TlbCmd.isAtom(cmdReg), !PMAMode.atomic(pmaMode), !PMAMode.read(pmaMode)) && TlbCmd.isRead(cmdReg) resp(i).bits.mmio := Mux(TlbCmd.isExec(cmdReg), !PMAMode.icache(pmaMode), !PMAMode.dcache(pmaMode))
resp(i).bits.excp.af.st := Mux(TlbCmd.isAtom(cmdReg), !PMAMode.atomic(pmaMode), !PMAMode.write(pmaMode)) && TlbCmd.isWrite(cmdReg) resp(i).bits.excp.af.ld := Mux(TlbCmd.isAtom(cmdReg), !PMAMode.atomic(pmaMode), !PMAMode.read(pmaMode)) && TlbCmd.isRead(cmdReg)
resp(i).bits.excp.af.instr := Mux(TlbCmd.isAtom(cmdReg), false.B, !PMAMode.execute(pmaMode)) resp(i).bits.excp.af.st := Mux(TlbCmd.isAtom(cmdReg), !PMAMode.atomic(pmaMode), !PMAMode.write(pmaMode)) && TlbCmd.isWrite(cmdReg)
resp(i).bits.excp.af.instr := Mux(TlbCmd.isAtom(cmdReg), false.B, !PMAMode.execute(pmaMode))
}
// TODO: MMIO check
(hit, miss, pfHitVec, multiHit) (hit, miss, pfHitVec, multiHit)
} }
......
package xiangshan.mem
import chisel3._
import chisel3.util._
import xiangshan._
import utils._
import xiangshan.cache._
class MaskedSyncDataModuleTemplate[T <: Data](gen: T, numEntries: Int, numRead: Int, numWrite: Int, numMRead: Int = 0, numMWrite: Int = 0) extends XSModule with HasDCacheParameters {
val io = IO(new Bundle {
// address indexed sync read
val raddr = Input(Vec(numRead, UInt(log2Up(numEntries).W)))
val rdata = Output(Vec(numRead, gen))
// masked sync read (1H)
val mrmask = Input(Vec(numMRead, Vec(numEntries, Bool())))
val mrdata = Output(Vec(numMRead, gen))
// address indexed write
val wen = Input(Vec(numWrite, Bool()))
val waddr = Input(Vec(numWrite, UInt(log2Up(numEntries).W)))
val wdata = Input(Vec(numWrite, gen))
// masked write
val mwmask = Input(Vec(numMWrite, Vec(numEntries, Bool())))
val mwdata = Input(Vec(numMWrite, gen))
})
val data = Reg(Vec(numEntries, gen))
// read ports
for (i <- 0 until numRead) {
io.rdata(i) := data(RegNext(io.raddr(i)))
}
// masked read ports
for (i <- 0 until numMRead) {
io.mrdata(i) := Mux1H(RegNext(io.mrmask(i)), data)
}
// write ports (with priorities)
for (i <- 0 until numWrite) {
when (io.wen(i)) {
data(io.waddr(i)) := io.wdata(i)
}
}
// masked write
for (j <- 0 until numEntries) {
val wen = VecInit((0 until numMWrite).map(i => io.mwmask(i)(j))).asUInt.orR
when (wen) {
data(j) := VecInit((0 until numMWrite).map(i => {
Mux(io.mwmask(i)(j), io.mwdata(i), 0.U).asUInt
})).reduce(_ | _)
}
}
// DataModuleTemplate should not be used when there're any write conflicts
for (i <- 0 until numWrite) {
for (j <- i+1 until numWrite) {
assert(!(io.wen(i) && io.wen(j) && io.waddr(i) === io.waddr(j)))
}
}
}
...@@ -60,3 +60,18 @@ class LoadForwardQueryIO extends XSBundle { ...@@ -60,3 +60,18 @@ class LoadForwardQueryIO extends XSBundle {
// val lqIdx = Output(UInt(LoadQueueIdxWidth.W)) // val lqIdx = Output(UInt(LoadQueueIdxWidth.W))
val sqIdx = Output(new SqPtr) val sqIdx = Output(new SqPtr)
} }
class MaskedLoadForwardQueryIO extends XSBundle {
val paddr = Output(UInt(PAddrBits.W))
val mask = Output(UInt(8.W))
val uop = Output(new MicroOp) // for replay
val pc = Output(UInt(VAddrBits.W)) //for debug
val valid = Output(Bool()) //for debug
val forwardMask = Input(Vec(8, Bool()))
val forwardData = Input(Vec(8, UInt(8.W)))
val sqIdx = Output(new SqPtr) // for debug
// sqIdxMask is calcuated in earlier stage for better timing
val sqIdxMask = Output(UInt(StoreQueueSize.W))
}
...@@ -43,10 +43,11 @@ class LsqWrappper extends XSModule with HasDCacheParameters { ...@@ -43,10 +43,11 @@ class LsqWrappper extends XSModule with HasDCacheParameters {
val loadIn = Vec(LoadPipelineWidth, Flipped(Valid(new LsPipelineBundle))) val loadIn = Vec(LoadPipelineWidth, Flipped(Valid(new LsPipelineBundle)))
val storeIn = Vec(StorePipelineWidth, Flipped(Valid(new LsPipelineBundle))) val storeIn = Vec(StorePipelineWidth, Flipped(Valid(new LsPipelineBundle)))
val loadDataForwarded = Vec(LoadPipelineWidth, Input(Bool())) val loadDataForwarded = Vec(LoadPipelineWidth, Input(Bool()))
val needReplayFromRS = Vec(LoadPipelineWidth, Input(Bool()))
val sbuffer = Vec(StorePipelineWidth, Decoupled(new DCacheWordReq)) val sbuffer = Vec(StorePipelineWidth, Decoupled(new DCacheWordReq))
val ldout = Vec(2, DecoupledIO(new ExuOutput)) // writeback int load val ldout = Vec(2, DecoupledIO(new ExuOutput)) // writeback int load
val mmioStout = DecoupledIO(new ExuOutput) // writeback uncached store val mmioStout = DecoupledIO(new ExuOutput) // writeback uncached store
val forward = Vec(LoadPipelineWidth, Flipped(new LoadForwardQueryIO)) val forward = Vec(LoadPipelineWidth, Flipped(new MaskedLoadForwardQueryIO))
val roq = Flipped(new RoqLsqIO) val roq = Flipped(new RoqLsqIO)
val rollback = Output(Valid(new Redirect)) val rollback = Output(Valid(new Redirect))
val dcache = Flipped(ValidIO(new Refill)) val dcache = Flipped(ValidIO(new Refill))
...@@ -94,6 +95,7 @@ class LsqWrappper extends XSModule with HasDCacheParameters { ...@@ -94,6 +95,7 @@ class LsqWrappper extends XSModule with HasDCacheParameters {
loadQueue.io.loadIn <> io.loadIn loadQueue.io.loadIn <> io.loadIn
loadQueue.io.storeIn <> io.storeIn loadQueue.io.storeIn <> io.storeIn
loadQueue.io.loadDataForwarded <> io.loadDataForwarded loadQueue.io.loadDataForwarded <> io.loadDataForwarded
loadQueue.io.needReplayFromRS <> io.needReplayFromRS
loadQueue.io.ldout <> io.ldout loadQueue.io.ldout <> io.ldout
loadQueue.io.roq <> io.roq loadQueue.io.roq <> io.roq
loadQueue.io.rollback <> io.rollback loadQueue.io.rollback <> io.rollback
......
...@@ -68,8 +68,9 @@ class LoadQueue extends XSModule ...@@ -68,8 +68,9 @@ class LoadQueue extends XSModule
val loadIn = Vec(LoadPipelineWidth, Flipped(Valid(new LsPipelineBundle))) val loadIn = Vec(LoadPipelineWidth, Flipped(Valid(new LsPipelineBundle)))
val storeIn = Vec(StorePipelineWidth, Flipped(Valid(new LsPipelineBundle))) val storeIn = Vec(StorePipelineWidth, Flipped(Valid(new LsPipelineBundle)))
val loadDataForwarded = Vec(LoadPipelineWidth, Input(Bool())) val loadDataForwarded = Vec(LoadPipelineWidth, Input(Bool()))
val needReplayFromRS = Vec(LoadPipelineWidth, Input(Bool()))
val ldout = Vec(2, DecoupledIO(new ExuOutput)) // writeback int load val ldout = Vec(2, DecoupledIO(new ExuOutput)) // writeback int load
val load_s1 = Vec(LoadPipelineWidth, Flipped(new LoadForwardQueryIO)) val load_s1 = Vec(LoadPipelineWidth, Flipped(new MaskedLoadForwardQueryIO))
val roq = Flipped(new RoqLsqIO) val roq = Flipped(new RoqLsqIO)
val rollback = Output(Valid(new Redirect)) // replay now starts from load instead of store val rollback = Output(Valid(new Redirect)) // replay now starts from load instead of store
val dcache = Flipped(ValidIO(new Refill)) val dcache = Flipped(ValidIO(new Refill))
...@@ -81,7 +82,7 @@ class LoadQueue extends XSModule ...@@ -81,7 +82,7 @@ class LoadQueue extends XSModule
// val data = Reg(Vec(LoadQueueSize, new LsRoqEntry)) // val data = Reg(Vec(LoadQueueSize, new LsRoqEntry))
val dataModule = Module(new LoadQueueData(LoadQueueSize, wbNumRead = LoadPipelineWidth, wbNumWrite = LoadPipelineWidth)) val dataModule = Module(new LoadQueueData(LoadQueueSize, wbNumRead = LoadPipelineWidth, wbNumWrite = LoadPipelineWidth))
dataModule.io := DontCare dataModule.io := DontCare
val vaddrModule = Module(new AsyncDataModuleTemplate(UInt(VAddrBits.W), LoadQueueSize, numRead = 1, numWrite = LoadPipelineWidth)) val vaddrModule = Module(new SyncDataModuleTemplate(UInt(VAddrBits.W), LoadQueueSize, numRead = 1, numWrite = LoadPipelineWidth))
vaddrModule.io := DontCare vaddrModule.io := DontCare
val allocated = RegInit(VecInit(List.fill(LoadQueueSize)(false.B))) // lq entry has been allocated val allocated = RegInit(VecInit(List.fill(LoadQueueSize)(false.B))) // lq entry has been allocated
val datavalid = RegInit(VecInit(List.fill(LoadQueueSize)(false.B))) // data is valid val datavalid = RegInit(VecInit(List.fill(LoadQueueSize)(false.B))) // data is valid
...@@ -144,7 +145,7 @@ class LoadQueue extends XSModule ...@@ -144,7 +145,7 @@ class LoadQueue extends XSModule
*/ */
for (i <- 0 until LoadPipelineWidth) { for (i <- 0 until LoadPipelineWidth) {
dataModule.io.wb.wen(i) := false.B dataModule.io.wb.wen(i) := false.B
vaddrModule.io.wen(i) := false.B val loadWbIndex = io.loadIn(i).bits.uop.lqIdx.value
when(io.loadIn(i).fire()) { when(io.loadIn(i).fire()) {
when(io.loadIn(i).bits.miss) { when(io.loadIn(i).bits.miss) {
XSInfo(io.loadIn(i).valid, "load miss write to lq idx %d pc 0x%x vaddr %x paddr %x data %x mask %x forwardData %x forwardMask: %x mmio %x\n", XSInfo(io.loadIn(i).valid, "load miss write to lq idx %d pc 0x%x vaddr %x paddr %x data %x mask %x forwardData %x forwardMask: %x mmio %x\n",
...@@ -170,8 +171,9 @@ class LoadQueue extends XSModule ...@@ -170,8 +171,9 @@ class LoadQueue extends XSModule
io.loadIn(i).bits.forwardMask.asUInt, io.loadIn(i).bits.forwardMask.asUInt,
io.loadIn(i).bits.mmio io.loadIn(i).bits.mmio
)} )}
val loadWbIndex = io.loadIn(i).bits.uop.lqIdx.value datavalid(loadWbIndex) := (!io.loadIn(i).bits.miss || io.loadDataForwarded(i)) &&
datavalid(loadWbIndex) := (!io.loadIn(i).bits.miss || io.loadDataForwarded(i)) && !io.loadIn(i).bits.mmio !io.loadIn(i).bits.mmio && // mmio data is not valid until we finished uncache access
!io.needReplayFromRS(i) // do not writeback if that inst will be resend from rs
writebacked(loadWbIndex) := !io.loadIn(i).bits.miss && !io.loadIn(i).bits.mmio writebacked(loadWbIndex) := !io.loadIn(i).bits.miss && !io.loadIn(i).bits.mmio
val loadWbData = Wire(new LQDataEntry) val loadWbData = Wire(new LQDataEntry)
...@@ -182,18 +184,19 @@ class LoadQueue extends XSModule ...@@ -182,18 +184,19 @@ class LoadQueue extends XSModule
dataModule.io.wbWrite(i, loadWbIndex, loadWbData) dataModule.io.wbWrite(i, loadWbIndex, loadWbData)
dataModule.io.wb.wen(i) := true.B dataModule.io.wb.wen(i) := true.B
vaddrModule.io.waddr(i) := loadWbIndex
vaddrModule.io.wdata(i) := io.loadIn(i).bits.vaddr
vaddrModule.io.wen(i) := true.B
debug_mmio(loadWbIndex) := io.loadIn(i).bits.mmio debug_mmio(loadWbIndex) := io.loadIn(i).bits.mmio
debug_paddr(loadWbIndex) := io.loadIn(i).bits.paddr debug_paddr(loadWbIndex) := io.loadIn(i).bits.paddr
val dcacheMissed = io.loadIn(i).bits.miss && !io.loadIn(i).bits.mmio val dcacheMissed = io.loadIn(i).bits.miss && !io.loadIn(i).bits.mmio
miss(loadWbIndex) := dcacheMissed && !io.loadDataForwarded(i) miss(loadWbIndex) := dcacheMissed && !io.loadDataForwarded(i) && !io.needReplayFromRS(i)
pending(loadWbIndex) := io.loadIn(i).bits.mmio pending(loadWbIndex) := io.loadIn(i).bits.mmio
uop(loadWbIndex).debugInfo.issueTime := io.loadIn(i).bits.uop.debugInfo.issueTime uop(loadWbIndex).debugInfo.issueTime := io.loadIn(i).bits.uop.debugInfo.issueTime
} }
// vaddrModule write is delayed, as vaddrModule will not be read right after write
vaddrModule.io.waddr(i) := RegNext(loadWbIndex)
vaddrModule.io.wdata(i) := RegNext(io.loadIn(i).bits.vaddr)
vaddrModule.io.wen(i) := RegNext(io.loadIn(i).fire())
} }
when(io.dcache.valid) { when(io.dcache.valid) {
...@@ -361,11 +364,25 @@ class LoadQueue extends XSModule ...@@ -361,11 +364,25 @@ class LoadQueue extends XSModule
* Generate match vector for store address with rangeMask(stPtr, enqPtr). * Generate match vector for store address with rangeMask(stPtr, enqPtr).
* Besides, load instructions in LoadUnit_S1 and S2 are also checked. * Besides, load instructions in LoadUnit_S1 and S2 are also checked.
* Cycle 1: Redirect Generation * Cycle 1: Redirect Generation
* There're three possible types of violations. Choose the oldest load. * There're three possible types of violations, up to 6 possible redirect requests.
* Prepare redirect request according to the detected violation. * Choose the oldest load (part 1). (4 + 2) -> (1 + 2)
* Cycle 2: Redirect Fire * Cycle 2: Redirect Fire
* Choose the oldest load (part 2). (3 -> 1)
* Prepare redirect request according to the detected violation.
* Fire redirect request (if valid) * Fire redirect request (if valid)
*/ */
// stage 0: lq l1 wb l1 wb lq
// | | | | | | (paddr match)
// stage 1: lq l1 wb l1 wb lq
// | | | | | |
// | |------------| |
// | | |
// stage 2: lq l1wb lq
// | | |
// --------------------
// |
// rollback req
io.load_s1 := DontCare io.load_s1 := DontCare
def detectRollback(i: Int) = { def detectRollback(i: Int) = {
val startIndex = io.storeIn(i).bits.uop.lqIdx.value val startIndex = io.storeIn(i).bits.uop.lqIdx.value
...@@ -413,18 +430,9 @@ class LoadQueue extends XSModule ...@@ -413,18 +430,9 @@ class LoadQueue extends XSModule
val l1ViolationUop = getOldestInTwo(l1ViolationVec, RegNext(VecInit(io.load_s1.map(_.uop)))) val l1ViolationUop = getOldestInTwo(l1ViolationVec, RegNext(VecInit(io.load_s1.map(_.uop))))
XSDebug(l1Violation, p"${Binary(Cat(l1ViolationVec))}, $l1ViolationUop\n") XSDebug(l1Violation, p"${Binary(Cat(l1ViolationVec))}, $l1ViolationUop\n")
val rollbackValidVec = Seq(lqViolation, wbViolation, l1Violation)
val rollbackUopVec = Seq(lqViolationUop, wbViolationUop, l1ViolationUop)
val mask = getAfterMask(rollbackValidVec, rollbackUopVec)
val oneAfterZero = mask(1)(0)
val rollbackUop = Mux(oneAfterZero && mask(2)(0),
rollbackUopVec(0),
Mux(!oneAfterZero && mask(2)(1), rollbackUopVec(1), rollbackUopVec(2)))
XSDebug( XSDebug(
l1Violation, l1Violation,
"need rollback (l4 load) pc %x roqidx %d target %x\n", "need rollback (l1 load) pc %x roqidx %d target %x\n",
io.storeIn(i).bits.uop.cf.pc, io.storeIn(i).bits.uop.roqIdx.asUInt, l1ViolationUop.roqIdx.asUInt io.storeIn(i).bits.uop.cf.pc, io.storeIn(i).bits.uop.roqIdx.asUInt, l1ViolationUop.roqIdx.asUInt
) )
XSDebug( XSDebug(
...@@ -438,15 +446,7 @@ class LoadQueue extends XSModule ...@@ -438,15 +446,7 @@ class LoadQueue extends XSModule
io.storeIn(i).bits.uop.cf.pc, io.storeIn(i).bits.uop.roqIdx.asUInt, wbViolationUop.roqIdx.asUInt io.storeIn(i).bits.uop.cf.pc, io.storeIn(i).bits.uop.roqIdx.asUInt, wbViolationUop.roqIdx.asUInt
) )
(RegNext(io.storeIn(i).valid) && Cat(rollbackValidVec).orR, rollbackUop) ((lqViolation, lqViolationUop), (wbViolation, wbViolationUop), (l1Violation, l1ViolationUop))
}
// rollback check
val rollback = Wire(Vec(StorePipelineWidth, Valid(new MicroOp)))
for (i <- 0 until StorePipelineWidth) {
val detectedRollback = detectRollback(i)
rollback(i).valid := detectedRollback._1
rollback(i).bits := detectedRollback._2
} }
def rollbackSel(a: Valid[MicroOp], b: Valid[MicroOp]): ValidIO[MicroOp] = { def rollbackSel(a: Valid[MicroOp], b: Valid[MicroOp]): ValidIO[MicroOp] = {
...@@ -460,33 +460,72 @@ class LoadQueue extends XSModule ...@@ -460,33 +460,72 @@ class LoadQueue extends XSModule
b // sel b b // sel b
) )
} }
val rollbackSelected = ParallelOperation(rollback, rollbackSel)
val lastCycleRedirect = RegNext(io.brqRedirect) val lastCycleRedirect = RegNext(io.brqRedirect)
val lastlastCycleRedirect = RegNext(lastCycleRedirect)
val lastCycleFlush = RegNext(io.flush) val lastCycleFlush = RegNext(io.flush)
val lastlastCycleFlush = RegNext(lastCycleFlush)
// S2: select rollback and generate rollback request // S2: select rollback (part1) and generate rollback request
// rollback check
// Wb/L1 rollback seq check is done in s2
val rollbackWb = Wire(Vec(StorePipelineWidth, Valid(new MicroOp)))
val rollbackL1 = Wire(Vec(StorePipelineWidth, Valid(new MicroOp)))
val rollbackL1Wb = Wire(Vec(StorePipelineWidth*2, Valid(new MicroOp)))
// Lq rollback seq check is done in s3 (next stage), as getting rollbackLq MicroOp is slow
val rollbackLq = Wire(Vec(StorePipelineWidth, Valid(new MicroOp)))
for (i <- 0 until StorePipelineWidth) {
val detectedRollback = detectRollback(i)
rollbackLq(i).valid := detectedRollback._1._1 && RegNext(io.storeIn(i).valid)
rollbackLq(i).bits := detectedRollback._1._2
rollbackWb(i).valid := detectedRollback._2._1 && RegNext(io.storeIn(i).valid)
rollbackWb(i).bits := detectedRollback._2._2
rollbackL1(i).valid := detectedRollback._3._1 && RegNext(io.storeIn(i).valid)
rollbackL1(i).bits := detectedRollback._3._2
rollbackL1Wb(2*i) := rollbackL1(i)
rollbackL1Wb(2*i+1) := rollbackWb(i)
}
val rollbackL1WbSelected = ParallelOperation(rollbackL1Wb, rollbackSel)
val rollbackL1WbVReg = RegNext(rollbackL1WbSelected.valid)
val rollbackL1WbReg = RegEnable(rollbackL1WbSelected.bits, rollbackL1WbSelected.valid)
val rollbackLq0VReg = RegNext(rollbackLq(0).valid)
val rollbackLq0Reg = RegEnable(rollbackLq(0).bits, rollbackLq(0).valid)
val rollbackLq1VReg = RegNext(rollbackLq(1).valid)
val rollbackLq1Reg = RegEnable(rollbackLq(1).bits, rollbackLq(1).valid)
// S3: select rollback (part2), generate rollback request, then fire rollback request
// Note that we use roqIdx - 1.U to flush the load instruction itself. // Note that we use roqIdx - 1.U to flush the load instruction itself.
// Thus, here if last cycle's roqIdx equals to this cycle's roqIdx, it still triggers the redirect. // Thus, here if last cycle's roqIdx equals to this cycle's roqIdx, it still triggers the redirect.
val rollbackGen = Wire(Valid(new Redirect))
val rollbackReg = Reg(Valid(new Redirect)) // FIXME: this is ugly
rollbackGen.valid := rollbackSelected.valid && val rollbackValidVec = Seq(rollbackL1WbVReg, rollbackLq0VReg, rollbackLq1VReg)
!rollbackSelected.bits.roqIdx.needFlush(lastCycleRedirect, lastCycleFlush) val rollbackUopVec = Seq(rollbackL1WbReg, rollbackLq0Reg, rollbackLq1Reg)
rollbackGen.bits.roqIdx := rollbackSelected.bits.roqIdx // select uop in parallel
rollbackGen.bits.ftqIdx := rollbackSelected.bits.cf.ftqPtr val mask = getAfterMask(rollbackValidVec, rollbackUopVec)
rollbackGen.bits.ftqOffset := rollbackSelected.bits.cf.ftqOffset val oneAfterZero = mask(1)(0)
rollbackGen.bits.level := RedirectLevel.flush val rollbackUop = Mux(oneAfterZero && mask(2)(0),
rollbackGen.bits.interrupt := DontCare rollbackUopVec(0),
rollbackGen.bits.cfiUpdate := DontCare Mux(!oneAfterZero && mask(2)(1), rollbackUopVec(1), rollbackUopVec(2)))
rollbackGen.bits.cfiUpdate.target := rollbackSelected.bits.cf.pc
// check if rollback request is still valid in parallel
rollbackReg := rollbackGen val rollbackValidVecChecked = Wire(Vec(3, Bool()))
for(((v, uop), idx) <- rollbackValidVec.zip(rollbackUopVec).zipWithIndex) {
// S3: fire rollback request rollbackValidVecChecked(idx) := v &&
io.rollback := rollbackReg (!lastCycleRedirect.valid || isBefore(uop.roqIdx, lastCycleRedirect.bits.roqIdx)) &&
io.rollback.valid := rollbackReg.valid && (!lastlastCycleRedirect.valid || isBefore(uop.roqIdx, lastlastCycleRedirect.bits.roqIdx))
!rollbackReg.bits.roqIdx.needFlush(lastCycleRedirect, lastCycleFlush) }
io.rollback.bits.roqIdx := rollbackUop.roqIdx
io.rollback.bits.ftqIdx := rollbackUop.cf.ftqPtr
io.rollback.bits.ftqOffset := rollbackUop.cf.ftqOffset
io.rollback.bits.level := RedirectLevel.flush
io.rollback.bits.interrupt := DontCare
io.rollback.bits.cfiUpdate := DontCare
io.rollback.bits.cfiUpdate.target := rollbackUop.cf.pc
// io.rollback.bits.pc := DontCare
io.rollback.valid := rollbackValidVecChecked.asUInt.orR && !lastCycleFlush && !lastlastCycleFlush
when(io.rollback.valid) { when(io.rollback.valid) {
// XSDebug("Mem rollback: pc %x roqidx %d\n", io.rollback.bits.cfi, io.rollback.bits.roqIdx.asUInt) // XSDebug("Mem rollback: pc %x roqidx %d\n", io.rollback.bits.cfi, io.rollback.bits.roqIdx.asUInt)
...@@ -503,11 +542,13 @@ class LoadQueue extends XSModule ...@@ -503,11 +542,13 @@ class LoadQueue extends XSModule
* (5) ROB commits the instruction: same as normal instructions * (5) ROB commits the instruction: same as normal instructions
*/ */
//(2) when they reach ROB's head, they can be sent to uncache channel //(2) when they reach ROB's head, they can be sent to uncache channel
val lqTailMmioPending = WireInit(pending(deqPtr))
val lqTailAllocated = WireInit(allocated(deqPtr))
val s_idle :: s_req :: s_resp :: s_wait :: Nil = Enum(4) val s_idle :: s_req :: s_resp :: s_wait :: Nil = Enum(4)
val uncacheState = RegInit(s_idle) val uncacheState = RegInit(s_idle)
switch(uncacheState) { switch(uncacheState) {
is(s_idle) { is(s_idle) {
when(io.roq.pendingld && pending(deqPtr) && allocated(deqPtr)) { when(io.roq.pendingld && lqTailMmioPending && lqTailAllocated) {
uncacheState := s_req uncacheState := s_req
} }
} }
...@@ -563,7 +604,7 @@ class LoadQueue extends XSModule ...@@ -563,7 +604,7 @@ class LoadQueue extends XSModule
} }
// Read vaddr for mem exception // Read vaddr for mem exception
vaddrModule.io.raddr(0) := deqPtr + commitCount vaddrModule.io.raddr(0) := deqPtr + io.roq.lcommit
io.exceptionAddr.vaddr := vaddrModule.io.rdata(0) io.exceptionAddr.vaddr := vaddrModule.io.rdata(0)
// misprediction recovery / exception redirect // misprediction recovery / exception redirect
...@@ -596,6 +637,15 @@ class LoadQueue extends XSModule ...@@ -596,6 +637,15 @@ class LoadQueue extends XSModule
allowEnqueue := validCount + enqNumber <= (LoadQueueSize - RenameWidth).U allowEnqueue := validCount + enqNumber <= (LoadQueueSize - RenameWidth).U
// perf counter
XSPerf("lqRollback", io.rollback.valid, acc = true) // rollback redirect generated
XSPerf("lqFull", !allowEnqueue, acc = true)
XSPerf("lqMmioCycle", uncacheState =/= s_idle, acc = true) // lq is busy dealing with uncache req
XSPerf("lqMmioCnt", io.uncache.req.fire(), acc = true)
XSPerf("lqRefill", io.dcache.valid, acc = true)
XSPerf("lqWriteback", PopCount(VecInit(io.ldout.map(i => i.fire()))), acc = true)
XSPerf("lqWbBlocked", PopCount(VecInit(io.ldout.map(i => i.valid && !i.ready))), acc = true)
// debug info // debug info
XSDebug("enqPtrExt %d:%d deqPtrExt %d:%d\n", enqPtrExt(0).flag, enqPtr, deqPtrExt.flag, deqPtr) XSDebug("enqPtrExt %d:%d deqPtrExt %d:%d\n", enqPtrExt(0).flag, enqPtr, deqPtrExt.flag, deqPtr)
......
...@@ -106,51 +106,51 @@ class MaskModule(numEntries: Int, numRead: Int, numWrite: Int) extends XSModule ...@@ -106,51 +106,51 @@ class MaskModule(numEntries: Int, numRead: Int, numWrite: Int) extends XSModule
} }
} }
class LQData8Module(numEntries: Int, numRead: Int, numWrite: Int) extends XSModule with HasDCacheParameters { // class LQData8Module(numEntries: Int, numRead: Int, numWrite: Int) extends XSModule with HasDCacheParameters {
val io = IO(new Bundle { // val io = IO(new Bundle {
// read // // read
val raddr = Input(Vec(numRead, UInt(log2Up(numEntries).W))) // val raddr = Input(Vec(numRead, UInt(log2Up(numEntries).W)))
val rdata = Output(Vec(numRead, UInt(8.W))) // val rdata = Output(Vec(numRead, UInt(8.W)))
// address indexed write // // address indexed write
val wen = Input(Vec(numWrite, Bool())) // val wen = Input(Vec(numWrite, Bool()))
val waddr = Input(Vec(numWrite, UInt(log2Up(numEntries).W))) // val waddr = Input(Vec(numWrite, UInt(log2Up(numEntries).W)))
val wdata = Input(Vec(numWrite, UInt(8.W))) // val wdata = Input(Vec(numWrite, UInt(8.W)))
// masked write // // masked write
val mwmask = Input(Vec(blockWords, Vec(numEntries, Bool()))) // val mwmask = Input(Vec(blockWords, Vec(numEntries, Bool())))
val mwdata = Input(Vec(blockWords, UInt(8.W))) // val mwdata = Input(Vec(blockWords, UInt(8.W)))
}) // })
val data = Reg(Vec(numEntries, UInt(8.W))) // val data = Reg(Vec(numEntries, UInt(8.W)))
// read ports // // read ports
for (i <- 0 until numRead) { // for (i <- 0 until numRead) {
io.rdata(i) := data(RegNext(io.raddr(i))) // io.rdata(i) := data(RegNext(io.raddr(i)))
} // }
// below is the write ports (with priorities) // // below is the write ports (with priorities)
for (i <- 0 until numWrite) { // for (i <- 0 until numWrite) {
when (io.wen(i)) { // when (io.wen(i)) {
data(io.waddr(i)) := io.wdata(i) // data(io.waddr(i)) := io.wdata(i)
} // }
} // }
// masked write // // masked write
for (j <- 0 until numEntries) { // for (j <- 0 until numEntries) {
val wen = VecInit((0 until blockWords).map(i => io.mwmask(i)(j))).asUInt.orR // val wen = VecInit((0 until blockWords).map(i => io.mwmask(i)(j))).asUInt.orR
when (wen) { // when (wen) {
data(j) := VecInit((0 until blockWords).map(i => { // data(j) := VecInit((0 until blockWords).map(i => {
Mux(io.mwmask(i)(j), io.mwdata(i), 0.U) // Mux(io.mwmask(i)(j), io.mwdata(i), 0.U)
})).reduce(_ | _) // })).reduce(_ | _)
} // }
} // }
// DataModuleTemplate should not be used when there're any write conflicts // // DataModuleTemplate should not be used when there're any write conflicts
for (i <- 0 until numWrite) { // for (i <- 0 until numWrite) {
for (j <- i+1 until numWrite) { // for (j <- i+1 until numWrite) {
assert(!(io.wen(i) && io.wen(j) && io.waddr(i) === io.waddr(j))) // assert(!(io.wen(i) && io.wen(j) && io.waddr(i) === io.waddr(j)))
} // }
} // }
} // }
class CoredataModule(numEntries: Int, numRead: Int, numWrite: Int) extends XSModule with HasDCacheParameters { class CoredataModule(numEntries: Int, numRead: Int, numWrite: Int) extends XSModule with HasDCacheParameters {
val io = IO(new Bundle { val io = IO(new Bundle {
...@@ -177,7 +177,7 @@ class CoredataModule(numEntries: Int, numRead: Int, numWrite: Int) extends XSMod ...@@ -177,7 +177,7 @@ class CoredataModule(numEntries: Int, numRead: Int, numWrite: Int) extends XSMod
val paddrWen = Input(Vec(numWrite, Bool())) val paddrWen = Input(Vec(numWrite, Bool()))
}) })
val data8 = Seq.fill(8)(Module(new LQData8Module(numEntries, numRead, numWrite))) val data8 = Seq.fill(8)(Module(new MaskedSyncDataModuleTemplate(UInt(8.W), numEntries, numRead, numWrite, numMWrite = blockWords)))
val fwdMask = Reg(Vec(numEntries, UInt(8.W))) val fwdMask = Reg(Vec(numEntries, UInt(8.W)))
val wordIndex = Reg(Vec(numEntries, UInt((blockOffBits - wordOffBits).W))) val wordIndex = Reg(Vec(numEntries, UInt((blockOffBits - wordOffBits).W)))
......
...@@ -38,7 +38,7 @@ class StoreQueue extends XSModule with HasDCacheParameters with HasCircularQueue ...@@ -38,7 +38,7 @@ class StoreQueue extends XSModule with HasDCacheParameters with HasCircularQueue
val storeIn = Vec(StorePipelineWidth, Flipped(Valid(new LsPipelineBundle))) val storeIn = Vec(StorePipelineWidth, Flipped(Valid(new LsPipelineBundle)))
val sbuffer = Vec(StorePipelineWidth, Decoupled(new DCacheWordReq)) val sbuffer = Vec(StorePipelineWidth, Decoupled(new DCacheWordReq))
val mmioStout = DecoupledIO(new ExuOutput) // writeback uncached store val mmioStout = DecoupledIO(new ExuOutput) // writeback uncached store
val forward = Vec(LoadPipelineWidth, Flipped(new LoadForwardQueryIO)) val forward = Vec(LoadPipelineWidth, Flipped(new MaskedLoadForwardQueryIO))
val roq = Flipped(new RoqLsqIO) val roq = Flipped(new RoqLsqIO)
val uncache = new DCacheWordIO val uncache = new DCacheWordIO
// val refill = Flipped(Valid(new DCacheLineReq )) // val refill = Flipped(Valid(new DCacheLineReq ))
...@@ -61,7 +61,7 @@ class StoreQueue extends XSModule with HasDCacheParameters with HasCircularQueue ...@@ -61,7 +61,7 @@ class StoreQueue extends XSModule with HasDCacheParameters with HasCircularQueue
dataModule.io := DontCare dataModule.io := DontCare
val paddrModule = Module(new SQPaddrModule(StoreQueueSize, numRead = StorePipelineWidth, numWrite = StorePipelineWidth, numForward = StorePipelineWidth)) val paddrModule = Module(new SQPaddrModule(StoreQueueSize, numRead = StorePipelineWidth, numWrite = StorePipelineWidth, numForward = StorePipelineWidth))
paddrModule.io := DontCare paddrModule.io := DontCare
val vaddrModule = Module(new AsyncDataModuleTemplate(UInt(VAddrBits.W), StoreQueueSize, numRead = 1, numWrite = StorePipelineWidth)) val vaddrModule = Module(new SyncDataModuleTemplate(UInt(VAddrBits.W), StoreQueueSize, numRead = 1, numWrite = StorePipelineWidth))
vaddrModule.io := DontCare vaddrModule.io := DontCare
// state & misc // state & misc
...@@ -104,7 +104,7 @@ class StoreQueue extends XSModule with HasDCacheParameters with HasCircularQueue ...@@ -104,7 +104,7 @@ class StoreQueue extends XSModule with HasDCacheParameters with HasCircularQueue
dataModule.io.raddr(i) := deqPtrExtNext(i).value dataModule.io.raddr(i) := deqPtrExtNext(i).value
paddrModule.io.raddr(i) := deqPtrExtNext(i).value paddrModule.io.raddr(i) := deqPtrExtNext(i).value
} }
vaddrModule.io.raddr(0) := cmtPtr + commitCount vaddrModule.io.raddr(0) := cmtPtr + io.roq.scommit
/** /**
* Enqueue at dispatch * Enqueue at dispatch
...@@ -144,9 +144,8 @@ class StoreQueue extends XSModule with HasDCacheParameters with HasCircularQueue ...@@ -144,9 +144,8 @@ class StoreQueue extends XSModule with HasDCacheParameters with HasCircularQueue
for (i <- 0 until StorePipelineWidth) { for (i <- 0 until StorePipelineWidth) {
dataModule.io.wen(i) := false.B dataModule.io.wen(i) := false.B
paddrModule.io.wen(i) := false.B paddrModule.io.wen(i) := false.B
vaddrModule.io.wen(i) := false.B val stWbIndex = io.storeIn(i).bits.uop.sqIdx.value
when (io.storeIn(i).fire()) { when (io.storeIn(i).fire()) {
val stWbIndex = io.storeIn(i).bits.uop.sqIdx.value
datavalid(stWbIndex) := !io.storeIn(i).bits.mmio datavalid(stWbIndex) := !io.storeIn(i).bits.mmio
writebacked(stWbIndex) := !io.storeIn(i).bits.mmio writebacked(stWbIndex) := !io.storeIn(i).bits.mmio
pending(stWbIndex) := io.storeIn(i).bits.mmio pending(stWbIndex) := io.storeIn(i).bits.mmio
...@@ -164,9 +163,6 @@ class StoreQueue extends XSModule with HasDCacheParameters with HasCircularQueue ...@@ -164,9 +163,6 @@ class StoreQueue extends XSModule with HasDCacheParameters with HasCircularQueue
paddrModule.io.wdata(i) := io.storeIn(i).bits.paddr paddrModule.io.wdata(i) := io.storeIn(i).bits.paddr
paddrModule.io.wen(i) := true.B paddrModule.io.wen(i) := true.B
vaddrModule.io.waddr(i) := stWbIndex
vaddrModule.io.wdata(i) := io.storeIn(i).bits.vaddr
vaddrModule.io.wen(i) := true.B
mmio(stWbIndex) := io.storeIn(i).bits.mmio mmio(stWbIndex) := io.storeIn(i).bits.mmio
...@@ -179,6 +175,10 @@ class StoreQueue extends XSModule with HasDCacheParameters with HasCircularQueue ...@@ -179,6 +175,10 @@ class StoreQueue extends XSModule with HasDCacheParameters with HasCircularQueue
io.storeIn(i).bits.mmio io.storeIn(i).bits.mmio
) )
} }
// vaddrModule write is delayed, as vaddrModule will not be read right after write
vaddrModule.io.waddr(i) := RegNext(stWbIndex)
vaddrModule.io.wdata(i) := RegNext(io.storeIn(i).bits.vaddr)
vaddrModule.io.wen(i) := RegNext(io.storeIn(i).fire())
} }
/** /**
...@@ -199,7 +199,7 @@ class StoreQueue extends XSModule with HasDCacheParameters with HasCircularQueue ...@@ -199,7 +199,7 @@ class StoreQueue extends XSModule with HasDCacheParameters with HasCircularQueue
// Forward2: Mux(same_flag, 0.U, range(0, sqIdx) ) // Forward2: Mux(same_flag, 0.U, range(0, sqIdx) )
// i.e. forward1 is the target entries with the same flag bits and forward2 otherwise // i.e. forward1 is the target entries with the same flag bits and forward2 otherwise
val differentFlag = deqPtrExt(0).flag =/= io.forward(i).sqIdx.flag val differentFlag = deqPtrExt(0).flag =/= io.forward(i).sqIdx.flag
val forwardMask = UIntToMask(io.forward(i).sqIdx.value, StoreQueueSize) val forwardMask = io.forward(i).sqIdxMask
val storeWritebackedVec = WireInit(VecInit(Seq.fill(StoreQueueSize)(false.B))) val storeWritebackedVec = WireInit(VecInit(Seq.fill(StoreQueueSize)(false.B)))
for (j <- 0 until StoreQueueSize) { for (j <- 0 until StoreQueueSize) {
storeWritebackedVec(j) := datavalid(j) && allocated(j) // all datavalid terms need to be checked storeWritebackedVec(j) := datavalid(j) && allocated(j) // all datavalid terms need to be checked
...@@ -388,6 +388,16 @@ class StoreQueue extends XSModule with HasDCacheParameters with HasCircularQueue ...@@ -388,6 +388,16 @@ class StoreQueue extends XSModule with HasDCacheParameters with HasCircularQueue
// for 1 cycle will also promise that sq is empty in that cycle // for 1 cycle will also promise that sq is empty in that cycle
io.sqempty := RegNext(enqPtrExt(0).value === deqPtrExt(0).value && enqPtrExt(0).flag === deqPtrExt(0).flag) io.sqempty := RegNext(enqPtrExt(0).value === deqPtrExt(0).value && enqPtrExt(0).flag === deqPtrExt(0).flag)
// perf counter
XSPerf("sqFull", !allowEnqueue, acc = true)
XSPerf("sqMmioCycle", uncacheState =/= s_idle, acc = true) // lq is busy dealing with uncache req
XSPerf("sqMmioCnt", io.uncache.req.fire(), acc = true)
XSPerf("sqWriteback", io.mmioStout.fire(), acc = true)
XSPerf("sqWbBlocked", io.mmioStout.valid && !io.mmioStout.ready, acc = true)
XSPerf("sqValidEntryCnt", distanceBetween(enqPtrExt(0), deqPtrExt(0)))
XSPerf("sqCmtEntryCnt", distanceBetween(cmtPtrExt(0), deqPtrExt(0)))
XSPerf("sqNCmtEntryCnt", distanceBetween(enqPtrExt(0), cmtPtrExt(0)))
// debug info // debug info
XSDebug("enqPtrExt %d:%d deqPtrExt %d:%d\n", enqPtrExt(0).flag, enqPtr, deqPtrExt(0).flag, deqPtr) XSDebug("enqPtrExt %d:%d deqPtrExt %d:%d\n", enqPtrExt(0).flag, enqPtr, deqPtrExt(0).flag, deqPtr)
......
...@@ -13,7 +13,8 @@ class LoadToLsqIO extends XSBundle { ...@@ -13,7 +13,8 @@ class LoadToLsqIO extends XSBundle {
val loadIn = ValidIO(new LsPipelineBundle) val loadIn = ValidIO(new LsPipelineBundle)
val ldout = Flipped(DecoupledIO(new ExuOutput)) val ldout = Flipped(DecoupledIO(new ExuOutput))
val loadDataForwarded = Output(Bool()) val loadDataForwarded = Output(Bool())
val forward = new LoadForwardQueryIO val needReplayFromRS = Output(Bool())
val forward = new MaskedLoadForwardQueryIO
} }
// Load Pipeline Stage 0 // Load Pipeline Stage 0
...@@ -28,17 +29,15 @@ class LoadUnit_S0 extends XSModule { ...@@ -28,17 +29,15 @@ class LoadUnit_S0 extends XSModule {
}) })
val s0_uop = io.in.bits.uop val s0_uop = io.in.bits.uop
val s0_vaddr_old = io.in.bits.src1 + SignExt(ImmUnion.I.toImm32(s0_uop.ctrl.imm), XLEN) // val s0_vaddr = io.in.bits.src1 + SignExt(s0_uop.ctrl.imm(11,0), VAddrBits)
// val s0_mask = genWmask(s0_vaddr, s0_uop.ctrl.fuOpType(1,0))
val imm12 = WireInit(s0_uop.ctrl.imm(11,0)) val imm12 = WireInit(s0_uop.ctrl.imm(11,0))
val s0_vaddr_lo = io.in.bits.src1(11,0) + Cat(0.U(1.W), imm12) val s0_vaddr_lo = io.in.bits.src1(11,0) + Cat(0.U(1.W), imm12)
val s0_vaddr_hi = Mux(imm12(11), val s0_vaddr_hi = Mux(s0_vaddr_lo(12),
Mux((s0_vaddr_lo(12)), io.in.bits.src1(VAddrBits-1, 12), io.in.bits.src1(VAddrBits-1, 12)+SignExt(1.U, VAddrBits-12)), Mux(imm12(11), io.in.bits.src1(VAddrBits-1, 12), io.in.bits.src1(VAddrBits-1, 12)+1.U),
Mux((s0_vaddr_lo(12)), io.in.bits.src1(VAddrBits-1, 12)+1.U, io.in.bits.src1(VAddrBits-1, 12)) Mux(imm12(11), io.in.bits.src1(VAddrBits-1, 12)+SignExt(1.U, VAddrBits-12), io.in.bits.src1(VAddrBits-1, 12)),
) )
val s0_vaddr = Cat(s0_vaddr_hi, s0_vaddr_lo(11,0)) val s0_vaddr = Cat(s0_vaddr_hi, s0_vaddr_lo(11,0))
when(io.in.fire() && s0_vaddr(VAddrBits-1,0) =/= (io.in.bits.src1 + SignExt(ImmUnion.I.toImm32(s0_uop.ctrl.imm), XLEN))(VAddrBits-1,0)){
printf("s0_vaddr %x s0_vaddr_old %x\n", s0_vaddr, s0_vaddr_old(VAddrBits-1,0))
}
val s0_mask = genWmask(s0_vaddr_lo, s0_uop.ctrl.fuOpType(1,0)) val s0_mask = genWmask(s0_vaddr_lo, s0_uop.ctrl.fuOpType(1,0))
// query DTLB // query DTLB
...@@ -92,7 +91,7 @@ class LoadUnit_S1 extends XSModule { ...@@ -92,7 +91,7 @@ class LoadUnit_S1 extends XSModule {
val dcachePAddr = Output(UInt(PAddrBits.W)) val dcachePAddr = Output(UInt(PAddrBits.W))
val dcacheKill = Output(Bool()) val dcacheKill = Output(Bool())
val sbuffer = new LoadForwardQueryIO val sbuffer = new LoadForwardQueryIO
val lsq = new LoadForwardQueryIO val lsq = new MaskedLoadForwardQueryIO
}) })
val s1_uop = io.in.bits.uop val s1_uop = io.in.bits.uop
...@@ -122,6 +121,7 @@ class LoadUnit_S1 extends XSModule { ...@@ -122,6 +121,7 @@ class LoadUnit_S1 extends XSModule {
io.lsq.paddr := s1_paddr io.lsq.paddr := s1_paddr
io.lsq.uop := s1_uop io.lsq.uop := s1_uop
io.lsq.sqIdx := s1_uop.sqIdx io.lsq.sqIdx := s1_uop.sqIdx
io.lsq.sqIdxMask := DontCare // will be overwritten by sqIdxMask pre-generated in s0
io.lsq.mask := s1_mask io.lsq.mask := s1_mask
io.lsq.pc := s1_uop.cf.pc // FIXME: remove it io.lsq.pc := s1_uop.cf.pc // FIXME: remove it
...@@ -149,6 +149,7 @@ class LoadUnit_S2 extends XSModule with HasLoadHelper { ...@@ -149,6 +149,7 @@ class LoadUnit_S2 extends XSModule with HasLoadHelper {
val lsq = new LoadForwardQueryIO val lsq = new LoadForwardQueryIO
val sbuffer = new LoadForwardQueryIO val sbuffer = new LoadForwardQueryIO
val dataForwarded = Output(Bool()) val dataForwarded = Output(Bool())
val needReplayFromRS = Output(Bool())
}) })
val s2_uop = io.in.bits.uop val s2_uop = io.in.bits.uop
...@@ -168,10 +169,22 @@ class LoadUnit_S2 extends XSModule with HasLoadHelper { ...@@ -168,10 +169,22 @@ class LoadUnit_S2 extends XSModule with HasLoadHelper {
io.tlbFeedback.valid := io.in.valid io.tlbFeedback.valid := io.in.valid
io.tlbFeedback.bits.hit := !s2_tlb_miss && (!s2_cache_replay || s2_mmio) io.tlbFeedback.bits.hit := !s2_tlb_miss && (!s2_cache_replay || s2_mmio)
io.tlbFeedback.bits.rsIdx := io.in.bits.rsIdx io.tlbFeedback.bits.rsIdx := io.in.bits.rsIdx
io.needReplayFromRS := s2_cache_replay
// merge forward result
// lsq has higher priority than sbuffer
val forwardMask = Wire(Vec(8, Bool()))
val forwardData = Wire(Vec(8, UInt(8.W)))
val forwardMask = io.out.bits.forwardMask
val forwardData = io.out.bits.forwardData
val fullForward = (~forwardMask.asUInt & s2_mask) === 0.U val fullForward = (~forwardMask.asUInt & s2_mask) === 0.U
io.lsq := DontCare
io.sbuffer := DontCare
// generate XLEN/8 Muxs
for (i <- 0 until XLEN / 8) {
forwardMask(i) := io.lsq.forwardMask(i) || io.sbuffer.forwardMask(i)
forwardData(i) := Mux(io.lsq.forwardMask(i), io.lsq.forwardData(i), io.sbuffer.forwardData(i))
}
XSDebug(io.out.fire(), "[FWD LOAD RESP] pc %x fwd %x(%b) + %x(%b)\n", XSDebug(io.out.fire(), "[FWD LOAD RESP] pc %x fwd %x(%b) + %x(%b)\n",
s2_uop.cf.pc, s2_uop.cf.pc,
...@@ -180,8 +193,9 @@ class LoadUnit_S2 extends XSModule with HasLoadHelper { ...@@ -180,8 +193,9 @@ class LoadUnit_S2 extends XSModule with HasLoadHelper {
) )
// data merge // data merge
val rdata = VecInit((0 until XLEN / 8).map(j => val rdataVec = VecInit((0 until XLEN / 8).map(j =>
Mux(forwardMask(j), forwardData(j), io.dcacheResp.bits.data(8*(j+1)-1, 8*j)))).asUInt Mux(forwardMask(j), forwardData(j), io.dcacheResp.bits.data(8*(j+1)-1, 8*j))))
val rdata = rdataVec.asUInt
val rdataSel = LookupTree(s2_paddr(2, 0), List( val rdataSel = LookupTree(s2_paddr(2, 0), List(
"b000".U -> rdata(63, 0), "b000".U -> rdata(63, 0),
"b001".U -> rdata(63, 8), "b001".U -> rdata(63, 8),
...@@ -194,9 +208,7 @@ class LoadUnit_S2 extends XSModule with HasLoadHelper { ...@@ -194,9 +208,7 @@ class LoadUnit_S2 extends XSModule with HasLoadHelper {
)) ))
val rdataPartialLoad = rdataHelper(s2_uop, rdataSel) val rdataPartialLoad = rdataHelper(s2_uop, rdataSel)
// TODO: ECC check io.out.valid := io.in.valid && !s2_tlb_miss
io.out.valid := io.in.valid && !s2_tlb_miss && (!s2_cache_replay || s2_mmio || s2_exception)
// Inst will be canceled in store queue / lsq, // Inst will be canceled in store queue / lsq,
// so we do not need to care about flush in load / store unit's out.valid // so we do not need to care about flush in load / store unit's out.valid
io.out.bits := io.in.bits io.out.bits := io.in.bits
...@@ -212,28 +224,16 @@ class LoadUnit_S2 extends XSModule with HasLoadHelper { ...@@ -212,28 +224,16 @@ class LoadUnit_S2 extends XSModule with HasLoadHelper {
// and dcache query is no longer needed. // and dcache query is no longer needed.
// Such inst will be writebacked from load queue. // Such inst will be writebacked from load queue.
io.dataForwarded := s2_cache_miss && fullForward && !s2_exception io.dataForwarded := s2_cache_miss && fullForward && !s2_exception
// io.out.bits.forwardX will be send to lq
io.out.bits.forwardMask := forwardMask
// data retbrived from dcache is also included in io.out.bits.forwardData
io.out.bits.forwardData := rdataVec
io.in.ready := io.out.ready || !io.in.valid io.in.ready := io.out.ready || !io.in.valid
// merge forward result
// lsq has higher priority than sbuffer
io.lsq := DontCare
io.sbuffer := DontCare
// generate XLEN/8 Muxs
for (i <- 0 until XLEN / 8) {
when (io.sbuffer.forwardMask(i)) {
io.out.bits.forwardMask(i) := true.B
io.out.bits.forwardData(i) := io.sbuffer.forwardData(i)
}
when (io.lsq.forwardMask(i)) {
io.out.bits.forwardMask(i) := true.B
io.out.bits.forwardData(i) := io.lsq.forwardData(i)
}
}
XSDebug(io.out.fire(), "[DCACHE LOAD RESP] pc %x rdata %x <- D$ %x + fwd %x(%b)\n", XSDebug(io.out.fire(), "[DCACHE LOAD RESP] pc %x rdata %x <- D$ %x + fwd %x(%b)\n",
s2_uop.cf.pc, rdataPartialLoad, io.dcacheResp.bits.data, s2_uop.cf.pc, rdataPartialLoad, io.dcacheResp.bits.data,
io.out.bits.forwardData.asUInt, io.out.bits.forwardMask.asUInt forwardData.asUInt, forwardMask.asUInt
) )
} }
...@@ -271,13 +271,19 @@ class LoadUnit extends XSModule with HasLoadHelper { ...@@ -271,13 +271,19 @@ class LoadUnit extends XSModule with HasLoadHelper {
PipelineConnect(load_s1.io.out, load_s2.io.in, true.B, load_s1.io.out.bits.uop.roqIdx.needFlush(io.redirect, io.flush)) PipelineConnect(load_s1.io.out, load_s2.io.in, true.B, load_s1.io.out.bits.uop.roqIdx.needFlush(io.redirect, io.flush))
load_s2.io.tlbFeedback <> io.tlbFeedback
load_s2.io.dcacheResp <> io.dcache.resp load_s2.io.dcacheResp <> io.dcache.resp
load_s2.io.lsq.forwardData <> io.lsq.forward.forwardData load_s2.io.lsq.forwardData <> io.lsq.forward.forwardData
load_s2.io.lsq.forwardMask <> io.lsq.forward.forwardMask load_s2.io.lsq.forwardMask <> io.lsq.forward.forwardMask
load_s2.io.sbuffer.forwardData <> io.sbuffer.forwardData load_s2.io.sbuffer.forwardData <> io.sbuffer.forwardData
load_s2.io.sbuffer.forwardMask <> io.sbuffer.forwardMask load_s2.io.sbuffer.forwardMask <> io.sbuffer.forwardMask
load_s2.io.dataForwarded <> io.lsq.loadDataForwarded load_s2.io.dataForwarded <> io.lsq.loadDataForwarded
io.tlbFeedback.bits := RegNext(load_s2.io.tlbFeedback.bits)
io.tlbFeedback.valid := RegNext(load_s2.io.tlbFeedback.valid && !load_s2.io.out.bits.uop.roqIdx.needFlush(io.redirect, io.flush))
io.lsq.needReplayFromRS := load_s2.io.needReplayFromRS
// pre-calcuate sqIdx mask in s0, then send it to lsq in s1 for forwarding
val sqIdxMaskReg = RegNext(UIntToMask(load_s0.io.in.bits.uop.sqIdx.value, StoreQueueSize))
io.lsq.forward.sqIdxMask := sqIdxMaskReg
// use s2_hit_way to select data received in s1 // use s2_hit_way to select data received in s1
load_s2.io.dcacheResp.bits.data := Mux1H(io.dcache.s2_hit_way, RegNext(io.dcache.s1_data)) load_s2.io.dcacheResp.bits.data := Mux1H(io.dcache.s2_hit_way, RegNext(io.dcache.s1_data))
...@@ -317,19 +323,26 @@ class LoadUnit extends XSModule with HasLoadHelper { ...@@ -317,19 +323,26 @@ class LoadUnit extends XSModule with HasLoadHelper {
io.ldout.bits := Mux(intHitLoadOut.valid, intHitLoadOut.bits, io.lsq.ldout.bits) io.ldout.bits := Mux(intHitLoadOut.valid, intHitLoadOut.bits, io.lsq.ldout.bits)
io.ldout.valid := intHitLoadOut.valid || io.lsq.ldout.valid && !refillFpLoad io.ldout.valid := intHitLoadOut.valid || io.lsq.ldout.valid && !refillFpLoad
// Fp load, if hit, will be send to recoder at s2, then it will be recoded & writebacked at s3 // Fp load, if hit, will be stored to reg at s2, then it will be recoded at s3, writebacked at s4
val fpHitLoadOut = Wire(Valid(new ExuOutput)) val fpHitLoadOut = Wire(Valid(new ExuOutput))
fpHitLoadOut.valid := s2_wb_valid && load_s2.io.out.bits.uop.ctrl.fpWen fpHitLoadOut.valid := s2_wb_valid && load_s2.io.out.bits.uop.ctrl.fpWen
fpHitLoadOut.bits := intHitLoadOut.bits fpHitLoadOut.bits := intHitLoadOut.bits
val fpLoadOut = Wire(Valid(new ExuOutput)) val fpLoadUnRecodedReg = Reg(Valid(new ExuOutput))
fpLoadOut.bits := Mux(fpHitLoadOut.valid, fpHitLoadOut.bits, io.lsq.ldout.bits) fpLoadUnRecodedReg.valid := fpHitLoadOut.valid || io.lsq.ldout.valid && refillFpLoad
fpLoadOut.valid := fpHitLoadOut.valid || io.lsq.ldout.valid && refillFpLoad when(fpHitLoadOut.valid || io.lsq.ldout.valid && refillFpLoad){
fpLoadUnRecodedReg.bits := Mux(fpHitLoadOut.valid, fpHitLoadOut.bits, io.lsq.ldout.bits)
}
val fpLoadOutReg = RegNext(fpLoadOut) val fpLoadRecodedReg = Reg(Valid(new ExuOutput))
io.fpout.bits := fpLoadOutReg.bits when(fpLoadUnRecodedReg.valid){
io.fpout.bits.data := fpRdataHelper(fpLoadOutReg.bits.uop, fpLoadOutReg.bits.data) // recode fpLoadRecodedReg := fpLoadUnRecodedReg
io.fpout.valid := RegNext(fpLoadOut.valid) fpLoadRecodedReg.bits.data := fpRdataHelper(fpLoadUnRecodedReg.bits.uop, fpLoadUnRecodedReg.bits.data) // recode
}
fpLoadRecodedReg.valid := fpLoadUnRecodedReg.valid
io.fpout.bits := fpLoadRecodedReg.bits
io.fpout.valid := fpLoadRecodedReg.valid
io.lsq.ldout.ready := Mux(refillFpLoad, !fpHitLoadOut.valid, !intHitLoadOut.valid) io.lsq.ldout.ready := Mux(refillFpLoad, !fpHitLoadOut.valid, !intHitLoadOut.valid)
......
...@@ -18,17 +18,14 @@ class StoreUnit_S0 extends XSModule { ...@@ -18,17 +18,14 @@ class StoreUnit_S0 extends XSModule {
}) })
// send req to dtlb // send req to dtlb
val saddr_old = io.in.bits.src1 + SignExt(ImmUnion.S.toImm32(io.in.bits.uop.ctrl.imm), XLEN) // val saddr = io.in.bits.src1 + SignExt(io.in.bits.uop.ctrl.imm(11,0), VAddrBits)
val imm12 = WireInit(io.in.bits.uop.ctrl.imm(11,0)) val imm12 = WireInit(io.in.bits.uop.ctrl.imm(11,0))
val saddr_lo = io.in.bits.src1(11,0) + Cat(0.U(1.W), imm12) val saddr_lo = io.in.bits.src1(11,0) + Cat(0.U(1.W), imm12)
val saddr_hi = Mux(imm12(11), val saddr_hi = Mux(saddr_lo(12),
Mux((saddr_lo(12)), io.in.bits.src1(VAddrBits-1, 12), io.in.bits.src1(VAddrBits-1, 12)+SignExt(1.U, VAddrBits-12)), Mux(imm12(11), io.in.bits.src1(VAddrBits-1, 12), io.in.bits.src1(VAddrBits-1, 12)+1.U),
Mux((saddr_lo(12)), io.in.bits.src1(VAddrBits-1, 12)+1.U, io.in.bits.src1(VAddrBits-1, 12)) Mux(imm12(11), io.in.bits.src1(VAddrBits-1, 12)+SignExt(1.U, VAddrBits-12), io.in.bits.src1(VAddrBits-1, 12)),
) )
val saddr = Cat(saddr_hi, saddr_lo(11,0)) val saddr = Cat(saddr_hi, saddr_lo(11,0))
when(io.in.fire() && saddr(VAddrBits-1,0) =/= (io.in.bits.src1 + SignExt(ImmUnion.S.toImm32(io.in.bits.uop.ctrl.imm), XLEN))(VAddrBits-1,0)){
printf("saddr %x saddr_old %x\n", saddr, saddr_old(VAddrBits-1,0))
}
io.dtlbReq.bits.vaddr := saddr io.dtlbReq.bits.vaddr := saddr
io.dtlbReq.valid := io.in.valid io.dtlbReq.valid := io.in.valid
......
...@@ -129,6 +129,9 @@ class NewSbuffer extends XSModule with HasSbufferCst { ...@@ -129,6 +129,9 @@ class NewSbuffer extends XSModule with HasSbufferCst {
difftestIO <> DontCare difftestIO <> DontCare
val buffer = Mem(StoreBufferSize, new SbufferLine) val buffer = Mem(StoreBufferSize, new SbufferLine)
val tag = Reg(Vec(StoreBufferSize, UInt(TagWidth.W)))
val mask = Reg(Vec(StoreBufferSize, Vec(CacheLineWords, Vec(DataBytes, Bool()))))
val data = Reg(Vec(StoreBufferSize, Vec(CacheLineWords, Vec(DataBytes, UInt(8.W))))) // TODO: will be replaced by SyncDataModuleTemplate
val stateVec = RegInit(VecInit(Seq.fill(StoreBufferSize)(s_invalid))) val stateVec = RegInit(VecInit(Seq.fill(StoreBufferSize)(s_invalid)))
val cohCount = Reg(Vec(StoreBufferSize, UInt(countBits.W))) val cohCount = Reg(Vec(StoreBufferSize, UInt(countBits.W)))
/* /*
...@@ -165,30 +168,13 @@ class NewSbuffer extends XSModule with HasSbufferCst { ...@@ -165,30 +168,13 @@ class NewSbuffer extends XSModule with HasSbufferCst {
val validCount = RegInit(0.U((log2Up(StoreBufferSize) + 1).W)) val validCount = RegInit(0.U((log2Up(StoreBufferSize) + 1).W))
val full = invalidCount === 0.U // full = TODO: validCount(log2Up(StoreBufferSize)) val full = invalidCount === 0.U // full = TODO: validCount(log2Up(StoreBufferSize))
val bufferRead = VecInit((0 until StoreBufferSize).map(i => buffer(i)))
val stateRead = VecInit((0 until StoreBufferSize).map(i => stateVec(i)))
val dataRead = VecInit((0 until StoreBufferSize).map(i => bufferRead(i).data.asTypeOf(Vec(CacheLineWords, Vec(DataBytes, UInt(8.W))))))
val maskRead = VecInit((0 until StoreBufferSize).map(i => bufferRead(i).mask.asTypeOf(Vec(CacheLineWords, Vec(DataBytes, Bool())))))
val tagRead = VecInit((0 until StoreBufferSize).map(i => bufferRead(i).tag))
val dataUpdate = WireInit(dataRead)
val maskUpdate = WireInit(maskRead)
val tagUpdate = WireInit(tagRead)
val stateUpdate = WireInit(stateRead)
val bufferUpdate = Wire(Vec(StoreBufferSize, new SbufferLine))
(0 until StoreBufferSize) foreach { i =>
bufferUpdate(i).tag := tagUpdate(i)
bufferUpdate(i).data := dataUpdate(i).asUInt()
bufferUpdate(i).mask := maskUpdate(i).asUInt()
}
val lru = Module(new ChooseReplace(StoreBufferSize)) val lru = Module(new ChooseReplace(StoreBufferSize))
val evictionIdx = lru.io.way val evictionIdx = lru.io.way
lru.io.mask := stateRead.map(isValid(_)) lru.io.mask := stateVec.map(isValid(_))
val tags = io.in.map(in => getTag(in.bits.addr)) val intags = io.in.map(in => getTag(in.bits.addr))
val sameTag = tags(0) === tags(1) val sameTag = intags(0) === intags(1)
val firstWord = getWord(io.in(0).bits.addr) val firstWord = getWord(io.in(0).bits.addr)
val secondWord = getWord(io.in(1).bits.addr) val secondWord = getWord(io.in(1).bits.addr)
val sameWord = firstWord === secondWord val sameWord = firstWord === secondWord
...@@ -201,13 +187,14 @@ class NewSbuffer extends XSModule with HasSbufferCst { ...@@ -201,13 +187,14 @@ class NewSbuffer extends XSModule with HasSbufferCst {
for(i <- 0 until StorePipelineWidth){ for(i <- 0 until StorePipelineWidth){
mergeMask(i) := widthMap(j => mergeMask(i) := widthMap(j =>
Mux(tags(i) === tagRead(j) && isValid(stateRead(j)), true.B, false.B)) intags(i) === tag(j) && isValid(stateVec(j))
)
} }
// insert confition // insert confition
// firstInsert: the first invalid entry // firstInsert: the first invalid entry
// if first entry canMerge or second entry has the same tag with the first entry , secondInsert equal the first invalid entry, otherwise, the second invalid entry // if first entry canMerge or second entry has the same tag with the first entry , secondInsert equal the first invalid entry, otherwise, the second invalid entry
val invalidMask = stateRead.map(s => isInvalid(s)) val invalidMask = stateVec.map(s => isInvalid(s))
val evenInvalidMask = GetEvenBits(VecInit(invalidMask).asUInt) val evenInvalidMask = GetEvenBits(VecInit(invalidMask).asUInt)
val oddInvalidMask = GetOddBits(VecInit(invalidMask).asUInt) val oddInvalidMask = GetOddBits(VecInit(invalidMask).asUInt)
...@@ -232,27 +219,26 @@ class NewSbuffer extends XSModule with HasSbufferCst { ...@@ -232,27 +219,26 @@ class NewSbuffer extends XSModule with HasSbufferCst {
Mux(~enbufferSelReg, evenCanInsert, oddCanInsert) Mux(~enbufferSelReg, evenCanInsert, oddCanInsert)
) )
io.in(0).ready := firstCanInsert || canMerge(0) io.in(0).ready := firstCanInsert
io.in(1).ready := (secondCanInsert || canMerge(1)) && !sameWord && io.in(0).ready io.in(1).ready := secondCanInsert && !sameWord && io.in(0).ready
def wordReqToBufLine(req: DCacheWordReq, tag: UInt, insertIdx: UInt, wordOffset: UInt, flushMask: Bool): Unit = { def wordReqToBufLine(req: DCacheWordReq, reqtag: UInt, insertIdx: UInt, wordOffset: UInt, flushMask: Bool): Unit = {
stateUpdate(insertIdx) := s_valid stateVec(insertIdx) := s_valid
tagUpdate(insertIdx) := tag
cohCount(insertIdx) := 0.U cohCount(insertIdx) := 0.U
tag(insertIdx) := reqtag
when(flushMask){ when(flushMask){
for(j <- 0 until CacheLineWords){ for(j <- 0 until CacheLineWords){
for(i <- 0 until DataBytes){ for(i <- 0 until DataBytes){
maskUpdate(insertIdx)(j)(i) := false.B mask(insertIdx)(j)(i) := false.B
} }
} }
} }
for(i <- 0 until DataBytes){ for(i <- 0 until DataBytes){
when(req.mask(i)){ when(req.mask(i)){
maskUpdate(insertIdx)(wordOffset)(i) := true.B mask(insertIdx)(wordOffset)(i) := true.B
dataUpdate(insertIdx)(wordOffset)(i) := req.data(i*8+7, i*8) data(insertIdx)(wordOffset)(i) := req.data(i*8+7, i*8)
} }
} }
} }
...@@ -261,8 +247,8 @@ class NewSbuffer extends XSModule with HasSbufferCst { ...@@ -261,8 +247,8 @@ class NewSbuffer extends XSModule with HasSbufferCst {
cohCount(mergeIdx) := 0.U cohCount(mergeIdx) := 0.U
for(i <- 0 until DataBytes){ for(i <- 0 until DataBytes){
when(req.mask(i)){ when(req.mask(i)){
maskUpdate(mergeIdx)(wordOffset)(i) := true.B mask(mergeIdx)(wordOffset)(i) := true.B
dataUpdate(mergeIdx)(wordOffset)(i) := req.data(i*8+7, i*8) data(mergeIdx)(wordOffset)(i) := req.data(i*8+7, i*8)
} }
} }
} }
...@@ -273,7 +259,7 @@ class NewSbuffer extends XSModule with HasSbufferCst { ...@@ -273,7 +259,7 @@ class NewSbuffer extends XSModule with HasSbufferCst {
mergeWordReq(io.in(0).bits, mergeIdx(0), firstWord) mergeWordReq(io.in(0).bits, mergeIdx(0), firstWord)
XSDebug(p"merge req 0 to line [${mergeIdx(0)}]\n") XSDebug(p"merge req 0 to line [${mergeIdx(0)}]\n")
}.otherwise{ }.otherwise{
wordReqToBufLine(io.in(0).bits, tags(0), firstInsertIdx, firstWord, true.B) wordReqToBufLine(io.in(0).bits, intags(0), firstInsertIdx, firstWord, true.B)
XSDebug(p"insert req 0 to line[$firstInsertIdx]\n") XSDebug(p"insert req 0 to line[$firstInsertIdx]\n")
} }
} }
...@@ -284,19 +270,14 @@ class NewSbuffer extends XSModule with HasSbufferCst { ...@@ -284,19 +270,14 @@ class NewSbuffer extends XSModule with HasSbufferCst {
mergeWordReq(io.in(1).bits, mergeIdx(1), secondWord) mergeWordReq(io.in(1).bits, mergeIdx(1), secondWord)
XSDebug(p"merge req 1 to line [${mergeIdx(1)}]\n") XSDebug(p"merge req 1 to line [${mergeIdx(1)}]\n")
}.otherwise{ }.otherwise{
wordReqToBufLine(io.in(1).bits, tags(1), secondInsertIdx, secondWord, !sameTag) wordReqToBufLine(io.in(1).bits, intags(1), secondInsertIdx, secondWord, !sameTag)
XSDebug(p"insert req 1 to line[$secondInsertIdx]\n") XSDebug(p"insert req 1 to line[$secondInsertIdx]\n")
} }
} }
for(i <- 0 until StoreBufferSize){
buffer.write(i.U, bufferUpdate(i))
stateVec(i) := stateUpdate(i)
}
for(i <- 0 until StoreBufferSize){ for(i <- 0 until StoreBufferSize){
XSDebug(stateVec(i)=/=s_invalid, XSDebug(stateVec(i)=/=s_invalid,
p"[$i] timeout:${cohCount(i)(countBits-1)} state:${stateVec(i)} buf:${bufferRead(i)}\n" p"[$i] timeout:${cohCount(i)(countBits-1)} state:${stateVec(i)}\n"
) )
} }
...@@ -320,7 +301,7 @@ class NewSbuffer extends XSModule with HasSbufferCst { ...@@ -320,7 +301,7 @@ class NewSbuffer extends XSModule with HasSbufferCst {
do_eviction := validCount >= 12.U do_eviction := validCount >= 12.U
io.flush.empty := empty && io.sqempty io.flush.empty := RegNext(empty && io.sqempty)
lru.io.flush := sbuffer_state === x_drain_sbuffer && empty lru.io.flush := sbuffer_state === x_drain_sbuffer && empty
switch(sbuffer_state){ switch(sbuffer_state){
is(x_idle){ is(x_idle){
...@@ -346,11 +327,11 @@ class NewSbuffer extends XSModule with HasSbufferCst { ...@@ -346,11 +327,11 @@ class NewSbuffer extends XSModule with HasSbufferCst {
XSDebug(p"sbuffer state:${sbuffer_state} do eviction:${do_eviction} empty:${empty}\n") XSDebug(p"sbuffer state:${sbuffer_state} do eviction:${do_eviction} empty:${empty}\n")
def noSameBlockInflight(idx: UInt): Bool = { def noSameBlockInflight(idx: UInt): Bool = {
val tag = tagRead(idx) val atag = tag(idx)
!Cat(widthMap(i => { !Cat(widthMap(i => {
// stateVec(idx) itself must not be s_inflight* // stateVec(idx) itself must not be s_inflight*
(isInflight(stateRead(i)) || isPrepare(stateRead(i))) && (isInflight(stateVec(i)) || isPrepare(stateVec(i))) &&
tag === tagRead(i) atag === tag(i)
})).orR() })).orR()
} }
...@@ -384,9 +365,9 @@ class NewSbuffer extends XSModule with HasSbufferCst { ...@@ -384,9 +365,9 @@ class NewSbuffer extends XSModule with HasSbufferCst {
dcacheReqValid := false.B dcacheReqValid := false.B
} }
when(prepareEn && (!dcacheReqValid || io.dcache.req.fire())) { when(prepareEn && (!dcacheReqValid || io.dcache.req.fire())) {
dcacheCandidate.addr := getAddr(tagRead(prepareIdx)) dcacheCandidate.addr := getAddr(tag(prepareIdx))
dcacheCandidate.data := bufferRead(prepareIdx).data dcacheCandidate.data := data(prepareIdx).asUInt
dcacheCandidate.mask := bufferRead(prepareIdx).mask dcacheCandidate.mask := mask(prepareIdx).asUInt
dcacheCandidate.cmd := MemoryOpConstants.M_XWR dcacheCandidate.cmd := MemoryOpConstants.M_XWR
dcacheCandidate.id := prepareIdx dcacheCandidate.id := prepareIdx
stateVec(prepareIdx) := s_inflight stateVec(prepareIdx) := s_inflight
...@@ -411,9 +392,9 @@ class NewSbuffer extends XSModule with HasSbufferCst { ...@@ -411,9 +392,9 @@ class NewSbuffer extends XSModule with HasSbufferCst {
if (!env.FPGAPlatform) { if (!env.FPGAPlatform) {
difftestIO.sbufferResp := WireInit(io.dcache.resp.fire()) difftestIO.sbufferResp := WireInit(io.dcache.resp.fire())
difftestIO.sbufferAddr := WireInit(getAddr(tagRead(respId))) difftestIO.sbufferAddr := WireInit(getAddr(tag(respId)))
difftestIO.sbufferData := WireInit(bufferRead(respId).data.asTypeOf(Vec(CacheLineBytes, UInt(8.W)))) difftestIO.sbufferData := WireInit(data(respId).asTypeOf(Vec(CacheLineBytes, UInt(8.W))))
difftestIO.sbufferMask := WireInit(bufferRead(respId).mask) difftestIO.sbufferMask := WireInit(mask(respId).asUInt)
} }
val needSpace = (io.in(0).fire && !canMerge(0)) +& (io.in(1).fire && !canMerge(1) && !sameTag) val needSpace = (io.in(0).fire && !canMerge(0)) +& (io.in(1).fire && !canMerge(1) && !sameTag)
...@@ -431,7 +412,7 @@ class NewSbuffer extends XSModule with HasSbufferCst { ...@@ -431,7 +412,7 @@ class NewSbuffer extends XSModule with HasSbufferCst {
when(isValid(stateVec(i))){ when(isValid(stateVec(i))){
when(cohCount(i)(countBits-1)){ when(cohCount(i)(countBits-1)){
assert(stateVec(i) === s_valid) assert(stateVec(i) === s_valid)
stateUpdate(i) := s_prepare stateVec(i) := s_prepare
} }
cohCount(i) := cohCount(i)+1.U cohCount(i) := cohCount(i)+1.U
} }
...@@ -440,7 +421,7 @@ class NewSbuffer extends XSModule with HasSbufferCst { ...@@ -440,7 +421,7 @@ class NewSbuffer extends XSModule with HasSbufferCst {
// ---------------------- Load Data Forward --------------------- // ---------------------- Load Data Forward ---------------------
for ((forward, i) <- io.forward.zipWithIndex) { for ((forward, i) <- io.forward.zipWithIndex) {
val tag_matches = widthMap(i => tagRead(i) === getTag(forward.paddr)) val tag_matches = widthMap(i => tag(i) === getTag(forward.paddr))
val valid_tag_matches = widthMap(i => tag_matches(i) && isValid(stateVec(i))) val valid_tag_matches = widthMap(i => tag_matches(i) && isValid(stateVec(i)))
val inflight_tag_matches = widthMap(i => val inflight_tag_matches = widthMap(i =>
tag_matches(i) && (isInflight(stateVec(i)) || isPrepare(stateVec(i))) tag_matches(i) && (isInflight(stateVec(i)) || isPrepare(stateVec(i)))
...@@ -451,13 +432,11 @@ class NewSbuffer extends XSModule with HasSbufferCst { ...@@ -451,13 +432,11 @@ class NewSbuffer extends XSModule with HasSbufferCst {
val inflight_tag_match_reg = inflight_tag_matches.map(RegNext(_)) val inflight_tag_match_reg = inflight_tag_matches.map(RegNext(_))
val line_offset_reg = RegNext(line_offset_mask) val line_offset_reg = RegNext(line_offset_mask)
val selectedValidLine = Mux1H(valid_tag_match_reg, bufferRead) val selectedValidMask = Mux1H(line_offset_reg, Mux1H(valid_tag_match_reg, mask).asTypeOf(Vec(CacheLineWords, Vec(DataBytes, Bool()))))
val selectedValidMask = Mux1H(line_offset_reg, selectedValidLine.mask.asTypeOf(Vec(CacheLineWords, Vec(DataBytes, Bool())))) val selectedValidData = Mux1H(line_offset_reg, Mux1H(valid_tag_match_reg, data).asTypeOf(Vec(CacheLineWords, Vec(DataBytes, UInt(8.W)))))
val selectedValidData = Mux1H(line_offset_reg, selectedValidLine.data.asTypeOf(Vec(CacheLineWords, Vec(DataBytes, UInt(8.W)))))
val selectedInflightLine = Mux1H(inflight_tag_match_reg, bufferRead) val selectedInflightMask = Mux1H(line_offset_reg, Mux1H(inflight_tag_match_reg, mask).asTypeOf(Vec(CacheLineWords, Vec(DataBytes, Bool()))))
val selectedInflightMask = Mux1H(line_offset_reg, selectedInflightLine.mask.asTypeOf(Vec(CacheLineWords, Vec(DataBytes, Bool())))) val selectedInflightData = Mux1H(line_offset_reg, Mux1H(inflight_tag_match_reg, data).asTypeOf(Vec(CacheLineWords, Vec(DataBytes, UInt(8.W)))))
val selectedInflightData = Mux1H(line_offset_reg, selectedInflightLine.data.asTypeOf(Vec(CacheLineWords, Vec(DataBytes, UInt(8.W)))))
for (j <- 0 until DataBytes) { for (j <- 0 until DataBytes) {
forward.forwardMask(j) := false.B forward.forwardMask(j) := false.B
......
...@@ -148,7 +148,7 @@ package object xiangshan { ...@@ -148,7 +148,7 @@ package object xiangshan {
def configable_cache(mode: UInt) = mode(7) def configable_cache(mode: UInt) = mode(7)
def strToMode(s: String) = { def strToMode(s: String) = {
var result = 0.U << 8 var result = 0.U(8.W)
if (s.toUpperCase.indexOf("R") >= 0) result = result + R if (s.toUpperCase.indexOf("R") >= 0) result = result + R
if (s.toUpperCase.indexOf("W") >= 0) result = result + W if (s.toUpperCase.indexOf("W") >= 0) result = result + W
if (s.toUpperCase.indexOf("X") >= 0) result = result + X if (s.toUpperCase.indexOf("X") >= 0) result = result + X
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册