提交 e8235ed3 编写于 作者: G good-circle

Memblock: Add basic Vector load and store logic

Vector load/store queue, exception handling and writeback will be improved later

---------
Co-authored-by: weixin_44573736's avatarlulu0521 <majianlu_0521@163.com>
上级 cdbff57c
...@@ -64,8 +64,8 @@ class MinimalConfig(n: Int = 1) extends Config( ...@@ -64,8 +64,8 @@ class MinimalConfig(n: Int = 1) extends Config(
IssQueSize = 8, IssQueSize = 8,
NRPhyRegs = 64, NRPhyRegs = 64,
VirtualLoadQueueSize = 16, VirtualLoadQueueSize = 16,
LoadQueueRARSize = 16, LoadQueueRARSize = 16,
LoadQueueRAWSize = 12, LoadQueueRAWSize = 12,
LoadQueueReplaySize = 8, LoadQueueReplaySize = 8,
LoadUncacheBufferSize = 8, LoadUncacheBufferSize = 8,
LoadQueueNWriteBanks = 4, // NOTE: make sure that LoadQueue{RAR, RAW, Replay}Size is divided by LoadQueueNWriteBanks. LoadQueueNWriteBanks = 4, // NOTE: make sure that LoadQueue{RAR, RAW, Replay}Size is divided by LoadQueueNWriteBanks.
...@@ -78,13 +78,24 @@ class MinimalConfig(n: Int = 1) extends Config( ...@@ -78,13 +78,24 @@ class MinimalConfig(n: Int = 1) extends Config(
IBufSize = 16, IBufSize = 16,
StoreBufferSize = 4, StoreBufferSize = 4,
StoreBufferThreshold = 3, StoreBufferThreshold = 3,
// TODO: VLSU, enqueue logic should be optimized for minimal config (Now vlsu queue size is same as default config)
// However, VirtualLoadQueueSize is 16 for minimal config
// So in fact Vector load/store instructions cannot be executed at minimalconfig now
// New vector load/store queue will be implemented soon
UsQueueSize = 8,
VlFlowSize = 32,
VlUopSize = 32,
VsFlowSize = 32,
VsUopSize = 32,
dpParams = DispatchParameters( dpParams = DispatchParameters(
IntDqSize = 12, IntDqSize = 12,
FpDqSize = 12, FpDqSize = 12,
LsDqSize = 12, LsDqSize = 12,
VlsDqSize = 12,
IntDqDeqWidth = 4, IntDqDeqWidth = 4,
FpDqDeqWidth = 4, FpDqDeqWidth = 4,
LsDqDeqWidth = 4 LsDqDeqWidth = 4,
VlsDqDeqWidth = 4
), ),
exuParameters = ExuParameters( exuParameters = ExuParameters(
JmpCnt = 1, JmpCnt = 1,
...@@ -95,7 +106,9 @@ class MinimalConfig(n: Int = 1) extends Config( ...@@ -95,7 +106,9 @@ class MinimalConfig(n: Int = 1) extends Config(
FmiscCnt = 1, FmiscCnt = 1,
FmiscDivSqrtCnt = 0, FmiscDivSqrtCnt = 0,
LduCnt = 2, LduCnt = 2,
StuCnt = 2 StuCnt = 2,
VlCnt = 2,
VsCnt = 2
), ),
icacheParameters = ICacheParameters( icacheParameters = ICacheParameters(
nSets = 64, // 16KB ICache nSets = 64, // 16KB ICache
......
...@@ -154,6 +154,19 @@ class FPUCtrlSignals(implicit p: Parameters) extends XSBundle { ...@@ -154,6 +154,19 @@ class FPUCtrlSignals(implicit p: Parameters) extends XSBundle {
val rm = UInt(3.W) val rm = UInt(3.W)
} }
class VType(implicit p: Parameters) extends XSBundle {
val vma = Bool()
val vta = Bool()
val vsew = UInt(3.W)
val vlmul = UInt(3.W)
}
class VConfig(implicit p: Parameters) extends XSBundle {
val vl = UInt(8.W)
val vstart = UInt(8.W)
val vtype = new VType
}
// Decode DecodeWidth insts at Decode Stage // Decode DecodeWidth insts at Decode Stage
class CtrlSignals(implicit p: Parameters) extends XSBundle { class CtrlSignals(implicit p: Parameters) extends XSBundle {
val debug_globalID = UInt(XLEN.W) val debug_globalID = UInt(XLEN.W)
...@@ -172,6 +185,11 @@ class CtrlSignals(implicit p: Parameters) extends XSBundle { ...@@ -172,6 +185,11 @@ class CtrlSignals(implicit p: Parameters) extends XSBundle {
val imm = UInt(ImmUnion.maxLen.W) val imm = UInt(ImmUnion.maxLen.W)
val commitType = CommitType() val commitType = CommitType()
val fpu = new FPUCtrlSignals val fpu = new FPUCtrlSignals
val uopIdx = UInt(log2Up(MaxUopSize).W)
val total_num = UInt(log2Up(MaxUopSize).W)
val firstUop = Bool()
val lastUop = Bool()
val vconfig = new VConfig
val isMove = Bool() val isMove = Bool()
val singleStep = Bool() val singleStep = Bool()
// This inst will flush all the pipe when it is the oldest inst in ROB, // This inst will flush all the pipe when it is the oldest inst in ROB,
...@@ -335,12 +353,16 @@ class DebugBundle(implicit p: Parameters) extends XSBundle { ...@@ -335,12 +353,16 @@ class DebugBundle(implicit p: Parameters) extends XSBundle {
// val levelTlbHit = UInt(2.W) // val levelTlbHit = UInt(2.W)
} }
class ExuInput(implicit p: Parameters) extends XSBundleWithMicroOp { class ExuInput(isVpu: Boolean = false)(implicit p: Parameters) extends XSBundleWithMicroOp {
val src = Vec(3, UInt(XLEN.W)) val dataWidth = if (isVpu) VLEN else XLEN
val src = Vec(4, UInt(dataWidth.W))
} }
class ExuOutput(implicit p: Parameters) extends XSBundleWithMicroOp { class ExuOutput(isVpu: Boolean = false)(implicit p: Parameters) extends XSBundleWithMicroOp {
val data = UInt(XLEN.W) val dataWidth = if (isVpu) VLEN else XLEN
val data = UInt(dataWidth.W)
val fflags = UInt(5.W) val fflags = UInt(5.W)
val redirectValid = Bool() val redirectValid = Bool()
val redirect = new Redirect val redirect = new Redirect
......
...@@ -128,6 +128,7 @@ case class XSCoreParameters ...@@ -128,6 +128,7 @@ case class XSCoreParameters
DecodeWidth: Int = 6, DecodeWidth: Int = 6,
RenameWidth: Int = 6, RenameWidth: Int = 6,
CommitWidth: Int = 6, CommitWidth: Int = 6,
MaxUopSize: Int = 37,
EnableRenameSnapshot: Boolean = true, EnableRenameSnapshot: Boolean = true,
RenameSnapshotNum: Int = 4, RenameSnapshotNum: Int = 4,
FtqSize: Int = 64, FtqSize: Int = 64,
...@@ -144,15 +145,21 @@ case class XSCoreParameters ...@@ -144,15 +145,21 @@ case class XSCoreParameters
StoreQueueSize: Int = 64, StoreQueueSize: Int = 64,
StoreQueueNWriteBanks: Int = 8, // NOTE: make sure that StoreQueueSize is divided by StoreQueueNWriteBanks StoreQueueNWriteBanks: Int = 8, // NOTE: make sure that StoreQueueSize is divided by StoreQueueNWriteBanks
StoreQueueForwardWithMask: Boolean = true, StoreQueueForwardWithMask: Boolean = true,
VlsQueueSize: Int = 8, UsQueueSize: Int = 8,
VlFlowSize: Int = 32,
VlUopSize: Int = 32,
VsFlowSize: Int = 32,
VsUopSize: Int = 32,
RobSize: Int = 256, RobSize: Int = 256,
dpParams: DispatchParameters = DispatchParameters( dpParams: DispatchParameters = DispatchParameters(
IntDqSize = 16, IntDqSize = 16,
FpDqSize = 16, FpDqSize = 16,
LsDqSize = 16, LsDqSize = 16,
VlsDqSize = 16,
IntDqDeqWidth = 4, IntDqDeqWidth = 4,
FpDqDeqWidth = 4, FpDqDeqWidth = 4,
LsDqDeqWidth = 4 LsDqDeqWidth = 4,
VlsDqDeqWidth = 4,
), ),
exuParameters: ExuParameters = ExuParameters( exuParameters: ExuParameters = ExuParameters(
JmpCnt = 1, JmpCnt = 1,
...@@ -163,14 +170,15 @@ case class XSCoreParameters ...@@ -163,14 +170,15 @@ case class XSCoreParameters
FmiscCnt = 2, FmiscCnt = 2,
FmiscDivSqrtCnt = 0, FmiscDivSqrtCnt = 0,
LduCnt = 2, LduCnt = 2,
StuCnt = 2 StuCnt = 2,
VlCnt = 2,
VsCnt = 2
), ),
prefetcher: Option[PrefetcherParams] = Some(SMSParams()), prefetcher: Option[PrefetcherParams] = Some(SMSParams()),
LoadPipelineWidth: Int = 2, LoadPipelineWidth: Int = 2,
StorePipelineWidth: Int = 2, StorePipelineWidth: Int = 2,
VecMemSrcInWidth: Int = 2, VecLoadPipelineWidth: Int = 2,
VecMemInstWbWidth: Int = 1, VecStorePipelineWidth: Int = 2,
VecMemDispatchWidth: Int = 1,
StoreBufferSize: Int = 16, StoreBufferSize: Int = 16,
StoreBufferThreshold: Int = 7, StoreBufferThreshold: Int = 7,
EnsbufferWidth: Int = 2, EnsbufferWidth: Int = 2,
...@@ -287,7 +295,16 @@ case class XSCoreParameters ...@@ -287,7 +295,16 @@ case class XSCoreParameters
Seq.fill(exuParameters.FmacCnt)(FmacExeUnitCfg) ++ Seq.fill(exuParameters.FmacCnt)(FmacExeUnitCfg) ++
Seq.fill(exuParameters.FmiscCnt)(FmiscExeUnitCfg) Seq.fill(exuParameters.FmiscCnt)(FmiscExeUnitCfg)
val exuConfigs: Seq[ExuConfig] = intExuConfigs ++ fpExuConfigs ++ loadExuConfigs ++ storeExuConfigs // TODO: Backend for VLSU, fix Vector exuconfigs and writebackconfigs
val vecloadExuConfigs = Seq.fill(exuParameters.VlCnt)(vecLdExeUnitCfg)
val vecstoreExuConfigs = Seq.fill(exuParameters.VsCnt)(vecStaExeUnitCfg) ++ Seq.fill(exuParameters.VsCnt)(vecStdExeUnitCfg)
val vecstoreWritebackConfigs = Seq.fill(exuParameters.VsCnt)(vecStaExeUnitCfg)
val vlsuExuConfigs = vecloadExuConfigs ++ vecstoreExuConfigs
val vlsuWritebackConfigs = vecloadExuConfigs ++ vecstoreWritebackConfigs
val exuConfigs: Seq[ExuConfig] = intExuConfigs ++ fpExuConfigs ++ loadExuConfigs ++ storeExuConfigs ++ vlsuExuConfigs
val exuWritebackConfigs: Seq[ExuConfig] = intExuConfigs ++ fpExuConfigs ++ loadExuConfigs ++ storeExuConfigs ++ vlsuWritebackConfigs
} }
case object DebugOptionsKey extends Field[DebugOptions] case object DebugOptionsKey extends Field[DebugOptions]
...@@ -401,6 +418,7 @@ trait HasXSParameter { ...@@ -401,6 +418,7 @@ trait HasXSParameter {
val DecodeWidth = coreParams.DecodeWidth val DecodeWidth = coreParams.DecodeWidth
val RenameWidth = coreParams.RenameWidth val RenameWidth = coreParams.RenameWidth
val CommitWidth = coreParams.CommitWidth val CommitWidth = coreParams.CommitWidth
val MaxUopSize = coreParams.MaxUopSize
val EnableRenameSnapshot = coreParams.EnableRenameSnapshot val EnableRenameSnapshot = coreParams.EnableRenameSnapshot
val RenameSnapshotNum = coreParams.RenameSnapshotNum val RenameSnapshotNum = coreParams.RenameSnapshotNum
val FtqSize = coreParams.FtqSize val FtqSize = coreParams.FtqSize
...@@ -420,19 +438,22 @@ trait HasXSParameter { ...@@ -420,19 +438,22 @@ trait HasXSParameter {
val StoreQueueSize = coreParams.StoreQueueSize val StoreQueueSize = coreParams.StoreQueueSize
val StoreQueueNWriteBanks = coreParams.StoreQueueNWriteBanks val StoreQueueNWriteBanks = coreParams.StoreQueueNWriteBanks
val StoreQueueForwardWithMask = coreParams.StoreQueueForwardWithMask val StoreQueueForwardWithMask = coreParams.StoreQueueForwardWithMask
val VlsQueueSize = coreParams.VlsQueueSize val UsQueueSize = coreParams.UsQueueSize
val VlFlowSize = coreParams.VlFlowSize
val VlUopSize = coreParams.VlUopSize
val VsFlowSize = coreParams.VsFlowSize
val VsUopSize = coreParams.VsUopSize
val dpParams = coreParams.dpParams val dpParams = coreParams.dpParams
val exuParameters = coreParams.exuParameters val exuParameters = coreParams.exuParameters
val NRMemReadPorts = exuParameters.LduCnt + 2 * exuParameters.StuCnt val NRMemReadPorts = exuParameters.LduCnt + 2 * exuParameters.StuCnt
val NRIntReadPorts = 2 * exuParameters.AluCnt + NRMemReadPorts val NRIntReadPorts = 2 * exuParameters.AluCnt + NRMemReadPorts
val NRIntWritePorts = exuParameters.AluCnt + exuParameters.MduCnt + exuParameters.LduCnt val NRIntWritePorts = exuParameters.AluCnt + exuParameters.MduCnt + exuParameters.LduCnt
val NRFpReadPorts = 3 * exuParameters.FmacCnt + exuParameters.StuCnt val NRFpReadPorts = 3 * exuParameters.FmacCnt + exuParameters.StuCnt
val NRFpWritePorts = exuParameters.FpExuCnt + exuParameters.LduCnt val NRFpWritePorts = exuParameters.FpExuCnt + exuParameters.LduCnt + exuParameters.VlCnt
val LoadPipelineWidth = coreParams.LoadPipelineWidth val LoadPipelineWidth = coreParams.LoadPipelineWidth
val StorePipelineWidth = coreParams.StorePipelineWidth val StorePipelineWidth = coreParams.StorePipelineWidth
val VecMemSrcInWidth = coreParams.VecMemSrcInWidth val VecLoadPipelineWidth = coreParams.VecLoadPipelineWidth
val VecMemInstWbWidth = coreParams.VecMemInstWbWidth val VecStorePipelineWidth = coreParams.VecStorePipelineWidth
val VecMemDispatchWidth = coreParams.VecMemDispatchWidth
val StoreBufferSize = coreParams.StoreBufferSize val StoreBufferSize = coreParams.StoreBufferSize
val StoreBufferThreshold = coreParams.StoreBufferThreshold val StoreBufferThreshold = coreParams.StoreBufferThreshold
val EnsbufferWidth = coreParams.EnsbufferWidth val EnsbufferWidth = coreParams.EnsbufferWidth
...@@ -459,7 +480,8 @@ trait HasXSParameter { ...@@ -459,7 +480,8 @@ trait HasXSParameter {
val NumRs = (exuParameters.JmpCnt+1)/2 + (exuParameters.AluCnt+1)/2 + (exuParameters.MulCnt+1)/2 + val NumRs = (exuParameters.JmpCnt+1)/2 + (exuParameters.AluCnt+1)/2 + (exuParameters.MulCnt+1)/2 +
(exuParameters.MduCnt+1)/2 + (exuParameters.FmacCnt+1)/2 + + (exuParameters.FmiscCnt+1)/2 + (exuParameters.MduCnt+1)/2 + (exuParameters.FmacCnt+1)/2 + + (exuParameters.FmiscCnt+1)/2 +
(exuParameters.FmiscDivSqrtCnt+1)/2 + (exuParameters.LduCnt+1)/2 + (exuParameters.FmiscDivSqrtCnt+1)/2 + (exuParameters.LduCnt+1)/2 +
(exuParameters.StuCnt+1)/2 + (exuParameters.StuCnt+1)/2 (exuParameters.StuCnt+1)/2 + (exuParameters.StuCnt+1)/2 +
(exuParameters.VlCnt+1)/2 + (exuParameters.VsCnt+1)/2 + (exuParameters.VsCnt+1)/2
val instBytes = if (HasCExtension) 2 else 4 val instBytes = if (HasCExtension) 2 else 4
val instOffsetBits = log2Ceil(instBytes) val instOffsetBits = log2Ceil(instBytes)
...@@ -498,6 +520,10 @@ trait HasXSParameter { ...@@ -498,6 +520,10 @@ trait HasXSParameter {
val fpExuConfigs = coreParams.fpExuConfigs val fpExuConfigs = coreParams.fpExuConfigs
val exuConfigs = coreParams.exuConfigs val exuConfigs = coreParams.exuConfigs
val exuWritebackConfigs = coreParams.exuWritebackConfigs
val vlsuExuConfigs = coreParams.vlsuExuConfigs
val vlsuWritebackConfigs = coreParams.vlsuWritebackConfigs
val PCntIncrStep: Int = 6 val PCntIncrStep: Int = 6
val numPCntHc: Int = 25 val numPCntHc: Int = 25
......
...@@ -146,7 +146,7 @@ abstract class XSCoreBase()(implicit p: config.Parameters) extends LazyModule ...@@ -146,7 +146,7 @@ abstract class XSCoreBase()(implicit p: config.Parameters) extends LazyModule
ptw_to_l2_buffer.node := ptw.node ptw_to_l2_buffer.node := ptw.node
} }
val wbArbiter = LazyModule(new WbArbiterWrapper(exuConfigs, NRIntWritePorts, NRFpWritePorts)) val wbArbiter = LazyModule(new WbArbiterWrapper(exuWritebackConfigs, NRIntWritePorts, NRFpWritePorts))
val intWbPorts = wbArbiter.intWbPorts val intWbPorts = wbArbiter.intWbPorts
val fpWbPorts = wbArbiter.fpWbPorts val fpWbPorts = wbArbiter.fpWbPorts
...@@ -157,6 +157,7 @@ abstract class XSCoreBase()(implicit p: config.Parameters) extends LazyModule ...@@ -157,6 +157,7 @@ abstract class XSCoreBase()(implicit p: config.Parameters) extends LazyModule
require(exuParameters.FmiscCnt <= exuParameters.FmacCnt && exuParameters.FmiscCnt > 0) require(exuParameters.FmiscCnt <= exuParameters.FmacCnt && exuParameters.FmiscCnt > 0)
require(exuParameters.LduCnt == exuParameters.StuCnt) // TODO: remove this limitation require(exuParameters.LduCnt == exuParameters.StuCnt) // TODO: remove this limitation
// TODO: Backend for VLSU, fix schedule and dispatch
// one RS every 2 MDUs // one RS every 2 MDUs
val schedulePorts = Seq( val schedulePorts = Seq(
// exuCfg, numDeq, intFastWakeupTarget, fpFastWakeupTarget // exuCfg, numDeq, intFastWakeupTarget, fpFastWakeupTarget
...@@ -170,7 +171,10 @@ abstract class XSCoreBase()(implicit p: config.Parameters) extends LazyModule ...@@ -170,7 +171,10 @@ abstract class XSCoreBase()(implicit p: config.Parameters) extends LazyModule
), ),
Seq( Seq(
(FmacExeUnitCfg, exuParameters.FmacCnt, Seq(), Seq(FmacExeUnitCfg, FmiscExeUnitCfg)), (FmacExeUnitCfg, exuParameters.FmacCnt, Seq(), Seq(FmacExeUnitCfg, FmiscExeUnitCfg)),
(FmiscExeUnitCfg, exuParameters.FmiscCnt, Seq(), Seq()) (FmiscExeUnitCfg, exuParameters.FmiscCnt, Seq(), Seq()),
(vecLdExeUnitCfg, exuParameters.VlCnt, Seq(AluExeUnitCfg, vecLdExeUnitCfg), Seq()),
(vecStaExeUnitCfg, exuParameters.VsCnt, Seq(), Seq()),
(vecStdExeUnitCfg, exuParameters.VsCnt, Seq(), Seq())
) )
) )
...@@ -207,7 +211,11 @@ abstract class XSCoreBase()(implicit p: config.Parameters) extends LazyModule ...@@ -207,7 +211,11 @@ abstract class XSCoreBase()(implicit p: config.Parameters) extends LazyModule
else Seq((0, i)) else Seq((0, i))
}) })
val dispatchPorts = Seq(intDpPorts ++ lsDpPorts, fpDpPorts) val vlsDpPorts = (0 until exuParameters.VlCnt).map(i => Seq((2, i))) ++
(0 until exuParameters.VsCnt).map(i => Seq((3, i))) ++
(0 until exuParameters.VsCnt).map(i => Seq((4, i)))
val dispatchPorts = Seq(intDpPorts ++ lsDpPorts, fpDpPorts ++ vlsDpPorts)
val outIntRfReadPorts = Seq(0, 0) val outIntRfReadPorts = Seq(0, 0)
val outFpRfReadPorts = Seq(0, StorePipelineWidth) val outFpRfReadPorts = Seq(0, StorePipelineWidth)
...@@ -224,7 +232,7 @@ abstract class XSCoreBase()(implicit p: config.Parameters) extends LazyModule ...@@ -224,7 +232,7 @@ abstract class XSCoreBase()(implicit p: config.Parameters) extends LazyModule
) )
}))) })))
val wb2Ctrl = LazyModule(new Wb2Ctrl(exuConfigs)) val wb2Ctrl = LazyModule(new Wb2Ctrl(exuWritebackConfigs))
wb2Ctrl.addWritebackSink(exuBlocks :+ memBlock) wb2Ctrl.addWritebackSink(exuBlocks :+ memBlock)
val dpExuConfigs = exuBlocks.flatMap(_.scheduler.dispatch2.map(_.configs)) val dpExuConfigs = exuBlocks.flatMap(_.scheduler.dispatch2.map(_.configs))
val ctrlBlock = LazyModule(new CtrlBlock(dpExuConfigs)) val ctrlBlock = LazyModule(new CtrlBlock(dpExuConfigs))
...@@ -271,8 +279,10 @@ class XSCoreImp(outer: XSCoreBase) extends LazyModuleImp(outer) ...@@ -271,8 +279,10 @@ class XSCoreImp(outer: XSCoreBase) extends LazyModuleImp(outer)
io.cpu_halt := ctrlBlock.io.cpu_halt io.cpu_halt := ctrlBlock.io.cpu_halt
outer.wbArbiter.module.io.redirect <> ctrlBlock.io.redirect outer.wbArbiter.module.io.redirect <> ctrlBlock.io.redirect
val allWriteback = exuBlocks.flatMap(_.io.fuWriteback) ++ memBlock.io.writeback val allWriteback = exuBlocks.flatMap(_.io.fuWriteback) ++ memBlock.io.writeback ++ memBlock.io.vecWriteback
require(exuConfigs.length == allWriteback.length, s"${exuConfigs.length} != ${allWriteback.length}") // TODO: Backend for VLSU, fix it
// Now vector store insts have vecsta and vecstd, but will writeback vecsta and vecstd together
require(exuConfigs.length == allWriteback.length + exuParameters.VsCnt, s"${exuConfigs.length} != ${allWriteback.length}")
outer.wbArbiter.module.io.in <> allWriteback outer.wbArbiter.module.io.in <> allWriteback
val rfWriteback = outer.wbArbiter.module.io.out val rfWriteback = outer.wbArbiter.module.io.out
...@@ -312,7 +322,8 @@ class XSCoreImp(outer: XSCoreBase) extends LazyModuleImp(outer) ...@@ -312,7 +322,8 @@ class XSCoreImp(outer: XSCoreBase) extends LazyModuleImp(outer)
outer.ctrlBlock.generateWritebackIO() outer.ctrlBlock.generateWritebackIO()
val allFastUop = exuBlocks.flatMap(b => b.io.fastUopOut.dropRight(b.numOutFu)) ++ memBlock.io.otherFastWakeup val allFastUop = exuBlocks.flatMap(b => b.io.fastUopOut.dropRight(b.numOutFu)) ++ memBlock.io.otherFastWakeup
require(allFastUop.length == exuConfigs.length, s"${allFastUop.length} != ${exuConfigs.length}") // TODO: Backend for VLSU, Fix fastuop logic
require(allFastUop.length == exuConfigs.length - exuParameters.VlCnt - 2 * exuParameters.VsCnt, s"${allFastUop.length} != ${exuConfigs.length}")
val intFastUop = allFastUop.zip(exuConfigs).filter(_._2.writeIntRf).map(_._1) val intFastUop = allFastUop.zip(exuConfigs).filter(_._2.writeIntRf).map(_._1)
val fpFastUop = allFastUop.zip(exuConfigs).filter(_._2.writeFpRf).map(_._1) val fpFastUop = allFastUop.zip(exuConfigs).filter(_._2.writeFpRf).map(_._1)
val intFastUop1 = outer.wbArbiter.intConnections.map(c => intFastUop(c.head)) val intFastUop1 = outer.wbArbiter.intConnections.map(c => intFastUop(c.head))
...@@ -426,6 +437,9 @@ class XSCoreImp(outer: XSCoreBase) extends LazyModuleImp(outer) ...@@ -426,6 +437,9 @@ class XSCoreImp(outer: XSCoreBase) extends LazyModuleImp(outer)
memBlock.io.lsTopdownInfo <> ctrlBlock.io.robio.lsTopdownInfo memBlock.io.lsTopdownInfo <> ctrlBlock.io.robio.lsTopdownInfo
memBlock.io.l2_hint.valid := io.l2_hint.valid memBlock.io.l2_hint.valid := io.l2_hint.valid
memBlock.io.l2_hint.bits.sourceId := io.l2_hint.bits.sourceId memBlock.io.l2_hint.bits.sourceId := io.l2_hint.bits.sourceId
// TODO: Backend for VLSU, implement vector load & store in
memBlock.io.VecloadRegIn := DontCare
memBlock.io.vecStoreIn := DontCare
val itlbRepeater1 = PTWFilter(itlbParams.fenceDelay,frontend.io.ptw, fenceio.sfence, csrioIn.tlb, l2tlbParams.ifilterSize) val itlbRepeater1 = PTWFilter(itlbParams.fenceDelay,frontend.io.ptw, fenceio.sfence, csrioIn.tlb, l2tlbParams.ifilterSize)
val itlbRepeater2 = PTWRepeaterNB(passReady = false, itlbParams.fenceDelay, itlbRepeater1.io.ptw, ptw.io.tlb(0), fenceio.sfence, csrioIn.tlb) val itlbRepeater2 = PTWRepeaterNB(passReady = false, itlbParams.fenceDelay, itlbRepeater1.io.ptw, ptw.io.tlb(0), fenceio.sfence, csrioIn.tlb)
......
...@@ -268,7 +268,7 @@ class CtrlBlockImp(outer: CtrlBlock)(implicit p: Parameters) extends LazyModuleI ...@@ -268,7 +268,7 @@ class CtrlBlockImp(outer: CtrlBlock)(implicit p: Parameters) extends LazyModuleI
val frontend = Flipped(new FrontendToCtrlIO) val frontend = Flipped(new FrontendToCtrlIO)
// to exu blocks // to exu blocks
val allocPregs = Vec(RenameWidth, Output(new ResetPregStateReq)) val allocPregs = Vec(RenameWidth, Output(new ResetPregStateReq))
val dispatch = Vec(3*dpParams.IntDqDeqWidth, DecoupledIO(new MicroOp)) val dispatch = Vec(4*dpParams.IntDqDeqWidth, DecoupledIO(new MicroOp))
val rsReady = Vec(outer.dispatch2.map(_.module.io.out.length).sum, Input(Bool())) val rsReady = Vec(outer.dispatch2.map(_.module.io.out.length).sum, Input(Bool()))
val enqLsq = Flipped(new LsqEnqIO) val enqLsq = Flipped(new LsqEnqIO)
val lqCancelCnt = Input(UInt(log2Up(VirtualLoadQueueSize + 1).W)) val lqCancelCnt = Input(UInt(log2Up(VirtualLoadQueueSize + 1).W))
...@@ -336,6 +336,8 @@ class CtrlBlockImp(outer: CtrlBlock)(implicit p: Parameters) extends LazyModuleI ...@@ -336,6 +336,8 @@ class CtrlBlockImp(outer: CtrlBlock)(implicit p: Parameters) extends LazyModuleI
val intDq = Module(new DispatchQueue(dpParams.IntDqSize, RenameWidth, dpParams.IntDqDeqWidth)) val intDq = Module(new DispatchQueue(dpParams.IntDqSize, RenameWidth, dpParams.IntDqDeqWidth))
val fpDq = Module(new DispatchQueue(dpParams.FpDqSize, RenameWidth, dpParams.FpDqDeqWidth)) val fpDq = Module(new DispatchQueue(dpParams.FpDqSize, RenameWidth, dpParams.FpDqDeqWidth))
val lsDq = Module(new DispatchQueue(dpParams.LsDqSize, RenameWidth, dpParams.LsDqDeqWidth)) val lsDq = Module(new DispatchQueue(dpParams.LsDqSize, RenameWidth, dpParams.LsDqDeqWidth))
// TODO: Backend for VLSU, fix vlsdq and other logic
val vlsDq = Module(new DispatchQueue(dpParams.VlsDqSize, RenameWidth, dpParams.VlsDqDeqWidth))
val redirectGen = Module(new RedirectGenerator) val redirectGen = Module(new RedirectGenerator)
val rob = outer.rob.module val rob = outer.rob.module
...@@ -574,6 +576,7 @@ class CtrlBlockImp(outer: CtrlBlock)(implicit p: Parameters) extends LazyModuleI ...@@ -574,6 +576,7 @@ class CtrlBlockImp(outer: CtrlBlock)(implicit p: Parameters) extends LazyModuleI
dispatch.io.toIntDq <> intDq.io.enq dispatch.io.toIntDq <> intDq.io.enq
dispatch.io.toFpDq <> fpDq.io.enq dispatch.io.toFpDq <> fpDq.io.enq
dispatch.io.toLsDq <> lsDq.io.enq dispatch.io.toLsDq <> lsDq.io.enq
dispatch.io.toVlsDq <> vlsDq.io.enq
dispatch.io.allocPregs <> io.allocPregs dispatch.io.allocPregs <> io.allocPregs
dispatch.io.robHead := rob.io.debugRobHead dispatch.io.robHead := rob.io.debugRobHead
dispatch.io.stallReason <> rename.io.stallReason.out dispatch.io.stallReason <> rename.io.stallReason.out
...@@ -586,8 +589,9 @@ class CtrlBlockImp(outer: CtrlBlock)(implicit p: Parameters) extends LazyModuleI ...@@ -586,8 +589,9 @@ class CtrlBlockImp(outer: CtrlBlock)(implicit p: Parameters) extends LazyModuleI
intDq.io.redirect <> redirectForExu intDq.io.redirect <> redirectForExu
fpDq.io.redirect <> redirectForExu fpDq.io.redirect <> redirectForExu
lsDq.io.redirect <> redirectForExu lsDq.io.redirect <> redirectForExu
vlsDq.io.redirect <> redirectForExu
val dpqOut = intDq.io.deq ++ lsDq.io.deq ++ fpDq.io.deq val dpqOut = intDq.io.deq ++ lsDq.io.deq ++ fpDq.io.deq ++ vlsDq.io.deq
io.dispatch <> dpqOut io.dispatch <> dpqOut
for (dp2 <- outer.dispatch2.map(_.module.io)) { for (dp2 <- outer.dispatch2.map(_.module.io)) {
...@@ -676,7 +680,7 @@ class CtrlBlockImp(outer: CtrlBlock)(implicit p: Parameters) extends LazyModuleI ...@@ -676,7 +680,7 @@ class CtrlBlockImp(outer: CtrlBlock)(implicit p: Parameters) extends LazyModuleI
val perfinfo = IO(new Bundle(){ val perfinfo = IO(new Bundle(){
val perfEventsRs = Input(Vec(NumRs, new PerfEvent)) val perfEventsRs = Input(Vec(NumRs, new PerfEvent))
val perfEventsEu0 = Input(Vec(6, new PerfEvent)) val perfEventsEu0 = Input(Vec(6, new PerfEvent))
val perfEventsEu1 = Input(Vec(6, new PerfEvent)) val perfEventsEu1 = Input(Vec(10, new PerfEvent))
}) })
val allPerfEvents = Seq(decode, rename, dispatch, intDq, fpDq, lsDq, rob).flatMap(_.getPerf) val allPerfEvents = Seq(decode, rename, dispatch, intDq, fpDq, lsDq, rob).flatMap(_.getPerf)
......
...@@ -54,7 +54,8 @@ class MemBlock()(implicit p: Parameters) extends LazyModule ...@@ -54,7 +54,8 @@ class MemBlock()(implicit p: Parameters) extends LazyModule
override val writebackSourceParams: Seq[WritebackSourceParams] = { override val writebackSourceParams: Seq[WritebackSourceParams] = {
val params = new WritebackSourceParams val params = new WritebackSourceParams
params.exuConfigs = (loadExuConfigs ++ storeExuConfigs).map(cfg => Seq(cfg)) // TODO: Backend for VLSU, implement vlsu configs
params.exuConfigs = (loadExuConfigs ++ storeExuConfigs ++ vlsuWritebackConfigs).map(cfg => Seq(cfg))
Seq(params) Seq(params)
} }
override lazy val writebackSourceImp: HasWritebackSourceImp = module override lazy val writebackSourceImp: HasWritebackSourceImp = module
...@@ -72,20 +73,20 @@ class MemBlockImp(outer: MemBlock) extends LazyModuleImp(outer) ...@@ -72,20 +73,20 @@ class MemBlockImp(outer: MemBlock) extends LazyModuleImp(outer)
val redirect = Flipped(ValidIO(new Redirect)) val redirect = Flipped(ValidIO(new Redirect))
// in // in
val issue = Vec(exuParameters.LsExuCnt + exuParameters.StuCnt, Flipped(DecoupledIO(new ExuInput))) val issue = Vec(exuParameters.LsExuCnt + exuParameters.StuCnt, Flipped(DecoupledIO(new ExuInput)))
val VecloadRegIn = Vec(exuParameters.VlCnt, Flipped(Decoupled(new ExuInput(isVpu = true))))
val vecStoreIn = Vec(exuParameters.VsCnt, Flipped(DecoupledIO(new ExuInput(isVpu = true))))
val loadFastMatch = Vec(exuParameters.LduCnt, Input(UInt(exuParameters.LduCnt.W))) val loadFastMatch = Vec(exuParameters.LduCnt, Input(UInt(exuParameters.LduCnt.W)))
val loadFastImm = Vec(exuParameters.LduCnt, Input(UInt(12.W))) val loadFastImm = Vec(exuParameters.LduCnt, Input(UInt(12.W)))
val rsfeedback = Vec(exuParameters.LsExuCnt, new MemRSFeedbackIO) val rsfeedback = Vec(exuParameters.LsExuCnt, new MemRSFeedbackIO)
val loadPc = Vec(exuParameters.LduCnt, Input(UInt(VAddrBits.W))) // for hw prefetch val loadPc = Vec(exuParameters.LduCnt, Input(UInt(VAddrBits.W))) // for hw prefetch
val stIssuePtr = Output(new SqPtr()) val stIssuePtr = Output(new SqPtr())
val int2vlsu = Flipped(new Int2VLSUIO)
val vec2vlsu = Flipped(new Vec2VLSUIO)
// out // out
val writeback = Vec(exuParameters.LsExuCnt + exuParameters.StuCnt, DecoupledIO(new ExuOutput)) val writeback = Vec(exuParameters.LsExuCnt + exuParameters.StuCnt, DecoupledIO(new ExuOutput))
// TODO: VLSU, implement writeback and feedback
val vecWriteback = Vec(exuParameters.VlCnt + exuParameters.VsCnt, DecoupledIO(new ExuOutput(isVpu = true)))
val vecfeedback = Vec(exuParameters.VlCnt + exuParameters.VsCnt, ValidIO(Bool()))
val s3_delayed_load_error = Vec(exuParameters.LduCnt, Output(Bool())) val s3_delayed_load_error = Vec(exuParameters.LduCnt, Output(Bool()))
val otherFastWakeup = Vec(exuParameters.LduCnt + 2 * exuParameters.StuCnt, ValidIO(new MicroOp)) val otherFastWakeup = Vec(exuParameters.LduCnt + 2 * exuParameters.StuCnt, ValidIO(new MicroOp))
val vlsu2vec = new VLSU2VecIO
val vlsu2int = new VLSU2IntIO
val vlsu2ctrl = new VLSU2CtrlIO
// prefetch to l1 req // prefetch to l1 req
val prefetch_req = Flipped(DecoupledIO(new L1PrefetchReq)) val prefetch_req = Flipped(DecoupledIO(new L1PrefetchReq))
// misc // misc
...@@ -121,7 +122,7 @@ class MemBlockImp(outer: MemBlock) extends LazyModuleImp(outer) ...@@ -121,7 +122,7 @@ class MemBlockImp(outer: MemBlock) extends LazyModuleImp(outer)
val l2_hint = Input(Valid(new L2ToL1Hint())) val l2_hint = Input(Valid(new L2ToL1Hint()))
}) })
override def writebackSource1: Option[Seq[Seq[DecoupledIO[ExuOutput]]]] = Some(Seq(io.writeback)) override def writebackSource1: Option[Seq[Seq[DecoupledIO[ExuOutput]]]] = Some(Seq(io.writeback ++ io.vecWriteback))
val redirect = RegNextWithEnable(io.redirect) val redirect = RegNextWithEnable(io.redirect)
...@@ -174,6 +175,13 @@ class MemBlockImp(outer: MemBlock) extends LazyModuleImp(outer) ...@@ -174,6 +175,13 @@ class MemBlockImp(outer: MemBlock) extends LazyModuleImp(outer)
loadUnits.zipWithIndex.map(x => x._1.suggestName("LoadUnit_"+x._2)) loadUnits.zipWithIndex.map(x => x._1.suggestName("LoadUnit_"+x._2))
storeUnits.zipWithIndex.map(x => x._1.suggestName("StoreUnit_"+x._2)) storeUnits.zipWithIndex.map(x => x._1.suggestName("StoreUnit_"+x._2))
val atomicsUnit = Module(new AtomicsUnit) val atomicsUnit = Module(new AtomicsUnit)
val vectorLoadWrapperModule = Module(new VectorLoadWrapper)
val vsFlowQueue = Module(new VsFlowQueue)
// TODO: VLSU, implement it
vsFlowQueue.io.issuePtrExt := DontCare
vsFlowQueue.io.flowPtrExt := DontCare
val vsUopQueue = Module(new VsUopQueue)
// Atom inst comes from sta / std, then its result // Atom inst comes from sta / std, then its result
// will be writebacked using load writeback port // will be writebacked using load writeback port
...@@ -195,6 +203,8 @@ class MemBlockImp(outer: MemBlock) extends LazyModuleImp(outer) ...@@ -195,6 +203,8 @@ class MemBlockImp(outer: MemBlock) extends LazyModuleImp(outer)
io.writeback <> ldExeWbReqs ++ VecInit(storeUnits.map(_.io.stout)) ++ VecInit(stdExeUnits.map(_.io.out)) io.writeback <> ldExeWbReqs ++ VecInit(storeUnits.map(_.io.stout)) ++ VecInit(stdExeUnits.map(_.io.out))
io.otherFastWakeup := DontCare io.otherFastWakeup := DontCare
io.otherFastWakeup.take(2).zip(loadUnits.map(_.io.fast_uop)).foreach{case(a,b)=> a := b} io.otherFastWakeup.take(2).zip(loadUnits.map(_.io.fast_uop)).foreach{case(a,b)=> a := b}
// TODO: VLSU, implement it
val vectorFaskWakeup = loadUnits.map(_.io.vec_fast_uop)
val stOut = io.writeback.drop(exuParameters.LduCnt).dropRight(exuParameters.StuCnt) val stOut = io.writeback.drop(exuParameters.LduCnt).dropRight(exuParameters.StuCnt)
// prefetch to l1 req // prefetch to l1 req
...@@ -227,7 +237,6 @@ class MemBlockImp(outer: MemBlock) extends LazyModuleImp(outer) ...@@ -227,7 +237,6 @@ class MemBlockImp(outer: MemBlock) extends LazyModuleImp(outer)
// TODO: fast load wakeup // TODO: fast load wakeup
val lsq = Module(new LsqWrapper) val lsq = Module(new LsqWrapper)
val vlsq = Module(new DummyVectorLsq)
val sbuffer = Module(new Sbuffer) val sbuffer = Module(new Sbuffer)
// if you wants to stress test dcache store, use FakeSbuffer // if you wants to stress test dcache store, use FakeSbuffer
// val sbuffer = Module(new FakeSbuffer) // out of date now // val sbuffer = Module(new FakeSbuffer) // out of date now
...@@ -370,7 +379,7 @@ class MemBlockImp(outer: MemBlock) extends LazyModuleImp(outer) ...@@ -370,7 +379,7 @@ class MemBlockImp(outer: MemBlock) extends LazyModuleImp(outer)
val fastReplaySel = loadUnits.zipWithIndex.map { case (ldu, i) => { val fastReplaySel = loadUnits.zipWithIndex.map { case (ldu, i) => {
val wrapper = Wire(Valid(new BalanceEntry)) val wrapper = Wire(Valid(new BalanceEntry))
wrapper.valid := ldu.io.fast_rep_out.valid wrapper.valid := ldu.io.fast_rep_out.valid
wrapper.bits.req := ldu.io.fast_rep_out.bits wrapper.bits.req := ldu.io.fast_rep_out.bits
wrapper.bits.balance := ldu.io.fast_rep_out.bits.rep_info.bank_conflict wrapper.bits.balance := ldu.io.fast_rep_out.bits.rep_info.bank_conflict
wrapper.bits.port := i.U wrapper.bits.port := i.U
...@@ -381,15 +390,19 @@ class MemBlockImp(outer: MemBlock) extends LazyModuleImp(outer) ...@@ -381,15 +390,19 @@ class MemBlockImp(outer: MemBlock) extends LazyModuleImp(outer)
for (i <- 0 until exuParameters.LduCnt) { for (i <- 0 until exuParameters.LduCnt) {
loadUnits(i).io.redirect <> redirect loadUnits(i).io.redirect <> redirect
loadUnits(i).io.isFirstIssue := true.B loadUnits(i).io.isFirstIssue := true.B
// get input form dispatch // get input form dispatch
loadUnits(i).io.ldin <> io.issue(i) loadUnits(i).io.ldin <> io.issue(i)
// vector input from vector load queue
loadUnits(i).io.vecldin <> vectorLoadWrapperModule.io.loadPipeOut(i)
loadUnits(i).io.feedback_slow <> io.rsfeedback(i).feedbackSlow loadUnits(i).io.feedback_slow <> io.rsfeedback(i).feedbackSlow
loadUnits(i).io.feedback_fast <> io.rsfeedback(i).feedbackFast loadUnits(i).io.feedback_fast <> io.rsfeedback(i).feedbackFast
loadUnits(i).io.rsIdx := io.rsfeedback(i).rsIdx loadUnits(i).io.rsIdx := io.rsfeedback(i).rsIdx
// fast replay // fast replay
loadUnits(i).io.fast_rep_in.valid := balanceFastReplaySel(i).valid loadUnits(i).io.fast_rep_in.valid := balanceFastReplaySel(i).valid
loadUnits(i).io.fast_rep_in.bits := balanceFastReplaySel(i).bits.req loadUnits(i).io.fast_rep_in.bits := balanceFastReplaySel(i).bits.req
loadUnits(i).io.fast_rep_out.ready := false.B loadUnits(i).io.fast_rep_out.ready := false.B
...@@ -398,7 +411,7 @@ class MemBlockImp(outer: MemBlock) extends LazyModuleImp(outer) ...@@ -398,7 +411,7 @@ class MemBlockImp(outer: MemBlock) extends LazyModuleImp(outer)
loadUnits(i).io.fast_rep_out.ready := loadUnits(j).io.fast_rep_in.ready loadUnits(i).io.fast_rep_out.ready := loadUnits(j).io.fast_rep_in.ready
} }
} }
// get input form dispatch // get input form dispatch
loadUnits(i).io.ldin <> io.issue(i) loadUnits(i).io.ldin <> io.issue(i)
// dcache access // dcache access
...@@ -418,7 +431,7 @@ class MemBlockImp(outer: MemBlock) extends LazyModuleImp(outer) ...@@ -418,7 +431,7 @@ class MemBlockImp(outer: MemBlock) extends LazyModuleImp(outer)
loadUnits(i).io.tlb <> dtlb_reqs.take(exuParameters.LduCnt)(i) loadUnits(i).io.tlb <> dtlb_reqs.take(exuParameters.LduCnt)(i)
// pmp // pmp
loadUnits(i).io.pmp <> pmp_check(i).resp loadUnits(i).io.pmp <> pmp_check(i).resp
// st-ld violation query // st-ld violation query
for (s <- 0 until StorePipelineWidth) { for (s <- 0 until StorePipelineWidth) {
loadUnits(i).io.stld_nuke_query(s) := storeUnits(s).io.stld_nuke_query loadUnits(i).io.stld_nuke_query(s) := storeUnits(s).io.stld_nuke_query
} }
...@@ -466,6 +479,9 @@ class MemBlockImp(outer: MemBlock) extends LazyModuleImp(outer) ...@@ -466,6 +479,9 @@ class MemBlockImp(outer: MemBlock) extends LazyModuleImp(outer)
// update mem dependency predictor // update mem dependency predictor
// io.memPredUpdate(i) := DontCare // io.memPredUpdate(i) := DontCare
// vector load resp
vectorLoadWrapperModule.io.loadPipleIn(i) <> loadUnits(i).io.vecldout
// -------------------------------- // --------------------------------
// Load Triggers // Load Triggers
// -------------------------------- // --------------------------------
...@@ -502,6 +518,16 @@ class MemBlockImp(outer: MemBlock) extends LazyModuleImp(outer) ...@@ -502,6 +518,16 @@ class MemBlockImp(outer: MemBlock) extends LazyModuleImp(outer)
dtlb_reqs(PrefetcherDTLBPortIndex).resp.ready := true.B dtlb_reqs(PrefetcherDTLBPortIndex).resp.ready := true.B
} }
// vector store
vsUopQueue.io.Redirect <> redirect
vsFlowQueue.io.Redirect <> redirect
for (i <- 0 until VecStorePipelineWidth) {
// TODO: VLSU, implement it
vsUopQueue.io.storeIn(i) := DontCare
vsUopQueue.io.vstart(i) := DontCare
vsFlowQueue.io.uopIn(i) <> vsUopQueue.io.uop2Flow(i)
}
// StoreUnit // StoreUnit
for (i <- 0 until exuParameters.StuCnt) { for (i <- 0 until exuParameters.StuCnt) {
val stu = storeUnits(i) val stu = storeUnits(i)
...@@ -517,7 +543,12 @@ class MemBlockImp(outer: MemBlock) extends LazyModuleImp(outer) ...@@ -517,7 +543,12 @@ class MemBlockImp(outer: MemBlock) extends LazyModuleImp(outer)
// NOTE: just for dtlb's perf cnt // NOTE: just for dtlb's perf cnt
stu.io.isFirstIssue <> io.rsfeedback(exuParameters.LduCnt + i).isFirstIssue stu.io.isFirstIssue <> io.rsfeedback(exuParameters.LduCnt + i).isFirstIssue
stu.io.stin <> io.issue(exuParameters.LduCnt + i) stu.io.stin <> io.issue(exuParameters.LduCnt + i)
stu.io.lsq <> lsq.io.sta.storeAddrIn(i) // vector store
stu.io.vecstin <> vsFlowQueue.io.storePipeOut(i)
stu.io.vec_isFirstIssue <> vsFlowQueue.io.isFirstIssue(i)
stu.io.vec_feedback_slow <> vsFlowQueue.io.vsfqFeedback(i)
stu.io.lsq <> lsq.io.sta.storeAddrIn(i)
stu.io.lsq_replenish <> lsq.io.sta.storeAddrInRe(i) stu.io.lsq_replenish <> lsq.io.sta.storeAddrInRe(i)
// dtlb // dtlb
stu.io.tlb <> dtlb_reqs.drop(exuParameters.LduCnt)(i) stu.io.tlb <> dtlb_reqs.drop(exuParameters.LduCnt)(i)
...@@ -615,6 +646,17 @@ class MemBlockImp(outer: MemBlock) extends LazyModuleImp(outer) ...@@ -615,6 +646,17 @@ class MemBlockImp(outer: MemBlock) extends LazyModuleImp(outer)
lsq.io.sbuffer <> sbuffer.io.in lsq.io.sbuffer <> sbuffer.io.in
lsq.io.sqEmpty <> sbuffer.io.sqempty lsq.io.sqEmpty <> sbuffer.io.sqempty
// vector loadqueue wrapper
vectorLoadWrapperModule.io.loadRegIn := DontCare
vectorLoadWrapperModule.io.Redirect := DontCare
vectorLoadWrapperModule.io.vecLoadWriteback := DontCare
vectorLoadWrapperModule.io.vecFeedback := DontCare
for (i <- 0 until 2) {
dontTouch(vectorLoadWrapperModule.io.loadPipleIn(i))
vectorLoadWrapperModule.io.loadPipleIn(i) <> loadUnits(i).io.vecldout
}
// Sbuffer // Sbuffer
sbuffer.io.csrCtrl <> csrCtrl sbuffer.io.csrCtrl <> csrCtrl
sbuffer.io.dcache <> dcache.io.lsu.store sbuffer.io.dcache <> dcache.io.lsu.store
...@@ -631,13 +673,6 @@ class MemBlockImp(outer: MemBlock) extends LazyModuleImp(outer) ...@@ -631,13 +673,6 @@ class MemBlockImp(outer: MemBlock) extends LazyModuleImp(outer)
sbuffer.io.flush.valid := RegNext(fenceFlush || atomicsFlush) sbuffer.io.flush.valid := RegNext(fenceFlush || atomicsFlush)
uncache.io.flush.valid := sbuffer.io.flush.valid uncache.io.flush.valid := sbuffer.io.flush.valid
// Vector Load/Store Queue
vlsq.io.int2vlsu <> io.int2vlsu
vlsq.io.vec2vlsu <> io.vec2vlsu
vlsq.io.vlsu2vec <> io.vlsu2vec
vlsq.io.vlsu2int <> io.vlsu2int
vlsq.io.vlsu2ctrl <> io.vlsu2ctrl
// AtomicsUnit: AtomicsUnit will override other control signials, // AtomicsUnit: AtomicsUnit will override other control signials,
// as atomics insts (LR/SC/AMO) will block the pipeline // as atomics insts (LR/SC/AMO) will block the pipeline
val s_normal +: s_atomics = Enum(exuParameters.StuCnt + 1) val s_normal +: s_atomics = Enum(exuParameters.StuCnt + 1)
......
...@@ -400,19 +400,19 @@ object SvinvalDecode extends DecodeConstants { ...@@ -400,19 +400,19 @@ object SvinvalDecode extends DecodeConstants {
* must assure it is the ONLY instrucion executing in backend. * must assure it is the ONLY instrucion executing in backend.
*/ */
SINVAL_VMA ->List(SrcType.reg, SrcType.reg, SrcType.X, FuType.fence, FenceOpType.sfence, N, N, N, N, N, N, SelImm.X), SINVAL_VMA ->List(SrcType.reg, SrcType.reg, SrcType.X, FuType.fence, FenceOpType.sfence, N, N, N, N, N, N, SelImm.X),
/* sfecne.w.inval is the begin instrucion of a TLB flush which set *noSpecExec* and *blockBackward* signals /* sfecne.w.inval is the begin instrucion of a TLB flush which set *noSpecExec* and *blockBackward* signals
* so when it comes to dispatch , it will block all instruction after itself until all instrucions ahead of it in rob commit * so when it comes to dispatch , it will block all instruction after itself until all instrucions ahead of it in rob commit
* then dispatch and issue this instrucion to flush sbuffer to dcache * then dispatch and issue this instrucion to flush sbuffer to dcache
* after this instrucion commits , issue following sinval_vma instructions (out of order) to flush TLB * after this instrucion commits , issue following sinval_vma instructions (out of order) to flush TLB
*/ */
SFENCE_W_INVAL ->List(SrcType.DC, SrcType.DC, SrcType.X, FuType.fence, FenceOpType.nofence, N, N, N, Y, Y, N, SelImm.X), SFENCE_W_INVAL ->List(SrcType.DC, SrcType.DC, SrcType.X, FuType.fence, FenceOpType.nofence, N, N, N, Y, Y, N, SelImm.X),
/* sfecne.inval.ir is the end instrucion of a TLB flush which set *noSpecExec* *blockBackward* and *flushPipe* signals /* sfecne.inval.ir is the end instrucion of a TLB flush which set *noSpecExec* *blockBackward* and *flushPipe* signals
* so when it comes to dispatch , it will wait until all sinval_vma ahead of it in rob commit * so when it comes to dispatch , it will wait until all sinval_vma ahead of it in rob commit
* then dispatch and issue this instrucion * then dispatch and issue this instrucion
* when it commit at the head of rob , flush the pipeline since some instrucions have been fetched to ibuffer using old TLB map * when it commit at the head of rob , flush the pipeline since some instrucions have been fetched to ibuffer using old TLB map
*/ */
SFENCE_INVAL_IR ->List(SrcType.DC, SrcType.DC, SrcType.X, FuType.fence, FenceOpType.nofence, N, N, N, Y, Y, Y, SelImm.X) SFENCE_INVAL_IR ->List(SrcType.DC, SrcType.DC, SrcType.X, FuType.fence, FenceOpType.nofence, N, N, N, Y, Y, Y, SelImm.X)
/* what is Svinval extension ? /* what is Svinval extension ?
* -----> sfecne.w.inval * -----> sfecne.w.inval
* sfence.vma vpn1 -----> sinval_vma vpn1 * sfence.vma vpn1 -----> sinval_vma vpn1
* sfence.vma vpn2 -----> sinval_vma vpn2 * sfence.vma vpn2 -----> sinval_vma vpn2
...@@ -598,6 +598,13 @@ class DecodeUnit(implicit p: Parameters) extends XSModule with DecodeUnitConstan ...@@ -598,6 +598,13 @@ class DecodeUnit(implicit p: Parameters) extends XSModule with DecodeUnitConstan
cs.replayInst := false.B cs.replayInst := false.B
cs.debug_globalID := DontCare cs.debug_globalID := DontCare
// TODO: Backend for VLSU, implement it
cs.vconfig := DontCare
cs.firstUop := DontCare
cs.lastUop := DontCare
cs.total_num := DontCare
cs.uopIdx := DontCare
val fpDecoder = Module(new FPDecoder) val fpDecoder = Module(new FPDecoder)
fpDecoder.io.instr := ctrl_flow.instr fpDecoder.io.instr := ctrl_flow.instr
cs.fpu := fpDecoder.io.fpCtrl cs.fpu := fpDecoder.io.fpCtrl
......
...@@ -33,9 +33,11 @@ case class DispatchParameters ...@@ -33,9 +33,11 @@ case class DispatchParameters
IntDqSize: Int, IntDqSize: Int,
FpDqSize: Int, FpDqSize: Int,
LsDqSize: Int, LsDqSize: Int,
VlsDqSize: Int,
IntDqDeqWidth: Int, IntDqDeqWidth: Int,
FpDqDeqWidth: Int, FpDqDeqWidth: Int,
LsDqDeqWidth: Int LsDqDeqWidth: Int,
VlsDqDeqWidth: Int
) )
// read rob and enqueue // read rob and enqueue
...@@ -65,6 +67,11 @@ class Dispatch(implicit p: Parameters) extends XSModule with HasPerfEvents { ...@@ -65,6 +67,11 @@ class Dispatch(implicit p: Parameters) extends XSModule with HasPerfEvents {
val needAlloc = Vec(RenameWidth, Output(Bool())) val needAlloc = Vec(RenameWidth, Output(Bool()))
val req = Vec(RenameWidth, ValidIO(new MicroOp)) val req = Vec(RenameWidth, ValidIO(new MicroOp))
} }
val toVlsDq = new Bundle {
val canAccept = Input(Bool())
val needAlloc = Vec(RenameWidth, Output(Bool()))
val req = Vec(RenameWidth, ValidIO(new MicroOp))
}
val redirect = Flipped(ValidIO(new Redirect)) val redirect = Flipped(ValidIO(new Redirect))
// singleStep // singleStep
val singleStep = Input(Bool()) val singleStep = Input(Bool())
...@@ -243,6 +250,9 @@ class Dispatch(implicit p: Parameters) extends XSModule with HasPerfEvents { ...@@ -243,6 +250,9 @@ class Dispatch(implicit p: Parameters) extends XSModule with HasPerfEvents {
canEnterDpq && io.toIntDq.canAccept && io.toFpDq.canAccept canEnterDpq && io.toIntDq.canAccept && io.toFpDq.canAccept
io.toLsDq.req(i).bits := updatedUop(i) io.toLsDq.req(i).bits := updatedUop(i)
// TODO: Backend for VLSU, implement it
io.toVlsDq := DontCare
XSDebug(io.toIntDq.req(i).valid, p"pc 0x${Hexadecimal(io.toIntDq.req(i).bits.cf.pc)} int index $i\n") XSDebug(io.toIntDq.req(i).valid, p"pc 0x${Hexadecimal(io.toIntDq.req(i).bits.cf.pc)} int index $i\n")
XSDebug(io.toFpDq.req(i).valid , p"pc 0x${Hexadecimal(io.toFpDq.req(i).bits.cf.pc )} fp index $i\n") XSDebug(io.toFpDq.req(i).valid , p"pc 0x${Hexadecimal(io.toFpDq.req(i).bits.cf.pc )} fp index $i\n")
XSDebug(io.toLsDq.req(i).valid , p"pc 0x${Hexadecimal(io.toLsDq.req(i).bits.cf.pc )} ls index $i\n") XSDebug(io.toLsDq.req(i).valid , p"pc 0x${Hexadecimal(io.toLsDq.req(i).bits.cf.pc )} ls index $i\n")
......
...@@ -35,7 +35,9 @@ case class ExuParameters ...@@ -35,7 +35,9 @@ case class ExuParameters
FmiscCnt: Int, FmiscCnt: Int,
FmiscDivSqrtCnt: Int, FmiscDivSqrtCnt: Int,
LduCnt: Int, LduCnt: Int,
StuCnt: Int StuCnt: Int,
VlCnt : Int,
VsCnt : Int,
) { ) {
assert(JmpCnt == 1, "Only support 1 JmpUnit now!") assert(JmpCnt == 1, "Only support 1 JmpUnit now!")
...@@ -45,6 +47,8 @@ case class ExuParameters ...@@ -45,6 +47,8 @@ case class ExuParameters
def LsExuCnt = LduCnt + StuCnt def LsExuCnt = LduCnt + StuCnt
def VlsExuCnt = VlCnt + VsCnt
def ExuCnt = IntExuCnt + FpExuCnt + LduCnt + StuCnt def ExuCnt = IntExuCnt + FpExuCnt + LduCnt + StuCnt
def CriticalExuCnt = AluCnt + FmacCnt + LsExuCnt def CriticalExuCnt = AluCnt + FmacCnt + LsExuCnt
......
...@@ -68,7 +68,7 @@ class FuOutput(val len: Int)(implicit p: Parameters) extends XSBundle { ...@@ -68,7 +68,7 @@ class FuOutput(val len: Int)(implicit p: Parameters) extends XSBundle {
} }
class FunctionUnitInput(val len: Int)(implicit p: Parameters) extends XSBundle { class FunctionUnitInput(val len: Int)(implicit p: Parameters) extends XSBundle {
val src = Vec(3, UInt(len.W)) val src = Vec(4, UInt(len.W))
val uop = new MicroOp val uop = new MicroOp
} }
......
...@@ -52,7 +52,8 @@ abstract class FPUSubModule(implicit p: Parameters) extends FunctionUnit ...@@ -52,7 +52,8 @@ abstract class FPUSubModule(implicit p: Parameters) extends FunctionUnit
val fflags = IO(Output(UInt(5.W))) val fflags = IO(Output(UInt(5.W)))
val dataModule: FPUDataModule val dataModule: FPUDataModule
def connectDataModule = { def connectDataModule = {
dataModule.io.in.src <> io.in.bits.src // TODO: Backend for VLSU, fix it
dataModule.io.in.src <> io.in.bits.src.take(dataModule.io.in.src.length)
dataModule.io.in.fpCtrl <> io.in.bits.uop.ctrl.fpu dataModule.io.in.fpCtrl <> io.in.bits.uop.ctrl.fpu
dataModule.io.in.rm <> rm dataModule.io.in.rm <> rm
io.out.bits.data := dataModule.io.out.data io.out.bits.data := dataModule.io.out.data
......
...@@ -49,9 +49,12 @@ case class RSParams ...@@ -49,9 +49,12 @@ case class RSParams
var isJump: Boolean = false, var isJump: Boolean = false,
var isAlu: Boolean = false, var isAlu: Boolean = false,
var isStore: Boolean = false, var isStore: Boolean = false,
var isvecStore: Boolean = false,
var isMul: Boolean = false, var isMul: Boolean = false,
var isLoad: Boolean = false, var isLoad: Boolean = false,
var isvecLoad: Boolean = false,
var isStoreData: Boolean = false, var isStoreData: Boolean = false,
var isvecStoreData: Boolean = false,
var exuCfg: Option[ExuConfig] = None var exuCfg: Option[ExuConfig] = None
){ ){
def allWakeup: Int = numFastWakeup + numWakeup def allWakeup: Int = numFastWakeup + numWakeup
...@@ -59,7 +62,7 @@ case class RSParams ...@@ -59,7 +62,7 @@ case class RSParams
// oldestFirst: (Enable_or_not, Need_balance, Victim_index) // oldestFirst: (Enable_or_not, Need_balance, Victim_index)
def oldestFirst: (Boolean, Boolean, Int) = (true, false, 0) def oldestFirst: (Boolean, Boolean, Int) = (true, false, 0)
def hasMidState: Boolean = exuCfg.get == FmacExeUnitCfg def hasMidState: Boolean = exuCfg.get == FmacExeUnitCfg
def delayedFpRf: Boolean = exuCfg.get == StdExeUnitCfg def delayedFpRf: Boolean = exuCfg.get == StdExeUnitCfg || exuCfg.get == vecStdExeUnitCfg
def delayedSrc: Boolean = delayedFpRf def delayedSrc: Boolean = delayedFpRf
def needScheduledBit: Boolean = hasFeedback || delayedSrc || hasMidState def needScheduledBit: Boolean = hasFeedback || delayedSrc || hasMidState
def needBalance: Boolean = exuCfg.get.needLoadBalance && exuCfg.get != LdExeUnitCfg def needBalance: Boolean = exuCfg.get.needLoadBalance && exuCfg.get != LdExeUnitCfg
...@@ -90,13 +93,16 @@ class ReservationStationWrapper(implicit p: Parameters) extends LazyModule with ...@@ -90,13 +93,16 @@ class ReservationStationWrapper(implicit p: Parameters) extends LazyModule with
case StdExeUnitCfg => params.isStoreData = true case StdExeUnitCfg => params.isStoreData = true
case MulDivExeUnitCfg => params.isMul = true case MulDivExeUnitCfg => params.isMul = true
case LdExeUnitCfg => params.isLoad = true case LdExeUnitCfg => params.isLoad = true
case vecLdExeUnitCfg => params.isvecLoad = true
case vecStaExeUnitCfg => params.isvecStore = true
case vecStdExeUnitCfg => params.isvecStoreData = true
case _ => case _ =>
} }
// TODO: why jump needs two sources? // TODO: why jump needs two sources?
if (cfg == JumpCSRExeUnitCfg) { if (cfg == JumpCSRExeUnitCfg) {
params.numSrc = 2 params.numSrc = 2
} }
if (cfg == StaExeUnitCfg || cfg == LdExeUnitCfg) { if (cfg == StaExeUnitCfg || cfg == LdExeUnitCfg || cfg == vecLdExeUnitCfg) {
params.lsqFeedback = true params.lsqFeedback = true
params.hasFeedback = true params.hasFeedback = true
params.checkWaitBit = false params.checkWaitBit = false
...@@ -160,6 +166,9 @@ class ReservationStationWrapper(implicit p: Parameters) extends LazyModule with ...@@ -160,6 +166,9 @@ class ReservationStationWrapper(implicit p: Parameters) extends LazyModule with
if (params.isStoreData) rs.zipWithIndex.foreach { case (rs, index) => rs.suggestName(s"stdRS_${index}") } if (params.isStoreData) rs.zipWithIndex.foreach { case (rs, index) => rs.suggestName(s"stdRS_${index}") }
if (params.isMul) rs.zipWithIndex.foreach { case (rs, index) => rs.suggestName(s"mulRS_${index}") } if (params.isMul) rs.zipWithIndex.foreach { case (rs, index) => rs.suggestName(s"mulRS_${index}") }
if (params.isLoad) rs.zipWithIndex.foreach { case (rs, index) => rs.suggestName(s"loadRS_${index}") } if (params.isLoad) rs.zipWithIndex.foreach { case (rs, index) => rs.suggestName(s"loadRS_${index}") }
if (params.isvecLoad) rs.zipWithIndex.foreach { case (rs, index) => rs.suggestName(s"vecloadRS_${index}") }
if (params.isvecStore) rs.zipWithIndex.foreach { case (rs, index) => rs.suggestName(s"vecstaRS_${index}") }
if (params.isvecStoreData) rs.zipWithIndex.foreach { case (rs, index) => rs.suggestName(s"vecstdRS_${index}") }
val updatedP = p.alter((site, here, up) => { val updatedP = p.alter((site, here, up) => {
case XSCoreParamsKey => up(XSCoreParamsKey).copy( case XSCoreParamsKey => up(XSCoreParamsKey).copy(
......
...@@ -285,7 +285,7 @@ class ExceptionGen(implicit p: Parameters) extends XSModule with HasCircularQueu ...@@ -285,7 +285,7 @@ class ExceptionGen(implicit p: Parameters) extends XSModule with HasCircularQueu
val redirect = Input(Valid(new Redirect)) val redirect = Input(Valid(new Redirect))
val flush = Input(Bool()) val flush = Input(Bool())
val enq = Vec(RenameWidth, Flipped(ValidIO(new RobExceptionInfo))) val enq = Vec(RenameWidth, Flipped(ValidIO(new RobExceptionInfo)))
val wb = Vec(1 + LoadPipelineWidth + StorePipelineWidth, Flipped(ValidIO(new RobExceptionInfo))) val wb = Vec(1 + LoadPipelineWidth + StorePipelineWidth + VecLoadPipelineWidth + VecStorePipelineWidth, Flipped(ValidIO(new RobExceptionInfo)))
val out = ValidIO(new RobExceptionInfo) val out = ValidIO(new RobExceptionInfo)
val state = ValidIO(new RobExceptionInfo) val state = ValidIO(new RobExceptionInfo)
}) })
......
...@@ -335,6 +335,7 @@ class DCacheWordReq(implicit p: Parameters) extends DCacheBundle ...@@ -335,6 +335,7 @@ class DCacheWordReq(implicit p: Parameters) extends DCacheBundle
val instrtype = UInt(sourceTypeWidth.W) val instrtype = UInt(sourceTypeWidth.W)
val isFirstIssue = Bool() val isFirstIssue = Bool()
val replayCarry = new ReplayCarry val replayCarry = new ReplayCarry
val is128bit = Bool()
val debug_robIdx = UInt(log2Ceil(RobSize).W) val debug_robIdx = UInt(log2Ceil(RobSize).W)
def dump() = { def dump() = {
......
...@@ -95,6 +95,20 @@ class LsPipelineBundle(implicit p: Parameters) extends XSBundleWithMicroOp with ...@@ -95,6 +95,20 @@ class LsPipelineBundle(implicit p: Parameters) extends XSBundleWithMicroOp with
val isHWPrefetch = Bool() val isHWPrefetch = Bool()
def isSWPrefetch = isPrefetch && !isHWPrefetch def isSWPrefetch = isPrefetch && !isHWPrefetch
// Vector instruction
val isvec = Bool()
val is128bit = Bool()
val exp = Bool()
val is_first_ele = Bool()
val flow_index = UInt(8.W)
val uop_unit_stride_fof = Bool()
val rob_idx_valid = Vec(2,Bool())
val inner_idx = Vec(2,UInt(3.W))
val rob_idx = Vec(2,new RobPtr)
val reg_offset = Vec(2,UInt(4.W))
val offset = Vec(2,UInt(4.W))
val fqIdx = UInt(log2Ceil(VsFlowSize).W)
// For debug usage // For debug usage
val isFirstIssue = Bool() val isFirstIssue = Bool()
val hasROBEntry = Bool() val hasROBEntry = Bool()
...@@ -140,6 +154,23 @@ class LdPrefetchTrainBundle(implicit p: Parameters) extends LsPipelineBundle { ...@@ -140,6 +154,23 @@ class LdPrefetchTrainBundle(implicit p: Parameters) extends LsPipelineBundle {
forwardData := input.forwardData forwardData := input.forwardData
isPrefetch := input.isPrefetch isPrefetch := input.isPrefetch
isHWPrefetch := input.isHWPrefetch isHWPrefetch := input.isHWPrefetch
// VLSU
isvec := input.isvec
is128bit := input.is128bit
exp := input.exp
flow_index := input.flow_index
is_first_ele := input.is_first_ele
uop_unit_stride_fof := input.uop_unit_stride_fof
rob_idx_valid := input.rob_idx_valid
rob_idx := input.rob_idx
inner_idx := input.inner_idx
reg_offset := input.reg_offset
offset := input.offset
fqIdx := input.fqIdx
isFirstIssue := input.isFirstIssue
dcacheRequireReplay := input.dcacheRequireReplay
isFirstIssue := input.isFirstIssue isFirstIssue := input.isFirstIssue
hasROBEntry := input.hasROBEntry hasROBEntry := input.hasROBEntry
dcacheRequireReplay := input.dcacheRequireReplay dcacheRequireReplay := input.dcacheRequireReplay
...@@ -187,6 +218,19 @@ class LqWriteBundle(implicit p: Parameters) extends LsPipelineBundle { ...@@ -187,6 +218,19 @@ class LqWriteBundle(implicit p: Parameters) extends LsPipelineBundle {
forwardData := input.forwardData forwardData := input.forwardData
isPrefetch := input.isPrefetch isPrefetch := input.isPrefetch
isHWPrefetch := input.isHWPrefetch isHWPrefetch := input.isHWPrefetch
// VLSU
isvec := input.isvec
is128bit := input.is128bit
exp := input.exp
uop_unit_stride_fof := input.uop_unit_stride_fof
rob_idx_valid := input.rob_idx_valid
rob_idx := input.rob_idx
inner_idx := input.inner_idx
reg_offset := input.reg_offset
offset := input.offset
fqIdx := input.fqIdx
isFirstIssue := input.isFirstIssue isFirstIssue := input.isFirstIssue
hasROBEntry := input.hasROBEntry hasROBEntry := input.hasROBEntry
isLoadReplay := input.isLoadReplay isLoadReplay := input.isLoadReplay
......
...@@ -103,6 +103,10 @@ class LoadUnit(implicit p: Parameters) extends XSModule ...@@ -103,6 +103,10 @@ class LoadUnit(implicit p: Parameters) extends XSModule
val rsIdx = Input(UInt()) val rsIdx = Input(UInt())
val isFirstIssue = Input(Bool()) val isFirstIssue = Input(Bool())
// vec issue path
val vecldin = Flipped(Decoupled(new VecLoadPipeBundle))
val vecldout = Decoupled(new VecExuOutput)
// data path // data path
val tlb = new TlbRequestIO(2) val tlb = new TlbRequestIO(2)
val pmp = Flipped(new PMPRespBundle()) // arrive same to tlb now val pmp = Flipped(new PMPRespBundle()) // arrive same to tlb now
...@@ -116,6 +120,8 @@ class LoadUnit(implicit p: Parameters) extends XSModule ...@@ -116,6 +120,8 @@ class LoadUnit(implicit p: Parameters) extends XSModule
// fast wakeup // fast wakeup
val fast_uop = ValidIO(new MicroOp) // early wakeup signal generated in load_s1, send to RS in load_s2 val fast_uop = ValidIO(new MicroOp) // early wakeup signal generated in load_s1, send to RS in load_s2
// TODO: VLSU, implement vector fast wakeup
val vec_fast_uop = ValidIO(new MicroOp)
// trigger // trigger
val trigger = Vec(3, new LoadUnitTriggerIO) val trigger = Vec(3, new LoadUnitTriggerIO)
...@@ -131,6 +137,7 @@ class LoadUnit(implicit p: Parameters) extends XSModule ...@@ -131,6 +137,7 @@ class LoadUnit(implicit p: Parameters) extends XSModule
val ld_fast_imm = Input(UInt(12.W)) val ld_fast_imm = Input(UInt(12.W))
// rs feedback // rs feedback
// TODO: VLSU, implement vector load feedback
val feedback_fast = ValidIO(new RSFeedback) // stage 2 val feedback_fast = ValidIO(new RSFeedback) // stage 2
val feedback_slow = ValidIO(new RSFeedback) // stage 3 val feedback_slow = ValidIO(new RSFeedback) // stage 3
...@@ -183,6 +190,18 @@ class LoadUnit(implicit p: Parameters) extends XSModule ...@@ -183,6 +190,18 @@ class LoadUnit(implicit p: Parameters) extends XSModule
val s0_fire = s0_valid && s0_can_go val s0_fire = s0_valid && s0_can_go
val s0_out = Wire(new LqWriteBundle) val s0_out = Wire(new LqWriteBundle)
// vector related ctrl signal
val s0_isvec = WireInit(false.B)
val s0_is128bit = WireInit(false.B)
val s0_uop_unit_stride_fof = WireInit(false.B)
val s0_rob_idx_valid = WireInit(VecInit(Seq.fill(2)(false.B)))
val s0_inner_idx = WireInit(VecInit(Seq.fill(2)(0.U(3.W))))
val s0_rob_idx = WireInit(VecInit(Seq.fill(2)(0.U.asTypeOf(new RobPtr))))
val s0_reg_offset = WireInit(VecInit(Seq.fill(2)(0.U(4.W))))
val s0_offset = WireInit(VecInit(Seq.fill(2)(0.U(4.W))))
val s0_exp = WireInit(true.B)
val s0_is_first_ele = WireInit(false.B)
// load flow select/gen // load flow select/gen
// src0: super load replayed by LSQ (cache miss replay) (io.replay) // src0: super load replayed by LSQ (cache miss replay) (io.replay)
// src1: fast load replay (io.fast_rep_in) // src1: fast load replay (io.fast_rep_in)
...@@ -199,7 +218,7 @@ class LoadUnit(implicit p: Parameters) extends XSModule ...@@ -199,7 +218,7 @@ class LoadUnit(implicit p: Parameters) extends XSModule
val s0_ld_rep_valid = io.replay.valid && !io.replay.bits.forward_tlDchannel && !s0_rep_stall val s0_ld_rep_valid = io.replay.valid && !io.replay.bits.forward_tlDchannel && !s0_rep_stall
val s0_high_conf_prf_valid = io.prefetch_req.valid && io.prefetch_req.bits.confidence > 0.U val s0_high_conf_prf_valid = io.prefetch_req.valid && io.prefetch_req.bits.confidence > 0.U
val s0_int_iss_valid = io.ldin.valid // int flow first issue or software prefetch val s0_int_iss_valid = io.ldin.valid // int flow first issue or software prefetch
val s0_vec_iss_valid = WireInit(false.B) // TODO val s0_vec_iss_valid = io.vecldin.valid
val s0_l2l_fwd_valid = io.l2l_fwd_in.valid val s0_l2l_fwd_valid = io.l2l_fwd_in.valid
val s0_low_conf_prf_valid = io.prefetch_req.valid && io.prefetch_req.bits.confidence === 0.U val s0_low_conf_prf_valid = io.prefetch_req.valid && io.prefetch_req.bits.confidence === 0.U
dontTouch(s0_super_ld_rep_valid) dontTouch(s0_super_ld_rep_valid)
...@@ -301,7 +320,7 @@ class LoadUnit(implicit p: Parameters) extends XSModule ...@@ -301,7 +320,7 @@ class LoadUnit(implicit p: Parameters) extends XSModule
TlbCmd.read TlbCmd.read
) )
io.tlb.req.bits.vaddr := Mux(s0_hw_prf_select, io.prefetch_req.bits.paddr, s0_vaddr) io.tlb.req.bits.vaddr := Mux(s0_hw_prf_select, io.prefetch_req.bits.paddr, s0_vaddr)
io.tlb.req.bits.size := LSUOpType.size(s0_uop.ctrl.fuOpType) io.tlb.req.bits.size := Mux(s0_isvec, io.vecldin.bits.alignedType, LSUOpType.size(s0_uop.ctrl.fuOpType))
io.tlb.req.bits.kill := s0_kill io.tlb.req.bits.kill := s0_kill
io.tlb.req.bits.memidx.is_ld := true.B io.tlb.req.bits.memidx.is_ld := true.B
io.tlb.req.bits.memidx.is_st := false.B io.tlb.req.bits.memidx.is_st := false.B
...@@ -324,6 +343,7 @@ class LoadUnit(implicit p: Parameters) extends XSModule ...@@ -324,6 +343,7 @@ class LoadUnit(implicit p: Parameters) extends XSModule
io.dcache.req.bits.instrtype := Mux(s0_prf, DCACHE_PREFETCH_SOURCE.U, LOAD_SOURCE.U) io.dcache.req.bits.instrtype := Mux(s0_prf, DCACHE_PREFETCH_SOURCE.U, LOAD_SOURCE.U)
io.dcache.req.bits.debug_robIdx := s0_uop.robIdx.value io.dcache.req.bits.debug_robIdx := s0_uop.robIdx.value
io.dcache.req.bits.replayCarry := s0_rep_carry io.dcache.req.bits.replayCarry := s0_rep_carry
io.dcache.req.bits.is128bit := s0_is128bit
io.dcache.req.bits.id := DontCare // TODO: update cache meta io.dcache.req.bits.id := DontCare // TODO: update cache meta
// load flow priority mux // load flow priority mux
...@@ -427,17 +447,20 @@ class LoadUnit(implicit p: Parameters) extends XSModule ...@@ -427,17 +447,20 @@ class LoadUnit(implicit p: Parameters) extends XSModule
s0_sched_idx := 0.U s0_sched_idx := 0.U
} }
def fromVecIssueSource() = { def fromVecIssueSource(src: VecLoadPipeBundle) = {
s0_vaddr := 0.U s0_vaddr := src.vaddr
s0_mask := 0.U s0_mask := src.mask
s0_uop := 0.U.asTypeOf(new MicroOp) s0_uop := src.uop
s0_try_l2l := false.B s0_try_l2l := false.B
s0_has_rob_entry := false.B s0_has_rob_entry := false.B
s0_sqIdx := 0.U.asTypeOf(new SqPtr) s0_sqIdx := src.uop.sqIdx
// TODO: VLSU, implement vector feedback
s0_rsIdx := 0.U s0_rsIdx := 0.U
// TODO: VLSU, implement replay carry
s0_rep_carry := 0.U.asTypeOf(s0_rep_carry.cloneType) s0_rep_carry := 0.U.asTypeOf(s0_rep_carry.cloneType)
s0_mshrid := 0.U s0_mshrid := 0.U
s0_isFirstIssue := false.B // TODO: VLSU, implement first issue
s0_isFirstIssue := true.B
s0_fast_rep := false.B s0_fast_rep := false.B
s0_ld_rep := false.B s0_ld_rep := false.B
s0_l2l_fwd := false.B s0_l2l_fwd := false.B
...@@ -445,6 +468,18 @@ class LoadUnit(implicit p: Parameters) extends XSModule ...@@ -445,6 +468,18 @@ class LoadUnit(implicit p: Parameters) extends XSModule
s0_prf_rd := false.B s0_prf_rd := false.B
s0_prf_wr := false.B s0_prf_wr := false.B
s0_sched_idx := 0.U s0_sched_idx := 0.U
// Vector load interface
s0_isvec := true.B
// TODO: VLSU, whether req is 128 bits should be separated
s0_is128bit := true.B
s0_uop_unit_stride_fof := src.uop_unit_stride_fof
s0_rob_idx_valid := src.rob_idx_valid
s0_inner_idx := src.inner_idx
s0_rob_idx := src.rob_idx
s0_reg_offset := src.reg_offset
s0_offset := src.offset
s0_exp := src.exp
s0_is_first_ele := src.is_first_ele
} }
def fromLoadToLoadSource(src: LoadToLoadIO) = { def fromLoadToLoadSource(src: LoadToLoadIO) = {
...@@ -478,7 +513,7 @@ class LoadUnit(implicit p: Parameters) extends XSModule ...@@ -478,7 +513,7 @@ class LoadUnit(implicit p: Parameters) extends XSModule
.elsewhen (s0_ld_rep_select) { fromNormalReplaySource(io.replay.bits) } .elsewhen (s0_ld_rep_select) { fromNormalReplaySource(io.replay.bits) }
.elsewhen (s0_hw_prf_select) { fromPrefetchSource(io.prefetch_req.bits) } .elsewhen (s0_hw_prf_select) { fromPrefetchSource(io.prefetch_req.bits) }
.elsewhen (s0_int_iss_select) { fromIntIssueSource(io.ldin.bits) } .elsewhen (s0_int_iss_select) { fromIntIssueSource(io.ldin.bits) }
.elsewhen (s0_vec_iss_select) { fromVecIssueSource() } .elsewhen (s0_vec_iss_select) { fromVecIssueSource(io.vecldin.bits) }
.otherwise { .otherwise {
if (EnableLoadToLoadForward) { if (EnableLoadToLoadForward) {
fromLoadToLoadSource(io.l2l_fwd_in) fromLoadToLoadSource(io.l2l_fwd_in)
...@@ -488,7 +523,7 @@ class LoadUnit(implicit p: Parameters) extends XSModule ...@@ -488,7 +523,7 @@ class LoadUnit(implicit p: Parameters) extends XSModule
} }
// address align check // address align check
val s0_addr_aligned = LookupTree(s0_uop.ctrl.fuOpType(1, 0), List( val s0_addr_aligned = LookupTree(Mux(s0_isvec, io.vecldin.bits.alignedType, s0_uop.ctrl.fuOpType(1, 0)), List(
"b00".U -> true.B, //b "b00".U -> true.B, //b
"b01".U -> (s0_vaddr(0) === 0.U), //h "b01".U -> (s0_vaddr(0) === 0.U), //h
"b10".U -> (s0_vaddr(1, 0) === 0.U), //w "b10".U -> (s0_vaddr(1, 0) === 0.U), //w
...@@ -497,20 +532,30 @@ class LoadUnit(implicit p: Parameters) extends XSModule ...@@ -497,20 +532,30 @@ class LoadUnit(implicit p: Parameters) extends XSModule
// accept load flow if dcache ready (tlb is always ready) // accept load flow if dcache ready (tlb is always ready)
// TODO: prefetch need writeback to loadQueueFlag // TODO: prefetch need writeback to loadQueueFlag
s0_out := DontCare s0_out := DontCare
s0_out.rsIdx := s0_rsIdx s0_out.rsIdx := s0_rsIdx
s0_out.vaddr := s0_vaddr s0_out.vaddr := s0_vaddr
s0_out.mask := s0_mask s0_out.mask := s0_mask
s0_out.uop := s0_uop s0_out.uop := s0_uop
s0_out.isFirstIssue := s0_isFirstIssue s0_out.isFirstIssue := s0_isFirstIssue
s0_out.hasROBEntry := s0_has_rob_entry s0_out.hasROBEntry := s0_has_rob_entry
s0_out.isPrefetch := s0_prf s0_out.isPrefetch := s0_prf
s0_out.isHWPrefetch := s0_hw_prf s0_out.isHWPrefetch := s0_hw_prf
s0_out.isFastReplay := s0_fast_rep s0_out.isFastReplay := s0_fast_rep
s0_out.isLoadReplay := s0_ld_rep s0_out.isLoadReplay := s0_ld_rep
s0_out.isFastPath := s0_l2l_fwd s0_out.isFastPath := s0_l2l_fwd
s0_out.mshrid := s0_mshrid s0_out.mshrid := s0_mshrid
s0_out.uop.cf.exceptionVec(loadAddrMisaligned) := !s0_addr_aligned s0_out.isvec := s0_isvec
s0_out.is128bit := s0_is128bit
s0_out.uop_unit_stride_fof := s0_uop_unit_stride_fof
s0_out.rob_idx_valid := s0_rob_idx_valid
s0_out.inner_idx := s0_inner_idx
s0_out.rob_idx := s0_rob_idx
s0_out.reg_offset := s0_reg_offset
s0_out.offset := s0_offset
s0_out.exp := s0_exp
s0_out.is_first_ele := s0_is_first_ele
s0_out.uop.cf.exceptionVec(loadAddrMisaligned) := !s0_addr_aligned && s0_exp
s0_out.forward_tlDchannel := s0_super_ld_rep_select s0_out.forward_tlDchannel := s0_super_ld_rep_select
when(io.tlb.req.valid && s0_isFirstIssue) { when(io.tlb.req.valid && s0_isFirstIssue) {
s0_out.uop.debugInfo.tlbFirstReqTime := GTimer() s0_out.uop.debugInfo.tlbFirstReqTime := GTimer()
...@@ -532,6 +577,7 @@ class LoadUnit(implicit p: Parameters) extends XSModule ...@@ -532,6 +577,7 @@ class LoadUnit(implicit p: Parameters) extends XSModule
// 2) there is no fast replayed load // 2) there is no fast replayed load
// 3) there is no high confidence prefetch request // 3) there is no high confidence prefetch request
io.ldin.ready := (s0_can_go && io.dcache.req.ready && s0_int_iss_ready) io.ldin.ready := (s0_can_go && io.dcache.req.ready && s0_int_iss_ready)
io.vecldin.ready := (s0_can_go && io.dcache.req.ready && s0_vec_iss_ready)
// for hw prefetch load flow feedback, to be added later // for hw prefetch load flow feedback, to be added later
// io.prefetch_in.ready := s0_hw_prf_select // io.prefetch_in.ready := s0_hw_prf_select
...@@ -558,6 +604,8 @@ class LoadUnit(implicit p: Parameters) extends XSModule ...@@ -558,6 +604,8 @@ class LoadUnit(implicit p: Parameters) extends XSModule
val s1_kill = Wire(Bool()) val s1_kill = Wire(Bool())
val s1_can_go = s2_ready val s1_can_go = s2_ready
val s1_fire = s1_valid && !s1_kill && s1_can_go val s1_fire = s1_valid && !s1_kill && s1_can_go
val s1_exp = RegEnable(s0_out.exp, true.B, s0_fire)
val s1_isvec = RegEnable(s0_out.isvec, false.B, s0_fire)
s1_ready := !s1_valid || s1_kill || s2_ready s1_ready := !s1_valid || s1_kill || s2_ready
when (s0_fire) { s1_valid := true.B } when (s0_fire) { s1_valid := true.B }
...@@ -576,7 +624,7 @@ class LoadUnit(implicit p: Parameters) extends XSModule ...@@ -576,7 +624,7 @@ class LoadUnit(implicit p: Parameters) extends XSModule
val s1_vaddr = Wire(UInt()) val s1_vaddr = Wire(UInt())
val s1_paddr_dup_lsu = Wire(UInt()) val s1_paddr_dup_lsu = Wire(UInt())
val s1_paddr_dup_dcache = Wire(UInt()) val s1_paddr_dup_dcache = Wire(UInt())
val s1_exception = ExceptionNO.selectByFu(s1_out.uop.cf.exceptionVec, lduCfg).asUInt.orR // af & pf exception were modified below. val s1_exception = ExceptionNO.selectByFu(s1_out.uop.cf.exceptionVec, lduCfg).asUInt.orR && s1_exp // af & pf exception were modified below.
val s1_tlb_miss = io.tlb.resp.bits.miss val s1_tlb_miss = io.tlb.resp.bits.miss
val s1_prf = s1_in.isPrefetch val s1_prf = s1_in.isPrefetch
val s1_hw_prf = s1_in.isHWPrefetch val s1_hw_prf = s1_in.isHWPrefetch
...@@ -620,11 +668,13 @@ class LoadUnit(implicit p: Parameters) extends XSModule ...@@ -620,11 +668,13 @@ class LoadUnit(implicit p: Parameters) extends XSModule
io.lsq.forward.pc := s1_in.uop.cf.pc // FIXME: remove it io.lsq.forward.pc := s1_in.uop.cf.pc // FIXME: remove it
// st-ld violation query // st-ld violation query
val s1_nuke_paddr_match = VecInit((0 until StorePipelineWidth).map(w => {Mux(s1_isvec && s1_in.is128bit,
s1_paddr_dup_lsu(PAddrBits-1, 4) === io.stld_nuke_query(w).bits.paddr(PAddrBits-1, 4),
s1_paddr_dup_lsu(PAddrBits-1, 3) === io.stld_nuke_query(w).bits.paddr(PAddrBits-1, 3))}))
val s1_nuke = VecInit((0 until StorePipelineWidth).map(w => { val s1_nuke = VecInit((0 until StorePipelineWidth).map(w => {
io.stld_nuke_query(w).valid && // query valid io.stld_nuke_query(w).valid && // query valid
isAfter(s1_in.uop.robIdx, io.stld_nuke_query(w).bits.robIdx) && // older store isAfter(s1_in.uop.robIdx, io.stld_nuke_query(w).bits.robIdx) && // older store
// TODO: Fix me when vector instruction s1_nuke_paddr_match(w) && // paddr match
(s1_paddr_dup_lsu(PAddrBits-1, 3) === io.stld_nuke_query(w).bits.paddr(PAddrBits-1, 3)) && // paddr match
(s1_in.mask & io.stld_nuke_query(w).bits.mask).orR // data mask contain (s1_in.mask & io.stld_nuke_query(w).bits.mask).orR // data mask contain
})).asUInt.orR && !s1_tlb_miss })).asUInt.orR && !s1_tlb_miss
// Generate forwardMaskFast to wake up insts earlier // Generate forwardMaskFast to wake up insts earlier
...@@ -644,11 +694,11 @@ class LoadUnit(implicit p: Parameters) extends XSModule ...@@ -644,11 +694,11 @@ class LoadUnit(implicit p: Parameters) extends XSModule
when (!s1_fast_rep_kill) { when (!s1_fast_rep_kill) {
// current ori test will cause the case of ldest == 0, below will be modifeid in the future. // current ori test will cause the case of ldest == 0, below will be modifeid in the future.
// af & pf exception were modified // af & pf exception were modified
s1_out.uop.cf.exceptionVec(loadPageFault) := io.tlb.resp.bits.excp(0).pf.ld s1_out.uop.cf.exceptionVec(loadPageFault) := io.tlb.resp.bits.excp(0).pf.ld && s1_exp
s1_out.uop.cf.exceptionVec(loadAccessFault) := io.tlb.resp.bits.excp(0).af.ld s1_out.uop.cf.exceptionVec(loadAccessFault) := io.tlb.resp.bits.excp(0).af.ld && s1_exp
} .otherwise { } .otherwise {
s1_out.uop.cf.exceptionVec(loadAddrMisaligned) := false.B s1_out.uop.cf.exceptionVec(loadAddrMisaligned) := false.B && s1_exp
s1_out.uop.cf.exceptionVec(loadAccessFault) := s1_fast_rep_kill s1_out.uop.cf.exceptionVec(loadAccessFault) := s1_fast_rep_kill && s1_exp
} }
// pointer chasing // pointer chasing
...@@ -729,6 +779,8 @@ class LoadUnit(implicit p: Parameters) extends XSModule ...@@ -729,6 +779,8 @@ class LoadUnit(implicit p: Parameters) extends XSModule
val s2_kill = Wire(Bool()) val s2_kill = Wire(Bool())
val s2_can_go = s3_ready val s2_can_go = s3_ready
val s2_fire = s2_valid && !s2_kill && s2_can_go val s2_fire = s2_valid && !s2_kill && s2_can_go
val s2_exp = RegEnable(s1_out.exp, true.B, s1_fire)
val s2_isvec = RegEnable(s1_out.isvec, false.B, s1_fire)
s2_kill := s2_in.uop.robIdx.needFlush(io.redirect) s2_kill := s2_in.uop.robIdx.needFlush(io.redirect)
s2_ready := !s2_valid || s2_kill || s3_ready s2_ready := !s2_valid || s2_kill || s3_ready
...@@ -753,13 +805,13 @@ class LoadUnit(implicit p: Parameters) extends XSModule ...@@ -753,13 +805,13 @@ class LoadUnit(implicit p: Parameters) extends XSModule
// will be force writebacked to rob // will be force writebacked to rob
val s2_exception_vec = WireInit(s2_in.uop.cf.exceptionVec) val s2_exception_vec = WireInit(s2_in.uop.cf.exceptionVec)
when (!s2_in.lateKill) { when (!s2_in.lateKill) {
s2_exception_vec(loadAccessFault) := s2_in.uop.cf.exceptionVec(loadAccessFault) || s2_pmp.ld s2_exception_vec(loadAccessFault) := (s2_in.uop.cf.exceptionVec(loadAccessFault) || s2_pmp.ld) && s2_exp
// soft prefetch will not trigger any exception (but ecc error interrupt may be triggered) // soft prefetch will not trigger any exception (but ecc error interrupt may be triggered)
when (s2_prf || s2_in.tlbMiss) { when (s2_prf || s2_in.tlbMiss) {
s2_exception_vec := 0.U.asTypeOf(s2_exception_vec.cloneType) s2_exception_vec := 0.U.asTypeOf(s2_exception_vec.cloneType)
} }
} }
val s2_exception = ExceptionNO.selectByFu(s2_exception_vec, lduCfg).asUInt.orR val s2_exception = ExceptionNO.selectByFu(s2_exception_vec, lduCfg).asUInt.orR && s2_exp
val (s2_fwd_frm_d_chan, s2_fwd_data_frm_d_chan) = io.tl_d_channel.forward(s1_valid && s1_out.forward_tlDchannel, s1_out.mshrid, s1_out.paddr) val (s2_fwd_frm_d_chan, s2_fwd_data_frm_d_chan) = io.tl_d_channel.forward(s1_valid && s1_out.forward_tlDchannel, s1_out.mshrid, s1_out.paddr)
val (s2_fwd_data_valid, s2_fwd_frm_mshr, s2_fwd_data_frm_mshr) = io.forward_mshr.forward() val (s2_fwd_data_valid, s2_fwd_frm_mshr, s2_fwd_data_frm_mshr) = io.forward_mshr.forward()
...@@ -795,11 +847,14 @@ class LoadUnit(implicit p: Parameters) extends XSModule ...@@ -795,11 +847,14 @@ class LoadUnit(implicit p: Parameters) extends XSModule
// 2. Load instruction is younger than requestors(store instructions). // 2. Load instruction is younger than requestors(store instructions).
// 3. Physical address match. // 3. Physical address match.
// 4. Data contains. // 4. Data contains.
val s2_nuke_paddr_match = VecInit((0 until StorePipelineWidth).map(w => { Mux(s2_isvec && s2_in.is128bit,
s2_in.paddr(PAddrBits-1, 4) === io.stld_nuke_query(w).bits.paddr(PAddrBits-1, 4),
s2_in.paddr(PAddrBits-1, 3) === io.stld_nuke_query(w).bits.paddr(PAddrBits-1, 3))}))
val s2_nuke = VecInit((0 until StorePipelineWidth).map(w => { val s2_nuke = VecInit((0 until StorePipelineWidth).map(w => {
io.stld_nuke_query(w).valid && // query valid io.stld_nuke_query(w).valid && // query valid
isAfter(s2_in.uop.robIdx, io.stld_nuke_query(w).bits.robIdx) && // older store isAfter(s2_in.uop.robIdx, io.stld_nuke_query(w).bits.robIdx) && // older store
// TODO: Fix me when vector instruction s2_nuke_paddr_match(w) &&
(s2_in.paddr(PAddrBits-1, 3) === io.stld_nuke_query(w).bits.paddr(PAddrBits-1, 3)) && // paddr match
(s2_in.mask & io.stld_nuke_query(w).bits.mask).orR // data mask contain (s2_in.mask & io.stld_nuke_query(w).bits.mask).orR // data mask contain
})).asUInt.orR || s2_in.rep_info.nuke })).asUInt.orR || s2_in.rep_info.nuke
...@@ -912,9 +967,13 @@ class LoadUnit(implicit p: Parameters) extends XSModule ...@@ -912,9 +967,13 @@ class LoadUnit(implicit p: Parameters) extends XSModule
!s1_fast_rep_kill && !s1_fast_rep_kill &&
!io.tlb.resp.bits.fast_miss && !io.tlb.resp.bits.fast_miss &&
!io.lsq.forward.dataInvalidFast !io.lsq.forward.dataInvalidFast
) && (s2_valid && !io.feedback_fast.valid && !s2_out.rep_info.need_rep && !s2_mmio) ) && (s2_valid && !io.feedback_fast.valid && !s2_out.rep_info.need_rep && !s2_mmio) && !s2_isvec
io.fast_uop.bits := RegNext(s1_out.uop) io.fast_uop.bits := RegNext(s1_out.uop)
// TODO: VLSU, vector fast wakeup
io.vec_fast_uop.valid := false.B
io.vec_fast_uop.bits := DontCare
// //
io.s2_ptr_chasing := RegEnable(s1_try_ptr_chasing && !s1_cancel_ptr_chasing, s1_fire) io.s2_ptr_chasing := RegEnable(s1_try_ptr_chasing && !s1_cancel_ptr_chasing, s1_fire)
io.prefetch_train.valid := s2_valid && !s2_in.mmio && !s2_in.tlbMiss io.prefetch_train.valid := s2_valid && !s2_in.mmio && !s2_in.tlbMiss
...@@ -943,10 +1002,13 @@ class LoadUnit(implicit p: Parameters) extends XSModule ...@@ -943,10 +1002,13 @@ class LoadUnit(implicit p: Parameters) extends XSModule
val s3_valid = RegNext(s2_valid && !s2_out.isHWPrefetch && !s2_out.uop.robIdx.needFlush(io.redirect)) val s3_valid = RegNext(s2_valid && !s2_out.isHWPrefetch && !s2_out.uop.robIdx.needFlush(io.redirect))
val s3_in = RegEnable(s2_out, s2_fire) val s3_in = RegEnable(s2_out, s2_fire)
val s3_out = Wire(Valid(new ExuOutput)) val s3_out = Wire(Valid(new ExuOutput))
val s3_vecout = Wire(new OnlyVecExuOutput)
val s3_cache_rep = RegEnable(s2_cache_rep && s2_troublem, s2_fire) val s3_cache_rep = RegEnable(s2_cache_rep && s2_troublem, s2_fire)
val s3_ld_valid_dup = RegEnable(s2_ld_valid_dup, s2_fire) val s3_ld_valid_dup = RegEnable(s2_ld_valid_dup, s2_fire)
val s3_fast_rep = Wire(Bool()) val s3_fast_rep = Wire(Bool())
val s3_kill = s3_in.uop.robIdx.needFlush(io.redirect) val s3_kill = s3_in.uop.robIdx.needFlush(io.redirect)
val s3_exp = RegEnable(s2_out.exp, true.B, s2_fire)
val s3_isvec = RegEnable(s2_out.isvec, false.B, s2_fire)
s3_ready := !s3_valid || s3_kill || io.ldout.ready s3_ready := !s3_valid || s3_kill || io.ldout.ready
// s3 load fast replay // s3 load fast replay
...@@ -983,7 +1045,7 @@ class LoadUnit(implicit p: Parameters) extends XSModule ...@@ -983,7 +1045,7 @@ class LoadUnit(implicit p: Parameters) extends XSModule
s3_sel_rep_cause(LoadReplayCauses.C_TM) || s3_sel_rep_cause(LoadReplayCauses.C_TM) ||
s3_sel_rep_cause(LoadReplayCauses.C_NK) s3_sel_rep_cause(LoadReplayCauses.C_NK)
val s3_exception = ExceptionNO.selectByFu(s3_in.uop.cf.exceptionVec, lduCfg).asUInt.orR val s3_exception = ExceptionNO.selectByFu(s3_in.uop.cf.exceptionVec, lduCfg).asUInt.orR && s3_exp
when ((s3_exception || s3_dly_ld_err || s3_rep_frm_fetch) && !s3_force_rep) { when ((s3_exception || s3_dly_ld_err || s3_rep_frm_fetch) && !s3_force_rep) {
io.lsq.ldin.bits.rep_info.cause := 0.U.asTypeOf(s3_rep_info.cause.cloneType) io.lsq.ldin.bits.rep_info.cause := 0.U.asTypeOf(s3_rep_info.cause.cloneType)
} .otherwise { } .otherwise {
...@@ -993,7 +1055,7 @@ class LoadUnit(implicit p: Parameters) extends XSModule ...@@ -993,7 +1055,7 @@ class LoadUnit(implicit p: Parameters) extends XSModule
// Int load, if hit, will be writebacked at s2 // Int load, if hit, will be writebacked at s2
s3_out.valid := s3_valid && !io.lsq.ldin.bits.rep_info.need_rep && !s3_in.mmio && !s3_in.lateKill s3_out.valid := s3_valid && !io.lsq.ldin.bits.rep_info.need_rep && !s3_in.mmio && !s3_in.lateKill
s3_out.bits.uop := s3_in.uop s3_out.bits.uop := s3_in.uop
s3_out.bits.uop.cf.exceptionVec(loadAccessFault) := s3_dly_ld_err || s3_in.uop.cf.exceptionVec(loadAccessFault) s3_out.bits.uop.cf.exceptionVec(loadAccessFault) := (s3_dly_ld_err || s3_in.uop.cf.exceptionVec(loadAccessFault)) && s3_exp
s3_out.bits.uop.ctrl.replayInst := s3_rep_frm_fetch s3_out.bits.uop.ctrl.replayInst := s3_rep_frm_fetch
s3_out.bits.data := s3_in.data s3_out.bits.data := s3_in.data
s3_out.bits.redirectValid := false.B s3_out.bits.redirectValid := false.B
...@@ -1003,6 +1065,19 @@ class LoadUnit(implicit p: Parameters) extends XSModule ...@@ -1003,6 +1065,19 @@ class LoadUnit(implicit p: Parameters) extends XSModule
s3_out.bits.debug.paddr := s3_in.paddr s3_out.bits.debug.paddr := s3_in.paddr
s3_out.bits.debug.vaddr := s3_in.vaddr s3_out.bits.debug.vaddr := s3_in.vaddr
s3_out.bits.fflags := DontCare s3_out.bits.fflags := DontCare
// Vector load
s3_vecout.isvec := s3_isvec
s3_vecout.vecdata := 0.U // Data will be assigned later
s3_vecout.mask := s3_in.mask
s3_vecout.rob_idx_valid := s3_in.rob_idx_valid
s3_vecout.inner_idx := s3_in.inner_idx
s3_vecout.rob_idx := s3_in.rob_idx
s3_vecout.offset := s3_in.offset
s3_vecout.reg_offset := s3_in.reg_offset
s3_vecout.exp := s3_exp
s3_vecout.is_first_ele := s3_in.is_first_ele
// TODO: VLSU, fix it!
s3_vecout.exp_ele_index := 0.U
when (s3_force_rep) { when (s3_force_rep) {
s3_out.bits.uop.cf.exceptionVec := 0.U.asTypeOf(s3_in.uop.cf.exceptionVec.cloneType) s3_out.bits.uop.cf.exceptionVec := 0.U.asTypeOf(s3_in.uop.cf.exceptionVec.cloneType)
...@@ -1088,9 +1163,22 @@ class LoadUnit(implicit p: Parameters) extends XSModule ...@@ -1088,9 +1163,22 @@ class LoadUnit(implicit p: Parameters) extends XSModule
io.lsq.uncache.ready := !s3_out.valid io.lsq.uncache.ready := !s3_out.valid
io.ldout.bits := s3_ld_wb_meta io.ldout.bits := s3_ld_wb_meta
io.ldout.bits.data := Mux(s3_out.valid, s3_ld_data_frm_cache, s3_ld_data_frm_uncache) io.ldout.bits.data := Mux(s3_out.valid, s3_ld_data_frm_cache, s3_ld_data_frm_uncache)
io.ldout.valid := s3_out.valid && !s3_out.bits.uop.robIdx.needFlush(io.redirect) || io.ldout.valid := !s3_vecout.isvec &&
io.lsq.uncache.valid && !io.lsq.uncache.bits.uop.robIdx.needFlush(io.redirect) && !s3_out.valid (s3_out.valid && !s3_out.bits.uop.robIdx.needFlush(io.redirect) ||
io.lsq.uncache.valid && !io.lsq.uncache.bits.uop.robIdx.needFlush(io.redirect) && !s3_out.valid)
io.vecldout.bits.vec := s3_vecout
// TODO: VLSU, uncache data logic
io.vecldout.bits.vec.vecdata := s3_merged_data_frm_cache
io.vecldout.bits.data := 0.U
io.vecldout.bits.fflags := s3_out.bits.fflags
io.vecldout.bits.redirectValid := s3_out.bits.redirectValid
io.vecldout.bits.redirect := s3_out.bits.redirect
io.vecldout.bits.debug := s3_out.bits.debug
io.vecldout.bits.uop := s3_out.bits.uop
io.vecldout.valid := s3_vecout.isvec &&
(s3_out.valid && !s3_out.bits.uop.robIdx.needFlush(io.redirect) ||
io.lsq.uncache.valid && !io.lsq.uncache.bits.uop.robIdx.needFlush(io.redirect) && !s3_out.valid)
// fast load to load forward // fast load to load forward
io.l2l_fwd_out.valid := s3_out.valid && !s3_in.lateKill // for debug only io.l2l_fwd_out.valid := s3_out.valid && !s3_in.lateKill // for debug only
......
...@@ -24,26 +24,29 @@ import utility._ ...@@ -24,26 +24,29 @@ import utility._
import xiangshan.ExceptionNO._ import xiangshan.ExceptionNO._
import xiangshan._ import xiangshan._
import xiangshan.backend.fu.PMPRespBundle import xiangshan.backend.fu.PMPRespBundle
import xiangshan.backend.rob.DebugLsInfoBundle import xiangshan.backend.rob.{DebugLsInfoBundle, RobPtr}
import xiangshan.cache.mmu.{TlbCmd, TlbReq, TlbRequestIO, TlbResp} import xiangshan.cache.mmu.{TlbCmd, TlbReq, TlbRequestIO, TlbResp}
class StoreUnit(implicit p: Parameters) extends XSModule { class StoreUnit(implicit p: Parameters) extends XSModule {
val io = IO(new Bundle() { val io = IO(new Bundle() {
val redirect = Flipped(ValidIO(new Redirect)) val redirect = Flipped(ValidIO(new Redirect))
val stin = Flipped(Decoupled(new ExuInput)) val stin = Flipped(Decoupled(new ExuInput))
val issue = Valid(new ExuInput) val vecstin = Flipped(Decoupled(new VecStorePipeBundle()))
val tlb = new TlbRequestIO() val issue = Valid(new ExuInput)
val pmp = Flipped(new PMPRespBundle()) val tlb = new TlbRequestIO()
val rsIdx = Input(UInt(log2Up(IssQueSize).W)) val pmp = Flipped(new PMPRespBundle())
val isFirstIssue = Input(Bool()) val rsIdx = Input(UInt(log2Up(IssQueSize).W))
val lsq = ValidIO(new LsPipelineBundle) val isFirstIssue = Input(Bool())
val lsq_replenish = Output(new LsPipelineBundle()) val vec_isFirstIssue = Input(Bool())
val feedback_slow = ValidIO(new RSFeedback) val lsq = ValidIO(new LsPipelineBundle)
val stld_nuke_query = Valid(new StoreNukeQueryIO) val lsq_replenish = Output(new LsPipelineBundle())
val stout = DecoupledIO(new ExuOutput) // writeback store val feedback_slow = ValidIO(new RSFeedback)
val vec_feedback_slow = ValidIO(new VSFQFeedback)
val stld_nuke_query = Valid(new StoreNukeQueryIO)
val stout = DecoupledIO(new ExuOutput) // writeback store
// store mask, send to sq in store_s0 // store mask, send to sq in store_s0
val st_mask_out = Valid(new StoreMaskBundle) val st_mask_out = Valid(new StoreMaskBundle)
val debug_ls = Output(new DebugLsInfoBundle) val debug_ls = Output(new DebugLsInfoBundle)
}) })
val s1_ready, s2_ready, s3_ready = WireInit(false.B) val s1_ready, s2_ready, s3_ready = WireInit(false.B)
...@@ -53,58 +56,117 @@ class StoreUnit(implicit p: Parameters) extends XSModule { ...@@ -53,58 +56,117 @@ class StoreUnit(implicit p: Parameters) extends XSModule {
// stage 0 // stage 0
// -------------------------------------------------------------------------------- // --------------------------------------------------------------------------------
// generate addr, use addr to query DCache and DTLB // generate addr, use addr to query DCache and DTLB
val s0_valid = io.stin.valid
val s0_in = io.stin.bits
val s0_isFirstIssue = io.isFirstIssue
val s0_rsIdx = io.rsIdx
val s0_out = Wire(new LsPipelineBundle) val s0_out = Wire(new LsPipelineBundle)
val s0_kill = s0_in.uop.robIdx.needFlush(io.redirect) val s0_valid = Wire(Bool())
val s0_vaddr = Wire(UInt(VAddrBits.W))
// TODO: VLSU, check it
val s0_isFirstIssue = io.isFirstIssue || io.vec_isFirstIssue
val s0_rsIdx = Wire(UInt(log2Up(IssQueSize).W))
val s0_kill = s0_out.uop.robIdx.needFlush(io.redirect)
val s0_can_go = s1_ready val s0_can_go = s1_ready
val s0_fire = s0_valid && !s0_kill && s0_can_go val s0_fire = s0_valid && !s0_kill && s0_can_go
val s0_sqIdx = WireInit(0.U.asTypeOf(new SqPtr))
val s0_uop = WireInit(0.U.asTypeOf(new MicroOp))
val s0_mask = WireInit(0.U((VLEN/8).W))
// vector related ctrl signal
val s0_isvec = WireInit(false.B)
val s0_is128bit = WireInit(false.B)
val s0_exp = WireInit(true.B)
val s0_fqidx = WireInit(0.U(log2Ceil(VsFlowSize).W))
// generate addr val s0_int_iss_valid = io.stin.valid
val s0_vec_iss_valid = io.vecstin.valid
dontTouch(s0_int_iss_valid)
dontTouch(s0_vec_iss_valid)
val s0_int_iss_ready = WireInit(true.B)
val s0_vec_iss_ready = !s0_int_iss_valid
val s0_int_iss_select = s0_int_iss_ready && s0_int_iss_valid
val s0_vec_iss_select = s0_vec_iss_ready && s0_vec_iss_valid
s0_valid := io.stin.valid || io.vecstin.valid
// generate addr for int store
// val saddr = s0_in.bits.src(0) + SignExt(s0_in.bits.uop.ctrl.imm(11,0), VAddrBits) // val saddr = s0_in.bits.src(0) + SignExt(s0_in.bits.uop.ctrl.imm(11,0), VAddrBits)
val imm12 = WireInit(s0_in.uop.ctrl.imm(11,0)) val imm12 = WireInit(io.stin.bits.uop.ctrl.imm(11,0))
val saddr_lo = s0_in.src(0)(11,0) + Cat(0.U(1.W), imm12) val saddr_lo = io.stin.bits.src(0)(11,0) + Cat(0.U(1.W), imm12)
val saddr_hi = Mux(saddr_lo(12), val saddr_hi = Mux(saddr_lo(12),
Mux(imm12(11), s0_in.src(0)(VAddrBits-1, 12), s0_in.src(0)(VAddrBits-1, 12)+1.U), Mux(imm12(11), io.stin.bits.src(0)(VAddrBits-1, 12), io.stin.bits.src(0)(VAddrBits-1, 12) + 1.U),
Mux(imm12(11), s0_in.src(0)(VAddrBits-1, 12)+SignExt(1.U, VAddrBits-12), s0_in.src(0)(VAddrBits-1, 12)), Mux(imm12(11), io.stin.bits.src(0)(VAddrBits-1, 12) + SignExt(1.U, VAddrBits-12), io.stin.bits.src(0)(VAddrBits-1, 12)),
) )
val s0_saddr = Cat(saddr_hi, saddr_lo(11,0)) val s0_saddr = Cat(saddr_hi, saddr_lo(11,0))
io.tlb.req.valid := s0_valid def fromNullSource() = {
io.tlb.req.bits.vaddr := s0_saddr s0_vaddr := 0.U
io.tlb.req.bits.cmd := TlbCmd.write s0_mask := 0.U
io.tlb.req.bits.size := LSUOpType.size(s0_in.uop.ctrl.fuOpType) s0_uop := 0.U.asTypeOf(new MicroOp)
io.tlb.req.bits.kill := DontCare s0_sqIdx := 0.U.asTypeOf(new SqPtr)
io.tlb.req.bits.memidx.is_ld := false.B s0_rsIdx := 0.U
io.tlb.req.bits.memidx.is_st := true.B }
io.tlb.req.bits.memidx.idx := s0_in.uop.sqIdx.value
io.tlb.req.bits.debug.robIdx := s0_in.uop.robIdx def fromIntIssueSource(src: ExuInput) = {
io.tlb.req.bits.no_translate := false.B s0_vaddr := s0_saddr
io.tlb.req.bits.debug.pc := s0_in.uop.cf.pc s0_mask := genVWmask(s0_vaddr, src.uop.ctrl.fuOpType(1,0))
s0_uop := src.uop
s0_sqIdx := src.uop.sqIdx
s0_rsIdx := io.rsIdx
}
def fromVecIssueSource(src: VecStorePipeBundle) = {
s0_vaddr := src.vaddr
s0_mask := src.mask
s0_uop := src.uop
s0_sqIdx := src.uop.sqIdx
// TODO: VLSU, implement vector feedback
s0_rsIdx := 0.U
// Vector load interface
s0_isvec := true.B
// TODO: VLSU, Store do not use 128 bits now?
s0_is128bit := false.B
s0_exp := src.exp
s0_fqidx := src.fqIdx
}
s0_uop := DontCare
when (s0_int_iss_select) { fromIntIssueSource(io.stin.bits) }
.elsewhen (s0_vec_iss_select) { fromVecIssueSource(io.vecstin.bits) }
.otherwise { fromNullSource() }
io.tlb.req.valid := s0_valid
io.tlb.req.bits.vaddr := s0_vaddr
io.tlb.req.bits.cmd := TlbCmd.write
io.tlb.req.bits.size := Mux(s0_isvec, io.vecstin.bits.alignedType, LSUOpType.size(s0_uop.ctrl.fuOpType))
io.tlb.req.bits.kill := DontCare
io.tlb.req.bits.memidx.is_ld := false.B
io.tlb.req.bits.memidx.is_st := true.B
io.tlb.req.bits.memidx.idx := s0_uop.sqIdx.value
io.tlb.req.bits.debug.robIdx := s0_uop.robIdx
io.tlb.req.bits.no_translate := false.B
io.tlb.req.bits.debug.pc := s0_uop.cf.pc
io.tlb.req.bits.debug.isFirstIssue := s0_isFirstIssue io.tlb.req.bits.debug.isFirstIssue := s0_isFirstIssue
io.tlb.req_kill := false.B io.tlb.req_kill := false.B
s0_out := DontCare s0_out := DontCare
s0_out.vaddr := s0_saddr s0_out.vaddr := s0_vaddr
// Now data use its own io s0_out.uop := s0_uop
// s1_out.data := genWdata(s1_in.src(1), s1_in.uop.ctrl.fuOpType(1,0))
s0_out.data := s0_in.src(1) // FIXME: remove data from pipeline
s0_out.uop := s0_in.uop
s0_out.miss := DontCare s0_out.miss := DontCare
s0_out.rsIdx := s0_rsIdx s0_out.rsIdx := s0_rsIdx
s0_out.mask := genVWmask(s0_saddr, s0_in.uop.ctrl.fuOpType(1,0)) s0_out.mask := s0_mask
s0_out.isFirstIssue := s0_isFirstIssue s0_out.isFirstIssue := s0_isFirstIssue
s0_out.isHWPrefetch := false.B // TODO s0_out.isHWPrefetch := false.B // TODO
s0_out.wlineflag := s0_in.uop.ctrl.fuOpType === LSUOpType.cbo_zero s0_out.wlineflag := s0_uop.ctrl.fuOpType === LSUOpType.cbo_zero
s0_out.isvec := s0_isvec
s0_out.is128bit := s0_is128bit
s0_out.exp := s0_exp
s0_out.fqIdx := s0_fqidx
when(s0_valid && s0_isFirstIssue) { when(s0_valid && s0_isFirstIssue) {
s0_out.uop.debugInfo.tlbFirstReqTime := GTimer() s0_out.uop.debugInfo.tlbFirstReqTime := GTimer()
} }
// exception check // exception check
val s0_addr_aligned = LookupTree(s0_in.uop.ctrl.fuOpType(1,0), List( val s0_addr_aligned = LookupTree(Mux(s0_isvec, io.vecstin.bits.alignedType, s0_uop.ctrl.fuOpType(1, 0)), List(
"b00".U -> true.B, //b "b00".U -> true.B, //b
"b01".U -> (s0_out.vaddr(0) === 0.U), //h "b01".U -> (s0_out.vaddr(0) === 0.U), //h
"b10".U -> (s0_out.vaddr(1,0) === 0.U), //w "b10".U -> (s0_out.vaddr(1,0) === 0.U), //w
"b11".U -> (s0_out.vaddr(2,0) === 0.U) //d "b11".U -> (s0_out.vaddr(2,0) === 0.U) //d
...@@ -115,7 +177,8 @@ class StoreUnit(implicit p: Parameters) extends XSModule { ...@@ -115,7 +177,8 @@ class StoreUnit(implicit p: Parameters) extends XSModule {
io.st_mask_out.bits.mask := s0_out.mask io.st_mask_out.bits.mask := s0_out.mask
io.st_mask_out.bits.sqIdx := s0_out.uop.sqIdx io.st_mask_out.bits.sqIdx := s0_out.uop.sqIdx
io.stin.ready := s1_ready io.stin.ready := s0_can_go && s0_int_iss_ready
io.vecstin.ready := s0_can_go && s0_vec_iss_ready
// Pipeline // Pipeline
// -------------------------------------------------------------------------------- // --------------------------------------------------------------------------------
...@@ -128,6 +191,7 @@ class StoreUnit(implicit p: Parameters) extends XSModule { ...@@ -128,6 +191,7 @@ class StoreUnit(implicit p: Parameters) extends XSModule {
val s1_kill = Wire(Bool()) val s1_kill = Wire(Bool())
val s1_can_go = s2_ready val s1_can_go = s2_ready
val s1_fire = s1_valid && !s1_kill && s1_can_go val s1_fire = s1_valid && !s1_kill && s1_can_go
val s1_exp = RegEnable(s0_out.exp, true.B, s0_fire)
// mmio cbo decoder // mmio cbo decoder
val s1_mmio_cbo = s1_in.uop.ctrl.fuOpType === LSUOpType.cbo_clean || val s1_mmio_cbo = s1_in.uop.ctrl.fuOpType === LSUOpType.cbo_clean ||
...@@ -137,6 +201,7 @@ class StoreUnit(implicit p: Parameters) extends XSModule { ...@@ -137,6 +201,7 @@ class StoreUnit(implicit p: Parameters) extends XSModule {
val s1_tlb_miss = io.tlb.resp.bits.miss val s1_tlb_miss = io.tlb.resp.bits.miss
val s1_mmio = s1_mmio_cbo val s1_mmio = s1_mmio_cbo
val s1_exception = ExceptionNO.selectByFu(s1_out.uop.cf.exceptionVec, staCfg).asUInt.orR val s1_exception = ExceptionNO.selectByFu(s1_out.uop.cf.exceptionVec, staCfg).asUInt.orR
val s1_isvec = RegEnable(s0_out.isvec, false.B, s0_fire)
s1_kill := s1_in.uop.robIdx.needFlush(io.redirect) || s1_tlb_miss s1_kill := s1_in.uop.robIdx.needFlush(io.redirect) || s1_tlb_miss
s1_ready := !s1_valid || s1_kill || s2_ready s1_ready := !s1_valid || s1_kill || s2_ready
...@@ -153,7 +218,7 @@ class StoreUnit(implicit p: Parameters) extends XSModule { ...@@ -153,7 +218,7 @@ class StoreUnit(implicit p: Parameters) extends XSModule {
// Send TLB feedback to store issue queue // Send TLB feedback to store issue queue
// Store feedback is generated in store_s1, sent to RS in store_s2 // Store feedback is generated in store_s1, sent to RS in store_s2
io.feedback_slow.valid := s1_fire io.feedback_slow.valid := s1_fire && !s1_isvec
io.feedback_slow.bits.hit := !s1_tlb_miss io.feedback_slow.bits.hit := !s1_tlb_miss
io.feedback_slow.bits.flushState := io.tlb.resp.bits.ptwBack io.feedback_slow.bits.flushState := io.tlb.resp.bits.ptwBack
io.feedback_slow.bits.rsIdx := s1_in.rsIdx io.feedback_slow.bits.rsIdx := s1_in.rsIdx
...@@ -166,12 +231,12 @@ class StoreUnit(implicit p: Parameters) extends XSModule { ...@@ -166,12 +231,12 @@ class StoreUnit(implicit p: Parameters) extends XSModule {
io.feedback_slow.bits.dataInvalidSqIdx := DontCare io.feedback_slow.bits.dataInvalidSqIdx := DontCare
// issue // issue
io.issue.valid := s1_valid && !s1_tlb_miss io.issue.valid := s1_valid && !s1_tlb_miss && !s1_isvec
io.issue.bits := RegEnable(s0_in, s0_valid) io.issue.bits := RegEnable(io.stin.bits, s0_valid)
// rs feedback // rs feedback
val s1_feedback = Wire(Valid(new RSFeedback)) val s1_feedback = Wire(Valid(new RSFeedback))
s1_feedback.valid := s1_valid & !s1_in.isHWPrefetch s1_feedback.valid := s1_valid && !s1_in.isHWPrefetch && !s1_isvec
s1_feedback.bits.hit := !s1_tlb_miss s1_feedback.bits.hit := !s1_tlb_miss
s1_feedback.bits.flushState := io.tlb.resp.bits.ptwBack s1_feedback.bits.flushState := io.tlb.resp.bits.ptwBack
s1_feedback.bits.rsIdx := s1_out.rsIdx s1_feedback.bits.rsIdx := s1_out.rsIdx
...@@ -183,6 +248,18 @@ class StoreUnit(implicit p: Parameters) extends XSModule { ...@@ -183,6 +248,18 @@ class StoreUnit(implicit p: Parameters) extends XSModule {
s1_feedback.bits.rsIdx s1_feedback.bits.rsIdx
) )
// TODO: VLSU, implement vector feedback
val s1_vec_feedback = Wire(Valid(new VSFQFeedback))
s1_vec_feedback.valid := s1_valid && !s1_in.isHWPrefetch && s1_isvec
s1_vec_feedback.bits.fqIdx := s1_out.fqIdx
s1_vec_feedback.bits.hit := !s1_tlb_miss
s1_vec_feedback.bits.sourceType := RSFeedbackType.tlbMiss
XSDebug(s1_vec_feedback.valid,
"Vector S1 Store: tlbHit: %d fqIdx: %d\n",
s1_vec_feedback.bits.hit,
s1_vec_feedback.bits.fqIdx
)
// get paddr from dtlb, check if rollback is needed // get paddr from dtlb, check if rollback is needed
// writeback store inst to lsq // writeback store inst to lsq
s1_out := s1_in s1_out := s1_in
...@@ -190,10 +267,11 @@ class StoreUnit(implicit p: Parameters) extends XSModule { ...@@ -190,10 +267,11 @@ class StoreUnit(implicit p: Parameters) extends XSModule {
s1_out.miss := false.B s1_out.miss := false.B
s1_out.mmio := s1_mmio s1_out.mmio := s1_mmio
s1_out.atomic := s1_mmio s1_out.atomic := s1_mmio
s1_out.uop.cf.exceptionVec(storePageFault) := io.tlb.resp.bits.excp(0).pf.st s1_out.uop.cf.exceptionVec(storePageFault) := io.tlb.resp.bits.excp(0).pf.st && s1_exp
s1_out.uop.cf.exceptionVec(storeAccessFault) := io.tlb.resp.bits.excp(0).af.st s1_out.uop.cf.exceptionVec(storeAccessFault) := io.tlb.resp.bits.excp(0).af.st && s1_exp
io.lsq.valid := s1_valid // TODO: VLSU, implement vector store queue
io.lsq.valid := s1_valid && !s1_isvec
io.lsq.bits := s1_out io.lsq.bits := s1_out
io.lsq.bits.miss := s1_tlb_miss io.lsq.bits.miss := s1_tlb_miss
...@@ -215,6 +293,7 @@ class StoreUnit(implicit p: Parameters) extends XSModule { ...@@ -215,6 +293,7 @@ class StoreUnit(implicit p: Parameters) extends XSModule {
val s2_kill = Wire(Bool()) val s2_kill = Wire(Bool())
val s2_can_go = s3_ready val s2_can_go = s3_ready
val s2_fire = s2_valid && !s2_kill && s2_can_go val s2_fire = s2_valid && !s2_kill && s2_can_go
val s2_exp = RegEnable(s1_out.exp, true.B, s1_fire)
s2_ready := !s2_valid || s2_kill || s3_ready s2_ready := !s2_valid || s2_kill || s3_ready
when (s1_fire) { s2_valid := true.B } when (s1_fire) { s2_valid := true.B }
...@@ -237,12 +316,16 @@ class StoreUnit(implicit p: Parameters) extends XSModule { ...@@ -237,12 +316,16 @@ class StoreUnit(implicit p: Parameters) extends XSModule {
s2_out := s2_in s2_out := s2_in
s2_out.mmio := s2_mmio && !s2_exception s2_out.mmio := s2_mmio && !s2_exception
s2_out.atomic := s2_in.atomic || s2_pmp.atomic s2_out.atomic := s2_in.atomic || s2_pmp.atomic
s2_out.uop.cf.exceptionVec(storeAccessFault) := s2_in.uop.cf.exceptionVec(storeAccessFault) || s2_pmp.st s2_out.uop.cf.exceptionVec(storeAccessFault) := (s2_in.uop.cf.exceptionVec(storeAccessFault) || s2_pmp.st) && s2_exp
// feedback tlb miss to RS in store_s2 // feedback tlb miss to RS in store_s2
io.feedback_slow.valid := RegNext(s1_feedback.valid && !s1_out.uop.robIdx.needFlush(io.redirect)) io.feedback_slow.valid := RegNext(s1_feedback.valid && !s1_out.uop.robIdx.needFlush(io.redirect))
io.feedback_slow.bits := RegNext(s1_feedback.bits) io.feedback_slow.bits := RegNext(s1_feedback.bits)
// TODO: VLSU, implement vector feedback
io.vec_feedback_slow.valid := RegNext(s1_vec_feedback.valid && !s1_out.uop.robIdx.needFlush(io.redirect))
io.vec_feedback_slow.bits := RegNext(s1_vec_feedback.bits)
// mmio and exception // mmio and exception
io.lsq_replenish := s2_out io.lsq_replenish := s2_out
...@@ -336,10 +419,11 @@ class StoreUnit(implicit p: Parameters) extends XSModule { ...@@ -336,10 +419,11 @@ class StoreUnit(implicit p: Parameters) extends XSModule {
XSPerfAccumulate("s0_in_valid", s0_valid) XSPerfAccumulate("s0_in_valid", s0_valid)
XSPerfAccumulate("s0_in_fire", s0_fire) XSPerfAccumulate("s0_in_fire", s0_fire)
XSPerfAccumulate("s0_in_fire_first_issue", s0_fire && s0_isFirstIssue) XSPerfAccumulate("s0_in_fire_first_issue", s0_fire && s0_isFirstIssue)
XSPerfAccumulate("s0_addr_spec_success", s0_fire && s0_saddr(VAddrBits-1, 12) === s0_in.src(0)(VAddrBits-1, 12)) // TODO: VLSU, implement vector addr spec
XSPerfAccumulate("s0_addr_spec_failed", s0_fire && s0_saddr(VAddrBits-1, 12) =/= s0_in.src(0)(VAddrBits-1, 12)) XSPerfAccumulate("s0_addr_spec_success", s0_fire && !s0_isvec && s0_saddr(VAddrBits-1, 12) === io.stin.bits.src(0)(VAddrBits-1, 12))
XSPerfAccumulate("s0_addr_spec_success_once", s0_fire && s0_saddr(VAddrBits-1, 12) === s0_in.src(0)(VAddrBits-1, 12) && s0_isFirstIssue) XSPerfAccumulate("s0_addr_spec_failed", s0_fire && !s0_isvec && s0_saddr(VAddrBits-1, 12) =/= io.stin.bits.src(0)(VAddrBits-1, 12))
XSPerfAccumulate("s0_addr_spec_failed_once", s0_fire && s0_saddr(VAddrBits-1, 12) =/= s0_in.src(0)(VAddrBits-1, 12) && s0_isFirstIssue) XSPerfAccumulate("s0_addr_spec_success_once", s0_fire && !s0_isvec && s0_saddr(VAddrBits-1, 12) === io.stin.bits.src(0)(VAddrBits-1, 12) && s0_isFirstIssue)
XSPerfAccumulate("s0_addr_spec_failed_once", s0_fire && !s0_isvec && s0_saddr(VAddrBits-1, 12) =/= io.stin.bits.src(0)(VAddrBits-1, 12) && s0_isFirstIssue)
XSPerfAccumulate("s1_in_valid", s1_valid) XSPerfAccumulate("s1_in_valid", s1_valid)
XSPerfAccumulate("s1_in_fire", s1_fire) XSPerfAccumulate("s1_in_fire", s1_fire)
......
/***************************************************************************************
* Copyright (c) 2020-2021 Institute of Computing Technology, Chinese Academy of Sciences
* Copyright (c) 2020-2021 Peng Cheng Laboratory
*
* XiangShan is licensed under Mulan PSL v2.
* You can use this software according to the terms and conditions of the Mulan PSL v2.
* You may obtain a copy of Mulan PSL v2 at:
* http://license.coscl.org.cn/MulanPSL2
*
* THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND,
* EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT,
* MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE.
*
* See the Mulan PSL v2 for more details.
***************************************************************************************/
package xiangshan.mem
import chipsalliance.rocketchip.config.Parameters
import chisel3._
import chisel3.util._
import utils._
import utility._
import xiangshan._
class DummyVectorLsq(implicit p: Parameters) extends BaseVectorLsq {
io := DontCare
}
\ No newline at end of file
/***************************************************************************************
* Copyright (c) 2020-2021 Institute of Computing Technology, Chinese Academy of Sciences
* Copyright (c) 2020-2021 Peng Cheng Laboratory
*
* XiangShan is licensed under Mulan PSL v2.
* You can use this software according to the terms and conditions of the Mulan PSL v2.
* You may obtain a copy of Mulan PSL v2 at:
* http://license.coscl.org.cn/MulanPSL2
*
* THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND,
* EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT,
* MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE.
*
* See the Mulan PSL v2 for more details.
***************************************************************************************/
package xiangshan.mem
import chipsalliance.rocketchip.config.Parameters
import chisel3.{util, _}
import chisel3.util._
import utils._
import utility._
import xiangshan._
import xiangshan.backend.rob.RobPtr
class FlowFreeList (size: Int, freeWidth: Int, maxIdxNum: Int, moduleName: String = "")(implicit p: Parameters)
extends XSModule with HasCircularQueuePtrHelper {
val io = IO(new Bundle(){
val allocReq = Flipped(Decoupled(UInt(log2Up(maxIdxNum + 1).W)))
val idxValue = Output(Vec(maxIdxNum,UInt(log2Ceil(size).W)))
val free = Input(UInt(size.W))
})
def getRemBits(input: UInt)(rem: Int): UInt = {
VecInit((0 until size / freeWidth).map(i => {
input(freeWidth * i + rem)
})).asUInt
}
class UopFreeListPtr extends CircularQueuePtr[UopFreeListPtr](size)
object UopFreeListPtr {
def apply(f: Boolean, v: Int): UopFreeListPtr = {
val ptr = Wire(new UopFreeListPtr)
ptr.flag := f.B
ptr.value := v.U
ptr
}
}
// flowFreeList ptr
val headPtr = RegInit(UopFreeListPtr(false, 0))
val tailPtr = RegInit(UopFreeListPtr(true, 0))
val freeList = RegInit(0.U(size.W))
val freeSelMaskVec = Wire(Vec(freeWidth, UInt(size.W)))
val freeSelMask = Wire(UInt(size.W))
val freeListBankBool = Wire(Vec(freeWidth, Bool()))
val IdxValueVec = Wire(Vec(freeWidth, UInt(log2Ceil(size).W)))
val flowFreeList = RegInit(VecInit(Seq.tabulate(size)(i => i.U(log2Up(size).W))))
io.allocReq.ready := distanceBetween(tailPtr,headPtr) >= 16.U
io.idxValue := DontCare
when (io.allocReq.fire) {
for (i <- 0 until maxIdxNum) {
when (i.U < io.allocReq.bits) {
val deqPtr = headPtr + i.U
io.idxValue(i) := flowFreeList(deqPtr.value)
}
}
}
headPtr := headPtr + io.allocReq.bits
freeSelMask := freeSelMaskVec.reduce(_|_)
freeList := (io.free | freeList) & ~freeSelMask
val freeListBank = VecInit(Seq.tabulate(freeWidth)(i => getRemBits(freeList & ~freeSelMask)(i)))
val freeIdxValueVec = VecInit(Seq.tabulate(freeWidth)(i => {
val value = PriorityEncoder(freeListBank(i))
Cat(value,i.U(log2Up(freeWidth).W))
}))
for (i <- 0 until freeWidth) {
freeListBankBool(i) := RegNext(freeListBank(i).orR)
IdxValueVec(i) := RegNext(freeIdxValueVec(i))
freeSelMaskVec(i) := Mux(freeListBankBool(i),UIntToOH(IdxValueVec(i)),0.U)
val enqPtr = tailPtr + PopCount(freeListBankBool.take(i))
flowFreeList(enqPtr.value) := IdxValueVec(i)
}
tailPtr := tailPtr + PopCount(freeListBankBool)
}
\ No newline at end of file
/***************************************************************************************
* Copyright (c) 2020-2021 Institute of Computing Technology, Chinese Academy of Sciences
* Copyright (c) 2020-2021 Peng Cheng Laboratory
*
* XiangShan is licensed under Mulan PSL v2.
* You can use this software according to the terms and conditions of the Mulan PSL v2.
* You may obtain a copy of Mulan PSL v2 at:
* http://license.coscl.org.cn/MulanPSL2
*
* THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND,
* EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT,
* MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE.
*
* See the Mulan PSL v2 for more details.
***************************************************************************************/
package xiangshan.mem
import chipsalliance.rocketchip.config.Parameters
import chisel3._
import chisel3.util._
import utils._
import utility._
import xiangshan._
import xiangshan.cache._
class VlsqPtr(implicit p: Parameters) extends CircularQueuePtr[VlsqPtr](
p => p(XSCoreParamsKey).VlsQueueSize
){
}
object VlsqPtr {
def apply(f: Bool, v: UInt)(implicit p: Parameters): VlsqPtr = {
val ptr = Wire(new VlsqPtr)
ptr.flag := f
ptr.value := v
ptr
}
}
// Intblock to VLSU IO
class Int2VLSUIO(implicit p: Parameters) extends XSBundle {
// base addr and stride from int block
val in = Vec(LoadPipelineWidth, Decoupled(new ExuInput)) // base addr and stride from int block
// For now, load RS only support 1 src operand,
// therefore only VL* unit-stride inst is supported
}
// Vecblock to VLSU IO
class Vec2VLSUIO(implicit p: Parameters) extends XSBundle {
// mask, address offsets, store data from vec block
val in = Vec(VecMemSrcInWidth, Decoupled(new VecMemOperand))
}
// VLSU to Vecblock IO
class VLSU2VecIO(implicit p: Parameters) extends XSBundle {
val out = Vec(LoadPipelineWidth, Decoupled(new VecLoadResult))
}
// VLSU to Intblock IO
class VLSU2IntIO(implicit p: Parameters) extends XSBundle {
// commit vector load and store
// data field in ExuOutput is not used here
val out = Vec(VecMemInstWbWidth, Decoupled(new ExuOutput))
}
// VLSU to Ctrlblock IO
class VLSU2CtrlIO(implicit p: Parameters) extends XSBundle {
// provide vlsqidx for ctrl block
val enq = new VlsqEnqIO
}
// Vector load/store source operand input
class VecMemOperand(implicit p: Parameters) extends XSBundle {
val mask = UInt(128.W)
val vs2 = UInt(128.W) // address offsets
val vs3 = UInt(128.W) // store data
val avd = UInt(5.W) // architectural vector register destinaton
// CHANGEME: update physical vector register destination width
val pvd = UInt(8.W) // physical vector register destinaton
val vlsqidx = new VlsqPtr
}
// Vector load/store ctrl info
class VecMemCtrl(implicit p: Parameters) extends XSBundle {
val vm = Bool()
val vwidth = UInt(3.W) // width field in inst
val mew = Bool()
val mop = UInt(2.W)
val nf = UInt(2.W)
val xumop = UInt(5.W) // lumop or sumop
def Inst2VecMemCtrl(inst: UInt): VecMemCtrl = {
val ctrl = Wire(new VecMemCtrl)
ctrl.nf := inst(31, 29)
ctrl.mew := inst(28)
ctrl.mop := inst(27, 26)
ctrl.vm := inst(25)
ctrl.xumop := inst(24, 20)
ctrl.vwidth := inst(14, 12)
ctrl
}
def fromInst(inst: UInt) = {
nf := inst(31, 29)
mew := inst(28)
mop := inst(27, 26)
vm := inst(25)
xumop := inst(24, 20)
vwidth := inst(14, 12)
}
}
// Extended micro-op for vector load/store
class VecMicroOp(implicit p: Parameters) extends MicroOp {
val vecmemCtrl = new VecMemCtrl
}
// Vector load result
class VecLoadResult(implicit p: Parameters) extends XSBundle {
val data = UInt(128.W) // hardcoded for now
val debug = new DebugBundle
}
// Vector load store queue enqueue IO
class VlsqEnqIO(implicit p: Parameters) extends XSBundle {
val canAccept = Output(Bool())
val needAlloc = Vec(exuParameters.LsExuCnt, Input(Bool()))
val req = Vec(VecMemDispatchWidth, Flipped(ValidIO(new VecMicroOp)))
val resp = Vec(VecMemDispatchWidth, Output(new VlsqPtr))
}
class BaseVectorLsq(implicit p: Parameters) extends XSModule {
val io = IO(new Bundle() {
val int2vlsu = Flipped(new Int2VLSUIO)
val vec2vlsu = Flipped(new Vec2VLSUIO)
val vlsu2vec = new VLSU2VecIO
val vlsu2int = new VLSU2IntIO
val vlsu2ctrl = new VLSU2CtrlIO
})
}
\ No newline at end of file
/***************************************************************************************
* Copyright (c) 2020-2021 Institute of Computing Technology, Chinese Academy of Sciences
* Copyright (c) 2020-2021 Peng Cheng Laboratory
*
* XiangShan is licensed under Mulan PSL v2.
* You can use this software according to the terms and conditions of the Mulan PSL v2.
* You may obtain a copy of Mulan PSL v2 at:
* http://license.coscl.org.cn/MulanPSL2
*
* THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND,
* EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT,
* MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE.
*
* See the Mulan PSL v2 for more details.
***************************************************************************************/
package xiangshan.mem
import chipsalliance.rocketchip.config.Parameters
import chisel3._
import chisel3.util._
import utils._
import utility._
import xiangshan._
import xiangshan.backend.rob.RobPtr
class VecOperand(implicit p: Parameters) extends XSBundleWithMicroOp {
val vmask = UInt(VLEN.W) // the mask of inst which is readed from reg
val vecData = UInt(VLEN.W)
val baseaddr = UInt(VAddrBits.W) // base address from rs1
val stride = UInt(XLEN.W) // stride from rs2
val index = UInt(VLEN.W) // index from vs2
val pvd = UInt(5.W) // physical vector register destination
val lmul = UInt(3.W)
val sew = UInt(2.W)
val vma = Bool()
val vta = Bool()
val inner_idx = UInt(3.W) // the number index among 8 uop
val vl = UInt(8.W)
// TODO: How will OOO calculatr vector register numbers?
// (EEW / SEW) * LMUL or (vl * EEW) / VLEN ?
// So OOO will always use eew ?
// val eew = UInt(3.W)
val total_num = UInt(3.W) // An inst to how many uops
}
class VecDecode(implicit p: Parameters) extends XSBundle {
val uop_segment_num = UInt(3.W)
val uop_type = UInt(2.W)
val mask_en = Bool()
val uop_unit_stride_whole_reg = Bool()
val uop_unit_stride_mask = Bool()
val uop_unit_stride_fof = Bool()
val uop_eew = UInt(3.W) // this is also the index width when the inst is a index load
def apply(inst: UInt) = {
this.uop_segment_num := inst(31, 29)
this.uop_type := inst(27, 26)
this.mask_en := inst(25)
this.uop_unit_stride_whole_reg := (inst(24,20) === "b01000".U)
this.uop_unit_stride_mask := (inst(24,20) === "b01011".U)
this.uop_unit_stride_fof := (inst(24,20) === "b10000".U)
this.uop_eew := inst(14, 12)
this
}
}
class OnlyVecExuOutput(implicit p: Parameters) extends XSBundle {
val isvec = Bool()
val vecdata = UInt(VLEN.W)
val mask = UInt((VLEN/8).W)
val rob_idx_valid = Vec(2, Bool())
val inner_idx = Vec(2, UInt(3.W))
val rob_idx = Vec(2, new RobPtr)
val offset = Vec(2, UInt(4.W))
val reg_offset = Vec(2, UInt(4.W))
val exp = Bool()
val is_first_ele = Bool()
val exp_ele_index = UInt(8.W)
}
class VecExuOutput(implicit p: Parameters) extends ExuOutput {
val vec = new OnlyVecExuOutput
}
class Uop2Flow(implicit p: Parameters) extends ExuInput(isVpu = true){
val vstart = UInt(8.W)
val mask = UInt(16.W)
val eew = UInt(3.W)
val emul = UInt(3.W)
val instType = UInt(3.W)
val uop_unit_stride_fof = Bool()
val uop_unit_whole_reg = Bool()
val agnedType = UInt(2.W)
val uop_segment_num = UInt(3.W)
}
object MulNum {
def apply (mul: UInt): UInt = { //mul means emul or lmul
(LookupTree(mul,List(
"b101".U -> 1.U , // 1/8
"b110".U -> 1.U , // 1/4
"b111".U -> 1.U , // 1/2
"b000".U -> 1.U , // 1
"b001".U -> 2.U , // 2
"b010".U -> 4.U , // 4
"b011".U -> 8.U // 8
)))}
}
/**
* when emul is greater than or equal to 1, this means the entire register needs to be written;
* otherwise, only write the specified number of bytes */
object MulDataSize {
def apply (mul: UInt): UInt = { //mul means emul or lmul
(LookupTree(mul,List(
"b101".U -> 2.U , // 1/8
"b110".U -> 4.U , // 1/4
"b111".U -> 8.U , // 1/2
"b000".U -> 16.U , // 1
"b001".U -> 16.U , // 2
"b010".U -> 16.U , // 4
"b011".U -> 16.U // 8
)))}
}
object OneRegNum {
def apply (eew: UInt): UInt = { //mul means emul or lmul
(LookupTree(eew,List(
"b000".U -> 16.U , // 1
"b101".U -> 8.U , // 2
"b110".U -> 4.U , // 4
"b111".U -> 2.U // 8
)))}
}
//index inst read data byte
object SewDataSize {
def apply (sew: UInt): UInt = {
(LookupTree(sew,List(
"b000".U -> 1.U , // 1
"b001".U -> 2.U , // 2
"b010".U -> 4.U , // 4
"b011".U -> 8.U // 8
)))}
}
// strided inst read data byte
object EewDataSize {
def apply (eew: UInt): UInt = {
(LookupTree(eew,List(
"b000".U -> 1.U , // 1
"b101".U -> 2.U , // 2
"b110".U -> 4.U , // 4
"b111".U -> 8.U // 8
)))}
}
object loadDataSize {
def apply (instType: UInt, emul: UInt, eew: UInt, sew: UInt): UInt = {
(LookupTree(instType,List(
"b000".U -> MulDataSize(emul), // unit-stride
"b010".U -> EewDataSize(eew) , // strided
"b001".U -> SewDataSize(sew) , // indexed-unordered
"b011".U -> SewDataSize(sew) , // indexed-ordered
"b100".U -> EewDataSize(eew) , // segment unit-stride
"b110".U -> EewDataSize(eew) , // segment strided
"b101".U -> SewDataSize(sew) , // segment indexed-unordered
"b111".U -> SewDataSize(sew) // segment indexed-ordered
)))}
}
object GenVecLoadMask {
def apply (instType: UInt, emul: UInt, eew: UInt, sew: UInt): UInt = {
val mask = Wire(UInt(16.W))
mask := UIntToOH(loadDataSize(instType = instType, emul = emul, eew = eew, sew = sew)) - 1.U
mask
}
}
object storeDataSize {
def apply (instType: UInt, eew: UInt, sew: UInt): UInt = {
(LookupTree(instType,List(
"b000".U -> EewDataSize(eew) , // unit-stride, do not use
"b010".U -> EewDataSize(eew) , // strided
"b001".U -> SewDataSize(sew) , // indexed-unordered
"b011".U -> SewDataSize(sew) , // indexed-ordered
"b100".U -> EewDataSize(eew) , // segment unit-stride
"b110".U -> EewDataSize(eew) , // segment strided
"b101".U -> SewDataSize(sew) , // segment indexed-unordered
"b111".U -> SewDataSize(sew) // segment indexed-ordered
)))}
}
object GenVecStoreMask {
def apply (instType: UInt, eew: UInt, sew: UInt): UInt = {
val mask = Wire(UInt(16.W))
mask := UIntToOH(storeDataSize(instType = instType, eew = eew, sew = sew)) - 1.U
mask
}
}
/**
* these are used to obtain immediate addresses for index instruction */
object EewEq8 {
def apply(index:UInt, flow_inner_idx: UInt): UInt = {
(LookupTree(flow_inner_idx,List(
0.U -> index(7 ,0 ),
1.U -> index(15,8 ),
2.U -> index(23,16 ),
3.U -> index(31,24 ),
4.U -> index(39,32 ),
5.U -> index(47,40 ),
6.U -> index(55,48 ),
7.U -> index(63,56 ),
8.U -> index(71,64 ),
9.U -> index(79,72 ),
10.U -> index(87,80 ),
11.U -> index(95,88 ),
12.U -> index(103,96 ),
13.U -> index(111,104),
14.U -> index(119,112),
15.U -> index(127,120)
)))}
}
object EewEq16 {
def apply(index: UInt, flow_inner_idx: UInt): UInt = {
(LookupTree(flow_inner_idx, List(
0.U -> index(15, 0),
1.U -> index(31, 16),
2.U -> index(47, 32),
3.U -> index(63, 48),
4.U -> index(79, 64),
5.U -> index(95, 80),
6.U -> index(111, 96),
7.U -> index(127, 112)
)))}
}
object EewEq32 {
def apply(index: UInt, flow_inner_idx: UInt): UInt = {
(LookupTree(flow_inner_idx, List(
0.U -> index(31, 0),
1.U -> index(63, 32),
2.U -> index(95, 64),
3.U -> index(127, 96)
)))}
}
object EewEq64 {
def apply (index: UInt, flow_inner_idx: UInt): UInt = {
(LookupTree(flow_inner_idx, List(
0.U -> index(63, 0),
1.U -> index(127, 64)
)))}
}
object IndexAddr {
def apply (index: UInt, flow_inner_idx: UInt, eew: UInt): UInt = {
(LookupTree(eew,List(
"b000".U -> EewEq8 (index = index, flow_inner_idx = flow_inner_idx ), // Imm is 1 Byte // TODO: index maybe cross register
"b101".U -> EewEq16(index = index, flow_inner_idx = flow_inner_idx ), // Imm is 2 Byte
"b110".U -> EewEq32(index = index, flow_inner_idx = flow_inner_idx ), // Imm is 4 Byte
"b111".U -> EewEq64(index = index, flow_inner_idx = flow_inner_idx ) // Imm is 8 Byte
)))}
}
/*
object RegFLowCnt {
def apply (emul: UInt, lmul:UInt, eew: UInt, uopIdx: UInt, flowIdx: UInt): UInt = {
(LookupTree(Cat(emul,lmul),List(
"b001000".U -> ((uopIdx(0 ) << Log2Num((16.U >> eew(1,0)).asUInt)).asUInt + flowIdx),//emul = 2,lmul = 1
"b010000".U -> ((uopIdx(1,0) << Log2Num((16.U >> eew(1,0)).asUInt)).asUInt + flowIdx),//emul = 4,lmul = 1
"b011000".U -> ((uopIdx(2,0) << Log2Num((16.U >> eew(1,0)).asUInt)).asUInt + flowIdx),//emul = 8,lmul = 1
"b010001".U -> ((uopIdx(0 ) << Log2Num((16.U >> eew(1,0)).asUInt)).asUInt + flowIdx),//emul = 4,lmul = 2
"b011001".U -> ((uopIdx(1,0) << Log2Num((16.U >> eew(1,0)).asUInt)).asUInt + flowIdx),//emul = 8,lmul = 2
"b011010".U -> ((uopIdx(0 ) << Log2Num((16.U >> eew(1,0)).asUInt)).asUInt + flowIdx) //emul = 8,lmul = 4
)))}
}
object AddrFLowCnt {
def apply (emul: UInt, lmul:UInt, sew:UInt, uopIdx: UInt, flowIdx: UInt):UInt = {
(LookupTree(Cat(lmul,emul),List(
"b001000".U -> ((uopIdx(0 ) << Log2Num((MulDataSize(lmul) >> sew(1,0)).asUInt)).asUInt + flowIdx),//lmul = 2, emul = 1
"b010000".U -> ((uopIdx(1,0) << Log2Num((MulDataSize(lmul) >> sew(1,0)).asUInt)).asUInt + flowIdx),//lmul = 4, emul = 1
"b011000".U -> ((uopIdx(2,0) << Log2Num((MulDataSize(lmul) >> sew(1,0)).asUInt)).asUInt + flowIdx),//lmul = 8, emul = 1
"b010001".U -> ((uopIdx(0 ) << Log2Num((MulDataSize(lmul) >> sew(1,0)).asUInt)).asUInt + flowIdx),//lmul = 4, emul = 2
"b011001".U -> ((uopIdx(1,0) << Log2Num((MulDataSize(lmul) >> sew(1,0)).asUInt)).asUInt + flowIdx),//lmul = 8, emul = 2
"b011011".U -> ((uopIdx(0 ) << Log2Num((MulDataSize(lmul) >> sew(1,0)).asUInt)).asUInt + flowIdx) //lmul = 8, emul = 4
)))}
}
*/
object RegFLowCnt {
def apply (emulNum: UInt, lmulNum:UInt, eew: UInt, uopIdx: UInt, flowIdx: UInt):UInt = {
(LookupTree(emulNum/lmulNum,List(
//"d1".U -> flowIdx,
"d2".U -> ((uopIdx(0 ) << Log2Num((16.U >> eew(1,0)).asUInt)).asUInt + flowIdx),
"d4".U -> ((uopIdx(1,0) << Log2Num((16.U >> eew(1,0)).asUInt)).asUInt + flowIdx),
"d8".U -> ((uopIdx(2,0) << Log2Num((16.U >> eew(1,0)).asUInt)).asUInt + flowIdx)
)))}
}
object AddrFLowCnt {
def apply (emulNum: UInt, lmulNum:UInt, sew:UInt, uopIdx: UInt, flowIdx: UInt):UInt = {
(LookupTree(lmulNum/emulNum,List(
"d1".U -> flowIdx,
"d2".U -> ((uopIdx(0 ) << Log2Num((16.U >> sew(1,0)).asUInt)).asUInt + flowIdx),
"d4".U -> ((uopIdx(1,0) << Log2Num((16.U >> sew(1,0)).asUInt)).asUInt + flowIdx),
"d8".U -> ((uopIdx(2,0) << Log2Num((16.U >> sew(1,0)).asUInt)).asUInt + flowIdx)
)))}
}
object Log2Num {
def apply (num: UInt): UInt = {
(LookupTree(num,List(
16.U -> 4.U,
8.U -> 3.U,
4.U -> 2.U,
2.U -> 1.U,
1.U -> 0.U
)))}
}
/**
* when emul is less than or equal to 1, the nf is equal to uop_inner_idx;
* when emul is equal to 2, the nf is equal to uop_inner_idx(2,1), and so on*/
object GenSegNfIdx {
def apply (mul: UInt, uopIdx: UInt):UInt = { // mul means lmul or emul
(LookupTree(mul,List(
"b101".U -> uopIdx , // 1/8
"b110".U -> uopIdx , // 1/4
"b111".U -> uopIdx , // 1/2
"b000".U -> uopIdx , // 1
"b001".U -> uopIdx(2,1), // 2
"b010".U -> uopIdx(2) , // 4
"b011".U -> 0.U //8
)))}
}
object GenSegNfIdxMul {
def apply (emul: UInt, lmul: UInt, uopIdx: UInt):UInt = {
(LookupTree(Cat(emul,lmul),List(
"b001000".U -> uopIdx(5,1), //emul = 2,lmul = 1
"b010000".U -> uopIdx(5,2), //emul = 4,lmul = 1
"b011000".U -> uopIdx(5,3), //emul = 8,lmul = 1
"b010001".U -> uopIdx(5,3), //emul = 4,lmul = 2
"b011001".U -> uopIdx(5,4), //emul = 8,lmul = 2
"b011010".U -> uopIdx(5,5) //emul = 8,lmul = 4
)))}
}
/**
* when emul is less than or equal to 1, only one segEmulIdx, so the segEmulIdx is 0.U;
* when emul is equal to 2, the segEmulIdx is equal to uopIdx(0), and so on*/
object GenSegMulIdx {
def apply (mul: UInt, uopIdx: UInt): UInt = { //mul means emul or lmul
(LookupTree(mul,List(
"b101".U -> 0.U , // 1/8
"b110".U -> 0.U , // 1/4
"b111".U -> 0.U , // 1/2
"b000".U -> 0.U , // 1
"b001".U -> uopIdx(0) , // 2
"b010".U -> uopIdx(1,0), // 4
"b011".U -> uopIdx(2,0) //8
)))}
}
//eew decode
object EewLog2 {
def apply (eew: UInt): UInt = {
(LookupTree(eew,List(
"b000".U -> "b000".U , // 1
"b101".U -> "b001".U , // 2
"b110".U -> "b010".U , // 4
"b111".U -> "b011".U // 8
)))}
}
/**
* unit-stride instructions don't use this method;
* other instructions generate realFlowNum by EmulDataSize >> eew(1,0),
* EmulDataSize means the number of bytes that need to be written to the register,
* eew(1,0) means the number of bytes written at once*/
object GenRealFlowNum {
def apply (instType: UInt, emul: UInt, lmul: UInt, eew: UInt, sew: UInt): UInt = {
(LookupTree(instType,List(
"b000".U -> (MulDataSize(emul) >> eew(1,0)).asUInt, // store use, load do not use
"b010".U -> (MulDataSize(emul) >> eew(1,0)).asUInt, // strided
"b001".U -> Mux(!emul(2) && !lmul(2) && emul > lmul,(MulDataSize(emul) >> eew(1,0)).asUInt,(MulDataSize(lmul) >> sew(1,0)).asUInt), // indexed-unordered
"b011".U -> Mux(!emul(2) && !lmul(2) && emul > lmul,(MulDataSize(emul) >> eew(1,0)).asUInt,(MulDataSize(lmul) >> sew(1,0)).asUInt), // indexed-ordered
"b100".U -> (MulDataSize(emul) >> eew(1,0)).asUInt, // segment unit-stride
"b110".U -> (MulDataSize(emul) >> eew(1,0)).asUInt, // segment strided
"b101".U -> Mux(!emul(2) && !lmul(2) && emul > lmul,(MulDataSize(emul) >> eew(1,0)).asUInt,(MulDataSize(lmul) >> sew(1,0)).asUInt), // segment indexed-unordered
"b111".U -> Mux(!emul(2) && !lmul(2) && emul > lmul,(MulDataSize(emul) >> eew(1,0)).asUInt,(MulDataSize(lmul) >> sew(1,0)).asUInt) // segment indexed-ordered
)))}
}
object GenEleIdx {
def apply (instType: UInt, emul: UInt, lmul: UInt, eew: UInt, sew: UInt, uopIdx:UInt, flowIdx: UInt):UInt = {
val eleIdx = Wire(UInt(7.W))
when (instType(1,0) === "b00".U || instType(1,0) === "b10".U || !emul(2) && !lmul(2) && emul > lmul) {
eleIdx := (uopIdx << Log2Num((MulDataSize(emul) >> eew(1,0)).asUInt)).asUInt + flowIdx
}.otherwise {
eleIdx := (uopIdx << Log2Num((MulDataSize(lmul) >> sew(1,0)).asUInt)).asUInt + flowIdx
}
eleIdx
}
}
/***************************************************************************************
* Copyright (c) 2020-2021 Institute of Computing Technology, Chinese Academy of Sciences
* Copyright (c) 2020-2021 Peng Cheng Laboratory
*
* XiangShan is licensed under Mulan PSL v2.
* You can use this software according to the terms and conditions of the Mulan PSL v2.
* You may obtain a copy of Mulan PSL v2 at:
* http://license.coscl.org.cn/MulanPSL2
*
* THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND,
* EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT,
* MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE.
*
* See the Mulan PSL v2 for more details.
***************************************************************************************/
package xiangshan.mem
import chipsalliance.rocketchip.config.Parameters
import chisel3._
import chisel3.util._
import utils._
import utility._
import xiangshan._
class VectorLoadWrapperIOBundle(implicit p: Parameters) extends XSBundle {
val loadRegIn = Vec(VecLoadPipelineWidth, Flipped(Decoupled(new ExuInput(isVpu = true))))
val loadPipleIn = Vec(VecLoadPipelineWidth, Flipped(Decoupled(new VecExuOutput())))
val Redirect = Flipped(ValidIO(new Redirect))
val loadPipeOut = Vec(VecLoadPipelineWidth, Decoupled(new VecLoadPipeBundle()))
val vecFeedback = Vec(VecLoadPipelineWidth, ValidIO(Bool()))
val vecLoadWriteback = Vec(VecLoadPipelineWidth, Decoupled(new ExuOutput(isVpu = true)))
}
class VectorLoadWrapper(implicit p: Parameters) extends XSModule with HasCircularQueuePtrHelper {
val io = IO(new VectorLoadWrapperIOBundle())
val loadInstDec = Wire(Vec(VecLoadPipelineWidth,new VecDecode()))
val eew = Wire(Vec(VecLoadPipelineWidth, UInt(3.W)))
val sew = Wire(Vec(VecLoadPipelineWidth, UInt(3.W)))
val lmul = Wire(Vec(VecLoadPipelineWidth, UInt(3.W)))
val emul = Wire(Vec(VecLoadPipelineWidth, UInt(3.W)))
val isSegment = Wire(Vec(VecLoadPipelineWidth, Bool()))
val instType = Wire(Vec(VecLoadPipelineWidth, UInt(3.W)))
val uop_unit_stride_fof = Wire(Vec(VecLoadPipelineWidth, Bool()))
val uop_unit_whole_reg = Wire(Vec(VecLoadPipelineWidth, Bool()))
val uop_segment_num = Wire(Vec(VecLoadPipelineWidth, Bool()))
val realFlowNum = Wire(Vec(VecLoadPipelineWidth, UInt(5.W)))
for (i <- 0 until VecLoadPipelineWidth) {
loadInstDec(i).apply(io.loadRegIn(i).bits.uop.cf.instr)
eew(i) := loadInstDec(i).uop_eew
sew(i) := io.loadRegIn(i).bits.uop.ctrl.vconfig.vtype.vsew
lmul(i) := io.loadRegIn(i).bits.uop.ctrl.vconfig.vtype.vlmul
emul(i) := EewLog2(eew(i)) - sew(i) + lmul(i)
isSegment(i) := loadInstDec(i).uop_segment_num =/= "b000".U && !loadInstDec(i).uop_unit_stride_whole_reg
instType(i) := Cat(isSegment(i), loadInstDec(i).uop_type)
uop_unit_stride_fof(i) := loadInstDec(i).uop_unit_stride_fof
uop_unit_whole_reg(i) := loadInstDec(i).uop_unit_stride_whole_reg
uop_segment_num(i) := loadInstDec(i).uop_segment_num
realFlowNum(i) := GenRealFlowNum(instType = instType(i), emul = emul(i), lmul = lmul(i), eew = eew(i), sew = sew(i))
}
val vlFlowQueue = Module(new VlFlowQueue())
val vlUopQueue = Module(new VlUopQueue())
vlUopQueue.io.Redirect <> io.Redirect
vlFlowQueue.io.Redirect <> io.Redirect
for (i <- 0 until VecLoadPipelineWidth) {
io.loadRegIn(i).ready := vlUopQueue.io.loadRegIn(i).ready && vlFlowQueue.io.loadRegIn(i).ready
io.vecFeedback(i).valid := vlUopQueue.io.uopVecFeedback(i).valid && vlFlowQueue.io.flowFeedback(i).valid
io.vecFeedback(i).bits := vlUopQueue.io.uopVecFeedback(i).bits && vlFlowQueue.io.flowFeedback(i).bits
vlUopQueue.io.loadRegIn(i).valid := io.loadRegIn(i).valid && (vlFlowQueue.io.loadRegIn(i).ready || vlFlowQueue.io.flowFeedback(i).bits)
vlUopQueue.io.loadRegIn(i).bits := io.loadRegIn(i).bits
vlFlowQueue.io.loadRegIn(i).valid := io.loadRegIn(i).valid && (vlUopQueue.io.loadRegIn(i).ready || vlUopQueue.io.uopVecFeedback(i).bits)
vlFlowQueue.io.loadRegIn(i).bits := io.loadRegIn(i).bits
}
vlUopQueue.io.instType := instType
vlUopQueue.io.emul := emul
vlUopQueue.io.realFlowNum := realFlowNum
vlUopQueue.io.loadPipeIn <> io.loadPipleIn
vlUopQueue.io.vecLoadWriteback <> io.vecLoadWriteback
vlUopQueue.io.fof := uop_unit_stride_fof
vlUopQueue.io.whole_reg := uop_unit_whole_reg
vlFlowQueue.io.eew := eew
vlFlowQueue.io.sew := sew
vlFlowQueue.io.emul := emul
vlFlowQueue.io.instType := instType
vlFlowQueue.io.uop_unit_stride_fof := uop_unit_stride_fof
vlFlowQueue.io.whole_reg := uop_unit_whole_reg
vlFlowQueue.io.uop_segment_num := uop_segment_num
vlFlowQueue.io.realFlowNum := realFlowNum
vlFlowQueue.io.loadPipeOut <> io.loadPipeOut
}
\ No newline at end of file
此差异已折叠。
/***************************************************************************************
* Copyright (c) 2020-2021 Institute of Computing Technology, Chinese Academy of Sciences
* Copyright (c) 2020-2021 Peng Cheng Laboratory
*
* XiangShan is licensed under Mulan PSL v2.
* You can use this software according to the terms and conditions of the Mulan PSL v2.
* You may obtain a copy of Mulan PSL v2 at:
* http://license.coscl.org.cn/MulanPSL2
*
* THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND,
* EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT,
* MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE.
*
* See the Mulan PSL v2 for more details.
***************************************************************************************/
package xiangshan.mem
import chipsalliance.rocketchip.config.Parameters
import chisel3.{util, _}
import chisel3.util._
import utils._
import utility._
import xiangshan._
import xiangshan.backend.rob.RobPtr
class VlUopFreeList(size: Int, allocWidth: Int, maxIdxNum: Int, freeWidth: Int, moduleName: String = "")(implicit p: Parameters)
extends XSModule with HasCircularQueuePtrHelper {
val io = IO(new Bundle(){
val accllReq = Vec(allocWidth,Flipped(Decoupled(UInt(log2Up(maxIdxNum + 1).W))))
val idxValue = Output(Vec(allocWidth,Vec(maxIdxNum,UInt(log2Ceil(size).W))))
val free = Input(UInt(size.W))
})
def getRemBits(input: UInt)(rem: Int): UInt = {
VecInit((0 until size / freeWidth).map(i => {input(freeWidth * i + rem)})).asUInt
}
class UopFreeListPtr extends CircularQueuePtr[UopFreeListPtr](size)
object UopFreeListPtr {
def apply(f: Boolean, v: Int): UopFreeListPtr = {
val ptr = Wire(new UopFreeListPtr)
ptr.flag := f.B
ptr.value := v.U
ptr
}
}
// uopFreeList ptr
val headPtr = RegInit(UopFreeListPtr(false,0))
val tailPtr = RegInit(UopFreeListPtr(true,0))
val freeMask = RegInit(0.U(size.W))
val freeSelMaskVec = Wire(Vec(freeWidth,UInt(size.W)))
val freeSelMask = Wire(UInt(size.W))
val freeListBankBool = Wire(Vec(freeWidth,Bool()))
val IdxValueVec = Wire(Vec(freeWidth,UInt(log2Ceil(size).W)))
//FreeList initialize
val uopFreeList = RegInit(VecInit(Seq.tabulate(size)(i => i.U(log2Up(size).W))))
for (i <- 0 until allocWidth) {
io.accllReq(i).ready := distanceBetween(tailPtr,headPtr) >= 16.U //FIXME:maybe optimized
}
//idxValue dequeue
for (i <- 0 until allocWidth) {
io.idxValue := DontCare
when (io.accllReq(i).fire) {
for (j <- 0 until maxIdxNum) {
when (j.U < io.accllReq(i).bits) {
val deqPtr = Wire(new UopFreeListPtr)
when (j.U === 1.U && io.accllReq(0).fire) {
deqPtr := headPtr + io.accllReq(0).bits + j.U
}.otherwise {
deqPtr := headPtr + j.U
}
io.idxValue(i)(j) := uopFreeList(deqPtr.value)
}
}
}
}
when (io.accllReq(0).fire && io.accllReq(1).fire) {
headPtr := headPtr + io.accllReq(0).bits + io.accllReq(1).bits
}.otherwise {
for (i <- 0 until allocWidth) {
when (io.accllReq(i).fire) {
headPtr := headPtr + io.accllReq(i).bits
}
}
}
//idxValue enqueue
freeSelMask := freeSelMaskVec.reduce(_|_)
freeMask := (io.free | freeMask) & ~freeSelMask
val freeLiskBank = VecInit(Seq.tabulate(freeWidth)(i => getRemBits(freeMask & ~freeSelMask)(i)))
val freeIdxValueVec = VecInit(Seq.tabulate(freeWidth)(i => {
val value = PriorityEncoder(freeLiskBank(i))
Cat(value,i.U(log2Up(freeWidth).W))
}))
for (i <- 0 until freeWidth) {
freeListBankBool(i) := RegNext(freeLiskBank(i).orR)
IdxValueVec(i) := RegNext(freeIdxValueVec(i))
freeSelMaskVec(i) := Mux(freeListBankBool(i),UIntToOH(IdxValueVec(i)),0.U)
val enqPtr = tailPtr + PopCount(freeListBankBool.take(i))
uopFreeList(enqPtr.value) := IdxValueVec(i)
}
tailPtr := tailPtr + PopCount(freeListBankBool)
}
\ No newline at end of file
/***************************************************************************************
* Copyright (c) 2020-2021 Institute of Computing Technology, Chinese Academy of Sciences
* Copyright (c) 2020-2021 Peng Cheng Laboratory
*
* XiangShan is licensed under Mulan PSL v2.
* You can use this software according to the terms and conditions of the Mulan PSL v2.
* You may obtain a copy of Mulan PSL v2 at:
* http://license.coscl.org.cn/MulanPSL2
*
* THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND,
* EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT,
* MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE.
*
* See the Mulan PSL v2 for more details.
***************************************************************************************/
package xiangshan.mem
import chipsalliance.rocketchip.config.Parameters
import chisel3.{util, _}
import chisel3.util._
import utils._
import utility._
import xiangshan._
import xiangshan.backend.rob.RobPtr
class VluopPtr(implicit p: Parameters) extends CircularQueuePtr[VluopPtr](
p => p(XSCoreParamsKey).VlUopSize
){
}
object VluopPtr {
def apply(f: Bool, v: UInt)(implicit p: Parameters): VluopPtr = {
val ptr = Wire(new VluopPtr)
ptr.flag := f
ptr.value := v
ptr
}
}
object VecGenMask {
def apply(rob_idx_valid: Vec[Bool], reg_offset: Vec[UInt], offset: Vec[UInt], mask: Vec[UInt]):Vec[UInt] = {
val vMask = VecInit(Seq.fill(2)(0.U(16.W)))
for (i <- 0 until 2){
when (rob_idx_valid(i)) {
when (offset(i) <= reg_offset(i)) {
vMask(i) := mask(i) << (reg_offset(i) - offset(i))
}.otherwise {
vMask(i) := mask(i) >> (offset(i) - reg_offset(i))
}
}
}
vMask
}
}
object VecGenData {
def apply (rob_idx_valid: Vec[Bool], reg_offset: Vec[UInt], offset: Vec[UInt], data:UInt):Vec[UInt] = {
val vData = VecInit(Seq.fill(2)(0.U(128.W)))
for (i <- 0 until 2){
when (rob_idx_valid(i)) {
when (offset(i) <= reg_offset(i)) {
vData(i) := data << ((reg_offset(i) - offset(i)) << 3.U)
}.otherwise {
vData(i) := data >> ((offset(i) - reg_offset(i)) << 3.U)
}
}
}
vData
}
}
/**
* */
class VluopBundle(implicit p: Parameters) extends XSBundle {
val uop = new MicroOp
val dataVMask = Vec(VLEN/8,Bool())
val data = Vec(VLEN/8,UInt(8.W))
val fof = Bool()
val excp_eew_index = UInt(8.W)
val exceptionVec = ExceptionVec()
def apply (uop: MicroOp, fof: Bool) = {
this.uop := uop
this.fof := fof
this
}
}
class VlUopQueueIOBundle(implicit p: Parameters) extends XSBundle {
val loadRegIn = Vec(VecLoadPipelineWidth, Flipped(DecoupledIO(new ExuInput(isVpu = true))))
val Redirect = Flipped(ValidIO(new Redirect))
val instType = Vec(VecLoadPipelineWidth, Input(UInt(3.W)))
val fof = Vec(VecLoadPipelineWidth, Input(Bool()))
val whole_reg = Vec(VecLoadPipelineWidth, Input(Bool()))
val emul = Vec(VecLoadPipelineWidth, Input(UInt(3.W)))
val realFlowNum = Vec(VecLoadPipelineWidth, Input(UInt(5.W)))
val loadPipeIn = Vec(VecLoadPipelineWidth, Flipped(DecoupledIO(new VecExuOutput)))
val uopVecFeedback = Vec(VecLoadPipelineWidth,ValidIO(Bool()))
val vecLoadWriteback = Vec(VecLoadPipelineWidth,DecoupledIO(new ExuOutput(isVpu = true)))
}
class VlUopQueue(implicit p: Parameters) extends XSModule with HasCircularQueuePtrHelper
{
val io = IO(new VlUopQueueIOBundle())
println("LoadUopQueue: size:" + VlUopSize)
val VluopEntry = Reg(Vec(VlUopSize, new VluopBundle))
// For example, an inst -> 4 uops,
// When first uop comes, 4 entries are all valid and pre_allocated
val valid = RegInit(VecInit(Seq.fill(VlUopSize)(false.B)))
val pre_allocated = RegInit(VecInit(Seq.fill(VlUopSize)(false.B)))
// When an uop really comes, an entry will be allocated
val allocated = RegInit(VecInit(Seq.fill(VlUopSize)(false.B)))
// When both data and pdest are readym, this entry is finished
val finished = RegInit(VecInit(Seq.fill(VlUopSize)(false.B)))
val counter = RegInit(VecInit(Seq.fill(VlUopSize)(0.U(4.W))))
val realFlowNum = Wire(Vec(VecLoadPipelineWidth, UInt(5.W)))
val vend = Wire(Vec(VecLoadPipelineWidth, UInt(5.W)))
val already_in = WireInit(VecInit(Seq.fill(VecLoadPipelineWidth)(false.B)))
val already_in_vec = WireInit(VecInit(Seq.fill(VecLoadPipelineWidth)(VecInit(Seq.fill(VlUopSize)(false.B)))))
val enq_valid = WireInit(VecInit(Seq.fill(VecLoadPipelineWidth)(false.B)))
val instType = Wire(Vec(VecLoadPipelineWidth, UInt(3.W)))
val mul = Wire(Vec(VecLoadPipelineWidth, UInt(3.W)))
val loadRegInValid = WireInit(VecInit(Seq.fill(VecStorePipelineWidth)(false.B)))
val needFlush = WireInit(VecInit(Seq.fill(VlUopSize)(false.B)))
val uopNum = Wire(Vec(VecLoadPipelineWidth, UInt(4.W)))
val free = WireInit(VecInit(Seq.fill(VecStorePipelineWidth)(0.U(VlUopSize.W))))
//First-level buffer
val buffer_valid_s0 = RegInit(VecInit(Seq.fill(VecLoadPipelineWidth)(false.B)))
val data_buffer_s0 = RegInit(VecInit(Seq.fill(VecLoadPipelineWidth)(0.U(VLEN.W))))
val mask_buffer_s0 = RegInit(VecInit(Seq.fill(VecLoadPipelineWidth)(VecInit(Seq.fill(2)(0.U((VLEN/8).W))))))
val rob_idx_valid_s0 = RegInit(VecInit(Seq.fill(VecLoadPipelineWidth)(VecInit(Seq.fill(2)(false.B)))))
val inner_idx_s0 = RegInit(VecInit(Seq.fill(VecLoadPipelineWidth)(VecInit(Seq.fill(2)(0.U(3.W))))))
val rob_idx_s0 = RegInit(VecInit(Seq.fill(VecLoadPipelineWidth)(VecInit(Seq.fill(2)(0.U.asTypeOf(new RobPtr))))))
val reg_offset_s0 = RegInit(VecInit(Seq.fill(VecLoadPipelineWidth)(VecInit(Seq.fill(2)(0.U(4.W))))))
val offset_s0 = RegInit(VecInit(Seq.fill(VecLoadPipelineWidth)(VecInit(Seq.fill(2)(0.U(4.W))))))
val uop_s0 = RegInit(VecInit(Seq.fill(VecLoadPipelineWidth)(0.U.asTypeOf(new MicroOp))))
val excp_s0 = RegInit(VecInit(Seq.fill(VecLoadPipelineWidth)(false.B)))
val is_first_ele_s0 = RegInit(VecInit(Seq.fill(VecLoadPipelineWidth)(false.B)))
val excep_ele_index_s0 = RegInit(VecInit(Seq.fill(VecLoadPipelineWidth)(0.U(8.W))))
val exceptionVec_s0 = RegInit(VecInit(Seq.fill(VecLoadPipelineWidth)(0.U.asTypeOf(ExceptionVec()))))
//Second-level buffer
//val buffer_valid_s1 = RegInit(VecInit(Seq.fill(VecLoadPipelineWidth)(VecInit(Seq.fill(2)(false.B)))))
//val data_buffer_s1 = RegInit(VecInit(Seq.fill(VecLoadPipelineWidth)(VecInit(Seq.fill(2)(0.U(VLEN.W))))))
//val mask_buffer_s1 = RegInit(VecInit(Seq.fill(VecLoadPipelineWidth)(VecInit(Seq.fill(2)(0.U((VLEN/8).W))))))
//val rob_idx_valid_s1 = RegInit(VecInit(Seq.fill(VecLoadPipelineWidth)(VecInit(Seq.fill(2)(false.B)))))
//val inner_idx_s1 = RegInit(VecInit(Seq.fill(VecLoadPipelineWidth)(VecInit(Seq.fill(2)(0.U(3.W))))))
//val rob_idx_s1 = RegInit(VecInit(Seq.fill(VecLoadPipelineWidth)(VecInit(Seq.fill(2)(0.U.asTypeOf(new RobPtr))))))
//val uop_s1 = RegInit(VecInit(Seq.fill(VecLoadPipelineWidth)(0.U.asTypeOf(new MicroOp))))
val vlUopFreeList = Module(new VlUopFreeList(size=VlUopSize,
allocWidth = VecLoadPipelineWidth,
maxIdxNum = 8,
freeWidth = 4,
moduleName = "vlUopFreeList"))
def getRemBits(input: UInt)(rem: Int): UInt = {
VecInit((0 until VlUopSize / VecLoadPipelineWidth).map(i => {
input(VecLoadPipelineWidth * i + rem)
})).asUInt
}
for (i <- 0 until VecLoadPipelineWidth) {
io.loadRegIn(i).ready := vlUopFreeList.io.accllReq(i).ready
io.loadPipeIn(i).ready := true.B
}
/**
*Check whether the unit-stride uop has enqueued*/
for (i <- 0 until VecLoadPipelineWidth) {
for (entry <- 0 until VlUopSize) {
already_in_vec(i)(entry) := VluopEntry(entry).uop.robIdx.value === io.loadRegIn(i).bits.uop.robIdx.value &&
VluopEntry(entry).uop.ctrl.uopIdx === io.loadRegIn(i).bits.uop.ctrl.uopIdx &&
pre_allocated(entry)
val debug_hit = WireInit(VecInit(Seq.fill(VlUopSize)(false.B))) // for debug
when (already_in_vec(i)(entry) && io.loadRegIn(i).valid) {
VluopEntry(entry).apply(uop = io.loadRegIn(i).bits.uop, fof = io.fof(i))
allocated(entry) := true.B
debug_hit(entry) := true.B
}
assert(PopCount(debug_hit) <= 1.U, "VlUopQueue Multi-Hit!")
}
}
for (i <- 0 until VecLoadPipelineWidth) {
vend(i) := DontCare
already_in(i) := already_in_vec(i).asUInt.orR
instType(i) := io.instType(i)
loadRegInValid(i) := !io.loadRegIn(i).bits.uop.robIdx.needFlush(io.Redirect) && io.loadRegIn(i).fire
enq_valid(i) := !already_in(i) && loadRegInValid(i)
mul(i) := Mux(instType(i)(1,0) === "b00".U || instType(i)(1,0) === "b10".U,io.emul(i),io.loadRegIn(i).bits.uop.ctrl.vconfig.vtype.vlmul(i))
when (instType(0) === "b000".U) {
vend(i) := io.loadRegIn(i).bits.src(0)(3,0) + MulDataSize(mul=io.emul(i))
realFlowNum(i) := vend(i)(4) +& (vend(i)(3,0) =/= 0.U).asUInt
uopNum(i) := io.loadRegIn(i).bits.uop.ctrl.total_num //TODO: if the inst has 4 uop, total_num = 4
}.otherwise {
realFlowNum(i) := io.realFlowNum(i)
uopNum(i) := 1.U
}
}
/**
* Only unit-stride instructions use vecFeedback
*/
for (i <- 0 until VecLoadPipelineWidth) {
io.uopVecFeedback(i).valid := io.loadRegIn(i).valid
io.uopVecFeedback(i).bits := already_in(i)
}
//uop enqueue
dontTouch(io.loadRegIn)
for (i <- 0 until VecLoadPipelineWidth) {
vlUopFreeList.io.accllReq(i) := DontCare
when (enq_valid(i)) {
vlUopFreeList.io.accllReq(i).valid := true.B
vlUopFreeList.io.accllReq(i).bits := uopNum(i)
for (j <- 0 until 8) {
when (j.U < uopNum(i)) {
val enqPtr = vlUopFreeList.io.idxValue(i)(j)
val inUop = WireInit(io.loadRegIn(i).bits.uop)
//val isPer = !(i.U === io.loadRegIn(0).bits.uop.ctrl.uopIdx || i.U === io.loadRegIn(1).bits.uop.ctrl.uopIdx)
inUop.ctrl.uopIdx := Mux(instType(i) === "b000".U, j.U, io.loadRegIn(i).bits.uop.ctrl.uopIdx) //TODO: If flow don't write loadQueue, ldIdx needn't calculate
VluopEntry(enqPtr).apply(uop = inUop, fof = io.fof(i))
valid(enqPtr) := true.B
pre_allocated(enqPtr) := true.B
counter(enqPtr) := realFlowNum(i)
when ( instType(i) === "b000".U && j.U === io.loadRegIn(i).bits.uop.ctrl.uopIdx || instType(i) =/= "b000".U) {
allocated(enqPtr) := true.B
}
}
}
}
}
// write data from loadpipe to first_level buffer
for (i <- 0 until VecLoadPipelineWidth) {
when (io.loadPipeIn(i).fire) {
buffer_valid_s0(i) := true.B
data_buffer_s0(i) := io.loadPipeIn(i).bits.vec.vecdata
rob_idx_valid_s0(i) := io.loadPipeIn(i).bits.vec.rob_idx_valid
rob_idx_s0(i) := io.loadPipeIn(i).bits.vec.rob_idx
inner_idx_s0(i) := io.loadPipeIn(i).bits.vec.inner_idx
reg_offset_s0(i) := io.loadPipeIn(i).bits.vec.reg_offset
offset_s0(i) := io.loadPipeIn(i).bits.vec.offset
uop_s0(i) := io.loadPipeIn(i).bits.uop
excp_s0(i) := io.loadPipeIn(i).bits.vec.exp
is_first_ele_s0(i) := io.loadPipeIn(i).bits.vec.is_first_ele
excep_ele_index_s0(i) := io.loadPipeIn(i).bits.vec.exp_ele_index
exceptionVec_s0(i) := io.loadPipeIn(i).bits.uop.cf.exceptionVec
for (j <- 0 until 2) {
mask_buffer_s0(i)(j) := io.loadPipeIn(i).bits.vec.mask << io.loadPipeIn(i).bits.vec.offset(j)
}
}.otherwise {
buffer_valid_s0(i) := false.B
rob_idx_valid_s0(i) := VecInit(Seq.fill(2)(false.B))
}
}
// write data from first_level buffer to second_level buffer
///for (i <- 0 until VecLoadPipelineWidth) {
/// when (buffer_valid_s0(i) === true.B) {
/// buffer_valid_s1(i) := VecInit(Seq.fill(2)(true.B))
/// mask_buffer_s1(i) := VecGenMask(rob_idx_valid = rob_idx_valid_s0(i), reg_offset = reg_offset_s0(i), offset = offset_s0(i), mask = mask_buffer_s0(i))
/// data_buffer_s1(i) := VecGenData(rob_idx_valid = rob_idx_valid_s0(i), reg_offset = reg_offset_s0(i), offset = offset_s0(i), data = data_buffer_s0(i))
/// rob_idx_valid_s1(i) := rob_idx_valid_s0(i)
/// inner_idx_s1(i) := inner_idx_s0(i)
/// rob_idx_s1(i) := rob_idx_s0(i)
/// uop_s1(i) := uop_s0(i)
/// }.otherwise {
/// buffer_valid_s1(i) := VecInit(Seq.fill(2)(false.B))
/// rob_idx_valid_s1(i) := VecInit(Seq.fill(2)(false.B))
/// }
///}
0.U.asTypeOf(ExceptionVec())
//write data from first_level buffer to VluopEntry
for (i <- 0 until VecLoadPipelineWidth) {
val mask_buffer = VecGenMask(rob_idx_valid = rob_idx_valid_s0(i),
reg_offset = reg_offset_s0(i),
offset = offset_s0(i),
mask = mask_buffer_s0(i))
val data_buffer = VecGenData(rob_idx_valid = rob_idx_valid_s0(i),
reg_offset = reg_offset_s0(i),
offset = offset_s0(i),
data = data_buffer_s0(i))
for (j <- 0 until 2) {
when(buffer_valid_s0(i) && rob_idx_valid_s0(i)(j)) {
for (entry <- 0 until VlUopSize) {
when(rob_idx_s0(i)(j).value === VluopEntry(entry).uop.robIdx.value &&
inner_idx_s0(i)(j) === VluopEntry(entry).uop.ctrl.uopIdx) {
counter(entry) := counter(entry) - 1.U
for (k <- 0 until VLEN / 8) {
when(mask_buffer(j)(k)) {
VluopEntry(entry).data(k) := data_buffer(j)(k * 8 + 7, k * 8)
VluopEntry(entry).dataVMask(k) := mask_buffer(j)(k)
}
}
when(excp_s0(i)) {
when(VluopEntry(entry).fof) {
when(VluopEntry(entry).uop.robIdx.value === 0.U & is_first_ele_s0(i)) {
VluopEntry(entry).excp_eew_index := excep_ele_index_s0(i)
VluopEntry(entry).exceptionVec := exceptionVec_s0(i)
}
}.otherwise {
when(VluopEntry(entry).excp_eew_index < excep_ele_index_s0(i)) {
VluopEntry(entry).excp_eew_index := excep_ele_index_s0(i)
VluopEntry(entry).exceptionVec := exceptionVec_s0(i)
}
}
}
}
}
}
}
}
//write data from second_level buffer to VluopEntry
//for (i <- 0 until VecLoadPipelineWidth) {
// for (j <- 0 until 2) {
// when (buffer_valid_s1(i)(j) && rob_idx_valid_s1(i)(j)) {
// for (entry <- 0 until VlUopSize) {
// when (rob_idx_s1(i)(j).value === VluopEntry(entry).uop.robIdx.value && inner_idx_s1(i)(j) === VluopEntry(entry).uop.ctrl.uopIdx) {
// counter(entry) := counter(entry) - 1.U
// for (k <- 0 until VLEN/8) {
// when (mask_buffer_s1(i)(j)(k)) {
// VluopEntry(entry).data(k) := data_buffer_s1(i)(j)(k*8 + 7,k*8)
// VluopEntry(entry).dataVMask(k) := mask_buffer_s1(i)(j)(k)
// }
// }
// }
// }
// }
// }
//}
//finished = 1 means completion
for (entry <- 0 until VlUopSize) {
finished(entry) := valid(entry) && allocated(entry) && counter(entry) === 0.U
}
/**
*dequeue logic*/
val vlUopQueueBank = VecInit(Seq.tabulate(VecLoadPipelineWidth)(i => getRemBits(finished.asUInt)(i)))
val deqPtr = VecInit(Seq.tabulate(VecLoadPipelineWidth)(i => {
val value = PriorityEncoder(vlUopQueueBank(i))
Cat(value,i.U(log2Up(VecLoadPipelineWidth).W))
}))
for (i <- 0 until VecLoadPipelineWidth) {
io.vecLoadWriteback(i).bits := DontCare
io.vecLoadWriteback(i).valid := valid(deqPtr(i)) && finished(deqPtr(i))
io.vecLoadWriteback(i).bits.uop := VluopEntry(deqPtr(i)).uop
io.vecLoadWriteback(i).bits.data := VluopEntry(deqPtr(i)).data.asUInt
when (io.vecLoadWriteback(i).fire) { //TODO:need optimization?
valid(deqPtr(i)) := false.B
allocated(deqPtr(i)) := false.B
pre_allocated(deqPtr(i)) := false.B
finished(deqPtr(i)) := false.B
VluopEntry(deqPtr(i)).dataVMask := VecInit(Seq.fill(VLEN / 8)(false.B))
free(i) := UIntToOH(deqPtr(i))
}
}
/**
* Redirection occurred, refreshing queue */
for (entry <- 0 until VlUopSize) {
needFlush(entry) := VluopEntry(entry).uop.robIdx.needFlush(io.Redirect) && valid(entry)
when (needFlush(entry)) {
valid(entry) := false.B
allocated(entry) := false.B
pre_allocated(entry) := false.B
finished(entry) := false.B
VluopEntry(entry).dataVMask := VecInit(Seq.fill(VLEN / 8)(false.B))
}
}
val lastRedirect = RegNext(io.Redirect)
when (lastRedirect.valid) {
vlUopFreeList.io.free := RegNext(needFlush.asUInt)
}.otherwise {
vlUopFreeList.io.free := free.reduce(_|_)
}
}
\ No newline at end of file
此差异已折叠。
/***************************************************************************************
* Copyright (c) 2020-2021 Institute of Computing Technology, Chinese Academy of Sciences
* Copyright (c) 2020-2021 Peng Cheng Laboratory
*
* XiangShan is licensed under Mulan PSL v2.
* You can use this software according to the terms and conditions of the Mulan PSL v2.
* You may obtain a copy of Mulan PSL v2 at:
* http://license.coscl.org.cn/MulanPSL2
*
* THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND,
* EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT,
* MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE.
*
* See the Mulan PSL v2 for more details.
***************************************************************************************/
package xiangshan.mem
import chipsalliance.rocketchip.config.Parameters
import chisel3.{util, _}
import chisel3.util._
import utils._
import utility._
import xiangshan._
import xiangshan.backend.rob.RobPtr
class VsUopFreeList(size: Int, allocWidth: Int, freeWidth: Int, moduleName: String = "")(implicit p: Parameters)
extends XSModule with HasCircularQueuePtrHelper {
val io = IO(new Bundle(){
val accllReq = Vec(allocWidth,Flipped(Decoupled(Bool())))
val idxValue = Output(Vec(allocWidth,UInt(log2Ceil(size).W)))
val free = Input(UInt(size.W))
})
def getRemBits(input: UInt)(rem: Int): UInt = {
VecInit((0 until size / freeWidth).map(i => {input(freeWidth * i + rem)})).asUInt
}
class UopFreeListPtr extends CircularQueuePtr[UopFreeListPtr](size)
object UopFreeListPtr {
def apply(f: Boolean, v: Int): UopFreeListPtr = {
val ptr = Wire(new UopFreeListPtr)
ptr.flag := f.B
ptr.value := v.U
ptr
}
}
// uopFreeList ptr
val headPtr = RegInit(UopFreeListPtr(false,0))
val tailPtr = RegInit(UopFreeListPtr(true,0))
val freeMask = RegInit(0.U(size.W))
val freeSelMaskVec = Wire(Vec(freeWidth,UInt(size.W)))
val freeSelMask = Wire(UInt(size.W))
val freeListBankBool = Wire(Vec(freeWidth,Bool()))
val IdxValueVec = Wire(Vec(freeWidth,UInt(log2Ceil(size).W)))
//FreeList initialize
val uopFreeList = RegInit(VecInit(Seq.tabulate(size)(i => i.U(log2Up(size).W))))
for (i <- 0 until allocWidth) {
io.accllReq(i).ready := distanceBetween(tailPtr,headPtr) >= 2.U
}
//idxValue dequeue
for (i <- 0 until allocWidth) {
val deqPtr = headPtr + PopCount(io.accllReq.map(_.fire).take(i))
io.idxValue(i) := uopFreeList(deqPtr.value)
}
headPtr := headPtr + PopCount(io.accllReq.map(_.fire))
//idxValue enqueue
freeSelMask := freeSelMaskVec.reduce(_|_)
freeMask := (io.free | freeMask) & ~freeSelMask
val freeLiskBank = VecInit(Seq.tabulate(freeWidth)(i => getRemBits(freeMask & ~freeSelMask)(i)))
val freeIdxValueVec = VecInit(Seq.tabulate(freeWidth)(i => {
val value = PriorityEncoder(freeLiskBank(i))
Cat(value,i.U(log2Up(freeWidth).W))
}))
for (i <- 0 until freeWidth) {
freeListBankBool(i) := RegNext(freeLiskBank(i).orR)
IdxValueVec(i) := RegNext(freeIdxValueVec(i))
freeSelMaskVec(i) := Mux(freeListBankBool(i),UIntToOH(IdxValueVec(i)),0.U)
val enqPtr = tailPtr + PopCount(freeListBankBool.take(i))
uopFreeList(enqPtr.value) := IdxValueVec(i)
}
tailPtr := tailPtr + PopCount(freeListBankBool)
}
\ No newline at end of file
/***************************************************************************************
* Copyright (c) 2020-2021 Institute of Computing Technology, Chinese Academy of Sciences
* Copyright (c) 2020-2021 Peng Cheng Laboratory
*
* XiangShan is licensed under Mulan PSL v2.
* You can use this software according to the terms and conditions of the Mulan PSL v2.
* You may obtain a copy of Mulan PSL v2 at:
* http://license.coscl.org.cn/MulanPSL2
*
* THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND,
* EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT,
* MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE.
*
* See the Mulan PSL v2 for more details.
***************************************************************************************/
package xiangshan.mem
import chipsalliance.rocketchip.config.Parameters
import chisel3.{util, _}
import chisel3.util._
import utils._
import utility._
import xiangshan._
class VsUopPtr(implicit p: Parameters) extends CircularQueuePtr[VsUopPtr](
p => p(XSCoreParamsKey).VsUopSize
){
}
object VsUopPtr {
def apply(f: Bool, v: UInt)(implicit p: Parameters): VsUopPtr = {
val ptr = Wire(new VsUopPtr)
ptr.flag := f
ptr.value := v
ptr
}
}
class VsUopQueueIOBundle (implicit p: Parameters) extends XSBundle {
val storeIn = Vec(VecStorePipelineWidth,Flipped(Decoupled(new ExuInput(isVpu = true))))
val vstart = Vec(VecStorePipelineWidth,Input(UInt(8.W)))
val Redirect = Flipped(ValidIO(new Redirect))
val uop2Flow = Vec(VecStorePipelineWidth,Decoupled(new Uop2Flow()))
}
class VsUopQueue(implicit p: Parameters) extends XSModule with HasCircularQueuePtrHelper {
val io = IO(new VsUopQueueIOBundle())
println("StoreUopQueue: size:" + VsUopSize)
val valid = RegInit(VecInit(Seq.fill(VsUopSize)(false.B)))
val vsUopEntry = Reg(Vec(VsUopSize,new Uop2Flow()))
val loadInstDec = Wire(Vec(VecStorePipelineWidth,new VecDecode()))
val eew = Wire(Vec(VecStorePipelineWidth, UInt(3.W)))
val sew = Wire(Vec(VecStorePipelineWidth, UInt(3.W)))
val lmul = Wire(Vec(VecStorePipelineWidth, UInt(3.W)))
val emul = Wire(Vec(VecStorePipelineWidth, UInt(3.W)))
val isSegment = Wire(Vec(VecStorePipelineWidth, Bool()))
val instType = Wire(Vec(VecStorePipelineWidth, UInt(3.W)))
val storeInValid = WireInit(VecInit(Seq.fill(VecStorePipelineWidth)(false.B)))
val needFlush = WireInit(VecInit(Seq.fill(VsUopSize)(false.B)))
val free = WireInit(VecInit(Seq.fill(VecStorePipelineWidth)(0.U(VsUopSize.W))))
def getRemBits(input: UInt)(rem: Int): UInt = {
VecInit((0 until VsUopSize / VecStorePipelineWidth).map(i => {input(VecStorePipelineWidth * i + rem)})).asUInt
}
val uopFreeList = Module(new VsUopFreeList(
size = VsUopSize,
allocWidth = VecStorePipelineWidth,
freeWidth = 4,
moduleName = "vsUopFreeList"))
for (i <- 0 until VecStorePipelineWidth) {
io.storeIn(i).ready := uopFreeList.io.accllReq(i).ready
}
for (i <- 0 until VecStorePipelineWidth) {
storeInValid(i) := !io.storeIn(i).bits.uop.robIdx.needFlush(io.Redirect) && io.storeIn(i).fire
}
/**
* Redirection occurred, flush VsUopQueue */
for (entry <- 0 until VsUopSize) {
needFlush(entry) := vsUopEntry(entry).uop.robIdx.needFlush(io.Redirect) && valid(entry)
when(needFlush(entry)) {
valid(entry) := false.B
}
}
val lastRedriect = RegNext(io.Redirect)
when (lastRedriect.valid) {
uopFreeList.io.free := RegNext(needFlush.asUInt)
}.otherwise {
uopFreeList.io.free := free.reduce(_|_)
}
for (i <- 0 until VecStorePipelineWidth) {
loadInstDec(i).apply(io.storeIn(i).bits.uop.cf.instr)
eew(i) := loadInstDec(i).uop_eew
sew(i) := io.storeIn(i).bits.uop.ctrl.vconfig.vtype.vsew
lmul(i) := io.storeIn(i).bits.uop.ctrl.vconfig.vtype.vlmul
emul(i) := EewLog2(eew(i)) - sew(i) + lmul(i)
isSegment(i) := loadInstDec(i).uop_segment_num =/= "b000".U && !loadInstDec(i).uop_unit_stride_whole_reg
instType(i) := Cat(isSegment(i),loadInstDec(i).uop_type)
}
//enqueue
for (i <- 0 until VecStorePipelineWidth) {
uopFreeList.io.accllReq(i) := DontCare
when (storeInValid(i)) {
uopFreeList.io.accllReq(i).valid := true.B
val enqPtr = uopFreeList.io.idxValue(i)
vsUopEntry(enqPtr) := DontCare
valid (enqPtr) := true.B
vsUopEntry(enqPtr).src := io.storeIn(i).bits.src
vsUopEntry(enqPtr).uop := io.storeIn(i).bits.uop
vsUopEntry(enqPtr).vstart := io.vstart(i) //FIXME
vsUopEntry(enqPtr).mask := GenVecStoreMask(instType = instType(i), eew = eew(i), sew = sew(i))
vsUopEntry(enqPtr).eew := eew(i)
vsUopEntry(enqPtr).emul := emul(i)
vsUopEntry(enqPtr).instType := instType(i)
vsUopEntry(enqPtr).uop_unit_stride_fof := loadInstDec(i).uop_unit_stride_fof
vsUopEntry(enqPtr).uop_unit_whole_reg := loadInstDec(i).uop_unit_stride_whole_reg
vsUopEntry(enqPtr).uop_segment_num := loadInstDec(i).uop_segment_num
}
}
//dequeue
val UopQueueBank = VecInit(Seq.tabulate(VecStorePipelineWidth)(i => getRemBits(valid.asUInt)(i)))
val deqPtr = VecInit(Seq.tabulate(VecStorePipelineWidth)(i => {
val value = PriorityEncoder(UopQueueBank(i))
Cat(value,i.U(log2Up(VecStorePipelineWidth).W))
}))
for (i <- 0 until VecStorePipelineWidth) {
io.uop2Flow(i).bits := DontCare
io.uop2Flow(i).valid := valid(deqPtr(i))//FIXME: performace, 1 interface may use incorrect valid
io.uop2Flow(i).bits := vsUopEntry(deqPtr(i))
when (io.uop2Flow(i).fire) {
valid(deqPtr(i)) := false.B
free(i) := UIntToOH(deqPtr(i))
}
}
}
...@@ -774,6 +774,34 @@ package object xiangshan { ...@@ -774,6 +774,34 @@ package object xiangshan {
latency = UncertainLatency() latency = UncertainLatency()
) )
val veclduCfg = FuConfig(
"vecldu",
null, // DontCare
(uop: MicroOp) => FuType.loadCanAccept(uop.ctrl.fuType),
FuType.ldu, 1, 0, writeIntRf = false, writeFpRf = false,
latency = UncertainLatency(),
exceptionOut = Seq(loadAddrMisaligned, loadAccessFault, loadPageFault),
flushPipe = true,
replayInst = true,
hasLoadError = true
)
val vecstaCfg = FuConfig(
"vecsta",
null,
(uop: MicroOp) => FuType.storeCanAccept(uop.ctrl.fuType),
FuType.stu, 1, 0, writeIntRf = false, writeFpRf = false,
latency = UncertainLatency(),
exceptionOut = Seq(storeAddrMisaligned, storeAccessFault, storePageFault)
)
val vecstdCfg = FuConfig(
"vecstd",
fuGen = stdGen, fuSel = (uop: MicroOp) => FuType.storeCanAccept(uop.ctrl.fuType), FuType.stu, 1, 1,
writeIntRf = false, writeFpRf = false, latency = CertainLatency(1)
)
val JumpExeUnitCfg = ExuConfig("JmpExeUnit", "Int", Seq(jmpCfg, i2fCfg), 2, Int.MaxValue) val JumpExeUnitCfg = ExuConfig("JmpExeUnit", "Int", Seq(jmpCfg, i2fCfg), 2, Int.MaxValue)
val AluExeUnitCfg = ExuConfig("AluExeUnit", "Int", Seq(aluCfg), 0, Int.MaxValue) val AluExeUnitCfg = ExuConfig("AluExeUnit", "Int", Seq(aluCfg), 0, Int.MaxValue)
val JumpCSRExeUnitCfg = ExuConfig("JmpCSRExeUnit", "Int", Seq(jmpCfg, csrCfg, fenceCfg, i2fCfg), 2, Int.MaxValue) val JumpCSRExeUnitCfg = ExuConfig("JmpCSRExeUnit", "Int", Seq(jmpCfg, csrCfg, fenceCfg, i2fCfg), 2, Int.MaxValue)
...@@ -788,6 +816,10 @@ package object xiangshan { ...@@ -788,6 +816,10 @@ package object xiangshan {
val LdExeUnitCfg = ExuConfig("LoadExu", "Mem", Seq(lduCfg), wbIntPriority = 0, wbFpPriority = 0, extendsExu = false) val LdExeUnitCfg = ExuConfig("LoadExu", "Mem", Seq(lduCfg), wbIntPriority = 0, wbFpPriority = 0, extendsExu = false)
val StaExeUnitCfg = ExuConfig("StaExu", "Mem", Seq(staCfg, mouCfg), wbIntPriority = Int.MaxValue, wbFpPriority = Int.MaxValue, extendsExu = false) val StaExeUnitCfg = ExuConfig("StaExu", "Mem", Seq(staCfg, mouCfg), wbIntPriority = Int.MaxValue, wbFpPriority = Int.MaxValue, extendsExu = false)
val StdExeUnitCfg = ExuConfig("StdExu", "Mem", Seq(stdCfg, mouDataCfg), wbIntPriority = Int.MaxValue, wbFpPriority = Int.MaxValue, extendsExu = false) val StdExeUnitCfg = ExuConfig("StdExu", "Mem", Seq(stdCfg, mouDataCfg), wbIntPriority = Int.MaxValue, wbFpPriority = Int.MaxValue, extendsExu = false)
// TODO: Backend for VLSU, fix vlsu exeunit config
val vecLdExeUnitCfg = ExuConfig("vecLoadExu", "Mem", Seq(veclduCfg), wbIntPriority = 0, wbFpPriority = 0, extendsExu = false)
val vecStaExeUnitCfg = ExuConfig("vecStaExu", "Mem", Seq(vecstaCfg, mouCfg), wbIntPriority = Int.MaxValue, wbFpPriority = Int.MaxValue, extendsExu = false)
val vecStdExeUnitCfg = ExuConfig("vecStdExu", "Mem", Seq(vecstdCfg, mouDataCfg), wbIntPriority = Int.MaxValue, wbFpPriority = Int.MaxValue, extendsExu = false)
// indicates where the memory access request comes from // indicates where the memory access request comes from
// a dupliacte of this is in HuanCun.common and CoupledL2.common // a dupliacte of this is in HuanCun.common and CoupledL2.common
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册