LoadUnit.scala 13.6 KB
Newer Older
Y
Yinan Xu 已提交
1 2 3 4 5 6
package xiangshan.mem

import chisel3._
import chisel3.util._
import utils._
import xiangshan._
7
import xiangshan.cache.{DCacheWordIO, TlbRequestIO, TlbCmd, MemoryOpConstants}
8
import xiangshan.backend.LSUOpType
L
LinJiawei 已提交
9
import xiangshan.backend.fu.fpu.boxF32ToF64
Y
Yinan Xu 已提交
10 11 12 13 14 15 16 17 18 19 20 21 22

class LoadToLsroqIO extends XSBundle {
  val loadIn = ValidIO(new LsPipelineBundle)
  val ldout = Flipped(DecoupledIO(new ExuOutput))
  val forward = new LoadForwardQueryIO
}

class LoadUnit extends XSModule {
  val io = IO(new Bundle() {
    val ldin = Flipped(Decoupled(new ExuInput))
    val ldout = Decoupled(new ExuOutput)
    val redirect = Flipped(ValidIO(new Redirect))
    val tlbFeedback = ValidIO(new TlbFeedback)
23
    val dcache = new DCacheWordIO
24
    val dtlb = new TlbRequestIO()
Y
Yinan Xu 已提交
25 26 27
    val sbuffer = new LoadForwardQueryIO
    val lsroq = new LoadToLsroqIO
  })
28 29 30 31
  
  when(io.ldin.valid){
    XSDebug("load enpipe %x iw %x fw %x\n", io.ldin.bits.uop.cf.pc, io.ldin.bits.uop.ctrl.rfWen, io.ldin.bits.uop.ctrl.fpWen)
  }
Y
Yinan Xu 已提交
32 33 34 35 36 37 38 39 40 41 42 43 44 45

  //-------------------------------------------------------
  // Load Pipeline
  //-------------------------------------------------------

  val l2_out = Wire(Decoupled(new LsPipelineBundle))
  val l4_out = Wire(Decoupled(new LsPipelineBundle))
  val l5_in  = Wire(Flipped(Decoupled(new LsPipelineBundle)))

  //-------------------------------------------------------
  // LD Pipeline Stage 2
  // Generate addr, use addr to query DCache Tag and DTLB
  //-------------------------------------------------------

46 47 48 49 50 51
  val l2_dtlb_hit  = Wire(new Bool())
  val l2_dtlb_miss = Wire(new Bool())
  val l2_dcache = Wire(new Bool())
  val l2_mmio = Wire(new Bool())
  val isMMIOReq = Wire(new Bool())

Y
Yinan Xu 已提交
52 53 54
  // send req to dtlb
  io.dtlb.req.valid := l2_out.valid
  io.dtlb.req.bits.vaddr := l2_out.bits.vaddr
55
  io.dtlb.req.bits.cmd := TlbCmd.read
Z
ZhangZifei 已提交
56
  io.dtlb.req.bits.roqIdx := l2_out.bits.uop.roqIdx
57
  io.dtlb.req.bits.debug.pc := l2_out.bits.uop.cf.pc
58
  io.dtlb.req.bits.debug.lsroqIdx := l2_out.bits.uop.lsroqIdx // FIXME: need update
59 60 61 62 63 64

  l2_dtlb_hit  := io.dtlb.resp.valid && !io.dtlb.resp.bits.miss
  l2_dtlb_miss := io.dtlb.resp.valid && io.dtlb.resp.bits.miss
  isMMIOReq := AddressSpace.isMMIO(io.dtlb.resp.bits.paddr)
  l2_dcache := l2_dtlb_hit && !isMMIOReq
  l2_mmio   := l2_dtlb_hit && isMMIOReq
65

66 67 68 69 70 71 72 73
  // l2_out is used to generate dcache req
  l2_out.bits := DontCare
  l2_out.bits.vaddr := io.ldin.bits.src1 + io.ldin.bits.uop.ctrl.imm
  l2_out.bits.paddr := io.dtlb.resp.bits.paddr
  l2_out.bits.mask  := genWmask(l2_out.bits.vaddr, io.ldin.bits.uop.ctrl.fuOpType(1,0))
  l2_out.bits.uop   := io.ldin.bits.uop
  l2_out.bits.miss  := false.B
  l2_out.bits.mmio  := l2_mmio
Y
Yinan Xu 已提交
74
  l2_out.valid := io.ldin.valid && !io.ldin.bits.uop.roqIdx.needFlush(io.redirect)
75 76 77
  // when we are sure it's a MMIO req, we do not need to wait for cache ready
  l2_out.ready := (l2_dcache && io.dcache.req.ready) || l2_mmio || l2_dtlb_miss
  io.ldin.ready := l2_out.ready
78

79 80 81 82 83 84 85 86 87 88
  // exception check
  val addrAligned = LookupTree(io.ldin.bits.uop.ctrl.fuOpType(1,0), List(
    "b00".U   -> true.B,              //b
    "b01".U   -> (l2_out.bits.vaddr(0) === 0.U),   //h
    "b10".U   -> (l2_out.bits.vaddr(1,0) === 0.U), //w
    "b11".U   -> (l2_out.bits.vaddr(2,0) === 0.U)  //d
  ))
  l2_out.bits.uop.cf.exceptionVec(loadAddrMisaligned) := !addrAligned
  l2_out.bits.uop.cf.exceptionVec(loadPageFault) := io.dtlb.resp.bits.excp.pf.ld

Y
Yinan Xu 已提交
89
  // send result to dcache
90 91
  // never send tlb missed or MMIO reqs to dcache
  io.dcache.req.valid     := l2_dcache
92 93

  io.dcache.req.bits.cmd  := MemoryOpConstants.M_XRD
94 95
  // TODO: vaddr
  io.dcache.req.bits.addr := io.dtlb.resp.bits.paddr 
96 97 98 99 100 101 102
  io.dcache.req.bits.data := DontCare
  io.dcache.req.bits.mask := l2_out.bits.mask

  io.dcache.req.bits.meta.id       := DontCare
  io.dcache.req.bits.meta.vaddr    := l2_out.bits.vaddr
  io.dcache.req.bits.meta.paddr    := io.dtlb.resp.bits.paddr
  io.dcache.req.bits.meta.uop      := l2_out.bits.uop
103
  io.dcache.req.bits.meta.mmio     := isMMIOReq
104 105 106 107
  io.dcache.req.bits.meta.tlb_miss := io.dtlb.resp.bits.miss
  io.dcache.req.bits.meta.mask     := l2_out.bits.mask
  io.dcache.req.bits.meta.replay   := false.B

Y
Yinan Xu 已提交
108 109 110 111 112

  val l2_tlbFeedback = Wire(new TlbFeedback)
  l2_tlbFeedback.hit := !io.dtlb.resp.bits.miss
  l2_tlbFeedback.roqIdx := l2_out.bits.uop.roqIdx

113 114 115 116 117 118 119 120 121
  // dump l2
  XSDebug(l2_out.valid, "L2: pc 0x%x addr 0x%x -> 0x%x op %b data 0x%x mask %x dltb_miss %b dcache %b mmio %b\n",
    l2_out.bits.uop.cf.pc, l2_out.bits.vaddr, l2_out.bits.paddr,
    l2_out.bits.uop.ctrl.fuOpType, l2_out.bits.data, l2_out.bits.mask,
    l2_dtlb_miss, l2_dcache, l2_mmio)

  XSDebug(l2_out.fire(), "load req: pc 0x%x addr 0x%x -> 0x%x op %b\n",
    l2_out.bits.uop.cf.pc, l2_out.bits.vaddr, l2_out.bits.paddr, l2_out.bits.uop.ctrl.fuOpType)

Z
ZhangZifei 已提交
122 123
  XSDebug(io.dcache.req.valid, p"dcache req(${io.dcache.req.valid} ${io.dcache.req.ready}): pc:0x${Hexadecimal(io.dcache.req.bits.meta.uop.cf.pc)} roqIdx:${io.dcache.req.bits.meta.uop.roqIdx} lsroqIdx:${io.dcache.req.bits.meta.uop.lsroqIdx} addr:0x${Hexadecimal(io.dcache.req.bits.addr)} vaddr:0x${Hexadecimal(io.dcache.req.bits.meta.vaddr)} paddr:0x${Hexadecimal(io.dcache.req.bits.meta.paddr)} mmio:${io.dcache.req.bits.meta.mmio} tlb_miss:${io.dcache.req.bits.meta.tlb_miss} mask:${io.dcache.req.bits.meta.mask}\n")

Y
Yinan Xu 已提交
124 125 126 127
  //-------------------------------------------------------
  // LD Pipeline Stage 3
  // Compare tag, use addr to query DCache Data
  //-------------------------------------------------------
Y
Yinan Xu 已提交
128 129

  val l3_valid = RegNext(l2_out.fire(), false.B)
130 131 132 133
  val l3_dtlb_miss = RegEnable(next = l2_dtlb_miss, enable = l2_out.fire(), init = false.B)
  val l3_dcache = RegEnable(next = l2_dcache, enable = l2_out.fire(), init = false.B)
  val l3_tlbFeedback = RegEnable(next = l2_tlbFeedback, enable = l2_out.fire())
  val l3_bundle = RegEnable(next = l2_out.bits, enable = l2_out.fire())
L
linjiawei 已提交
134
  val l3_uop = l3_bundle.uop
135
  // dltb miss reqs ends here
Y
Yinan Xu 已提交
136
  val l3_passdown = l3_valid && !l3_dtlb_miss && !l3_uop.roqIdx.needFlush(io.redirect)
137

Y
Yinan Xu 已提交
138 139
  io.tlbFeedback.valid := l3_valid
  io.tlbFeedback.bits := l3_tlbFeedback
Y
Yinan Xu 已提交
140
  io.dcache.s1_kill := l3_valid && l3_dcache && l3_uop.roqIdx.needFlush(io.redirect)
141 142 143 144 145

  // dump l3
  XSDebug(l3_valid, "l3: pc 0x%x addr 0x%x -> 0x%x op %b data 0x%x mask %x dltb_miss %b dcache %b mmio %b\n",
    l3_bundle.uop.cf.pc, l3_bundle.vaddr, l3_bundle.paddr,
    l3_bundle.uop.ctrl.fuOpType, l3_bundle.data, l3_bundle.mask,
L
linjiawei 已提交
146
    l3_dtlb_miss, l3_dcache, l3_bundle.mmio)
147 148

  XSDebug(io.tlbFeedback.valid, "tlbFeedback: hit %b roqIdx %d\n",
Y
Yinan Xu 已提交
149
    io.tlbFeedback.bits.hit, io.tlbFeedback.bits.roqIdx.asUInt)
150 151

  XSDebug(io.dcache.s1_kill, "l3: dcache s1_kill\n")
Y
Yinan Xu 已提交
152

Y
Yinan Xu 已提交
153
  // Done in Dcache
Y
Yinan Xu 已提交
154

Y
Yinan Xu 已提交
155 156 157 158
  //-------------------------------------------------------
  // LD Pipeline Stage 4
  // Dcache return result, do tag ecc check and forward check
  //-------------------------------------------------------
Y
Yinan Xu 已提交
159

160 161 162 163
  val l4_valid = RegNext(l3_passdown, false.B)
  val l4_dcache = RegNext(l3_dcache, false.B)
  val l4_bundle = RegNext(l3_bundle)

164 165
  val fullForward = Wire(Bool())

166 167 168 169 170 171 172 173 174
  assert(!(io.dcache.resp.ready && !io.dcache.resp.valid), "DCache response got lost")
  io.dcache.resp.ready := l4_valid && l4_dcache
  when (io.dcache.resp.fire()) {
    l4_out.bits := DontCare
    l4_out.bits.data  := io.dcache.resp.bits.data
    l4_out.bits.paddr := io.dcache.resp.bits.meta.paddr
    l4_out.bits.uop   := io.dcache.resp.bits.meta.uop
    l4_out.bits.mmio  := io.dcache.resp.bits.meta.mmio
    l4_out.bits.mask  := io.dcache.resp.bits.meta.mask
175 176
    // when we can get the data completely from forward
    // we no longer need to access dcache
A
Allen 已提交
177 178 179
    // treat nack as miss
    l4_out.bits.miss  := Mux(fullForward, false.B,
      io.dcache.resp.bits.miss || io.dcache.resp.bits.nack)
Z
ZhangZifei 已提交
180
    XSDebug(io.dcache.resp.fire(), p"DcacheResp(l4): data:0x${Hexadecimal(io.dcache.resp.bits.data)} paddr:0x${Hexadecimal(io.dcache.resp.bits.meta.paddr)} pc:0x${Hexadecimal(io.dcache.resp.bits.meta.uop.cf.pc)} roqIdx:${io.dcache.resp.bits.meta.uop.roqIdx} lsroqIdx:${io.dcache.resp.bits.meta.uop.lsroqIdx} miss:${io.dcache.resp.bits.miss}\n")
181
  } .otherwise {
182 183
    l4_out.bits := l4_bundle
  }
Y
Yinan Xu 已提交
184
  l4_out.valid := l4_valid && !l4_out.bits.uop.roqIdx.needFlush(io.redirect)
Y
Yinan Xu 已提交
185 186 187 188

  // Store addr forward match
  // If match, get data / fmask from store queue / store buffer

189
  // io.lsroq.forward := DontCare
Y
Yinan Xu 已提交
190
  io.lsroq.forward.paddr := l4_out.bits.paddr
191
  io.lsroq.forward.mask := io.dcache.resp.bits.meta.mask
Y
Yinan Xu 已提交
192
  io.lsroq.forward.lsroqIdx := l4_out.bits.uop.lsroqIdx
193
  io.lsroq.forward.sqIdx := l4_out.bits.uop.sqIdx
194
  io.lsroq.forward.uop := l4_out.bits.uop
Y
Yinan Xu 已提交
195
  io.lsroq.forward.pc := l4_out.bits.uop.cf.pc
L
linjiawei 已提交
196
  io.lsroq.forward.valid := io.dcache.resp.valid //TODO: opt timing
Y
Yinan Xu 已提交
197 198

  io.sbuffer.paddr := l4_out.bits.paddr
199
  io.sbuffer.mask := io.dcache.resp.bits.meta.mask
Y
Yinan Xu 已提交
200
  io.sbuffer.lsroqIdx := l4_out.bits.uop.lsroqIdx
201
  io.sbuffer.sqIdx := l4_out.bits.uop.sqIdx
202
  io.sbuffer.uop := DontCare
Y
Yinan Xu 已提交
203 204 205
  io.sbuffer.pc := l4_out.bits.uop.cf.pc
  io.sbuffer.valid := l4_out.valid

206 207
  val forwardVec = WireInit(io.sbuffer.forwardData)
  val forwardMask = WireInit(io.sbuffer.forwardMask)
Y
Yinan Xu 已提交
208 209
  // generate XLEN/8 Muxs
  (0 until XLEN/8).map(j => {
210
    when(io.lsroq.forward.forwardMask(j)) {
Y
Yinan Xu 已提交
211
      forwardMask(j) := true.B
212
      forwardVec(j) := io.lsroq.forward.forwardData(j)
Y
Yinan Xu 已提交
213 214 215 216
    }
  })
  l4_out.bits.forwardMask := forwardMask
  l4_out.bits.forwardData := forwardVec
217
  fullForward := (~l4_out.bits.forwardMask.asUInt & l4_out.bits.mask) === 0.U
218

219
  PipelineConnect(l4_out, l5_in, io.ldout.fire() || (l5_in.bits.miss || l5_in.bits.mmio) && l5_in.valid, false.B)
Y
Yinan Xu 已提交
220

Z
ZhangZifei 已提交
221 222
  XSDebug(l4_valid, "l4: out.valid:%d pc 0x%x addr 0x%x -> 0x%x op %b data 0x%x mask %x forwardData: 0x%x forwardMask: %x dcache %b mmio %b miss:%d\n",
    l4_out.valid, l4_out.bits.uop.cf.pc, l4_out.bits.vaddr, l4_out.bits.paddr,
223
    l4_out.bits.uop.ctrl.fuOpType, l4_out.bits.data, l4_out.bits.mask,
Z
ZhangZifei 已提交
224
    l4_out.bits.forwardData.asUInt, l4_out.bits.forwardMask.asUInt, l4_dcache, l4_out.bits.mmio, l4_out.bits.miss)
225

Z
ZhangZifei 已提交
226 227
  XSDebug(l5_in.valid, "L5(%d %d): pc 0x%x addr 0x%x -> 0x%x op %b data 0x%x mask %x forwardData: 0x%x forwardMask: %x\n",
    l5_in.valid, l5_in.ready, l5_in.bits.uop.cf.pc,  l5_in.bits.vaddr, l5_in.bits.paddr,
228 229 230 231 232 233 234 235
    l5_in.bits.uop.ctrl.fuOpType , l5_in.bits.data,  l5_in.bits.mask,
    l5_in.bits.forwardData.asUInt, l5_in.bits.forwardMask.asUInt)

  XSDebug(l4_valid, "l4: sbuffer forwardData: 0x%x forwardMask: %x\n",
    io.sbuffer.forwardData.asUInt, io.sbuffer.forwardMask.asUInt)

  XSDebug(l4_valid, "l4: lsroq forwardData: 0x%x forwardMask: %x\n",
    io.lsroq.forward.forwardData.asUInt, io.lsroq.forward.forwardMask.asUInt)
236

Y
Yinan Xu 已提交
237 238 239 240 241 242
  XSDebug(io.redirect.valid,
    p"Redirect: excp:${io.redirect.bits.isException} flushPipe:${io.redirect.bits.isFlushPipe} misp:${io.redirect.bits.isMisPred} " +
    p"replay:${io.redirect.bits.isReplay} pc:0x${Hexadecimal(io.redirect.bits.pc)} target:0x${Hexadecimal(io.redirect.bits.target)} " +
    p"brTag:${io.redirect.bits.brTag} l2:${io.ldin.bits.uop.roqIdx.needFlush(io.redirect)} l3:${l3_uop.roqIdx.needFlush(io.redirect)} " +
    p"l4:${l4_out.bits.uop.roqIdx.needFlush(io.redirect)}\n"
  )
Y
Yinan Xu 已提交
243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276
  //-------------------------------------------------------
  // LD Pipeline Stage 5
  // Do data ecc check, merge result and write back to LS ROQ
  // If cache hit, return writeback result to CDB
  //-------------------------------------------------------

  val loadWriteBack = l5_in.fire()

  // data merge
  val rdata = VecInit((0 until 8).map(j => {
    Mux(l5_in.bits.forwardMask(j),
      l5_in.bits.forwardData(j),
      l5_in.bits.data(8*(j+1)-1, 8*j)
    )
  })).asUInt
  val func = l5_in.bits.uop.ctrl.fuOpType
  val raddr = l5_in.bits.paddr
  val rdataSel = LookupTree(raddr(2, 0), List(
    "b000".U -> rdata(63, 0),
    "b001".U -> rdata(63, 8),
    "b010".U -> rdata(63, 16),
    "b011".U -> rdata(63, 24),
    "b100".U -> rdata(63, 32),
    "b101".U -> rdata(63, 40),
    "b110".U -> rdata(63, 48),
    "b111".U -> rdata(63, 56)
  ))
  val rdataPartialLoad = LookupTree(func, List(
      LSUOpType.lb   -> SignExt(rdataSel(7, 0) , XLEN),
      LSUOpType.lh   -> SignExt(rdataSel(15, 0), XLEN),
      LSUOpType.lw   -> SignExt(rdataSel(31, 0), XLEN),
      LSUOpType.ld   -> SignExt(rdataSel(63, 0), XLEN),
      LSUOpType.lbu  -> ZeroExt(rdataSel(7, 0) , XLEN),
      LSUOpType.lhu  -> ZeroExt(rdataSel(15, 0), XLEN),
L
LinJiawei 已提交
277 278
      LSUOpType.lwu  -> ZeroExt(rdataSel(31, 0), XLEN),
      LSUOpType.flw  -> boxF32ToF64(rdataSel(31, 0))
Y
Yinan Xu 已提交
279 280 281 282 283 284 285 286 287 288 289
  ))

  // ecc check
  // TODO

  // if hit, writeback result to CDB
  // val ldout = Vec(2, Decoupled(new ExuOutput))
  // when io.loadIn(i).fire() && !io.io.loadIn(i).miss, commit load to cdb
  val hitLoadOut = Wire(Decoupled(new ExuOutput))
  hitLoadOut.bits.uop := l5_in.bits.uop
  hitLoadOut.bits.data := rdataPartialLoad
L
linjiawei 已提交
290
  hitLoadOut.bits.fflags := DontCare
Y
Yinan Xu 已提交
291 292 293 294
  hitLoadOut.bits.redirectValid := false.B
  hitLoadOut.bits.redirect := DontCare
  hitLoadOut.bits.brUpdate := DontCare
  hitLoadOut.bits.debug.isMMIO := l5_in.bits.mmio
295
  hitLoadOut.valid := l5_in.valid && !l5_in.bits.mmio && !l5_in.bits.miss // MMIO will be done in lsroq
Y
Yinan Xu 已提交
296 297 298 299 300 301 302 303 304 305
  XSDebug(hitLoadOut.fire(), "load writeback: pc %x data %x (%x + %x(%b))\n",
    hitLoadOut.bits.uop.cf.pc, rdataPartialLoad, l5_in.bits.data,
    l5_in.bits.forwardData.asUInt, l5_in.bits.forwardMask.asUInt
  )

  // writeback to LSROQ
  // Current dcache use MSHR

  io.lsroq.loadIn.bits := l5_in.bits
  io.lsroq.loadIn.bits.data := rdataPartialLoad // for debug
L
linjiawei 已提交
306
  io.lsroq.loadIn.valid := loadWriteBack
Y
Yinan Xu 已提交
307 308 309 310 311 312 313 314

  // pipeline control
  l5_in.ready := io.ldout.ready

  val cdbArb = Module(new Arbiter(new ExuOutput, 2))
  io.ldout <> cdbArb.io.out
  hitLoadOut <> cdbArb.io.in(0)
  io.lsroq.ldout <> cdbArb.io.in(1) // missLoadOut
315

316 317
  when(io.ldout.fire()){
    XSDebug("ldout %x iw %x fw %x\n", io.ldout.bits.uop.cf.pc, io.ldout.bits.uop.ctrl.rfWen, io.ldout.bits.uop.ctrl.fpWen)
318
  }
Y
Yinan Xu 已提交
319
}