MissQueue.scala 15.5 KB
Newer Older
A
Allen 已提交
1 2 3 4 5 6 7
package xiangshan.cache

import chisel3._
import chisel3.util._
import chisel3.ExcitingUtils._

import freechips.rocketchip.tilelink.{TLEdgeOut, TLBundleA, TLBundleD, TLBundleE, TLPermissions, TLArbiter, ClientMetadata}
8
import utils.{HasTLDump, XSDebug, BoolStopWatch, OneHot, XSPerf}
A
Allen 已提交
9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197

class MissReq extends DCacheBundle
{
  val source = UInt(sourceTypeWidth.W)
  val cmd    = UInt(M_SZ.W)
  // must be aligned to block
  val addr   = UInt(PAddrBits.W)

  // store
  val store_data   = UInt((cfg.blockBytes * 8).W)
  val store_mask   = UInt(cfg.blockBytes.W)

  // which word does amo work on?
  val word_idx = UInt(log2Up(blockWords).W)
  val amo_data = UInt(DataBits.W)
  val amo_mask = UInt((DataBits/8).W)

  // coherence state
  val coh = new ClientMetadata
  val id  = UInt(reqIdWidth.W)

  def dump() = {
    XSDebug("MissReq source: %d cmd: %d addr: %x store_data: %x store_mask: %x word_idx: %d amo_data: %x amo_mask: %x coh: %d id: %d\n",
      source, cmd, addr, store_data, store_mask, word_idx, amo_data, amo_mask, coh.state, id)
  }
}

// One miss entry deals with one missed block
class MissEntry(edge: TLEdgeOut) extends DCacheModule
{
  val io = IO(new Bundle {
    // MSHR ID
    val id = Input(UInt())

    // client requests
    val req_valid = Input(Bool())
    // this entry is free and can be allocated to new reqs
    val primary_ready = Output(Bool())
    // this entry is busy, but it can merge the new req
    val secondary_ready = Output(Bool())
    // this entry is busy and it can not merge the new req
    val secondary_reject = Output(Bool())
    val req    = Input((new MissReq))
    val refill = ValidIO(new Refill)

    // bus
    val mem_acquire = DecoupledIO(new TLBundleA(edge.bundle))
    val mem_grant   = Flipped(DecoupledIO(new TLBundleD(edge.bundle)))
    val mem_finish  = DecoupledIO(new TLBundleE(edge.bundle))

    val pipe_req  = DecoupledIO(new MainPipeReq)
    val pipe_resp = Flipped(ValidIO(new MainPipeResp))
  })

  // MSHR:
  // 1. receive req
  // 2. send acquire req
  // 3. receive grant resp
  // 4. let main pipe do refill and replace
  // 5. wait for resp
  // 6. send finish to end the tilelink transaction
  //    We only send finish after data is written into cache.
  //    This prevents L2 from probing the block down.
  //    See Tilelink spec 1.8.1 page 69
  //    A slave should not issue a Probe if there is a pending GrantAck on the block. Once the Probe is
  //    issued, the slave should not issue further Probes on that block until it receives a ProbeAck.
  val s_invalid :: s_refill_req :: s_refill_resp :: s_main_pipe_req :: s_main_pipe_resp :: s_mem_finish :: Nil = Enum(6)

  val state = RegInit(s_invalid)

  // --------------------------------------------
  // internal registers
  val req = Reg(new MissReq)

  // param of grant
  val grant_param = Reg(UInt(TLPermissions.bdWidth.W))

  // recording the source/sink info from Grant
  // so that we can use it grantack
  val grantack = Reg(Valid(new TLBundleE(edge.bundle)))

  // should we refill the data to load queue to wake up any missed load?
  val should_refill_data  = Reg(Bool())


  // --------------------------------------------
  // merge reqs
  // see whether we can merge requests
  // do not count s_invalid state in
  // since we can not merge request at that state
  val acquire_not_sent = state === s_refill_req && !io.mem_acquire.ready
  val data_not_refilled = state === s_refill_req || state === s_refill_resp

  def can_merge(new_req: MissReq): Bool = {
    // caution: do not merge with AMO
    // we can not do amoalu calculation in MissQueue
    // so, we do not know the result after AMO calculation
    // so do not merge with AMO

    // before read acquire is fired, we can merge read or write
    val before_read_sent = acquire_not_sent && req.source === LOAD_SOURCE.U && (new_req.source === LOAD_SOURCE.U || new_req.source === STORE_SOURCE.U)
    // before read/write refills data to LoadQueue, we can merge any read
    val before_data_refill = data_not_refilled && (req.source === LOAD_SOURCE.U || req.source === STORE_SOURCE.U) && new_req.source === LOAD_SOURCE.U

    before_read_sent || before_data_refill
  }

  def should_merge(new_req: MissReq): Bool = {
    val block_match = req.addr === new_req.addr
    block_match && can_merge(new_req)
  }

  def should_reject(new_req: MissReq): Bool = {
    val block_match = req.addr === new_req.addr
    // do not reject any req when we are in s_invalid
    block_match && !can_merge(new_req) && state =/= s_invalid
  }

  io.primary_ready    := state === s_invalid
  io.secondary_ready  := should_merge(io.req)
  io.secondary_reject := should_reject(io.req)

  // should not allocate, merge or reject at the same time
  // one at a time
  OneHot.checkOneHot(Seq(io.primary_ready, io.secondary_ready, io.secondary_reject))


  // --------------------------------------------
  // assign default values to output signals
  io.refill.valid := false.B
  io.refill.bits  := DontCare

  io.mem_acquire.valid   := false.B
  io.mem_acquire.bits    := DontCare
  io.mem_grant.ready     := false.B
  io.mem_finish.valid    := false.B
  io.mem_finish.bits     := DontCare

  io.pipe_req.valid := false.B
  io.pipe_req.bits  := DontCare

  when (state =/= s_invalid) {
    XSDebug("entry: %d state: %d\n", io.id, state)
    req.dump()
  }


  // --------------------------------------------
  // State Machine

  // --------------------------------------------
  // receive requests
  // primary request: allocate for a new request
  when (io.req_valid && io.primary_ready) {
    assert (state === s_invalid)

    // re init some fields
    req := io.req
    grantack.valid := false.B
    // only miss req from load needs a refill to LoadQueue
    should_refill_data := io.req.source === LOAD_SOURCE.U

    state := s_refill_req
  }

  // secondary request: merge with existing request
  when (io.req_valid && io.secondary_ready) {
    // The merged reqs should never have higher permissions
    // which means the cache silently upgrade the permission of our block
    // without merge with this miss queue request!
    // Either our req come in with stale meta, or the req that upgrade the permission does not merge with this req.
    // Both cases are bugs of DCache.
    //
    // DCache can silently drop permission(eg, probed or evicted)
    // it should never silently upgrade permissions.
    //
    // TODO: please check Tilelink Metadata.scala
    // and make sure that lower permission are encoded as smaller number
    assert (io.req.coh.state <= req.coh.state)
    // use the most uptodate meta
    req.coh := io.req.coh

    // when merging with store
    // we should remember its info into our req
    // or we will not be able to replay store
    when (io.req.source === STORE_SOURCE.U) {
      req := io.req
    }

198
    should_refill_data := should_refill_data || io.req.source === LOAD_SOURCE.U
A
Allen 已提交
199 200 201 202 203
  }


  // --------------------------------------------
  // refill
204 205 206

  // for full overwrite, we can use AcquirePerm to save memory bandwidth
  val full_overwrite = req.source === STORE_SOURCE.U && req.store_mask.andR
A
Allen 已提交
207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251
  when (state === s_refill_req) {

    val grow_param = req.coh.onAccess(req.cmd)._2
    val acquireBlock = edge.AcquireBlock(
      fromSource      = io.id,
      toAddress       = req.addr,
      lgSize          = (log2Up(cfg.blockBytes)).U,
      growPermissions = grow_param)._2
    val acquirePerm = edge.AcquirePerm(
      fromSource      = io.id,
      toAddress       = req.addr,
      lgSize          = (log2Up(cfg.blockBytes)).U,
      growPermissions = grow_param)._2

    io.mem_acquire.valid := true.B
    io.mem_acquire.bits := Mux(full_overwrite, acquirePerm, acquireBlock)

    when (io.mem_acquire.fire()) {
      state := s_refill_resp
    }
  }

  val (_, _, refill_done, refill_count) = edge.count(io.mem_grant)

  // raw data
  val refill_data = Reg(Vec(blockRows, UInt(rowBits.W)))
  val new_data    = Wire(Vec(blockRows, UInt(rowBits.W)))
  val new_mask    = Wire(Vec(blockRows, UInt(rowBytes.W)))

  for (i <- 0 until blockRows) {
    new_data(i) := req.store_data(rowBits * (i + 1) - 1, rowBits * i)
    // we only need to merge data for Store
    new_mask(i) := Mux(req.source === STORE_SOURCE.U,
      req.store_mask(rowBytes * (i + 1) - 1, rowBytes * i), 0.U(rowBytes.W))
  }

  def mergePutData(old_data: UInt, new_data: UInt, wmask: UInt): UInt = {
    val full_wmask = FillInterleaved(8, wmask)
    ((~full_wmask & old_data) | (full_wmask & new_data))
  }

  when (state === s_refill_resp) {
    io.mem_grant.ready := true.B
    when (io.mem_grant.fire()) {
      when (edge.hasData(io.mem_grant.bits)) {
252
        // GrantData
253 254
        for (i <- 0 until beatRows) {
          val idx = (refill_count << log2Floor(beatRows)) + i.U
255
          refill_data(idx) := mergePutData(io.mem_grant.bits.data(rowBits * (i + 1) - 1, rowBits * i), new_data(idx), new_mask(idx))
256
        }
A
Allen 已提交
257
      } .otherwise {
258 259 260 261 262 263 264 265 266
        // Grant

        // since we do not sync between MissQueue and WritebackQueue
        // for a AcquireBlock BtoT, we can not protect our block from being replaced by another miss and written back by WritebackQueue
        // so AcquireBlock BtoT, we need L2 to give us GrantData, not Grant.
        // So that whether our block is replaced or not, we can always refill the block with valid data
        // So, if we enters here
        // we must be a AcquirePerm, not a AcquireBlock!!!
        assert (full_overwrite)
A
Allen 已提交
267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283
        // when we only acquire perm, not data
        // use Store's data
        for (i <- 0 until blockRows) {
          refill_data(i) := new_data(i)
        }
      }
    }

    when (refill_done) {
      grantack.valid := edge.isRequest(io.mem_grant.bits)
      grantack.bits := edge.GrantAck(io.mem_grant.bits)
      grant_param := io.mem_grant.bits.param

      state := s_main_pipe_req
    }
  }

A
Allen 已提交
284 285 286 287
  // put should_refill_data out of RegNext
  // so that when load miss are merged at refill_done
  // we can still refill data back
  io.refill.valid := RegNext(state === s_refill_resp && refill_done) && should_refill_data
A
Allen 已提交
288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348
  io.refill.bits.addr := req.addr
  io.refill.bits.data := refill_data.asUInt

  when (state === s_main_pipe_req) {
    io.pipe_req.valid := true.B
    val pipe_req = io.pipe_req.bits
    pipe_req.miss := true.B
    pipe_req.miss_id := io.id
    pipe_req.miss_param := grant_param

    pipe_req.probe := false.B
    pipe_req.probe_param := DontCare

    pipe_req.source := req.source
    pipe_req.cmd    := req.cmd
    pipe_req.addr   := req.addr
    pipe_req.store_data := refill_data.asUInt
    // full overwrite
    pipe_req.store_mask := Fill(cfg.blockBytes, "b1".U)
    pipe_req.word_idx := req.word_idx
    pipe_req.amo_data   := req.amo_data
    pipe_req.amo_mask   := req.amo_mask
    pipe_req.id     := req.id

    when (io.pipe_req.fire()) {
      state := s_main_pipe_resp
    }
  }

  when (state === s_main_pipe_resp) {
    when (io.pipe_resp.fire()) {
      state := s_mem_finish
    }
  }

  when (state === s_mem_finish) {
    io.mem_finish.valid := grantack.valid
    io.mem_finish.bits  := grantack.bits

    when (io.mem_finish.fire()) {
      grantack.valid := false.B
      state := s_invalid
    }
  }
}


class MissQueue(edge: TLEdgeOut) extends DCacheModule with HasTLDump
{
  val io = IO(new Bundle {
    val req    = Flipped(DecoupledIO(new MissReq))
    val refill = ValidIO(new Refill)

    val mem_acquire = Decoupled(new TLBundleA(edge.bundle))
    val mem_grant   = Flipped(DecoupledIO(new TLBundleD(edge.bundle)))
    val mem_finish  = Decoupled(new TLBundleE(edge.bundle))

    val pipe_req  = DecoupledIO(new MainPipeReq)
    val pipe_resp = Flipped(ValidIO(new MainPipeResp))
  })

349 350
  val pipe_req_arb = Module(new RRArbiter(new MainPipeReq, cfg.nMissEntries))
  val refill_arb   = Module(new RRArbiter(new Refill, cfg.nMissEntries))
A
Allen 已提交
351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415

  // dispatch req to MSHR
  val primary_ready  = Wire(Vec(cfg.nMissEntries, Bool()))
  val secondary_ready  = Wire(Vec(cfg.nMissEntries, Bool()))
  val secondary_reject  = Wire(Vec(cfg.nMissEntries, Bool()))

  // try merging with existing reqs
  val merge = secondary_ready.asUInt.orR
  val merge_idx = PriorityEncoder(secondary_ready)
  // some req says the request can not be merged
  val reject = secondary_reject.asUInt.orR
  // allocate a new entry for this req
  val allocate = !reject && !merge && primary_ready.asUInt.orR
  val alloc_idx = PriorityEncoder(primary_ready)

  // will this req be accepted
  val accept = (merge || allocate) && !reject
  // if it's accepted, which entry will it enter
  val entry_idx = Mux(allocate, alloc_idx, merge_idx)

  // for one block, their should be only one MSHR
  // one block should not be stay in multiple MSHRs
  // if we a req can not merge with existing reqs
  // block it!
  OneHot.checkOneHot(secondary_ready)
  OneHot.checkOneHot(secondary_reject)
  // should not merge and reject at the same time
  OneHot.checkOneHot(Seq(merge, reject))

  io.req.ready := accept
  io.mem_grant.ready := false.B

  val entries = (0 until cfg.nMissEntries) map { i =>
    val entry = Module(new MissEntry(edge))

    entry.io.id := i.U(log2Up(cfg.nMissEntries).W)

    // entry req
    entry.io.req_valid  := (i.U === entry_idx) && accept && io.req.valid
    primary_ready(i)    := entry.io.primary_ready
    secondary_ready(i)  := entry.io.secondary_ready
    secondary_reject(i) := entry.io.secondary_reject
    entry.io.req        := io.req.bits

    // entry refill
    refill_arb.io.in(i).valid := entry.io.refill.valid
    refill_arb.io.in(i).bits  := entry.io.refill.bits

    // pipe_req
    pipe_req_arb.io.in(i)     <> entry.io.pipe_req

    // pipe_req
    entry.io.pipe_resp.valid  := false.B
    entry.io.pipe_resp.bits   := DontCare
    when (io.pipe_resp.bits.id === i.U) {
      entry.io.pipe_resp <> io.pipe_resp
    }

    entry.io.mem_grant.valid := false.B
    entry.io.mem_grant.bits  := DontCare
    when (io.mem_grant.bits.source === i.U) {
      entry.io.mem_grant <> io.mem_grant
    }

    /*
416 417 418 419 420 421 422
    XSPerf(
      "perfCntDCacheMissQueuePenaltyEntry" + Integer.toString(i, 10),
      BoolStopWatch(
        start = entry.io.req.fire(), 
        stop = entry.io.resp.fire(),
        startHighPriority = true)
    )
A
Allen 已提交
423 424 425 426 427 428 429 430 431 432 433 434
    */

    entry
  }

  io.refill.valid := refill_arb.io.out.valid
  io.refill.bits  := refill_arb.io.out.bits
  refill_arb.io.out.ready := true.B

  // one refill at a time
  OneHot.checkOneHot(refill_arb.io.in.map(r => r.valid))

435 436
  TLArbiter.robin(edge, io.mem_acquire, entries.map(_.io.mem_acquire):_*)
  TLArbiter.robin(edge, io.mem_finish,  entries.map(_.io.mem_finish):_*)
A
Allen 已提交
437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491

  io.pipe_req <> pipe_req_arb.io.out


  // print all input/output requests for debug purpose

  when (io.req.fire()) {
    io.req.bits.dump()
    // sanity check
    val source = io.req.bits.source
    val cmd = io.req.bits.cmd
    when (source === LOAD_SOURCE.U) {
      assert (cmd === M_XRD)
    }
    when (source === STORE_SOURCE.U) {
      assert (cmd === M_XWR)
    }

    when (source === AMO_SOURCE.U) {
      assert (
        cmd === M_XA_SWAP ||
        cmd === M_XLR     ||
        cmd === M_XSC     ||
        cmd === M_XA_ADD  ||
        cmd === M_XA_XOR  ||
        cmd === M_XA_OR   ||
        cmd === M_XA_AND  ||
        cmd === M_XA_MIN  ||
        cmd === M_XA_MAX  ||
        cmd === M_XA_MINU ||
        cmd === M_XA_MAXU)
    }
    // req addr must be aligned to block boundary
    assert (io.req.bits.addr(blockOffBits - 1, 0) === 0.U)
  }

  when (io.refill.fire()) {
    io.refill.bits.dump()
  }

  when (io.mem_acquire.fire()) {
    XSDebug("mem_acquire ")
    io.mem_acquire.bits.dump
  }

  when (io.mem_grant.fire()) {
    XSDebug("mem_grant ")
    io.mem_grant.bits.dump
  }

  when (io.mem_finish.fire()) {
    XSDebug("mem_finish ")
    io.mem_finish.bits.dump
  }

492
  XSPerf("dcache_miss", io.req.fire())
A
Allen 已提交
493
}