FMA.scala 8.8 KB
Newer Older
L
Lemover 已提交
1 2
/***************************************************************************************
* Copyright (c) 2020-2021 Institute of Computing Technology, Chinese Academy of Sciences
Y
Yinan Xu 已提交
3
* Copyright (c) 2020-2021 Peng Cheng Laboratory
L
Lemover 已提交
4 5 6 7 8 9 10 11 12 13 14 15 16
*
* XiangShan is licensed under Mulan PSL v2.
* You can use this software according to the terms and conditions of the Mulan PSL v2.
* You may obtain a copy of Mulan PSL v2 at:
*          http://license.coscl.org.cn/MulanPSL2
*
* THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND,
* EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT,
* MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE.
*
* See the Mulan PSL v2 for more details.
***************************************************************************************/

17 18
package xiangshan.backend.fu.fpu

X
Xuan Hu 已提交
19
import _root_.utils._
20
import chipsalliance.rocketchip.config.Parameters
21
import chisel3._
22
import chisel3.util._
X
Xuan Hu 已提交
23
import fudian._
X
Xuan Hu 已提交
24
import fudian.utils.Multiplier
25
import utility._
X
Xuan Hu 已提交
26
import xiangshan._
X
Xuan Hu 已提交
27
import xiangshan.backend.rob.RobPtr
X
Xuan Hu 已提交
28
import xiangshan.backend.fu.FuConfig
29

L
LinJiawei 已提交
30

X
Xuan Hu 已提交
31
class MulToAddIO(val ftypes: Seq[FPU.FType])(implicit p: Parameters) extends XSBundle {
J
Jiawei Lin 已提交
32 33
  val mul_out = MixedVec(ftypes.map(t => new FMULToFADD(t.expWidth, t.precision)))
  val addend = UInt(ftypes.map(_.len).max.W)
X
Xuan Hu 已提交
34
  val fpCtrl = new FPUCtrlSignals
X
Xuan Hu 已提交
35 36 37
  val robIdx = new RobPtr
  val pdest = UInt(PhyRegIdxWidth.W)
  val fpWen = Bool()
38 39
  def getFloat = mul_out.head
  def getDouble = mul_out.last
J
Jiawei Lin 已提交
40 41
}

X
Xuan Hu 已提交
42 43
class FMUL_pipe(cfg: FuConfig, val mulLat: Int = 2)(implicit p: Parameters)
  extends FPUPipelineModule(cfg)
J
Jiawei Lin 已提交
44 45 46 47
{
  override def latency: Int = mulLat
  override val dataModule: FPUDataModule = null

X
Xuan Hu 已提交
48 49
  private val rm = io.frm.get

J
Jiawei Lin 已提交
50 51
  val toAdd = IO(Output(new MulToAddIO(FPU.ftypes)))

X
Xuan Hu 已提交
52
  val robIdx = robIdxVec(0)
53
  val fpCtrl = DataHoldBypass(io.in.bits.ctrl.fpu.get, io.in.fire)
54 55
  val typeTagIn = fpCtrl.typeTagIn

J
Jiawei Lin 已提交
56 57
  val typeSel = VecInit(FPU.ftypes.zipWithIndex.map(_._2.U === typeTagIn))

58 59
  val src1 = FPU.unbox(io.in.bits.data.src(0), typeTagIn)
  val src2 = FPU.unbox(io.in.bits.data.src(1), typeTagIn)
J
Jiawei Lin 已提交
60

J
Jiawei Lin 已提交
61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96
  val multiplier = Module(new Multiplier(FPU.ftypes.last.precision+1, pipeAt = Seq(1)))

  val stages = FPU.ftypes.map{ t =>
    // s1 -> s2 -> s3
    val s1 = Module(new FMUL_s1(t.expWidth, t.precision))
    val s2 = Module(new FMUL_s2(t.expWidth, t.precision))
    val s3 = Module(new FMUL_s3(t.expWidth, t.precision))

    val in1 = src1
    val in2 = Mux(fpCtrl.fmaCmd(1), invert_sign(src2, t.len), src2)
    s1.io.a := in1
    s1.io.b := in2
    s1.io.rm := rm

    s2.io.in := S1Reg(s1.io.out)
    s2.io.prod := multiplier.io.result
    s3.io.in := S2Reg(s2.io.out)
    (s1, s2, s3)
  }

  val (s1, s2, s3) = stages.unzip3
  val (mul_a_sel, mul_b_sel) = s1.zipWithIndex.map{
    case (s, i) =>
      val raw_a = RawFloat.fromUInt(s.io.a, s.expWidth, s.precision)
      val raw_b = RawFloat.fromUInt(s.io.b, s.expWidth, s.precision)
      (
        (typeTagIn === i.U) -> raw_a.sig,
        (typeTagIn === i.U) -> raw_b.sig
      )
  }.unzip
  multiplier.io.a := Mux1H(mul_a_sel)
  multiplier.io.b := Mux1H(mul_b_sel)
  multiplier.io.regEnables(0) := regEnable(1)

  val outSel = S2Reg(S1Reg(typeSel))

97
  toAdd.addend := S2Reg(S1Reg(io.in.bits.data.src(2)))
J
Jiawei Lin 已提交
98
  toAdd.mul_out.zip(s3.map(_.io.to_fadd)).foreach(x => x._1 := x._2)
99
  toAdd.fpCtrl := S2Reg(S1Reg(io.in.bits.ctrl.fpu.get))
X
Xuan Hu 已提交
100
  toAdd.robIdx := robIdxVec(latency)
101 102 103
  toAdd.pdest := S2Reg(S1Reg(io.in.bits.ctrl.pdest))
  toAdd.fpWen := S2Reg(S1Reg(io.in.bits.ctrl.fpWen.get))
  io.out.bits.res.data := Mux1H(outSel, s3.zip(FPU.ftypes).map{
J
Jiawei Lin 已提交
104 105
    case (mod, t) => FPU.box(mod.io.result, t)
  })
106 107 108 109
  io.out.bits.res.fflags.get := Mux1H(outSel, s3.map(_.io.fflags))
  io.out.bits.ctrl.robIdx := robIdxVec(latency)
  io.out.bits.ctrl.pdest := S2Reg(S1Reg(io.in.bits.ctrl.pdest))
  io.out.bits.ctrl.fpu.get := S2Reg(S1Reg(io.in.bits.ctrl.fpu.get))
110 111
}

X
Xuan Hu 已提交
112
class FADD_pipe(cfg: FuConfig, val addLat: Int = 2)(implicit p: Parameters) extends FPUPipelineModule(cfg) {
J
Jiawei Lin 已提交
113 114
  override val dataModule: FPUDataModule = null
  override def latency: Int = addLat
115

X
Xuan Hu 已提交
116 117
  private val rm = io.frm.get

J
Jiawei Lin 已提交
118 119
  val mulToAdd = IO(Input(new MulToAddIO(FPU.ftypes)))
  val isFMA = IO(Input(Bool()))
120

121
  val src1 = S1Reg(FPU.unbox(io.in.bits.data.src(0), io.in.bits.ctrl.fpu.get.typeTagIn))
J
Jiawei Lin 已提交
122
  val src2 = S1Reg(FPU.unbox(
123
    Mux(isFMA, mulToAdd.addend, io.in.bits.data.src(1)), io.in.bits.ctrl.fpu.get.typeTagIn
J
Jiawei Lin 已提交
124 125
  ))

126
  val fpCtrl = S1Reg(Mux(isFMA, mulToAdd.fpCtrl, io.in.bits.ctrl.fpu.get))
127 128
  val typeTagIn = fpCtrl.typeTagIn

J
Jiawei Lin 已提交
129 130 131 132 133 134
  val fma = S1Reg(isFMA)
  val mulProd = S1Reg(mulToAdd.mul_out)

  val stages = FPU.ftypes.zipWithIndex.map{
    case (t, i) =>
      val s1 = Module(new FCMA_ADD_s1(t.expWidth, 2*t.precision, t.precision))
135
      val s2 = Module(new FCMA_ADD_s2(t.expWidth, 2*t.precision, t.precision))
J
Jiawei Lin 已提交
136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153
      val in1 = Mux(fma,
        mulProd(i).fp_prod.asUInt,
        Cat(src1(t.len - 1, 0), 0.U(t.precision.W))
      )
      val in2 = Cat(
        Mux(fpCtrl.fmaCmd(0), invert_sign(src2, t.len), src2(t.len - 1, 0)),
        0.U(t.precision.W)
      )
      s1.io.a := in1
      s1.io.b := in2
      s1.io.b_inter_valid := fma
      s1.io.b_inter_flags := Mux(fma,
        mulProd(i).inter_flags,
        0.U.asTypeOf(s1.io.b_inter_flags)
      )
      s1.io.rm := S1Reg(rm)
      s2.io.in := S2Reg(s1.io.out)
      (s1, s2)
154 155
  }

J
Jiawei Lin 已提交
156
  val (s1, s2) = stages.unzip
157

J
Jiawei Lin 已提交
158
  val outSel = S2Reg(VecInit(FPU.ftypes.zipWithIndex.map(_._2.U === typeTagIn)))
159
  io.out.bits.res.data := Mux1H(outSel, s2.zip(FPU.ftypes).map{
J
Jiawei Lin 已提交
160 161
    case (mod, t) => FPU.box(mod.io.result, t)
  })
162 163 164 165
  io.out.bits.res.fflags.get := Mux1H(outSel, s2.map(_.io.fflags))
  io.out.bits.ctrl.robIdx := robIdxVec(latency)
  io.out.bits.ctrl.pdest := S2Reg(S1Reg(io.in.bits.ctrl.pdest))
  io.out.bits.ctrl.fpu.get := S2Reg(S1Reg(io.in.bits.ctrl.fpu.get))
166
}
L
LinJiawei 已提交
167

X
Xuan Hu 已提交
168 169
class FMA(cfg: FuConfig)(implicit p: Parameters) extends FPUSubModule(cfg) {
  private val rm = io.frm.get
J
Jiawei Lin 已提交
170
  override val dataModule = null
X
Xuan Hu 已提交
171 172
  val mul_pipe = Module(new FMUL_pipe(cfg))
  val add_pipe = Module(new FADD_pipe(cfg))
J
Jiawei Lin 已提交
173 174


X
Xuan Hu 已提交
175 176
  mul_pipe.io.flush := io.flush
  mul_pipe.io.frm.get := rm
J
Jiawei Lin 已提交
177

X
Xuan Hu 已提交
178 179
  add_pipe.io.flush := io.flush
  add_pipe.io.frm.get := rm
L
LinJiawei 已提交
180

181
  val fpCtrl = io.in.bits.ctrl.fpu.get
J
Jiawei Lin 已提交
182
  mul_pipe.io.in <> io.in
183
  mul_pipe.io.in.valid := io.in.valid && !fpCtrl.isAddSub
J
Jiawei Lin 已提交
184 185

  // For better timing, we let out.valid be true even if it's flushed.
186
  val isFMA = mul_pipe.io.out.valid && mul_pipe.io.out.bits.ctrl.fpu.get.ren3
J
Jiawei Lin 已提交
187
  // However, when sending instructions to add_pipe, we need to determine whether it's flushed.
188
  val mulFlushed = mul_pipe.io.out.bits.ctrl.robIdx.needFlush(io.flush)
Z
zhanglyGit 已提交
189
  val isFMAReg = isFMA && !mulFlushed
J
Jiawei Lin 已提交
190 191

  add_pipe.mulToAdd <> mul_pipe.toAdd
192

J
Jiawei Lin 已提交
193 194 195
  // For FADD, it accepts instructions from io.in and FMUL.
  // When FMUL gives an FMA, FADD accepts this instead of io.in.
  // Since FADD gets FMUL data from add_pipe.mulToAdd, only uop needs Mux.
196
  add_pipe.io.in.valid := io.in.valid && fpCtrl.isAddSub || isFMAReg
X
Xuan Hu 已提交
197
  add_pipe.io.in.bits := 0.U.asTypeOf(add_pipe.io.in.bits)
198 199 200 201 202
  add_pipe.io.in.bits.data.src := io.in.bits.data.src
  add_pipe.io.in.bits.ctrl.robIdx := Mux(isFMAReg, add_pipe.mulToAdd.robIdx, io.in.bits.ctrl.robIdx)
  add_pipe.io.in.bits.ctrl.pdest := Mux(isFMAReg, add_pipe.mulToAdd.pdest, io.in.bits.ctrl.pdest)
  add_pipe.io.in.bits.ctrl.fpu.get := Mux(isFMAReg, add_pipe.mulToAdd.fpCtrl, io.in.bits.ctrl.fpu.get)
  add_pipe.io.in.bits.ctrl.fpWen.get := Mux(isFMAReg, add_pipe.mulToAdd.fpWen, io.in.bits.ctrl.fpWen.get)
203
  add_pipe.isFMA := isFMAReg
J
Jiawei Lin 已提交
204 205

  // When the in uop is Add/Sub, we check FADD, otherwise fmul is checked.
206
  io.in.ready := Mux(fpCtrl.isAddSub,
J
Jiawei Lin 已提交
207 208 209 210 211 212 213 214
    !isFMAReg && add_pipe.io.in.ready,
    mul_pipe.io.in.ready
  )

  // For FMUL:
  // (1) It always accept FMA from FADD (if an FMA wants FMUL, it's never blocked).
  // (2) It has lower writeback arbitration priority than FADD (and may be blocked when FMUL.out.valid).
  XSError(isFMA && !add_pipe.io.in.ready, "FMA should not be blocked\n")
215
  mul_pipe.io.out.ready := isFMA || (io.out.ready && !add_pipe.io.out.valid)
J
Jiawei Lin 已提交
216 217
  add_pipe.io.out.ready := io.out.ready

218 219 220
  io.out.bits.ctrl.robIdx := Mux(add_pipe.io.out.valid,
    add_pipe.io.out.bits.ctrl.robIdx,
    mul_pipe.io.out.bits.ctrl.robIdx
J
Jiawei Lin 已提交
221
  )
222 223 224
  io.out.bits.ctrl.fpu.get := Mux(add_pipe.io.out.valid,
    add_pipe.io.out.bits.ctrl.fpu.get,
    mul_pipe.io.out.bits.ctrl.fpu.get
X
Xuan Hu 已提交
225
  )
226 227 228
  io.out.bits.ctrl.pdest := Mux(add_pipe.io.out.valid,
    add_pipe.io.out.bits.ctrl.pdest,
    mul_pipe.io.out.bits.ctrl.pdest
X
Xuan Hu 已提交
229
  )
230 231 232
  io.out.bits.res.data := Mux(add_pipe.io.out.valid,
    add_pipe.io.out.bits.res.data,
    mul_pipe.io.out.bits.res.data
J
Jiawei Lin 已提交
233
  )
234 235 236
  io.out.bits.res.fflags.get := Mux(add_pipe.io.out.valid,
    add_pipe.io.out.bits.res.fflags.get,
    mul_pipe.io.out.bits.res.fflags.get
J
Jiawei Lin 已提交
237
  )
238
  io.out.valid := add_pipe.io.out.valid || (mul_pipe.io.out.valid && !isFMA)
239 240 241 242
  io.out.bits.ctrl.fpWen.get := Mux(add_pipe.io.out.valid,
    add_pipe.io.out.bits.ctrl.fpWen.get,
    mul_pipe.io.out.bits.ctrl.fpWen.get
  )
L
LinJiawei 已提交
243
}