未验证 提交 88825c5c 编写于 作者: Y Yinan Xu 提交者: GitHub

backend: support instruction fusion cases (#1011)

This commit adds some simple instruction fusion cases in decode stage.
Currently we only implement instruction pairs that can be fused into
RV64GCB instructions.

Instruction fusions are detected in the decode stage by FusionDecoder.
The decoder checks every two instructions and marks the first
instruction fused if they can be fused into one instruction. The second
instruction is removed by setting the valid field to false.

Simple fusion cases include sh1add, sh2add, sh3add, sexth, zexth, etc.

Currently, ftq in frontend needs every instruction to commit. However,
the second instruction is removed from the pipeline and will not commit.
To solve this issue, we temporarily add more bits to isFused to indicate
the offset diff of the two fused instruction. There are four
possibilities now. This feature may be removed later.

This commit also adds more instruction fusion cases that need changes
in both the decode stage and the funtion units. In this commit, we add
some opcode to the function units and fuse the new instruction pairs
into these new internal uops.

The list of opcodes we add in this commit is shown below:
- szewl1: `slli r1, r0, 32` + `srli r1, r0, 31`
- szewl2: `slli r1, r0, 32` + `srli r1, r0, 30`
- byte2: `srli r1, r0, 8` + `andi r1, r1, 255`
- sh4add: `slli r1, r0, 4` + `add r1, r1, r2`
- sr30add: `srli r1, r0, 30` + `add r1, r1, r2`
- sr31add: `srli r1, r0, 31` + `add r1, r1, r2`
- sr32add: `srli r1, r0, 32` + `add r1, r1, r2`
- oddadd: `andi r1, r0, 1`` + `add r1, r1, r2`
- oddaddw: `andi r1, r0, 1`` + `addw r1, r1, r2`
- orh48: mask off the first 16 bits and or with another operand
         (`andi r1, r0, -256`` + `or r1, r1, r2`)

Furthermore, this commit adds some complex instruction fusion cases to
the decode stage and function units. The complex instruction fusion cases
are detected after the instructions are decoded into uop and their
CtrlSignals are used for instruction fusion detection.

We add the following complex instruction fusion cases:
- addwbyte: addw and mask it with 0xff (extract the first byte)
- addwbit: addw and mask it with 0x1 (extract the first bit)
- logiclsb: logic operation and mask it with 0x1 (extract the first bit)
- mulw7: andi 127 and mulw instructions.
        Input to mul is AND with 0x7f if mulw7 bit is set to true.
上级 fa086d5e
Subproject commit a85c72dbd7f8988933d0827508dacfe9580f6c83
Subproject commit b27a21e576f826839233d7a43a645b3e383a02f7
......@@ -175,4 +175,5 @@ class MinimalSimConfig(n: Int = 1) extends Config(
useFakeL3Cache = true
)
})
)
\ No newline at end of file
)
......@@ -35,6 +35,7 @@ import utils._
import scala.math.max
import Chisel.experimental.chiselName
import chipsalliance.rocketchip.config.Parameters
import chisel3.util.BitPat.bitPatToUInt
import xiangshan.frontend.Ftq_Redirect_SRAMEntry
class ValidUndirectioned[T <: Data](gen: T) extends Bundle {
......@@ -152,16 +153,22 @@ class CtrlSignals(implicit p: Parameters) extends XSBundle {
val fpu = new FPUCtrlSignals
val isMove = Bool()
val singleStep = Bool()
val isFused = UInt(3.W)
def decode(inst: UInt, table: Iterable[(BitPat, List[BitPat])]) = {
private def allSignals = srcType ++ Seq(fuType, fuOpType, rfWen, fpWen,
isXSTrap, noSpecExec, blockBackward, flushPipe, isRVF, selImm)
def decode(inst: UInt, table: Iterable[(BitPat, List[BitPat])]): CtrlSignals = {
val decoder = freechips.rocketchip.rocket.DecodeLogic(inst, XDecode.decodeDefault, table)
val signals =
Seq(srcType(0), srcType(1), srcType(2), fuType, fuOpType, rfWen, fpWen,
isXSTrap, noSpecExec, blockBackward, flushPipe, isRVF, selImm)
signals zip decoder map { case (s, d) => s := d }
allSignals zip decoder foreach { case (s, d) => s := d }
commitType := DontCare
this
}
def decode(bit: List[BitPat]): CtrlSignals = {
allSignals.zip(bit.map(bitPatToUInt(_))).foreach{ case (s, d) => s := d }
this
}
}
class CfCtrl(implicit p: Parameters) extends XSBundle {
......@@ -300,6 +307,7 @@ class RoqCommitInfo(implicit p: Parameters) extends XSBundle {
val old_pdest = UInt(PhyRegIdxWidth.W)
val ftqIdx = new FtqPtr
val ftqOffset = UInt(log2Up(PredictWidth).W)
val isFused = UInt(3.W)
// these should be optimized for synthesis verilog
val pc = UInt(VAddrBits.W)
......
......@@ -57,6 +57,7 @@ class DecodeStage(implicit p: Parameters) extends XSModule {
io.out(i).bits := decoders(i).io.deq.cf_ctrl
io.in(i).ready := io.out(i).ready
}
for (i <- 0 until StorePipelineWidth) {
waittable.io.update(i) <> RegNext(io.memPredUpdate(i))
}
......@@ -64,6 +65,38 @@ class DecodeStage(implicit p: Parameters) extends XSModule {
ssit.io.update <> RegNext(io.memPredUpdate(0))
ssit.io.csrCtrl <> io.csrCtrl
// instruction fusion
val fusionDecoder = Module(new FusionDecoder())
fusionDecoder.io.in.zip(io.in).foreach{ case (d, in) =>
// TODO: instructions with exceptions should not be considered fusion
d.valid := in.valid
d.bits := in.bits.instr
}
fusionDecoder.io.dec := decoders.map(_.io.deq.cf_ctrl.ctrl)
fusionDecoder.io.out.zip(io.out.dropRight(1)).zipWithIndex.foreach{ case ((d, out), i) =>
d.ready := out.ready
when (d.valid) {
out.bits.ctrl := d.bits
// TODO: remove this
// Dirty code for ftq update
val sameFtqPtr = out.bits.cf.ftqPtr.value === io.out(i + 1).bits.cf.ftqPtr.value
val ftqOffset0 = out.bits.cf.ftqOffset
val ftqOffset1 = io.out(i + 1).bits.cf.ftqOffset
val ftqOffsetDiff = ftqOffset1 - ftqOffset0
val cond1 = sameFtqPtr && ftqOffsetDiff === 1.U
val cond2 = sameFtqPtr && ftqOffsetDiff === 2.U
val cond3 = !sameFtqPtr && ftqOffset1 === 0.U
val cond4 = !sameFtqPtr && ftqOffset1 === 1.U
out.bits.ctrl.isFused := Mux(cond1, 1.U, Mux(cond2, 2.U, Mux(cond3, 3.U, 4.U)))
XSError(!cond1 && !cond2 && !cond3 && !cond4, p"new condition $sameFtqPtr $ftqOffset0 $ftqOffset1\n")
}
}
fusionDecoder.io.clear.zip(io.out.map(_.valid)).foreach{ case (c, v) =>
when (c) {
v := false.B
}
}
val loadWaitBitSet = PopCount(io.out.map(o => o.fire() && o.bits.cf.loadWaitBit))
XSPerfAccumulate("loadWaitBitSet", loadWaitBitSet)
val storeSetHit = PopCount(io.out.map(o => o.fire() && o.bits.cf.storeSetHit))
......
......@@ -492,13 +492,15 @@ class DecodeUnit(implicit p: Parameters) extends XSModule with DecodeUnitConstan
ctrl_flow := io.enq.ctrl_flow
var decode_table = XDecode.table ++ FDecode.table ++ FDivSqrtDecode.table ++ X64Decode.table ++ XSTrapDecode.table ++ BDecode.table
val decode_table = XDecode.table ++ FDecode.table ++ FDivSqrtDecode.table ++ X64Decode.table ++ XSTrapDecode.table ++ BDecode.table
// output
cf_ctrl.cf := ctrl_flow
val cs = Wire(new CtrlSignals()).decode(ctrl_flow.instr, decode_table)
cs.singleStep := false.B
cs.isFused := 0.U
val fpDecoder = Module(new FPDecoder)
fpDecoder.io.instr := ctrl_flow.instr
cs.fpu := fpDecoder.io.fpCtrl
......
/***************************************************************************************
* Copyright (c) 2020-2021 Institute of Computing Technology, Chinese Academy of Sciences
* Copyright (c) 2020-2021 Peng Cheng Laboratory
*
* XiangShan is licensed under Mulan PSL v2.
* You can use this software according to the terms and conditions of the Mulan PSL v2.
* You may obtain a copy of Mulan PSL v2 at:
* http://license.coscl.org.cn/MulanPSL2
*
* THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND,
* EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT,
* MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE.
*
* See the Mulan PSL v2 for more details.
***************************************************************************************/
package xiangshan.backend.decode
import chipsalliance.rocketchip.config.Parameters
import chisel3._
import chisel3.util.BitPat.bitPatToUInt
import chisel3.util._
import xiangshan._
import utils._
abstract class BaseFusionCase(pair: Seq[Valid[UInt]], csPair: Option[Seq[CtrlSignals]] = None)(implicit p: Parameters)
extends DecodeUnitConstants {
require(pair.length == 2)
protected def instr: Seq[UInt] = pair.map(_.bits)
protected def pairValid: Bool = VecInit(pair.map(_.valid)).asUInt().andR()
protected def instr1Rs1: UInt = instr(0)(RS1_MSB, RS1_LSB)
protected def instr1Rs2: UInt = instr(0)(RS2_MSB, RS2_LSB)
protected def instr1Rd: UInt = instr(0)(RD_MSB, RD_LSB)
protected def instr2Rs1: UInt = instr(1)(RS1_MSB, RS1_LSB)
protected def instr2Rs2: UInt = instr(1)(RS2_MSB, RS2_LSB)
protected def instr2Rd: UInt = instr(1)(RD_MSB, RD_LSB)
protected def withSameDest: Bool = instr1Rd === instr2Rd
protected def destToRs1: Bool = instr1Rd === instr2Rs1
protected def destToRs2: Bool = instr1Rd === instr2Rs2
protected def getBaseCS(pat: BitPat): CtrlSignals = {
val allDecodeTable = XDecode.table ++ X64Decode.table ++ BDecode.table
val baseTable = allDecodeTable.filter(_._1 == pat).map(_._2).head
val cs = Wire(new CtrlSignals)
cs := DontCare
cs.decode(baseTable)
// For simple instruction fusions, we assume their destination registers are the same.
cs.ldest := instr1Rd
cs
}
def isValid: Bool
// TODO: optimize timing
def target: CtrlSignals
// clear the next instruction
// def needClear: Boolean = true
def fusionName: String
}
// Case: clear upper 32 bits / get lower 32 bits
// Source: `slli r1, r0, 32` + `srli r1, r1, 32`
// Target: `add.uw r1, r0, zero` (pseudo instruction: `zext.w r1, r0`)
class FusedAdduw(pair: Seq[Valid[UInt]])(implicit p: Parameters) extends BaseFusionCase(pair) {
def inst1Cond = instr(0) === Instructions.SLLI && instr(0)(25, 20) === 32.U
def inst2Cond = instr(1) === Instructions.SRLI && instr(1)(25, 20) === 32.U
def isValid: Bool = inst1Cond && inst2Cond && withSameDest && destToRs1
def target: CtrlSignals = {
val cs = getBaseCS(Instructions.ADDU_W)
cs.lsrc(0) := instr1Rs1
cs.lsrc(1) := 0.U
cs
}
def fusionName: String = "slli32_srli32"
}
// Case: clear upper 48 bits / get lower 16 bits
// Source: `slli r1, r0, 48` + `srli r1, r1, 48`
// Target: `zext.h r1, r0`
class FusedZexth(pair: Seq[Valid[UInt]])(implicit p: Parameters) extends BaseFusionCase(pair) {
def inst1Cond = instr(0) === Instructions.SLLI && instr(0)(25, 20) === 48.U
def inst2Cond = instr(1) === Instructions.SRLI && instr(1)(25, 20) === 48.U
def isValid: Bool = inst1Cond && inst2Cond && withSameDest && destToRs1
def target: CtrlSignals = {
val cs = getBaseCS(Instructions.ZEXT_H)
cs.lsrc(0) := instr1Rs1
cs
}
def fusionName: String = "slli48_srli48"
}
// Another case of Zext.h
// Source: `slliw r1, r0, 16` + `srliw r1, r1, 16`
// Target: `zext.h r1, r0`
class FusedZexth1(pair: Seq[Valid[UInt]])(implicit p: Parameters) extends FusedZexth(pair) {
override def inst1Cond: Bool = instr(0) === Instructions.SLLIW && instr(0)(24, 20) === 16.U
override def inst2Cond: Bool = instr(1) === Instructions.SRLIW && instr(1)(24, 20) === 16.U
override def fusionName: String = "slliw16_srliw16"
}
// Case: sign-extend a 16-bit number
// Source: `slliw r1, r0, 16` + `sraiw r1, r1, 16`
// Target: `sext.h r1, r0`
class FusedSexth(pair: Seq[Valid[UInt]])(implicit p: Parameters) extends BaseFusionCase(pair) {
def inst1Cond = instr(0) === Instructions.SLLIW && instr(0)(24, 20) === 16.U
def inst2Cond = instr(1) === Instructions.SRAIW && instr(1)(24, 20) === 16.U
def isValid: Bool = inst1Cond && inst2Cond && withSameDest && destToRs1
def target: CtrlSignals = {
val cs = getBaseCS(Instructions.SEXT_H)
cs.lsrc(0) := instr1Rs1
cs
}
def fusionName: String = "slliw16_sraiw16"
}
// Case: shift left by one and add
// Source: `slli r1, r0, 1` + `add r1, r1, r2`
// Target: `sh1add r1, r0, r2`
class FusedSh1add(pair: Seq[Valid[UInt]])(implicit p: Parameters) extends BaseFusionCase(pair) {
def inst1Cond = instr(0) === Instructions.SLLI && instr(0)(25, 20) === 1.U
def inst2Cond = instr(1) === Instructions.ADD
def isValid: Bool = inst1Cond && inst2Cond && withSameDest && (destToRs1 || destToRs2)
def target: CtrlSignals = {
val cs = getBaseCS(Instructions.SH1ADD)
cs.lsrc(0) := instr1Rs1
cs.lsrc(1) := Mux(destToRs1, instr2Rs2, instr2Rs1)
cs
}
def fusionName: String = "slli1_add"
}
// Case: shift left by two and add
// Source: `slli r1, r0, 2` + `add r1, r1, r2`
// Target: `sh2add r1, r0, r2`
class FusedSh2add(pair: Seq[Valid[UInt]])(implicit p: Parameters) extends BaseFusionCase(pair) {
def inst1Cond = instr(0) === Instructions.SLLI && instr(0)(25, 20) === 2.U
def inst2Cond = instr(1) === Instructions.ADD
def isValid: Bool = inst1Cond && inst2Cond && withSameDest && (destToRs1 || destToRs2)
def target: CtrlSignals = {
val cs = getBaseCS(Instructions.SH2ADD)
cs.lsrc(0) := instr1Rs1
cs.lsrc(1) := Mux(destToRs1, instr2Rs2, instr2Rs1)
cs
}
def fusionName: String = "slli2_add"
}
// Case: shift left by three and add
// Source: `slli r1, r0, 3` + `add r1, r1, r2`
// Target: `sh3add r1, r0, r2`
class FusedSh3add(pair: Seq[Valid[UInt]])(implicit p: Parameters) extends BaseFusionCase(pair) {
def inst1Cond = instr(0) === Instructions.SLLI && instr(0)(25, 20) === 3.U
def inst2Cond = instr(1) === Instructions.ADD
def isValid: Bool = inst1Cond && inst2Cond && withSameDest && (destToRs1 || destToRs2)
def target: CtrlSignals = {
val cs = getBaseCS(Instructions.SH3ADD)
cs.lsrc(0) := instr1Rs1
cs.lsrc(1) := Mux(destToRs1, instr2Rs2, instr2Rs1)
cs
}
def fusionName: String = "slli3_add"
}
// Case: shift zero-extended word left by one
// Source: `slli r1, r0, 32` + `srli r1, r0, 31`
// Target: `szewl1 r1, r0` (customized internal opcode)
class FusedSzewl1(pair: Seq[Valid[UInt]])(implicit p: Parameters) extends BaseFusionCase(pair) {
def inst1Cond = instr(0) === Instructions.SLLI && instr(0)(25, 20) === 32.U
def inst2Cond = instr(1) === Instructions.SRLI && instr(1)(25, 20) === 31.U
def isValid: Bool = inst1Cond && inst2Cond && withSameDest && destToRs1
def target: CtrlSignals = {
val cs = getBaseCS(Instructions.ZEXT_H)
// replace the fuOpType with szewl1
cs.fuOpType := ALUOpType.szewl1
cs.lsrc(0) := instr1Rs1
cs
}
def fusionName: String = "slli32_srli31"
}
// Case: shift zero-extended word left by two
// Source: `slli r1, r0, 32` + `srli r1, r0, 30`
// Target: `szewl2 r1, r0` (customized internal opcode)
class FusedSzewl2(pair: Seq[Valid[UInt]])(implicit p: Parameters) extends BaseFusionCase(pair) {
def inst1Cond = instr(0) === Instructions.SLLI && instr(0)(25, 20) === 32.U
def inst2Cond = instr(1) === Instructions.SRLI && instr(1)(25, 20) === 30.U
def isValid: Bool = inst1Cond && inst2Cond && withSameDest && destToRs1
def target: CtrlSignals = {
val cs = getBaseCS(Instructions.ZEXT_H)
// replace the fuOpType with szewl2
cs.fuOpType := ALUOpType.szewl2
cs.lsrc(0) := instr1Rs1
cs
}
def fusionName: String = "slli32_srli30"
}
// Case: get the second byte
// Source: `srli r1, r0, 8` + `andi r1, r1, 255`
// Target: `byte2 r1, r0` (customized internal opcode)
class FusedByte2(pair: Seq[Valid[UInt]])(implicit p: Parameters) extends BaseFusionCase(pair) {
def inst1Cond = instr(0) === Instructions.SRLI && instr(0)(25, 20) === 8.U
def inst2Cond = instr(1) === Instructions.ANDI && instr(1)(31, 20) === 255.U
def isValid: Bool = inst1Cond && inst2Cond && withSameDest && destToRs1
def target: CtrlSignals = {
val cs = getBaseCS(Instructions.ZEXT_H)
// replace the fuOpType with byte2
cs.fuOpType := ALUOpType.byte2
cs.lsrc(0) := instr1Rs1
cs
}
def fusionName: String = "srli8_andi255"
}
// Case: shift left by four and add
// Source: `slli r1, r0, 4` + `add r1, r1, r2`
// Target: `sh4add r1, r0, r2` (customized internal opcode)
class FusedSh4add(pair: Seq[Valid[UInt]])(implicit p: Parameters) extends BaseFusionCase(pair) {
def inst1Cond = instr(0) === Instructions.SLLI && instr(0)(25, 20) === 4.U
def inst2Cond = instr(1) === Instructions.ADD
def isValid: Bool = inst1Cond && inst2Cond && withSameDest && (destToRs1 || destToRs2)
def target: CtrlSignals = {
val cs = getBaseCS(Instructions.SH3ADD)
// replace the fuOpType with sh4add
cs.fuOpType := ALUOpType.sh4add
cs.lsrc(0) := instr1Rs1
cs.lsrc(1) := Mux(destToRs1, instr2Rs2, instr2Rs1)
cs
}
def fusionName: String = "slli4_add"
}
// Case: shift right by 30 and add
// Source: `srli r1, r0, 30` + `add r1, r1, r2`
// Target: `sr30add r1, r0, r2` (customized internal opcode)
class FusedSr30add(pair: Seq[Valid[UInt]])(implicit p: Parameters) extends BaseFusionCase(pair) {
def inst1Cond = instr(0) === Instructions.SRLI && instr(0)(25, 20) === 30.U
def inst2Cond = instr(1) === Instructions.ADD
def isValid: Bool = inst1Cond && inst2Cond && withSameDest && (destToRs1 || destToRs2)
def target: CtrlSignals = {
val cs = getBaseCS(Instructions.SH3ADD)
// replace the fuOpType with sr30add
cs.fuOpType := ALUOpType.sr30add
cs.lsrc(0) := instr1Rs1
cs.lsrc(1) := Mux(destToRs1, instr2Rs2, instr2Rs1)
cs
}
def fusionName: String = "srli30_add"
}
// Case: shift right by 31 and add
// Source: `srli r1, r0, 31` + `add r1, r1, r2`
// Target: `sr31add r1, r0, r2` (customized internal opcode)
class FusedSr31add(pair: Seq[Valid[UInt]])(implicit p: Parameters) extends BaseFusionCase(pair) {
def inst1Cond = instr(0) === Instructions.SRLI && instr(0)(25, 20) === 31.U
def inst2Cond = instr(1) === Instructions.ADD
def isValid: Bool = inst1Cond && inst2Cond && withSameDest && (destToRs1 || destToRs2)
def target: CtrlSignals = {
val cs = getBaseCS(Instructions.SH3ADD)
// replace the fuOpType with sr31add
cs.fuOpType := ALUOpType.sr31add
cs.lsrc(0) := instr1Rs1
cs.lsrc(1) := Mux(destToRs1, instr2Rs2, instr2Rs1)
cs
}
def fusionName: String = "srli31_add"
}
// Case: shift right by 32 and add
// Source: `srli r1, r0, 32` + `add r1, r1, r2`
// Target: `sr32add r1, r0, r2` (customized internal opcode)
class FusedSr32add(pair: Seq[Valid[UInt]])(implicit p: Parameters) extends BaseFusionCase(pair) {
def inst1Cond = instr(0) === Instructions.SRLI && instr(0)(25, 20) === 32.U
def inst2Cond = instr(1) === Instructions.ADD
def isValid: Bool = inst1Cond && inst2Cond && withSameDest && (destToRs1 || destToRs2)
def target: CtrlSignals = {
val cs = getBaseCS(Instructions.SH3ADD)
// replace the fuOpType with sr32add
cs.fuOpType := ALUOpType.sr32add
cs.lsrc(0) := instr1Rs1
cs.lsrc(1) := Mux(destToRs1, instr2Rs2, instr2Rs1)
cs
}
def fusionName: String = "srli32_add"
}
// Case: add one if odd, otherwise unchanged
// Source: `andi r1, r0, 1`` + `add r1, r1, r2`
// Target: `oddadd r1, r0, r2` (customized internal opcode)
class FusedOddadd(pair: Seq[Valid[UInt]])(implicit p: Parameters) extends BaseFusionCase(pair) {
def inst1Cond = instr(0) === Instructions.ANDI && instr(0)(31, 20) === 1.U
def inst2Cond = instr(1) === Instructions.ADD
def isValid: Bool = inst1Cond && inst2Cond && withSameDest && (destToRs1 || destToRs2)
def target: CtrlSignals = {
val cs = getBaseCS(Instructions.SH3ADD)
// replace the fuOpType with oddadd
cs.fuOpType := ALUOpType.oddadd
cs.lsrc(0) := instr1Rs1
cs.lsrc(1) := Mux(destToRs1, instr2Rs2, instr2Rs1)
cs
}
def fusionName: String = "andi1_add"
}
// Case: add one if odd (in word format), otherwise unchanged
// Source: `andi r1, r0, 1`` + `addw r1, r1, r2`
// Target: `oddaddw r1, r0, r2` (customized internal opcode)
class FusedOddaddw(pair: Seq[Valid[UInt]])(implicit p: Parameters) extends BaseFusionCase(pair) {
def inst1Cond = instr(0) === Instructions.ANDI && instr(0)(31, 20) === 1.U
def inst2Cond = instr(1) === Instructions.ADDW
def isValid: Bool = inst1Cond && inst2Cond && withSameDest && (destToRs1 || destToRs2)
def target: CtrlSignals = {
val cs = getBaseCS(Instructions.SH3ADD)
// replace the fuOpType with oddaddw
cs.fuOpType := ALUOpType.oddaddw
cs.lsrc(0) := instr1Rs1
cs.lsrc(1) := Mux(destToRs1, instr2Rs2, instr2Rs1)
cs
}
def fusionName: String = "andi1_addw"
}
// Case: addw and extract its lower 8 bits (fused into addwbyte)
class FusedAddwbyte(pair: Seq[Valid[UInt]], csPair: Option[Seq[CtrlSignals]])(implicit p: Parameters)
extends BaseFusionCase(pair, csPair) {
require(csPair.isDefined)
// the first instruction is a addw
def inst1Cond = csPair.get(0).fuType === FuType.alu && ALUOpType.isAddw(csPair.get(0).fuOpType)
def inst2Cond = instr(1) === Instructions.ANDI && instr(1)(31, 20) === 0xff.U
def isValid: Bool = inst1Cond && inst2Cond && withSameDest && destToRs1
def target: CtrlSignals = {
val cs = WireInit(csPair.get(0))
// replace the fuOpType with addwbyte
cs.fuOpType := ALUOpType.addwbyte
cs
}
def fusionName: String = "andw_andi255"
}
// Case: addw and extract its lower 1 bit (fused into addwbit)
class FusedAddwbit(pair: Seq[Valid[UInt]], csPair: Option[Seq[CtrlSignals]])(implicit p: Parameters)
extends FusedAddwbyte(pair, csPair) {
override def inst2Cond = instr(1) === Instructions.ANDI && instr(1)(31, 20) === 0x1.U
override def target: CtrlSignals = {
val cs = WireInit(csPair.get(0))
// replace the fuOpType with addwbit
cs.fuOpType := ALUOpType.addwbit
cs
}
override def fusionName: String = "andw_andi1"
}
// Case: logic operation and extract its LSB
class FusedLogiclsb(pair: Seq[Valid[UInt]], csPair: Option[Seq[CtrlSignals]])(implicit p: Parameters)
extends BaseFusionCase(pair, csPair) {
require(csPair.isDefined)
// the first instruction is a logic
def inst1Cond = csPair.get(0).fuType === FuType.alu && ALUOpType.isLogic(csPair.get(0).fuOpType)
def inst2Cond = instr(1) === Instructions.ANDI && instr(1)(31, 20) === 1.U
def isValid: Bool = inst1Cond && inst2Cond && withSameDest && destToRs1
def target: CtrlSignals = {
val cs = WireInit(csPair.get(0))
// change the opType to lsb format
cs.fuOpType := ALUOpType.logicToLSB(csPair.get(0).fuOpType)
cs
}
def fusionName: String = "logic_andi1"
}
// Case: OR(Cat(src1(63, 8), 0.U(8.W)), src2)
// Source: `andi r1, r0, -256`` + `or r1, r1, r2`
class FusedOrh48(pair: Seq[Valid[UInt]])(implicit p: Parameters) extends BaseFusionCase(pair) {
def inst1Cond = instr(0) === Instructions.ANDI && instr(0)(31, 20) === 0xf00.U
def inst2Cond = instr(1) === Instructions.OR
def isValid: Bool = inst1Cond && inst2Cond && withSameDest && (destToRs1 || destToRs2)
def target: CtrlSignals = {
val cs = getBaseCS(Instructions.OR)
// replace the fuOpType with orh48
cs.fuOpType := ALUOpType.orh48
cs.lsrc(0) := instr1Rs1
cs.lsrc(1) := Mux(destToRs1, instr2Rs2, instr2Rs1)
cs
}
def fusionName: String = "andi_f00_or"
}
// Case: mul 7bit data with 32-bit data
// Source: `andi r1, r0, 127`` + `mulw r1, r1, r2`
// Target: `mulw7 r1, r0, r2`
class FusedMulw7(pair: Seq[Valid[UInt]], csPair: Option[Seq[CtrlSignals]])(implicit p: Parameters)
extends BaseFusionCase(pair, csPair) {
require(csPair.isDefined)
def inst1Cond = instr(0) === Instructions.ANDI && instr(0)(31, 20) === 127.U
def inst2Cond = instr(1) === Instructions.MULW
def isValid: Bool = inst1Cond && inst2Cond && withSameDest && (destToRs1 || destToRs2)
def target: CtrlSignals = {
// use MULW as the base
val cs = WireInit(csPair.get(1))
// replace the fuOpType with mulw7
cs.fuOpType := MDUOpType.mulw7
cs.lsrc(0) := instr1Rs1
cs.lsrc(1) := Mux(destToRs1, instr2Rs2, instr2Rs1)
cs
}
def fusionName: String = "andi127_mulw"
}
class FusionDecoder(implicit p: Parameters) extends XSModule {
val io = IO(new Bundle {
// detect instruction fusions in these instructions
val in = Vec(DecodeWidth, Flipped(ValidIO(UInt(32.W))))
val dec = Vec(DecodeWidth, Input(new CtrlSignals()))
// whether an instruction fusion is found
val out = Vec(DecodeWidth - 1, DecoupledIO(new CtrlSignals))
// fused instruction needs to be cleared
val clear = Vec(DecodeWidth, Output(Bool()))
})
io.clear.head := false.B
val instrPairs = io.in.dropRight(1).zip(io.in.drop(1)).map(x => Seq(x._1, x._2))
val csPairs = io.dec.dropRight(1).zip(io.dec.drop(1)).map(x => Seq(x._1, x._2))
instrPairs.zip(csPairs).zip(io.out).zipWithIndex.foreach{ case (((pair, cs), out), i) =>
val fusionList = Seq(
new FusedAdduw(pair),
new FusedZexth(pair),
new FusedZexth1(pair),
new FusedSexth(pair),
new FusedSh1add(pair),
new FusedSh2add(pair),
new FusedSh3add(pair),
new FusedSzewl1(pair),
new FusedSzewl2(pair),
new FusedByte2(pair),
new FusedSh4add(pair),
new FusedSr30add(pair),
new FusedSr31add(pair),
new FusedSr32add(pair),
new FusedOddadd(pair),
new FusedOddaddw(pair),
new FusedAddwbyte(pair, Some(cs)),
new FusedAddwbit(pair, Some(cs)),
new FusedLogiclsb(pair, Some(cs)),
new FusedOrh48(pair),
new FusedMulw7(pair, Some(cs))
)
val pairValid = VecInit(pair.map(_.valid)).asUInt().andR
val thisCleared = io.clear(i)
val fusionVec = VecInit(fusionList.map(_.isValid))
out.valid := pairValid && !thisCleared && fusionVec.asUInt().orR()
XSError(PopCount(fusionVec) > 1.U, "more then one fusion matched\n")
out.bits := Mux1H(fusionVec, fusionList.map(_.target))
// TODO: assume every instruction fusion clears the second instruction now
io.clear(i + 1) := out.valid
fusionList.zip(fusionVec).foreach { case (f, v) =>
XSPerfAccumulate(s"case_${f.fusionName}_$i", pairValid && !thisCleared && v && out.ready)
}
XSPerfAccumulate(s"conflict_fusion_$i", pairValid && thisCleared && fusionVec.asUInt().orR() && out.ready)
}
XSPerfAccumulate("fused_instr", PopCount(io.out.map(_.fire)))
}
......@@ -54,6 +54,9 @@ class MulDivExeUnit(implicit p: Parameters) extends ExeUnit(MulDivExeUnitCfg) {
op,
mulInputFuncTable.map(p => (p._1(1, 0), p._2._1(src1)))
)
when (func(3)) {
mul.io.in.bits.src(0) := src1(6, 0)
}
mul.io.in.bits.src(1) := LookupTree(
op,
mulInputFuncTable.map(p => (p._1(1, 0), p._2._2(src2)))
......
......@@ -19,7 +19,7 @@ package xiangshan.backend.fu
import chipsalliance.rocketchip.config.Parameters
import chisel3._
import chisel3.util._
import utils.{LookupTreeDefault, LookupTree, ParallelMux, SignExt, ZeroExt}
import utils.{LookupTree, LookupTreeDefault, ParallelMux, SignExt, ZeroExt}
import xiangshan._
class AddModule(implicit p: Parameters) extends XSModule {
......@@ -30,6 +30,7 @@ class AddModule(implicit p: Parameters) extends XSModule {
val addw = Output(UInt((XLEN/2).W))
})
io.add := io.src(0) + io.src(1)
// TODO: why this extra adder?
io.addw := io.srcw + io.src(1)(31,0)
}
......@@ -95,26 +96,35 @@ class RightShiftWordModule(implicit p: Parameters) extends XSModule {
class MiscResultSelect(implicit p: Parameters) extends XSModule {
val io = IO(new Bundle() {
val func = Input(UInt())
val andn, orn, xnor, and, or, xor, sextb, sexth, zexth, rev8, orcb = Input(UInt(XLEN.W))
val func = Input(UInt(5.W))
val andn, orn, xnor, and, or, xor, orh48, sextb, sexth, zexth, rev8, orcb = Input(UInt(XLEN.W))
val src = Input(UInt(XLEN.W))
val miscRes = Output(UInt(XLEN.W))
})
val logicResSel = ParallelMux(List(
ALUOpType.andn -> io.andn,
ALUOpType.and -> io.and,
ALUOpType.orn -> io.orn,
ALUOpType.or -> io.or,
ALUOpType.xnor -> io.xnor,
ALUOpType.xor -> io.xor,
ALUOpType.orh48 -> io.orh48
).map(x => (x._1(2, 0) === io.func(2, 0), x._2)))
val maskedLogicRes = Cat(Fill(63, ~io.func(3)), 1.U(1.W)) & logicResSel
val miscRes = ParallelMux(List(
ALUOpType.andn -> io.andn,
ALUOpType.and -> io.and,
ALUOpType.orn -> io.orn,
ALUOpType.or -> io.or,
ALUOpType.xnor -> io.xnor,
ALUOpType.xor -> io.xor,
ALUOpType.sext_b -> io.sextb,
ALUOpType.sext_h -> io.sexth,
ALUOpType.zext_h -> io.zexth,
ALUOpType.orc_b -> io.orcb,
ALUOpType.rev8 -> io.rev8
).map(x => (x._1(3, 0) === io.func(3, 0), x._2)))
ALUOpType.rev8 -> io.rev8,
ALUOpType.szewl1 -> Cat(0.U(31.W), io.src(31, 0), 0.U(1.W)),
ALUOpType.szewl2 -> Cat(0.U(30.W), io.src(31, 0), 0.U(2.W)),
ALUOpType.byte2 -> Cat(0.U(56.W), io.src(15, 8))
).map(x => (x._1(2, 0) === io.func(2, 0), x._2)))
io.miscRes := miscRes
io.miscRes := Mux(io.func(3) && !io.func(4), miscRes, maskedLogicRes)
}
class ShiftResultSelect(implicit p: Parameters) extends XSModule {
......@@ -173,17 +183,37 @@ class AluDataModule(implicit p: Parameters) extends XSModule {
})
val (src1, src2, func) = (io.src(0), io.src(1), io.func)
val isW = ALUOpType.isWordOp(func)
val addModule = Module(new AddModule)
// For 64-bit adder:
// BITS(2, 1): shamt (0, 1, 2, 3)
// BITS(3 ): different fused cases
val shaddShamt = func(2,1)
val add = addModule.io.add
val addw = addModule.io.addw
addModule.io.src(0) := (Cat(Fill(32, func(0)), Fill(32,1.U)) & src1) << shaddShamt
addModule.io.src(1) := src2
// TODO: use decoder or other libraries to optimize timing
when (func(4)) {
addModule.io.src(0) := ZeroExt(src1(0), XLEN)
}
when (func(3)) {
val sourceVec = VecInit(Seq(
Cat(src1(59, 0), 0.U(4.W)),
ZeroExt(src1(63, 30), XLEN),
ZeroExt(src1(63, 31), XLEN),
ZeroExt(src1(63, 32), XLEN)
))
addModule.io.src(0) := sourceVec(func(2, 1))
}
val add = addModule.io.add
// For 32-bit adder:
// BITS(4 ): different fused cases
// BITS(2, 1): result mask (ffffffff, 0x1, 0xff)
addModule.io.srcw := src1(31,0)
when (func(4)) {
addModule.io.srcw := ZeroExt(src1(0), XLEN)
}
val byteMask = Cat(Fill(56, ~func(1)), 0xff.U(8.W))
val bitMask = Cat(Fill(63, ~func(2)), 0x1.U(1.W))
val addw = addModule.io.addw & byteMask & bitMask
val subModule = Module(new SubModule)
val sub = subModule.io.sub
......@@ -243,6 +273,7 @@ class AluDataModule(implicit p: Parameters) extends XSModule {
val and = src1 & src2
val or = src1 | src2
val xor = src1 ^ src2
val orh48 = Cat(src1(63, 8), 0.U(8.W)) | src2
val sgtu = sub(XLEN)
val sltu = !sgtu
val slt = xor(XLEN-1) ^ sltu
......@@ -284,18 +315,20 @@ class AluDataModule(implicit p: Parameters) extends XSModule {
val shiftRes = shiftResSel.io.shiftRes
val miscResSel = Module(new MiscResultSelect)
miscResSel.io.func := func(3, 0)
miscResSel.io.func := func(4, 0)
miscResSel.io.andn := andn
miscResSel.io.orn := orn
miscResSel.io.xnor := xnor
miscResSel.io.and := and
miscResSel.io.or := or
miscResSel.io.xor := xor
miscResSel.io.orh48 := orh48
miscResSel.io.sextb := sextb
miscResSel.io.sexth := sexth
miscResSel.io.zexth := zexth
miscResSel.io.rev8 := rev8
miscResSel.io.orcb := orcb
miscResSel.io.src := src1
val miscRes = miscResSel.io.miscRes
val wordResSel = Module(new WordResultSelect)
......
......@@ -149,7 +149,7 @@ class Rename(implicit p: Parameters) extends XSModule {
// }
uops(i).roqIdx := roqIdxHead + i.U
uops(i).roqIdx := roqIdxHead + PopCount(io.in.take(i).map(_.valid))
io.out(i).valid := io.in(i).valid && intFreeList.canAllocate && fpFreeList.canAllocate && !io.roqCommits.isWalk
io.out(i).bits := uops(i)
......
......@@ -754,6 +754,7 @@ class Roq(numWbPorts: Int)(implicit p: Parameters) extends XSModule with HasCirc
wdata.ftqOffset := req.cf.ftqOffset
wdata.pc := req.cf.pc
wdata.crossPageIPFFix := req.cf.crossPageIPFFix
wdata.isFused := req.ctrl.isFused
// wdata.exceptionVec := req.cf.exceptionVec
}
dispatchData.io.raddr := commitReadAddr_next
......@@ -813,6 +814,14 @@ class Roq(numWbPorts: Int)(implicit p: Parameters) extends XSModule with HasCirc
fflagsDataRead := fflagsDataModule.io.rdata
val instrCnt = RegInit(0.U(64.W))
val fuseCommitCnt = PopCount(io.commits.valid.zip(io.commits.info).map{ case (v, i) => v && i.isFused =/= 0.U })
val trueCommitCnt = commitCnt +& fuseCommitCnt
val retireCounter = Mux(state === s_idle, trueCommitCnt, 0.U)
instrCnt := instrCnt + retireCounter
io.csr.perfinfo.retiredInstr := RegNext(retireCounter)
io.roqFull := !allowEnqueue
/**
* debug info
*/
......@@ -836,14 +845,15 @@ class Roq(numWbPorts: Int)(implicit p: Parameters) extends XSModule with HasCirc
XSPerfAccumulate("clock_cycle", 1.U)
QueuePerf(RoqSize, PopCount((0 until RoqSize).map(valid(_))), !allowEnqueue)
io.roqFull := !allowEnqueue
XSPerfAccumulate("commitInstr", Mux(io.commits.isWalk, 0.U, PopCount(io.commits.valid)))
XSPerfAccumulate("commitUop", Mux(io.commits.isWalk, 0.U, commitCnt))
XSPerfAccumulate("commitInstr", Mux(io.commits.isWalk, 0.U, trueCommitCnt))
val commitIsMove = deqPtrVec.map(_.value).map(ptr => debug_microOp(ptr).ctrl.isMove)
XSPerfAccumulate("commitInstrMove", Mux(io.commits.isWalk, 0.U, PopCount(io.commits.valid.zip(commitIsMove).map{ case (v, m) => v && m })))
if (EnableIntMoveElim) {
val commitMoveElim = deqPtrVec.map(_.value).map(ptr => debug_microOp(ptr).debugInfo.eliminatedMove)
XSPerfAccumulate("commitInstrMoveElim", Mux(io.commits.isWalk, 0.U, PopCount(io.commits.valid zip commitMoveElim map { case (v, e) => v && e })))
}
XSPerfAccumulate("commitInstrFused", Mux(io.commits.isWalk, 0.U, fuseCommitCnt))
val commitIsLoad = io.commits.info.map(_.commitType).map(_ === CommitType.LOAD)
val commitLoadValid = io.commits.valid.zip(commitIsLoad).map{ case (v, t) => v && t }
XSPerfAccumulate("commitInstrLoad", Mux(io.commits.isWalk, 0.U, PopCount(commitLoadValid)))
......@@ -869,10 +879,6 @@ class Roq(numWbPorts: Int)(implicit p: Parameters) extends XSModule with HasCirc
ExcitingUtils.addSink(l1Miss, "TMA_l1miss")
XSPerfAccumulate("TMA_L1miss", deqNotWritebacked && deqUopCommitType === CommitType.LOAD && l1Miss)
val instrCnt = RegInit(0.U(64.W))
val retireCounter = Mux(state === s_idle, commitCnt, 0.U)
instrCnt := instrCnt + retireCounter
io.csr.perfinfo.retiredInstr := RegNext(retireCounter)
//difftest signals
val firstValidCommit = (deqPtr + PriorityMux(io.commits.valid, VecInit(List.tabulate(CommitWidth)(_.U)))).value
......@@ -910,8 +916,10 @@ class Roq(numWbPorts: Int)(implicit p: Parameters) extends XSModule with HasCirc
difftest.io.valid := RegNext(io.commits.valid(i) && !io.commits.isWalk)
difftest.io.pc := RegNext(SignExt(uop.cf.pc, XLEN))
difftest.io.instr := RegNext(uop.cf.instr)
difftest.io.special := RegNext(uop.ctrl.isFused =/= 0.U)
if (EnableIntMoveElim) {
// when committing an eliminated move instruction, we must make sure that skip is properly set to false (output from EXU is random value)
// when committing an eliminated move instruction,
// we must make sure that skip is properly set to false (output from EXU is random value)
difftest.io.skip := RegNext(Mux(uop.eliminatedMove, false.B, exuOut.isMMIO || exuOut.isPerfCnt))
} else {
difftest.io.skip := RegNext(exuOut.isMMIO || exuOut.isPerfCnt)
......
......@@ -822,6 +822,19 @@ class Ftq(implicit p: Parameters) extends XSModule with HasCircularQueuePtrHelpe
for (c <- io.fromBackend.roq_commits) {
when(c.valid) {
commitStateQueue(c.bits.ftqIdx.value)(c.bits.ftqOffset) := c_commited
// TODO: remove this
// For instruction fusions, we also update the next instruction
when (c.bits.isFused === 1.U) {
commitStateQueue(c.bits.ftqIdx.value)(c.bits.ftqOffset + 1.U) := c_commited
}.elsewhen(c.bits.isFused === 2.U) {
commitStateQueue(c.bits.ftqIdx.value)(c.bits.ftqOffset + 2.U) := c_commited
}.elsewhen(c.bits.isFused === 3.U) {
val index = (c.bits.ftqIdx + 1.U).value
commitStateQueue(index)(0) := c_commited
}.elsewhen(c.bits.isFused === 4.U) {
val index = (c.bits.ftqIdx + 1.U).value
commitStateQueue(index)(1) := c_commited
}
}
}
......
......@@ -212,12 +212,26 @@ package object xiangshan {
def orn = "b0_00_00_011".U
def xor = "b0_00_00_100".U
def xnor = "b0_00_00_101".U
def orh48 = "b0_00_00_110".U
def andlsb = "b0_00_11_000".U
def andnlsb = "b0_00_11_001".U
def orlsb = "b0_00_11_010".U
def ornlsb = "b0_00_11_011".U
def xorlsb = "b0_00_11_100".U
def xnorlsb = "b0_00_11_101".U
def sext_b = "b0_00_01_000".U
def sext_h = "b0_00_01_001".U
def zext_h = "b0_00_01_010".U
// TOOD: optimize it
def szewl1 = "b0_00_01_011".U
def orc_b = "b0_00_01_100".U
def rev8 = "b0_00_01_101".U
// TOOD: optimize it
def szewl2 = "b0_00_01_110".U
// TOOD: optimize it
def byte2 = "b0_00_01_111".U
def beq = "b0_00_10_000".U
def bne = "b0_00_10_001".U
......@@ -229,13 +243,17 @@ package object xiangshan {
// add & sub optype
def add_uw = "b0_01_00_000".U
def add = "b0_01_00_001".U
def oddadd = "b0_01_10_001".U
def sh1add_uw = "b0_01_00_010".U
def sh1add = "b0_01_00_011".U
def sh2add_uw = "b0_01_00_100".U
def sh2add = "b0_01_00_101".U
def sh3add_uw = "b0_01_00_110".U
def sh3add = "b0_01_00_111".U
def sh4add = "b0_01_01_001".U
def sr30add = "b0_01_01_011".U
def sr31add = "b0_01_01_101".U
def sr32add = "b0_01_01_111".U
// shift optype
def slli_uw = "b0_10_00_000".U
......@@ -243,7 +261,7 @@ package object xiangshan {
def bclr = "b0_10_00_100".U
def bset = "b0_10_00_101".U
def binv = "b0_10_00_110".U
def srl = "b0_10_01_001".U
def bext = "b0_10_01_010".U
def sra = "b0_10_01_100".U
......@@ -259,11 +277,12 @@ package object xiangshan {
def minu = "b0_11_00_101".U
def max = "b0_11_00_110".U
def min = "b0_11_00_111".U
// RV64 32bit optype
def addw = "b1_01_00_001".U
def addwbyte = "b1_01_00_011".U
def addwbit = "b1_01_00_101".U
def oddaddw = "b1_01_10_001".U
def subw = "b1_11_00_000".U
def sllw = "b1_10_00_000".U
def srlw = "b1_10_01_001".U
......@@ -272,6 +291,9 @@ package object xiangshan {
def rorw = "b1_10_11_000".U
def isWordOp(func: UInt) = func(7)
def isAddw(func: UInt) = func(7, 5) === "b101".U
def isLogic(func: UInt) = func(7, 3) === "b00000".U
def logicToLSB(func: UInt) = Cat(func(7, 5), "b11".U(2.W), func(2, 0))
def isBranch(func: UInt) = func(6, 3) === "b0010".U
def getBranchType(func: UInt) = func(2, 1)
def isBranchInvert(func: UInt) = func(0)
......@@ -288,40 +310,27 @@ package object xiangshan {
def mulhu = "b00011".U
def mulw = "b00100".U
def mulw7 = "b01100".U
// div
// bit encoding: | type (2bit) | isWord(1bit) | isSign(1bit) | opcode(1bit) |
def div = "b01000".U
def divu = "b01010".U
def rem = "b01001".U
def remu = "b01011".U
def divw = "b01100".U
def divuw = "b01110".U
def remw = "b01101".U
def remuw = "b01111".U
// fence
// bit encoding: | type (2bit) | padding(1bit)(zero) | opcode(2bit) |
def fence = "b10000".U
def sfence = "b10001".U
def fencei = "b10010".U
// the highest bits are for instruction types
def typeMSB = 4
def typeLSB = 3
def MulType = "b00".U
def DivType = "b01".U
def FenceType = "b10".U
def isMul(op: UInt) = op(typeMSB, typeLSB) === MulType
def isDiv(op: UInt) = op(typeMSB, typeLSB) === DivType
def isFence(op: UInt) = op(typeMSB, typeLSB) === FenceType
def div = "b10000".U
def divu = "b10010".U
def rem = "b10001".U
def remu = "b10011".U
def divw = "b10100".U
def divuw = "b10110".U
def remw = "b10101".U
def remuw = "b10111".U
def isMul(op: UInt) = !op(4)
def isDiv(op: UInt) = op(4)
def isDivSign(op: UInt) = isDiv(op) && !op(1)
def isW(op: UInt) = op(2)
def isH(op: UInt) = (isDiv(op) && op(0)) || (isMul(op) && op(1,0)=/=0.U)
def getMulOp(op: UInt) = op(1,0)
def isH(op: UInt) = (isDiv(op) && op(0)) || (isMul(op) && op(1, 0) =/= 0.U)
def getMulOp(op: UInt) = op(1, 0)
}
object LSUOpType {
......@@ -372,7 +381,7 @@ package object xiangshan {
}
object BMUOpType {
def clmul = "b0000".U
def clmulh = "b0010".U
def clmulr = "b0100".U
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册