diff --git a/.github/workflows/check-usage.sh b/.github/workflows/check-usage.sh new file mode 100644 index 0000000000000000000000000000000000000000..aab1dd1951e946142f2ce35d6174dd93d6f648cb --- /dev/null +++ b/.github/workflows/check-usage.sh @@ -0,0 +1,10 @@ +#!/bin/bash + +echo $1 +echo $2 +grep -rn $1 $2/src/main/scala/xiangshan +if [[ $? == 0 ]]; +then + exit 1 +fi +exit 0 diff --git a/.github/workflows/emu.yml b/.github/workflows/emu.yml index 8d058edc9de28c2ef022a223fecf3e670feffd7e..505103ff32b260ce6730c644cbd8807a9c4da52c 100644 --- a/.github/workflows/emu.yml +++ b/.github/workflows/emu.yml @@ -3,7 +3,7 @@ name: EMU Test on: push: - branches: [ master, update-ci ] + branches: [ master, update-ci] pull_request: branches: [ master ] @@ -15,30 +15,19 @@ jobs: - uses: actions/checkout@v2 with: submodules: 'recursive' + - name: Check Wiring + run: bash .github/workflows/check-usage.sh "BoringUtils" $GITHUB_WORKSPACE - name: Set env run: | - echo ::set-env name=NEMU_HOME::/home/ci-runner/xsenv/NEMU - echo ::set-env name=NOOP_HOME::$GITHUB_WORKSPACE + echo "NEMU_HOME=/home/ci-runner/xsenv/NEMU" >> $GITHUB_ENV + echo "NOOP_HOME=$GITHUB_WORKSPACE" >> $GITHUB_ENV + echo "RVTEST_HOME=/home/ci-runner/xsenv/riscv-tests" >> $GITHUB_ENV + echo "AM_HOME=/home/ci-runner/xsenv/nexus-am" >> $GITHUB_ENV - name: Build EMU run: - make ./build/emu SIM_ARGS=--disable-log EMU_THREADS=16 NEMU_HOME=$NEMU_HOME NOOP_HOME=$NOOP_HOME -j20 - - cputest: - runs-on: self-hosted - name: Run cputest - needs: [build-emu] - steps: - - name: Set env - run: | - echo ::set-env name=AM_HOME::/home/ci-runner/xsenv/nexus-am - echo ::set-env name=NEMU_HOME::/home/ci-runner/xsenv/NEMU - echo ::set-env name=NOOP_HOME::$GITHUB_WORKSPACE - + make ./build/emu SIM_ARGS=--disable-all NEMU_HOME=$NEMU_HOME NOOP_HOME=$NOOP_HOME -j60 - name: Run cputest run: | - echo $AM_HOME - echo $NEMU_HOME - echo $NOOP_HOME CPU_TEST_DIR=$AM_HOME/tests/cputest echo $CPU_TEST_DIR ret=0 @@ -54,38 +43,9 @@ jobs: fi done exit $ret - - microbench: - runs-on: self-hosted - name: Run microbench - needs: [build-emu] - steps: - - name: Set env + - name: Run riscv-tests run: | - echo ::set-env name=AM_HOME::/home/ci-runner/xsenv/nexus-am - echo ::set-env name=NEMU_HOME::/home/ci-runner/xsenv/NEMU - echo ::set-env name=NOOP_HOME::$GITHUB_WORKSPACE + make -C $RVTEST_HOME/isa/ SUITES+=rv64ui SUITES+=rv64um SUITES+=rv64ua SUITES+=rv64uf SUITES+=rv64ud NEMU_HOME=$NEMU_HOME NOOP_HOME=$NOOP_HOME noop_run 2> /dev/null - name: Run microbench run: | - echo $AM_HOME - echo $NEMU_HOME - echo $NOOP_HOME make -C $AM_HOME/apps/microbench ARCH=riscv64-noop AM_HOME=$AM_HOME NEMU_HOME=$NEMU_HOME NOOP_HOME=$NOOP_HOME mainargs=test run 2> /dev/null - - riscv-tests: - runs-on: self-hosted - name: Run riscv-tests - needs: [build-emu] - steps: - - name: Set env - run: | - echo ::set-env name=NEMU_HOME::/home/ci-runner/xsenv/NEMU - echo ::set-env name=NOOP_HOME::$GITHUB_WORKSPACE - echo ::set-env name=RVTEST_HOME::/home/ci-runner/xsenv/riscv-tests - - name: Run riscv-test - run: | - echo $NEMU_HOME - echo $NOOP_HOME - echo $RVTEST_HOME - make -C $RVTEST_HOME/isa/ SUITES+=rv64ui SUITES+=rv64um SUITES+=rv64ua NEMU_HOME=$NEMU_HOME NOOP_HOME=$NOOP_HOME noop_run 2> /dev/null - diff --git a/.gitignore b/.gitignore index 7f7dca6d823e2e1ec3ab93da1076af672f05a005..fabc511356033b0be00469518c1f94c6994c5983 100644 --- a/.gitignore +++ b/.gitignore @@ -342,8 +342,10 @@ hs_err_pid* .vscode .metals .bloop +.bsp .coursier mill.rdiB stale_outputs_checked +*.snapshot diff --git a/.gitmodules b/.gitmodules index 79125e2f51ff281767f1c5f15b1799320fe3cce2..edeae7c1803e24458b20190ca2f807cc8b747a27 100644 --- a/.gitmodules +++ b/.gitmodules @@ -1,7 +1,7 @@ [submodule "rocket-chip"] path = rocket-chip - url = https://github.com/chipsalliance/rocket-chip.git - branch = d6bd3c61993637c3f10544c59e861fae8af29f39 + url = https://github.com/RISCVERS/rocket-chip.git + branch = 147bdcc4a26c74e5d7a47e3d667d456699d6d11f [submodule "block-inclusivecache-sifive"] path = block-inclusivecache-sifive url = https://github.com/RISCVERS/block-inclusivecache-sifive.git @@ -10,3 +10,9 @@ path = chiseltest url = https://github.com/ucb-bar/chisel-testers2.git branch = 3e3ecc5b25b7b6bc48341ec07c7a54b7ad53bcb7 +[submodule "api-config-chipsalliance"] + path = api-config-chipsalliance + url = https://github.com/chipsalliance/api-config-chipsalliance +[submodule "berkeley-hardfloat"] + path = berkeley-hardfloat + url = https://github.com/ucb-bar/berkeley-hardfloat diff --git a/Makefile b/Makefile index 5d0fbc527531d6c2d82365134efa4894216e7c50..a67aae05dccabadbb649796cc7928749ca7e5956 100644 --- a/Makefile +++ b/Makefile @@ -9,6 +9,14 @@ MEM_GEN = ./scripts/vlsi_mem_gen SIMTOP = top.TestMain IMAGE ?= temp +# co-simulation with DRAMsim3 +ifeq ($(WITH_DRAMSIM3),1) +ifndef DRAMSIM3_HOME +$(error DRAMSIM3_HOME is not set) +endif +override SIM_ARGS += --with-dramsim3 +endif + # remote machine with more cores to speedup c++ build REMOTE ?= localhost @@ -19,16 +27,17 @@ help: $(TOP_V): $(SCALA_FILE) mkdir -p $(@D) - mill XiangShan.runMain top.$(TOP) -X verilog -td $(@D) --output-file $(@F) --infer-rw $(FPGATOP) --repl-seq-mem -c:$(FPGATOP):-o:$(@D)/$(@F).conf - $(MEM_GEN) $(@D)/$(@F).conf >> $@ - sed -i -e 's/_\(aw\|ar\|w\|r\|b\)_\(\|bits_\)/_\1/g' $@ - @git log -n 1 >> .__head__ - @git diff >> .__diff__ - @sed -i 's/^/\/\// ' .__head__ - @sed -i 's/^/\/\//' .__diff__ - @cat .__head__ .__diff__ $@ > .__out__ - @mv .__out__ $@ - @rm .__head__ .__diff__ + mill XiangShan.test.runMain $(SIMTOP) -X verilog -td $(@D) --full-stacktrace --output-file $(@F) --disable-all --fpga-platform --remove-assert $(SIM_ARGS) + # mill XiangShan.runMain top.$(TOP) -X verilog -td $(@D) --output-file $(@F) --infer-rw $(FPGATOP) --repl-seq-mem -c:$(FPGATOP):-o:$(@D)/$(@F).conf + # $(MEM_GEN) $(@D)/$(@F).conf >> $@ + # sed -i -e 's/_\(aw\|ar\|w\|r\|b\)_\(\|bits_\)/_\1/g' $@ + # @git log -n 1 >> .__head__ + # @git diff >> .__diff__ + # @sed -i 's/^/\/\// ' .__head__ + # @sed -i 's/^/\/\//' .__diff__ + # @cat .__head__ .__diff__ $@ > .__out__ + # @mv .__out__ $@ + # @rm .__head__ .__diff__ deploy: build/top.zip @@ -40,68 +49,98 @@ build/top.zip: $(TOP_V) verilog: $(TOP_V) -SIM_TOP = XSSimTop +SIM_TOP = XSSimTop SIM_TOP_V = $(BUILD_DIR)/$(SIM_TOP).v -SIM_ARGS = $(SIM_TOP_V): $(SCALA_FILE) $(TEST_FILE) mkdir -p $(@D) + date -R mill XiangShan.test.runMain $(SIMTOP) -X verilog -td $(@D) --full-stacktrace --output-file $(@F) $(SIM_ARGS) + sed -i '/module XSSimTop/,/endmodule/d' $(SIM_TOP_V) + sed -i -e 's/$$fatal/$$finish/g' $(SIM_TOP_V) + date -R +EMU_TOP = XSSimSoC EMU_CSRC_DIR = $(abspath ./src/test/csrc) EMU_VSRC_DIR = $(abspath ./src/test/vsrc) EMU_CXXFILES = $(shell find $(EMU_CSRC_DIR) -name "*.cpp") -EMU_VFILES = $(shell find $(EMU_VSRC_DIR) -name "*.v" -or -name "*.sv") +EMU_VFILES = $(shell find $(EMU_VSRC_DIR) -name "*.v" -or -name "*.sv") -EMU_CXXFLAGS = -std=c++11 -static -Wall -I$(EMU_CSRC_DIR) +EMU_CXXFLAGS += -std=c++11 -static -Wall -I$(EMU_CSRC_DIR) EMU_CXXFLAGS += -DVERILATOR -Wno-maybe-uninitialized -EMU_LDFLAGS = -lpthread -lSDL2 -ldl -EMU_THREADS = 1 -ifeq ($(EMU_THREADS), 1) - VTHREAD_FLAGS = -else - VTHREAD_FLAGS = --threads $(EMU_THREADS) --threads-dpi none +EMU_LDFLAGS += -lpthread -lSDL2 -ldl -lz + +VEXTRA_FLAGS = -I$(abspath $(BUILD_DIR)) --x-assign unique -O3 -CFLAGS "$(EMU_CXXFLAGS)" -LDFLAGS "$(EMU_LDFLAGS)" + +# Verilator trace support +EMU_TRACE ?= +ifeq ($(EMU_TRACE),1) +VEXTRA_FLAGS += --trace +endif + +# Verilator multi-thread support +EMU_THREADS ?= 1 +ifneq ($(EMU_THREADS),1) +VEXTRA_FLAGS += --threads $(EMU_THREADS) --threads-dpi none +endif + +# Verilator savable +EMU_SNAPSHOT ?= +ifeq ($(EMU_SNAPSHOT),1) +VEXTRA_FLAGS += --savable +EMU_CXXFLAGS += -DVM_SAVABLE +endif + +# co-simulation with DRAMsim3 +ifeq ($(WITH_DRAMSIM3),1) +EMU_CXXFLAGS += -I$(DRAMSIM3_HOME)/src +EMU_CXXFLAGS += -DWITH_DRAMSIM3 -DDRAMSIM3_CONFIG=\\\"$(DRAMSIM3_HOME)/configs/XiangShan.ini\\\" -DDRAMSIM3_OUTDIR=\\\"$(BUILD_DIR)\\\" +EMU_LDFLAGS += $(DRAMSIM3_HOME)/build/libdramsim3.a endif # --trace -VERILATOR_FLAGS = --top-module $(SIM_TOP) \ +VERILATOR_FLAGS = --top-module $(EMU_TOP) \ +define+VERILATOR=1 \ +define+PRINTF_COND=1 \ +define+RANDOMIZE_REG_INIT \ +define+RANDOMIZE_MEM_INIT \ - $(VTHREAD_FLAGS) \ + $(VEXTRA_FLAGS) \ --assert \ - --trace \ - --savable \ --stats-vars \ --output-split 5000 \ - --output-split-cfuncs 5000 \ - -I$(abspath $(BUILD_DIR)) \ - --x-assign unique -O3 -CFLAGS "$(EMU_CXXFLAGS)" \ - -LDFLAGS "$(EMU_LDFLAGS)" + --output-split-cfuncs 5000 -EMU_MK := $(BUILD_DIR)/emu-compile/V$(SIM_TOP).mk +EMU_MK := $(BUILD_DIR)/emu-compile/V$(EMU_TOP).mk EMU_DEPS := $(EMU_VFILES) $(EMU_CXXFILES) EMU_HEADERS := $(shell find $(EMU_CSRC_DIR) -name "*.h") EMU := $(BUILD_DIR)/emu $(EMU_MK): $(SIM_TOP_V) | $(EMU_DEPS) @mkdir -p $(@D) + date -R verilator --cc --exe $(VERILATOR_FLAGS) \ -o $(abspath $(EMU)) -Mdir $(@D) $^ $(EMU_DEPS) + date -R +ifndef NEMU_HOME +$(error NEMU_HOME is not set) +endif REF_SO := $(NEMU_HOME)/build/riscv64-nemu-interpreter-so $(REF_SO): $(MAKE) -C $(NEMU_HOME) ISA=riscv64 SHARE=1 $(EMU): $(EMU_MK) $(EMU_DEPS) $(EMU_HEADERS) $(REF_SO) + date -R ifeq ($(REMOTE),localhost) CPPFLAGS=-DREF_SO=\\\"$(REF_SO)\\\" $(MAKE) VM_PARALLEL_BUILDS=1 OPT_FAST="-O3" -C $(abspath $(dir $(EMU_MK))) -f $(abspath $(EMU_MK)) else - ssh -tt $(REMOTE) 'CPPFLAGS=-DREF_SO=\\\"$(REF_SO)\\\" $(MAKE) -j80 VM_PARALLEL_BUILDS=1 OPT_FAST="-O3" -C $(abspath $(dir $(EMU_MK))) -f $(abspath $(EMU_MK))' + ssh -tt $(REMOTE) 'CPPFLAGS=-DREF_SO=\\\"$(REF_SO)\\\" $(MAKE) -j128 VM_PARALLEL_BUILDS=1 OPT_FAST="-O3" -C $(abspath $(dir $(EMU_MK))) -f $(abspath $(EMU_MK))' endif + date -R SEED ?= $(shell shuf -i 1-10000 -n 1) +VME_SOURCE ?= $(shell pwd) +VME_MODULE ?= # log will only be printed when (B<=GTimer<=E) && (L < loglevel) # use 'emu -h' to see more details @@ -119,21 +158,46 @@ else SNAPSHOT_OPTION = --load-snapshot=$(SNAPSHOT) endif +ifndef NOOP_HOME +$(error NOOP_HOME is not set) +endif EMU_FLAGS = -s $(SEED) -b $(B) -e $(E) $(SNAPSHOT_OPTION) $(WAVEFORM) emu: $(EMU) ls build $(EMU) -i $(IMAGE) $(EMU_FLAGS) +# extract verilog module from sim_top.v +# usage: make vme VME_MODULE=Roq +vme: $(SIM_TOP_V) + mill XiangShan.runMain utils.ExtractVerilogModules -m $(VME_MODULE) + +# usage: make phy_evaluate VME_MODULE=Roq REMOTE=100 +phy_evaluate: vme + scp -r ./build/extracted/* $(REMOTE):~/phy_evaluation/remote_run/rtl + ssh -tt $(REMOTE) 'cd ~/phy_evaluation/remote_run && $(MAKE) evaluate DESIGN_NAME=$(VME_MODULE)' + scp -r $(REMOTE):~/phy_evaluation/remote_run/rpts ./build + +# usage: make phy_evaluate_atc VME_MODULE=Roq REMOTE=100 +phy_evaluate_atc: vme + scp -r ./build/extracted/* $(REMOTE):~/phy_evaluation/remote_run/rtl + ssh -tt $(REMOTE) 'cd ~/phy_evaluation/remote_run && $(MAKE) evaluate_atc DESIGN_NAME=$(VME_MODULE)' + scp -r $(REMOTE):~/phy_evaluation/remote_run/rpts ./build + cache: $(MAKE) emu IMAGE=Makefile clean: - rm -rf $(BUILD_DIR) + git submodule foreach git clean -fdx + git clean -fd + rm -rf ./build init: git submodule update --init - @# do not use a recursive init to pull some not used submodules - cd ./rocket-chip/ && git submodule update --init api-config-chipsalliance hardfloat -.PHONY: verilog emu clean help init $(REF_SO) +bump: + git submodule foreach "git fetch origin&&git checkout master&&git reset --hard origin/master" + +bsp: + mill -i mill.contrib.BSP/install +.PHONY: verilog emu clean help init bump bsp $(REF_SO) diff --git a/api-config-chipsalliance b/api-config-chipsalliance new file mode 160000 index 0000000000000000000000000000000000000000..fd8df1105a92065425cd353b6855777e35bd79b4 --- /dev/null +++ b/api-config-chipsalliance @@ -0,0 +1 @@ +Subproject commit fd8df1105a92065425cd353b6855777e35bd79b4 diff --git a/berkeley-hardfloat b/berkeley-hardfloat new file mode 160000 index 0000000000000000000000000000000000000000..267357bdae5973a30565da6ebc728d513827ca5e --- /dev/null +++ b/berkeley-hardfloat @@ -0,0 +1 @@ +Subproject commit 267357bdae5973a30565da6ebc728d513827ca5e diff --git a/block-inclusivecache-sifive b/block-inclusivecache-sifive index 5ca43398ac8b1b293291bd4e6e8c233be6c66968..3d6bdf10d7b740588130e3056c8fd29f4175cadb 160000 --- a/block-inclusivecache-sifive +++ b/block-inclusivecache-sifive @@ -1 +1 @@ -Subproject commit 5ca43398ac8b1b293291bd4e6e8c233be6c66968 +Subproject commit 3d6bdf10d7b740588130e3056c8fd29f4175cadb diff --git a/build.sc b/build.sc index 09864c8778b178ccf728eb045361684a5745d8c9..14524033e7288596d0c191b89ca3797391cf7c54 100644 --- a/build.sc +++ b/build.sc @@ -1,6 +1,9 @@ import os.Path import mill._ import mill.modules.Util +import $ivy.`com.lihaoyi::mill-contrib-buildinfo:$MILL_VERSION` +import $ivy.`com.lihaoyi::mill-contrib-bsp:$MILL_VERSION` +import mill.contrib.buildinfo.BuildInfo import scalalib._ import coursier.maven.MavenRepository @@ -29,6 +32,15 @@ val chisel = Agg( ivy"edu.berkeley.cs::chisel3:3.4.0" ) +object `api-config-chipsalliance` extends CommonModule { + override def millSourcePath = super.millSourcePath / "design" / "craft" +} + +object hardfloat extends SbtModule with CommonModule { + override def millSourcePath = os.pwd / "berkeley-hardfloat" + override def ivyDeps = super.ivyDeps() ++ chisel +} + object `rocket-chip` extends SbtModule with CommonModule { override def ivyDeps = super.ivyDeps() ++ Agg( @@ -36,17 +48,8 @@ object `rocket-chip` extends SbtModule with CommonModule { ivy"org.json4s::json4s-jackson:3.6.1" ) ++ chisel - - object `api-config-chipsalliance` extends CommonModule { - override def millSourcePath = super.millSourcePath / 'design / 'craft - } - object macros extends SbtModule with CommonModule - object hardfloat extends SbtModule with CommonModule { - override def ivyDeps = super.ivyDeps() ++ chisel - } - override def moduleDeps = super.moduleDeps ++ Seq( `api-config-chipsalliance`, macros, hardfloat ) @@ -63,13 +66,13 @@ object `block-inclusivecache-sifive` extends CommonModule { object chiseltest extends CommonModule with SbtModule { override def ivyDeps = super.ivyDeps() ++ Agg( - ivy"edu.berkeley.cs::treadle:1.3.0", - ivy"org.scalatest::scalatest:3.0.8", - ivy"com.lihaoyi::utest:0.7.4" + ivy"edu.berkeley.cs::treadle:1.3.0", + ivy"org.scalatest::scalatest:3.2.0", + ivy"com.lihaoyi::utest:0.7.4" ) ++ chisel object test extends Tests { - def ivyDeps = Agg(ivy"org.scalacheck::scalacheck:1.14.3") - def testFrameworks = Seq("org.scalatest.tools.Framework") + def ivyDeps = Agg(ivy"org.scalacheck::scalacheck:1.14.3") + def testFrameworks = Seq("org.scalatest.tools.Framework") } } @@ -81,15 +84,14 @@ object XiangShan extends CommonModule with SbtModule { override def ivyDeps = super.ivyDeps() ++ chisel override def moduleDeps = super.moduleDeps ++ Seq( - `rocket-chip`, - `block-inclusivecache-sifive`, - chiseltest + `rocket-chip`, + `block-inclusivecache-sifive`, + chiseltest ) object test extends Tests { override def ivyDeps = super.ivyDeps() ++ Agg( - ivy"org.scalatest::scalatest:3.0.4", - ivy"edu.berkeley.cs::chisel-iotesters:1.2+", + ivy"org.scalatest::scalatest:3.2.0" ) def testFrameworks = Seq( @@ -101,5 +103,4 @@ object XiangShan extends CommonModule with SbtModule { } } -} - +} \ No newline at end of file diff --git a/chiseltest b/chiseltest index 3e3ecc5b25b7b6bc48341ec07c7a54b7ad53bcb7..6a2e1776c91635deb7e1982b2333611ae620e777 160000 --- a/chiseltest +++ b/chiseltest @@ -1 +1 @@ -Subproject commit 3e3ecc5b25b7b6bc48341ec07c7a54b7ad53bcb7 +Subproject commit 6a2e1776c91635deb7e1982b2333611ae620e777 diff --git a/debug/Makefile b/debug/Makefile index 8a09ef7f7596e711a520cdfc053ac4b1d68ee2f9..4253105ee2dd321429354b8ed75e3889a5bf863a 100644 --- a/debug/Makefile +++ b/debug/Makefile @@ -4,7 +4,7 @@ SINGLETEST = ALL=min3 B ?= 0 E ?= 0 -V ?= ALL +V ?= OFF #V ?= OFF EMU_ARGS = B=$(B) E=$(E) V=$(V) @@ -13,7 +13,8 @@ EMU_ARGS = B=$(B) E=$(E) V=$(V) # ------------------------------------------------------------------ cache: - $(MAKE) -C $(AM_HOME)/tests/cachetest $(ARCH) ALL=loader $(EMU_ARGS) run 2>&1 | tee > loader.log + $(MAKE) -C $(AM_HOME)/tests/cachetest $(ARCH) ALL=loader $(EMU_ARGS) run + #2>&1 | tee > loader.log #2>&1 | tee > loader.log cpu: @@ -40,6 +41,7 @@ amtest: microbench: $(MAKE) -C $(AM_HOME)/apps/microbench $(ARCH) $(EMU_ARGS) mainargs=test run + #2>&1 | tee > microbench.log #2 > microbench.log cat microbench.log | grep IPC @@ -48,7 +50,7 @@ microbench_train: cat microbench.log | grep IPC coremark: - $(MAKE) -C $(AM_HOME)/apps/coremark $(ARCH) $(EMU_ARGS) mainargs=test run + $(MAKE) -C $(AM_HOME)/apps/coremark $(ARCH) $(EMU_ARGS) mainargs=test run #2 > coremark.log cat coremark.log | grep IPC diff --git a/debug/cputest.sh b/debug/cputest.sh index 7ea91d0d003568ef15b067b34736e5af56b7880b..5510d73acde11dc1fc7b94c3c5fdbb2104111d84 100755 --- a/debug/cputest.sh +++ b/debug/cputest.sh @@ -6,7 +6,7 @@ for test in $(ls $TEST_HOME/tests) do t=${test%.c} echo -n -e "\x1b[0m $t: " - make -C $TEST_HOME ARCH=riscv64-noop E=0 ALL=$t run 2>/dev/null | grep "HIT GOOD TRAP" + make -C $TEST_HOME ARCH=riscv64-noop E=0 ALL=$t run 2>/dev/null | grep -E "HIT GOOD TRAP|IPC" if [[ $? == 1 ]]; then echo -e "\x1b[31mfail" diff --git a/debug/sc_stat.sh b/debug/sc_stat.sh new file mode 100755 index 0000000000000000000000000000000000000000..8a929f01df10105b11ddad50528050f7d6a013e3 --- /dev/null +++ b/debug/sc_stat.sh @@ -0,0 +1,12 @@ +#!/bin/bash + +log_dir=$1 +tage_w_sc_w=$(grep "scUpdate" $log_dir | grep "sc(1), tage(1)" -c) +tage_w_sc_r=$(grep "scUpdate" $log_dir | grep "sc(0), tage(1)" -c) +tage_r_sc_w=$(grep "scUpdate" $log_dir | grep "sc(1), tage(0)" -c) +tage_r_sc_r=$(grep "scUpdate" $log_dir | grep "sc(0), tage(0)" -c) + +echo $tage_r_sc_w tage right but mispredicted by sc +echo $tage_w_sc_r tage wrong and rectified by sc +echo `expr $tage_w_sc_w + $tage_r_sc_r` branches remain unchanged, in which $tage_w_sc_w are wrong + diff --git a/rocket-chip b/rocket-chip index d6bd3c61993637c3f10544c59e861fae8af29f39..147bdcc4a26c74e5d7a47e3d667d456699d6d11f 160000 --- a/rocket-chip +++ b/rocket-chip @@ -1 +1 @@ -Subproject commit d6bd3c61993637c3f10544c59e861fae8af29f39 +Subproject commit 147bdcc4a26c74e5d7a47e3d667d456699d6d11f diff --git a/src/main/resources/vsrc/regfile_160x64_10w16r_sim.v b/src/main/resources/vsrc/regfile_160x64_10w16r_sim.v new file mode 100755 index 0000000000000000000000000000000000000000..6095fe185c22d6cac5e298bb1f142909ab63603f --- /dev/null +++ b/src/main/resources/vsrc/regfile_160x64_10w16r_sim.v @@ -0,0 +1,558 @@ + +`timescale 1ns/1ps +//`define WITH_BYPASS + +module regfile_160x64_10w16r_sim ( +input clk, gpr, +input wen0, wen1, wen2, wen3, wen4, wen5, wen6, wen7, wen8, wen9, +input [ 7:0] waddr0, waddr1, waddr2, waddr3, waddr4, waddr5, waddr6, waddr7, waddr8, waddr9, +input [63:0] wdata0, wdata1, wdata2, wdata3, wdata4, wdata5, wdata6, wdata7, wdata8, wdata9, +input [ 7:0] raddr0, raddr1, raddr2, raddr3, raddr4, raddr5, raddr6, raddr7, +input [ 7:0] raddr8, raddr9, raddr10, raddr11, raddr12, raddr13, raddr14, raddr15, +output [63:0] rdata0, rdata1, rdata2, rdata3, rdata4, rdata5, rdata6, rdata7, +output [63:0] rdata8, rdata9, rdata10, rdata11, rdata12, rdata13, rdata14, rdata15); + +reg reg_gpr, reg_wen0, reg_wen1, reg_wen2, reg_wen3, reg_wen4, reg_wen5, reg_wen6, reg_wen7, reg_wen8, reg_wen9; +reg [ 7:0] reg_waddr0, reg_waddr1, reg_waddr2, reg_waddr3, reg_waddr4, reg_waddr5, reg_waddr6, reg_waddr7, reg_waddr8, reg_waddr9; +reg [63:0] reg_wdata0, reg_wdata1, reg_wdata2, reg_wdata3, reg_wdata4, reg_wdata5, reg_wdata6, reg_wdata7, reg_wdata8, reg_wdata9; +reg [ 7:0] reg_raddr0, reg_raddr1, reg_raddr2, reg_raddr3, reg_raddr4, reg_raddr5, reg_raddr6, reg_raddr7; +reg [ 7:0] reg_raddr8, reg_raddr9, reg_raddr10, reg_raddr11, reg_raddr12, reg_raddr13, reg_raddr14, reg_raddr15; +always @(posedge clk) begin + reg_gpr <= gpr; + reg_wen0 <= wen0; + reg_wen1 <= wen1; + reg_wen2 <= wen2; + reg_wen3 <= wen3; + reg_wen4 <= wen4; + reg_wen5 <= wen5; + reg_wen6 <= wen6; + reg_wen7 <= wen7; + reg_wen8 <= wen8; + reg_wen9 <= wen9; + if(wen0) begin reg_waddr0 <= waddr0; end + if(wen1) begin reg_waddr1 <= waddr1; end + if(wen2) begin reg_waddr2 <= waddr2; end + if(wen3) begin reg_waddr3 <= waddr3; end + if(wen4) begin reg_waddr4 <= waddr4; end + if(wen5) begin reg_waddr5 <= waddr5; end + if(wen6) begin reg_waddr6 <= waddr6; end + if(wen7) begin reg_waddr7 <= waddr7; end + if(wen8) begin reg_waddr8 <= waddr8; end + if(wen9) begin reg_waddr9 <= waddr9; end + if(wen0) begin reg_wdata0 <= wdata0; end + if(wen1) begin reg_wdata1 <= wdata1; end + if(wen2) begin reg_wdata2 <= wdata2; end + if(wen3) begin reg_wdata3 <= wdata3; end + if(wen4) begin reg_wdata4 <= wdata4; end + if(wen5) begin reg_wdata5 <= wdata5; end + if(wen6) begin reg_wdata6 <= wdata6; end + if(wen7) begin reg_wdata7 <= wdata7; end + if(wen8) begin reg_wdata8 <= wdata8; end + if(wen9) begin reg_wdata9 <= wdata9; end + reg_raddr0 <= raddr0; + reg_raddr1 <= raddr1; + reg_raddr2 <= raddr2; + reg_raddr3 <= raddr3; + reg_raddr4 <= raddr4; + reg_raddr5 <= raddr5; + reg_raddr6 <= raddr6; + reg_raddr7 <= raddr7; + reg_raddr8 <= raddr8; + reg_raddr9 <= raddr9; + reg_raddr10 <= raddr10; + reg_raddr11 <= raddr11; + reg_raddr12 <= raddr12; + reg_raddr13 <= raddr13; + reg_raddr14 <= raddr14; + reg_raddr15 <= raddr15; +end + +wire [255:0] wad0_dec, wad1_dec, wad2_dec, wad3_dec, wad4_dec, wad5_dec, wad6_dec, wad7_dec, wad8_dec, wad9_dec; +addr_dec_8x256_with_en U_wad0_dec ( .en(reg_wen0), .addr(reg_waddr0), .dec(wad0_dec) ); +addr_dec_8x256_with_en U_wad1_dec ( .en(reg_wen1), .addr(reg_waddr1), .dec(wad1_dec) ); +addr_dec_8x256_with_en U_wad2_dec ( .en(reg_wen2), .addr(reg_waddr2), .dec(wad2_dec) ); +addr_dec_8x256_with_en U_wad3_dec ( .en(reg_wen3), .addr(reg_waddr3), .dec(wad3_dec) ); +addr_dec_8x256_with_en U_wad4_dec ( .en(reg_wen4), .addr(reg_waddr4), .dec(wad4_dec) ); +addr_dec_8x256_with_en U_wad5_dec ( .en(reg_wen5), .addr(reg_waddr5), .dec(wad5_dec) ); +addr_dec_8x256_with_en U_wad6_dec ( .en(reg_wen6), .addr(reg_waddr6), .dec(wad6_dec) ); +addr_dec_8x256_with_en U_wad7_dec ( .en(reg_wen7), .addr(reg_waddr7), .dec(wad7_dec) ); +addr_dec_8x256_with_en U_wad8_dec ( .en(reg_wen8), .addr(reg_waddr8), .dec(wad8_dec) ); +addr_dec_8x256_with_en U_wad9_dec ( .en(reg_wen9), .addr(reg_waddr9), .dec(wad9_dec) ); + +wire clk_inv = !clk_inv; +wire gpr_inv = !reg_gpr; +integer i; +reg [63:0] reg_MEM [159:0]; +always @(posedge clk_inv or negedge gpr_inv) begin + if(!gpr_inv) begin + reg_MEM[0] <= 64'b0; + end + else begin + if(wad0_dec[0]||wad1_dec[0]||wad2_dec[0]||wad3_dec[0]||wad4_dec[0]||wad5_dec[0]||wad6_dec[0]||wad7_dec[0]||wad8_dec[0]||wad9_dec[0]) begin + reg_MEM[0] <= {64{wad0_dec[0]}}®_wdata0 | {64{wad1_dec[0]}}®_wdata1 | {64{wad2_dec[0]}}®_wdata2 | {64{wad3_dec[0]}}®_wdata3 | + {64{wad4_dec[0]}}®_wdata4 | {64{wad5_dec[0]}}®_wdata5 | {64{wad6_dec[0]}}®_wdata6 | {64{wad7_dec[0]}}®_wdata7 | + {64{wad8_dec[0]}}®_wdata8 | {64{wad9_dec[0]}}®_wdata9; + end + end +end +always @(posedge clk_inv) begin + for(i=1;i<160;i=i+1) begin + if(wad0_dec[i]||wad1_dec[i]||wad2_dec[i]||wad3_dec[i]||wad4_dec[i]||wad5_dec[i]||wad6_dec[i]||wad7_dec[i]||wad8_dec[i]||wad9_dec[i]) begin + reg_MEM[i] <= {64{wad0_dec[i]}}®_wdata0 | {64{wad1_dec[i]}}®_wdata1 | {64{wad2_dec[i]}}®_wdata2 | {64{wad3_dec[i]}}®_wdata3 | + {64{wad4_dec[i]}}®_wdata4 | {64{wad5_dec[i]}}®_wdata5 | {64{wad6_dec[i]}}®_wdata6 | {64{wad7_dec[i]}}®_wdata7 | + {64{wad8_dec[i]}}®_wdata8 | {64{wad9_dec[i]}}®_wdata9; + end + end +end + +wire [63:0] rdata0_0 = reg_MEM[reg_raddr0]; +wire [63:0] rdata1_0 = reg_MEM[reg_raddr1]; +wire [63:0] rdata2_0 = reg_MEM[reg_raddr2]; +wire [63:0] rdata3_0 = reg_MEM[reg_raddr3]; +wire [63:0] rdata4_0 = reg_MEM[reg_raddr4]; +wire [63:0] rdata5_0 = reg_MEM[reg_raddr5]; +wire [63:0] rdata6_0 = reg_MEM[reg_raddr6]; +wire [63:0] rdata7_0 = reg_MEM[reg_raddr7]; +wire [63:0] rdata8_0 = reg_MEM[reg_raddr8]; +wire [63:0] rdata9_0 = reg_MEM[reg_raddr9]; +wire [63:0] rdata10_0 = reg_MEM[reg_raddr10]; +wire [63:0] rdata11_0 = reg_MEM[reg_raddr11]; +wire [63:0] rdata12_0 = reg_MEM[reg_raddr12]; +wire [63:0] rdata13_0 = reg_MEM[reg_raddr13]; +wire [63:0] rdata14_0 = reg_MEM[reg_raddr14]; +wire [63:0] rdata15_0 = reg_MEM[reg_raddr15]; + +`ifdef WITH_BYPASS + wire [ 7:0] rd_by0, rd_by1, rd_by2, rd_by3, rd_by4, rd_by5, rd_by6, rd_by7, rd_by8, rd_by9, rd_by10, rd_by11, rd_by12, rd_by13, rd_by14, rd_by15; + wire [63:0] by_data0, by_data1, by_data2, by_data3, by_data4, by_data5, by_data6, by_data7, by_data8, by_data9, by_data10, by_data11, by_data12, by_data13, by_data14, by_data15; + addr_comp_10b U_comp_0 ( .by(rd_by0), .raddr(reg_raddr0), + .wen0(reg_wen0), .waddr0(reg_waddr0), .wen1(reg_wen1), .waddr1(reg_waddr1), .wen2(reg_wen2), .waddr2(reg_waddr2), .wen3(reg_wen3), .waddr3(reg_waddr3), + .wen4(reg_wen4), .waddr4(reg_waddr4), .wen5(reg_wen5), .waddr5(reg_waddr5), .wen6(reg_wen6), .waddr6(reg_waddr6), .wen7(reg_wen7), .waddr7(reg_waddr7), + .wen8(reg_wen8), .waddr8(reg_waddr8), .wen9(reg_wen9), .waddr9(reg_waddr9) ); + addr_comp_10b U_comp_1 ( .by(rd_by1), .raddr(reg_raddr1), + .wen0(reg_wen0), .waddr0(reg_waddr0), .wen1(reg_wen1), .waddr1(reg_waddr1), .wen2(reg_wen2), .waddr2(reg_waddr2), .wen3(reg_wen3), .waddr3(reg_waddr3), + .wen4(reg_wen4), .waddr4(reg_waddr4), .wen5(reg_wen5), .waddr5(reg_waddr5), .wen6(reg_wen6), .waddr6(reg_waddr6), .wen7(reg_wen7), .waddr7(reg_waddr7), + .wen8(reg_wen8), .waddr8(reg_waddr8), .wen9(reg_wen9), .waddr9(reg_waddr9) ); + addr_comp_10b U_comp_2 ( .by(rd_by2), .raddr(reg_raddr2), + .wen0(reg_wen0), .waddr0(reg_waddr0), .wen1(reg_wen1), .waddr1(reg_waddr1), .wen2(reg_wen2), .waddr2(reg_waddr2), .wen3(reg_wen3), .waddr3(reg_waddr3), + .wen4(reg_wen4), .waddr4(reg_waddr4), .wen5(reg_wen5), .waddr5(reg_waddr5), .wen6(reg_wen6), .waddr6(reg_waddr6), .wen7(reg_wen7), .waddr7(reg_waddr7), + .wen8(reg_wen8), .waddr8(reg_waddr8), .wen9(reg_wen9), .waddr9(reg_waddr9) ); + addr_comp_10b U_comp_3 ( .by(rd_by3), .raddr(reg_raddr3), + .wen0(reg_wen0), .waddr0(reg_waddr0), .wen1(reg_wen1), .waddr1(reg_waddr1), .wen2(reg_wen2), .waddr2(reg_waddr2), .wen3(reg_wen3), .waddr3(reg_waddr3), + .wen4(reg_wen4), .waddr4(reg_waddr4), .wen5(reg_wen5), .waddr5(reg_waddr5), .wen6(reg_wen6), .waddr6(reg_waddr6), .wen7(reg_wen7), .waddr7(reg_waddr7), + .wen8(reg_wen8), .waddr8(reg_waddr8), .wen9(reg_wen9), .waddr9(reg_waddr9) ); + addr_comp_10b U_comp_4 ( .by(rd_by4), .raddr(reg_raddr4), + .wen0(reg_wen0), .waddr0(reg_waddr0), .wen1(reg_wen1), .waddr1(reg_waddr1), .wen2(reg_wen2), .waddr2(reg_waddr2), .wen3(reg_wen3), .waddr3(reg_waddr3), + .wen4(reg_wen4), .waddr4(reg_waddr4), .wen5(reg_wen5), .waddr5(reg_waddr5), .wen6(reg_wen6), .waddr6(reg_waddr6), .wen7(reg_wen7), .waddr7(reg_waddr7), + .wen8(reg_wen8), .waddr8(reg_waddr8), .wen9(reg_wen9), .waddr9(reg_waddr9) ); + addr_comp_10b U_comp_5 ( .by(rd_by5), .raddr(reg_raddr5), + .wen0(reg_wen0), .waddr0(reg_waddr0), .wen1(reg_wen1), .waddr1(reg_waddr1), .wen2(reg_wen2), .waddr2(reg_waddr2), .wen3(reg_wen3), .waddr3(reg_waddr3), + .wen4(reg_wen4), .waddr4(reg_waddr4), .wen5(reg_wen5), .waddr5(reg_waddr5), .wen6(reg_wen6), .waddr6(reg_waddr6), .wen7(reg_wen7), .waddr7(reg_waddr7), + .wen8(reg_wen8), .waddr8(reg_waddr8), .wen9(reg_wen9), .waddr9(reg_waddr9) ); + addr_comp_10b U_comp_6 ( .by(rd_by6), .raddr(reg_raddr6), + .wen0(reg_wen0), .waddr0(reg_waddr0), .wen1(reg_wen1), .waddr1(reg_waddr1), .wen2(reg_wen2), .waddr2(reg_waddr2), .wen3(reg_wen3), .waddr3(reg_waddr3), + .wen4(reg_wen4), .waddr4(reg_waddr4), .wen5(reg_wen5), .waddr5(reg_waddr5), .wen6(reg_wen6), .waddr6(reg_waddr6), .wen7(reg_wen7), .waddr7(reg_waddr7), + .wen8(reg_wen8), .waddr8(reg_waddr8), .wen9(reg_wen9), .waddr9(reg_waddr9) ); + addr_comp_10b U_comp_7 ( .by(rd_by7), .raddr(reg_raddr7), + .wen0(reg_wen0), .waddr0(reg_waddr0), .wen1(reg_wen1), .waddr1(reg_waddr1), .wen2(reg_wen2), .waddr2(reg_waddr2), .wen3(reg_wen3), .waddr3(reg_waddr3), + .wen4(reg_wen4), .waddr4(reg_waddr4), .wen5(reg_wen5), .waddr5(reg_waddr5), .wen6(reg_wen6), .waddr6(reg_waddr6), .wen7(reg_wen7), .waddr7(reg_waddr7), + .wen8(reg_wen8), .waddr8(reg_waddr8), .wen9(reg_wen9), .waddr9(reg_waddr9) ); + addr_comp_10b U_comp_8 ( .by(rd_by8), .raddr(reg_raddr8), + .wen0(reg_wen0), .waddr0(reg_waddr0), .wen1(reg_wen1), .waddr1(reg_waddr1), .wen2(reg_wen2), .waddr2(reg_waddr2), .wen3(reg_wen3), .waddr3(reg_waddr3), + .wen4(reg_wen4), .waddr4(reg_waddr4), .wen5(reg_wen5), .waddr5(reg_waddr5), .wen6(reg_wen6), .waddr6(reg_waddr6), .wen7(reg_wen7), .waddr7(reg_waddr7), + .wen8(reg_wen8), .waddr8(reg_waddr8), .wen9(reg_wen9), .waddr9(reg_waddr9) ); + addr_comp_10b U_comp_9 ( .by(rd_by9), .raddr(reg_raddr9), + .wen0(reg_wen0), .waddr0(reg_waddr0), .wen1(reg_wen1), .waddr1(reg_waddr1), .wen2(reg_wen2), .waddr2(reg_waddr2), .wen3(reg_wen3), .waddr3(reg_waddr3), + .wen4(reg_wen4), .waddr4(reg_waddr4), .wen5(reg_wen5), .waddr5(reg_waddr5), .wen6(reg_wen6), .waddr6(reg_waddr6), .wen7(reg_wen7), .waddr7(reg_waddr7), + .wen8(reg_wen8), .waddr8(reg_waddr8), .wen9(reg_wen9), .waddr9(reg_waddr9) ); + addr_comp_10b U_comp_10 ( .by(rd_by10), .raddr(reg_raddr10), + .wen0(reg_wen0), .waddr0(reg_waddr0), .wen1(reg_wen1), .waddr1(reg_waddr1), .wen2(reg_wen2), .waddr2(reg_waddr2), .wen3(reg_wen3), .waddr3(reg_waddr3), + .wen4(reg_wen4), .waddr4(reg_waddr4), .wen5(reg_wen5), .waddr5(reg_waddr5), .wen6(reg_wen6), .waddr6(reg_waddr6), .wen7(reg_wen7), .waddr7(reg_waddr7), + .wen8(reg_wen8), .waddr8(reg_waddr8), .wen9(reg_wen9), .waddr9(reg_waddr9) ); + addr_comp_10b U_comp_11 ( .by(rd_by11), .raddr(reg_raddr11), + .wen0(reg_wen0), .waddr0(reg_waddr0), .wen1(reg_wen1), .waddr1(reg_waddr1), .wen2(reg_wen2), .waddr2(reg_waddr2), .wen3(reg_wen3), .waddr3(reg_waddr3), + .wen4(reg_wen4), .waddr4(reg_waddr4), .wen5(reg_wen5), .waddr5(reg_waddr5), .wen6(reg_wen6), .waddr6(reg_waddr6), .wen7(reg_wen7), .waddr7(reg_waddr7), + .wen8(reg_wen8), .waddr8(reg_waddr8), .wen9(reg_wen9), .waddr9(reg_waddr9) ); + addr_comp_10b U_comp_12 ( .by(rd_by12), .raddr(reg_raddr12), + .wen0(reg_wen0), .waddr0(reg_waddr0), .wen1(reg_wen1), .waddr1(reg_waddr1), .wen2(reg_wen2), .waddr2(reg_waddr2), .wen3(reg_wen3), .waddr3(reg_waddr3), + .wen4(reg_wen4), .waddr4(reg_waddr4), .wen5(reg_wen5), .waddr5(reg_waddr5), .wen6(reg_wen6), .waddr6(reg_waddr6), .wen7(reg_wen7), .waddr7(reg_waddr7), + .wen8(reg_wen8), .waddr8(reg_waddr8), .wen9(reg_wen9), .waddr9(reg_waddr9) ); + addr_comp_10b U_comp_13 ( .by(rd_by13), .raddr(reg_raddr13), + .wen0(reg_wen0), .waddr0(reg_waddr0), .wen1(reg_wen1), .waddr1(reg_waddr1), .wen2(reg_wen2), .waddr2(reg_waddr2), .wen3(reg_wen3), .waddr3(reg_waddr3), + .wen4(reg_wen4), .waddr4(reg_waddr4), .wen5(reg_wen5), .waddr5(reg_waddr5), .wen6(reg_wen6), .waddr6(reg_waddr6), .wen7(reg_wen7), .waddr7(reg_waddr7), + .wen8(reg_wen8), .waddr8(reg_waddr8), .wen9(reg_wen9), .waddr9(reg_waddr9) ); + + assign rdata0 = (|rd_by0) ? by_data0 : rdata0_0; + assign rdata1 = (|rd_by1) ? by_data1 : rdata1_0; + assign rdata2 = (|rd_by2) ? by_data2 : rdata2_0; + assign rdata3 = (|rd_by3) ? by_data3 : rdata3_0; + assign rdata4 = (|rd_by4) ? by_data4 : rdata4_0; + assign rdata5 = (|rd_by5) ? by_data5 : rdata5_0; + assign rdata6 = (|rd_by6) ? by_data6 : rdata6_0; + assign rdata7 = (|rd_by7) ? by_data7 : rdata7_0; + assign rdata8 = (|rd_by8) ? by_data8 : rdata8_0; + assign rdata9 = (|rd_by9) ? by_data9 : rdata9_0; + assign rdata10 = (|rd_by10) ? by_data10 : rdata10_0; + assign rdata11 = (|rd_by11) ? by_data11 : rdata11_0; + assign rdata12 = (|rd_by12) ? by_data12 : rdata12_0; + assign rdata13 = (|rd_by13) ? by_data13 : rdata13_0; + assign rdata14 = (|rd_by14) ? by_data14 : rdata14_0; + assign rdata15 = (|rd_by15) ? by_data15 : rdata15_0; +`else + assign rdata0 = rdata0_0; + assign rdata1 = rdata1_0; + assign rdata2 = rdata2_0; + assign rdata3 = rdata3_0; + assign rdata4 = rdata4_0; + assign rdata5 = rdata5_0; + assign rdata6 = rdata6_0; + assign rdata7 = rdata7_0; + assign rdata8 = rdata8_0; + assign rdata9 = rdata9_0; + assign rdata10 = rdata10_0; + assign rdata11 = rdata11_0; + assign rdata12 = rdata12_0; + assign rdata13 = rdata13_0; + assign rdata14 = rdata14_0; + assign rdata15 = rdata15_0; +`endif +endmodule + +`ifdef WITH_BYPASS +module addr_comp_10b ( +input [7:0] raddr, +input wen0, wen1, wen2, wen3, wen4, wen5, wen6, wen7, wen8, wen9, +input [7:0] waddr0, waddr1, waddr2, waddr3, waddr4, waddr5, waddr6, waddr7, waddr8, waddr9, +output [7:0] by); + +assign by[0] = wen0 && (waddr0 == raddr); +assign by[1] = wen1 && (waddr1 == raddr); +assign by[2] = wen2 && (waddr2 == raddr); +assign by[3] = wen3 && (waddr3 == raddr); +assign by[4] = wen4 && (waddr4 == raddr); +assign by[5] = wen5 && (waddr5 == raddr); +assign by[6] = wen6 && (waddr6 == raddr); +assign by[7] = wen7 && (waddr7 == raddr); +assign by[8] = wen8 && (waddr8 == raddr); +assign by[9] = wen9 && (waddr9 == raddr); +endmodule +`endif + +module addr_dec_8x256_with_en ( +input en, +input [7:0] addr, +output [255:0] dec); + +wire [15:0] r, c; +addr_dec_4x16 U_dec_r ( .addr(addr[3:0]), .dec(r) ); +addr_dec_4x16 U_dec_c ( .addr(addr[7:4]), .dec(c) ); +wire [255:0] dec_0; +addr_dec_32x256 U_dec_rc ( .r(r), .c(c), .dec(dec_0) ); +assign dec = {256{en}}&dec_0; +endmodule + +module addr_dec_4x16 ( +input [3:0] addr, +output reg [15:0] dec); +always @(addr) begin + case(addr) // synopsys full_case parallel_case + 4'b0000: dec = 16'b0000000000000001; + 4'b0001: dec = 16'b0000000000000010; + 4'b0010: dec = 16'b0000000000000100; + 4'b0011: dec = 16'b0000000000001000; + 4'b0100: dec = 16'b0000000000010000; + 4'b0101: dec = 16'b0000000000100000; + 4'b0110: dec = 16'b0000000001000000; + 4'b0111: dec = 16'b0000000010000000; + 4'b1000: dec = 16'b0000000100000000; + 4'b1001: dec = 16'b0000001000000000; + 4'b1010: dec = 16'b0000010000000000; + 4'b1011: dec = 16'b0000100000000000; + 4'b1100: dec = 16'b0001000000000000; + 4'b1101: dec = 16'b0010000000000000; + 4'b1110: dec = 16'b0100000000000000; + 4'b1111: dec = 16'b1000000000000000; + default: dec = 16'b0000000000000000; + endcase +end +endmodule + +module addr_dec_32x256 ( +input [15:0] r, c, +output [255:0] dec); +assign dec[ 0] = c[ 0] && r[ 0]; +assign dec[ 1] = c[ 0] && r[ 1]; +assign dec[ 2] = c[ 0] && r[ 2]; +assign dec[ 3] = c[ 0] && r[ 3]; +assign dec[ 4] = c[ 0] && r[ 4]; +assign dec[ 5] = c[ 0] && r[ 5]; +assign dec[ 6] = c[ 0] && r[ 6]; +assign dec[ 7] = c[ 0] && r[ 7]; +assign dec[ 8] = c[ 0] && r[ 8]; +assign dec[ 9] = c[ 0] && r[ 9]; +assign dec[ 10] = c[ 0] && r[10]; +assign dec[ 11] = c[ 0] && r[11]; +assign dec[ 12] = c[ 0] && r[12]; +assign dec[ 13] = c[ 0] && r[13]; +assign dec[ 14] = c[ 0] && r[14]; +assign dec[ 15] = c[ 0] && r[15]; + +assign dec[ 16] = c[ 1] && r[ 0]; +assign dec[ 17] = c[ 1] && r[ 1]; +assign dec[ 18] = c[ 1] && r[ 2]; +assign dec[ 19] = c[ 1] && r[ 3]; +assign dec[ 20] = c[ 1] && r[ 4]; +assign dec[ 21] = c[ 1] && r[ 5]; +assign dec[ 22] = c[ 1] && r[ 6]; +assign dec[ 23] = c[ 1] && r[ 7]; +assign dec[ 24] = c[ 1] && r[ 8]; +assign dec[ 25] = c[ 1] && r[ 9]; +assign dec[ 26] = c[ 1] && r[10]; +assign dec[ 27] = c[ 1] && r[11]; +assign dec[ 28] = c[ 1] && r[12]; +assign dec[ 29] = c[ 1] && r[13]; +assign dec[ 30] = c[ 1] && r[14]; +assign dec[ 31] = c[ 1] && r[15]; + +assign dec[ 32] = c[ 2] && r[ 0]; +assign dec[ 33] = c[ 2] && r[ 1]; +assign dec[ 34] = c[ 2] && r[ 2]; +assign dec[ 35] = c[ 2] && r[ 3]; +assign dec[ 36] = c[ 2] && r[ 4]; +assign dec[ 37] = c[ 2] && r[ 5]; +assign dec[ 38] = c[ 2] && r[ 6]; +assign dec[ 39] = c[ 2] && r[ 7]; +assign dec[ 40] = c[ 2] && r[ 8]; +assign dec[ 41] = c[ 2] && r[ 9]; +assign dec[ 42] = c[ 2] && r[10]; +assign dec[ 43] = c[ 2] && r[11]; +assign dec[ 44] = c[ 2] && r[12]; +assign dec[ 45] = c[ 2] && r[13]; +assign dec[ 46] = c[ 2] && r[14]; +assign dec[ 47] = c[ 2] && r[15]; + +assign dec[ 48] = c[ 3] && r[ 0]; +assign dec[ 49] = c[ 3] && r[ 1]; +assign dec[ 50] = c[ 3] && r[ 2]; +assign dec[ 51] = c[ 3] && r[ 3]; +assign dec[ 52] = c[ 3] && r[ 4]; +assign dec[ 53] = c[ 3] && r[ 5]; +assign dec[ 54] = c[ 3] && r[ 6]; +assign dec[ 55] = c[ 3] && r[ 7]; +assign dec[ 56] = c[ 3] && r[ 8]; +assign dec[ 57] = c[ 3] && r[ 9]; +assign dec[ 58] = c[ 3] && r[10]; +assign dec[ 59] = c[ 3] && r[11]; +assign dec[ 60] = c[ 3] && r[12]; +assign dec[ 61] = c[ 3] && r[13]; +assign dec[ 62] = c[ 3] && r[14]; +assign dec[ 63] = c[ 3] && r[15]; + +assign dec[ 64] = c[ 4] && r[ 0]; +assign dec[ 65] = c[ 4] && r[ 1]; +assign dec[ 66] = c[ 4] && r[ 2]; +assign dec[ 67] = c[ 4] && r[ 3]; +assign dec[ 68] = c[ 4] && r[ 4]; +assign dec[ 69] = c[ 4] && r[ 5]; +assign dec[ 70] = c[ 4] && r[ 6]; +assign dec[ 71] = c[ 4] && r[ 7]; +assign dec[ 72] = c[ 4] && r[ 8]; +assign dec[ 73] = c[ 4] && r[ 9]; +assign dec[ 74] = c[ 4] && r[10]; +assign dec[ 75] = c[ 4] && r[11]; +assign dec[ 76] = c[ 4] && r[12]; +assign dec[ 77] = c[ 4] && r[13]; +assign dec[ 78] = c[ 4] && r[14]; +assign dec[ 79] = c[ 4] && r[15]; + +assign dec[ 80] = c[ 5] && r[ 0]; +assign dec[ 81] = c[ 5] && r[ 1]; +assign dec[ 82] = c[ 5] && r[ 2]; +assign dec[ 83] = c[ 5] && r[ 3]; +assign dec[ 84] = c[ 5] && r[ 4]; +assign dec[ 85] = c[ 5] && r[ 5]; +assign dec[ 86] = c[ 5] && r[ 6]; +assign dec[ 87] = c[ 5] && r[ 7]; +assign dec[ 88] = c[ 5] && r[ 8]; +assign dec[ 89] = c[ 5] && r[ 9]; +assign dec[ 90] = c[ 5] && r[10]; +assign dec[ 91] = c[ 5] && r[11]; +assign dec[ 92] = c[ 5] && r[12]; +assign dec[ 93] = c[ 5] && r[13]; +assign dec[ 94] = c[ 5] && r[14]; +assign dec[ 95] = c[ 5] && r[15]; + +assign dec[ 96] = c[ 6] && r[ 0]; +assign dec[ 97] = c[ 6] && r[ 1]; +assign dec[ 98] = c[ 6] && r[ 2]; +assign dec[ 99] = c[ 6] && r[ 3]; +assign dec[100] = c[ 6] && r[ 4]; +assign dec[101] = c[ 6] && r[ 5]; +assign dec[102] = c[ 6] && r[ 6]; +assign dec[103] = c[ 6] && r[ 7]; +assign dec[104] = c[ 6] && r[ 8]; +assign dec[105] = c[ 6] && r[ 9]; +assign dec[106] = c[ 6] && r[10]; +assign dec[107] = c[ 6] && r[11]; +assign dec[108] = c[ 6] && r[12]; +assign dec[109] = c[ 6] && r[13]; +assign dec[110] = c[ 6] && r[14]; +assign dec[111] = c[ 6] && r[15]; + +assign dec[112] = c[ 7] && r[ 0]; +assign dec[113] = c[ 7] && r[ 1]; +assign dec[114] = c[ 7] && r[ 2]; +assign dec[115] = c[ 7] && r[ 3]; +assign dec[116] = c[ 7] && r[ 4]; +assign dec[117] = c[ 7] && r[ 5]; +assign dec[118] = c[ 7] && r[ 6]; +assign dec[119] = c[ 7] && r[ 7]; +assign dec[120] = c[ 7] && r[ 8]; +assign dec[121] = c[ 7] && r[ 9]; +assign dec[122] = c[ 7] && r[10]; +assign dec[123] = c[ 7] && r[11]; +assign dec[124] = c[ 7] && r[12]; +assign dec[125] = c[ 7] && r[13]; +assign dec[126] = c[ 7] && r[14]; +assign dec[127] = c[ 7] && r[15]; + +assign dec[128] = c[ 8] && r[ 0]; +assign dec[129] = c[ 8] && r[ 1]; +assign dec[130] = c[ 8] && r[ 2]; +assign dec[131] = c[ 8] && r[ 3]; +assign dec[132] = c[ 8] && r[ 4]; +assign dec[133] = c[ 8] && r[ 5]; +assign dec[134] = c[ 8] && r[ 6]; +assign dec[135] = c[ 8] && r[ 7]; +assign dec[136] = c[ 8] && r[ 8]; +assign dec[137] = c[ 8] && r[ 9]; +assign dec[138] = c[ 8] && r[10]; +assign dec[139] = c[ 8] && r[11]; +assign dec[140] = c[ 8] && r[12]; +assign dec[141] = c[ 8] && r[13]; +assign dec[142] = c[ 8] && r[14]; +assign dec[143] = c[ 8] && r[15]; + +assign dec[144] = c[ 9] && r[ 0]; +assign dec[145] = c[ 9] && r[ 1]; +assign dec[146] = c[ 9] && r[ 2]; +assign dec[147] = c[ 9] && r[ 3]; +assign dec[148] = c[ 9] && r[ 4]; +assign dec[149] = c[ 9] && r[ 5]; +assign dec[150] = c[ 9] && r[ 6]; +assign dec[151] = c[ 9] && r[ 7]; +assign dec[152] = c[ 9] && r[ 8]; +assign dec[153] = c[ 9] && r[ 9]; +assign dec[154] = c[ 9] && r[10]; +assign dec[155] = c[ 9] && r[11]; +assign dec[156] = c[ 9] && r[12]; +assign dec[157] = c[ 9] && r[13]; +assign dec[158] = c[ 9] && r[14]; +assign dec[159] = c[ 9] && r[15]; + +assign dec[160] = c[10] && r[ 0]; +assign dec[161] = c[10] && r[ 1]; +assign dec[162] = c[10] && r[ 2]; +assign dec[163] = c[10] && r[ 3]; +assign dec[164] = c[10] && r[ 4]; +assign dec[165] = c[10] && r[ 5]; +assign dec[166] = c[10] && r[ 6]; +assign dec[167] = c[10] && r[ 7]; +assign dec[168] = c[10] && r[ 8]; +assign dec[169] = c[10] && r[ 9]; +assign dec[170] = c[10] && r[10]; +assign dec[171] = c[10] && r[11]; +assign dec[172] = c[10] && r[12]; +assign dec[173] = c[10] && r[13]; +assign dec[174] = c[10] && r[14]; +assign dec[175] = c[10] && r[15]; + +assign dec[176] = c[11] && r[ 0]; +assign dec[177] = c[11] && r[ 1]; +assign dec[178] = c[11] && r[ 2]; +assign dec[179] = c[11] && r[ 3]; +assign dec[180] = c[11] && r[ 4]; +assign dec[181] = c[11] && r[ 5]; +assign dec[182] = c[11] && r[ 6]; +assign dec[183] = c[11] && r[ 7]; +assign dec[184] = c[11] && r[ 8]; +assign dec[185] = c[11] && r[ 9]; +assign dec[186] = c[11] && r[10]; +assign dec[187] = c[11] && r[11]; +assign dec[188] = c[11] && r[12]; +assign dec[189] = c[11] && r[13]; +assign dec[190] = c[11] && r[14]; +assign dec[191] = c[11] && r[15]; + +assign dec[192] = c[12] && r[ 0]; +assign dec[193] = c[12] && r[ 1]; +assign dec[194] = c[12] && r[ 2]; +assign dec[195] = c[12] && r[ 3]; +assign dec[196] = c[12] && r[ 4]; +assign dec[197] = c[12] && r[ 5]; +assign dec[198] = c[12] && r[ 6]; +assign dec[199] = c[12] && r[ 7]; +assign dec[200] = c[12] && r[ 8]; +assign dec[201] = c[12] && r[ 9]; +assign dec[202] = c[12] && r[10]; +assign dec[203] = c[12] && r[11]; +assign dec[204] = c[12] && r[12]; +assign dec[205] = c[12] && r[13]; +assign dec[206] = c[12] && r[14]; +assign dec[207] = c[12] && r[15]; + +assign dec[208] = c[13] && r[ 0]; +assign dec[209] = c[13] && r[ 1]; +assign dec[210] = c[13] && r[ 2]; +assign dec[211] = c[13] && r[ 3]; +assign dec[212] = c[13] && r[ 4]; +assign dec[213] = c[13] && r[ 5]; +assign dec[214] = c[13] && r[ 6]; +assign dec[215] = c[13] && r[ 7]; +assign dec[216] = c[13] && r[ 8]; +assign dec[217] = c[13] && r[ 9]; +assign dec[218] = c[13] && r[10]; +assign dec[219] = c[13] && r[11]; +assign dec[220] = c[13] && r[12]; +assign dec[221] = c[13] && r[13]; +assign dec[222] = c[13] && r[14]; +assign dec[223] = c[13] && r[15]; + +assign dec[224] = c[14] && r[ 0]; +assign dec[225] = c[14] && r[ 1]; +assign dec[226] = c[14] && r[ 2]; +assign dec[227] = c[14] && r[ 3]; +assign dec[228] = c[14] && r[ 4]; +assign dec[229] = c[14] && r[ 5]; +assign dec[230] = c[14] && r[ 6]; +assign dec[231] = c[14] && r[ 7]; +assign dec[232] = c[14] && r[ 8]; +assign dec[233] = c[14] && r[ 9]; +assign dec[234] = c[14] && r[10]; +assign dec[235] = c[14] && r[11]; +assign dec[236] = c[14] && r[12]; +assign dec[237] = c[14] && r[13]; +assign dec[238] = c[14] && r[14]; +assign dec[239] = c[14] && r[15]; + +assign dec[240] = c[15] && r[ 0]; +assign dec[241] = c[15] && r[ 1]; +assign dec[242] = c[15] && r[ 2]; +assign dec[243] = c[15] && r[ 3]; +assign dec[244] = c[15] && r[ 4]; +assign dec[245] = c[15] && r[ 5]; +assign dec[246] = c[15] && r[ 6]; +assign dec[247] = c[15] && r[ 7]; +assign dec[248] = c[15] && r[ 8]; +assign dec[249] = c[15] && r[ 9]; +assign dec[250] = c[15] && r[10]; +assign dec[251] = c[15] && r[11]; +assign dec[252] = c[15] && r[12]; +assign dec[253] = c[15] && r[13]; +assign dec[254] = c[15] && r[14]; +assign dec[255] = c[15] && r[15]; +endmodule + + + + diff --git a/src/main/scala/bus/simplebus/Crossbar.scala b/src/main/scala/bus/simplebus/Crossbar.scala deleted file mode 100644 index 96b4271e7d607cfbcaf02118a9ae202505c1ed17..0000000000000000000000000000000000000000 --- a/src/main/scala/bus/simplebus/Crossbar.scala +++ /dev/null @@ -1,139 +0,0 @@ -/************************************************************************************** -* Copyright (c) 2020 Institute of Computing Technology, CAS -* Copyright (c) 2020 University of Chinese Academy of Sciences -* -* NutShell is licensed under Mulan PSL v2. -* You can use this software according to the terms and conditions of the Mulan PSL v2. -* You may obtain a copy of Mulan PSL v2 at: -* http://license.coscl.org.cn/MulanPSL2 -* -* THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY OR -* FIT FOR A PARTICULAR PURPOSE. -* -* See the Mulan PSL v2 for more details. -***************************************************************************************/ - -package bus.simplebus - -import chisel3._ -import chisel3.util._ - -import utils._ - -class SimpleBusCrossbar1toN(addressSpace: List[(Long, Long)]) extends Module { - val io = IO(new Bundle { - val in = Flipped(new SimpleBusUC) - val out = Vec(addressSpace.length, new SimpleBusUC) - }) - - val s_idle :: s_resp :: s_error :: Nil = Enum(3) - val state = RegInit(s_idle) - - // select the output channel according to the address - val addr = io.in.req.bits.addr - val outSelVec = VecInit(addressSpace.map( - range => (addr >= range._1.U && addr < (range._1 + range._2).U))) - val outSelIdx = PriorityEncoder(outSelVec) - val outSel = io.out(outSelIdx) - val outSelIdxResp = RegEnable(outSelIdx, outSel.req.fire() && (state === s_idle)) - val outSelResp = io.out(outSelIdxResp) - val reqInvalidAddr = io.in.req.valid && !outSelVec.asUInt.orR - - when(!(!io.in.req.valid || outSelVec.asUInt.orR) || !(!(io.in.req.valid && outSelVec.asUInt.andR))){printf("[ERROR] bad addr %x, time %d\n", addr, GTimer())} - // assert(!io.in.req.valid || outSelVec.asUInt.orR, "address decode error, bad addr = 0x%x\n", addr) - assert(!(io.in.req.valid && outSelVec.asUInt.andR), "address decode error, bad addr = 0x%x\n", addr) - - // bind out.req channel - (io.out zip outSelVec).map { case (o, v) => { - o.req.bits := io.in.req.bits - o.req.valid := v && (io.in.req.valid && (state === s_idle)) - o.resp.ready := v - }} - - switch (state) { - is (s_idle) { - when (outSel.req.fire()) { state := s_resp } - when (reqInvalidAddr) { state := s_error } - } - is (s_resp) { when (outSelResp.resp.fire()) { state := s_idle } } - is (s_error) { when(io.in.resp.fire()){ state := s_idle } } - } - - io.in.resp.valid := outSelResp.resp.fire() || state === s_error - io.in.resp.bits <> outSelResp.resp.bits - // io.in.resp.bits.exc.get := state === s_error - outSelResp.resp.ready := io.in.resp.ready - io.in.req.ready := outSel.req.ready || reqInvalidAddr - - Debug() { - when (state === s_idle && io.in.req.valid) { - printf(p"${GTimer()}: xbar: in.req: ${io.in.req.bits}\n") - } - - when (outSel.req.fire()) { - printf(p"${GTimer()}: xbar: outSelIdx = ${outSelIdx}, outSel.req: ${outSel.req.bits}\n") - } - when (outSel.resp.fire()) { - printf(p"${GTimer()}: xbar: outSelIdx= ${outSelIdx}, outSel.resp: ${outSel.resp.bits}\n") - } - - when (io.in.resp.fire()) { - printf(p"${GTimer()}: xbar: in.resp: ${io.in.resp.bits}\n") - } - } -} - -class SimpleBusCrossbarNto1(n: Int, userBits:Int = 0) extends Module { - val io = IO(new Bundle { - val in = Flipped(Vec(n, new SimpleBusUC(userBits))) - val out = new SimpleBusUC(userBits) - }) - - val s_idle :: s_readResp :: s_writeResp :: Nil = Enum(3) - val state = RegInit(s_idle) - - val lockWriteFun = ((x: SimpleBusReqBundle) => x.isWrite() && x.isBurst()) - val inputArb = Module(new LockingArbiter(chiselTypeOf(io.in(0).req.bits), n, 8, Some(lockWriteFun))) - (inputArb.io.in zip io.in.map(_.req)).map{ case (arb, in) => arb <> in } - val thisReq = inputArb.io.out - assert(!(thisReq.valid && !thisReq.bits.isRead() && !thisReq.bits.isWrite())) - val inflightSrc = Reg(UInt(log2Up(n).W)) - - io.out.req.bits := thisReq.bits - // bind correct valid and ready signals - io.out.req.valid := thisReq.valid && (state === s_idle) - thisReq.ready := io.out.req.ready && (state === s_idle) - - io.in.map(_.resp.bits := io.out.resp.bits) - io.in.map(_.resp.valid := false.B) - (io.in(inflightSrc).resp, io.out.resp) match { case (l, r) => { - l.valid := r.valid - r.ready := l.ready - }} - - switch (state) { - is (s_idle) { - when (thisReq.fire()) { - inflightSrc := inputArb.io.chosen - when (thisReq.bits.isRead()) { state := s_readResp } - .elsewhen (thisReq.bits.isWriteLast() || thisReq.bits.isWriteSingle()) { state := s_writeResp } - } - } - is (s_readResp) { when (io.out.resp.fire() && io.out.resp.bits.isReadLast()) { state := s_idle } } - is (s_writeResp) { when (io.out.resp.fire()) { state := s_idle } } - } -} - -class SimpleBusCrossbar(n: Int, addressSpace: List[(Long, Long)]) extends Module { - val io = IO(new Bundle { - val in = Flipped(Vec(n, new SimpleBusUC)) - val out = Vec(addressSpace.length, new SimpleBusUC) - }) - - val inXbar = Module(new SimpleBusCrossbarNto1(n)) - val outXbar = Module(new SimpleBusCrossbar1toN(addressSpace)) - inXbar.io.in <> io.in - outXbar.io.in <> inXbar.io.out - io.out <> outXbar.io.out -} diff --git a/src/main/scala/bus/simplebus/DistributedMem.scala b/src/main/scala/bus/simplebus/DistributedMem.scala deleted file mode 100644 index 4fd469da142086bf7be0f980d7e2985bcfda42b3..0000000000000000000000000000000000000000 --- a/src/main/scala/bus/simplebus/DistributedMem.scala +++ /dev/null @@ -1,62 +0,0 @@ -package bus.simplebus - -import chisel3._ -import chisel3.util._ -import chisel3.util.experimental.loadMemoryFromFile - -import noop.HasNOOPParameter - -class DistributedMem(memByte: Int, dualPort: Boolean, delayCycles: Int = 0, dataFile: String = "") - extends Module with HasNOOPParameter { - val io = IO(new Bundle { - val rw = Flipped(new SimpleBusUC) - val ro = Flipped(new SimpleBusUC) - }) - - val wordNum = memByte / 8 - val nBank = XLEN / 8 - val memAddrBits = log2Up(wordNum) - def Index(addr: UInt): UInt = addr(memAddrBits + 2 - 1, 2) - - val rwIdx = Index(io.rw.req.bits.addr) - val roIdx = Index(io.ro.req.bits.addr) - val wen = io.rw.isWrite() - val wdataVec = VecInit.tabulate(nBank) { i => io.rw.req.bits.wdata(8 * (i + 1) - 1, 8 * i) } - val wmask = VecInit.tabulate(nBank) { i => io.rw.req.bits.wmask(i).asBool() } - - val rwData = Wire(UInt(XLEN.W)) - val roData = Wire(UInt(XLEN.W)) - - val mem = Mem(wordNum, Vec(nBank, UInt(8.W))) - if (dataFile != "") - loadMemoryFromFile(mem, dataFile) - - rwData := Cat(mem.read(rwIdx).reverse) - roData := Cat(mem.read(roIdx).reverse) - when (wen) { mem.write(rwIdx, wdataVec, wmask) } - - def readPort(p: SimpleBusUC, rdata: UInt) = { - val s_idle :: s_reading :: Nil = Enum(2) - val state = RegInit(s_idle) - switch (state) { - is (s_idle) { - when (p.req.fire()) { state := Mux(p.resp.fire(), s_idle, s_reading) } - } - is (s_reading) { - when (p.resp.fire()) { state := s_idle } - } - } - - p.req.ready := state === s_idle - p.resp.bits.rdata := rdata - p.resp.valid := (if (delayCycles == 0) p.req.fire() else Counter(state === s_reading, delayCycles)._2) - } - - readPort(io.rw, rwData) - if (dualPort) { - readPort(io.ro, roData) - } - else { - io.ro := DontCare - } -} diff --git a/src/main/scala/bus/simplebus/SimpleBus.scala b/src/main/scala/bus/simplebus/SimpleBus.scala deleted file mode 100644 index 74c48376c6ad6ef0a7d42c5b7feb22f5b6042666..0000000000000000000000000000000000000000 --- a/src/main/scala/bus/simplebus/SimpleBus.scala +++ /dev/null @@ -1,100 +0,0 @@ -package bus.simplebus - -import chisel3._ -import chisel3.util._ - -import noop.HasNOOPParameter -import utils._ -import bus.axi4._ - -sealed abstract class SimpleBusBundle extends Bundle with HasNOOPParameter - -object SimpleBusCmd { - // req - // hit | miss - def read = "b0000".U // read | refill - def write = "b0001".U // write | refill - def readBurst = "b0010".U // read | refill - def writeBurst = "b0011".U // write | refill - def writeLast = "b0111".U // write | refill - def probe = "b1000".U // read | do nothing - def prefetch = "b0100".U // read | refill - - // resp - def readLast = "b0110".U - def writeResp = "b0101".U - def probeHit = "b1100".U - def probeMiss = "b1000".U - - def apply() = UInt(4.W) -} - -class SimpleBusReqBundle(val userBits: Int = 0, val addrBits: Int = 32) extends SimpleBusBundle { - val addr = Output(UInt(addrBits.W)) - val size = Output(UInt(3.W)) - val cmd = Output(SimpleBusCmd()) - val wmask = Output(UInt((DataBits / 8).W)) - val wdata = Output(UInt(DataBits.W)) - val user = if (userBits > 0) Some(Output(UInt(userBits.W))) else None - - override def toPrintable: Printable = { - p"addr = 0x${Hexadecimal(addr)}, cmd = ${cmd}, size = ${size}, " + - p"wmask = 0x${Hexadecimal(wmask)}, wdata = 0x${Hexadecimal(wdata)}" - } - - def apply(addr: UInt, cmd: UInt, size: UInt, wdata: UInt, wmask: UInt, user: UInt = 0.U) { - this.addr := addr - this.cmd := cmd - this.size := size - this.wdata := wdata - this.wmask := wmask - this.user.map(_ := user) - } - - def isRead() = !cmd(0) && !cmd(3) - def isWrite() = cmd(0) - def isBurst() = cmd(1) - def isReadBurst() = cmd === SimpleBusCmd.readBurst - def isWriteSingle() = cmd === SimpleBusCmd.write - def isWriteLast() = cmd === SimpleBusCmd.writeLast - def isProbe() = cmd === SimpleBusCmd.probe - def isPrefetch() = cmd === SimpleBusCmd.prefetch -} - -class SimpleBusRespBundle(val userBits: Int = 0) extends SimpleBusBundle { - val cmd = Output(SimpleBusCmd()) - val rdata = Output(UInt(DataBits.W)) - val user = if (userBits > 0) Some(Output(UInt(userBits.W))) else None - - override def toPrintable: Printable = p"rdata = ${Hexadecimal(rdata)}, cmd = ${cmd}" - - def isReadLast() = cmd === SimpleBusCmd.readLast - def isProbeHit() = cmd === SimpleBusCmd.probeHit - def isProbeMiss() = cmd === SimpleBusCmd.probeMiss - def isWriteResp() = cmd === SimpleBusCmd.writeResp - def isPrefetch() = cmd === SimpleBusCmd.prefetch -} - -// Uncache -class SimpleBusUC(val userBits: Int = 0, val addrBits: Int = 32) extends SimpleBusBundle { - val req = Decoupled(new SimpleBusReqBundle(userBits, addrBits)) - val resp = Flipped(Decoupled(new SimpleBusRespBundle(userBits))) - - def isWrite() = req.valid && req.bits.isWrite() - def isRead() = req.valid && req.bits.isRead() - def toAXI4Lite() = SimpleBus2AXI4Converter(this, new AXI4Lite) - def toAXI4() = SimpleBus2AXI4Converter(this, new AXI4) - - def dump(name: String) = { - when (req.fire()) { printf(p"${GTimer()},[${name}] ${req.bits}\n") } - when (resp.fire()) { printf(p"${GTimer()},[${name}] ${resp.bits}\n") } - } -} - -// Cache -class SimpleBusC(val userBits: Int = 0) extends SimpleBusBundle { - val mem = new SimpleBusUC(userBits) - val coh = Flipped(new SimpleBusUC(userBits)) - - def memtoAXI4() = this.mem.toAXI4 -} diff --git a/src/main/scala/bus/simplebus/ToAXI4.scala b/src/main/scala/bus/simplebus/ToAXI4.scala deleted file mode 100644 index c6d8af32b9a682dd3298663cc71aeca74e0d9313..0000000000000000000000000000000000000000 --- a/src/main/scala/bus/simplebus/ToAXI4.scala +++ /dev/null @@ -1,182 +0,0 @@ -package bus.simplebus - -import chisel3._ -import chisel3.util._ - -import bus.axi4._ -import utils._ - -class AXI42SimpleBusConverter() extends Module { - val io = IO(new Bundle { - val in = Flipped(new AXI4(idBits = 18)) - val out = new SimpleBusUC() - }) - - val (axi, mem) = (io.in, io.out) - val (ar, aw, w, r, b) = (axi.ar.bits, axi.aw.bits, axi.w.bits, axi.r.bits, axi.b.bits) - val (req, resp) = (mem.req.bits, mem.resp.bits) - - // Default value - - val inflight_id_reg = RegInit(0.U) - val axi_na :: axi_read :: axi_write :: Nil = Enum(3) - val inflight_type = RegInit(axi_na) - private def setState(axi_type: UInt, id: UInt) = { - inflight_id_reg := id - inflight_type := axi_type; - } - private def resetState() = { - inflight_type := axi_na - inflight_id_reg := 0.U - } - private def is_inflight() = { - inflight_type =/= axi_na - } - - // Default - val default_mem = 0.U.asTypeOf(new SimpleBusUC) - val default_axi = 0.U.asTypeOf(new AXI4) - req := default_mem.req.bits - r := default_axi.r.bits - b := default_axi.b.bits - - - // Read Path - when (axi.ar.valid) { - mem.req.valid := true.B - req.addr := ar.addr - req.cmd := Mux(ar.len === 0.U, SimpleBusCmd.read, SimpleBusCmd.readBurst) - // TODO: consider ar.burst - req.size := ar.size - req.user.foreach(_ := ar.user) - req.wmask := 0.U - req.wdata := 0.U - - when (mem.req.fire) { - setState(axi_read, ar.id) - } - } - - when (mem.resp.valid) { - axi.r.valid := true.B - r.data := resp.rdata - r.id := inflight_id_reg - // TODO: r.resp handling - r.resp := AXI4Parameters.RESP_OKAY - r.last := resp.isReadLast - resp.user.foreach(r.user := _) - - when (axi.r.fire && resp.isReadLast) { - resetState() - } - } - - // Write Path - val aw_reg = Reg(new AXI4BundleA(AXI4Parameters.idBits)) - val bresp_en = RegInit(false.B) - - when (axi.aw.valid && !axi.ar.valid) { - aw_reg := aw - - when (axi.aw.fire) { - setState(axi_write, aw.id) - } - } - - when (axi.w.valid) { - mem.req.valid := true.B - req.cmd := Mux(aw_reg.len === 0.U, SimpleBusCmd.write, - Mux(w.last, SimpleBusCmd.writeLast, SimpleBusCmd.writeBurst)) - req.addr := aw_reg.addr - req.size := aw_reg.size - req.wmask := w.strb - req.wdata := w.data - req.user.foreach(_ := aw.user) - - when (axi.w.fire && w.last) { - bresp_en := true.B - } - } - - when (axi.b.fire) { - bresp_en := false.B - resetState() - } - - // Arbitration - // Slave's ready maybe generated according to valid signal, so let valid signals go through. - mem.req.valid := axi.ar.valid || axi.w.valid - mem.resp.ready := true.B || (inflight_type === axi_read && axi.r.ready) || (inflight_type === axi_write && axi.b.ready) - axi.ar.ready := !is_inflight && mem.req.ready - axi.r.valid := inflight_type === axi_read && mem.resp.valid - // AW should be buffered so no ready is considered. - axi.aw.ready := !is_inflight && !axi.ar.valid - axi.w.ready := inflight_type === axi_write && mem.req.ready - axi.b.valid := bresp_en && mem.resp.valid - axi.b.bits.resp := AXI4Parameters.RESP_OKAY -} - - -class SimpleBus2AXI4Converter[OT <: AXI4Lite](outType: OT) extends Module { - val io = IO(new Bundle { - val in = Flipped(new SimpleBusUC) - val out = Flipped(Flipped(outType)) - }) - - val toAXI4Lite = !(io.in.req.valid && io.in.req.bits.isBurst()) && (outType.getClass == classOf[AXI4Lite]).B - val toAXI4 = (outType.getClass == classOf[AXI4]).B - assert(toAXI4Lite || toAXI4) - - val (mem, axi) = (io.in, io.out) - val (ar, aw, w, r, b) = (axi.ar.bits, axi.aw.bits, axi.w.bits, axi.r.bits, axi.b.bits) - - ar.addr := mem.req.bits.addr - ar.prot := AXI4Parameters.PROT_PRIVILEDGED - w.data := mem.req.bits.wdata - w.strb := mem.req.bits.wmask - - def LineBeats = 8 - val wlast = WireInit(true.B) - val rlast = WireInit(true.B) - if (outType.getClass == classOf[AXI4]) { - val axi4 = io.out.asInstanceOf[AXI4] - axi4.ar.bits.id := 0.U - axi4.ar.bits.len := Mux(mem.req.bits.isBurst(), (LineBeats - 1).U, 0.U) - axi4.ar.bits.size := mem.req.bits.size - axi4.ar.bits.burst := AXI4Parameters.BURST_WRAP - axi4.ar.bits.lock := false.B - axi4.ar.bits.cache := 0.U - axi4.ar.bits.qos := 0.U - axi4.ar.bits.user := 0.U - axi4.w.bits.last := mem.req.bits.isWriteLast() || mem.req.bits.isWriteSingle() - wlast := axi4.w.bits.last - rlast := axi4.r.bits.last - } - - aw := ar - mem.resp.bits.rdata := r.data - mem.resp.bits.cmd := Mux(rlast, SimpleBusCmd.readLast, 0.U) - - val wSend = Wire(Bool()) - val awAck = BoolStopWatch(axi.aw.fire(), wSend) - val wAck = BoolStopWatch(axi.w.fire() && wlast, wSend) - wSend := (axi.aw.fire() && axi.w.fire() && wlast) || (awAck && wAck) - val wen = RegEnable(mem.req.bits.isWrite(), mem.req.fire()) - - axi.ar.valid := mem.isRead() - axi.aw.valid := mem.isWrite() && !awAck - axi.w .valid := mem.isWrite() && !wAck - mem.req.ready := Mux(mem.req.bits.isWrite(), !wAck && axi.w.ready, axi.ar.ready) - - axi.r.ready := mem.resp.ready - axi.b.ready := mem.resp.ready - mem.resp.valid := Mux(wen, axi.b.valid, axi.r.valid) -} - -object SimpleBus2AXI4Converter { - def apply[OT <: AXI4Lite](in: SimpleBusUC, outType: OT): OT = { - val bridge = Module(new SimpleBus2AXI4Converter(outType)) - bridge.io.in <> in - bridge.io.out - } -} diff --git a/src/main/scala/bus/tilelink/NaiveTL1toN.scala b/src/main/scala/bus/tilelink/NaiveTL1toN.scala deleted file mode 100644 index 345ad2a8817d55aa68e2a642d933037708f6a4e2..0000000000000000000000000000000000000000 --- a/src/main/scala/bus/tilelink/NaiveTL1toN.scala +++ /dev/null @@ -1,89 +0,0 @@ -package bus.tilelink - -import chisel3._ -import chisel3.util._ -import utils.{Debug, GTimer} - -// Only support A and D channel, very naive... - -class NaiveTL1toN -( - addressSpace: List[(Long, Long)], - para: TLParameters -) extends Module{ - val io = IO(new Bundle() { - val in = Flipped(TLCached(para)) - val out = Vec(addressSpace.length, TLCached(para)) - }) - - io.in <> DontCare - io.out <> DontCare - - val s_idle :: s_resp :: s_error :: Nil = Enum(3) - val state = RegInit(s_idle) - - // select the output channel according to the address - val addr = io.in.a.bits.address - val outSelVec = VecInit(addressSpace.map( - range => addr >= range._1.U && addr < (range._1 + range._2).U - )) - val outSelIdx = PriorityEncoder(outSelVec) - val outSel = io.out(outSelIdx) - val outSelIdxResp = RegEnable(outSelIdx, outSel.a.fire() && (state === s_idle)) - val outSelResp = io.out(outSelIdxResp) - val reqInvalidAddr = io.in.a.valid && !outSelVec.asUInt.orR - - when( - !(!io.in.a.valid || outSelVec.asUInt.orR) || (io.in.a.valid && outSelVec.asUInt.andR) - ){ - printf("[ERROR] bad addr %x, time %d\n", addr, GTimer()) - } - // assert(!io.in.req.valid || outSelVec.asUInt.orR, "address decode error, bad addr = 0x%x\n", addr) - assert( - !(io.in.a.valid && outSelVec.asUInt.andR), - "address decode error, bad addr = 0x%x\n", addr - ) - - // bind out.req channel - (io.out zip outSelVec).foreach { case (o, v) => - o.a.bits := io.in.a.bits - o.a.valid := v && (io.in.a.valid && (state === s_idle)) - o.d.ready := v - } - - switch (state) { - is (s_idle) { - when (outSel.a.fire()) { state := s_resp } - when (reqInvalidAddr) { state := s_error } - } - is (s_resp) { when (outSelResp.d.fire()) { state := s_idle } } - is (s_error) { when(io.in.d.fire()){ state := s_idle } } - } - - io.in.d.valid := outSelResp.d.fire() || state === s_error - io.in.d.bits <> outSelResp.d.bits - // io.in.resp.bits.exc.get := state === s_error - outSelResp.d.ready := io.in.d.ready - io.in.a.ready := outSel.a.ready || reqInvalidAddr - - Debug() { - when (state === s_idle && io.in.a.valid) { - printf(p"${GTimer()}: req: ") - io.in.a.bits.dump() - } - - when (outSel.a.fire()) { - printf(p"${GTimer()}: xbar: outSelIdx = $outSelIdx, outSel.req: ") - outSel.a.bits.dump() - } - when (outSel.d.fire()) { - printf(p"${GTimer()}: xbar: outSelIdx= $outSelIdx, outSel.resp: ") - outSel.d.bits.dump() - } - - when (io.in.d.fire()) { - printf(p"${GTimer()}: xbar: in.resp: ") - io.in.d.bits.dump() - } - } -} diff --git a/src/main/scala/device/AXI4DummySD.scala b/src/main/scala/device/AXI4DummySD.scala index 7f863bf7e083eff9f1629c62ac1f4f3c413c338e..267d4c40c0787c5689a1f0f8549b62dd8914f6fa 100644 --- a/src/main/scala/device/AXI4DummySD.scala +++ b/src/main/scala/device/AXI4DummySD.scala @@ -53,7 +53,7 @@ class SDHelper extends BlackBox with HasBlackBoxInline { class AXI4DummySD ( - address: AddressSet + address: Seq[AddressSet] )(implicit p: Parameters) extends AXI4SlaveModule(address, executable = false) with HasSDConst { diff --git a/src/main/scala/device/AXI4Flash.scala b/src/main/scala/device/AXI4Flash.scala index 2a92d42a7f58507e0ef6243919c6585d99277eeb..aeee36b7c423d22914771d02d5fe393e35fb0a6d 100644 --- a/src/main/scala/device/AXI4Flash.scala +++ b/src/main/scala/device/AXI4Flash.scala @@ -8,7 +8,7 @@ import utils._ class AXI4Flash ( - address: AddressSet + address: Seq[AddressSet] )(implicit p: Parameters) extends AXI4SlaveModule(address, executable = false) { diff --git a/src/main/scala/device/AXI4Keyboard.scala b/src/main/scala/device/AXI4Keyboard.scala index c6457c43af93555db4d73d8962dad6deada4636e..a649ac71aabbd495af34f2b9f4262def2a426b50 100644 --- a/src/main/scala/device/AXI4Keyboard.scala +++ b/src/main/scala/device/AXI4Keyboard.scala @@ -14,7 +14,7 @@ class KeyboardIO extends Bundle { // this Module is not tested class AXI4Keyboard ( - address: AddressSet + address: Seq[AddressSet] )(implicit p: Parameters) extends AXI4SlaveModule(address, executable = false, _extra = new KeyboardIO) { diff --git a/src/main/scala/device/AXI4RAM.scala b/src/main/scala/device/AXI4RAM.scala index e88cf66ddfe5276b421e38ec4fabd8d28de113b6..010ce2607f398271d0d927a3bd42022c9318f89d 100644 --- a/src/main/scala/device/AXI4RAM.scala +++ b/src/main/scala/device/AXI4RAM.scala @@ -5,22 +5,24 @@ import chisel3._ import chisel3.util._ import freechips.rocketchip.diplomacy.{AddressSet, LazyModule, LazyModuleImp, RegionType} import xiangshan.HasXSParameter +import utils.{MaskExpand} class RAMHelper(memByte: BigInt) extends BlackBox with HasXSParameter { val io = IO(new Bundle { - val clk = Input(Clock()) - val rIdx = Input(UInt(DataBits.W)) + val clk = Input(Clock()) + val en = Input(Bool()) + val rIdx = Input(UInt(DataBits.W)) val rdata = Output(UInt(DataBits.W)) - val wIdx = Input(UInt(DataBits.W)) + val wIdx = Input(UInt(DataBits.W)) val wdata = Input(UInt(DataBits.W)) val wmask = Input(UInt(DataBits.W)) - val wen = Input(Bool()) + val wen = Input(Bool()) }) } class AXI4RAM ( - address: AddressSet, + address: Seq[AddressSet], memByte: Long, useBlackBox: Boolean = false, executable: Boolean = true, @@ -32,27 +34,35 @@ class AXI4RAM override lazy val module = new AXI4SlaveModuleImp(this){ + val split = beatBytes / 8 + val bankByte = memByte / split val offsetBits = log2Up(memByte) - val offsetMask = (1 << offsetBits) - 1 + require(address.length >= 1) + val baseAddress = address(0).base - def index(addr: UInt) = ((addr & offsetMask.U) >> log2Ceil(beatBytes)).asUInt() + def index(addr: UInt) = ((addr - baseAddress.U)(offsetBits - 1, 0) >> log2Ceil(beatBytes)).asUInt() - def inRange(idx: UInt) = idx < (memByte / 8).U + def inRange(idx: UInt) = idx < (memByte / beatBytes).U val wIdx = index(waddr) + writeBeatCnt val rIdx = index(raddr) + readBeatCnt val wen = in.w.fire() && inRange(wIdx) + require(beatBytes >= 8) val rdata = if (useBlackBox) { - val mem = Module(new RAMHelper(memByte)) - mem.io.clk := clock - mem.io.rIdx := rIdx - mem.io.wIdx := wIdx - mem.io.wdata := in.w.bits.data - mem.io.wmask := fullMask - mem.io.wen := wen - mem.io.rdata + val mems = (0 until split).map {_ => Module(new RAMHelper(bankByte))} + mems.zipWithIndex map { case (mem, i) => + mem.io.clk := clock + mem.io.en := !reset.asBool() && ((state === s_rdata) || (state === s_wdata)) + mem.io.rIdx := (rIdx << log2Up(split)) + i.U + mem.io.wIdx := (wIdx << log2Up(split)) + i.U + mem.io.wdata := in.w.bits.data((i + 1) * 64 - 1, i * 64) + mem.io.wmask := MaskExpand(in.w.bits.strb((i + 1) * 8 - 1, i * 8)) + mem.io.wen := wen + } + val rdata = mems.map {mem => mem.io.rdata} + Cat(rdata.reverse) } else { val mem = Mem(memByte / beatBytes, Vec(beatBytes, UInt(8.W))) diff --git a/src/main/scala/device/AXI4SlaveModule.scala b/src/main/scala/device/AXI4SlaveModule.scala index bac94067b7c5a1e86692e6e191d87af6956b6f08..1d740f8684b72de1e4ddeec708582d9632f5b9aa 100644 --- a/src/main/scala/device/AXI4SlaveModule.scala +++ b/src/main/scala/device/AXI4SlaveModule.scala @@ -10,7 +10,7 @@ import xiangshan.HasXSLog abstract class AXI4SlaveModule[T <: Data] ( - address: AddressSet, + address: Seq[AddressSet], executable: Boolean = true, beatBytes: Int = 8, burstLen: Int = 1, @@ -19,7 +19,7 @@ abstract class AXI4SlaveModule[T <: Data] val node = AXI4SlaveNode(Seq(AXI4SlavePortParameters( Seq(AXI4SlaveParameters( - Seq(address), + address, regionType = RegionType.UNCACHED, executable = executable, supportsWrite = TransferSizes(1, beatBytes * burstLen), @@ -41,6 +41,9 @@ class AXI4SlaveModuleImp[T<:Data](outer: AXI4SlaveModule[T]) }) val (in, edge) = outer.node.in.head + // do not let MMIO AXI signals optimized out + chisel3.dontTouch(in) + // val timer = GTimer() when(in.ar.fire()){ @@ -72,9 +75,9 @@ class AXI4SlaveModuleImp[T<:Data](outer: AXI4SlaveModule[T]) assert(in.ar.bits.burst === AXI4Parameters.BURST_INCR, "only support busrt ince!") } - private val s_idle :: s_rdata :: s_wdata :: s_wresp :: Nil = Enum(4) + val s_idle :: s_rdata :: s_wdata :: s_wresp :: Nil = Enum(4) - private val state = RegInit(s_idle) + val state = RegInit(s_idle) switch(state){ is(s_idle){ @@ -150,7 +153,7 @@ class AXI4SlaveModuleImp[T<:Data](outer: AXI4SlaveModule[T]) (c.value, in.w.bits.last) } - in.aw.ready := state === s_idle + in.aw.ready := state === s_idle && !in.ar.valid in.w.ready := state === s_wdata in.b.bits.resp := AXI4Parameters.RESP_OKAY diff --git a/src/main/scala/device/AXI4Timer.scala b/src/main/scala/device/AXI4Timer.scala index 5bcb798b8ddd9ce85e3ca95d81133f260327840d..c97c15bc3955c5a6cc73c0c72423469611cbddc0 100644 --- a/src/main/scala/device/AXI4Timer.scala +++ b/src/main/scala/device/AXI4Timer.scala @@ -1,7 +1,6 @@ package device import chisel3._ -import chisel3.util.experimental.BoringUtils import chipsalliance.rocketchip.config.Parameters import freechips.rocketchip.diplomacy.AddressSet import utils._ @@ -13,7 +12,7 @@ class TimerIO extends Bundle { class AXI4Timer ( sim: Boolean = false, - address: AddressSet + address: Seq[AddressSet] )(implicit p: Parameters) extends AXI4SlaveModule(address, executable = false, _extra = new TimerIO) { @@ -31,12 +30,6 @@ class AXI4Timer val tick = (nextCnt === freq) when (tick) { mtime := mtime + inc } - if (sim) { - val isWFI = WireInit(false.B) - BoringUtils.addSink(isWFI, "isWFI") - when (isWFI) { mtime := mtime + 100000.U } - } - val mapping = Map( RegMap(0x4000, mtimecmp), RegMap(0x8000, freq), diff --git a/src/main/scala/device/AXI4UART.scala b/src/main/scala/device/AXI4UART.scala index ae2b168f4e9315b137df60fef8cb94ba20f16190..beea3cb9525005a4518b8d57f7bd2b1eac2688c4 100644 --- a/src/main/scala/device/AXI4UART.scala +++ b/src/main/scala/device/AXI4UART.scala @@ -4,7 +4,6 @@ import chisel3._ import chisel3.util._ import bus.axi4._ import chipsalliance.rocketchip.config.Parameters -import chisel3.util.experimental.BoringUtils import freechips.rocketchip.diplomacy.AddressSet import utils._ @@ -21,7 +20,7 @@ class UARTIO extends Bundle { class AXI4UART ( - address: AddressSet + address: Seq[AddressSet] )(implicit p: Parameters) extends AXI4SlaveModule(address, executable = false, _extra = new UARTIO) { diff --git a/src/main/scala/device/AXI4VGA.scala b/src/main/scala/device/AXI4VGA.scala index 9c94aabe0bc480e030c67e4ee5cb4fd7167779f6..d4f96d2536bbbb5adda553b72f06c9320a50b0b8 100644 --- a/src/main/scala/device/AXI4VGA.scala +++ b/src/main/scala/device/AXI4VGA.scala @@ -54,7 +54,7 @@ class VGACtrlBundle extends Bundle { class VGACtrl ( - address: AddressSet + address: Seq[AddressSet] )(implicit p: Parameters) extends AXI4SlaveModule(address, _extra = new VGACtrlBundle, executable = false) with HasVGAParameter { override lazy val module = new AXI4SlaveModuleImp[VGACtrlBundle](this) { @@ -106,8 +106,8 @@ class FBHelper extends BlackBox with HasBlackBoxInline { class AXI4VGA ( sim: Boolean = false, - fbAddress: AddressSet, - ctrlAddress: AddressSet + fbAddress: Seq[AddressSet], + ctrlAddress: Seq[AddressSet] )(implicit p: Parameters) extends LazyModule with HasVGAParameter { diff --git a/src/main/scala/device/TLTimer.scala b/src/main/scala/device/TLTimer.scala index 1137fa1a7516c7bc4b2310bb4c6b02c2cb0cda83..3f94050a683380fef44640f8803584acbf48ed0d 100644 --- a/src/main/scala/device/TLTimer.scala +++ b/src/main/scala/device/TLTimer.scala @@ -4,7 +4,6 @@ import chisel3._ import chisel3.util._ import freechips.rocketchip.tilelink._ import chipsalliance.rocketchip.config._ -import chisel3.util.experimental.BoringUtils import freechips.rocketchip.diplomacy._ import freechips.rocketchip.regmapper.RegField import utils.{HasTLDump, XSDebug} @@ -35,12 +34,6 @@ class TLTimer(address: Seq[AddressSet], sim: Boolean)(implicit p: Parameters) ex val tick = (nextCnt === freq) when (tick) { mtime := mtime + inc } - if (sim) { - val isWFI = WireInit(false.B) - ExcitingUtils.addSink(isWFI, "isWFI") - when (isWFI) { mtime := mtime + 100000.U } - } - node.regmap( mapping = 0x0000 -> RegField.bytes(msip), 0x4000 -> RegField.bytes(mtimecmp), diff --git a/src/main/scala/fpu/FPUSubModule.scala b/src/main/scala/fpu/FPUSubModule.scala deleted file mode 100644 index 414884dff54fbb4d1c0a8399b1a2038ff6f8e674..0000000000000000000000000000000000000000 --- a/src/main/scala/fpu/FPUSubModule.scala +++ /dev/null @@ -1,57 +0,0 @@ -package fpu - -import chisel3._ -import chisel3.util._ - - -class FPUSubModuleInput extends Bundle{ - val op = UInt(3.W) - val isDouble = Bool() - val a, b, c = UInt(64.W) - val rm = UInt(3.W) -} - -class FPUSubModuleOutput extends Bundle{ - val fflags = new Fflags - val result = UInt(64.W) -} - -class FPUSubModuleIO extends Bundle{ - val in = Flipped(DecoupledIO(new FPUSubModuleInput)) - val out = DecoupledIO(new FPUSubModuleOutput) -} - -trait HasPipelineReg { this: FPUSubModule => - def latency: Int - - val ready = Wire(Bool()) - val cnt = RegInit(0.U((log2Up(latency)+1).W)) - - ready := (cnt < latency.U) || (cnt === latency.U && io.out.ready) - cnt := cnt + io.in.fire() - io.out.fire() - - val valids = io.in.valid +: Array.fill(latency)(RegInit(false.B)) - for(i <- 1 to latency){ - when(ready){ valids(i) := valids(i-1) } - } - - def PipelineReg[T<:Data](i: Int)(next: T) = RegEnable(next, enable = valids(i-1) && ready) - def S1Reg[T<:Data](next: T):T = PipelineReg[T](1)(next) - def S2Reg[T<:Data](next: T):T = PipelineReg[T](2)(next) - def S3Reg[T<:Data](next: T):T = PipelineReg[T](3)(next) - def S4Reg[T<:Data](next: T):T = PipelineReg[T](4)(next) - def S5Reg[T<:Data](next: T):T = PipelineReg[T](5)(next) - - io.in.ready := ready - io.out.valid := valids.last -} - -trait HasUIntToSIntHelper { - implicit class UIntToSIntHelper(x: UInt){ - def toSInt: SInt = Cat(0.U(1.W), x).asSInt() - } -} - -abstract class FPUSubModule extends Module with HasUIntToSIntHelper { - val io = IO(new FPUSubModuleIO) -} diff --git a/src/main/scala/fpu/package.scala b/src/main/scala/fpu/package.scala deleted file mode 100644 index e32b2d8a13a6a3e91c6b5831ce8077f16fd4937e..0000000000000000000000000000000000000000 --- a/src/main/scala/fpu/package.scala +++ /dev/null @@ -1,121 +0,0 @@ -import chisel3._ -import chisel3.util._ - -package object fpu { - - object FPUOpType { - def funcWidth = 6 - def FpuOp(fu: String, op: String): UInt = ("b" + fu + op).U(funcWidth.W) - - // FMA - def fadd:UInt = FpuOp("000", "000") - def fsub:UInt = FpuOp("000", "001") - def fmadd:UInt = FpuOp("000", "100") - def fmsub:UInt = FpuOp("000", "101") - def fnmsub:UInt = FpuOp("000", "110") - def fnmadd:UInt = FpuOp("000", "111") - def fmul:UInt = FpuOp("000", "010") - - // FCMP - def fmin:UInt = FpuOp("001", "000") - def fmax:UInt = FpuOp("001", "001") - def fle:UInt = FpuOp("001", "010") - def flt:UInt = FpuOp("001", "011") - def feq:UInt = FpuOp("001", "100") - - // FMV - def fmv_f2i:UInt= FpuOp("010", "000") - def fmv_i2f:UInt= FpuOp("010", "001") - def fclass:UInt = FpuOp("010", "010") - def fsgnj:UInt = FpuOp("010", "110") - def fsgnjn:UInt = FpuOp("010", "101") - def fsgnjx:UInt = FpuOp("010", "100") - - // FloatToInt - def f2w:UInt = FpuOp("011", "000") - def f2wu:UInt = FpuOp("011", "001") - def f2l:UInt = FpuOp("011", "010") - def f2lu:UInt = FpuOp("011", "011") - - // IntToFloat - def w2f:UInt = FpuOp("100", "000") - def wu2f:UInt = FpuOp("100", "001") - def l2f:UInt = FpuOp("100", "010") - def lu2f:UInt = FpuOp("100", "011") - - // FloatToFloat - def s2d:UInt = FpuOp("101", "000") - def d2s:UInt = FpuOp("110", "000") - - // Div/Sqrt - def fdiv:UInt = FpuOp("111", "000") - def fsqrt:UInt = FpuOp("111", "001") - } - - object FPUIOFunc { - def in_raw = 0.U(1.W) - def in_unbox = 1.U(1.W) - - def out_raw = 0.U(2.W) - def out_box = 1.U(2.W) - def out_sext = 2.U(2.W) - def out_zext = 3.U(2.W) - - def apply(inputFunc: UInt, outputFunc:UInt) = Cat(inputFunc, outputFunc) - } - - class Fflags extends Bundle { - val invalid = Bool() // 4 - val infinite = Bool() // 3 - val overflow = Bool() // 2 - val underflow = Bool() // 1 - val inexact = Bool() // 0 - } - - object RoudingMode { - val RNE = "b000".U(3.W) - val RTZ = "b001".U(3.W) - val RDN = "b010".U(3.W) - val RUP = "b011".U(3.W) - val RMM = "b100".U(3.W) - } - - class FloatPoint(val expWidth: Int, val mantWidth:Int) extends Bundle{ - val sign = Bool() - val exp = UInt(expWidth.W) - val mant = UInt(mantWidth.W) - def defaultNaN: UInt = Cat(0.U(1.W), Fill(expWidth+1,1.U(1.W)), Fill(mantWidth-1,0.U(1.W))) - def posInf: UInt = Cat(0.U(1.W), Fill(expWidth, 1.U(1.W)), 0.U(mantWidth.W)) - def negInf: UInt = Cat(1.U(1.W), posInf.tail(1)) - def maxNorm: UInt = Cat(0.U(1.W), Fill(expWidth-1, 1.U(1.W)), 0.U(1.W), Fill(mantWidth, 1.U(1.W))) - def expBias: UInt = Fill(expWidth-1, 1.U(1.W)) - def expBiasInt: Int = (1 << (expWidth-1)) - 1 - def mantExt: UInt = Cat(exp=/=0.U, mant) - def apply(x: UInt): FloatPoint = x.asTypeOf(new FloatPoint(expWidth, mantWidth)) - } - - object Float32 extends FloatPoint(8, 23) - object Float64 extends FloatPoint(11, 52) - - def expOverflow(sexp: SInt, expWidth: Int): Bool = sexp >= Cat(0.U(1.W), Fill(expWidth, 1.U(1.W))).asSInt() - def expOverflow(uexp: UInt, expWidth: Int): Bool = expOverflow(Cat(0.U(1.W), uexp).asSInt(), expWidth) - - def boxF32ToF64(x: UInt): UInt = Cat(Fill(32, 1.U(1.W)), x(31, 0)) - def unboxF64ToF32(x: UInt): UInt = Mux(x(63, 32)===Fill(32, 1.U(1.W)), x(31, 0), Float32.defaultNaN) - - def extF32ToF64(x: UInt): UInt = { - val f32 = Float32(x) - Cat( - f32.sign, - Mux(f32.exp === 0.U, - 0.U(Float64.expWidth.W), - Mux((~f32.exp).asUInt() === 0.U, - Cat("b111".U(3.W), f32.exp), - Cat("b0111".U(4.W) + f32.exp.head(1), f32.exp.tail(1)) - ) - ), - Cat(f32.mant, 0.U((Float64.mantWidth - Float32.mantWidth).W)) - ) - } -} - diff --git a/src/main/scala/noop/BPU.scala b/src/main/scala/noop/BPU.scala deleted file mode 100644 index 8722596fac7c60ccf4a0d8f789d0974e09b89198..0000000000000000000000000000000000000000 --- a/src/main/scala/noop/BPU.scala +++ /dev/null @@ -1,229 +0,0 @@ -package noop - -import chisel3._ -import chisel3.util._ -import chisel3.util.experimental.BoringUtils - -import utils._ - -class TableAddr(val idxBits: Int) extends NOOPBundle { - def tagBits = VAddrBits - 2 - idxBits - - //val res = UInt((AddrBits - VAddrBits).W) - val tag = UInt(tagBits.W) - val idx = UInt(idxBits.W) - val pad = UInt(2.W)//TODO - - def fromUInt(x: UInt) = x.asTypeOf(UInt(VAddrBits.W)).asTypeOf(this) - def getTag(x: UInt) = fromUInt(x).tag - def getIdx(x: UInt) = fromUInt(x).idx -} - -object BTBtype { - def B = "b00".U // branch - def J = "b01".U // jump - def I = "b10".U // indirect - def R = "b11".U // return - - def apply() = UInt(2.W) -} - -class BPUUpdateReq extends NOOPBundle { - val valid = Output(Bool()) - val pc = Output(UInt(VAddrBits.W)) - val isMissPredict = Output(Bool()) - val actualTarget = Output(UInt(VAddrBits.W)) - val actualTaken = Output(Bool()) // for branch - val fuOpType = Output(FuOpType()) - val btbType = Output(BTBtype()) - val isRVC = Output(Bool()) // for ras, save PC+2 to stack if is RVC -} - -class BPU1 extends NOOPModule { - val io = IO(new Bundle { - val in = new Bundle { val pc = Flipped(Valid((UInt(VAddrBits.W)))) } - val out = new RedirectIO - val flush = Input(Bool()) - val brIdx = Output(UInt(3.W)) - val lateJump = Output(Bool()) - }) - - val flush = BoolStopWatch(io.flush, io.in.pc.valid, startHighPriority = true) - - // BTB - val NRbtb = 512 - val btbAddr = new TableAddr(log2Up(NRbtb)) - def btbEntry() = new Bundle { - val tag = UInt(btbAddr.tagBits.W) - val _type = UInt(2.W) - val target = UInt(VAddrBits.W) - val brIdx = UInt(3.W) - val valid = Bool() - } - - val btb = Module(new SRAMTemplate(btbEntry(), set = NRbtb, shouldReset = true, holdRead = true, singlePort = true)) - // flush BTB when executing fence.i - val flushBTB = WireInit(false.B) - val flushTLB = WireInit(false.B) - BoringUtils.addSink(flushBTB, "MOUFlushICache") - BoringUtils.addSink(flushTLB, "MOUFlushTLB") - btb.reset := reset.asBool || (flushBTB || flushTLB) - - Debug(false) { - when (reset.asBool || (flushBTB || flushTLB)) { - printf("[BPU-RESET] %d bpu-reset flushBTB:%d flushTLB:%d\n", GTimer(), flushBTB, flushTLB) - } - } - - btb.io.r.req.valid := io.in.pc.valid - btb.io.r.req.bits.setIdx := btbAddr.getIdx(io.in.pc.bits) - - - val btbRead = Wire(btbEntry()) - btbRead := btb.io.r.resp.data(0) - // since there is one cycle latency to read SyncReadMem, - // we should latch the input pc for one cycle - val pcLatch = RegEnable(io.in.pc.bits, io.in.pc.valid) - val btbHit = btbRead.tag === btbAddr.getTag(pcLatch) && !flush && RegNext(btb.io.r.req.fire(), init = false.B) && !(pcLatch(1) && btbRead.brIdx(0)) && btbRead.valid - // btbHit will ignore pc(1,0). pc(1,0) is used to build brIdx - // !(pcLatch(1) && btbRead.brIdx(0)) is used to deal with the following case: - // ------------------------------------------------- - // 0 jump rvc // marked as "take branch" in BTB - // 2 xxx rvc <-- jump to here - // ------------------------------------------------- - val lateJump = btbRead.brIdx(2) && btbHit - io.lateJump := lateJump - // val lateJumpLatch = RegNext(lateJump) - // val lateJumpTarget = RegEnable(btbRead.target, lateJump) - Debug(false){ - //printf("[BTBHT] lateJump %x lateJumpLatch %x lateJumpTarget %x\n", lateJump, lateJumpLatch, lateJumpTarget) - when(btbHit){ - printf("[BTBHT1] %d pc=%x tag=%x,%x index=%x bridx=%x tgt=%x,%x flush %x type:%x\n", GTimer(), pcLatch, btbRead.tag, btbAddr.getTag(pcLatch), btbAddr.getIdx(pcLatch), btbRead.brIdx, btbRead.target, io.out.target, flush,btbRead._type) - printf("[BTBHT2] btbRead.brIdx %x mask %x\n", btbRead.brIdx, Cat(lateJump, Fill(2, io.out.valid))) - printf("[BTBHT5] btbReqValid:%d btbReqSetIdx:%x\n",btb.io.r.req.valid, btb.io.r.req.bits.setIdx) - } - } - - // PHT - val pht = Mem(NRbtb, UInt(2.W)) - val phtTaken = RegEnable(pht.read(btbAddr.getIdx(io.in.pc.bits))(1), io.in.pc.valid) - - // RAS - - val NRras = 16 - val ras = Mem(NRras, UInt(VAddrBits.W)) - // val raBrIdxs = Mem(NRras, UInt(2.W)) - val sp = Counter(NRras) - val rasTarget = RegEnable(ras.read(sp.value), io.in.pc.valid) - // val rasBrIdx = RegEnable(raBrIdxs.read(sp.value), io.in.pc.valid) - - // update - val req = WireInit(0.U.asTypeOf(new BPUUpdateReq)) - val btbWrite = WireInit(0.U.asTypeOf(btbEntry())) - BoringUtils.addSink(req, "bpuUpdateReq") - - Debug(false){ - when(req.valid){ - printf("[BTBUP] pc=%x tag=%x index=%x bridx=%x tgt=%x type=%x\n", req.pc, btbAddr.getTag(req.pc), btbAddr.getIdx(req.pc), Cat(req.pc(1), ~req.pc(1)), req.actualTarget, req.btbType) - } - } - - //val fflag = req.btbType===3.U && btb.io.w.req.valid && btb.io.w.req.bits.setIdx==="hc9".U - //when(fflag && GTimer()>2888000.U) { - // printf("%d\n", GTimer()) - // printf("[BTBHT6] btbWrite.type is BTBtype.R/RET!!! Inpc:%x btbWrite.brIdx:%x setIdx:%x\n", io.in.pc.bits, btbWrite.brIdx, btb.io.w.req.bits.setIdx) - // printf("[BTBHT6] tag:%x target:%x _type:%x bridx:%x\n", btbWrite.tag,btbWrite.target,btbWrite._type,btbWrite.brIdx) - // printf(p"[BTBHT6] req:${req} \n") - //} - //printf("[BTBHT5] tag: target:%x type:%d brIdx:%d\n", req.actualTarget, req.btbType, Cat(req.pc(2,0)==="h6".U && !req.isRVC, req.pc(1), ~req.pc(1))) - - btbWrite.tag := btbAddr.getTag(req.pc) - btbWrite.target := req.actualTarget - btbWrite._type := req.btbType - btbWrite.brIdx := Cat(req.pc(2,0)==="h6".U && !req.isRVC, req.pc(1), ~req.pc(1)) - btbWrite.valid := true.B - // NOTE: We only update BTB at a miss prediction. - // If a miss prediction is found, the pipeline will be flushed - // in the next cycle. Therefore it is safe to use single-port - // SRAM to implement BTB, since write requests have higher priority - // than read request. Again, since the pipeline will be flushed - // in the next cycle, the read request will be useless. - btb.io.w.req.valid := req.isMissPredict && req.valid - btb.io.w.req.bits.setIdx := btbAddr.getIdx(req.pc) - btb.io.w.req.bits.data := btbWrite - - //Debug(true) { - //when (btb.io.w.req.valid && btbWrite.tag === btbAddr.getTag("hffffffff803541a4".U)) { - // printf("[BTBWrite] %d setIdx:%x req.valid:%d pc:%x target:%x bridx:%x\n", GTimer(), btbAddr.getIdx(req.pc), req.valid, req.pc, req.actualTarget, btbWrite.brIdx) - //} - //} - - //when (GTimer() > 77437484.U && btb.io.w.req.valid) { - // printf("[BTBWrite-ALL] %d setIdx:%x req.valid:%d pc:%x target:%x bridx:%x\n", GTimer(), btbAddr.getIdx(req.pc), req.valid, req.pc, req.actualTarget, btbWrite.brIdx) - //} - - val cnt = RegNext(pht.read(btbAddr.getIdx(req.pc))) - val reqLatch = RegNext(req) - when (reqLatch.valid && ALUOpType.isBranch(reqLatch.fuOpType)) { - val taken = reqLatch.actualTaken - val newCnt = Mux(taken, cnt + 1.U, cnt - 1.U) - val wen = (taken && (cnt =/= "b11".U)) || (!taken && (cnt =/= "b00".U)) - when (wen) { - pht.write(btbAddr.getIdx(reqLatch.pc), newCnt) - //Debug(){ - //printf("BPUPDATE: pc %x cnt %x\n", reqLatch.pc, newCnt) - //} - } - } - when (req.valid) { - when (req.fuOpType === ALUOpType.call) { - ras.write(sp.value + 1.U, Mux(req.isRVC, req.pc + 2.U, req.pc + 4.U)) - // raBrIdxs.write(sp.value + 1.U, Mux(req.pc(1), 2.U, 1.U)) - sp.value := sp.value + 1.U - } - .elsewhen (req.fuOpType === ALUOpType.ret) { - when(sp.value === 0.U) { - //printf("ATTTTT: sp.value is 0.U\n") //TODO: sp.value may equal to 0.U - } - sp.value := Mux(sp.value===0.U, 0.U, sp.value - 1.U) //TODO: sp.value may less than 0.U - } - } - - io.out.target := Mux(btbRead._type === BTBtype.R, rasTarget, btbRead.target) - // io.out.target := Mux(lateJumpLatch && !flush, lateJumpTarget, Mux(btbRead._type === BTBtype.R, rasTarget, btbRead.target)) - // io.out.brIdx := btbRead.brIdx & Fill(3, io.out.valid) - io.brIdx := btbRead.brIdx & Cat(true.B, lateJump, Fill(2, io.out.valid)) - io.out.valid := btbHit && Mux(btbRead._type === BTBtype.B, phtTaken, true.B && rasTarget=/=0.U) //TODO: add rasTarget=/=0.U, need fix - // io.out.valid := btbHit && Mux(btbRead._type === BTBtype.B, phtTaken, true.B) && !lateJump || lateJumpLatch && !flush && !lateJump - // Note: - // btbHit && Mux(btbRead._type === BTBtype.B, phtTaken, true.B) && !lateJump : normal branch predict - // lateJumpLatch && !flush && !lateJump : cross line branch predict, bpu will require imem to fetch the next 16bit of current inst in next instline - // `&& !lateJump` is used to make sure this logic will run correctly when imem stalls (pcUpdate === false) - // by using `instline`, we mean a 64 bit instfetch result from imem - // ROCKET uses a 32 bit instline, and its IDU logic is more simple than this implentation. -} - -class BPU2 extends NOOPModule { - val io = IO(new Bundle { - val in = Flipped(Valid(new CtrlFlowIO)) - val out = new RedirectIO - }) - - val instr = io.in.bits.instr - val immJ = SignExt(Cat(instr(31), instr(19, 12), instr(20), instr(30, 21), 0.U(1.W)), XLEN) - val immB = SignExt(Cat(instr(31), instr(7), instr(30, 25), instr(11, 8), 0.U(1.W)), XLEN) - val table = Array( - RV32I_BRUInstr.JAL -> List(immJ, true.B), - RV32I_BRUInstr.BNE -> List(immB, instr(31)), - RV32I_BRUInstr.BEQ -> List(immB, instr(31)), - RV32I_BRUInstr.BLT -> List(immB, instr(31)), - RV32I_BRUInstr.BGE -> List(immB, instr(31)), - RV32I_BRUInstr.BLTU -> List(immB, instr(31)), - RV32I_BRUInstr.BGEU -> List(immB, instr(31)) - ) - val default = List(immB, false.B) - val offset :: predict :: Nil = ListLookup(instr, default, table) - - io.out.target := io.in.bits.pc + offset - io.out.valid := io.in.valid && predict(0) -} diff --git a/src/main/scala/noop/Bundle.scala b/src/main/scala/noop/Bundle.scala deleted file mode 100644 index 071168c2a391028ca95f220255aab9ea32be59a8..0000000000000000000000000000000000000000 --- a/src/main/scala/noop/Bundle.scala +++ /dev/null @@ -1,127 +0,0 @@ -package noop - -import chisel3._ -import chisel3.util._ - -class CtrlSignalIO extends NOOPBundle { - val src1Type = Output(SrcType()) - val src2Type = Output(SrcType()) - val src3Type = Output(SrcType()) - val fuType = Output(FuType()) - val fuOpType = Output(FuOpType()) - val rfSrc1 = Output(UInt(5.W)) - val rfSrc2 = Output(UInt(5.W)) - val rfWen = Output(Bool()) - val fpWen = Output(Bool()) - val fpInputFunc = Output(UInt(1.W)) - val fpOutputFunc = Output(UInt(2.W)) - val rfDest = Output(UInt(5.W)) - val isNoopTrap = Output(Bool()) - val isSrc1Forward = Output(Bool()) - val isSrc2Forward = Output(Bool()) -} - -class DataSrcIO extends NOOPBundle { - val src1 = Output(UInt(XLEN.W)) - val src2 = Output(UInt(XLEN.W)) - val imm = Output(UInt(XLEN.W)) -} - -class RedirectIO extends NOOPBundle { - val target = Output(UInt(VAddrBits.W)) - // val brIdx = Output(UInt(3.W)) // for RVC - val valid = Output(Bool()) -} - -// class IRIDCtrlFlowIO extends NOOPBundle { -// val instr = Output(UInt(64.W)) -// val pc = Output(UInt(VAddrBits.W)) -// val pnpc = Output(UInt(VAddrBits.W)) -// val brIdx = Output(UInt(3.W)) -// val redirect = new RedirectIO -// } - -class CtrlFlowIO extends NOOPBundle { - val instr = Output(UInt(64.W)) - val pc = Output(UInt(VAddrBits.W)) - val pnpc = Output(UInt(VAddrBits.W)) - val redirect = new RedirectIO - val exceptionVec = Output(Vec(16, Bool())) - val intrVec = Output(Vec(12, Bool())) - val brIdx = Output(UInt(4.W)) - val crossPageIPFFix = Output(Bool()) -} - -class DecodeIO extends NOOPBundle { - val cf = new CtrlFlowIO - val ctrl = new CtrlSignalIO - val data = new DataSrcIO -} - -class WriteBackIO extends NOOPBundle { - val rfWen = Output(Bool()) - val fpWen = Output(Bool()) - val rfDest = Output(UInt(5.W)) - val rfData = Output(UInt(XLEN.W)) -} - -class CommitIO extends NOOPBundle { - val decode = new DecodeIO - val isMMIO = Output(Bool()) - val intrNO = Output(UInt(XLEN.W)) - val commits = Output(Vec(FuType.num, UInt(XLEN.W))) -} - -class FunctionUnitIO extends NOOPBundle { - val in = Flipped(Decoupled(new Bundle { - val src1 = Output(UInt(XLEN.W)) - val src2 = Output(UInt(XLEN.W)) - val func = Output(FuOpType()) - })) - val out = Decoupled(Output(UInt(XLEN.W))) -} - -class ForwardIO extends NOOPBundle { - val valid = Output(Bool()) - val wb = new WriteBackIO - val fuType = Output(FuType()) -} - -class MMUIO extends NOOPBundle { - // val ptev = Output(Bool()) - // val pteu = Output(Bool()) - // val ptex = Output(Bool()) - // val valid = Output(Bool()) - // val isStore = Output(Bool()) - - val priviledgeMode = Input(UInt(2.W)) - val status_sum = Input(Bool()) - val status_mxr = Input(Bool()) - - val loadPF = Output(Bool()) - val storePF = Output(Bool()) - val addr = Output(UInt(VAddrBits.W)) - - def isPF() = loadPF || storePF -} - -class MemMMUIO extends NOOPBundle { - val imem = new MMUIO - val dmem = new MMUIO -} - -class TLBExuIO extends NOOPBundle { - val satp = Output(UInt(XLEN.W)) - val sfence = new Bundle { - val valid = Output(Bool()) - val asid = Output(UInt(9.W)) - val vaddr = Output(UInt(XLEN.W)) - } - - def access(valid: Bool, src1: UInt, src2: UInt, func: UInt, satp: UInt) = {//func no use here for just sfence.vma only - this.sfence.valid := valid - this.sfence.vaddr := src1 - this.sfence.asid := src2(8,0) - this.satp := satp - } -} \ No newline at end of file diff --git a/src/main/scala/noop/Cache.scala b/src/main/scala/noop/Cache.scala deleted file mode 100644 index fb173b1e0f14b169ae75311b03e955f4787a0c2e..0000000000000000000000000000000000000000 --- a/src/main/scala/noop/Cache.scala +++ /dev/null @@ -1,561 +0,0 @@ -package noop - -import chisel3._ -import chisel3.util._ -import chisel3.util.experimental.BoringUtils - -import bus.simplebus._ -import bus.axi4._ -import utils._ - -case class CacheConfig ( - ro: Boolean = false, - name: String = "cache", - userBits: Int = 0, - cacheLevel: Int = 1, - - totalSize: Int = 32, // Kbytes - ways: Int = 4 -) - -sealed trait HasCacheConst { - implicit val cacheConfig: CacheConfig - - val PAddrBits: Int - val XLEN: Int - - val cacheName = cacheConfig.name - val userBits = cacheConfig.userBits - - val ro = cacheConfig.ro - val hasCoh = !ro - val hasCohInt = (if (hasCoh) 1 else 0) - val hasPrefetch = cacheName == "l2cache" - - val cacheLevel = cacheConfig.cacheLevel - val TotalSize = cacheConfig.totalSize - val Ways = cacheConfig.ways - val LineSize = XLEN // byte - val LineBeats = LineSize / 8 //DATA WIDTH 64 - val Sets = TotalSize * 1024 / LineSize / Ways - val OffsetBits = log2Up(LineSize) - val IndexBits = log2Up(Sets) - val WordIndexBits = log2Up(LineBeats) - val TagBits = PAddrBits - OffsetBits - IndexBits - - val debug = true - - def addrBundle = new Bundle { - val tag = UInt(TagBits.W) - val index = UInt(IndexBits.W) - val wordIndex = UInt(WordIndexBits.W) - val byteOffset = UInt((if (XLEN == 64) 3 else 2).W) - } - - def CacheMetaArrayReadBus() = new SRAMReadBus(new MetaBundle, set = Sets, way = Ways) - def CacheDataArrayReadBus() = new SRAMReadBus(new DataBundle, set = Sets * LineBeats, way = Ways) - def CacheMetaArrayWriteBus() = new SRAMWriteBus(new MetaBundle, set = Sets, way = Ways) - def CacheDataArrayWriteBus() = new SRAMWriteBus(new DataBundle, set = Sets * LineBeats, way = Ways) - - def getMetaIdx(addr: UInt) = addr.asTypeOf(addrBundle).index - def getDataIdx(addr: UInt) = Cat(addr.asTypeOf(addrBundle).index, addr.asTypeOf(addrBundle).wordIndex) - - def isSameWord(a1: UInt, a2: UInt) = ((a1 >> 2) === (a2 >> 2)) - def isSetConflict(a1: UInt, a2: UInt) = (a1.asTypeOf(addrBundle).index === a2.asTypeOf(addrBundle).index) -} - -sealed abstract class CacheBundle(implicit cacheConfig: CacheConfig) extends Bundle with HasNOOPParameter with HasCacheConst -sealed abstract class CacheModule(implicit cacheConfig: CacheConfig) extends Module with HasNOOPParameter with HasCacheConst - -sealed class MetaBundle(implicit val cacheConfig: CacheConfig) extends CacheBundle { - val tag = Output(UInt(TagBits.W)) - val valid = Output(Bool()) - val dirty = Output(Bool()) - - def apply(tag: UInt, valid: Bool, dirty: Bool) = { - this.tag := tag - this.valid := valid - this.dirty := dirty - this - } -} - -sealed class DataBundle(implicit val cacheConfig: CacheConfig) extends CacheBundle { - val data = Output(UInt(DataBits.W)) - - def apply(data: UInt) = { - this.data := data - this - } -} - -sealed class Stage1IO(implicit val cacheConfig: CacheConfig) extends CacheBundle { - val req = new SimpleBusReqBundle(userBits = userBits) -} - -// meta read -sealed class CacheStage1(implicit val cacheConfig: CacheConfig) extends CacheModule { - val io = IO(new Bundle { - val in = Flipped(Decoupled(new SimpleBusReqBundle(userBits = userBits))) - val out = Decoupled(new Stage1IO) - val metaReadBus = CacheMetaArrayReadBus() - val dataReadBus = CacheDataArrayReadBus() - - val s2s3Empty = Input(Bool()) // FIXME: remove me when do not use nut's cache - }) - - if (ro) when (io.in.fire()) { assert(!io.in.bits.isWrite()) } - Debug(){ - if (debug) { - when(io.in.fire()){ - printf("[L1$] " +name+" cache stage1, addr in: %x, user: %x\n", io.in.bits.addr, io.in.bits.user.getOrElse(0.U)) - } - } - } - - // read meta array and data array - val readBusValid = io.in.valid && io.out.ready - io.metaReadBus.apply(valid = readBusValid, setIdx = getMetaIdx(io.in.bits.addr)) - io.dataReadBus.apply(valid = readBusValid, setIdx = getDataIdx(io.in.bits.addr)) - - io.out.bits.req := io.in.bits - io.out.valid := io.in.valid && io.metaReadBus.req.ready && io.dataReadBus.req.ready && io.s2s3Empty // FIXME: remove me when do not use nut's cache - io.in.ready := (!io.in.valid || io.out.fire()) && io.metaReadBus.req.ready && io.dataReadBus.req.ready && io.s2s3Empty // FIXME: remove me when do not use nut's cache - - Debug() { - if (debug) { - printf("%d: [" + cacheName + " stage1]: in.ready = %d, in.valid = %d, out.valid = %d, out.ready = %d, addr = %x, cmd = %x, dataReadBus.req.valid = %d\n", - GTimer(), io.in.ready, io.in.valid, io.out.valid, io.out.ready, io.in.bits.addr, io.in.bits.cmd, io.dataReadBus.req.valid) - } - } -} - -sealed class Stage2IO(implicit val cacheConfig: CacheConfig) extends CacheBundle { - val req = new SimpleBusReqBundle(userBits = userBits) - val metas = Vec(Ways, new MetaBundle) - val datas = Vec(Ways, new DataBundle) - val hit = Output(Bool()) - val waymask = Output(UInt(Ways.W)) - val mmio = Output(Bool()) - val isForwardData = Output(Bool()) - val forwardData = Output(CacheDataArrayWriteBus().req.bits) -} - -// check -sealed class CacheStage2(implicit val cacheConfig: CacheConfig) extends CacheModule { - val io = IO(new Bundle { - val in = Flipped(Decoupled(new Stage1IO)) - val out = Decoupled(new Stage2IO) - val metaReadResp = Flipped(Vec(Ways, new MetaBundle)) - val dataReadResp = Flipped(Vec(Ways, new DataBundle)) - val metaWriteBus = Input(CacheMetaArrayWriteBus()) - val dataWriteBus = Input(CacheDataArrayWriteBus()) - }) - - val req = io.in.bits.req - val addr = req.addr.asTypeOf(addrBundle) - - val isForwardMeta = io.in.valid && io.metaWriteBus.req.valid && io.metaWriteBus.req.bits.setIdx === getMetaIdx(req.addr) - val isForwardMetaReg = RegInit(false.B) - when (isForwardMeta) { isForwardMetaReg := true.B } - when (io.in.fire() || !io.in.valid) { isForwardMetaReg := false.B } - val forwardMetaReg = RegEnable(io.metaWriteBus.req.bits, isForwardMeta) - - val metaWay = Wire(Vec(Ways, chiselTypeOf(forwardMetaReg.data))) - forwardMetaReg.waymask.getOrElse("b1".U).asBools.zipWithIndex.map { case (w, i) => - metaWay(i) := Mux(isForwardMetaReg && w, forwardMetaReg.data, io.metaReadResp(i)) - } - - val hitVec = VecInit(metaWay.map(m => m.valid && (m.tag === addr.tag) && io.in.valid)).asUInt - val victimWaymask = if (Ways > 1) (1.U << LFSR64()(log2Up(Ways)-1,0)) else "b1".U - - val invalidVec = VecInit(metaWay.map(m => !m.valid)).asUInt - val hasInvalidWay = invalidVec.orR - val refillInvalidWaymask = Mux(invalidVec >= 8.U, "b1000".U, - Mux(invalidVec >= 4.U, "b0100".U, - Mux(invalidVec >= 2.U, "b0010".U, "b0001".U))) - - // val waymask = Mux(io.out.bits.hit, hitVec, victimWaymask) - val waymask = Mux(io.out.bits.hit, hitVec, Mux(hasInvalidWay, refillInvalidWaymask, victimWaymask)) - assert(!(io.in.valid && PopCount(waymask) > 1.U)) - - io.out.bits.metas := metaWay - io.out.bits.hit := io.in.valid && hitVec.orR - io.out.bits.waymask := waymask - io.out.bits.datas := io.dataReadResp - io.out.bits.mmio := xiangshan.AddressSpace.isMMIO(ZeroExt(req.addr, 40)) // FIXME: isMMIO should have PAddrBits Length ?? - - val isForwardData = io.in.valid && (io.dataWriteBus.req match { case r => - r.valid && r.bits.setIdx === getDataIdx(req.addr) - }) - val isForwardDataReg = RegInit(false.B) - when (isForwardData) { isForwardDataReg := true.B } - when (io.in.fire() || !io.in.valid) { isForwardDataReg := false.B } - val forwardDataReg = RegEnable(io.dataWriteBus.req.bits, isForwardData) - io.out.bits.isForwardData := isForwardDataReg || isForwardData - io.out.bits.forwardData := Mux(isForwardData, io.dataWriteBus.req.bits, forwardDataReg) - - io.out.bits.req <> req - io.out.valid := io.in.valid - io.in.ready := !io.in.valid || io.out.fire() - - Debug() { - if (debug) { - printf("%d: [" + cacheName + " S2]: isFD:%d isFDreg:%d inFire:%d invalid:%d \n", GTimer(), isForwardData, isForwardDataReg, io.in.fire(), io.in.valid) - } - } -} - -// writeback -sealed class CacheStage3(implicit val cacheConfig: CacheConfig) extends CacheModule { - val io = IO(new Bundle { - val in = Flipped(Decoupled(new Stage2IO)) - val out = Decoupled(new SimpleBusRespBundle(userBits = userBits)) - val isFinish = Output(Bool()) - val flush = Input(Bool()) - val dataReadBus = CacheDataArrayReadBus() - val dataWriteBus = CacheDataArrayWriteBus() - val metaWriteBus = CacheMetaArrayWriteBus() - - val mem = new SimpleBusUC - val mmio = new SimpleBusUC - val cohResp = Decoupled(new SimpleBusRespBundle) - - // use to distinguish prefetch request and normal request - val dataReadRespToL1 = Output(Bool()) - }) - - val metaWriteArb = Module(new Arbiter(CacheMetaArrayWriteBus().req.bits, 2)) - val dataWriteArb = Module(new Arbiter(CacheDataArrayWriteBus().req.bits, 2)) - - val req = io.in.bits.req - val addr = req.addr.asTypeOf(addrBundle) - val mmio = io.in.valid && io.in.bits.mmio - val hit = io.in.valid && io.in.bits.hit - val miss = io.in.valid && !io.in.bits.hit - val probe = io.in.valid && hasCoh.B && req.isProbe() - val hitReadBurst = hit && req.isReadBurst() - val meta = Mux1H(io.in.bits.waymask, io.in.bits.metas) - assert(!(mmio && hit), "MMIO request should not hit in cache") - - // this is ugly - if (cacheName == "dcache") { - BoringUtils.addSource(mmio, "lsuMMIO") - } - - val useForwardData = io.in.bits.isForwardData && io.in.bits.waymask === io.in.bits.forwardData.waymask.getOrElse("b1".U) - val dataReadArray = Mux1H(io.in.bits.waymask, io.in.bits.datas).data - val dataRead = Mux(useForwardData, io.in.bits.forwardData.data.data, dataReadArray) - val wordMask = Mux(!ro.B && req.isWrite(), MaskExpand(req.wmask), 0.U(DataBits.W)) - - val writeL2BeatCnt = Counter(LineBeats) - when(io.out.fire() && (req.cmd === SimpleBusCmd.writeBurst || req.isWriteLast())) { - writeL2BeatCnt.inc() - } - - val hitWrite = hit && req.isWrite() - val dataHitWriteBus = Wire(CacheDataArrayWriteBus()).apply( - data = Wire(new DataBundle).apply(MaskData(dataRead, req.wdata, wordMask)), - valid = hitWrite, setIdx = Cat(addr.index, Mux(req.cmd === SimpleBusCmd.writeBurst || req.isWriteLast(), writeL2BeatCnt.value, addr.wordIndex)), waymask = io.in.bits.waymask) - - val metaHitWriteBus = Wire(CacheMetaArrayWriteBus()).apply( - valid = hitWrite && !meta.dirty, setIdx = getMetaIdx(req.addr), waymask = io.in.bits.waymask, - data = Wire(new MetaBundle).apply(tag = meta.tag, valid = true.B, dirty = (!ro).B) - ) - - val s_idle :: s_memReadReq :: s_memReadResp :: s_memWriteReq :: s_memWriteResp :: s_mmioReq :: s_mmioResp :: s_wait_resp :: s_release :: Nil = Enum(9) - val state = RegInit(s_idle) - val needFlush = RegInit(false.B) - - when (io.flush && (state =/= s_idle)) { needFlush := true.B } - when (io.out.fire() && needFlush) { needFlush := false.B } - - val readBeatCnt = Counter(LineBeats) - val writeBeatCnt = Counter(LineBeats) - - val s2_idle :: s2_dataReadWait :: s2_dataOK :: Nil = Enum(3) - val state2 = RegInit(s2_idle) - - io.dataReadBus.apply(valid = (state === s_memWriteReq || state === s_release) && (state2 === s2_idle), - setIdx = Cat(addr.index, Mux(state === s_release, readBeatCnt.value, writeBeatCnt.value))) - val dataWay = RegEnable(io.dataReadBus.resp.data, state2 === s2_dataReadWait) - val dataHitWay = Mux1H(io.in.bits.waymask, dataWay).data - - switch (state2) { - is (s2_idle) { when (io.dataReadBus.req.fire()) { state2 := s2_dataReadWait } } - is (s2_dataReadWait) { state2 := s2_dataOK } - is (s2_dataOK) { when (io.mem.req.fire() || io.cohResp.fire() || hitReadBurst && io.out.ready) { state2 := s2_idle } } - } - - // critical word first read - val raddr = (if (XLEN == 64) Cat(req.addr(PAddrBits-1,3), 0.U(3.W)) - else Cat(req.addr(PAddrBits-1,2), 0.U(2.W))) - // dirty block addr - val waddr = Cat(meta.tag, addr.index, 0.U(OffsetBits.W)) - val cmd = Mux(state === s_memReadReq, SimpleBusCmd.readBurst, - Mux((writeBeatCnt.value === (LineBeats - 1).U), SimpleBusCmd.writeLast, SimpleBusCmd.writeBurst)) - io.mem.req.bits.apply(addr = Mux(state === s_memReadReq, raddr, waddr), - cmd = cmd, size = (if (XLEN == 64) "b11".U else "b10".U), - wdata = dataHitWay, wmask = Fill(DataBytes, 1.U)) - - io.mem.resp.ready := true.B - io.mem.req.valid := (state === s_memReadReq) || ((state === s_memWriteReq) && (state2 === s2_dataOK)) - - // mmio - io.mmio.req.bits := req - io.mmio.resp.ready := true.B - io.mmio.req.valid := (state === s_mmioReq) - - val afterFirstRead = RegInit(false.B) - val alreadyOutFire = RegEnable(true.B, init = false.B, io.out.fire()) - val readingFirst = !afterFirstRead && io.mem.resp.fire() && (state === s_memReadResp) - val inRdataRegDemand = RegEnable(Mux(mmio, io.mmio.resp.bits.rdata, io.mem.resp.bits.rdata), - Mux(mmio, state === s_mmioResp, readingFirst)) - - // probe - io.cohResp.valid := ((state === s_idle) && probe) || - ((state === s_release) && (state2 === s2_dataOK)) - io.cohResp.bits.rdata := dataHitWay - val releaseLast = Counter(state === s_release && io.cohResp.fire(), LineBeats)._2 - io.cohResp.bits.cmd := Mux(state === s_release, Mux(releaseLast, SimpleBusCmd.readLast, 0.U), - Mux(hit, SimpleBusCmd.probeHit, SimpleBusCmd.probeMiss)) - - val respToL1Fire = hitReadBurst && io.out.ready && state2 === s2_dataOK - val respToL1Last = Counter((state === s_idle || state === s_release && state2 === s2_dataOK) && hitReadBurst && io.out.ready, LineBeats)._2 - - switch (state) { - is (s_idle) { - afterFirstRead := false.B - alreadyOutFire := false.B - - when (probe) { - when (io.cohResp.fire()) { - state := Mux(hit, s_release, s_idle) - readBeatCnt.value := addr.wordIndex - } - } .elsewhen (hitReadBurst && io.out.ready) { - state := s_release - readBeatCnt.value := Mux(addr.wordIndex === (LineBeats - 1).U, 0.U, (addr.wordIndex + 1.U)) - } .elsewhen ((miss || mmio) && !io.flush) { - state := Mux(mmio, s_mmioReq, Mux(!ro.B && meta.dirty, s_memWriteReq, s_memReadReq)) - } - } - - is (s_mmioReq) { when (io.mmio.req.fire()) { state := s_mmioResp } } - is (s_mmioResp) { when (io.mmio.resp.fire()) { state := s_wait_resp } } - - is (s_release) { - when (io.cohResp.fire() || respToL1Fire) { readBeatCnt.inc() } - when (probe && io.cohResp.fire() && releaseLast || respToL1Fire && respToL1Last) { state := s_idle } - } - - is (s_memReadReq) { when (io.mem.req.fire()) { - state := s_memReadResp - readBeatCnt.value := addr.wordIndex - }} - - is (s_memReadResp) { - when (io.mem.resp.fire()) { - afterFirstRead := true.B - readBeatCnt.inc() - when (req.cmd === SimpleBusCmd.writeBurst) { writeL2BeatCnt.value := 0.U } - when (io.mem.resp.bits.isReadLast()) { state := s_wait_resp } - } - } - - is (s_memWriteReq) { - when (io.mem.req.fire()) { writeBeatCnt.inc() } - when (io.mem.req.bits.isWriteLast() && io.mem.req.fire()) { state := s_memWriteResp } - } - - is (s_memWriteResp) { when (io.mem.resp.fire()) { state := s_memReadReq } } - is (s_wait_resp) { when (io.out.fire() || needFlush || alreadyOutFire) { state := s_idle } } - } - - val dataRefill = MaskData(io.mem.resp.bits.rdata, req.wdata, Mux(readingFirst, wordMask, 0.U(DataBits.W))) - val dataRefillWriteBus = Wire(CacheDataArrayWriteBus).apply( - valid = (state === s_memReadResp) && io.mem.resp.fire(), setIdx = Cat(addr.index, readBeatCnt.value), - data = Wire(new DataBundle).apply(dataRefill), waymask = io.in.bits.waymask) - - dataWriteArb.io.in(0) <> dataHitWriteBus.req - dataWriteArb.io.in(1) <> dataRefillWriteBus.req - io.dataWriteBus.req <> dataWriteArb.io.out - - val metaRefillWriteBus = Wire(CacheMetaArrayWriteBus()).apply( - valid = (state === s_memReadResp) && io.mem.resp.fire() && io.mem.resp.bits.isReadLast(), - data = Wire(new MetaBundle).apply(valid = true.B, tag = addr.tag, dirty = !ro.B && req.isWrite()), - setIdx = getMetaIdx(req.addr), waymask = io.in.bits.waymask - ) - - metaWriteArb.io.in(0) <> metaHitWriteBus.req - metaWriteArb.io.in(1) <> metaRefillWriteBus.req - io.metaWriteBus.req <> metaWriteArb.io.out - - if (cacheLevel == 2) { - when ((state === s_memReadResp) && io.mem.resp.fire() && req.isReadBurst()) { - // readBurst request miss - io.out.bits.rdata := dataRefill - io.out.bits.cmd := Mux(io.mem.resp.bits.isReadLast(), SimpleBusCmd.readLast, SimpleBusCmd.readBurst) - }.elsewhen (req.isWriteLast() || req.cmd === SimpleBusCmd.writeBurst) { - // writeBurst/writeLast request, no matter hit or miss - io.out.bits.rdata := Mux(hit, dataRead, inRdataRegDemand) - io.out.bits.cmd := DontCare - }.elsewhen (hitReadBurst && state === s_release) { - // readBurst request hit - io.out.bits.rdata := dataHitWay - io.out.bits.cmd := Mux(respToL1Last, SimpleBusCmd.readLast, SimpleBusCmd.readBurst) - }.otherwise { - io.out.bits.rdata := Mux(hit, dataRead, inRdataRegDemand) - io.out.bits.cmd := req.cmd - } - } else { - io.out.bits.rdata := Mux(hit, dataRead, inRdataRegDemand) - io.out.bits.cmd := Mux(io.in.bits.req.isRead(), SimpleBusCmd.readLast, Mux(io.in.bits.req.isWrite(), SimpleBusCmd.writeResp, DontCare))//DontCare, added by lemover - } - io.out.bits.user.zip(req.user).map { case (o,i) => o := i } - - io.out.valid := io.in.valid && Mux(req.isBurst() && (cacheLevel == 2).B, - Mux(req.isWrite() && (hit || !hit && state === s_wait_resp), true.B, (state === s_memReadResp && io.mem.resp.fire() && req.cmd === SimpleBusCmd.readBurst)) || (respToL1Fire && respToL1Last && state === s_release), - Mux(probe, false.B, Mux(hit, true.B, Mux(req.isWrite() || mmio, state === s_wait_resp, afterFirstRead && !alreadyOutFire))) - ) - - // With critical-word first, the pipeline registers between - // s2 and s3 can not be overwritten before a missing request - // is totally handled. We use io.isFinish to indicate when the - // request really ends. - io.isFinish := Mux(probe, io.cohResp.fire() && Mux(miss, state === s_idle, (state === s_release) && releaseLast), - Mux(hit || req.isWrite(), io.out.fire(), (state === s_wait_resp) && (io.out.fire() || alreadyOutFire)) - ) - - io.in.ready := io.out.ready && (state === s_idle && !hitReadBurst) && !miss && !probe - io.dataReadRespToL1 := hitReadBurst && (state === s_idle && io.out.ready || state === s_release && state2 === s2_dataOK) - - assert(!(metaHitWriteBus.req.valid && metaRefillWriteBus.req.valid)) - assert(!(dataHitWriteBus.req.valid && dataRefillWriteBus.req.valid)) - assert(!(!ro.B && io.flush), "only allow to flush icache") - Debug() { - if (debug) { - printf("%d: [" + cacheName + " S3]: in.ready = %d, in.valid = %d, hit = %x, state = %d, addr = %x cmd:%d probe:%d isFinish:%d\n", - GTimer(), io.in.ready, io.in.valid, hit, state, req.addr, req.cmd, probe, io.isFinish) - printf("%d: [" + cacheName + " S3]: out.valid:%d rdata:%x cmd:%d user:%x \n", - GTimer(), io.out.valid, io.out.bits.rdata, io.out.bits.cmd, io.out.bits.user.getOrElse(0.U)) - printf("%d: [" + cacheName + " S3]: DHW: (%d, %d), data:%x MHW:(%d, %d)\n", - GTimer(), dataHitWriteBus.req.valid, dataHitWriteBus.req.ready, dataHitWriteBus.req.bits.data.asUInt, metaHitWriteBus.req.valid, metaHitWriteBus.req.ready) - printf("%d: [" + cacheName + " S3]: useFD:%d isFD:%d FD:%x DreadArray:%x dataRead:%x inwaymask:%x FDwaymask:%x \n", - GTimer(), useForwardData, io.in.bits.isForwardData, io.in.bits.forwardData.data.data, dataReadArray, dataRead, io.in.bits.waymask, io.in.bits.forwardData.waymask.getOrElse("b1".U)) - } - } -} - -class Cache(implicit val cacheConfig: CacheConfig) extends CacheModule { - val io = IO(new Bundle { - val in = Flipped(new SimpleBusUC(userBits = userBits)) - val flush = Input(UInt(2.W)) - val out = new SimpleBusC - val mmio = new SimpleBusUC - val empty = Output(Bool()) - }) - - // cpu pipeline - val s1 = Module(new CacheStage1) - val s2 = Module(new CacheStage2) - val s3 = Module(new CacheStage3) - val metaArray = Module(new SRAMTemplateWithArbiter(nRead = 1, new MetaBundle, set = Sets, way = Ways, shouldReset = true)) - val dataArray = Module(new SRAMTemplateWithArbiter(nRead = 2, new DataBundle, set = Sets * LineBeats, way = Ways)) - - if (cacheName == "icache") { - // flush icache when executing fence.i - val flushICache = WireInit(false.B) - BoringUtils.addSink(flushICache, "MOUFlushICache") - metaArray.reset := reset.asBool || flushICache - } - - val arb = Module(new Arbiter(new SimpleBusReqBundle(userBits = userBits), hasCohInt + 1)) - arb.io.in(hasCohInt + 0) <> io.in.req - - s1.io.in <> arb.io.out - /* - val s2BlockByPrefetch = if (cacheLevel == 2) { - s2.io.out.valid && s3.io.in.valid && s3.io.in.bits.req.isPrefetch() && !s3.io.in.ready - } else { false.B } - */ - PipelineConnect(s1.io.out, s2.io.in, s2.io.out.fire(), io.flush(0)) - PipelineConnect(s2.io.out, s3.io.in, s3.io.isFinish, io.flush(1) || s2.io.out.bits.mmio && s2.io.out.bits.req.isPrefetch()/* || s2BlockByPrefetch*/) - io.in.resp <> s3.io.out - s3.io.flush := io.flush(1) - io.out.mem <> s3.io.mem - io.mmio <> s3.io.mmio - io.empty := !s2.io.in.valid && !s3.io.in.valid - s1.io.s2s3Empty := io.empty // FIXME: remove me when do not use nut's cache - - io.in.resp.valid := Mux(s3.io.out.valid && s3.io.out.bits.isPrefetch(), false.B, s3.io.out.valid || s3.io.dataReadRespToL1) - - if (hasCoh) { - val cohReq = io.out.coh.req.bits - // coh does not have user signal, any better code? - val coh = Wire(new SimpleBusReqBundle(userBits = userBits)) - coh.apply(addr = cohReq.addr, cmd = cohReq.cmd, size = cohReq.cmd, wdata = cohReq.wdata, wmask = cohReq.wmask) - arb.io.in(0).bits := coh - arb.io.in(0).valid := io.out.coh.req.valid - io.out.coh.req.ready := arb.io.in(0).ready - io.out.coh.resp <> s3.io.cohResp - } else { - io.out.coh.req.ready := true.B - io.out.coh.resp := DontCare - io.out.coh.resp.valid := false.B - s3.io.cohResp.ready := true.B - } - - metaArray.io.r(0) <> s1.io.metaReadBus - dataArray.io.r(0) <> s1.io.dataReadBus - dataArray.io.r(1) <> s3.io.dataReadBus - - metaArray.io.w <> s3.io.metaWriteBus - dataArray.io.w <> s3.io.dataWriteBus - - s2.io.metaReadResp := s1.io.metaReadBus.resp.data - s2.io.dataReadResp := s1.io.dataReadBus.resp.data - s2.io.dataWriteBus := s3.io.dataWriteBus - s2.io.metaWriteBus := s3.io.metaWriteBus - - BoringUtils.addSource(s3.io.in.valid && s3.io.in.bits.hit, "perfCntCondM" + cacheName + "Hit") - - Debug() { - if (debug) { - when(true.B) { - io.in.dump(cacheName + ".in") - printf("%d:" + cacheName + "InReq(%d, %d) InResp(%d, %d) \n", GTimer(), io.in.req.valid, io.in.req.ready, io.in.resp.valid, io.in.resp.ready) - printf("%d:" + cacheName + " {IN s1:(%d,%d), s2:(%d,%d), s3:(%d,%d)} {OUT s1:(%d,%d), s2:(%d,%d), s3:(%d,%d)}\n", - GTimer(), s1.io.in.valid, s1.io.in.ready, s2.io.in.valid, s2.io.in.ready, s3.io.in.valid, s3.io.in.ready, s1.io.out.valid, s1.io.out.ready, s2.io.out.valid, s2.io.out.ready, s3.io.out.valid, s3.io.out.ready) - when (s1.io.in.valid) { printf("%d ", GTimer()) ; printf(p"[${cacheName}.S1]: ${s1.io.in.bits}\n") } - when (s2.io.in.valid) { printf("%d ", GTimer()) ; printf(p"[${cacheName}.S2]: ${s2.io.in.bits.req}\n") } - when (s3.io.in.valid) { printf("%d ", GTimer()) ; printf(p"[${cacheName}.S3]: ${s3.io.in.bits.req}\n") } - // s3.io.mem.dump(cacheName + ".mem") - }} - } -} - -object Cache { - def apply(in: SimpleBusUC, mmio: Seq[SimpleBusUC], flush: UInt, empty: Bool, enable: Boolean = true)(implicit cacheConfig: CacheConfig) = { - if (enable) { - val cache = Module(new Cache) - cache.io.flush := flush - cache.io.in <> in - mmio(0) <> cache.io.mmio - empty := cache.io.empty - cache.io.out - } else { - assert(false, "XiangShan should not reach here!") - val addrspace = List(AddressSpace.dram) ++ AddressSpace.mmio - val xbar = Module(new SimpleBusCrossbar1toN(addrspace)) - val busC = WireInit(0.U.asTypeOf(new SimpleBusC)) - busC.mem <> xbar.io.out(0) - xbar.io.in <> in - (mmio zip xbar.io.out.drop(1)) foreach { case (mmio_in, xbar_out) => - mmio_in <> xbar_out - } - empty := false.B - busC - } - } -} diff --git a/src/main/scala/noop/Decode.scala b/src/main/scala/noop/Decode.scala deleted file mode 100644 index 9c4a9c7fafdcfce46f2bbfedd2cfe763b9bd27cb..0000000000000000000000000000000000000000 --- a/src/main/scala/noop/Decode.scala +++ /dev/null @@ -1,74 +0,0 @@ -package noop - -import chisel3._ -import chisel3.util._ -import noop.isa.{RVDInstr, RVFInstr} - -trait HasInstrType { - def InstrN = "b0000".U - def InstrI = "b0100".U - def InstrR = "b0101".U - def InstrS = "b0010".U - def InstrB = "b0001".U - def InstrU = "b0110".U - def InstrJ = "b0111".U - def InstrA = "b1110".U - def InstrSA = "b1111".U // Atom Inst: SC - - def isrfWen(instrType : UInt): Bool = instrType(2) -} - -// trait CompInstConst { -// val RVCRegNumTable = Array( -// BitPat("b000") -> 8.U, -// BitPat("b001") -> 9.U, -// BitPat("b010") -> 10.U, -// BitPat("b011") -> 11.U, -// BitPat("b100") -> 12.U, -// BitPat("b101") -> 13.U, -// BitPat("b110") -> 14.U, -// BitPat("b111") -> 15.U -// ) -// } - -object SrcType { - def reg = "b00".U - def pc = "b01".U - def imm = "b01".U - def fp = "b10".U - def apply() = UInt(2.W) -} - -object FuType { - def num = 6 - def alu = "b000".U - def lsu = "b001".U - def mdu = "b010".U - def csr = "b011".U - def mou = "b100".U - def fpu = "b101".U - def apply() = UInt(log2Up(num).W) -} - -object FuOpType { - def apply() = UInt(6.W) -} - -object Instructions extends HasInstrType with HasNOOPParameter { - def NOP = 0x00000013.U - val DecodeDefault = List(InstrN, FuType.csr, CSROpType.jmp) - def DecodeTable = RVIInstr.table ++ NOOPTrap.table ++ - (if (HasMExtension) RVMInstr.table else Nil) ++ - (if (HasCExtension) RVCInstr.table else Nil) ++ - (if (HasFPU) RVFInstr.table ++ RVDInstr.table else Nil) ++ - Priviledged.table ++ - RVAInstr.table ++ - RVZicsrInstr.table ++ RVZifenceiInstr.table -} - -object CInstructions extends HasInstrType with HasNOOPParameter{ - def NOP = 0x00000013.U - val DecodeDefault = List(RVCInstr.ImmNone, RVCInstr.DtCare, RVCInstr.DtCare, RVCInstr.DtCare) - // val DecodeDefault = List(InstrN, FuType.csr, CSROpType.jmp) - def CExtraDecodeTable = RVCInstr.cExtraTable -} diff --git a/src/main/scala/noop/EXU.scala b/src/main/scala/noop/EXU.scala deleted file mode 100644 index 47902ebde861cb7abd4307d11f41758e52d5b125..0000000000000000000000000000000000000000 --- a/src/main/scala/noop/EXU.scala +++ /dev/null @@ -1,151 +0,0 @@ -package noop - -import chisel3._ -import chisel3.util._ -import chisel3.util.experimental.BoringUtils -import utils._ -import bus.simplebus._ -import noop.fu.FPU - -class EXU(implicit val p: NOOPConfig) extends NOOPModule { - val io = IO(new Bundle { - val in = Flipped(Decoupled(new DecodeIO)) - val out = Decoupled(new CommitIO) - val flush = Input(Bool()) - val dmem = new SimpleBusUC(addrBits = VAddrBits) - val forward = new ForwardIO - val memMMU = Flipped(new MemMMUIO) - }) - - val src1 = io.in.bits.data.src1 - val src2 = io.in.bits.data.src2 - - val (fuType, fuOpType) = (io.in.bits.ctrl.fuType, io.in.bits.ctrl.fuOpType) - - val fuValids = Wire(Vec(FuType.num, Bool())) - (0 until FuType.num).map (i => fuValids(i) := (fuType === i.U) && io.in.valid && !io.flush) - - val alu = Module(new ALU) - val aluOut = alu.access(valid = fuValids(FuType.alu), src1 = src1, src2 = src2, func = fuOpType) - alu.io.cfIn := io.in.bits.cf - alu.io.offset := io.in.bits.data.imm - alu.io.out.ready := true.B - - val lsu = Module(new LSU) - val lsuTlbPF = WireInit(false.B) - val lsuOut = lsu.access(valid = fuValids(FuType.lsu), src1 = src1, src2 = io.in.bits.data.imm, func = fuOpType, dtlbPF = lsuTlbPF) - lsu.io.wdata := src2 - lsu.io.instr := io.in.bits.cf.instr - io.out.bits.isMMIO := lsu.io.isMMIO || (AddressSpace.isMMIO(io.in.bits.cf.pc) && io.out.valid) - io.dmem <> lsu.io.dmem - lsu.io.out.ready := true.B - - val mdu = Module(new MDU) - val mduOut = mdu.access(valid = fuValids(FuType.mdu), src1 = src1, src2 = src2, func = fuOpType) - mdu.io.out.ready := true.B - - val csr = Module(new CSR) - val csrOut = csr.access(valid = fuValids(FuType.csr), src1 = src1, src2 = src2, func = fuOpType) - csr.io.cfIn := io.in.bits.cf - csr.io.cfIn.exceptionVec(loadAddrMisaligned) := lsu.io.loadAddrMisaligned - csr.io.cfIn.exceptionVec(storeAddrMisaligned) := lsu.io.storeAddrMisaligned - csr.io.instrValid := io.in.valid && !io.flush - io.out.bits.intrNO := csr.io.intrNO - csr.io.out.ready := true.B - - csr.io.imemMMU <> io.memMMU.imem - csr.io.dmemMMU <> io.memMMU.dmem - - val mou = Module(new MOU) - // mou does not write register - mou.access(valid = fuValids(FuType.mou), src1 = src1, src2 = src2, func = fuOpType) - mou.io.cfIn := io.in.bits.cf - mou.io.out.ready := true.B - - val (fpuOut,fpuOutValid) = if(HasFPU){ - val fpu = Module(new FPU) - Debug(){ - when(io.in.valid){ - printf(p"[EXU] at pc=${Hexadecimal(io.in.bits.cf.pc)} " + - p"fpu in valid=${fpu.io.in.valid} " + - p"fpu out valid=${fpu.io.out.valid}\n") - } - } - fpu.io.out.ready := true.B - csr.io.fpu_csr <> fpu.io.fpu_csr - fpu.io.fpWen := io.in.bits.ctrl.fpWen - fpu.io.inputFunc := io.in.bits.ctrl.fpInputFunc - fpu.io.outputFunc := io.in.bits.ctrl.fpOutputFunc - fpu.io.instr := io.in.bits.cf.instr - (fpu.access(fuValids(FuType.fpu), src1, src2, io.in.bits.data.imm, io.in.bits.ctrl.fuOpType), fpu.io.out.valid) - } else { - csr.io.fpu_csr <> DontCare - (0.U,false.B) - } - - - io.out.bits.decode := DontCare - (io.out.bits.decode.ctrl, io.in.bits.ctrl) match { case (o, i) => - o.rfWen := i.rfWen && (!lsuTlbPF && !lsu.io.loadAddrMisaligned && !lsu.io.storeAddrMisaligned || !fuValids(FuType.lsu)) && !(csr.io.wenFix && fuValids(FuType.csr)) - o.rfDest := i.rfDest - o.fuType := i.fuType - o.fpWen := i.fpWen && (!lsuTlbPF && !lsu.io.loadAddrMisaligned && !lsu.io.storeAddrMisaligned || !fuValids(FuType.lsu)) && !(csr.io.wenFix && fuValids(FuType.csr)) - } - io.out.bits.decode.cf.pc := io.in.bits.cf.pc - - io.out.bits.decode.cf.instr := io.in.bits.cf.instr - io.out.bits.decode.cf.redirect <> - Mux(mou.io.redirect.valid, mou.io.redirect, - Mux(csr.io.redirect.valid, csr.io.redirect, alu.io.redirect)) - Debug(){ - //when(mou.io.redirect.valid || csr.io.redirect.valid || alu.io.redirect.valid){ - printf("[REDIRECT] inValid:%d mou %x csr %x alu %x \n", io.in.valid, mou.io.redirect.valid, csr.io.redirect.valid, alu.io.redirect.valid) - printf("[REDIRECT] flush: %d mou %x csr %x alu %x\n", io.flush, mou.io.redirect.target, csr.io.redirect.target, alu.io.redirect.target) - //} - } - - // FIXME: should handle io.out.ready == false - io.out.valid := io.in.valid && MuxLookup(fuType, true.B, List( - FuType.lsu -> lsu.io.out.valid, - FuType.mdu -> mdu.io.out.valid, - FuType.fpu -> fpuOutValid - )) - - io.out.bits.commits(FuType.alu) := aluOut - io.out.bits.commits(FuType.lsu) := lsuOut - io.out.bits.commits(FuType.csr) := csrOut - io.out.bits.commits(FuType.mdu) := mduOut - io.out.bits.commits(FuType.mou) := 0.U - io.out.bits.commits(FuType.fpu) := fpuOut - - io.in.ready := !io.in.valid || io.out.fire() - - io.forward.valid := io.in.valid - io.forward.wb.rfWen := io.in.bits.ctrl.rfWen - io.forward.wb.fpWen := io.in.bits.ctrl.fpWen - io.forward.wb.rfDest := io.in.bits.ctrl.rfDest - io.forward.wb.rfData := Mux(alu.io.out.fire(), aluOut, lsuOut) - io.forward.fuType := io.in.bits.ctrl.fuType - - val isBru = ALUOpType.isBru(fuOpType) - BoringUtils.addSource(alu.io.out.fire() && !isBru, "perfCntCondMaluInstr") - BoringUtils.addSource(alu.io.out.fire() && isBru, "perfCntCondMbruInstr") - BoringUtils.addSource(lsu.io.out.fire(), "perfCntCondMlsuInstr") - BoringUtils.addSource(mdu.io.out.fire(), "perfCntCondMmduInstr") - BoringUtils.addSource(csr.io.out.fire(), "perfCntCondMcsrInstr") - - if (!p.FPGAPlatform) { - val nooptrap = io.in.bits.ctrl.isNoopTrap && io.in.valid - val cycleCnt = WireInit(0.U(XLEN.W)) - val instrCnt = WireInit(0.U(XLEN.W)) - - BoringUtils.addSink(cycleCnt, "simCycleCnt") - BoringUtils.addSink(instrCnt, "simInstrCnt") - - BoringUtils.addSource(nooptrap, "trapValid") - BoringUtils.addSource(io.in.bits.data.src1, "trapCode") - BoringUtils.addSource(io.in.bits.cf.pc, "trapPC") - BoringUtils.addSource(cycleCnt, "trapCycleCnt") - BoringUtils.addSource(instrCnt, "trapInstrCnt") - } -} diff --git a/src/main/scala/noop/IDU1.scala b/src/main/scala/noop/IDU1.scala deleted file mode 100644 index 54ac5e19a78a017126451bb89d2fc07ab934df4b..0000000000000000000000000000000000000000 --- a/src/main/scala/noop/IDU1.scala +++ /dev/null @@ -1,192 +0,0 @@ -package noop - -import chisel3._ -import chisel3.util._ -import chisel3.util.experimental.BoringUtils - -import utils._ - -class IDU1 extends NOOPModule with HasInstrType with HasExceptionNO { - val io = IO(new Bundle { - val in = Flipped(Decoupled(new CtrlFlowIO)) - val out = Decoupled(new CtrlFlowIO) - val flush = Input(Bool()) - val redirect = new RedirectIO - }) - - val instr = Wire(UInt(32.W)) - val isRVC = instr(1,0) =/= "b11".U - - //RVC support FSM - //only ensure pnpc given by this FSM is right. May need flush after 6 offset 32 bit inst - val s_idle :: s_extra :: s_waitnext :: s_waitnext_thenj :: Nil = Enum(4) - val state = RegInit(UInt(2.W), s_idle) - val pcOffsetR = RegInit(UInt(3.W), 0.U) - val pcOffset = Mux(state === s_idle, io.in.bits.pc(2,0), pcOffsetR) - val instIn = Cat(0.U(16.W), io.in.bits.instr) - // val nextState = WireInit(0.U(2.W)) - val canGo = WireInit(false.B) - val canIn = WireInit(false.B) - val brIdx = io.in.bits.brIdx - // val brIdx = 0.U - val rvcFinish = pcOffset === 0.U && (!isRVC || brIdx(0)) || pcOffset === 4.U && (!isRVC || brIdx(0)) || pcOffset === 2.U && (isRVC || brIdx(1)) || pcOffset === 6.U && isRVC - // if brIdx(0) (branch taken at inst with offest 0), ignore the rest part of this instline - // just get next pc and instline from IFU - val rvcNext = pcOffset === 0.U && (isRVC && !brIdx(0)) || pcOffset === 4.U && (isRVC && !brIdx(0)) || pcOffset === 2.U && !isRVC && !brIdx(1) - val rvcSpecial = pcOffset === 6.U && !isRVC && !brIdx(2) - val rvcSpecialJump = pcOffset === 6.U && !isRVC && brIdx(2) - val pnpcIsSeq = brIdx(3) - // val pnpcIsSeqRight = io.in.bits.pnpc === (Cat(io.in.bits.pc(VAddrBits-1,2), 0.U(2.W)) + 4.U) // TODO: add a new user bit bpRight to do this - // assert(pnpcIsSeq === pnpcIsSeqRight) - val flushIFU = (state === s_idle || state === s_extra) && rvcSpecial && io.in.valid && !pnpcIsSeq - when(flushIFU){printf("flushIFU at pc %x offset %x timer:%d\n", io.in.bits.pc, pcOffset, GTimer())} - assert(!flushIFU) - val loadNextInstline = (state === s_idle || state === s_extra) && (rvcSpecial || rvcSpecialJump) && io.in.valid && pnpcIsSeq - // val loadNextInstline =false.B - val pcOut = WireInit(0.U(VAddrBits.W)) - val pnpcOut = WireInit(0.U(VAddrBits.W)) - val specialPCR = Reg(UInt(VAddrBits.W)) // reg for full inst that cross 2 inst line - val specialNPCR = Reg(UInt(VAddrBits.W)) // reg for pnc for full inst jump that cross 2 inst line - val specialInstR = Reg(UInt(16.W)) - val specialIPFR = RegInit(Bool(), false.B) - val redirectPC = Cat(io.in.bits.pc(VAddrBits-1,3), 0.U(3.W))+"b1010".U // IDU can got get full inst from a single inst line - val rvcForceLoadNext = (pcOffset === 2.U && !isRVC && io.in.bits.pnpc(2,0) === 4.U && !brIdx(1)) - //------------------------------------------------------ - // rvcForceLoadNext is used to deal with: - // case 1: - // 8010004a: 406007b7 lui a5,0x40600 - // 8010004e: 470d li a4,3 - // 80100050: 00e78623 sb a4,12(a5) # 4060000c <_start-0x3faffff4> - // For icache req inst in seq, if there is no rvcForceLoadNext, - // after 8010004e there will be 8010004c instead of 80100050 - //------------------------------------------------------ - // case 2: - // 80100046: 406007b7 lui a5,0x40600 - // 8010004a: 470d li a4,3 - // force load next instline into ID stage, if bp wrong, it will be flushed by flushIFU - //------------------------------------------------------ - // if there is a j inst in current inst line, a redirect req will be sent by ALU before invalid inst exception being committed - // when brIdx(1), next instline will just be branch target, eatline is no longer needed - - // only for test, add this to pipeline when do real implementation - // val predictBranch = io.in.valid && Mux(io.in.bits.pc(1), io.in.bits.pc + 2.U === io.in.bits.pnpc, io.in.bits.pc + 4.U === io.in.bits.pnpc) - // val flush = rvcSpecial - instr := Mux((state === s_waitnext || state === s_waitnext_thenj), Cat(instIn(15,0), specialInstR), LookupTree(pcOffset, List( - "b000".U -> instIn(31,0), - "b010".U -> instIn(31+16,16), - "b100".U -> instIn(63,32), - "b110".U -> instIn(63+16,32+16) - ))) - - io.redirect.target := redirectPC - io.redirect.valid := flushIFU - - when(!io.flush){ - switch(state){ - is(s_idle){//decode current pc in pipeline - canGo := rvcFinish || rvcNext - canIn := rvcFinish || rvcForceLoadNext - pcOut := io.in.bits.pc - pnpcOut := Mux(rvcFinish, io.in.bits.pnpc, Mux(isRVC, io.in.bits.pc+2.U, io.in.bits.pc+4.U)) - when(io.out.fire() && rvcFinish){state := s_idle} - when(io.out.fire() && rvcNext){ - state := s_extra - pcOffsetR := pcOffset + Mux(isRVC, 2.U, 4.U) - } - when(rvcSpecial && io.in.valid){ - state := s_waitnext - specialPCR := pcOut - specialInstR := io.in.bits.instr(63,63-16+1) - specialIPFR := io.in.bits.exceptionVec(instrPageFault) - } - when(rvcSpecialJump && io.in.valid){ - state := s_waitnext_thenj - specialPCR := pcOut - specialNPCR := io.in.bits.pnpc - specialInstR := io.in.bits.instr(63,63-16+1) - specialIPFR := io.in.bits.exceptionVec(instrPageFault) - } - } - is(s_extra){//get 16 aligned inst, pc controled by this FSM - canGo := rvcFinish || rvcNext - canIn := rvcFinish || rvcForceLoadNext - pcOut := Cat(io.in.bits.pc(VAddrBits-1,3), pcOffsetR(2,0)) - pnpcOut := Mux(rvcFinish, io.in.bits.pnpc, Mux(isRVC, pcOut+2.U, pcOut+4.U)) - when(io.out.fire() && rvcFinish){state := s_idle} - when(io.out.fire() && rvcNext){ - state := s_extra - pcOffsetR := pcOffset + Mux(isRVC, 2.U, 4.U) - } - when(rvcSpecial && io.in.valid){ - state := s_waitnext - specialPCR := pcOut - specialInstR := io.in.bits.instr(63,63-16+1) - specialIPFR := io.in.bits.exceptionVec(instrPageFault) - } - when(rvcSpecialJump && io.in.valid){ - state := s_waitnext_thenj - specialPCR := pcOut - specialNPCR := io.in.bits.pnpc - specialInstR := io.in.bits.instr(63,63-16+1) - specialIPFR := io.in.bits.exceptionVec(instrPageFault) - } - } - is(s_waitnext){//require next 64bits, for this inst has size 32 and offset 6 - //ignore bp result, use pc+4 instead - pcOut := specialPCR - pnpcOut := specialPCR + 4.U - // pnpcOut := Mux(rvcFinish, io.in.bits.pnpc, Mux(isRVC, pcOut+2.U, pcOut+4.U)) - canGo := io.in.valid - canIn := false.B - when(io.out.fire()){ - state := s_extra - pcOffsetR := "b010".U - } - } - is(s_waitnext_thenj){//require next 64bits, for this inst has size 32 and offset 6 - //use bp result - pcOut := specialPCR - pnpcOut := specialNPCR - // pnpcOut := Mux(rvcFinish, io.in.bits.pnpc, Mux(isRVC, pcOut+2.U, pcOut+4.U)) - canGo := io.in.valid - canIn := true.B - when(io.out.fire()){ - state := s_idle - } - } - // is(s_readnext){//npc right, get next 64 inst bits, flush pipeline is not needed - // //ignore bp result, use pc+4 instead - // pcOut := specialPCR - // pnpcOut := specialPCR + 4.U - // // pnpcOut := Mux(rvcFinish, io.in.bits.pnpc, Mux(isRVC, pcOut+2.U, pcOut+4.U)) - // canGo := io.in.valid - // canIn := false.B - // when(io.out.fire()){ - // state := s_extra - // pcOffsetR := "b010".U - // } - // } - } - }.otherwise{ - state := s_idle - canGo := DontCare - canIn := DontCare - pcOut := DontCare - pnpcOut := DontCare - } - - //output signals - io.out.bits := DontCare - io.out.bits.redirect.valid := false.B - io.out.bits.pc := pcOut - io.out.bits.pnpc := pnpcOut - io.out.bits.instr := instr - io.out.bits.brIdx := io.in.bits.brIdx - - io.out.valid := io.in.valid && canGo - io.in.ready := (!io.in.valid || (io.out.fire() && canIn) || loadNextInstline) - - io.out.bits.exceptionVec := io.in.bits.exceptionVec/*.map(_ := false.B)*/ //Fix by zhangzifei from false.B - io.out.bits.exceptionVec(instrPageFault) := io.in.bits.exceptionVec(instrPageFault) || specialIPFR && (state === s_waitnext_thenj || state === s_waitnext) - io.out.bits.crossPageIPFFix := io.in.bits.exceptionVec(instrPageFault) && (state === s_waitnext_thenj || state === s_waitnext) && !specialIPFR -} diff --git a/src/main/scala/noop/IDU2.scala b/src/main/scala/noop/IDU2.scala deleted file mode 100644 index 041b5018e9ed2d4aabf6172e274050e2ec4d922d..0000000000000000000000000000000000000000 --- a/src/main/scala/noop/IDU2.scala +++ /dev/null @@ -1,217 +0,0 @@ -package noop - -import chisel3._ -import chisel3.util._ -import chisel3.util.experimental.BoringUtils -import noop.isa.{RVDInstr, RVFInstr, RVF_LSUInstr, RVD_LSUInstr} -import utils._ - -class IDU2(implicit val p: NOOPConfig) extends NOOPModule with HasInstrType { - val io = IO(new Bundle { - val in = Flipped(Decoupled(new CtrlFlowIO)) - val out = Decoupled(new DecodeIO) - val flush = Input(Bool()) - }) - - val hasIntr = Wire(Bool()) - val hasIntrOrExceptino = hasIntr || io.in.bits.exceptionVec(instrPageFault) - val instr = io.in.bits.instr(31, 0) - val decodeList = ListLookup(instr, Instructions.DecodeDefault, Instructions.DecodeTable) - val commonInstrType :: commonFuType :: commonFuOpType :: Nil = decodeList - - val intrInstrType :: intrFuType :: intrFuOpType :: Nil = Instructions.DecodeDefault - - //(isFp, src1Type, src2Type, src3Type, rfWen, fpWen, fuOpType, inputFunc, outputFunc) - val fpExtraDecodeTable = RVFInstr.extraTable ++ RVDInstr.extraTable - val isFp :: fpSrc1Type :: fpSrc2Type :: fpSrc3Type :: fpRfWen :: fpWen :: fpFuOpType :: fpInputFunc :: fpOutputFunc :: Nil = - if(HasFPU) ListLookup(instr, RVFInstr.extraTableDefault, fpExtraDecodeTable) else RVFInstr.extraTableDefault - - val floatLdStInstrs = List( - RVF_LSUInstr.FLW, - RVF_LSUInstr.FSW, - RVD_LSUInstr.FLD, - RVCInstr.C_FLD, - RVCInstr.C_FLDSP, - RVD_LSUInstr.FSD, - RVCInstr.C_FSD, - RVCInstr.C_FSDSP - ) - - def treeCmp(key: UInt, cmpList: List[BitPat]): Bool = { - cmpList.size match { - case 1 => - key === cmpList.head - case n => - treeCmp(key, cmpList take n/2) || treeCmp(key, cmpList drop n/2) - } - } - - val isFloatLdSd = if(HasFPU) treeCmp(instr, floatLdStInstrs) else false.B - - val isRVFD = isFp.asBool() - val instrType = Mux(hasIntrOrExceptino, - intrInstrType, - commonInstrType - ) - val fuType = Mux(hasIntrOrExceptino, - intrFuType, - Mux(isRVFD && !isFloatLdSd, - FuType.fpu, - commonFuType - ) - ) - val fuOpType = Mux(hasIntrOrExceptino, - intrFuOpType, - Mux(isRVFD, fpFuOpType, commonFuOpType) - ) - - - val isRVC = instr(1,0) =/= "b11".U - val rvcImmType :: rvcSrc1Type :: rvcSrc2Type :: rvcDestType :: Nil = - ListLookup(instr, CInstructions.DecodeDefault, CInstructions.CExtraDecodeTable) - - io.out.bits := DontCare - - io.out.bits.ctrl.fuType := fuType - io.out.bits.ctrl.fuOpType := fuOpType - io.out.bits.ctrl.fpInputFunc := fpInputFunc - io.out.bits.ctrl.fpOutputFunc := fpOutputFunc - - val SrcTypeTable = List( - InstrI -> (SrcType.reg, SrcType.imm), - InstrR -> (SrcType.reg, SrcType.reg), - InstrS -> (SrcType.reg, SrcType.reg), - InstrSA-> (SrcType.reg, SrcType.reg), - InstrB -> (SrcType.reg, SrcType.reg), - InstrU -> (SrcType.pc , SrcType.imm), - InstrJ -> (SrcType.pc , SrcType.imm), - InstrN -> (SrcType.pc , SrcType.imm) - ) - val src1Type = Mux(isRVFD, - fpSrc1Type, - LookupTree(instrType, SrcTypeTable.map(p => (p._1, p._2._1))) - ) - val src2Type = Mux(isRVFD, - fpSrc2Type, - LookupTree(instrType, SrcTypeTable.map(p => (p._1, p._2._2))) - ) - - val (rs, rt, rd) = (instr(19, 15), instr(24, 20), instr(11, 7)) - // see riscv-spec vol1, Table 16.1: Compressed 16-bit RVC instruction formats. - val rs1 = instr(11,7) - val rs2 = instr(6,2) - val rs1p = LookupTree(instr(9,7), RVCInstr.RVCRegNumTable.map(p => (p._1, p._2))) - val rs2p = LookupTree(instr(4,2), RVCInstr.RVCRegNumTable.map(p => (p._1, p._2))) - val rvc_shamt = Cat(instr(12),instr(6,2)) - // val rdp_rs1p = LookupTree(instr(9,7), RVCRegNumTable) - // val rdp = LookupTree(instr(4,2), RVCRegNumTable) - - val RegLookUpTable = List( - RVCInstr.DtCare -> 0.U, - RVCInstr.REGrs -> rs, - RVCInstr.REGrt -> rt, - RVCInstr.REGrd -> rd, - RVCInstr.REGrs1 -> rs1, - RVCInstr.REGrs2 -> rs2, - RVCInstr.REGrs1p -> rs1p, - RVCInstr.REGrs2p -> rs2p, - RVCInstr.REGx1 -> 1.U, - RVCInstr.REGx2 -> 2.U - ) - - val rvc_src1 = LookupTree(rvcSrc1Type, RegLookUpTable.map(p => (p._1, p._2))) - val rvc_src2 = LookupTree(rvcSrc2Type, RegLookUpTable.map(p => (p._1, p._2))) - val rvc_dest = LookupTree(rvcDestType, RegLookUpTable.map(p => (p._1, p._2))) - - val rfSrc1 = Mux(isRVC, rvc_src1, rs) - val rfSrc2 = Mux(isRVC, rvc_src2, rt) - val rfDest = Mux(isRVC, rvc_dest, rd) - - val rfWen = !hasIntrOrExceptino && Mux(isRVFD, fpRfWen.asBool(), isrfWen(instrType)) - - // TODO: refactor decode logic - // make non-register addressing to zero, since isu.sb.isBusy(0) === false.B - io.out.bits.ctrl.rfSrc1 := Mux(src1Type === SrcType.pc, 0.U, rfSrc1) - io.out.bits.ctrl.rfSrc2 := Mux(src2Type === SrcType.imm, 0.U, rfSrc2) - io.out.bits.ctrl.rfWen := rfWen - io.out.bits.ctrl.fpWen := fpWen.asBool() - io.out.bits.ctrl.rfDest := Mux(fpWen.asBool() || rfWen, rfDest, 0.U) - - io.out.bits.data := DontCare - val imm = LookupTree(instrType, List( - InstrI -> SignExt(instr(31, 20), XLEN), - InstrS -> SignExt(Cat(instr(31, 25), instr(11, 7)), XLEN), - InstrSA -> SignExt(Cat(instr(31, 25), instr(11, 7)), XLEN), - InstrB -> SignExt(Cat(instr(31), instr(7), instr(30, 25), instr(11, 8), 0.U(1.W)), XLEN), - InstrU -> SignExt(Cat(instr(31, 12), 0.U(12.W)), XLEN),//fixed - InstrJ -> SignExt(Cat(instr(31), instr(19, 12), instr(20), instr(30, 21), 0.U(1.W)), XLEN) - )) - val immrvc = LookupTree(rvcImmType, List( - // InstrIW -> Cat(Fill(20+32, instr(31)), instr(31, 20)),//fixed - RVCInstr.ImmNone -> 0.U(XLEN.W), - RVCInstr.ImmLWSP -> ZeroExt(Cat(instr(3,2), instr(12), instr(6,4), 0.U(2.W)), XLEN), - RVCInstr.ImmLDSP -> ZeroExt(Cat(instr(4,2), instr(12), instr(6,5), 0.U(3.W)), XLEN), - RVCInstr.ImmSWSP -> ZeroExt(Cat(instr(8,7), instr(12,9), 0.U(2.W)), XLEN), - RVCInstr.ImmSDSP -> ZeroExt(Cat(instr(9,7), instr(12,10), 0.U(3.W)), XLEN), - RVCInstr.ImmSW -> ZeroExt(Cat(instr(5), instr(12,10), instr(6), 0.U(2.W)), XLEN), - RVCInstr.ImmSD -> ZeroExt(Cat(instr(6,5), instr(12,10), 0.U(3.W)), XLEN), - RVCInstr.ImmLW -> ZeroExt(Cat(instr(5), instr(12,10), instr(6), 0.U(2.W)), XLEN), - RVCInstr.ImmLD -> ZeroExt(Cat(instr(6,5), instr(12,10), 0.U(3.W)), XLEN), - RVCInstr.ImmJ -> SignExt(Cat(instr(12), instr(8), instr(10,9), instr(6), instr(7), instr(2), instr(11), instr(5,3), 0.U(1.W)), XLEN), - RVCInstr.ImmB -> SignExt(Cat(instr(12), instr(6,5), instr(2), instr(11,10), instr(4,3), 0.U(1.W)), XLEN), - RVCInstr.ImmLI -> SignExt(Cat(instr(12), instr(6,2)), XLEN), - RVCInstr.ImmLUI -> SignExt(Cat(instr(12), instr(6,2), 0.U(12.W)), XLEN), - RVCInstr.ImmADDI -> SignExt(Cat(instr(12), instr(6,2)), XLEN), - RVCInstr.ImmADDI16SP-> SignExt(Cat(instr(12), instr(4,3), instr(5), instr(2), instr(6), 0.U(4.W)), XLEN), - RVCInstr.ImmADD4SPN-> ZeroExt(Cat(instr(10,7), instr(12,11), instr(5), instr(6), 0.U(2.W)), XLEN) - // ImmFLWSP -> - // ImmFLDSP -> - )) - io.out.bits.data.imm := Mux(isRVC, immrvc, imm) - - when (fuType === FuType.alu) { - def isLink(reg: UInt) = (reg === 1.U || reg === 5.U) - when (isLink(rfDest) && fuOpType === ALUOpType.jal) { io.out.bits.ctrl.fuOpType := ALUOpType.call } - when (fuOpType === ALUOpType.jalr) { - when (isLink(rfSrc1)) { io.out.bits.ctrl.fuOpType := ALUOpType.ret } - when (isLink(rfDest)) { io.out.bits.ctrl.fuOpType := ALUOpType.call } - } - } - // fix LUI - io.out.bits.ctrl.src1Type := Mux(instr(6,0) === "b0110111".U, SrcType.reg, src1Type) - io.out.bits.ctrl.src2Type := src2Type - io.out.bits.ctrl.src3Type := fpSrc3Type - - // io.out.bits.ctrl.isInvOpcode := (instrType === InstrN) && io.in.valid - io.out.bits.ctrl.isNoopTrap := (instr(31,0) === NOOPTrap.TRAP) && io.in.valid - - //output signals - - io.out.valid := io.in.valid - io.in.ready := !io.in.valid || io.out.fire() && !hasIntr - io.out.bits.cf <> io.in.bits - - Debug(){ - when(io.out.fire()){printf("[IDU] issue: pc %x npc %x instr %x\n", io.out.bits.cf.pc, io.out.bits.cf.pnpc, io.out.bits.cf.instr)} - } - - val intrVec = WireInit(0.U(12.W)) - BoringUtils.addSink(intrVec, "intrVecIDU") - io.out.bits.cf.intrVec.zip(intrVec.asBools).map{ case(x, y) => x := y } - hasIntr := intrVec.orR - - io.out.bits.cf.exceptionVec.map(_ := false.B) - io.out.bits.cf.exceptionVec(illegalInstr) := (!isRVFD && instrType === InstrN && !hasIntr) && io.in.valid - io.out.bits.cf.exceptionVec(instrPageFault) := io.in.bits.exceptionVec(instrPageFault) - - io.out.bits.ctrl.isNoopTrap := (instr === NOOPTrap.TRAP) && io.in.valid - - if (!p.FPGAPlatform) { - val isWFI = (instr === Priviledged.WFI) && io.in.valid - BoringUtils.addSource(isWFI, "isWFI") - } -} - -// Note -// C.LWSP is only valid when rd̸=x0; the code points with rd=x0 are reserved -// C.LDSP is only valid when rd̸=x0; the code points with rd=x0 are reserved. diff --git a/src/main/scala/noop/IFU.scala b/src/main/scala/noop/IFU.scala deleted file mode 100644 index 1c363088e91ebc604a39248c1d3587fd420ac825..0000000000000000000000000000000000000000 --- a/src/main/scala/noop/IFU.scala +++ /dev/null @@ -1,115 +0,0 @@ -package noop - -import chisel3._ -import chisel3.util._ -import chisel3.util.experimental.BoringUtils - -import utils._ -import bus.simplebus._ - -trait HasResetVector { - val resetVector = 0x40000000L//TODO: set reset vec -} - -class IFU extends NOOPModule with HasResetVector { - val io = IO(new Bundle { - - val imem = new SimpleBusUC(userBits = VAddrBits*2 + 4, addrBits = VAddrBits) - // val pc = Input(UInt(VAddrBits.W)) - val out = Decoupled(new CtrlFlowIO) - - val redirect = Flipped(new RedirectIO) - val flushVec = Output(UInt(4.W)) - val bpFlush = Output(Bool()) - val ipf = Input(Bool()) - }) - - // pc - val pc = RegInit(resetVector.U(VAddrBits.W)) - val pcUpdate = io.redirect.valid || io.imem.req.fire() - val snpc = Mux(pc(1), pc + 2.U, pc + 4.U) // sequential next pc - - val bp1 = Module(new BPU1) - - // - val lateJump = bp1.io.lateJump - val lateJumpLatch = RegInit(false.B) - when(pcUpdate || bp1.io.flush) { - lateJumpLatch := Mux(bp1.io.flush, false.B, lateJump && !lateJumpLatch) - } - val lateJumpTarget = RegEnable(bp1.io.out.target, lateJump) - val lateJumpForceSeq = lateJump && bp1.io.out.valid - val lateJumpForceTgt = lateJumpLatch && !bp1.io.flush - - // predicted next pc - val pnpc = Mux(lateJump, snpc, bp1.io.out.target) - val pbrIdx = bp1.io.brIdx - val npc = Mux(io.redirect.valid, io.redirect.target, Mux(lateJumpLatch, lateJumpTarget, Mux(bp1.io.out.valid, pnpc, snpc))) - val npcIsSeq = Mux(io.redirect.valid , false.B, Mux(lateJumpLatch, false.B, Mux(lateJump, true.B, Mux(bp1.io.out.valid, false.B, true.B)))) - // Debug(){ - // printf("[NPC] %x %x %x %x %x %x\n",lateJumpLatch, lateJumpTarget, lateJump, bp1.io.out.valid, pnpc, snpc) - // } - - // val npc = Mux(io.redirect.valid, io.redirect.target, Mux(io.redirectRVC.valid, io.redirectRVC.target, snpc)) - val brIdx = Wire(UInt(4.W)) - // brIdx(0) -> branch at pc offset 0 (mod 4) - // brIdx(1) -> branch at pc offset 2 (mod 4) - // brIdx(2) -> branch at pc offset 6 (mod 8), and this inst is not rvc inst - brIdx := Cat(npcIsSeq, Mux(io.redirect.valid, 0.U, pbrIdx)) - //TODO: BP will be disabled shortly after a redirect request - - bp1.io.in.pc.valid := io.imem.req.fire() // only predict when Icache accepts a request - bp1.io.in.pc.bits := npc // predict one cycle early - // bp1.io.flush := io.redirect.valid - bp1.io.flush := io.redirect.valid - //val bp2 = Module(new BPU2) - //bp2.io.in.bits := io.out.bits - //bp2.io.in.valid := io.imem.resp.fire() - - when (pcUpdate) { - pc := npc - // printf("[IF1] pc=%x\n", pc) - } - - Debug(){ - when(pcUpdate) { - printf("[IFUPC] pc:%x pcUpdate:%d npc:%x RedValid:%d RedTarget:%x LJL:%d LJTarget:%x LJ:%d snpc:%x bpValid:%d pnpn:%x \n",pc, pcUpdate, npc, io.redirect.valid,io.redirect.target,lateJumpLatch,lateJumpTarget,lateJump,snpc,bp1.io.out.valid,pnpc) - //printf(p"[IFUIN] redirect: ${io.redirect} \n") - } - } - - io.flushVec := Mux(io.redirect.valid, "b1111".U, 0.U) - io.bpFlush := false.B - - io.imem.req.bits.apply(addr = Cat(pc(VAddrBits-1,1),0.U(1.W)), //cache will treat it as Cat(pc(63,3),0.U(3.W)) - size = "b11".U, cmd = SimpleBusCmd.read, wdata = 0.U, wmask = 0.U, user = Cat(brIdx(3,0), npc(VAddrBits-1, 0), pc(VAddrBits-1, 0))) - io.imem.req.valid := io.out.ready - //TODO: add ctrlFlow.exceptionVec - io.imem.resp.ready := io.out.ready || io.flushVec(0) - - io.out.bits := DontCare - //inst path only uses 32bit inst, get the right inst according to pc(2) - - Debug(){ - when(io.imem.req.fire()){ - printf("[IFI] pc=%x user=%x %x %x %x \n", io.imem.req.bits.addr, io.imem.req.bits.user.getOrElse(0.U), io.redirect.valid, pbrIdx, brIdx) - } - when (io.out.fire()) { - printf("[IFO] pc=%x inst=%x\n", io.out.bits.pc, io.out.bits.instr) - } - } - - // io.out.bits.instr := (if (XLEN == 64) io.imem.resp.bits.rdata.asTypeOf(Vec(2, UInt(32.W)))(io.out.bits.pc(2)) - // else io.imem.resp.bits.rdata) - io.out.bits.instr := io.imem.resp.bits.rdata - io.imem.resp.bits.user.map{ case x => - io.out.bits.pc := x(VAddrBits-1,0) - io.out.bits.pnpc := x(VAddrBits*2-1,VAddrBits) - io.out.bits.brIdx := x(VAddrBits*2 + 3, VAddrBits*2) - } - io.out.bits.exceptionVec(instrPageFault) := io.ipf - io.out.valid := io.imem.resp.valid && !io.flushVec(0) - - BoringUtils.addSource(BoolStopWatch(io.imem.req.valid, io.imem.resp.fire()), "perfCntCondMimemStall") - BoringUtils.addSource(io.flushVec.orR, "perfCntCondMifuFlush") -} diff --git a/src/main/scala/noop/ISU.scala b/src/main/scala/noop/ISU.scala deleted file mode 100644 index 25e137eb201c2ed39ecb7fb62e8b5b1baf47a090..0000000000000000000000000000000000000000 --- a/src/main/scala/noop/ISU.scala +++ /dev/null @@ -1,159 +0,0 @@ -package noop - -import chisel3._ -import chisel3.util._ -import chisel3.util.experimental.BoringUtils - -import utils._ - -trait HasRegFileParameter { - val NRReg = 32 -} - -class RegFile(width:Int, hasZero:Boolean = true) extends HasRegFileParameter with HasNOOPParameter { - val rf = Mem(NRReg, UInt(width.W)) - def read(addr: UInt) : UInt = if(hasZero) Mux(addr === 0.U, 0.U, rf(addr)) else rf(addr) - def write(addr: UInt, data: UInt) = { rf(addr) := data } -} - -class ScoreBoard(hasZero:Boolean = true) extends HasRegFileParameter { - val busy = RegInit(0.U(NRReg.W)) - def isBusy(idx: UInt): Bool = busy(idx) - def mask(idx: UInt) = (1.U(NRReg.W) << idx)(NRReg-1, 0) - def update(setMask: UInt, clearMask: UInt) = { - // When clearMask(i) and setMask(i) are both set, setMask(i) wins. - // This can correctly record the busy bit when reg(i) is written - // and issued at the same cycle. - // Note that rf(0) is always free when hasZero==true. - if(hasZero) busy := Cat(((busy & ~clearMask) | setMask)(NRReg-1, 1), 0.U(1.W)) - else busy := ((busy & ~clearMask) | setMask) - } -} - -class ISU(implicit val p: NOOPConfig) extends NOOPModule with HasRegFileParameter { - val io = IO(new Bundle { - val in = Flipped(Decoupled(new DecodeIO)) - val out = Decoupled(new DecodeIO) - val wb = Flipped(new WriteBackIO) - val flush = Input(Bool()) - val forward = Flipped(new ForwardIO) - }) - - io.out.bits := DontCare - val rfSrc1 = io.in.bits.ctrl.rfSrc1 - val rfSrc2 = io.in.bits.ctrl.rfSrc2 - val rfDest = io.in.bits.ctrl.rfDest - - def isDepend(rfSrc: UInt, rfDest: UInt, wen: Bool): Bool = (rfSrc =/= 0.U) && (rfSrc === rfDest) && wen - - val forwardRfWen = io.forward.wb.rfWen && io.forward.valid - val dontForward = (io.forward.fuType =/= FuType.alu) && (io.forward.fuType =/= FuType.lsu) - val src1DependEX = isDepend(rfSrc1, io.forward.wb.rfDest, forwardRfWen) - val src2DependEX = isDepend(rfSrc2, io.forward.wb.rfDest, forwardRfWen) - val src1DependWB = isDepend(rfSrc1, io.wb.rfDest, io.wb.rfWen) - val src2DependWB = isDepend(rfSrc2, io.wb.rfDest, io.wb.rfWen) - - val src1ForwardNextCycle = src1DependEX && !dontForward - val src2ForwardNextCycle = src2DependEX && !dontForward - val src1Forward = src1DependWB && Mux(dontForward, !src1DependEX, true.B) - val src2Forward = src2DependWB && Mux(dontForward, !src2DependEX, true.B) - - val sb = new ScoreBoard - val src1Ready = !sb.isBusy(rfSrc1) || src1ForwardNextCycle || src1Forward - val src2Ready = !sb.isBusy(rfSrc2) || src2ForwardNextCycle || src2Forward - - val fpr = new RegFile(width = XLEN, hasZero = false) - - val (fprSrcReady,fprSrcData):(Bool,Array[UInt]) = if(HasFPU){ - val fsb = new ScoreBoard(hasZero = false) - val forwardFpWen = io.forward.wb.fpWen && io.forward.valid - - when (io.wb.fpWen) { - fpr.write(io.wb.rfDest, io.wb.rfData) - } - - val fsbClearMask = Mux(io.wb.fpWen && !isDepend(io.wb.rfDest, io.forward.wb.rfDest, forwardFpWen), - fsb.mask(io.wb.rfDest), 0.U(NRReg.W)) - val fsbSetMask = Mux(io.out.fire() && io.in.bits.ctrl.fpWen, fsb.mask(rfDest), 0.U) - when (io.flush) { fsb.update(0.U, Fill(NRReg, 1.U(1.W))) } - .otherwise { fsb.update(fsbSetMask, fsbClearMask) } - - val instr = io.in.bits.cf.instr - - val (fpSrc1,fpSrc2,fpSrc3) = (rfSrc1, rfSrc2, instr(31, 27)) - val srcs = Seq(fpSrc1, fpSrc2, fpSrc3).zip(Seq( - io.in.bits.ctrl.src1Type, - io.in.bits.ctrl.src2Type, - io.in.bits.ctrl.src3Type - )) - val dataVec = Array.fill(3)(Wire(UInt(XLEN.W))) - // result - (srcs.zipWithIndex.map({ - case ((src, t),i) => - val dependEX = isDepend(src, io.forward.wb.rfDest, forwardFpWen) - val dependWB = isDepend(src, io.wb.rfDest, io.wb.fpWen) - val forwardEX = dependEX && !dontForward - val forwardWB = dependWB && Mux(dontForward, !dependEX, true.B) - dataVec(i) := MuxCase(fpr.read(src), Seq( - forwardEX -> io.forward.wb.rfData, - forwardWB -> io.wb.rfData - )) - (!fsb.busy(src) || forwardEX || forwardWB) || (t =/= SrcType.fp) - }).reduceLeft(_ && _), dataVec) - } else (true.B, Array.fill(3)(0.U)) - - io.out.valid := io.in.valid && src1Ready && src2Ready && fprSrcReady - - val rf = new RegFile(XLEN) -// io.out.bits.data.src1 := Mux1H(List( -// (io.in.bits.ctrl.src1Type === SrcType.pc) -> SignExt(io.in.bits.cf.pc, AddrBits), -// src1ForwardNextCycle -> io.forward .wb.rfData, -// (src1Forward && !src1ForwardNextCycle) -> io.wb.rfData, -// ((io.in.bits.ctrl.src1Type =/= SrcType.pc) && !src1ForwardNextCycle && !src1Forward) -> rf.read(rfSrc1) -// )) -// io.out.bits.data.src2 := Mux1H(List( -// (io.in.bits.ctrl.src2Type =/= SrcType.reg) -> io.in.bits.data.imm, -// src2ForwardNextCycle -> io.forward.wb.rfData, -// (src2Forward && !src2ForwardNextCycle) -> io.wb.rfData, -// ((io.in.bits.ctrl.src2Type === SrcType.reg) && !src2ForwardNextCycle && !src2Forward) -> rf.read(rfSrc2) -// )) - - io.out.bits.data.src1 := MuxCase(rf.read(rfSrc1), Seq( - (io.in.bits.ctrl.src1Type === SrcType.fp) -> fprSrcData(0), - (io.in.bits.ctrl.src1Type === SrcType.pc) -> SignExt(io.in.bits.cf.pc, AddrBits), - src1ForwardNextCycle -> io.forward.wb.rfData, - src1Forward -> io.wb.rfData - )) - io.out.bits.data.src2 := MuxCase(rf.read(rfSrc2), Seq( - (io.in.bits.ctrl.src2Type === SrcType.fp) -> fprSrcData(1), - (io.in.bits.ctrl.src2Type =/= SrcType.reg) -> io.in.bits.data.imm, - src2ForwardNextCycle -> io.forward.wb.rfData, - src2Forward -> io.wb.rfData - )) - - io.out.bits.data.imm := Mux(io.in.bits.ctrl.src3Type===SrcType.fp, fprSrcData(2), io.in.bits.data.imm) - - io.out.bits.cf <> io.in.bits.cf - io.out.bits.ctrl := io.in.bits.ctrl - io.out.bits.ctrl.isSrc1Forward := src1ForwardNextCycle - io.out.bits.ctrl.isSrc2Forward := src2ForwardNextCycle - - when (io.wb.rfWen) { rf.write(io.wb.rfDest, io.wb.rfData) } - - val wbClearMask = Mux(io.wb.rfWen && !isDepend(io.wb.rfDest, io.forward.wb.rfDest, forwardRfWen), sb.mask(io.wb.rfDest), 0.U(NRReg.W)) - val isuFireSetMask = Mux(io.out.fire() && io.in.bits.ctrl.rfWen, sb.mask(rfDest), 0.U) - when (io.flush) { sb.update(0.U, Fill(NRReg, 1.U(1.W))) } - .otherwise { sb.update(isuFireSetMask, wbClearMask) } - - io.in.ready := !io.in.valid || io.out.fire() - - // read after write - BoringUtils.addSource(io.in.valid && !io.out.valid, "perfCntCondMrawStall") - BoringUtils.addSource(io.out.valid && !io.out.fire(), "perfCntCondMexuBusy") - - if (!p.FPGAPlatform) { - val gRegs = (0 until NRReg).map(i => rf.read(i.U)) - val fRegs = (0 until NRReg).map(i => if(HasFPU) fpr.read(i.U) else 0.U) - BoringUtils.addSource(VecInit(gRegs ++ fRegs), "difftestRegs") - } -} diff --git a/src/main/scala/noop/NOOP.scala b/src/main/scala/noop/NOOP.scala deleted file mode 100644 index eadde5b5e97488c285e57895f492f3172d927785..0000000000000000000000000000000000000000 --- a/src/main/scala/noop/NOOP.scala +++ /dev/null @@ -1,117 +0,0 @@ -package noop - -import chisel3._ -import chisel3.util._ -import chisel3.util.experimental.BoringUtils -import bus.simplebus._ -import bus.axi4._ -import utils._ - -trait HasNOOPParameter { - val XLEN = 64 - val HasMExtension = true - val HasCExtension = true - val HasDiv = true - val HasIcache = true - val HasDcache = true - val EnableStoreQueue = false - val AddrBits = 64 // AddrBits is used in some cases - val VAddrBits = 39 // VAddrBits is Virtual Memory addr bits - val PAddrBits = 32 // PAddrBits is Phyical Memory addr bits - val AddrBytes = AddrBits / 8 // unused - val DataBits = XLEN - val DataBytes = DataBits / 8 - val HasFPU = true -} - -abstract class NOOPModule extends Module with HasNOOPParameter with HasExceptionNO -abstract class NOOPBundle extends Bundle with HasNOOPParameter - -case class NOOPConfig ( - FPGAPlatform: Boolean = true, - EnableDebug: Boolean = false -) - -object AddressSpace { - // (start, size) - def mmio = List((0x0000000040000000L, 0x0000000010000000L)) - def dram = (0x0000000080000000L, 0x0000000010000000L) - - //def isMMIO(addr: UInt) = mmio.map(range => ((addr & ~((range._2 - 1).U(32.W))) === range._1.U)).reduce(_ || _) - def isMMIO(addr: UInt) = addr(31,28) === "h4".U -} - -class NOOP(implicit val p: NOOPConfig) extends NOOPModule { - val io = IO(new Bundle { - val imem = new SimpleBusC - val dmem = new SimpleBusC - val mmio = new SimpleBusUC - val frontend = Flipped(new SimpleBusUC) - }) - - val ifu = Module(new IFU) - val idu1 = Module(new IDU1) - val idu2 = Module(new IDU2) - val isu = Module(new ISU) - val exu = Module(new EXU) - val wbu = Module(new WBU) - - def pipelineConnect2[T <: Data](left: DecoupledIO[T], right: DecoupledIO[T], - isFlush: Bool, entries: Int = 4, pipe: Boolean = false) = { - right <> FlushableQueue(left, isFlush, entries = entries, pipe = pipe) - } - - pipelineConnect2(ifu.io.out, idu1.io.in, ifu.io.flushVec(0)) - PipelineConnect(idu1.io.out, idu2.io.in, idu2.io.out.fire(), ifu.io.flushVec(1)) - PipelineConnect(idu2.io.out, isu.io.in, isu.io.out.fire(), ifu.io.flushVec(1)) - PipelineConnect(isu.io.out, exu.io.in, exu.io.out.fire(), ifu.io.flushVec(2)) - PipelineConnect(exu.io.out, wbu.io.in, true.B, ifu.io.flushVec(3)) - idu1.io.flush := ifu.io.flushVec(1) - idu2.io.flush := ifu.io.flushVec(1) - isu.io.flush := ifu.io.flushVec(2) - exu.io.flush := ifu.io.flushVec(3) - - Debug() { - printf("------------------------ TIMER: %d ------------------------\n", GTimer()) - printf("flush = %b, ifu:(%d,%d), idu1:(%d,%d), idu2:(%d,%d), isu:(%d,%d), exu:(%d,%d), wbu: (%d,%d)\n", - ifu.io.flushVec.asUInt, ifu.io.out.valid, ifu.io.out.ready, - idu1.io.in.valid, idu1.io.in.ready, idu2.io.in.valid, idu2.io.in.ready, isu.io.in.valid, isu.io.in.ready, - exu.io.in.valid, exu.io.in.ready, wbu.io.in.valid, wbu.io.in.ready) - when (ifu.io.out.valid) { printf("IFU: pc = 0x%x, instr = 0x%x, pnpc = 0x%x\n", ifu.io.out.bits.pc, ifu.io.out.bits.instr, ifu.io.out.bits.pnpc)} ; - when (idu1.io.in.valid) { printf("ID1: pc = 0x%x, instr = 0x%x, pnpc = 0x%x\n", idu1.io.in.bits.pc, idu1.io.in.bits.instr, idu1.io.in.bits.pnpc) } - when (idu2.io.in.valid) { printf("ID2: pc = 0x%x, instr = 0x%x, pnpc = 0x%x\n", idu2.io.in.bits.pc, idu2.io.in.bits.instr, idu2.io.in.bits.pnpc) } - when (isu.io.in.valid) { printf("ISU: pc = 0x%x, pnpc = 0x%x\n", isu.io.in.bits.cf.pc, isu.io.in.bits.cf.pnpc)} ; - when (exu.io.in.valid) { printf("EXU: pc = 0x%x, pnpc = 0x%x\n", exu.io.in.bits.cf.pc, exu.io.in.bits.cf.pnpc)} ; - when (wbu.io.in.valid) { printf("WBU: pc = 0x%x rfWen:%d rfDest:%d rfData:%x Futype:%x\n", wbu.io.in.bits.decode.cf.pc, wbu.io.in.bits.decode.ctrl.rfWen, wbu.io.in.bits.decode.ctrl.rfDest, wbu.io.wb.rfData, wbu.io.in.bits.decode.ctrl.fuType )} - // when (io.in.valid) { printf("TIMER: %d WBU: pc = 0x%x wen %x wdata %x mmio %x intrNO %x\n", GTimer(), io.in.bits.decode.cf.pc, io.wb.rfWen, io.wb.rfData, io.in.bits.isMMIO, io.in.bits.intrNO) } - - // printf(p"IFUO: redirectIO:${ifu.io.out.bits.redirect}\n") ; printf("IFUO: exceptionVec: %x\n", ifu.io.out.bits.exceptionVec.asUInt)} - // printf(p"IDUO: redirectIO:${idu.io.out.bits.cf.redirect} redirectIOC:${idu.io.redirect}\n") ; printf("IDUO: exceptionVec:%x\n", idu.io.out.bits.cf.exceptionVec.asUInt)} - // printf(p"ISUO: ${isu.io.out.bits.cf.redirect}\n") ; printf("ISUO: exceptionVec:%x\n", isu.io.out.bits.cf.exceptionVec.asUInt)} - when (exu.io.out.bits.decode.cf.redirect.valid) { printf("EXUO: redirect valid:%d target:%x\n", exu.io.out.bits.decode.cf.redirect.valid, exu.io.out.bits.decode.cf.redirect.target) } - // when (wbu.io.in.valid) { printf("WBU: pc = 0x%x rfWen:%d rfDest:%d rfData:%x Futype:%x commits(0):%x commits(1):%x commits(3):%x\n", wbu.io.in.bits.decode.cf.pc, wbu.io.in.bits.decode.ctrl.rfWen, wbu.io.in.bits.decode.ctrl.rfDest, wbu.io.wb.rfData, wbu.io.in.bits.decode.ctrl.fuType, wbu.io.in.bits.commits(0), wbu.io.in.bits.commits(1), wbu.io.in.bits.commits(3)) } - - } - - isu.io.wb <> wbu.io.wb - ifu.io.redirect <> wbu.io.redirect - // forward - isu.io.forward <> exu.io.forward - - val mmioXbar = Module(new SimpleBusCrossbarNto1(if (HasDcache) 2 else 3)) - val dmemXbar = Module(new SimpleBusCrossbarNto1(4)) - - val itlb = TLB(in = ifu.io.imem, mem = dmemXbar.io.in(1), flush = ifu.io.flushVec(0) | ifu.io.bpFlush, csrMMU = exu.io.memMMU.imem)(TLBConfig(name = "itlb", userBits = VAddrBits*2 + 4, totalEntry = 4)) - ifu.io.ipf := itlb.io.ipf - io.imem <> Cache(in = itlb.io.out, mmio = mmioXbar.io.in.take(1), flush = Fill(2, ifu.io.flushVec(0) | ifu.io.bpFlush), empty = itlb.io.cacheEmpty)( - CacheConfig(ro = true, name = "icache", userBits = VAddrBits*2 + 4)) - - val dtlb = TLB(in = exu.io.dmem, mem = dmemXbar.io.in(2), flush = false.B, csrMMU = exu.io.memMMU.dmem)(TLBConfig(name = "dtlb", totalEntry = 64)) - dmemXbar.io.in(0) <> dtlb.io.out - io.dmem <> Cache(in = dmemXbar.io.out, mmio = mmioXbar.io.in.drop(1), flush = "b00".U, empty = dtlb.io.cacheEmpty, enable = HasDcache)(CacheConfig(ro = false, name = "dcache")) - - // Make DMA access through L1 DCache to keep coherence - dmemXbar.io.in(3) <> io.frontend - - io.mmio <> mmioXbar.io.out -} diff --git a/src/main/scala/noop/NOOPTrap.scala b/src/main/scala/noop/NOOPTrap.scala deleted file mode 100644 index efb4d0dc8a072f01a8b94ce4bf17ed150bfa9a56..0000000000000000000000000000000000000000 --- a/src/main/scala/noop/NOOPTrap.scala +++ /dev/null @@ -1,14 +0,0 @@ -package noop - -import chisel3._ -import chisel3.util._ - -object NOOPTrap extends HasInstrType { - def StateGoodTrap = 0.U - def StateBadTrap = 1.U - def StateInvOpcode = 2.U - def StateRunning = 3.U - - def TRAP = BitPat("b????????????_?????_000_?????_1101011") - val table = Array(TRAP -> List(InstrI, FuType.alu, ALUOpType.add)) -} diff --git a/src/main/scala/noop/TLB.scala b/src/main/scala/noop/TLB.scala deleted file mode 100644 index d6fb58ee2676486949fcd68f0c55c0b0f8a17924..0000000000000000000000000000000000000000 --- a/src/main/scala/noop/TLB.scala +++ /dev/null @@ -1,609 +0,0 @@ -package noop - -import chisel3._ -import chisel3.util._ -import chisel3.util.experimental.BoringUtils - -import bus.simplebus._ -import bus.axi4._ -import utils._ - -trait Sv39Const extends HasNOOPParameter{ - val Level = 3 - val offLen = 12 - val ppn0Len = 9 - val ppn1Len = 9 - val ppn2Len = PAddrBits - offLen - ppn0Len - ppn1Len // 2 - val ppnLen = ppn2Len + ppn1Len + ppn0Len - val vpn2Len = 9 - val vpn1Len = 9 - val vpn0Len = 9 - val vpnLen = vpn2Len + vpn1Len + vpn0Len - - //val paddrLen = PAddrBits - //val vaddrLen = VAddrBits - val satpLen = XLEN - val satpModeLen = 4 - val asidLen = 16 - val flagLen = 8 - - val ptEntryLen = XLEN - val satpResLen = XLEN - ppnLen - satpModeLen - asidLen - //val vaResLen = 25 // unused - //val paResLen = 25 // unused - val pteResLen = XLEN - ppnLen - 2 - flagLen - - def vaBundle = new Bundle { - val vpn2 = UInt(vpn2Len.W) - val vpn1 = UInt(vpn1Len.W) - val vpn0 = UInt(vpn0Len.W) - val off = UInt( offLen.W) - } - - def vaBundle2 = new Bundle { - val vpn = UInt(vpnLen.W) - val off = UInt(offLen.W) - } - - def vaBundle3 = new Bundle { - val vpn = UInt(vpnLen.W) - val off = UInt(offLen.W) - } - - def vpnBundle = new Bundle { - val vpn2 = UInt(vpn2Len.W) - val vpn1 = UInt(vpn1Len.W) - val vpn0 = UInt(vpn0Len.W) - } - - def paBundle = new Bundle { - val ppn2 = UInt(ppn2Len.W) - val ppn1 = UInt(ppn1Len.W) - val ppn0 = UInt(ppn0Len.W) - val off = UInt( offLen.W) - } - - def paBundle2 = new Bundle { - val ppn = UInt(ppnLen.W) - val off = UInt(offLen.W) - } - - def paddrApply(ppn: UInt, vpnn: UInt):UInt = { - Cat(Cat(ppn, vpnn), 0.U(3.W)) - } - - def pteBundle = new Bundle { - val reserved = UInt(pteResLen.W) - val ppn = UInt(ppnLen.W) - val rsw = UInt(2.W) - val flag = new Bundle { - val d = UInt(1.W) - val a = UInt(1.W) - val g = UInt(1.W) - val u = UInt(1.W) - val x = UInt(1.W) - val w = UInt(1.W) - val r = UInt(1.W) - val v = UInt(1.W) - } - } - - def satpBundle = new Bundle { - val mode = UInt(satpModeLen.W) - val asid = UInt(asidLen.W) - val res = UInt(satpResLen.W) - val ppn = UInt(ppnLen.W) - } - - def flagBundle = new Bundle { - val d = Bool()//UInt(1.W) - val a = Bool()//UInt(1.W) - val g = Bool()//UInt(1.W) - val u = Bool()//UInt(1.W) - val x = Bool()//UInt(1.W) - val w = Bool()//UInt(1.W) - val r = Bool()//UInt(1.W) - val v = Bool()//UInt(1.W) - } - - def maskPaddr(ppn:UInt, vaddr:UInt, mask:UInt) = { - MaskData(vaddr, Cat(ppn, 0.U(offLen.W)), Cat(Fill(ppn2Len, 1.U(1.W)), mask, 0.U(offLen.W))) - } - - def MaskEQ(mask: UInt, pattern: UInt, vpn: UInt) = { - (Cat("h1ff".U(vpn2Len.W), mask) & pattern) === (Cat("h1ff".U(vpn2Len.W), mask) & vpn) - } - -} - -case class TLBConfig ( - name: String = "tlb", - userBits: Int = 0, - - totalEntry: Int = 4, - ways: Int = 4 -) - -sealed trait HasTlbConst extends Sv39Const{ - implicit val tlbConfig: TLBConfig - - val AddrBits: Int - val PAddrBits: Int - val VAddrBits: Int - val XLEN: Int - - val tlbname = tlbConfig.name - val userBits = tlbConfig.userBits - - val maskLen = vpn0Len + vpn1Len // 18 - val metaLen = vpnLen + asidLen + maskLen + flagLen // 27 + 16 + 18 + 8 = 69, is asid necessary - val dataLen = ppnLen + PAddrBits // - val tlbLen = metaLen + dataLen - val Ways = tlbConfig.ways - val TotalEntry = tlbConfig.totalEntry - val Sets = TotalEntry / Ways - val IndexBits = log2Up(Sets) - val TagBits = vpnLen - IndexBits - - val debug = true //&& tlbname == "dtlb" - - def vaddrTlbBundle = new Bundle { - val tag = UInt(TagBits.W) - val index = UInt(IndexBits.W) - val off = UInt(offLen.W) - } - - def metaBundle = new Bundle { - val vpn = UInt(vpnLen.W) - val asid = UInt(asidLen.W) - val mask = UInt(maskLen.W) // to support super page - val flag = UInt(flagLen.W) - } - - def dataBundle = new Bundle { - val ppn = UInt(ppnLen.W) - val pteaddr = UInt(PAddrBits.W) // pte addr, used to write back pte when flag changes (flag.d, flag.v) - } - - def tlbBundle = new Bundle { - val vpn = UInt(vpnLen.W) - val asid = UInt(asidLen.W) - val mask = UInt(maskLen.W) - val flag = UInt(flagLen.W) - val ppn = UInt(ppnLen.W) - val pteaddr = UInt(PAddrBits.W) - } - - def tlbBundle2 = new Bundle { - val meta = UInt(metaLen.W) - val data = UInt(dataLen.W) - } - - def getIndex(vaddr: UInt) : UInt = { - vaddr.asTypeOf(vaddrTlbBundle).index - } -} - -sealed abstract class TlbBundle(implicit tlbConfig: TLBConfig) extends Bundle with HasNOOPParameter with HasTlbConst with Sv39Const -sealed abstract class TlbModule(implicit tlbConfig: TLBConfig) extends Module with HasNOOPParameter with HasTlbConst with Sv39Const with HasCSRConst - -class TLBMDWriteBundle (val IndexBits: Int, val Ways: Int, val tlbLen: Int) extends Bundle with HasNOOPParameter with Sv39Const { - val wen = Output(Bool()) - val windex = Output(UInt(IndexBits.W)) - val waymask = Output(UInt(Ways.W)) - val wdata = Output(UInt(tlbLen.W)) - - def apply(wen: UInt, windex: UInt, waymask: UInt, vpn: UInt, asid: UInt, mask: UInt, flag: UInt, ppn: UInt, pteaddr: UInt) { - this.wen := wen - this.windex := windex - this.waymask := waymask - this.wdata := Cat(vpn, asid, mask, flag, ppn, pteaddr) - } -} - -class TLBMD(implicit val tlbConfig: TLBConfig) extends TlbModule { - val io = IO(new Bundle { - val tlbmd = Output(Vec(Ways, UInt(tlbLen.W))) - val write = Flipped(new TLBMDWriteBundle(IndexBits = IndexBits, Ways = Ways, tlbLen = tlbLen)) - val rindex = Input(UInt(IndexBits.W)) - val ready = Output(Bool()) - }) - - //val tlbmd = Reg(Vec(Ways, UInt(tlbLen.W))) - val tlbmd = Mem(Sets, Vec(Ways, UInt(tlbLen.W))) - io.tlbmd := tlbmd(io.rindex) - - //val reset = WireInit(false.B) - val resetState = RegInit(true.B)//RegEnable(true.B, init = true.B, reset) - val (resetSet, resetFinish) = Counter(resetState, Sets) - when (resetFinish) { resetState := false.B } - - val writeWen = io.write.wen//WireInit(false.B) - val writeSetIdx = io.write.windex - val writeWayMask = io.write.waymask - val writeData = io.write.wdata - - val wen = Mux(resetState, true.B, writeWen) - val setIdx = Mux(resetState, resetSet, writeSetIdx) - val waymask = Mux(resetState, Fill(Ways, "b1".U), writeWayMask) - val dataword = Mux(resetState, 0.U, writeData) - val wdata = VecInit(Seq.fill(Ways)(dataword)) - - when (wen) { tlbmd.write(setIdx, wdata, waymask.asBools) } - - io.ready := !resetState - def rready() = !resetState - def wready() = !resetState -} - -class TLB(implicit val tlbConfig: TLBConfig) extends TlbModule{ - val io = IO(new Bundle { - val in = Flipped(new SimpleBusUC(userBits = userBits, addrBits = VAddrBits)) - val out = new SimpleBusUC(userBits = userBits) - - val mem = new SimpleBusUC(userBits = userBits) - val flush = Input(Bool()) - val csrMMU = new MMUIO - val cacheEmpty = Input(Bool()) - val ipf = Output(Bool()) - }) - - val satp = WireInit(0.U(XLEN.W)) - BoringUtils.addSink(satp, "CSRSATP") - - // tlb exec - val tlbExec = Module(new TLBExec) - val tlbEmpty = Module(new TLBEmpty) - val mdTLB = Module(new TLBMD) - val mdUpdate = Wire(Bool()) - - tlbExec.io.flush := io.flush - tlbExec.io.satp := satp - tlbExec.io.mem <> io.mem - tlbExec.io.pf <> io.csrMMU - tlbExec.io.md <> RegEnable(mdTLB.io.tlbmd, mdUpdate) - tlbExec.io.mdReady := mdTLB.io.ready - mdTLB.io.rindex := getIndex(io.in.req.bits.addr) - mdTLB.io.write <> tlbExec.io.mdWrite - - io.ipf := false.B - - // meta reset - val flushTLB = WireInit(false.B) - BoringUtils.addSink(flushTLB, "MOUFlushTLB") - mdTLB.reset := reset.asBool || flushTLB - Debug() { - when(flushTLB && GTimer() > 77437080.U) { - printf("%d sfence_vma req.pc:%x valid:%d\n", GTimer(), io.in.req.bits.addr, io.in.req.valid) - } - } - - // VM enable && io - val vmEnable = satp.asTypeOf(satpBundle).mode === 8.U && (io.csrMMU.priviledgeMode < ModeM) - - def PipelineConnectTLB[T <: Data](left: DecoupledIO[T], right: DecoupledIO[T], update: Bool, rightOutFire: Bool, isFlush: Bool, vmEnable: Bool) = { - val valid = RegInit(false.B) - when (rightOutFire) { valid := false.B } - when (left.valid && right.ready && vmEnable) { valid := true.B } - when (isFlush) { valid := false.B } - - left.ready := right.ready - right.bits <> RegEnable(left.bits, left.valid && right.ready) - right.valid := valid //&& !isFlush - - update := left.valid && right.ready - } - - tlbEmpty.io.in <> DontCare - tlbEmpty.io.out.ready := DontCare - PipelineConnectTLB(io.in.req, tlbExec.io.in, mdUpdate, tlbExec.io.isFinish, io.flush, vmEnable) - if(tlbname == "dtlb") { - PipelineConnect(tlbExec.io.out, tlbEmpty.io.in, tlbEmpty.io.out.fire(), io.flush) - } - when(!vmEnable) { - tlbExec.io.out.ready := true.B // let existed request go out - if( tlbname == "dtlb") { tlbEmpty.io.out.ready := true.B } - io.out.req.valid := io.in.req.valid - io.in.req.ready := io.out.req.ready - io.out.req.bits.addr := io.in.req.bits.addr(PAddrBits-1, 0) - io.out.req.bits.size := io.in.req.bits.size - io.out.req.bits.cmd := io.in.req.bits.cmd - io.out.req.bits.wmask := io.in.req.bits.wmask - io.out.req.bits.wdata := io.in.req.bits.wdata - io.out.req.bits.user.map(_ := io.in.req.bits.user.getOrElse(0.U)) - }.otherwise { - if (tlbname == "dtlb") { io.out.req <> tlbEmpty.io.out} - else { io.out.req <> tlbExec.io.out } - } - io.out.resp <> io.in.resp - - // lsu need dtlb signals - if(tlbname == "dtlb") { - val alreadyOutFinish = RegEnable(true.B, init=false.B, tlbExec.io.out.valid && !tlbExec.io.out.ready) - when(alreadyOutFinish && tlbExec.io.out.fire()) { alreadyOutFinish := false.B} - val tlbFinish = (tlbExec.io.out.valid && !alreadyOutFinish) || tlbExec.io.pf.isPF() - BoringUtils.addSource(tlbFinish, "DTLBFINISH") - BoringUtils.addSource(io.csrMMU.isPF(), "DTLBPF") - BoringUtils.addSource(vmEnable, "DTLBENABLE") - } - - // instruction page fault - if (tlbname == "itlb") { - when (tlbExec.io.ipf && vmEnable) { - tlbExec.io.out.ready := io.cacheEmpty && io.in.resp.ready - io.out.req.valid := false.B - } - - when (tlbExec.io.ipf && vmEnable && io.cacheEmpty) { - io.in.resp.valid := true.B - io.in.resp.bits.rdata := 0.U - io.in.resp.bits.cmd := SimpleBusCmd.readLast - io.in.resp.bits.user.map(_ := tlbExec.io.in.bits.user.getOrElse(0.U)) - io.ipf := tlbExec.io.ipf - } - } - - Debug() { - if (debug) { - printf("[TLB-" + tlbname+ "]: Timer:%d---------\n", GTimer()) - printf("[TLB-" + tlbname+ "]: InReq(%d, %d) InResp(%d, %d) OutReq(%d, %d) OutResp(%d, %d) vmEnable:%d mode:%d\n", io.in.req.valid, io.in.req.ready, io.in.resp.valid, io.in.resp.ready, io.out.req.valid, io.out.req.ready, io.out.resp.valid, io.out.resp.ready, vmEnable, io.csrMMU.priviledgeMode) - printf("[TLB-" + tlbname+ "]: InReq: addr:%x cmd:%d wdata:%x OutReq: addr:%x cmd:%x wdata:%x\n", io.in.req.bits.addr, io.in.req.bits.cmd, io.in.req.bits.wdata, io.out.req.bits.addr, io.out.req.bits.cmd, io.out.req.bits.wdata) - printf("[TLB-" + tlbname+ "]: OutResp: rdata:%x cmd:%x Inresp: rdata:%x cmd:%x\n", io.out.resp.bits.rdata, io.out.resp.bits.cmd, io.in.resp.bits.rdata, io.in.resp.bits.cmd) - printf("[TLB-" + tlbname+ "]: satp:%x flush:%d cacheEmpty:%d instrPF:%d loadPF:%d storePF:%d \n", satp, io.flush, io.cacheEmpty, io.ipf, io.csrMMU.loadPF, io.csrMMU.storePF) - } - } - -} - -class TLBExec(implicit val tlbConfig: TLBConfig) extends TlbModule{ - val io = IO(new Bundle { - val in = Flipped(Decoupled(new SimpleBusReqBundle(userBits = userBits, addrBits = VAddrBits))) - val out = Decoupled(new SimpleBusReqBundle(userBits = userBits)) - - val md = Input(Vec(Ways, UInt(tlbLen.W))) - val mdWrite = new TLBMDWriteBundle(IndexBits = IndexBits, Ways = Ways, tlbLen = tlbLen) - val mdReady = Input(Bool()) - - val mem = new SimpleBusUC(userBits = userBits) - val flush = Input(Bool()) - val satp = Input(UInt(XLEN.W)) - val pf = new MMUIO - val ipf = Output(Bool()) - val isFinish = Output(Bool()) - }) - - val md = io.md//RegEnable(mdTLB.io.tlbmd, io.in.ready) - - // lazy renaming - val req = io.in.bits - val vpn = req.addr.asTypeOf(vaBundle2).vpn.asTypeOf(vpnBundle) - val pf = io.pf - val satp = io.satp.asTypeOf(satpBundle) - - // pf init - pf.loadPF := false.B - pf.storePF := false.B - pf.addr := req.addr - - // check hit or miss - val hitVec = VecInit(md.map(m => m.asTypeOf(tlbBundle).flag.asTypeOf(flagBundle).v && (m.asTypeOf(tlbBundle).asid === satp.asid) && MaskEQ(m.asTypeOf(tlbBundle).mask, m.asTypeOf(tlbBundle).vpn, vpn.asUInt))).asUInt - val hit = io.in.valid && hitVec.orR - val miss = io.in.valid && !hitVec.orR - - val victimWaymask = if (Ways > 1) (1.U << LFSR64()(log2Up(Ways)-1,0)) else "b1".U - val waymask = Mux(hit, hitVec, victimWaymask) - - val loadPF = WireInit(false.B) - val storePF = WireInit(false.B) - - // hit - val hitMeta = Mux1H(waymask, md).asTypeOf(tlbBundle2).meta.asTypeOf(metaBundle) - val hitData = Mux1H(waymask, md).asTypeOf(tlbBundle2).data.asTypeOf(dataBundle) - val hitFlag = hitMeta.flag.asTypeOf(flagBundle) - val hitMask = hitMeta.mask - // hit write back pte.flag - val hitinstrPF = WireInit(false.B) - val hitWB = hit && (!hitFlag.a || !hitFlag.d && req.isWrite()) && !hitinstrPF && !(loadPF || storePF || io.pf.isPF()) - val hitRefillFlag = Cat(req.isWrite().asUInt, 1.U(1.W), 0.U(6.W)) | hitFlag.asUInt - val hitWBStore = RegEnable(Cat(0.U(10.W), hitData.ppn, 0.U(2.W), hitRefillFlag), hitWB) - - // hit permission check - val hitCheck = hit /*&& hitFlag.v */&& !(pf.priviledgeMode === ModeU && !hitFlag.u) && !(pf.priviledgeMode === ModeS && hitFlag.u && !pf.status_sum) - val hitExec = hitCheck && hitFlag.x - val hitLoad = hitCheck && (hitFlag.r || pf.status_mxr && hitFlag.x) - val hitStore = hitCheck && hitFlag.w - - val isAMO = WireInit(false.B) - if (tlbname == "dtlb") { - BoringUtils.addSink(isAMO, "ISAMO") - } - - io.pf.loadPF := RegNext(loadPF, init =false.B) - io.pf.storePF := RegNext(storePF, init = false.B) - - if (tlbname == "itlb") { hitinstrPF := !hitExec && hit} - if (tlbname == "dtlb") { - loadPF := !hitLoad && req.isRead() && hit && !isAMO - storePF := (!hitStore && req.isWrite() && hit) || (!hitLoad && req.isRead() && hit && isAMO) - } - - // miss - val s_idle :: s_memReadReq :: s_memReadResp :: s_write_pte :: s_wait_resp :: s_miss_slpf :: Nil = Enum(6) - val state = RegInit(s_idle) - val level = RegInit(Level.U(log2Up(Level).W)) - - val memRespStore = Reg(UInt(XLEN.W)) - val missMask = WireInit("h3ffff".U(maskLen.W)) - val missMaskStore = Reg(UInt(maskLen.W)) - val missMetaRefill = WireInit(false.B) - val missRefillFlag = WireInit(0.U(8.W)) - val memRdata = io.mem.resp.bits.rdata.asTypeOf(pteBundle) - val raddr = Reg(UInt(PAddrBits.W)) - val alreadyOutFire = RegEnable(true.B, init = false.B, io.out.fire) - - //handle flush - val needFlush = RegInit(false.B) - val ioFlush = io.flush - val isFlush = needFlush || ioFlush - when (ioFlush && (state =/= s_idle)) { needFlush := true.B} - when (io.out.fire() && needFlush) { needFlush := false.B} - - val missIPF = RegInit(false.B) - - // state machine to handle miss(ptw) and pte-writing-back - switch (state) { - is (s_idle) { - when (!ioFlush && hitWB) { - state := s_write_pte - needFlush := false.B - alreadyOutFire := false.B - }.elsewhen (miss && !ioFlush) { - state := s_memReadReq - raddr := paddrApply(satp.ppn, vpn.vpn2) // - level := Level.U - needFlush := false.B - alreadyOutFire := false.B - } - } - - is (s_memReadReq) { - when (isFlush) { - state := s_idle - needFlush := false.B - }.elsewhen (io.mem.req.fire()) { state := s_memReadResp} - } - - is (s_memReadResp) { - val missflag = memRdata.flag.asTypeOf(flagBundle) - when (io.mem.resp.fire()) { - when (isFlush) { - state := s_idle - needFlush := false.B - }.elsewhen (!(missflag.r || missflag.x) && (level===3.U || level===2.U)) { - when(!missflag.v || (!missflag.r && missflag.w)) { //TODO: fix needflush - if(tlbname == "itlb") { state := s_wait_resp } else { state := s_miss_slpf } - if(tlbname == "itlb") { missIPF := true.B } - if(tlbname == "dtlb") { - loadPF := req.isRead() && !isAMO - storePF := req.isWrite() || isAMO - } - Debug() { - if(debug) { - printf("%d " + tlbname +" tlbException!!! ", GTimer()) - printf(p" req:${req} Memreq:${io.mem.req} MemResp:${io.mem.resp}") - printf(" level:%d",level) - printf("\n") - } - } - }.otherwise { - state := s_memReadReq - raddr := paddrApply(memRdata.ppn, Mux(level === 3.U, vpn.vpn1, vpn.vpn0)) - } - }.elsewhen (level =/= 0.U) { //TODO: fix needFlush - val permCheck = missflag.v && !(pf.priviledgeMode === ModeU && !missflag.u) && !(pf.priviledgeMode === ModeS && missflag.u && !pf.status_sum) - val permExec = permCheck && missflag.x - val permLoad = permCheck && (missflag.r || pf.status_mxr && missflag.x) - val permStore = permCheck && missflag.w - val updateAD = !missflag.a || (!missflag.d && req.isWrite()) - val updateData = Cat( 0.U(56.W), req.isWrite(), 1.U(1.W), 0.U(6.W) ) - missRefillFlag := Cat(req.isWrite(), 1.U(1.W), 0.U(6.W)) | missflag.asUInt - memRespStore := io.mem.resp.bits.rdata | updateData - if(tlbname == "itlb") { - when (!permExec) { missIPF := true.B ; state := s_wait_resp} - .otherwise { - state := Mux(updateAD, s_write_pte, s_wait_resp) - missMetaRefill := true.B - } - } - if(tlbname == "dtlb") { - when((!permLoad && req.isRead()) || (!permStore && req.isWrite())) { - state := s_miss_slpf - loadPF := req.isRead() && !isAMO - storePF := req.isWrite() || isAMO - }.otherwise { - state := Mux(updateAD, s_write_pte, s_wait_resp) - missMetaRefill := true.B - } - } - missMask := Mux(level===3.U, 0.U(maskLen.W), Mux(level===2.U, "h3fe00".U(maskLen.W), "h3ffff".U(maskLen.W))) - missMaskStore := missMask - } - level := level - 1.U - } - } - - is (s_write_pte) { - when (isFlush) { - state := s_idle - needFlush := false.B - }.elsewhen (io.mem.req.fire()) { state := s_wait_resp } - } - - is (s_wait_resp) { when (io.out.fire() || ioFlush || alreadyOutFire){ - state := s_idle - missIPF := false.B - alreadyOutFire := false.B - }} - - is (s_miss_slpf) { - state := s_idle - } - } - - // mem - val cmd = Mux(state === s_write_pte, SimpleBusCmd.write, SimpleBusCmd.read) - io.mem.req.bits.apply(addr = Mux(hitWB, hitData.pteaddr, raddr), cmd = cmd, size = (if (XLEN == 64) "b11".U else "b10".U), wdata = Mux( hitWB, hitWBStore, memRespStore), wmask = 0xff.U) - io.mem.req.valid := ((state === s_memReadReq || state === s_write_pte) && !isFlush) - io.mem.resp.ready := true.B - - // tlb refill - io.mdWrite.apply(wen = RegNext((missMetaRefill && !isFlush) || (hitWB && state === s_idle && !isFlush), init = false.B), - windex = RegNext(getIndex(req.addr)), waymask = RegNext(waymask), vpn = RegNext(vpn.asUInt), - asid = RegNext(Mux(hitWB, hitMeta.asid, satp.asid)), mask = RegNext(Mux(hitWB, hitMask, missMask)), - flag = RegNext(Mux(hitWB, hitRefillFlag, missRefillFlag)), ppn = RegNext(Mux(hitWB, hitData.ppn, memRdata.ppn)), - pteaddr = RegNext((Mux(hitWB, hitData.pteaddr, raddr)))) - - // io - io.out.bits := req - io.out.bits.addr := Mux(hit, maskPaddr(hitData.ppn, req.addr(PAddrBits-1, 0), hitMask), maskPaddr(memRespStore.asTypeOf(pteBundle).ppn, req.addr(PAddrBits-1, 0), missMaskStore)) - io.out.valid := io.in.valid && Mux(hit && !hitWB, !(io.pf.isPF() || loadPF || storePF), state === s_wait_resp)// && !alreadyOutFire - - io.in.ready := io.out.ready && (state === s_idle) && !miss && !hitWB && io.mdReady && (!io.pf.isPF() && !loadPF && !storePF)//maybe be optimized - - io.ipf := Mux(hit, hitinstrPF, missIPF) - io.isFinish := io.out.fire() || io.pf.isPF() - - Debug() { - if (debug) { - printf("[TLBExec-" + tlbname+ "]: Timer:%d---------\n", GTimer()) - printf("[TLBExec-" + tlbname+ "]: In(%d, %d) Out(%d, %d) InAddr:%x OutAddr:%x cmd:%d \n", io.in.valid, io.in.ready, io.out.valid, io.out.ready, req.addr, io.out.bits.addr, req.cmd) - printf("[TLBExec-" + tlbname+ "]: isAMO:%d io.Flush:%d needFlush:%d alreadyOutFire:%d isFinish:%d\n",isAMO, io.flush, needFlush, alreadyOutFire, io.isFinish) - printf("[TLBExec-" + tlbname+ "]: hit:%d hitWB:%d hitVPN:%x hitFlag:%x hitPPN:%x hitRefillFlag:%x hitWBStore:%x hitCheck:%d hitExec:%d hitLoad:%d hitStore:%d\n", hit, hitWB, hitMeta.vpn, hitFlag.asUInt, hitData.ppn, hitRefillFlag, hitWBStore, hitCheck, hitExec, hitLoad, hitStore) - printf("[TLBExec-" + tlbname+ "]: miss:%d state:%d level:%d raddr:%x memRdata:%x missMask:%x missRefillFlag:%x missMetaRefill:%d\n", miss, state, level, raddr, memRdata.asUInt, missMask, missRefillFlag, missMetaRefill) - printf("[TLBExec-" + tlbname+ "]: meta/data: (0)%x|%b|%x (1)%x|%b|%x (2)%x|%b|%x (3)%x|%b|%x rread:%d\n", md(0).asTypeOf(tlbBundle).vpn, md(0).asTypeOf(tlbBundle).flag, md(0).asTypeOf(tlbBundle).ppn, md(1).asTypeOf(tlbBundle).vpn, md(1).asTypeOf(tlbBundle).flag, md(1).asTypeOf(tlbBundle).ppn, md(2).asTypeOf(tlbBundle).vpn, md(2).asTypeOf(tlbBundle).flag, md(2).asTypeOf(tlbBundle).ppn, md(3).asTypeOf(tlbBundle).vpn, md(3).asTypeOf(tlbBundle).flag, md(3).asTypeOf(tlbBundle).ppn, io.mdReady) - printf("[TLBExec-" + tlbname+ "]: md: wen:%d windex:%x waymask:%x vpn:%x asid:%x mask:%x flag:%x asid:%x ppn:%x pteaddr:%x\n", io.mdWrite.wen, io.mdWrite.windex, io.mdWrite.waymask, io.mdWrite.wdata.asTypeOf(tlbBundle).vpn, io.mdWrite.wdata.asTypeOf(tlbBundle).asid, io.mdWrite.wdata.asTypeOf(tlbBundle).mask, io.mdWrite.wdata.asTypeOf(tlbBundle).flag, io.mdWrite.wdata.asTypeOf(tlbBundle).asid, io.mdWrite.wdata.asTypeOf(tlbBundle).ppn, io.mdWrite.wdata.asTypeOf(tlbBundle).pteaddr) - printf("[TLBExec-" + tlbname+ "]: MemReq(%d, %d) MemResp(%d, %d) addr:%x cmd:%d rdata:%x cmd:%d\n", io.mem.req.valid, io.mem.req.ready, io.mem.resp.valid, io.mem.resp.ready, io.mem.req.bits.addr, io.mem.req.bits.cmd, io.mem.resp.bits.rdata, io.mem.resp.bits.cmd) - printf("[TLBExec-" + tlbname+ "]: io.ipf:%d hitinstrPF:%d missIPF:%d pf.loadPF:%d pf.storePF:%d loadPF:%d storePF:%d\n", io.ipf, hitinstrPF, missIPF, io.pf.loadPF, io.pf.storePF, loadPF, storePF) - } - } -} - -class TLBEmpty(implicit val tlbConfig: TLBConfig) extends TlbModule { - val io = IO(new Bundle { - val in = Flipped(Decoupled(new SimpleBusReqBundle(userBits = userBits))) - val out = Decoupled(new SimpleBusReqBundle(userBits = userBits)) - }) - - io.out <> io.in -} - -object TLB { - def apply(in: SimpleBusUC, mem: SimpleBusUC, flush: Bool, csrMMU: MMUIO)(implicit tlbConfig: TLBConfig) = { - val tlb = Module(new TLB) - tlb.io.in <> in - tlb.io.mem <> mem - tlb.io.flush := flush - tlb.io.csrMMU <> csrMMU - tlb - } -} \ No newline at end of file diff --git a/src/main/scala/noop/WBU.scala b/src/main/scala/noop/WBU.scala deleted file mode 100644 index c70a79270cac9bed9da537efe1a2189e4b7d468d..0000000000000000000000000000000000000000 --- a/src/main/scala/noop/WBU.scala +++ /dev/null @@ -1,43 +0,0 @@ -package noop - -import chisel3._ -import chisel3.util._ -import chisel3.util.experimental.BoringUtils -import utils._ - -class WBU(implicit val p: NOOPConfig) extends NOOPModule{ - val io = IO(new Bundle { - val in = Flipped(Decoupled(new CommitIO)) - val wb = new WriteBackIO - val redirect = new RedirectIO - }) - - io.wb.rfWen := io.in.bits.decode.ctrl.rfWen && io.in.valid - io.wb.fpWen := io.in.bits.decode.ctrl.fpWen && io.in.valid - io.wb.rfDest := io.in.bits.decode.ctrl.rfDest - io.wb.rfData := io.in.bits.commits(io.in.bits.decode.ctrl.fuType) - io.in.ready := true.B - - io.redirect := io.in.bits.decode.cf.redirect - io.redirect.valid := io.in.bits.decode.cf.redirect.valid && io.in.valid - - Debug(){ - when (io.in.valid) { printf("[COMMIT] TIMER: %d WBU: pc = 0x%x inst %x wen %x wdata %x mmio %x intrNO %x\n", GTimer(), io.in.bits.decode.cf.pc, io.in.bits.decode.cf.instr, io.wb.rfWen, io.wb.rfData, io.in.bits.isMMIO, io.in.bits.intrNO) } - } - - BoringUtils.addSource(io.in.valid, "perfCntCondMinstret") - if (!p.FPGAPlatform) { - BoringUtils.addSource(RegNext(io.in.valid), "difftestCommit") - BoringUtils.addSource(RegNext(SignExt(io.in.bits.decode.cf.pc, AddrBits)), "difftestThisPC") - BoringUtils.addSource(RegNext(io.in.bits.decode.cf.instr), "difftestThisINST") - BoringUtils.addSource(RegNext(io.in.bits.isMMIO), "difftestIsMMIO") - BoringUtils.addSource(RegNext(io.in.bits.decode.cf.instr(1,0)=/="b11".U), "difftestIsRVC") - BoringUtils.addSource(RegNext(io.in.bits.intrNO), "difftestIntrNO") - } else { - BoringUtils.addSource(io.in.valid, "ilaWBUvalid") - BoringUtils.addSource(io.in.bits.decode.cf.pc, "ilaWBUpc") - BoringUtils.addSource(io.wb.rfWen, "ilaWBUrfWen") - BoringUtils.addSource(io.wb.rfDest, "ilaWBUrfDest") - BoringUtils.addSource(io.wb.rfData, "ilaWBUrfData") - } -} diff --git a/src/main/scala/noop/fu/ALU.scala b/src/main/scala/noop/fu/ALU.scala deleted file mode 100644 index 3a3f74354b40b105d9e65a540aeecd100344b8fa..0000000000000000000000000000000000000000 --- a/src/main/scala/noop/fu/ALU.scala +++ /dev/null @@ -1,170 +0,0 @@ -package noop - -import chisel3._ -import chisel3.util._ -import chisel3.util.experimental.BoringUtils - -import utils._ - -object ALUOpType { - def add = "b000000".U - def sll = "b000001".U - def slt = "b000010".U - def sltu = "b000011".U - def xor = "b000100".U - def srl = "b000101".U - def or = "b000110".U - def and = "b000111".U - def sub = "b001000".U - def sra = "b001101".U - - def addw = "b100000".U - def subw = "b101000".U - def sllw = "b100001".U - def srlw = "b100101".U - def sraw = "b101101".U - - def isWordOp(func: UInt) = func(5) - - def jal = "b011000".U - def jalr = "b011010".U - // def cjalr= "b111010".U // pc + 2 instead of 4 - def beq = "b010000".U - def bne = "b010001".U - def blt = "b010100".U - def bge = "b010101".U - def bltu = "b010110".U - def bgeu = "b010111".U - - // for RAS - def call = "b011100".U - def ret = "b011110".U - - def isBru(func: UInt) = func(4)//[important] - def pcPlus2(func: UInt) = func(5)//[important] - def isBranch(func: UInt) = !func(3) - def isJump(func: UInt) = isBru(func) && !isBranch(func) - def getBranchType(func: UInt) = func(2, 1) - def isBranchInvert(func: UInt) = func(0) -} - -class ALUIO extends FunctionUnitIO { - val cfIn = Flipped(new CtrlFlowIO) - val redirect = new RedirectIO - val offset = Input(UInt(XLEN.W)) -} - -class ALU extends NOOPModule { - val io = IO(new ALUIO) - - val (valid, src1, src2, func) = (io.in.valid, io.in.bits.src1, io.in.bits.src2, io.in.bits.func) - def access(valid: Bool, src1: UInt, src2: UInt, func: UInt): UInt = { - this.valid := valid - this.src1 := src1 - this.src2 := src2 - this.func := func - io.out.bits - } - - val isAdderSub = (func =/= ALUOpType.add) && (func =/= ALUOpType.addw) && !ALUOpType.isJump(func) - val adderRes = (src1 +& (src2 ^ Fill(XLEN, isAdderSub))) + isAdderSub - val xorRes = src1 ^ src2 - val sltu = !adderRes(XLEN) - val slt = xorRes(XLEN-1) ^ sltu - - val shsrc1 = LookupTreeDefault(func, src1, List( - ALUOpType.srlw -> ZeroExt(src1(31,0), 64), - ALUOpType.sraw -> SignExt(src1(31,0), 64) - )) - val shamt = Mux(ALUOpType.isWordOp(func), src2(4, 0), src2(5, 0)) - val res = LookupTreeDefault(func(3, 0), adderRes, List( - ALUOpType.sll -> ((shsrc1 << shamt)(XLEN-1, 0)), - ALUOpType.slt -> ZeroExt(slt, XLEN), - ALUOpType.sltu -> ZeroExt(sltu, XLEN), - ALUOpType.xor -> xorRes, - ALUOpType.srl -> (shsrc1 >> shamt), - ALUOpType.or -> (src1 | src2), - ALUOpType.and -> (src1 & src2), - ALUOpType.sra -> ((shsrc1.asSInt >> shamt).asUInt) - )) - val aluRes = Mux(ALUOpType.isWordOp(func), SignExt(res(31,0), 64), res) - - val branchOpTable = List( - ALUOpType.getBranchType(ALUOpType.beq) -> !xorRes.orR, - ALUOpType.getBranchType(ALUOpType.blt) -> slt, - ALUOpType.getBranchType(ALUOpType.bltu) -> sltu - ) - - val isBranch = ALUOpType.isBranch(func) - val isBru = ALUOpType.isBru(func) - // val pcPlus2 = ALUOpType.pcPlus2(func) - val taken = LookupTree(ALUOpType.getBranchType(func), branchOpTable) ^ ALUOpType.isBranchInvert(func) - val target = Mux(isBranch, io.cfIn.pc + io.offset, adderRes)(VAddrBits-1,0) - val predictWrong = (io.redirect.target =/= io.cfIn.pnpc) - val isRVC = (io.cfIn.instr(1,0) =/= "b11".U) - io.redirect.target := Mux(!taken && isBranch, Mux(isRVC, io.cfIn.pc + 2.U, io.cfIn.pc + 4.U), target) - // with branch predictor, this is actually to fix the wrong prediction - io.redirect.valid := valid && isBru && predictWrong - // may be can be moved to ISU to calculate pc + 4 - // this is actually for jal and jalr to write pc + 4/2 to rd - io.out.bits := Mux(isBru, Mux(!isRVC, SignExt(io.cfIn.pc, AddrBits) + 4.U, SignExt(io.cfIn.pc, AddrBits) + 2.U), aluRes) - // when(pcPlus2 && isBru){ - // printf("CJALR %x %x \n ", io.cfIn.instr, io.cfIn.pc) - // } - - Debug(){ - when(valid && isBru){ - printf("[BRU] tgt %x, valid:%d, npc: %x, pdwrong: %x\n", io.redirect.target, io.redirect.valid, io.cfIn.pnpc, predictWrong) - printf("[BRU] taken:%d addrRes:%x src1:%x src2:%x func:%x\n", taken, adderRes, src1, src2, func) - } - } - - Debug(false){ - when(valid && isBru){ - printf("[BPW] pc %x tgt %x, npc: %x, pdwrong: %x type: %x%x%x%x\n", io.cfIn.pc, io.redirect.target, io.cfIn.pnpc, predictWrong, isBranch, (func === ALUOpType.jal || func === ALUOpType.call), func === ALUOpType.jalr, func === ALUOpType.ret) - } - - when(true.B) { - printf("[ALUIN0] valid:%d isBru:%d isBranch:%d \n", valid, isBru, isBranch) - printf("[ALUIN1] pc %x instr %x tgt %x, npc: %x, pdwrong: %x type: %x%x%x%x\n", io.cfIn.pc, io.cfIn.instr, io.redirect.target, io.cfIn.pnpc, predictWrong, isBranch, (func === ALUOpType.jal || func === ALUOpType.call), func === ALUOpType.jalr, func === ALUOpType.ret) - printf("[ALUIN2] func:%b ", func) - printf(" bpuUpdateReq: valid:%d pc:%x isMissPredict:%d actualTarget:%x actualTaken:%x fuOpType:%x btbType:%x isRVC:%d \n", valid && isBru, io.cfIn.pc, predictWrong, target, taken, func, LookupTree(func, RV32I_BRUInstr.bruFuncTobtbTypeTable), isRVC) - printf("[ALUIN3]tgt %x, npc: %x, pdwrong: %x\n", io.redirect.target, io.cfIn.pnpc, predictWrong) - printf("[ALUIN4]taken:%d addrRes:%x src1:%x src2:%x func:%x\n", taken, adderRes, src1, src2, func) - } - } - - io.in.ready := true.B - io.out.valid := valid - - val bpuUpdateReq = WireInit(0.U.asTypeOf(new BPUUpdateReq)) - bpuUpdateReq.valid := valid && isBru - bpuUpdateReq.pc := io.cfIn.pc - bpuUpdateReq.isMissPredict := predictWrong - bpuUpdateReq.actualTarget := target - bpuUpdateReq.actualTaken := taken - bpuUpdateReq.fuOpType := func - bpuUpdateReq.btbType := LookupTree(func, RV32I_BRUInstr.bruFuncTobtbTypeTable) - bpuUpdateReq.isRVC := isRVC - - BoringUtils.addSource(RegNext(bpuUpdateReq), "bpuUpdateReq") - - val right = valid && isBru && !predictWrong - val wrong = valid && isBru && predictWrong - BoringUtils.addSource(right && isBranch, "MbpBRight") - BoringUtils.addSource(wrong && isBranch, "MbpBWrong") - BoringUtils.addSource(wrong && isBranch && io.cfIn.pc(2,0)==="h0".U && isRVC, "Custom1") - BoringUtils.addSource(wrong && isBranch && io.cfIn.pc(2,0)==="h0".U && !isRVC, "Custom2") - BoringUtils.addSource(wrong && isBranch && io.cfIn.pc(2,0)==="h2".U && isRVC, "Custom3") - BoringUtils.addSource(wrong && isBranch && io.cfIn.pc(2,0)==="h2".U && !isRVC, "Custom4") - BoringUtils.addSource(wrong && isBranch && io.cfIn.pc(2,0)==="h4".U && isRVC, "Custom5") - BoringUtils.addSource(wrong && isBranch && io.cfIn.pc(2,0)==="h4".U && !isRVC, "Custom6") - BoringUtils.addSource(wrong && isBranch && io.cfIn.pc(2,0)==="h6".U && isRVC, "Custom7") - BoringUtils.addSource(wrong && isBranch && io.cfIn.pc(2,0)==="h6".U && !isRVC, "Custom8") - BoringUtils.addSource(right && (func === ALUOpType.jal || func === ALUOpType.call), "MbpJRight") - BoringUtils.addSource(wrong && (func === ALUOpType.jal || func === ALUOpType.call), "MbpJWrong") - BoringUtils.addSource(right && func === ALUOpType.jalr, "MbpIRight") - BoringUtils.addSource(wrong && func === ALUOpType.jalr, "MbpIWrong") - BoringUtils.addSource(right && func === ALUOpType.ret, "MbpRRight") - BoringUtils.addSource(wrong && func === ALUOpType.ret, "MbpRWrong") -} diff --git a/src/main/scala/noop/fu/CSR.scala b/src/main/scala/noop/fu/CSR.scala deleted file mode 100644 index 9d28b1fc2361fa3a97a1d01a32bba07cd478b11b..0000000000000000000000000000000000000000 --- a/src/main/scala/noop/fu/CSR.scala +++ /dev/null @@ -1,830 +0,0 @@ -package noop - -import chisel3._ -import chisel3.util._ -import chisel3.util.experimental.BoringUtils -import noop.fu.FpuCsrIO -import utils._ - -object CSROpType { - def jmp = "b000".U - def wrt = "b001".U - def set = "b010".U - def clr = "b011".U - def wrti = "b101".U - def seti = "b110".U - def clri = "b111".U -} - -trait HasCSRConst { - // User Trap Setup - val Ustatus = 0x000 - val Uie = 0x004 - val Utvec = 0x005 - - // User Trap Handling - val Uscratch = 0x040 - val Uepc = 0x041 - val Ucause = 0x042 - val Utval = 0x043 - val Uip = 0x044 - - // User Floating-Point CSRs (not implemented) - val Fflags = 0x001 - val Frm = 0x002 - val Fcsr = 0x003 - - // User Counter/Timers - val Cycle = 0xC00 - val Time = 0xC01 - val Instret = 0xC02 - - // Supervisor Trap Setup - val Sstatus = 0x100 - val Sedeleg = 0x102 - val Sideleg = 0x103 - val Sie = 0x104 - val Stvec = 0x105 - val Scounteren = 0x106 - - // Supervisor Trap Handling - val Sscratch = 0x140 - val Sepc = 0x141 - val Scause = 0x142 - val Stval = 0x143 - val Sip = 0x144 - - // Supervisor Protection and Translation - val Satp = 0x180 - - // Machine Information Registers - val Mvendorid = 0xF11 - val Marchid = 0xF12 - val Mimpid = 0xF13 - val Mhartid = 0xF14 - - // Machine Trap Setup - val Mstatus = 0x300 - val Misa = 0x301 - val Medeleg = 0x302 - val Mideleg = 0x303 - val Mie = 0x304 - val Mtvec = 0x305 - val Mcounteren = 0x306 - - // Machine Trap Handling - val Mscratch = 0x340 - val Mepc = 0x341 - val Mcause = 0x342 - val Mtval = 0x343 - val Mip = 0x344 - - // Machine Memory Protection - // TBD - val Pmpcfg0 = 0x3A0 - val Pmpcfg1 = 0x3A1 - val Pmpcfg2 = 0x3A2 - val Pmpcfg3 = 0x3A3 - val PmpaddrBase = 0x3B0 - - // Machine Counter/Timers - // Currently, NOOP uses perfcnt csr set instead of standard Machine Counter/Timers - // 0xB80 - 0x89F are also used as perfcnt csr - - // Machine Counter Setup (not implemented) - // Debug/Trace Registers (shared with Debug Mode) (not implemented) - // Debug Mode Registers (not implemented) - - def privEcall = 0x000.U - def privMret = 0x302.U - def privSret = 0x102.U - def privUret = 0x002.U - - def ModeM = 0x3.U - def ModeH = 0x2.U - def ModeS = 0x1.U - def ModeU = 0x0.U - - def IRQ_UEIP = 0 - def IRQ_SEIP = 1 - def IRQ_MEIP = 3 - - def IRQ_UTIP = 4 - def IRQ_STIP = 5 - def IRQ_MTIP = 7 - - def IRQ_USIP = 8 - def IRQ_SSIP = 9 - def IRQ_MSIP = 11 - - val IntPriority = Seq( - IRQ_MEIP, IRQ_MSIP, IRQ_MTIP, - IRQ_SEIP, IRQ_SSIP, IRQ_STIP, - IRQ_UEIP, IRQ_USIP, IRQ_UTIP - ) -} - -trait HasExceptionNO { - def instrAddrMisaligned = 0 - def instrAccessFault = 1 - def illegalInstr = 2 - def breakPoint = 3 - def loadAddrMisaligned = 4 - def loadAccessFault = 5 - def storeAddrMisaligned = 6 - def storeAccessFault = 7 - def ecallU = 8 - def ecallS = 9 - def ecallM = 11 - def instrPageFault = 12 - def loadPageFault = 13 - def storePageFault = 15 - - val ExcPriority = Seq( - breakPoint, // TODO: different BP has different priority - instrPageFault, - instrAccessFault, - illegalInstr, - instrAddrMisaligned, - ecallM, ecallS, ecallU, - storeAddrMisaligned, - loadAddrMisaligned, - storePageFault, - loadPageFault, - storeAccessFault, - loadAccessFault - ) -} - - -class CSRIO extends FunctionUnitIO { - val cfIn = Flipped(new CtrlFlowIO) - val redirect = new RedirectIO - val fpu_csr = Flipped(new FpuCsrIO) - // for exception check - val instrValid = Input(Bool()) - // for differential testing - val intrNO = Output(UInt(XLEN.W)) - val imemMMU = Flipped(new MMUIO) - val dmemMMU = Flipped(new MMUIO) - val wenFix = Output(Bool()) -} - -class CSR(implicit val p: NOOPConfig) extends NOOPModule with HasCSRConst{ - val io = IO(new CSRIO) - - val (valid, src1, src2, func) = (io.in.valid, io.in.bits.src1, io.in.bits.src2, io.in.bits.func) - def access(valid: Bool, src1: UInt, src2: UInt, func: UInt): UInt = { - this.valid := valid - this.src1 := src1 - this.src2 := src2 - this.func := func - io.out.bits - } - - // CSR define - - class Priv extends Bundle { - val m = Output(Bool()) - val h = Output(Bool()) - val s = Output(Bool()) - val u = Output(Bool()) - } - - val csrNotImplemented = RegInit(UInt(XLEN.W), 0.U) - - class MstatusStruct extends Bundle { - val sd = Output(UInt(1.W)) - val pad1 = Output(UInt((XLEN-37).W)) - val sxl = Output(UInt(2.W)) - val uxl = Output(UInt(2.W)) - val pad0 = Output(UInt(9.W)) - val tsr = Output(UInt(1.W)) - val tw = Output(UInt(1.W)) - val tvm = Output(UInt(1.W)) - val mxr = Output(UInt(1.W)) - val sum = Output(UInt(1.W)) - val mprv = Output(UInt(1.W)) - val xs = Output(UInt(2.W)) - val fs = Output(UInt(2.W)) - val mpp = Output(UInt(2.W)) - val hpp = Output(UInt(2.W)) - val spp = Output(UInt(1.W)) - val pie = new Priv - val ie = new Priv - assert(this.getWidth == XLEN) - } - - class Interrupt extends Bundle { - val e = new Priv - val t = new Priv - val s = new Priv - } - - // Machine-Level CSRs - - val mtvec = RegInit(UInt(XLEN.W), 0.U) - val mcounteren = RegInit(UInt(XLEN.W), 0.U) - val mcause = RegInit(UInt(XLEN.W), 0.U) - val mtval = RegInit(UInt(XLEN.W), 0.U) - val mepc = Reg(UInt(XLEN.W)) - - val mie = RegInit(0.U(XLEN.W)) - val mipWire = WireInit(0.U.asTypeOf(new Interrupt)) - val mipReg = RegInit(0.U.asTypeOf(new Interrupt).asUInt) - val mipFixMask = "h777".U - val mip = (mipWire.asUInt | mipReg).asTypeOf(new Interrupt) - - def getMisaMxl(mxl: Int): UInt = {mxl.U << (XLEN-2)} - def getMisaExt(ext: Char): UInt = {1.U << (ext.toInt - 'a'.toInt)} - var extList = List('a', 's', 'i', 'u') - if(HasMExtension){ extList = extList :+ 'm'} - if(HasCExtension){ extList = extList :+ 'c'} - if(HasFPU){ extList = extList ++ List('f', 'd')} - val misaInitVal = getMisaMxl(2) | extList.foldLeft(0.U)((sum, i) => sum | getMisaExt(i)) //"h8000000000141105".U - val misa = RegInit(UInt(XLEN.W), misaInitVal) - // MXL = 2 | 0 | EXT = b 00 0000 0100 0001 0001 0000 0101 - // (XLEN-1, XLEN-2) | |(25, 0) ZY XWVU TSRQ PONM LKJI HGFE DCBA - - val mvendorid = RegInit(UInt(XLEN.W), 0.U) // this is a non-commercial implementation - val marchid = RegInit(UInt(XLEN.W), 0.U) // return 0 to indicate the field is not implemented - val mimpid = RegInit(UInt(XLEN.W), 0.U) // provides a unique encoding of the version of the processor implementation - val mhartid = RegInit(UInt(XLEN.W), 0.U) // the hardware thread running the code - val mstatus = RegInit(UInt(XLEN.W), "h00001800".U) - // val mstatus = RegInit(UInt(XLEN.W), "h8000c0100".U) - // mstatus Value Table - // | sd | - // | pad1 | - // | sxl | hardlinked to 10, use 00 to pass xv6 test - // | uxl | hardlinked to 00 - // | pad0 | - // | tsr | - // | tw | - // | tvm | - // | mxr | - // | sum | - // | mprv | - // | xs | 00 | - // | fs | - // | mpp | 00 | - // | hpp | 00 | - // | spp | 0 | - // | pie | 0000 | - // | ie | 0000 | uie hardlinked to 0, as N ext is not implemented - val mstatusStruct = mstatus.asTypeOf(new MstatusStruct) - def mstatusUpdateSideEffect(mstatus: UInt): UInt = { - val mstatusOld = WireInit(mstatus.asTypeOf(new MstatusStruct)) - val mstatusNew = Cat(mstatusOld.fs === "b11".U, mstatus(XLEN-2, 0)) - mstatusNew - } - - val medeleg = RegInit(UInt(XLEN.W), 0.U) - val mideleg = RegInit(UInt(XLEN.W), 0.U) - val mscratch = RegInit(UInt(XLEN.W), 0.U) - - val pmpcfg0 = RegInit(UInt(XLEN.W), 0.U) - val pmpcfg1 = RegInit(UInt(XLEN.W), 0.U) - val pmpcfg2 = RegInit(UInt(XLEN.W), 0.U) - val pmpcfg3 = RegInit(UInt(XLEN.W), 0.U) - val pmpaddr0 = RegInit(UInt(XLEN.W), 0.U) - val pmpaddr1 = RegInit(UInt(XLEN.W), 0.U) - val pmpaddr2 = RegInit(UInt(XLEN.W), 0.U) - val pmpaddr3 = RegInit(UInt(XLEN.W), 0.U) - - // Superviser-Level CSRs - - // val sstatus = RegInit(UInt(XLEN.W), "h00000000".U) - val sstatusWmask = "hc6122".U - // Sstatus Write Mask - // ------------------------------------------------------- - // 19 9 5 2 - // 0 1100 0000 0001 0010 0010 - // 0 c 0 1 2 2 - // ------------------------------------------------------- - val sstatusRmask = sstatusWmask | "h8000000300018000".U - // Sstatus Read Mask = (SSTATUS_WMASK | (0xf << 13) | (1ull << 63) | (3ull << 32)) - val stvec = RegInit(UInt(XLEN.W), 0.U) - // val sie = RegInit(0.U(XLEN.W)) - val sieMask = "h222".U & mideleg - val sipMask = "h222".U & mideleg - //val satp = RegInit(UInt(XLEN.W), "h8000000000087fbe".U) - val satp = RegInit(UInt(XLEN.W), 0.U) - val sepc = RegInit(UInt(XLEN.W), 0.U) - val scause = RegInit(UInt(XLEN.W), 0.U) - val stval = Reg(UInt(XLEN.W)) - val sscratch = RegInit(UInt(XLEN.W), 0.U) - val scounteren = RegInit(UInt(XLEN.W), 0.U) - BoringUtils.addSource(satp, "CSRSATP") - - // User-Level CSRs - val uepc = Reg(UInt(XLEN.W)) - - // fcsr - class FcsrStruct extends Bundle{ - val reserved = UInt((XLEN-3-5).W) - val frm = UInt(3.W) - val fflags = UInt(5.W) - assert(this.getWidth == XLEN) - } - val fcsr = RegInit(0.U(XLEN.W)) - // set mstatus->sd and mstatus->fs when true - val csrw_dirty_fp_state = WireInit(false.B) - - def frm_wfn(wdata: UInt): UInt = { - val fcsrOld = WireInit(fcsr.asTypeOf(new FcsrStruct)) - csrw_dirty_fp_state := true.B - fcsrOld.frm := wdata(2,0) - fcsrOld.asUInt() - } - def frm_rfn(rdata: UInt): UInt = rdata(7,5) - - def fflags_wfn(wdata: UInt): UInt = { - val fcsrOld = WireInit(fcsr.asTypeOf(new FcsrStruct)) - csrw_dirty_fp_state := true.B - fcsrOld.fflags := wdata(4,0) - fcsrOld.asUInt() - } - def fflags_rfn(rdata:UInt): UInt = rdata(4,0) - - def fcsr_wfn(wdata: UInt): UInt = { - val fcsrOld = WireInit(fcsr.asTypeOf(new FcsrStruct)) - csrw_dirty_fp_state := true.B - Cat(fcsrOld.reserved, wdata.asTypeOf(fcsrOld).frm, wdata.asTypeOf(fcsrOld).fflags) - } - - val fcsrMapping = Map( - MaskedRegMap(Fflags, fcsr, wfn = fflags_wfn, rfn = fflags_rfn), - MaskedRegMap(Frm, fcsr, wfn = frm_wfn, rfn = frm_rfn), - MaskedRegMap(Fcsr, fcsr, wfn = fcsr_wfn) - ) - - // Atom LR/SC Control Bits - val setLr = WireInit(Bool(), false.B) - val setLrVal = WireInit(Bool(), false.B) - val setLrAddr = WireInit(UInt(AddrBits.W), DontCare) //TODO : need check - val lr = RegInit(Bool(), false.B) - val lrAddr = RegInit(UInt(AddrBits.W), 0.U) - BoringUtils.addSink(setLr, "set_lr") - BoringUtils.addSink(setLrVal, "set_lr_val") - BoringUtils.addSink(setLrAddr, "set_lr_addr") - BoringUtils.addSource(lr, "lr") - BoringUtils.addSource(lrAddr, "lr_addr") - - when(setLr){ - lr := setLrVal - lrAddr := setLrAddr - } - - // Hart Priviledge Mode - val priviledgeMode = RegInit(UInt(2.W), ModeM) - - // perfcnt - val hasPerfCnt = !p.FPGAPlatform - val nrPerfCnts = if (hasPerfCnt) 0x80 else 0x3 - val perfCnts = List.fill(nrPerfCnts)(RegInit(0.U(XLEN.W))) - val perfCntsLoMapping = (0 until nrPerfCnts).map { case i => MaskedRegMap(0xb00 + i, perfCnts(i)) } - val perfCntsHiMapping = (0 until nrPerfCnts).map { case i => MaskedRegMap(0xb80 + i, perfCnts(i)(63, 32)) } - - // CSR reg map - val mapping = Map( - - // User Trap Setup - // MaskedRegMap(Ustatus, ustatus), - // MaskedRegMap(Uie, uie, 0.U, MaskedRegMap.Unwritable), - // MaskedRegMap(Utvec, utvec), - - // User Trap Handling - // MaskedRegMap(Uscratch, uscratch), - // MaskedRegMap(Uepc, uepc), - // MaskedRegMap(Ucause, ucause), - // MaskedRegMap(Utval, utval), - // MaskedRegMap(Uip, uip), - - // User Counter/Timers - // MaskedRegMap(Cycle, cycle), - // MaskedRegMap(Time, time), - // MaskedRegMap(Instret, instret), - - // Supervisor Trap Setup - MaskedRegMap(Sstatus, mstatus, sstatusWmask, mstatusUpdateSideEffect, sstatusRmask), - - // MaskedRegMap(Sedeleg, Sedeleg), - // MaskedRegMap(Sideleg, Sideleg), - MaskedRegMap(Sie, mie, sieMask, MaskedRegMap.NoSideEffect, sieMask), - MaskedRegMap(Stvec, stvec), - MaskedRegMap(Scounteren, scounteren), - - // Supervisor Trap Handling - MaskedRegMap(Sscratch, sscratch), - MaskedRegMap(Sepc, sepc), - MaskedRegMap(Scause, scause), - MaskedRegMap(Stval, stval), - MaskedRegMap(Sip, mip.asUInt, sipMask, MaskedRegMap.Unwritable, sipMask), - - // Supervisor Protection and Translation - MaskedRegMap(Satp, satp), - - // Machine Information Registers - MaskedRegMap(Mvendorid, mvendorid, 0.U, MaskedRegMap.Unwritable), - MaskedRegMap(Marchid, marchid, 0.U, MaskedRegMap.Unwritable), - MaskedRegMap(Mimpid, mimpid, 0.U, MaskedRegMap.Unwritable), - MaskedRegMap(Mhartid, mhartid, 0.U, MaskedRegMap.Unwritable), - - // Machine Trap Setup - // MaskedRegMap(Mstatus, mstatus, "hffffffffffffffee".U, (x=>{printf("mstatus write: %x time: %d\n", x, GTimer()); x})), - MaskedRegMap(Mstatus, mstatus, "hffffffffffffffff".U, mstatusUpdateSideEffect), - MaskedRegMap(Misa, misa), // now MXL, EXT is not changeable - MaskedRegMap(Medeleg, medeleg, "hbbff".U), - MaskedRegMap(Mideleg, mideleg, "h222".U), - MaskedRegMap(Mie, mie), - MaskedRegMap(Mtvec, mtvec), - MaskedRegMap(Mcounteren, mcounteren), - - // Machine Trap Handling - MaskedRegMap(Mscratch, mscratch), - MaskedRegMap(Mepc, mepc), - MaskedRegMap(Mcause, mcause), - MaskedRegMap(Mtval, mtval), - MaskedRegMap(Mip, mip.asUInt, 0.U, MaskedRegMap.Unwritable), - - // Machine Memory Protection - MaskedRegMap(Pmpcfg0, pmpcfg0), - MaskedRegMap(Pmpcfg1, pmpcfg1), - MaskedRegMap(Pmpcfg2, pmpcfg2), - MaskedRegMap(Pmpcfg3, pmpcfg3), - MaskedRegMap(PmpaddrBase + 0, pmpaddr0), - MaskedRegMap(PmpaddrBase + 1, pmpaddr1), - MaskedRegMap(PmpaddrBase + 2, pmpaddr2), - MaskedRegMap(PmpaddrBase + 3, pmpaddr3) - - ) ++ - perfCntsLoMapping ++ (if (XLEN == 32) perfCntsHiMapping else Nil) ++ - (if(HasFPU) fcsrMapping else Nil) - - val addr = src2(11, 0) - val rdata = Wire(UInt(XLEN.W)) - val csri = ZeroExt(io.cfIn.instr(19,15), XLEN) //unsigned imm for csri. [TODO] - val wdata = LookupTree(func, List( - CSROpType.wrt -> src1, - CSROpType.set -> (rdata | src1), - CSROpType.clr -> (rdata & ~src1), - CSROpType.wrti -> csri,//TODO: csri --> src2 - CSROpType.seti -> (rdata | csri), - CSROpType.clri -> (rdata & ~csri) - )) - - val wen = (valid && func =/= CSROpType.jmp) - // Debug(){when(wen){printf("[CSR] addr %x wdata %x func %x rdata %x\n", addr, wdata, func, rdata)}} - MaskedRegMap.generate(mapping, addr, rdata, wen, wdata) - val isIllegalAddr = MaskedRegMap.isIllegalAddr(mapping, addr) - val resetSatp = addr === Satp.U && wen // write to satp will cause the pipeline be flushed - io.out.bits := rdata - - // Fix Mip/Sip write - val fixMapping = Map( - MaskedRegMap(Mip, mipReg.asUInt, mipFixMask), - MaskedRegMap(Sip, mipReg.asUInt, sipMask, MaskedRegMap.NoSideEffect, sipMask) - ) - val rdataDummy = Wire(UInt(XLEN.W)) - MaskedRegMap.generate(fixMapping, addr, rdataDummy, wen, wdata) - - when(io.fpu_csr.fflags.asUInt() =/= 0.U){ - fcsr := fflags_wfn(io.fpu_csr.fflags.asUInt()) - } - // set fs and sd in mstatus - when(csrw_dirty_fp_state || io.fpu_csr.dirty_fs){ - val mstatusNew = WireInit(mstatus.asTypeOf(new MstatusStruct)) - mstatusNew.fs := "b11".U - mstatusNew.sd := true.B - mstatus := mstatusNew.asUInt() - } - io.fpu_csr.frm := fcsr.asTypeOf(new FcsrStruct).frm - - // CSR inst decode - val ret = Wire(Bool()) - val isEcall = addr === privEcall && func === CSROpType.jmp - val isMret = addr === privMret && func === CSROpType.jmp - val isSret = addr === privSret && func === CSROpType.jmp - val isUret = addr === privUret && func === CSROpType.jmp - - Debug(false){ - when(wen){ - printf("[CSR] csr write: pc %x addr %x rdata %x wdata %x func %x\n", io.cfIn.pc, addr, rdata, wdata, func) - printf("[MST] time %d pc %x mstatus %x mideleg %x medeleg %x mode %x\n", GTimer(), io.cfIn.pc, mstatus, mideleg , medeleg, priviledgeMode) - } - } - - // MMU Permission Check - - // def MMUPermissionCheck(ptev: Bool, pteu: Bool): Bool = ptev && !(priviledgeMode === ModeU && !pteu) && !(priviledgeMode === ModeS && pteu && mstatusStruct.sum.asBool) - // def MMUPermissionCheckLoad(ptev: Bool, pteu: Bool): Bool = ptev && !(priviledgeMode === ModeU && !pteu) && !(priviledgeMode === ModeS && pteu && mstatusStruct.sum.asBool) && (pter || (mstatusStruct.mxr && ptex)) - // imem - // val imemPtev = true.B - // val imemPteu = true.B - // val imemPtex = true.B - // val imemReq = true.B - // val imemPermissionCheckPassed = MMUPermissionCheck(imemPtev, imemPteu) - // val hasInstrPageFault = imemReq && !(imemPermissionCheckPassed && imemPtex) - // assert(!hasInstrPageFault) - - // dmem - // val dmemPtev = true.B - // val dmemPteu = true.B - // val dmemReq = true.B - // val dmemPermissionCheckPassed = MMUPermissionCheck(dmemPtev, dmemPteu) - // val dmemIsStore = true.B - - // val hasLoadPageFault = dmemReq && !dmemIsStore && !(dmemPermissionCheckPassed) - // val hasStorePageFault = dmemReq && dmemIsStore && !(dmemPermissionCheckPassed) - // assert(!hasLoadPageFault) - // assert(!hasStorePageFault) - - //TODO: Havn't test if io.dmemMMU.priviledgeMode is correct yet - io.imemMMU.priviledgeMode := priviledgeMode - io.dmemMMU.priviledgeMode := Mux(mstatusStruct.mprv.asBool, mstatusStruct.mpp, priviledgeMode) - io.imemMMU.status_sum := mstatusStruct.sum.asBool - io.dmemMMU.status_sum := mstatusStruct.sum.asBool - io.imemMMU.status_mxr := DontCare - io.dmemMMU.status_mxr := mstatusStruct.mxr.asBool - - val hasInstrPageFault = io.cfIn.exceptionVec(instrPageFault) && valid - val hasLoadPageFault = io.dmemMMU.loadPF - val hasStorePageFault = io.dmemMMU.storePF - val hasStoreAddrMisaligned = io.cfIn.exceptionVec(storeAddrMisaligned) - val hasLoadAddrMisaligned = io.cfIn.exceptionVec(loadAddrMisaligned) - - when(hasInstrPageFault || hasLoadPageFault || hasStorePageFault){ - val tval = Mux(hasInstrPageFault, Mux(io.cfIn.crossPageIPFFix, SignExt(io.cfIn.pc + 2.U, XLEN), SignExt(io.cfIn.pc, XLEN)), SignExt(io.dmemMMU.addr, XLEN)) - when(priviledgeMode === ModeM){ - mtval := tval - }.otherwise{ - stval := tval - } - } - - val lsuAddr = WireInit(0.U(64.W)) - BoringUtils.addSink(lsuAddr, "LSUADDR") - when(hasLoadAddrMisaligned || hasStoreAddrMisaligned) - { - mtval := SignExt(lsuAddr, XLEN) - } - - // Exception and Intr - - // interrupts - - val ideleg = (mideleg & mip.asUInt) - def priviledgedEnableDetect(x: Bool): Bool = Mux(x, ((priviledgeMode === ModeS) && mstatusStruct.ie.s) || (priviledgeMode < ModeS), - ((priviledgeMode === ModeM) && mstatusStruct.ie.m) || (priviledgeMode < ModeM)) - - val intrVecEnable = Wire(Vec(12, Bool())) - intrVecEnable.zip(ideleg.asBools).map{case(x,y) => x := priviledgedEnableDetect(y)} - val intrVec = mie(11,0) & mip.asUInt & intrVecEnable.asUInt - BoringUtils.addSource(intrVec, "intrVecIDU") - // val intrNO = PriorityEncoder(intrVec) - - val intrNO = IntPriority.foldRight(0.U)((i: Int, sum: UInt) => Mux(io.cfIn.intrVec(i), i.U, sum)) - // val intrNO = PriorityEncoder(io.cfIn.intrVec) - val raiseIntr = io.cfIn.intrVec.asUInt.orR - - val mtip = WireInit(false.B) - val meip = WireInit(false.B) - BoringUtils.addSink(mtip, "mtip") - BoringUtils.addSink(meip, "meip") - mipWire.t.m := mtip - mipWire.e.m := meip - - // exceptions - - // TODO: merge iduExceptionVec, csrExceptionVec as raiseExceptionVec - val csrExceptionVec = Wire(Vec(16, Bool())) - csrExceptionVec.map(_ := false.B) - csrExceptionVec(ecallM) := priviledgeMode === ModeM && io.in.valid && isEcall - csrExceptionVec(ecallS) := priviledgeMode === ModeS && io.in.valid && isEcall - csrExceptionVec(ecallU) := priviledgeMode === ModeU && io.in.valid && isEcall - // csrExceptionVec(instrPageFault) := hasInstrPageFault - csrExceptionVec(illegalInstr) := isIllegalAddr && wen // Trigger an illegal instr exception when unimplemented csr is being read/written - csrExceptionVec(loadPageFault) := hasLoadPageFault - csrExceptionVec(storePageFault) := hasStorePageFault - val iduExceptionVec = io.cfIn.exceptionVec - val raiseExceptionVec = csrExceptionVec.asUInt() | iduExceptionVec.asUInt() - val raiseException = raiseExceptionVec.orR - val exceptionNO = ExcPriority.foldRight(0.U)((i: Int, sum: UInt) => Mux(raiseExceptionVec(i), i.U, sum)) - io.wenFix := raiseException - - val causeNO = (raiseIntr << (XLEN-1)) | Mux(raiseIntr, intrNO, exceptionNO) - io.intrNO := Mux(raiseIntr, causeNO, 0.U) - - val raiseExceptionIntr = (raiseException || raiseIntr) && io.instrValid - val retTarget = Wire(UInt(VAddrBits.W)) - val trapTarget = Wire(UInt(VAddrBits.W)) - io.redirect.valid := (valid && func === CSROpType.jmp) || raiseExceptionIntr || resetSatp - io.redirect.target := Mux(resetSatp, io.cfIn.pnpc, Mux(raiseExceptionIntr, trapTarget, retTarget)) - - Debug(){ - when(raiseExceptionIntr){ - printf("[CSR] int/exc: pc %x int (%d):%x exc: (%d):%x\n",io.cfIn.pc, intrNO, io.cfIn.intrVec.asUInt, exceptionNO, raiseExceptionVec.asUInt) - printf("[MST] time %d pc %x mstatus %x mideleg %x medeleg %x mode %x\n", GTimer(), io.cfIn.pc, mstatus, mideleg , medeleg, priviledgeMode) - } - when(io.redirect.valid){ - printf("[CSR] redirect to %x\n", io.redirect.target) - } - } - - // Debug(false){ - // when(raiseExceptionIntr){ - // printf("[CSR] raiseExceptionIntr!\n[CSR] int/exc: pc %x int (%d):%x exc: (%d):%x\n",io.cfIn.pc, intrNO, io.cfIn.intrVec.asUInt, exceptionNO, raiseExceptionVec.asUInt) - // printf("[MST] time %d pc %x mstatus %x mideleg %x medeleg %x mode %x\n", GTimer(), io.cfIn.pc, mstatus, mideleg , medeleg, priviledgeMode) - // } - - // when(valid && isMret){ - // printf("[CSR] Mret to %x!\n[CSR] int/exc: pc %x int (%d):%x exc: (%d):%x\n",retTarget, io.cfIn.pc, intrNO, io.cfIn.intrVec.asUInt, exceptionNO, raiseExceptionVec.asUInt) - // printf("[MST] time %d pc %x mstatus %x mideleg %x medeleg %x mode %x\n", GTimer(), io.cfIn.pc, mstatus, mideleg , medeleg, priviledgeMode) - // } - - // when(valid && isSret){ - // printf("[CSR] Sret to %x!\n[CSR] int/exc: pc %x int (%d):%x exc: (%d):%x\n",retTarget, io.cfIn.pc, intrNO, io.cfIn.intrVec.asUInt, exceptionNO, raiseExceptionVec.asUInt) - // printf("[MST] time %d pc %x mstatus %x mideleg %x medeleg %x mode %x\n", GTimer(), io.cfIn.pc, mstatus, mideleg , medeleg, priviledgeMode) - // } - //printf("[CSR] Red(%d, %x) raiseExcepIntr:%d valid:%d instrValid:%x \n", io.redirect.valid, io.redirect.target, raiseExceptionIntr, valid, io.instrValid) - // } - - // Branch control - - val deleg = Mux(raiseIntr, mideleg , medeleg) - // val delegS = ((deleg & (1 << (causeNO & 0xf))) != 0) && (priviledgeMode < ModeM); - val delegS = (deleg(causeNO(3,0))) && (priviledgeMode < ModeM) - val tvalWen = !(hasInstrPageFault || hasLoadPageFault || hasStorePageFault || hasLoadAddrMisaligned || hasStoreAddrMisaligned) || raiseIntr // in noop-riscv64, no exception will come together with PF - - ret := isMret || isSret || isUret - trapTarget := Mux(delegS, stvec, mtvec)(VAddrBits-1, 0) - retTarget := DontCare - // TODO redirect target - // val illegalEret = TODO - - when (valid && isMret) { - val mstatusOld = WireInit(mstatus.asTypeOf(new MstatusStruct)) - val mstatusNew = WireInit(mstatus.asTypeOf(new MstatusStruct)) - // mstatusNew.mpp.m := ModeU //TODO: add mode U - mstatusNew.ie.m := mstatusOld.pie.m - priviledgeMode := mstatusOld.mpp - mstatusNew.pie.m := true.B - mstatusNew.mpp := ModeU - mstatus := mstatusNew.asUInt - lr := false.B - retTarget := mepc(VAddrBits-1, 0) - } - - when (valid && isSret) { - val mstatusOld = WireInit(mstatus.asTypeOf(new MstatusStruct)) - val mstatusNew = WireInit(mstatus.asTypeOf(new MstatusStruct)) - // mstatusNew.mpp.m := ModeU //TODO: add mode U - mstatusNew.ie.s := mstatusOld.pie.s - priviledgeMode := Cat(0.U(1.W), mstatusOld.spp) - mstatusNew.pie.s := true.B - mstatusNew.spp := ModeU - mstatus := mstatusNew.asUInt - lr := false.B - retTarget := sepc(VAddrBits-1, 0) - } - - when (valid && isUret) { - val mstatusOld = WireInit(mstatus.asTypeOf(new MstatusStruct)) - val mstatusNew = WireInit(mstatus.asTypeOf(new MstatusStruct)) - // mstatusNew.mpp.m := ModeU //TODO: add mode U - mstatusNew.ie.u := mstatusOld.pie.u - priviledgeMode := ModeU - mstatusNew.pie.u := true.B - mstatus := mstatusNew.asUInt - retTarget := uepc(VAddrBits-1, 0) - } - - when (raiseExceptionIntr) { - val mstatusOld = WireInit(mstatus.asTypeOf(new MstatusStruct)) - val mstatusNew = WireInit(mstatus.asTypeOf(new MstatusStruct)) - - when (delegS) { - scause := causeNO - sepc := SignExt(io.cfIn.pc, XLEN) - mstatusNew.spp := priviledgeMode - mstatusNew.pie.s := mstatusOld.ie.s - mstatusNew.ie.s := false.B - priviledgeMode := ModeS - when(tvalWen){stval := 0.U} // TODO: should not use =/= - // printf("[*] mstatusNew.spp %x\n", mstatusNew.spp) - // trapTarget := stvec(VAddrBits-1. 0) - }.otherwise { - mcause := causeNO - mepc := SignExt(io.cfIn.pc, XLEN) - mstatusNew.mpp := priviledgeMode - mstatusNew.pie.m := mstatusOld.ie.m - mstatusNew.ie.m := false.B - priviledgeMode := ModeM - when(tvalWen){mtval := 0.U} // TODO: should not use =/= - // trapTarget := mtvec(VAddrBits-1. 0) - } - // mstatusNew.pie.m := LookupTree(priviledgeMode, List( - // ModeM -> mstatusOld.ie.m, - // ModeH -> mstatusOld.ie.h, //ERROR - // ModeS -> mstatusOld.ie.s, - // ModeU -> mstatusOld.ie.u - // )) - - mstatus := mstatusNew.asUInt - } - - io.in.ready := true.B - io.out.valid := valid - - Debug(false) { - printf("[CSR2] Red(%d, %x) raiseExcepIntr:%d isSret:%d retTarget:%x sepc:%x delegs:%d deleg:%x cfInpc:%x valid:%d instrValid:%x \n", io.redirect.valid, io.redirect.target, raiseExceptionIntr, isSret, retTarget, sepc, delegS, deleg, io.cfIn.pc, valid, io.instrValid) - } - - Debug(false) { - when(raiseExceptionIntr && delegS ) { - printf("[CSR2] Red(%d, %x) raiseExcepIntr:%d isSret:%d retTarget:%x sepc:%x delegs:%d deleg:%x cfInpc:%x valid:%d instrValid:%x \n", io.redirect.valid, io.redirect.target, raiseExceptionIntr, isSret, retTarget, sepc, delegS, deleg, io.cfIn.pc, valid, io.instrValid) - printf("[CSR3] sepc is writen!!! pc:%x time:%d\n", io.cfIn.pc, GTimer()) - } - } - - // perfcnt - - val perfCntList = Map( - "Mcycle" -> (0xb00, "perfCntCondMcycle" ), - "Minstret" -> (0xb02, "perfCntCondMinstret" ), - "MimemStall" -> (0xb03, "perfCntCondMimemStall" ), - "MaluInstr" -> (0xb04, "perfCntCondMaluInstr" ), - "MbruInstr" -> (0xb05, "perfCntCondMbruInstr" ), - "MlsuInstr" -> (0xb06, "perfCntCondMlsuInstr" ), - "MmduInstr" -> (0xb07, "perfCntCondMmduInstr" ), - "McsrInstr" -> (0xb08, "perfCntCondMcsrInstr" ), - "MloadInstr" -> (0xb09, "perfCntCondMloadInstr" ), - "MloadStall" -> (0xb0a, "perfCntCondMloadStall" ), - "MstoreStall" -> (0xb0b, "perfCntCondMstoreStall"), - "MmmioInstr" -> (0xb0c, "perfCntCondMmmioInstr" ), - "MicacheHit" -> (0xb0d, "perfCntCondMicacheHit" ), - "MdcacheHit" -> (0xb0e, "perfCntCondMdcacheHit" ), - "MmulInstr" -> (0xb0f, "perfCntCondMmulInstr" ), - "MifuFlush" -> (0xb10, "perfCntCondMifuFlush" ), - "MrawStall" -> (0xb11, "perfCntCondMrawStall" ), - "MexuBusy" -> (0xb12, "perfCntCondMexuBusy" ), - "MbpBRight" -> (0xb13, "MbpBRight" ), - "MbpBWrong" -> (0xb14, "MbpBWrong" ), - "MbpJRight" -> (0xb15, "MbpJRight" ), - "MbpJWrong" -> (0xb16, "MbpJWrong" ), - "MbpIRight" -> (0xb17, "MbpIRight" ), - "MbpIWrong" -> (0xb18, "MbpIWrong" ), - "MbpRRight" -> (0xb19, "MbpRRight" ), - "MbpRWrong" -> (0xb1a, "MbpRWrong" ), - "Custom1" -> (0xb1b, "Custom1" ), - "Custom2" -> (0xb1c, "Custom2" ), - "Custom3" -> (0xb1d, "Custom3" ), - "Custom4" -> (0xb1e, "Custom4" ), - "Custom5" -> (0xb1f, "Custom5" ), - "Custom6" -> (0xb20, "Custom6" ), - "Custom7" -> (0xb21, "Custom7" ), - "Custom8" -> (0xb22, "Custom8" ), - "Ml2cacheHit" -> (0xb23, "perfCntCondMl2cacheHit") - ) - val perfCntCond = List.fill(0x80)(WireInit(false.B)) - (perfCnts zip perfCntCond).map { case (c, e) => { when (e) { c := c + 1.U } } } - - BoringUtils.addSource(WireInit(true.B), "perfCntCondMcycle") - perfCntList.map { case (name, (addr, boringId)) => { - BoringUtils.addSink(perfCntCond(addr & 0x7f), boringId) - if (!hasPerfCnt) { - // do not enable perfcnts except for Mcycle and Minstret - if (addr != perfCntList("Mcycle")._1 && addr != perfCntList("Minstret")._1) { - perfCntCond(addr & 0x7f) := false.B - } - } - }} - - val nooptrap = WireInit(false.B) - BoringUtils.addSink(nooptrap, "nooptrap") - def readWithScala(addr: Int): UInt = mapping(addr)._1 - - if (!p.FPGAPlatform) { - // to monitor - BoringUtils.addSource(readWithScala(perfCntList("Mcycle")._1), "simCycleCnt") - BoringUtils.addSource(readWithScala(perfCntList("Minstret")._1), "simInstrCnt") - - // display all perfcnt when nooptrap is executed - when (nooptrap) { - printf("======== PerfCnt =========\n") - perfCntList.toSeq.sortBy(_._2._1).map { case (name, (addr, boringId)) => - printf("%d <- " + name + "\n", readWithScala(addr)) } - } - - // for differential testing - BoringUtils.addSource(RegNext(priviledgeMode), "difftestMode") - BoringUtils.addSource(RegNext(mstatus), "difftestMstatus") - BoringUtils.addSource(RegNext(mstatus & sstatusRmask), "difftestSstatus") - BoringUtils.addSource(RegNext(mepc), "difftestMepc") - BoringUtils.addSource(RegNext(sepc), "difftestSepc") - BoringUtils.addSource(RegNext(mcause), "difftestMcause") - BoringUtils.addSource(RegNext(scause), "difftestScause") - } else { - BoringUtils.addSource(readWithScala(perfCntList("Minstret")._1), "ilaInstrCnt") - } -} diff --git a/src/main/scala/noop/fu/FPU.scala b/src/main/scala/noop/fu/FPU.scala deleted file mode 100644 index b2ceffe884108404d8c6242a4f199789fbea7e84..0000000000000000000000000000000000000000 --- a/src/main/scala/noop/fu/FPU.scala +++ /dev/null @@ -1,126 +0,0 @@ -package noop.fu - -import chisel3.{util, _} -import chisel3.util._ -import utils._ -import noop._ -import fpu._ -import fpu.FPUIOFunc._ -import fpu.divsqrt.DivSqrt -import fpu.fma.FMA - -class FpInstr extends NOOPBundle { - val func5 = UInt(5.W) - val fmt = UInt(2.W) - val rs2 = UInt(5.W) - val rs1 = UInt(5.W) - val rm = UInt(3.W) - val rd = UInt(5.W) - val op = UInt(7.W) - assert(this.getWidth == 32) -} - -class FpuCsrIO extends NOOPBundle { - val fflags = Output(new Fflags) - val isIllegal = Output(Bool()) - val dirty_fs = Output(Bool()) - val frm = Input(UInt(3.W)) -} - -class FPUIO extends FunctionUnitIO{ - // use XLEN because fpu share data path with cpu - val src3 = Input(UInt(XLEN.W)) - val fpu_csr = new FpuCsrIO - val fpWen = Input(Bool()) - val instr = Input(UInt(32.W)) - val inputFunc = Input(UInt(1.W)) - val outputFunc = Input(UInt(2.W)) -} - - - - -class FPU extends NOOPModule{ -// require(XLEN >= FLEN) - val io = IO(new FPUIO) - val (valid, src1, src2, src3, func) = (io.in.valid, io.in.bits.src1, io.in.bits.src2, io.src3, io.in.bits.func) - def access(valid: Bool, src1: UInt, src2: UInt, src3: UInt, func: UInt): UInt = { - this.valid := valid - this.src1 := src1 - this.src2 := src2 - this.src3 := src3 - this.func := func - io.out.bits - } - - val instr = io.instr.asTypeOf(new FpInstr) - val isRVD = instr.fmt(0) - val src = VecInit(Seq(src1, src2, src3)).map(x => - Mux(io.inputFunc === in_unbox, unboxF64ToF32(x), x) - ) - - val roudingMode = Mux(instr.rm===7.U, io.fpu_csr.frm, instr.rm) - val op = func(2, 0) - val fu = func(5, 3) - - val s_ready :: s_wait :: Nil = Enum(2) - val state = RegInit(s_ready) - switch(state){ - is(s_ready){ - when(io.in.valid){ - state := s_wait - } - } - is(s_wait){ - when(io.out.fire()){ - state := s_ready - } - } - } - - val subModuleInput = Wire(new FPUSubModuleInput) - subModuleInput.a := src(0) - subModuleInput.b := src(1) - subModuleInput.c := src(2) - subModuleInput.op := op - subModuleInput.isDouble := isRVD - subModuleInput.rm := roudingMode - - val subModules = Array[FPUSubModule]( - Module(new FMA), // 0 - Module(new FCMP), // 1 - Module(new FMV(XLEN)), // 2 - Module(new FloatToInt), // 3 - Module(new IntToFloat), // 4 - Module(new F32toF64), // 5 - Module(new F64toF32), // 6 - Module(new DivSqrt) //7 - ) - val outFuncReg = RegEnable(io.outputFunc, io.in.fire()) - val fuReg = RegEnable(fu, io.in.fire()) - for((module, idx) <- subModules.zipWithIndex){ - module.io.in.bits := subModuleInput - module.io.in.valid := io.in.fire() && idx.U===fu - module.io.out.ready := true.B - } - - val subModuleOutput = Wire(Decoupled(new FPUSubModuleOutput)) - subModuleOutput := LookupTree(fuReg, subModules.zipWithIndex.map({ - case (module, idx) => - idx.U -> module.io.out - })) - val result = subModuleOutput.bits.result - - io.in.ready := state===s_ready - io.out.valid := subModuleOutput.valid - io.out.bits := MuxLookup(outFuncReg, result, Seq( - out_sext -> SignExt(result(31, 0), XLEN), - out_box -> boxF32ToF64(result) - )) - - //TODO: check illegal rounding mode exception - io.fpu_csr.isIllegal := false.B - io.fpu_csr.dirty_fs := io.in.fire() && io.fpWen - io.fpu_csr.fflags := Mux(io.out.valid, subModuleOutput.bits.fflags, 0.U.asTypeOf(new Fflags)) -} - diff --git a/src/main/scala/noop/fu/LSU.scala b/src/main/scala/noop/fu/LSU.scala deleted file mode 100644 index 2933faebfbff590fda5ae82f11040cc0cf87c170..0000000000000000000000000000000000000000 --- a/src/main/scala/noop/fu/LSU.scala +++ /dev/null @@ -1,470 +0,0 @@ -package noop -import chisel3._ -import chisel3.util._ -import chisel3.util.experimental.BoringUtils -import utils._ -import bus.simplebus._ -import fpu.boxF32ToF64 - -object LSUOpType { - def lb = "b000000".U - def lh = "b000001".U - def lw = "b000010".U - def ld = "b000011".U - def lbu = "b000100".U - def lhu = "b000101".U - def lwu = "b000110".U - def flw = "b010110".U // box 32-bit data to 64-bit with 1s - def sb = "b001000".U - def sh = "b001001".U - def sw = "b001010".U - def sd = "b001011".U - - def lr = "b100000".U - def sc = "b100001".U - def amoswap = "b100010".U - def amoadd = "b100011".U - def amoxor = "b100100".U - def amoand = "b100101".U - def amoor = "b100110".U - def amomin = "b110111".U - def amomax = "b110000".U - def amominu = "b110001".U - def amomaxu = "b110010".U - - def isStore(func: UInt): Bool = func(3) - def isAtom(func: UInt): Bool = func(5) - def isLoad(func: UInt): Bool = !isStore(func) & !isAtom(func) - def isLR(func: UInt): Bool = func === lr - def isSC(func: UInt): Bool = func === sc - def isAMO(func: UInt): Bool = isAtom(func) && !isLR(func) && !isSC(func) - - def atomW = "010".U - def atomD = "011".U -} - -class LSUIO extends FunctionUnitIO { - val wdata = Input(UInt(XLEN.W)) - val instr = Input(UInt(32.W)) // Atom insts need aq rl funct3 bit from instr - val dmem = new SimpleBusUC(addrBits = VAddrBits) - val isMMIO = Output(Bool()) - val dtlbPF = Output(Bool()) - val loadAddrMisaligned = Output(Bool()) - val storeAddrMisaligned = Output(Bool()) -} - -class StoreQueueEntry extends NOOPBundle{ - val src1 = UInt(XLEN.W) - val src2 = UInt(XLEN.W) - val wdata = UInt(XLEN.W) - val func = UInt(6.W) -} - -class AtomALU extends NOOPModule { - val io = IO(new NOOPBundle{ - val src1 = Input(UInt(XLEN.W)) - val src2 = Input(UInt(XLEN.W)) - val func = Input(UInt(6.W)) - val isWordOp = Input(Bool()) - val result = Output(UInt(XLEN.W)) - }) - - // src1: load result - // src2: reg result - val src1 = io.src1 - val src2 = io.src2 - val func = io.func - val isAdderSub = (func =/= LSUOpType.amoadd) - val adderRes = (src1 +& (src2 ^ Fill(XLEN, isAdderSub))) + isAdderSub - val xorRes = src1 ^ src2 - val sltu = !adderRes(XLEN) - val slt = xorRes(XLEN-1) ^ sltu - - val res = LookupTreeDefault(func(5, 0), adderRes, List( - LSUOpType.amoswap -> src2, - LSUOpType.amoadd -> adderRes, - LSUOpType.amoxor -> xorRes, - LSUOpType.amoand -> (src1 & src2), - LSUOpType.amoor -> (src1 | src2), - LSUOpType.amomin -> Mux(slt(0), src1, src2), - LSUOpType.amomax -> Mux(slt(0), src2, src1), - LSUOpType.amominu -> Mux(sltu(0), src1, src2), - LSUOpType.amomaxu -> Mux(sltu(0), src2, src1) - )) - - io.result := Mux(io.isWordOp, SignExt(res(31,0), 64), res) -} - -class LSU extends NOOPModule { - val io = IO(new LSUIO) - val (valid, src1, src2, func) = (io.in.valid, io.in.bits.src1, io.in.bits.src2, io.in.bits.func) - def access(valid: Bool, src1: UInt, src2: UInt, func: UInt, dtlbPF: Bool): UInt = { - this.valid := valid - this.src1 := src1 - this.src2 := src2 - this.func := func - dtlbPF := io.dtlbPF - io.out.bits - } - val lsExecUnit = Module(new LSExecUnit) - lsExecUnit.io.instr := DontCare - io.dtlbPF := lsExecUnit.io.dtlbPF - - val storeReq = valid & LSUOpType.isStore(func) - val loadReq = valid & LSUOpType.isLoad(func) - val atomReq = valid & LSUOpType.isAtom(func) - val amoReq = valid & LSUOpType.isAMO(func) - val lrReq = valid & LSUOpType.isLR(func) - val scReq = valid & LSUOpType.isSC(func) - BoringUtils.addSource(amoReq, "ISAMO") - BoringUtils.addSource(amoReq, "ISAMO2") - - val aq = io.instr(26) - val rl = io.instr(25) - val funct3 = io.instr(14, 12) - - val atomWidthW = !funct3(0) - val atomWidthD = funct3(0) - - // Atom LR/SC Control Bits - val setLr = Wire(Bool()) - val setLrVal = Wire(Bool()) - val setLrAddr = Wire(UInt(AddrBits.W)) - val lr = WireInit(Bool(), false.B) - val lrAddr = WireInit(UInt(AddrBits.W), DontCare) - BoringUtils.addSource(setLr, "set_lr") - BoringUtils.addSource(setLrVal, "set_lr_val") - BoringUtils.addSource(setLrAddr, "set_lr_addr") - BoringUtils.addSink(lr, "lr") - BoringUtils.addSink(lrAddr, "lr_addr") - - val scInvalid = !(src1 === lrAddr) && scReq - - // PF signal from TLB - val dtlbFinish = WireInit(false.B) - val dtlbPF = WireInit(false.B) - val dtlbEnable = WireInit(false.B) - BoringUtils.addSink(dtlbFinish, "DTLBFINISH") - BoringUtils.addSink(dtlbPF, "DTLBPF") - BoringUtils.addSink(dtlbEnable, "DTLBENABLE") - - // LSU control FSM state - val s_idle :: s_load :: s_lr :: s_sc :: s_amo_l :: s_amo_a :: s_amo_s :: Nil = Enum(7) - - // LSU control FSM - val state = RegInit(s_idle) - val atomMemReg = Reg(UInt(XLEN.W)) - val atomRegReg = Reg(UInt(XLEN.W)) - val atomALU = Module(new AtomALU) - atomALU.io.src1 := atomMemReg - atomALU.io.src2 := io.wdata - atomALU.io.func := func - atomALU.io.isWordOp := atomWidthW - - // StoreQueue - // TODO: inst fence needs storeQueue to be finished - val enableStoreQueue = EnableStoreQueue // StoreQueue is disabled for page fault detection - val storeQueue = Module(new Queue(new StoreQueueEntry, 4)) - storeQueue.io.enq.valid := state === s_idle && storeReq - storeQueue.io.enq.bits.src1 := src1 - storeQueue.io.enq.bits.src2 := src2 - storeQueue.io.enq.bits.wdata := io.wdata - storeQueue.io.enq.bits.func := func - storeQueue.io.deq.ready := lsExecUnit.io.out.fire() - - lsExecUnit.io.in.valid := false.B - lsExecUnit.io.out.ready := DontCare - lsExecUnit.io.in.bits.src1 := DontCare - lsExecUnit.io.in.bits.src2 := DontCare - lsExecUnit.io.in.bits.func := DontCare - lsExecUnit.io.wdata := DontCare - io.out.valid := false.B - io.in.ready := false.B - - switch (state) { - is(s_idle){ - if(enableStoreQueue){ - lsExecUnit.io.in.valid := Mux(storeQueue.io.deq.valid, storeQueue.io.deq.valid, io.in.valid) - lsExecUnit.io.out.ready := io.out.ready - lsExecUnit.io.in.bits.src1 := Mux(storeQueue.io.deq.valid, storeQueue.io.deq.bits.src1, src1) - lsExecUnit.io.in.bits.src2 := Mux(storeQueue.io.deq.valid, storeQueue.io.deq.bits.src2, src2) - lsExecUnit.io.in.bits.func := Mux(storeQueue.io.deq.valid, storeQueue.io.deq.bits.func, func) - lsExecUnit.io.wdata := Mux(storeQueue.io.deq.valid, storeQueue.io.deq.bits.wdata, io.wdata) - io.in.ready := Mux(storeReq, storeQueue.io.enq.ready, false.B) || scInvalid - io.out.valid := Mux(storeReq, storeQueue.io.enq.ready, false.B) || scInvalid - }else{ - lsExecUnit.io.in.valid := io.in.valid && !atomReq - lsExecUnit.io.out.ready := io.out.ready - lsExecUnit.io.in.bits.src1 := src1 - lsExecUnit.io.in.bits.src2 := src2 - lsExecUnit.io.in.bits.func := func - lsExecUnit.io.wdata := io.wdata - io.in.ready := lsExecUnit.io.out.fire() || scInvalid - io.out.valid := lsExecUnit.io.out.valid || scInvalid - } - - // when(storeReq){ - // state := s_idle - // } - if(enableStoreQueue){ - when(loadReq){state := Mux(storeQueue.io.deq.valid, s_idle, s_load)} - when(amoReq){state := Mux(storeQueue.io.deq.valid, s_idle, s_amo_l)} - when(lrReq){state := Mux(storeQueue.io.deq.valid, s_idle, s_lr)} - when(scReq){state := Mux(storeQueue.io.deq.valid, s_idle, s_sc)} - }else{ - when(amoReq){state := s_amo_l} - when(lrReq){state := s_lr} - when(scReq){state := Mux(scInvalid, s_idle, s_sc)} - } - } - - is(s_load){ - lsExecUnit.io.in.valid := true.B - lsExecUnit.io.out.ready := io.out.ready - lsExecUnit.io.in.bits.src1 := src1 - lsExecUnit.io.in.bits.src2 := src2 - lsExecUnit.io.in.bits.func := func - lsExecUnit.io.wdata := DontCare - io.in.ready := lsExecUnit.io.out.fire() - io.out.valid := lsExecUnit.io.out.valid - when(lsExecUnit.io.out.fire()){state := s_idle}//load finished - } - - is(s_amo_l){ - lsExecUnit.io.in.valid := true.B - lsExecUnit.io.out.ready := true.B - lsExecUnit.io.in.bits.src1 := src1 - lsExecUnit.io.in.bits.src2 := 0.U - lsExecUnit.io.in.bits.func := Mux(atomWidthD, LSUOpType.ld, LSUOpType.lw) - lsExecUnit.io.wdata := DontCare - io.in.ready := false.B - io.out.valid := false.B - when(lsExecUnit.io.out.fire()){ - state := s_amo_a; - Debug(){printf("[AMO-L] lsExecUnit.io.out.bits %x addr %x src2 %x\n", lsExecUnit.io.out.bits, lsExecUnit.io.in.bits.src1, io.wdata)} - } - atomMemReg := lsExecUnit.io.out.bits - atomRegReg := lsExecUnit.io.out.bits - } - - is(s_amo_a){ - lsExecUnit.io.in.valid := false.B - lsExecUnit.io.out.ready := false.B - lsExecUnit.io.in.bits.src1 := DontCare - lsExecUnit.io.in.bits.src2 := DontCare - lsExecUnit.io.in.bits.func := DontCare - lsExecUnit.io.wdata := DontCare - io.in.ready := false.B - io.out.valid := false.B - state := s_amo_s - atomMemReg := atomALU.io.result - Debug(){printf("[AMO-A] src1 %x src2 %x res %x\n", atomMemReg, io.wdata, atomALU.io.result)} - } - - is(s_amo_s){ - lsExecUnit.io.in.valid := true.B - lsExecUnit.io.out.ready := io.out.ready - lsExecUnit.io.in.bits.src1 := src1 - lsExecUnit.io.in.bits.src2 := 0.U - lsExecUnit.io.in.bits.func := Mux(atomWidthD, LSUOpType.sd, LSUOpType.sw) - lsExecUnit.io.wdata := atomMemReg - io.in.ready := lsExecUnit.io.out.fire() - io.out.valid := lsExecUnit.io.out.fire() - when(lsExecUnit.io.out.fire()){ - state := s_idle; - Debug(){printf("[AMO-S] atomRegReg %x addr %x\n", atomRegReg, lsExecUnit.io.in.bits.src1)} - } - } - is(s_lr){ - lsExecUnit.io.in.valid := true.B - lsExecUnit.io.out.ready := io.out.ready - lsExecUnit.io.in.bits.src1 := src1 - lsExecUnit.io.in.bits.src2 := 0.U - lsExecUnit.io.in.bits.func := Mux(atomWidthD, LSUOpType.ld, LSUOpType.lw) - lsExecUnit.io.wdata := DontCare - io.in.ready := lsExecUnit.io.out.fire() - io.out.valid := lsExecUnit.io.out.fire() - when(lsExecUnit.io.out.fire()){ - state := s_idle; - Debug(){printf("[LR]\n")} - } - } - is(s_sc){ - lsExecUnit.io.in.valid := true.B - lsExecUnit.io.out.ready := io.out.ready - lsExecUnit.io.in.bits.src1 := src1 - lsExecUnit.io.in.bits.src2 := 0.U - lsExecUnit.io.in.bits.func := Mux(atomWidthD, LSUOpType.sd, LSUOpType.sw) - lsExecUnit.io.wdata := io.wdata - io.in.ready := lsExecUnit.io.out.fire() - io.out.valid := lsExecUnit.io.out.fire() - when(lsExecUnit.io.out.fire()){ - state := s_idle; - Debug(){printf("[SC] \n")} - } - } - } - when(dtlbPF || io.loadAddrMisaligned || io.storeAddrMisaligned){ - state := s_idle - io.out.valid := true.B - io.in.ready := true.B - } - - // controled by FSM - // io.in.ready := lsExecUnit.io.in.ready - // lsExecUnit.io.wdata := io.wdata - // io.out.valid := lsExecUnit.io.out.valid - - //Set LR/SC bits - setLr := io.out.fire() && (lrReq || scReq) - setLrVal := lrReq - setLrAddr := src1 - - io.dmem <> lsExecUnit.io.dmem - io.out.bits := Mux(scReq, scInvalid, Mux(state === s_amo_s, atomRegReg, lsExecUnit.io.out.bits)) - - val lsuMMIO = WireInit(false.B) - BoringUtils.addSink(lsuMMIO, "lsuMMIO") - - val mmioReg = RegInit(false.B) - when (!mmioReg) { mmioReg := lsuMMIO } - when (io.out.valid) { mmioReg := false.B } - io.isMMIO := mmioReg && io.out.valid - - io.loadAddrMisaligned := lsExecUnit.io.loadAddrMisaligned - io.storeAddrMisaligned := lsExecUnit.io.storeAddrMisaligned -} - -class LSExecUnit extends NOOPModule { - val io = IO(new LSUIO) - - val (valid, src1, src2, func) = (io.in.valid, io.in.bits.src1, io.in.bits.src2, io.in.bits.func) - def access(valid: Bool, src1: UInt, src2: UInt, func: UInt): UInt = { - this.valid := valid - this.src1 := src1 - this.src2 := src2 - this.func := func - io.out.bits - } - - def genWmask(addr: UInt, sizeEncode: UInt): UInt = { - LookupTree(sizeEncode, List( - "b00".U -> 0x1.U, //0001 << addr(2:0) - "b01".U -> 0x3.U, //0011 - "b10".U -> 0xf.U, //1111 - "b11".U -> 0xff.U //11111111 - )) << addr(2, 0) - } - def genWdata(data: UInt, sizeEncode: UInt): UInt = { - LookupTree(sizeEncode, List( - "b00".U -> Fill(8, data(7, 0)), - "b01".U -> Fill(4, data(15, 0)), - "b10".U -> Fill(2, data(31, 0)), - "b11".U -> data - )) - } - - val dmem = io.dmem - val addr = src1 + src2 - val addrLatch = RegNext(addr) - val isStore = valid && LSUOpType.isStore(func) - val partialLoad = !isStore && (func =/= LSUOpType.ld) - - val s_idle :: s_wait_tlb :: s_wait_resp :: s_partialLoad :: Nil = Enum(4) - val state = RegInit(s_idle) - - val dtlbFinish = WireInit(false.B) - val dtlbPF = WireInit(false.B) - val dtlbEnable = WireInit(false.B) - BoringUtils.addSink(dtlbFinish, "DTLBFINISH") - BoringUtils.addSink(dtlbPF, "DTLBPF") - BoringUtils.addSink(dtlbEnable, "DTLBENABLE") - - io.dtlbPF := dtlbPF - - switch (state) { - is (s_idle) { - when (dmem.req.fire() && dtlbEnable) { state := s_wait_tlb } - when (dmem.req.fire() && !dtlbEnable) { state := s_wait_resp } - //when (dmem.req.fire()) { state := Mux(isStore, s_partialLoad, s_wait_resp) } - } - is (s_wait_tlb) { - when (dtlbFinish && dtlbPF ) { state := s_idle } - when (dtlbFinish && !dtlbPF) { state := s_wait_resp/*Mux(isStore, s_partialLoad, s_wait_resp) */} - } - is (s_wait_resp) { when (dmem.resp.fire()) { state := Mux(partialLoad, s_partialLoad, s_idle) } } - is (s_partialLoad) { state := s_idle } - } - - Debug(){ - //when (dmem.req.fire()){ - printf("[LSU] IN(%d, %d) OUT(%d, %d) addr %x, size %x, wdata_raw %x, isStore %x \n", io.in.valid, io.in.ready, io.out.valid, io.out.ready, addr, func(1,0), io.wdata, isStore) - printf("[LSU] dtlbFinish:%d dtlbEnable:%d dtlbPF:%d state:%d addr:%x dmemReqFire:%d dmemRespFire:%d dmemRdata:%x \n",dtlbFinish, dtlbEnable, dtlbPF, state, dmem.req.bits.addr, dmem.req.fire(), dmem.resp.fire(), dmem.resp.bits.rdata) - //} - //when (dtlbFinish && dtlbEnable) { - printf("[LSU] dtlbFinish:%d dtlbEnable:%d dtlbPF:%d state:%d addr:%x dmemReqFire:%d dmemRespFire:%d dmemRdata:%x \n",dtlbFinish, dtlbEnable, dtlbPF, state, dmem.req.bits.addr, dmem.req.fire(), dmem.resp.fire(), dmem.resp.bits.rdata) - //} - } - - val size = func(1,0) - dmem.req.bits.apply(addr = addr(VAddrBits-1, 0), size = size, wdata = genWdata(io.wdata, size), - wmask = genWmask(addr, size), cmd = Mux(isStore, SimpleBusCmd.write, SimpleBusCmd.read)) - dmem.req.valid := valid && (state === s_idle) && !io.loadAddrMisaligned && !io.storeAddrMisaligned - dmem.resp.ready := true.B - - io.out.valid := Mux( dtlbPF || io.loadAddrMisaligned || io.storeAddrMisaligned, true.B, Mux(partialLoad, state === s_partialLoad, dmem.resp.fire() && (state === s_wait_resp))) - io.in.ready := (state === s_idle) || dtlbPF - - val rdata = dmem.resp.bits.rdata - val rdataLatch = RegNext(rdata) - val rdataSel = LookupTree(addrLatch(2, 0), List( - "b000".U -> rdataLatch(63, 0), - "b001".U -> rdataLatch(63, 8), - "b010".U -> rdataLatch(63, 16), - "b011".U -> rdataLatch(63, 24), - "b100".U -> rdataLatch(63, 32), - "b101".U -> rdataLatch(63, 40), - "b110".U -> rdataLatch(63, 48), - "b111".U -> rdataLatch(63, 56) - )) - val rdataPartialLoad = LookupTree(func, List( - LSUOpType.lb -> SignExt(rdataSel(7, 0) , XLEN), - LSUOpType.lh -> SignExt(rdataSel(15, 0), XLEN), - LSUOpType.lw -> SignExt(rdataSel(31, 0), XLEN), - LSUOpType.lbu -> ZeroExt(rdataSel(7, 0) , XLEN), - LSUOpType.lhu -> ZeroExt(rdataSel(15, 0), XLEN), - LSUOpType.lwu -> ZeroExt(rdataSel(31, 0), XLEN), - LSUOpType.flw -> boxF32ToF64(rdataSel(31,0)) - )) - val addrAligned = LookupTree(func(1,0), List( - "b00".U -> true.B, //b - "b01".U -> (addr(0) === 0.U), //h - "b10".U -> (addr(1,0) === 0.U), //w - "b11".U -> (addr(2,0) === 0.U) //d - )) - - io.out.bits := Mux(partialLoad, rdataPartialLoad, rdata) - - io.isMMIO := DontCare - - val isAMO = WireInit(false.B) - BoringUtils.addSink(isAMO, "ISAMO2") - BoringUtils.addSource(addr, "LSUADDR") - - io.loadAddrMisaligned := valid && !isStore && !isAMO && !addrAligned - io.storeAddrMisaligned := valid && (isStore || isAMO) && !addrAligned - - when(io.loadAddrMisaligned || io.storeAddrMisaligned) { - //printf("[LSU] misaligned addr detected\n") - } - - BoringUtils.addSource(dmem.isRead() && dmem.req.fire(), "perfCntCondMloadInstr") - BoringUtils.addSource(BoolStopWatch(dmem.isRead(), dmem.resp.fire()), "perfCntCondMloadStall") - BoringUtils.addSource(BoolStopWatch(dmem.isWrite(), dmem.resp.fire()), "perfCntCondMstoreStall") - BoringUtils.addSource(io.isMMIO, "perfCntCondMmmioInstr") - Debug() { - when (dmem.req.fire() && (addr === "h80104708".U || genWdata(io.wdata, size)(31,0) === "h80000218".U)){ - //printf("[LSUBP] time %d, addr %x, size %x, wdata_raw %x, wdata %x, isStore %x \n", GTimer(), addr, func(1,0), io.wdata, genWdata(io.wdata, size), isStore) - } - } -} diff --git a/src/main/scala/noop/fu/MDU.scala b/src/main/scala/noop/fu/MDU.scala deleted file mode 100644 index 20fecd768bd608b601b66e6fd05655f46491b681..0000000000000000000000000000000000000000 --- a/src/main/scala/noop/fu/MDU.scala +++ /dev/null @@ -1,170 +0,0 @@ -package noop - -import chisel3._ -import chisel3.util._ -import chisel3.util.experimental.BoringUtils - -import utils._ - -object MDUOpType { - def mul = "b0000".U - def mulh = "b0001".U - def mulhsu = "b0010".U - def mulhu = "b0011".U - def div = "b0100".U - def divu = "b0101".U - def rem = "b0110".U - def remu = "b0111".U - - def mulw = "b1000".U - def divw = "b1100".U - def divuw = "b1101".U - def remw = "b1110".U - def remuw = "b1111".U - - def isDiv(op: UInt) = op(2) - def isDivSign(op: UInt) = isDiv(op) && !op(0) - def isW(op: UInt) = op(3) -} - -class MulDivIO(val len: Int) extends Bundle { - val in = Flipped(DecoupledIO(Vec(2, Output(UInt(len.W))))) - val sign = Input(Bool()) - val out = DecoupledIO(Output(UInt((len * 2).W))) -} - -class Multiplier(len: Int) extends NOOPModule { - val io = IO(new MulDivIO(len)) - val latency = 1 - - def DSPInPipe[T <: Data](a: T) = RegNext(a) - def DSPOutPipe[T <: Data](a: T) = RegNext(RegNext(RegNext(a))) - val mulRes = (DSPInPipe(io.in.bits(0)).asSInt * DSPInPipe(io.in.bits(1)).asSInt) - io.out.bits := DSPOutPipe(mulRes).asUInt - io.out.valid := DSPOutPipe(DSPInPipe(io.in.fire())) - - val busy = RegInit(false.B) - when (io.in.valid && !busy) { busy := true.B } - when (io.out.valid) { busy := false.B } - io.in.ready := (if (latency == 0) true.B else !busy) -} - -class Divider(len: Int = 64) extends NOOPModule { - val io = IO(new MulDivIO(len)) - - def abs(a: UInt, sign: Bool): (Bool, UInt) = { - val s = a(len - 1) && sign - (s, Mux(s, -a, a)) - } - - val s_idle :: s_log2 :: s_shift :: s_compute :: s_finish :: Nil = Enum(5) - val state = RegInit(s_idle) - val newReq = (state === s_idle) && io.in.fire() - - val (a, b) = (io.in.bits(0), io.in.bits(1)) - val divBy0 = b === 0.U(len.W) - - val shiftReg = Reg(UInt((1 + len * 2).W)) - val hi = shiftReg(len * 2, len) - val lo = shiftReg(len - 1, 0) - - val (aSign, aVal) = abs(a, io.sign) - val (bSign, bVal) = abs(b, io.sign) - val aSignReg = RegEnable(aSign, newReq) - val qSignReg = RegEnable((aSign ^ bSign) && !divBy0, newReq) - val bReg = RegEnable(bVal, newReq) - val aValx2Reg = RegEnable(Cat(aVal, "b0".U), newReq) - - val cnt = Counter(len) - when (newReq) { - state := s_log2 - } .elsewhen (state === s_log2) { - // `canSkipShift` is calculated as following: - // bEffectiveBit = Log2(bVal, XLEN) + 1.U - // aLeadingZero = 64.U - aEffectiveBit = 64.U - (Log2(aVal, XLEN) + 1.U) - // canSkipShift = aLeadingZero + bEffectiveBit - // = 64.U - (Log2(aVal, XLEN) + 1.U) + Log2(bVal, XLEN) + 1.U - // = 64.U + Log2(bVal, XLEN) - Log2(aVal, XLEN) - // = (64.U | Log2(bVal, XLEN)) - Log2(aVal, XLEN) // since Log2(bVal, XLEN) < 64.U - val canSkipShift = (64.U | Log2(bReg)) - Log2(aValx2Reg) - // When divide by 0, the quotient should be all 1's. - // Therefore we can not shift in 0s here. - // We do not skip any shift to avoid this. - cnt.value := Mux(divBy0, 0.U, Mux(canSkipShift >= (len-1).U, (len-1).U, canSkipShift)) - state := s_shift - } .elsewhen (state === s_shift) { - shiftReg := aValx2Reg << cnt.value - state := s_compute - } .elsewhen (state === s_compute) { - val enough = hi.asUInt >= bReg.asUInt - shiftReg := Cat(Mux(enough, hi - bReg, hi)(len - 1, 0), lo, enough) - cnt.inc() - when (cnt.value === (len-1).U) { state := s_finish } - } .elsewhen (state === s_finish) { - state := s_idle - } - - val r = hi(len, 1) - val resQ = Mux(qSignReg, -lo, lo) - val resR = Mux(aSignReg, -r, r) - io.out.bits := Cat(resR, resQ) - - io.out.valid := (if (HasDiv) (state === s_finish) else io.in.valid) // FIXME: should deal with ready = 0 - io.in.ready := (state === s_idle) -} - -class MDUIO extends FunctionUnitIO { -} - -class MDU extends NOOPModule { - val io = IO(new MDUIO) - - val (valid, src1, src2, func) = (io.in.valid, io.in.bits.src1, io.in.bits.src2, io.in.bits.func) - def access(valid: Bool, src1: UInt, src2: UInt, func: UInt): UInt = { - this.valid := valid - this.src1 := src1 - this.src2 := src2 - this.func := func - io.out.bits - } - - val isDiv = MDUOpType.isDiv(func) - val isDivSign = MDUOpType.isDivSign(func) - val isW = MDUOpType.isW(func) - - val mul = Module(new Multiplier(XLEN + 1)) - val div = Module(new Divider(64)) - List(mul.io, div.io).map { case x => - x.sign := isDivSign - x.out.ready := io.out.ready - } - - val signext = SignExt(_: UInt, XLEN+1) - val zeroext = ZeroExt(_: UInt, XLEN+1) - val mulInputFuncTable = List( - MDUOpType.mul -> (zeroext, zeroext), - MDUOpType.mulh -> (signext, signext), - MDUOpType.mulhsu -> (signext, zeroext), - MDUOpType.mulhu -> (zeroext, zeroext) - ) - mul.io.in.bits(0) := LookupTree(func(1,0), mulInputFuncTable.map(p => (p._1(1,0), p._2._1(src1)))) - mul.io.in.bits(1) := LookupTree(func(1,0), mulInputFuncTable.map(p => (p._1(1,0), p._2._2(src2)))) - - val divInputFunc = (x: UInt) => Mux(isW, Mux(isDivSign, SignExt(x(31,0), XLEN), ZeroExt(x(31,0), XLEN)), x) - div.io.in.bits(0) := divInputFunc(src1) - div.io.in.bits(1) := divInputFunc(src2) - - mul.io.in.valid := io.in.valid && !isDiv - div.io.in.valid := io.in.valid && isDiv - - val mulRes = Mux(func(1,0) === MDUOpType.mul(1,0), mul.io.out.bits(XLEN-1,0), mul.io.out.bits(2*XLEN-1,XLEN)) - val divRes = Mux(func(1) /* rem */, div.io.out.bits(2*XLEN-1,XLEN), div.io.out.bits(XLEN-1,0)) - val res = Mux(isDiv, divRes, mulRes) - io.out.bits := Mux(isW, SignExt(res(31,0),XLEN), res) - - val isDivReg = Mux(io.in.fire(), isDiv, RegNext(isDiv)) - io.in.ready := Mux(isDiv, div.io.in.ready, mul.io.in.ready) - io.out.valid := Mux(isDivReg, div.io.out.valid, mul.io.out.valid) - - BoringUtils.addSource(mul.io.out.fire(), "perfCntCondMmulInstr") -} diff --git a/src/main/scala/noop/fu/MOU.scala b/src/main/scala/noop/fu/MOU.scala deleted file mode 100644 index ff40bcc293792bdcd2771ddf02ce83198821ec82..0000000000000000000000000000000000000000 --- a/src/main/scala/noop/fu/MOU.scala +++ /dev/null @@ -1,54 +0,0 @@ -package noop - -import chisel3._ -import chisel3.util._ -import chisel3.util.experimental.BoringUtils - -import utils._ - -// memory order unit -object MOUOpType { - def fence = "b00".U - def fencei = "b01".U - def sfence_vma = "b10".U -} - -class MOUIO extends FunctionUnitIO { - val cfIn = Flipped(new CtrlFlowIO) - val redirect = new RedirectIO -} - -class MOU extends NOOPModule { - val io = IO(new MOUIO) - - val (valid, src1, src2, func) = (io.in.valid, io.in.bits.src1, io.in.bits.src2, io.in.bits.func) - def access(valid: Bool, src1: UInt, src2: UInt, func: UInt): UInt = { - this.valid := valid - this.src1 := src1 - this.src2 := src2 - this.func := func - io.out.bits - } - - io.redirect.target := io.cfIn.pc + 4.U - io.redirect.valid := valid - val flushICache = valid && (func === MOUOpType.fencei) - BoringUtils.addSource(flushICache, "MOUFlushICache") - Debug(false){ - when(flushICache){ - printf("%d: [MOU] Flush I$ at %x\n", GTimer(), io.cfIn.pc) - } - } - - val flushTLB = valid && (func === MOUOpType.sfence_vma) - BoringUtils.addSource(flushTLB, "MOUFlushTLB") - Debug(false) { - when (flushTLB) { - printf("%d: [MOU] Flush TLB at %x\n", GTimer(), io.cfIn.pc) - } - } - - io.out.bits := 0.U - io.in.ready := true.B - io.out.valid := valid -} diff --git a/src/main/scala/noop/isa/Priviledged.scala b/src/main/scala/noop/isa/Priviledged.scala deleted file mode 100644 index 09faa568935b32f80dfb58c8dd21db16fadb2541..0000000000000000000000000000000000000000 --- a/src/main/scala/noop/isa/Priviledged.scala +++ /dev/null @@ -1,24 +0,0 @@ -package noop - -import chisel3._ -import chisel3.util._ - -object Priviledged extends HasInstrType { - def ECALL = BitPat("b000000000000_00000_000_00000_1110011") - def MRET = BitPat("b001100000010_00000_000_00000_1110011") - def SRET = BitPat("b000100000010_00000_000_00000_1110011") - def SFANCE_VMA = BitPat("b0001001_?????_?????_000_00000_1110011") - def FENCE = BitPat("b????????????_?????_000_?????_0001111") - def WFI = BitPat("b0001000_00101_00000_000_00000_1110011") - - val table = Array( - ECALL -> List(InstrI, FuType.csr, CSROpType.jmp), - MRET -> List(InstrI, FuType.csr, CSROpType.jmp), - SRET -> List(InstrI, FuType.csr, CSROpType.jmp), - SFANCE_VMA -> List(InstrR, FuType.mou, MOUOpType.sfence_vma), - FENCE -> List(InstrS, FuType.alu, ALUOpType.add), // nop InstrS -> !wen - WFI -> List(InstrI, FuType.alu, ALUOpType.add) // nop - // FENCE -> List(InstrB, FuType.mou, MOUOpType.fencei) - - ) -} diff --git a/src/main/scala/noop/isa/RVA.scala b/src/main/scala/noop/isa/RVA.scala deleted file mode 100644 index ba9c0d000f644e125791926e2cfdc4ad046fb32a..0000000000000000000000000000000000000000 --- a/src/main/scala/noop/isa/RVA.scala +++ /dev/null @@ -1,42 +0,0 @@ -package noop - -import chisel3._ -import chisel3.util._ - -object RVAInstr extends HasInstrType { - // Note: use instr(14,12) to distinguish D/W inst - // def LR = BitPat("b00010??00000_?????_???_?????_0101111") - // def SC = BitPat("b00011??00000_?????_???_?????_0101111") - def LR_D = BitPat("b00010_??_00000_?????_011_?????_0101111") - def SC_D = BitPat("b00011_??_?????_?????_011_?????_0101111") - def LR_W = BitPat("b00010_??_00000_?????_010_?????_0101111") - def SC_W = BitPat("b00011_??_?????_?????_010_?????_0101111") - def AMOSWAP = BitPat("b00001_??_?????_?????_01?_?????_0101111") - def AMOADD = BitPat("b00000_??_?????_?????_01?_?????_0101111") - def AMOXOR = BitPat("b00100_??_?????_?????_01?_?????_0101111") - def AMOAND = BitPat("b01100_??_?????_?????_01?_?????_0101111") - def AMOOR = BitPat("b01000_??_?????_?????_01?_?????_0101111") - def AMOMIN = BitPat("b10000_??_?????_?????_01?_?????_0101111") - def AMOMAX = BitPat("b10100_??_?????_?????_01?_?????_0101111") - def AMOMINU = BitPat("b11000_??_?????_?????_01?_?????_0101111") - def AMOMAXU = BitPat("b11100_??_?????_?????_01?_?????_0101111") - // funct3 === 010 or 011 - - val table = Array( - // LR -> List(InstrI, FuType.lsu, LSUOpType.lr), - LR_D -> List(InstrI, FuType.lsu, LSUOpType.lr), - LR_W -> List(InstrI, FuType.lsu, LSUOpType.lr), - // SC -> List(InstrS, FuType.lsu, LSUOpType.sc), - SC_D -> List(InstrSA, FuType.lsu, LSUOpType.sc), - SC_W -> List(InstrSA, FuType.lsu, LSUOpType.sc), - AMOSWAP -> List(InstrR, FuType.lsu, LSUOpType.amoswap), - AMOADD -> List(InstrR, FuType.lsu, LSUOpType.amoadd), - AMOXOR -> List(InstrR, FuType.lsu, LSUOpType.amoxor), - AMOAND -> List(InstrR, FuType.lsu, LSUOpType.amoand), - AMOOR -> List(InstrR, FuType.lsu, LSUOpType.amoor), - AMOMIN -> List(InstrR, FuType.lsu, LSUOpType.amomin), - AMOMAX -> List(InstrR, FuType.lsu, LSUOpType.amomax), - AMOMINU -> List(InstrR, FuType.lsu, LSUOpType.amominu), - AMOMAXU -> List(InstrR, FuType.lsu, LSUOpType.amomaxu) - ) -} diff --git a/src/main/scala/noop/isa/RVC.scala b/src/main/scala/noop/isa/RVC.scala deleted file mode 100644 index 2b5a5888947b7af7141c98c864de4eba45531eb0..0000000000000000000000000000000000000000 --- a/src/main/scala/noop/isa/RVC.scala +++ /dev/null @@ -1,204 +0,0 @@ -// This package is used to deal with RVC decode -package noop - -import chisel3._ -import chisel3.util._ -import chisel3.util.experimental.BoringUtils - -import utils._ - -trait HasRVCConst { - - val RVCRegNumTable = List( - "b000".U -> 8.U, - "b001".U -> 9.U, - "b010".U -> 10.U, - "b011".U -> 11.U, - "b100".U -> 12.U, - "b101".U -> 13.U, - "b110".U -> 14.U, - "b111".U -> 15.U - ) - - // Imm src - def ImmNone = "b10000".U - def ImmLWSP = "b00000".U - def ImmLDSP = "b00001".U - def ImmSWSP = "b00010".U - def ImmSDSP = "b00011".U - def ImmSW = "b00100".U - def ImmSD = "b00101".U - def ImmLW = "b00110".U - def ImmLD = "b00111".U - def ImmJ = "b01000".U - def ImmB = "b01001".U - def ImmLI = "b01010".U - def ImmLUI = "b01011".U - def ImmADDI = "b01100".U - def ImmADDI16SP = "b01101".U - def ImmADD4SPN = "b01110".U - - // REG src - def DtCare = "b0000".U // reg x0 - def REGrs = "b0011".U - def REGrt = "b0001".U - def REGrd = "b0010".U - def REGrs1 = "b0100".U - def REGrs2 = "b0101".U - def REGrs1p = "b0110".U - def REGrs2p = "b0111".U - def REGx1 = "b1000".U - def REGx2 = "b1001".U -} - -object RVCInstr extends HasInstrType with HasRVCConst { - - // RVC 00 -// def C_XX = BitPat("b????????????????_???_?_10_987_65_432_10") - def C_ILLEGAL = BitPat("b0000000000000000_000_0_00_000_00_000_00") - def C_ADDI4SPN = BitPat("b????????????????_000_?_??_???_??_???_00") - def C_FLD = BitPat("b????????????????_001_?_??_???_??_???_00") -// def C_LQ = BitPat("b????????????????_001_?_??_???_??_???_00") - def C_LW = BitPat("b????????????????_010_?_??_???_??_???_00") -// def C_FLW = BitPat("b????????????????_011_?_??_???_??_???_00") // RV32FC Only - def C_LD = BitPat("b????????????????_011_?_??_???_??_???_00") - // def C_LI = BitPat("b????????????????_100_?_??_???_??_???_00") //reserved - def C_FSD = BitPat("b????????????????_101_?_??_???_??_???_00") -// def C_SQ = BitPat("b????????????????_101_?_??_???_??_???_00") - def C_SW = BitPat("b????????????????_110_?_??_???_??_???_00") -// def C_FSW = BitPat("b????????????????_111_?_??_???_??_???_00") // RV32FC Only - def C_SD = BitPat("b????????????????_111_?_??_???_??_???_00") - - // RVC 01 - def C_NOP = BitPat("b????????????????_000_?_00_000_??_???_01") - def C_ADDI = BitPat("b????????????????_000_?_??_???_??_???_01") - def C_JAL = BitPat("b????????????????_001_?_??_???_??_???_01") - def C_ADDIW = BitPat("b????????????????_001_?_??_???_??_???_01") - def C_LI = BitPat("b????????????????_010_?_??_???_??_???_01") - def C_ADDI16SP= BitPat("b????????????????_011_?_00_010_??_???_01") - def C_LUI = BitPat("b????????????????_011_?_??_???_??_???_01") - def C_SRLI = BitPat("b????????????????_100_?_00_???_??_???_01") -// def C_SRLI64 = BitPat("b????????????????_100_0_01_???_00_000_01") - def C_SRAI = BitPat("b????????????????_100_?_01_???_??_???_01") -// def C_SAI64 = BitPat("b????????????????_100_0_01_???_00_000_01") - def C_ANDI = BitPat("b????????????????_100_?_10_???_??_???_01") - def C_SUB = BitPat("b????????????????_100_0_11_???_00_???_01") - def C_XOR = BitPat("b????????????????_100_0_11_???_01_???_01") - def C_OR = BitPat("b????????????????_100_0_11_???_10_???_01") - def C_AND = BitPat("b????????????????_100_0_11_???_11_???_01") - def C_SUBW = BitPat("b????????????????_100_1_11_???_00_???_01") - def C_ADDW = BitPat("b????????????????_100_1_11_???_01_???_01") -// def C_RES = BitPat("b????????????????_100_1_11_???_??_???_01") -// def C_RES = BitPat("b????????????????_100_1_11_???_??_???_01") - def C_J = BitPat("b????????????????_101_?_??_???_??_???_01") - def C_BEQZ = BitPat("b????????????????_110_?_??_???_??_???_01") - def C_BNEZ = BitPat("b????????????????_111_?_??_???_??_???_01") - - //RVC 11 - def C_SLLI = BitPat("b????????????????_000_?_??_???_??_???_10") -// def C_SLLI64 = BitPat("b????????????????_000_0_??_???_00_000_10") - def C_FLDSP = BitPat("b????????????????_001_?_??_???_??_???_10") -// def C_LQSP = BitPat("b????????????????_001_?_??_???_??_???_10") - def C_LWSP = BitPat("b????????????????_010_?_??_???_??_???_10") -// def C_FLWSP = BitPat("b????????????????_011_?_??_???_??_???_10") // RV32FC Only - def C_LDSP = BitPat("b????????????????_011_?_??_???_??_???_10") - def C_JR = BitPat("b????????????????_100_0_??_???_00_000_10") - def C_MV = BitPat("b????????????????_100_0_??_???_??_???_10") - def C_EBREAK = BitPat("b????????????????_100_1_00_000_00_000_10") - def C_JALR = BitPat("b????????????????_100_1_??_???_00_000_10") - def C_ADD = BitPat("b????????????????_100_1_??_???_??_???_10") - def C_FSDSP = BitPat("b????????????????_101_?_??_???_??_???_10") -// def C_SQSP = BitPat("b????????????????_101_?_??_???_??_???_10") - def C_SWSP = BitPat("b????????????????_110_?_??_???_??_???_10") -// def C_FSWSP = BitPat("b????????????????_111_?_??_???_??_???_10") // RV32FC Only - def C_SDSP = BitPat("b????????????????_111_?_??_???_??_???_10") - - // TODO: HINT - // TODO: RES - -// def is_C_ADDI4SPN(op: UInt) = op(12,5) =/= 0.U - - val table = Array( - C_ILLEGAL -> List(InstrN, FuType.csr, CSROpType.jmp), - C_ADDI4SPN -> List(InstrI, FuType.alu, ALUOpType.add), - C_LW -> List(InstrI, FuType.lsu, LSUOpType.lw), - C_LD -> List(InstrI, FuType.lsu, LSUOpType.ld), - C_SW -> List(InstrS, FuType.lsu, LSUOpType.sw), - C_SD -> List(InstrS, FuType.lsu, LSUOpType.sd), - C_NOP -> List(InstrI, FuType.alu, ALUOpType.add), - C_ADDI -> List(InstrI, FuType.alu, ALUOpType.add), - // C_JAL -> List(InstrI, FuType.alu, ALUOpType.add),//RV32C only - C_ADDIW -> List(InstrI, FuType.alu, ALUOpType.addw), - C_LI -> List(InstrI, FuType.alu, ALUOpType.add), - C_ADDI16SP -> List(InstrI, FuType.alu, ALUOpType.add), - C_LUI -> List(InstrI, FuType.alu, ALUOpType.add), - C_SRLI -> List(InstrI, FuType.alu, ALUOpType.srl), - C_SRAI -> List(InstrI, FuType.alu, ALUOpType.sra), - C_ANDI -> List(InstrI, FuType.alu, ALUOpType.and), - C_SUB -> List(InstrR, FuType.alu, ALUOpType.sub), - C_XOR -> List(InstrR, FuType.alu, ALUOpType.xor), - C_OR -> List(InstrR, FuType.alu, ALUOpType.or), - C_AND -> List(InstrR, FuType.alu, ALUOpType.and), - C_SUBW -> List(InstrR, FuType.alu, ALUOpType.subw), - C_ADDW -> List(InstrR, FuType.alu, ALUOpType.addw), - C_J -> List(InstrJ, FuType.alu, ALUOpType.jal), - C_BEQZ -> List(InstrB, FuType.alu, ALUOpType.beq), - C_BNEZ -> List(InstrB, FuType.alu, ALUOpType.bne), - C_SLLI -> List(InstrI, FuType.alu, ALUOpType.sll), - C_LWSP -> List(InstrI, FuType.lsu, LSUOpType.lw), - C_LDSP -> List(InstrI, FuType.lsu, LSUOpType.ld), - C_JR -> List(InstrI, FuType.alu, ALUOpType.jalr), - C_MV -> List(InstrR, FuType.alu, ALUOpType.add), - C_EBREAK -> List(InstrI, FuType.alu, ALUOpType.add), - C_JALR -> List(InstrI, FuType.alu, ALUOpType.jalr), - C_ADD -> List(InstrR, FuType.alu, ALUOpType.add), - C_SWSP -> List(InstrS, FuType.lsu, LSUOpType.sw), - C_SDSP -> List(InstrS, FuType.lsu, LSUOpType.sd) - ) - - val cExtraTable = Array( - C_ADDI4SPN -> List(ImmADD4SPN, REGx2, DtCare, REGrs2p), - C_FLD -> List(ImmLD, REGrs1p, DtCare, REGrs2p), - C_LW -> List(ImmLW, REGrs1p, DtCare, REGrs2p), - C_LD -> List(ImmLD, REGrs1p, DtCare, REGrs2p), - C_FSD -> List(ImmSD, REGrs1p, REGrs2p, DtCare), - C_SW -> List(ImmSW, REGrs1p, REGrs2p, DtCare), - C_SD -> List(ImmSD, REGrs1p, REGrs2p, DtCare), - C_NOP -> List(ImmNone, DtCare, DtCare, DtCare), - C_ADDI -> List(ImmADDI, REGrd, DtCare, REGrd), - // C_JAL -> List(), - C_ADDIW -> List(ImmADDI, REGrd, DtCare, REGrd), - C_LI -> List(ImmLI, DtCare, DtCare, REGrd), - C_ADDI16SP -> List(ImmADDI16SP, REGx2, DtCare, REGx2), - C_LUI -> List(ImmLUI, DtCare, DtCare, REGrd), - C_SRLI -> List(ImmLI, REGrs1p, DtCare, REGrs1p), - C_SRAI -> List(ImmLI, REGrs1p, DtCare, REGrs1p), - C_ANDI -> List(ImmLI, REGrs1p, DtCare, REGrs1p), - C_SUB -> List(ImmNone, REGrs1p, REGrs2p, REGrs1p), - C_XOR -> List(ImmNone, REGrs1p, REGrs2p, REGrs1p), - C_OR -> List(ImmNone, REGrs1p, REGrs2p, REGrs1p), - C_AND -> List(ImmNone, REGrs1p, REGrs2p, REGrs1p), - C_SUBW -> List(ImmNone, REGrs1p, REGrs2p, REGrs1p), - C_ADDW -> List(ImmNone, REGrs1p, REGrs2p, REGrs1p), - C_J -> List(ImmJ, DtCare, DtCare, DtCare), - C_BEQZ -> List(ImmB, REGrs1p, DtCare, DtCare), // rd: x0 - C_BNEZ -> List(ImmB, REGrs1p, DtCare, DtCare), // rd: x0 - C_SLLI -> List(ImmLI, REGrd, DtCare, REGrd), - C_FLDSP -> List(ImmLDSP, REGx2, DtCare, REGrd), - // C_LQSP -> List(), - C_LWSP -> List(ImmLWSP, REGx2, DtCare, REGrd), - C_LDSP -> List(ImmLDSP, REGx2, DtCare, REGrd), - C_JR -> List(ImmNone, REGrs1, DtCare, DtCare), - C_MV -> List(ImmNone, REGrs2, DtCare, REGrd), - C_EBREAK -> List(ImmNone, DtCare, DtCare, DtCare), //not implemented - C_JALR -> List(ImmNone, REGrs1, DtCare, REGx1), - C_ADD -> List(ImmNone, REGrd, REGrs2, REGrd), - C_FSDSP -> List(ImmSDSP, REGx2, REGrs2, DtCare), - // C_SQSP -> List(), - C_SWSP -> List(ImmSWSP, REGx2, REGrs2, DtCare), - C_SDSP -> List(ImmSDSP, REGx2, REGrs2, DtCare) - ) - - //TODO: support pc = 2 aligned address - //TODO: branch predictor support pc = 2 align -} diff --git a/src/main/scala/noop/isa/RVD.scala b/src/main/scala/noop/isa/RVD.scala deleted file mode 100644 index 495b16bf27d37e987389dc575876907a22820560..0000000000000000000000000000000000000000 --- a/src/main/scala/noop/isa/RVD.scala +++ /dev/null @@ -1,107 +0,0 @@ -package noop.isa - -import Chisel.BitPat -import noop._ -import noop.SrcType.{fp, imm, reg} -import RVF_FPUInstr.{Y, N} -import RVCInstr._ -import fpu.FPUIOFunc._ -import fpu.FPUOpType._ - -object RVD_LSUInstr extends HasInstrType{ - def FLD = BitPat("b?????????????????011?????0000111") - def FSD = BitPat("b?????????????????011?????0100111") - val table = Array( - FLD -> List(InstrI, FuType.lsu, LSUOpType.ld), - C_FLD -> List(InstrI, FuType.lsu, LSUOpType.ld), - C_FLDSP -> List(InstrI, FuType.lsu, LSUOpType.ld), - FSD -> List(InstrS, FuType.lsu, LSUOpType.sd), - C_FSD -> List(InstrS, FuType.lsu, LSUOpType.sd), - C_FSDSP -> List(InstrS, FuType.lsu, LSUOpType.sd) - ) -} - -object RVD_FPUInstr extends HasNOOPParameter { - def FADD_D = BitPat("b0000001??????????????????1010011") - def FSUB_D = BitPat("b0000101??????????????????1010011") - def FMUL_D = BitPat("b0001001??????????????????1010011") - def FDIV_D = BitPat("b0001101??????????????????1010011") - def FSGNJ_D = BitPat("b0010001??????????000?????1010011") - def FSGNJN_D = BitPat("b0010001??????????001?????1010011") - def FSGNJX_D = BitPat("b0010001??????????010?????1010011") - def FMIN_D = BitPat("b0010101??????????000?????1010011") - def FMAX_D = BitPat("b0010101??????????001?????1010011") - def FCVT_S_D = BitPat("b010000000001?????????????1010011") - def FCVT_D_S = BitPat("b010000100000?????????????1010011") - def FSQRT_D = BitPat("b010110100000?????????????1010011") - def FLE_D = BitPat("b1010001??????????000?????1010011") - def FLT_D = BitPat("b1010001??????????001?????1010011") - def FEQ_D = BitPat("b1010001??????????010?????1010011") - def FCVT_W_D = BitPat("b110000100000?????????????1010011") - def FCVT_WU_D = BitPat("b110000100001?????????????1010011") - def FCVT_L_D = BitPat("b110000100010?????????????1010011") - def FCVT_LU_D = BitPat("b110000100011?????????????1010011") - def FMV_X_D = BitPat("b111000100000?????000?????1010011") - def FCLASS_D = BitPat("b111000100000?????001?????1010011") - def FCVT_D_W = BitPat("b110100100000?????????????1010011") - def FCVT_D_WU = BitPat("b110100100001?????????????1010011") - def FCVT_D_L = BitPat("b110100100010?????????????1010011") - def FCVT_D_LU = BitPat("b110100100011?????????????1010011") - def FMV_D_X = BitPat("b111100100000?????000?????1010011") - def FLD = BitPat("b?????????????????011?????0000111") - def FSD = BitPat("b?????????????????011?????0100111") - def FMADD_D = BitPat("b?????01??????????????????1000011") - def FMSUB_D = BitPat("b?????01??????????????????1000111") - def FNMSUB_D = BitPat("b?????01??????????????????1001011") - def FNMADD_D = BitPat("b?????01??????????????????1001111") - // (isFp, src1Type, src2Type, src3Type, rfWen, fpWen, fuOpType, inputFunc, outputFunc) - val table = Array( - FLD -> List(Y, reg, imm, imm, N, Y, LSUOpType.ld, in_raw, out_raw), - C_FLD -> List(Y, reg, imm, imm, N, Y, LSUOpType.ld, in_raw, out_raw), - C_FLDSP -> List(Y, reg, imm, imm, N, Y, LSUOpType.ld, in_raw, out_raw), - FSD -> List(Y, reg, fp, imm, N, N, LSUOpType.sd, in_raw, out_raw), - C_FSD -> List(Y, reg, fp, imm, N, N, LSUOpType.sd, in_raw, out_raw), - C_FSDSP -> List(Y, reg, fp, imm, N, N, LSUOpType.sd, in_raw, out_raw), - // fp fp -> fp - FADD_D -> List(Y, fp, fp, imm, N, Y, fadd, in_raw, out_raw), - FSUB_D -> List(Y, fp, fp, imm, N, Y, fsub, in_raw, out_raw), - FMUL_D -> List(Y, fp, fp, imm, N, Y, fmul, in_raw, out_raw), - FDIV_D -> List(Y, fp, fp, imm, N, Y, fdiv, in_raw, out_raw), - FMIN_D -> List(Y, fp, fp, imm, N, Y, fmin, in_raw, out_raw), - FMAX_D -> List(Y, fp, fp, imm, N, Y, fmax, in_raw, out_raw), - FSGNJ_D -> List(Y, fp, fp, imm, N, Y, fsgnj, in_raw, out_raw), - FSGNJN_D -> List(Y, fp, fp, imm, N, Y, fsgnjn, in_raw, out_raw), - FSGNJX_D -> List(Y, fp, fp, imm, N, Y, fsgnjx, in_raw, out_raw), - // fp -> fp - FSQRT_D -> List(Y, fp, imm, imm, N, Y, fsqrt, in_raw, out_raw), - FCVT_S_D -> List(Y, fp, imm, imm, N, Y, d2s, in_raw, out_box), - FCVT_D_S -> List(Y, fp, imm, imm, N, Y, s2d, in_unbox, out_raw), - // fp fp fp -> fp - FMADD_D -> List(Y, fp, fp, fp, N, Y, fmadd, in_raw, out_raw), - FNMADD_D -> List(Y, fp, fp, fp, N, Y, fnmadd, in_raw, out_raw), - FMSUB_D -> List(Y, fp, fp, fp, N, Y, fmsub, in_raw, out_raw), - FNMSUB_D -> List(Y, fp, fp, fp, N, Y, fnmsub, in_raw, out_raw), - // fp -> gp - FCLASS_D -> List(Y, fp, imm, imm, Y, N, fclass, in_raw, out_raw), - FMV_X_D -> List(Y, fp, imm, imm, Y, N, fmv_f2i, in_raw, out_raw), - FCVT_W_D -> List(Y, fp, imm, imm, Y, N, f2w, in_raw, out_sext), - FCVT_WU_D -> List(Y, fp, imm, imm, Y, N, f2wu, in_raw, out_sext), - FCVT_L_D -> List(Y, fp, imm, imm, Y, N, f2l, in_raw, out_raw), - FCVT_LU_D -> List(Y, fp, imm, imm, Y, N, f2lu, in_raw, out_raw), - // fp fp -> gp - FLE_D -> List(Y, fp, fp, imm, Y, N, fle, in_raw, out_raw), - FLT_D -> List(Y, fp, fp, imm, Y, N, flt, in_raw, out_raw), - FEQ_D -> List(Y, fp, fp, imm, Y, N, feq, in_raw, out_raw), - // gp -> fp - FMV_D_X -> List(Y, reg, imm, imm, N, Y, fmv_i2f, in_raw, out_raw), - FCVT_D_W -> List(Y, reg, imm, imm, N, Y, w2f, in_raw, out_raw), - FCVT_D_WU -> List(Y, reg, imm, imm, N, Y, wu2f, in_raw, out_raw), - FCVT_D_L -> List(Y, reg, imm, imm, N, Y, l2f, in_raw, out_raw), - FCVT_D_LU -> List(Y, reg, imm, imm, N, Y, lu2f, in_raw, out_raw) - ) -} - -object RVDInstr { - val table = RVD_LSUInstr.table - val extraTable = RVD_FPUInstr.table -} \ No newline at end of file diff --git a/src/main/scala/noop/isa/RVF.scala b/src/main/scala/noop/isa/RVF.scala deleted file mode 100644 index 6ab8abea10a70a63db850ecca434afaeeb7600e4..0000000000000000000000000000000000000000 --- a/src/main/scala/noop/isa/RVF.scala +++ /dev/null @@ -1,100 +0,0 @@ -package noop.isa - -import Chisel.BitPat -import chisel3._ -import noop._ -import noop.SrcType._ -import fpu.FPUOpType._ -import fpu.FPUIOFunc._ - -object RVF_LSUInstr extends HasInstrType{ - def FLW = BitPat("b?????????????????010?????0000111") - def FSW = BitPat("b?????????????????010?????0100111") - val table = Array( - FLW -> List(InstrI, FuType.lsu, LSUOpType.flw), - FSW -> List(InstrS, FuType.lsu, LSUOpType.sw) - ) -} - -object RVF_FPUInstr extends HasNOOPParameter { - def FADD_S = BitPat("b0000000??????????????????1010011") - def FSUB_S = BitPat("b0000100??????????????????1010011") - def FMUL_S = BitPat("b0001000??????????????????1010011") - def FDIV_S = BitPat("b0001100??????????????????1010011") - def FSGNJ_S = BitPat("b0010000??????????000?????1010011") - def FSGNJN_S = BitPat("b0010000??????????001?????1010011") - def FSGNJX_S = BitPat("b0010000??????????010?????1010011") - def FMIN_S = BitPat("b0010100??????????000?????1010011") - def FMAX_S = BitPat("b0010100??????????001?????1010011") - def FSQRT_S = BitPat("b010110000000?????????????1010011") - def FLE_S = BitPat("b1010000??????????000?????1010011") - def FLT_S = BitPat("b1010000??????????001?????1010011") - def FEQ_S = BitPat("b1010000??????????010?????1010011") - def FCVT_W_S = BitPat("b110000000000?????????????1010011") - def FCVT_WU_S = BitPat("b110000000001?????????????1010011") - def FCVT_L_S = BitPat("b110000000010?????????????1010011") - def FCVT_LU_S = BitPat("b110000000011?????????????1010011") - def FMV_X_W = BitPat("b111000000000?????000?????1010011") - def FCLASS_S = BitPat("b111000000000?????001?????1010011") - def FCVT_S_W = BitPat("b110100000000?????????????1010011") - def FCVT_S_WU = BitPat("b110100000001?????????????1010011") - def FCVT_S_L = BitPat("b110100000010?????????????1010011") - def FCVT_S_LU = BitPat("b110100000011?????????????1010011") - def FMV_W_X = BitPat("b111100000000?????000?????1010011") - def FLW = BitPat("b?????????????????010?????0000111") - def FSW = BitPat("b?????????????????010?????0100111") - def FMADD_S = BitPat("b?????00??????????????????1000011") - def FMSUB_S = BitPat("b?????00??????????????????1000111") - def FNMSUB_S = BitPat("b?????00??????????????????1001011") - def FNMADD_S = BitPat("b?????00??????????????????1001111") - - def Y: Bool = true.B - def N: Bool = false.B - - // (isFp, src1Type, src2Type, src3Type, rfWen, fpWen, fuOpType, inputFunc, outputFunc) - val DecodeDefault = List(N, imm, imm, imm, N, N, fadd, in_raw, out_raw) - val table = Array( - FLW -> List(Y, reg, imm, imm, N, Y, LSUOpType.flw, in_raw, out_raw), - FSW -> List(Y, reg, fp, imm, N, N, LSUOpType.sw, in_raw, out_raw), - // fp fp -> fp - FADD_S -> List(Y, fp, fp, imm, N, Y, fadd, in_unbox, out_box), - FSUB_S -> List(Y, fp, fp, imm, N, Y, fsub, in_unbox, out_box), - FMUL_S -> List(Y, fp, fp, imm, N, Y, fmul, in_unbox, out_box), - FDIV_S -> List(Y, fp, fp, imm, N, Y, fdiv, in_unbox, out_box), - FMIN_S -> List(Y, fp, fp, imm, N, Y, fmin, in_unbox, out_box), - FMAX_S -> List(Y, fp, fp, imm, N, Y, fmax, in_unbox, out_box), - FSGNJ_S -> List(Y, fp, fp, imm, N, Y, fsgnj, in_unbox, out_box), - FSGNJN_S -> List(Y, fp, fp, imm, N, Y, fsgnjn, in_unbox, out_box), - FSGNJX_S -> List(Y, fp, fp, imm, N, Y, fsgnjx, in_unbox, out_box), - // fp -> fp - FSQRT_S -> List(Y, fp, imm, imm, N, Y, fsqrt, in_unbox, out_box), - // fp fp fp -> fp - FMADD_S -> List(Y, fp, fp, fp, N, Y, fmadd, in_unbox, out_box), - FNMADD_S -> List(Y, fp, fp, fp, N, Y, fnmadd, in_unbox, out_box), - FMSUB_S -> List(Y, fp, fp, fp, N, Y, fmsub, in_unbox, out_box), - FNMSUB_S -> List(Y, fp, fp, fp, N, Y, fnmsub, in_unbox, out_box), - // fp -> gp - FCLASS_S -> List(Y, fp, imm, imm, Y, N, fclass, in_unbox, out_raw), - FMV_X_W -> List(Y, fp, imm, imm, Y, N, fmv_f2i, in_raw, out_sext), - FCVT_W_S -> List(Y, fp, imm, imm, Y, N, f2w, in_unbox, out_sext), - FCVT_WU_S -> List(Y, fp, imm, imm, Y, N, f2wu, in_unbox, out_sext), - FCVT_L_S -> List(Y, fp, imm, imm, Y, N, f2l, in_unbox, out_raw), - FCVT_LU_S -> List(Y, fp, imm, imm, Y, N, f2lu, in_unbox, out_raw) , - // fp fp -> gp - FLE_S -> List(Y, fp, fp, imm, Y, N, fle, in_unbox, out_raw), - FLT_S -> List(Y, fp, fp, imm, Y, N, flt, in_unbox, out_raw), - FEQ_S -> List(Y, fp, fp, imm, Y, N, feq, in_unbox, out_raw), - // gp -> fp - FMV_W_X -> List(Y, reg, imm, imm, N, Y, fmv_i2f, in_raw, out_box), - FCVT_S_W -> List(Y, reg, imm, imm, N, Y, w2f, in_raw, out_box), - FCVT_S_WU -> List(Y, reg, imm, imm, N, Y, wu2f, in_raw, out_box), - FCVT_S_L -> List(Y, reg, imm, imm, N, Y, l2f, in_raw, out_box), - FCVT_S_LU -> List(Y, reg, imm, imm, N, Y, lu2f, in_raw, out_box) - ) -} - -object RVFInstr{ - val table = RVF_LSUInstr.table - val extraTable = RVF_FPUInstr.table - val extraTableDefault = RVF_FPUInstr.DecodeDefault -} diff --git a/src/main/scala/noop/isa/RVI.scala b/src/main/scala/noop/isa/RVI.scala deleted file mode 100644 index f4a70266cf384e6030899481f9af2dca2c7ce602..0000000000000000000000000000000000000000 --- a/src/main/scala/noop/isa/RVI.scala +++ /dev/null @@ -1,155 +0,0 @@ -package noop - -import chisel3._ -import chisel3.util._ - -object RV32I_ALUInstr extends HasInstrType with HasNOOPParameter { - def ADDI = BitPat("b????????????_?????_000_?????_0010011") - def SLLI = if (XLEN == 32) BitPat("b0000000?????_?????_001_?????_0010011") - else BitPat("b000000??????_?????_001_?????_0010011") - def SLTI = BitPat("b????????????_?????_010_?????_0010011") - def SLTIU = BitPat("b????????????_?????_011_?????_0010011") - def XORI = BitPat("b????????????_?????_100_?????_0010011") - def SRLI = if (XLEN == 32) BitPat("b0000000?????_?????_101_?????_0010011") - else BitPat("b000000??????_?????_101_?????_0010011") - def ORI = BitPat("b????????????_?????_110_?????_0010011") - def ANDI = BitPat("b????????????_?????_111_?????_0010011") - def SRAI = if (XLEN == 32) BitPat("b0100000?????_?????_101_?????_0010011") - else BitPat("b010000??????_?????_101_?????_0010011") - - def ADD = BitPat("b0000000_?????_?????_000_?????_0110011") - def SLL = BitPat("b0000000_?????_?????_001_?????_0110011") - def SLT = BitPat("b0000000_?????_?????_010_?????_0110011") - def SLTU = BitPat("b0000000_?????_?????_011_?????_0110011") - def XOR = BitPat("b0000000_?????_?????_100_?????_0110011") - def SRL = BitPat("b0000000_?????_?????_101_?????_0110011") - def OR = BitPat("b0000000_?????_?????_110_?????_0110011") - def AND = BitPat("b0000000_?????_?????_111_?????_0110011") - def SUB = BitPat("b0100000_?????_?????_000_?????_0110011") - def SRA = BitPat("b0100000_?????_?????_101_?????_0110011") - - def AUIPC = BitPat("b????????????????????_?????_0010111") - def LUI = BitPat("b????????????????????_?????_0110111") - - val table = Array( - ADDI -> List(InstrI, FuType.alu, ALUOpType.add), - SLLI -> List(InstrI, FuType.alu, ALUOpType.sll), - SLTI -> List(InstrI, FuType.alu, ALUOpType.slt), - SLTIU -> List(InstrI, FuType.alu, ALUOpType.sltu), - XORI -> List(InstrI, FuType.alu, ALUOpType.xor), - SRLI -> List(InstrI, FuType.alu, ALUOpType.srl), - ORI -> List(InstrI, FuType.alu, ALUOpType.or ), - ANDI -> List(InstrI, FuType.alu, ALUOpType.and), - SRAI -> List(InstrI, FuType.alu, ALUOpType.sra), - - ADD -> List(InstrR, FuType.alu, ALUOpType.add), - SLL -> List(InstrR, FuType.alu, ALUOpType.sll), - SLT -> List(InstrR, FuType.alu, ALUOpType.slt), - SLTU -> List(InstrR, FuType.alu, ALUOpType.sltu), - XOR -> List(InstrR, FuType.alu, ALUOpType.xor), - SRL -> List(InstrR, FuType.alu, ALUOpType.srl), - OR -> List(InstrR, FuType.alu, ALUOpType.or ), - AND -> List(InstrR, FuType.alu, ALUOpType.and), - SUB -> List(InstrR, FuType.alu, ALUOpType.sub), - SRA -> List(InstrR, FuType.alu, ALUOpType.sra), - - AUIPC -> List(InstrU, FuType.alu, ALUOpType.add), - LUI -> List(InstrU, FuType.alu, ALUOpType.add) - ) -} - -object RV32I_BRUInstr extends HasInstrType { - def JAL = BitPat("b????????????????????_?????_1101111") - def JALR = BitPat("b????????????_?????_000_?????_1100111") - - def BNE = BitPat("b???????_?????_?????_001_?????_1100011") - def BEQ = BitPat("b???????_?????_?????_000_?????_1100011") - def BLT = BitPat("b???????_?????_?????_100_?????_1100011") - def BGE = BitPat("b???????_?????_?????_101_?????_1100011") - def BLTU = BitPat("b???????_?????_?????_110_?????_1100011") - def BGEU = BitPat("b???????_?????_?????_111_?????_1100011") - - val table = Array( - JAL -> List(InstrJ, FuType.alu, ALUOpType.jal), - JALR -> List(InstrI, FuType.alu, ALUOpType.jalr), - - BEQ -> List(InstrB, FuType.alu, ALUOpType.beq), - BNE -> List(InstrB, FuType.alu, ALUOpType.bne), - BLT -> List(InstrB, FuType.alu, ALUOpType.blt), - BGE -> List(InstrB, FuType.alu, ALUOpType.bge), - BLTU -> List(InstrB, FuType.alu, ALUOpType.bltu), - BGEU -> List(InstrB, FuType.alu, ALUOpType.bgeu) - ) - - val bruFuncTobtbTypeTable = List( - ALUOpType.beq -> BTBtype.B, - ALUOpType.bne -> BTBtype.B, - ALUOpType.blt -> BTBtype.B, - ALUOpType.bge -> BTBtype.B, - ALUOpType.bltu -> BTBtype.B, - ALUOpType.bgeu -> BTBtype.B, - ALUOpType.call -> BTBtype.J, - ALUOpType.ret -> BTBtype.R, - ALUOpType.jal -> BTBtype.J, - ALUOpType.jalr -> BTBtype.I - ) -} - -object RV32I_LSUInstr extends HasInstrType { - def LB = BitPat("b????????????_?????_000_?????_0000011") - def LH = BitPat("b????????????_?????_001_?????_0000011") - def LW = BitPat("b????????????_?????_010_?????_0000011") - def LBU = BitPat("b????????????_?????_100_?????_0000011") - def LHU = BitPat("b????????????_?????_101_?????_0000011") - def SB = BitPat("b???????_?????_?????_000_?????_0100011") - def SH = BitPat("b???????_?????_?????_001_?????_0100011") - def SW = BitPat("b???????_?????_?????_010_?????_0100011") - - val table = Array( - LB -> List(InstrI, FuType.lsu, LSUOpType.lb ), - LH -> List(InstrI, FuType.lsu, LSUOpType.lh ), - LW -> List(InstrI, FuType.lsu, LSUOpType.lw ), - LBU -> List(InstrI, FuType.lsu, LSUOpType.lbu), - LHU -> List(InstrI, FuType.lsu, LSUOpType.lhu), - SB -> List(InstrS, FuType.lsu, LSUOpType.sb ), - SH -> List(InstrS, FuType.lsu, LSUOpType.sh ), - SW -> List(InstrS, FuType.lsu, LSUOpType.sw) - ) -} - -object RV64IInstr extends HasInstrType { - def ADDIW = BitPat("b???????_?????_?????_000_?????_0011011") - def SLLIW = BitPat("b0000000_?????_?????_001_?????_0011011") - def SRLIW = BitPat("b0000000_?????_?????_101_?????_0011011") - def SRAIW = BitPat("b0100000_?????_?????_101_?????_0011011") - def SLLW = BitPat("b0000000_?????_?????_001_?????_0111011") - def SRLW = BitPat("b0000000_?????_?????_101_?????_0111011") - def SRAW = BitPat("b0100000_?????_?????_101_?????_0111011") - def ADDW = BitPat("b0000000_?????_?????_000_?????_0111011") - def SUBW = BitPat("b0100000_?????_?????_000_?????_0111011") - - def LWU = BitPat("b???????_?????_?????_110_?????_0000011") - def LD = BitPat("b???????_?????_?????_011_?????_0000011") - def SD = BitPat("b???????_?????_?????_011_?????_0100011") - - val table = Array( - ADDIW -> List(InstrI, FuType.alu, ALUOpType.addw), - SLLIW -> List(InstrI, FuType.alu, ALUOpType.sllw), - SRLIW -> List(InstrI, FuType.alu, ALUOpType.srlw), - SRAIW -> List(InstrI, FuType.alu, ALUOpType.sraw), - SLLW -> List(InstrR, FuType.alu, ALUOpType.sllw), - SRLW -> List(InstrR, FuType.alu, ALUOpType.srlw), - SRAW -> List(InstrR, FuType.alu, ALUOpType.sraw), - ADDW -> List(InstrR, FuType.alu, ALUOpType.addw), - SUBW -> List(InstrR, FuType.alu, ALUOpType.subw), - - LWU -> List(InstrI, FuType.lsu, LSUOpType.lwu), - LD -> List(InstrI, FuType.lsu, LSUOpType.ld ), - SD -> List(InstrS, FuType.lsu, LSUOpType.sd) - ) -} - -object RVIInstr extends HasNOOPParameter { - val table = RV32I_ALUInstr.table ++ RV32I_BRUInstr.table ++ RV32I_LSUInstr.table ++ - (if (XLEN == 64) RV64IInstr.table else Nil) -} diff --git a/src/main/scala/noop/isa/RVM.scala b/src/main/scala/noop/isa/RVM.scala deleted file mode 100644 index 804c680c951f8f32b5dfd27377c0945dde49c1f5..0000000000000000000000000000000000000000 --- a/src/main/scala/noop/isa/RVM.scala +++ /dev/null @@ -1,57 +0,0 @@ -package noop - -import chisel3._ -import chisel3.util._ - -object RV32MInstr extends HasInstrType with HasNOOPParameter { - def MUL = BitPat("b0000001_?????_?????_000_?????_0110011") - def MULH = BitPat("b0000001_?????_?????_001_?????_0110011") - def MULHSU = BitPat("b0000001_?????_?????_010_?????_0110011") - def MULHU = BitPat("b0000001_?????_?????_011_?????_0110011") - def DIV = BitPat("b0000001_?????_?????_100_?????_0110011") - def DIVU = BitPat("b0000001_?????_?????_101_?????_0110011") - def REM = BitPat("b0000001_?????_?????_110_?????_0110011") - def REMU = BitPat("b0000001_?????_?????_111_?????_0110011") - def MULW = BitPat("b0000001_?????_?????_000_?????_0111011") - def DIVW = BitPat("b0000001_?????_?????_100_?????_0111011") - def DIVUW = BitPat("b0000001_?????_?????_101_?????_0111011") - def REMW = BitPat("b0000001_?????_?????_110_?????_0111011") - def REMUW = BitPat("b0000001_?????_?????_111_?????_0111011") - - val mulTable = Array( - MUL -> List(InstrR, FuType.mdu, MDUOpType.mul), - MULH -> List(InstrR, FuType.mdu, MDUOpType.mulh), - MULHSU -> List(InstrR, FuType.mdu, MDUOpType.mulhsu), - MULHU -> List(InstrR, FuType.mdu, MDUOpType.mulhu) - ) - val divTable = Array( - DIV -> List(InstrR, FuType.mdu, MDUOpType.div), - DIVU -> List(InstrR, FuType.mdu, MDUOpType.divu), - REM -> List(InstrR, FuType.mdu, MDUOpType.rem), - REMU -> List(InstrR, FuType.mdu, MDUOpType.remu) - ) - val table = mulTable ++ (if (HasDiv) divTable else Nil) -} - -object RV64MInstr extends HasInstrType with HasNOOPParameter { - def MULW = BitPat("b0000001_?????_?????_000_?????_0111011") - def DIVW = BitPat("b0000001_?????_?????_100_?????_0111011") - def DIVUW = BitPat("b0000001_?????_?????_101_?????_0111011") - def REMW = BitPat("b0000001_?????_?????_110_?????_0111011") - def REMUW = BitPat("b0000001_?????_?????_111_?????_0111011") - - val mulTable = Array( - MULW -> List(InstrR, FuType.mdu, MDUOpType.mulw) - ) - val divTable = Array( - DIVW -> List(InstrR, FuType.mdu, MDUOpType.divw), - DIVUW -> List(InstrR, FuType.mdu, MDUOpType.divuw), - REMW -> List(InstrR, FuType.mdu, MDUOpType.remw), - REMUW -> List(InstrR, FuType.mdu, MDUOpType.remuw) - ) - val table = mulTable ++ (if (HasDiv) divTable else Nil) -} - -object RVMInstr extends HasNOOPParameter { - val table = RV32MInstr.table ++ (if (XLEN == 64) RV64MInstr.table else Nil) -} diff --git a/src/main/scala/noop/isa/RVZicsr.scala b/src/main/scala/noop/isa/RVZicsr.scala deleted file mode 100644 index 8906ee39f571f55c2269affff3a6040442ea523b..0000000000000000000000000000000000000000 --- a/src/main/scala/noop/isa/RVZicsr.scala +++ /dev/null @@ -1,22 +0,0 @@ -package noop - -import chisel3._ -import chisel3.util._ - -object RVZicsrInstr extends HasInstrType { - def CSRRW = BitPat("b????????????_?????_001_?????_1110011") - def CSRRS = BitPat("b????????????_?????_010_?????_1110011") - def CSRRC = BitPat("b????????????_?????_011_?????_1110011") - def CSRRWI = BitPat("b????????????_?????_101_?????_1110011") - def CSRRSI = BitPat("b????????????_?????_110_?????_1110011") - def CSRRCI = BitPat("b????????????_?????_111_?????_1110011") - - val table = Array( - CSRRW -> List(InstrI, FuType.csr, CSROpType.wrt), - CSRRS -> List(InstrI, FuType.csr, CSROpType.set), - CSRRC -> List(InstrI, FuType.csr, CSROpType.clr), - CSRRWI -> List(InstrI, FuType.csr, CSROpType.wrti), - CSRRSI -> List(InstrI, FuType.csr, CSROpType.seti), - CSRRCI -> List(InstrI, FuType.csr, CSROpType.clri) - ) -} diff --git a/src/main/scala/noop/isa/RVZifencei.scala b/src/main/scala/noop/isa/RVZifencei.scala deleted file mode 100644 index 22fa15d7d795353eb745ee2ef472649797ea6b55..0000000000000000000000000000000000000000 --- a/src/main/scala/noop/isa/RVZifencei.scala +++ /dev/null @@ -1,12 +0,0 @@ -package noop - -import chisel3._ -import chisel3.util._ - -object RVZifenceiInstr extends HasInstrType { - def FENCEI = BitPat("b000000000000_00000_001_00000_0001111") - - val table = Array( - FENCEI -> List(InstrB, FuType.mou, MOUOpType.fencei) - ) -} diff --git a/src/main/scala/system/Coherence.scala b/src/main/scala/system/Coherence.scala deleted file mode 100644 index 0d5150467387bf9952daf20ef3382b9a177b6c92..0000000000000000000000000000000000000000 --- a/src/main/scala/system/Coherence.scala +++ /dev/null @@ -1,83 +0,0 @@ -package system - -import chisel3._ -import chisel3.util._ - -import utils._ -import bus.simplebus._ -import noop.HasNOOPParameter - -trait HasCoherenceParameter extends HasNOOPParameter { - val supportCoh = HasDcache -} - -class CoherenceManager extends Module with HasCoherenceParameter { - val io = IO(new Bundle { - val in = Flipped(new SimpleBusUC) - val out = new Bundle { - val mem = new SimpleBusUC - val coh = new SimpleBusUC - } - }) - - // state transition: - // write: s_idle -> s_memWriteResp -> s_idle - // read: s_idle -> s_probeResp -> (hit) s_probeForward -> s_idle - // +> (miss) s_memReadReq -> s_memReadResp -> s_idle - - val s_idle :: s_probeResp :: s_probeForward :: s_memReadReq :: s_memReadResp :: s_memWriteResp :: Nil = Enum(6) - val state = RegInit(s_idle) - val inflight = state =/= s_idle - - val thisReq = io.in.req - assert(!(thisReq.valid && !thisReq.bits.isRead() && !thisReq.bits.isWrite())) - - // when read, we should first probe dcache - val reqLatch = RegEnable(thisReq.bits, !inflight && thisReq.bits.isRead()) - io.out.coh match { case c => { - c.req.bits := thisReq.bits - c.req.bits.cmd := SimpleBusCmd.probe - c.resp.ready := true.B - }} - - io.out.mem.req.bits := thisReq.bits - // bind correct valid and ready signals - io.out.mem.req.valid := false.B - thisReq.ready := false.B - io.out.coh.req.valid := false.B - when (if (supportCoh) thisReq.bits.isWrite() else true.B) { - io.out.mem.req.valid := thisReq.valid && !inflight - thisReq.ready := io.out.mem.req.ready && !inflight - } .elsewhen (thisReq.bits.isRead()) { - io.out.coh.req.valid := thisReq.valid && !inflight - thisReq.ready := io.out.coh.req.ready && !inflight - } - - io.in.resp <> io.out.mem.resp - - switch (state) { - is (s_idle) { - when (thisReq.fire()) { - when (thisReq.bits.isRead()) { state := Mux(supportCoh.B, s_probeResp, s_memReadResp) } - .elsewhen (thisReq.bits.isWriteLast()) { state := s_memWriteResp } - } - } - is (s_probeResp) { - when (io.out.coh.resp.fire()) { - state := Mux(io.out.coh.resp.bits.isProbeHit(), s_probeForward, s_memReadReq) - } - } - is (s_probeForward) { - val thisResp = io.in.resp - thisResp <> io.out.coh.resp - when (thisResp.fire() && thisResp.bits.isReadLast()) { state := s_idle } - } - is (s_memReadReq) { - io.out.mem.req.bits := reqLatch - io.out.mem.req.valid := true.B - when (io.out.mem.req.fire()) { state := s_memReadResp } - } - is (s_memReadResp) { when (io.out.mem.resp.fire() && io.out.mem.resp.bits.isReadLast()) { state := s_idle } } - is (s_memWriteResp) { when (io.out.mem.resp.fire()) { state := s_idle } } - } -} diff --git a/src/main/scala/system/Prefetcher.scala b/src/main/scala/system/Prefetcher.scala deleted file mode 100644 index 8d2f45b525ce52e38e3df9736083ac84038a8c3a..0000000000000000000000000000000000000000 --- a/src/main/scala/system/Prefetcher.scala +++ /dev/null @@ -1,47 +0,0 @@ -package system - -import noop.{NOOP, NOOPConfig, HasNOOPParameter, Cache, CacheConfig} -import bus.axi4.{AXI4, AXI4Lite} -import bus.simplebus._ -import utils._ - -import chisel3._ -import chisel3.util._ -import chisel3.util.experimental.BoringUtils - -trait HasPrefetcherParameter extends HasNOOPParameter { - val supportPrefetch = HasDcache -} - -class Prefetcher extends Module with HasPrefetcherParameter { - val io = IO(new Bundle { - val in = Flipped(Decoupled(new SimpleBusReqBundle)) - val out = Decoupled(new SimpleBusReqBundle) - }) - val getNewReq = RegInit(false.B) - val prefetchReq = RegNext(io.in.bits) - prefetchReq.cmd := SimpleBusCmd.prefetch - prefetchReq.addr := io.in.bits.addr + XLEN.U - - val lastReqAddr = (RegEnable(io.in.bits.addr, io.in.fire())) - val thisReqAddr = io.in.bits.addr - val lineMask = Cat(Fill(AddrBits - 6, 1.U(1.W)), 0.U(6.W)) - val neqAddr = (thisReqAddr & lineMask) =/= (lastReqAddr & lineMask) - - when (!getNewReq) { - io.out.bits <> io.in.bits - io.out.valid := io.in.valid - io.in.ready := !io.in.valid || io.out.fire() - getNewReq := io.in.fire() && io.in.bits.isBurst() && neqAddr - }.otherwise { - io.out.bits <> prefetchReq - io.out.valid := true.B - io.in.ready := false.B - getNewReq := !io.out.fire() - } - - Debug() { - printf("%d: [Prefetcher]: in(%d,%d), out(%d,%d), in.bits.addr = %x\n", - GTimer(), io.in.valid, io.in.ready, io.out.valid, io.out.ready, io.in.bits.addr) - } -} diff --git a/src/main/scala/system/SoC.scala b/src/main/scala/system/SoC.scala index 18bf0978b1be9f569b8aed1b2b306e9bb88b4c3f..74409cac74f5f9a54e4bc37d3b6309ef1c94b6ab 100644 --- a/src/main/scala/system/SoC.scala +++ b/src/main/scala/system/SoC.scala @@ -8,10 +8,15 @@ import freechips.rocketchip.diplomacy.{AddressSet, LazyModule, LazyModuleImp} import freechips.rocketchip.tilelink.{TLBuffer, TLFuzzer, TLIdentityNode, TLXbar} import utils.DebugIdentityNode import xiangshan.{HasXSParameter, XSCore} - +import sifive.blocks.inclusivecache.{CacheParameters, InclusiveCache, InclusiveCacheMicroParameters} +import freechips.rocketchip.diplomacy.{LazyModule, LazyModuleImp, AddressSet} +import freechips.rocketchip.tilelink.{TLBundleParameters, TLCacheCork, TLBuffer, TLClientNode, TLIdentityNode, TLXbar, TLWidthWidget, TLFilter, TLToAXI4} +import freechips.rocketchip.devices.tilelink.{TLError, DevNullParams} +import freechips.rocketchip.amba.axi4.{AXI4ToTL, AXI4IdentityNode, AXI4UserYanker, AXI4Fragmenter, AXI4IdIndexer, AXI4Deinterleaver} case class SoCParameters ( + NumCores: Integer = 1, EnableILA: Boolean = false, HasL2Cache: Boolean = false, HasPrefetch: Boolean = false @@ -19,6 +24,7 @@ case class SoCParameters trait HasSoCParameter extends HasXSParameter{ val soc = top.Parameters.get.socParameters + val NumCores = soc.NumCores val EnableILA = soc.EnableILA val HasL2cache = soc.HasL2Cache val HasPrefetch = soc.HasPrefetch @@ -38,76 +44,129 @@ class DummyCore()(implicit p: Parameters) extends LazyModule { class XSSoc()(implicit p: Parameters) extends LazyModule with HasSoCParameter { + // CPU Cores + private val xs_core = Seq.fill(NumCores)(LazyModule(new XSCore())) + + // L1 to L2 network + // ------------------------------------------------- + private val l2_xbar = Seq.fill(NumCores)(TLXbar()) + + private val l2cache = Seq.fill(NumCores)(LazyModule(new InclusiveCache( + CacheParameters( + level = 2, + ways = L2NWays, + sets = L2NSets, + blockBytes = L2BlockSize, + beatBytes = L1BusWidth / 8, // beatBytes = l1BusDataWidth / 8 + cacheName = s"L2" + ), + InclusiveCacheMicroParameters( + writeBytes = 8 + ) + ))) + + // L2 to L3 network + // ------------------------------------------------- + private val l3_xbar = TLXbar() + + private val l3_banks = (0 until L3NBanks) map (i => + LazyModule(new InclusiveCache( + CacheParameters( + level = 3, + ways = L3NWays, + sets = L3NSets, + blockBytes = L3BlockSize, + beatBytes = L2BusWidth / 8, + cacheName = s"L3_$i" + ), + InclusiveCacheMicroParameters( + writeBytes = 8 + ) + ))) + + // L3 to memory network + // ------------------------------------------------- + private val memory_xbar = TLXbar() + private val mmioXbar = TLXbar() - private val xsCore = LazyModule(new XSCore()) + // only mem, dma and extDev are visible externally + val mem = Seq.fill(L3NBanks)(AXI4IdentityNode()) + val dma = AXI4IdentityNode() + val extDev = AXI4IdentityNode() + + // connections + // ------------------------------------------------- + for (i <- 0 until NumCores) { + l2_xbar(i) := TLBuffer() := DebugIdentityNode() := xs_core(i).dcache.clientNode + l2_xbar(i) := TLBuffer() := DebugIdentityNode() := xs_core(i).l1pluscache.clientNode + l2_xbar(i) := TLBuffer() := DebugIdentityNode() := xs_core(i).ptw.node + mmioXbar := TLBuffer() := DebugIdentityNode() := xs_core(i).uncache.clientNode + l2cache(i).node := TLBuffer() := DebugIdentityNode() := l2_xbar(i) + l3_xbar := TLBuffer() := DebugIdentityNode() := l2cache(i).node + } - // only mem and extDev visible externally - val mem = xsCore.mem - val extDev = TLIdentityNode() + // DMA should not go to MMIO + val mmioRange = AddressSet(base = 0x0000000000L, mask = 0x007fffffffL) + // AXI4ToTL needs a TLError device to route error requests, + // add one here to make it happy. + val tlErrorParams = DevNullParams( + address = Seq(mmioRange), + maxAtomic = 8, + maxTransfer = 64) + val tlError = LazyModule(new TLError(params = tlErrorParams, beatBytes = L2BusWidth / 8)) + private val tlError_xbar = TLXbar() + tlError_xbar := + AXI4ToTL() := + AXI4UserYanker(Some(1)) := + AXI4Fragmenter() := + AXI4IdIndexer(1) := + dma + tlError.node := tlError_xbar + + l3_xbar := + TLBuffer() := + DebugIdentityNode() := + tlError_xbar + + def bankFilter(bank: Int) = AddressSet( + base = bank * L3BlockSize, + mask = ~BigInt((L3NBanks -1) * L3BlockSize)) + + for(i <- 0 until L3NBanks) { + val filter = TLFilter(TLFilter.mSelectIntersect(bankFilter(i))) + l3_banks(i).node := TLBuffer() := DebugIdentityNode() := filter := l3_xbar + } + + for(i <- 0 until L3NBanks) { + mem(i) := + AXI4UserYanker() := + TLToAXI4() := + TLWidthWidget(L3BusWidth / 8) := + TLCacheCork() := + l3_banks(i).node + } - private val mmioXbar = TLXbar() private val clint = LazyModule(new TLTimer( Seq(AddressSet(0x38000000L, 0x0000ffffL)), sim = !env.FPGAPlatform )) - mmioXbar := - TLBuffer() := - DebugIdentityNode() := - xsCore.mmio - - clint.node := - mmioXbar - - extDev := - mmioXbar + clint.node := mmioXbar + extDev := AXI4UserYanker() := TLToAXI4() := mmioXbar lazy val module = new LazyModuleImp(this){ val io = IO(new Bundle{ val meip = Input(Bool()) val ila = if(env.FPGAPlatform && EnableILA) Some(Output(new ILABundle)) else None }) - val mtipSync = clint.module.io.mtip - val msipSync = clint.module.io.msip - val meipSync = RegNext(RegNext(io.meip)) - ExcitingUtils.addSource(mtipSync, "mtip") - ExcitingUtils.addSource(msipSync, "msip") - ExcitingUtils.addSource(meipSync, "meip") + for (i <- 0 until NumCores) { + xs_core(i).module.io.externalInterrupt.mtip := clint.module.io.mtip + xs_core(i).module.io.externalInterrupt.msip := clint.module.io.msip + xs_core(i).module.io.externalInterrupt.meip := RegNext(RegNext(io.meip)) + } + // do not let dma AXI signals optimized out + chisel3.dontTouch(dma.out.head._1) + chisel3.dontTouch(extDev.out.head._1) } } - - -//class XSSoc extends Module with HasSoCParameter { -// val io = IO(new Bundle{ -// val mem = new TLCached(l1BusParams) -// val mmio = new TLCached(l1BusParams) -// val frontend = Flipped(new AXI4) //TODO: do we need it ? -// val meip = Input(Bool()) -// val ila = if (env.FPGAPlatform && EnableILA) Some(Output(new ILABundle)) else None -// }) -// -// val xsCore = Module(new XSCore) -// -// io.frontend <> DontCare -// -// io.mem <> xsCore.io.mem -// -// val addrSpace = List( -// (0x40000000L, 0x40000000L), // external devices -// (0x38000000L, 0x00010000L) // CLINT -// ) -// val mmioXbar = Module(new NaiveTL1toN(addrSpace, xsCore.io.mem.params)) -// mmioXbar.io.in <> xsCore.io.mmio -// -// val extDev = mmioXbar.io.out(0) -// val clint = Module(new AXI4Timer(sim = !env.FPGAPlatform)) -// clint.io.in <> AXI4ToAXI4Lite(MMIOTLToAXI4(mmioXbar.io.out(1))) -// -// io.mmio <> extDev -// -// val mtipSync = clint.io.extra.get.mtip -// val meipSync = RegNext(RegNext(io.meip)) -// ExcitingUtils.addSource(mtipSync, "mtip") -// ExcitingUtils.addSource(meipSync, "meip") -//} diff --git a/src/main/scala/top/Parameters.scala b/src/main/scala/top/Parameters.scala index b567ee166e95ddaa0e7cc9a4e2eb1e8ee6586014..f3a838e756c16b4f899c70f943155f778345bf1b 100644 --- a/src/main/scala/top/Parameters.scala +++ b/src/main/scala/top/Parameters.scala @@ -20,7 +20,7 @@ case class Parameters } object Parameters { - val defaultParameters = Parameters() + val dualCoreParameters = Parameters(socParameters = SoCParameters(NumCores = 2)) val simParameters = Parameters(envParameters = EnviromentParameters(FPGAPlatform = false)) // sim only, disable log val debugParameters = Parameters(envParameters = simParameters.envParameters.copy(EnableDebug = true)) // open log diff --git a/src/main/scala/top/TopMain.scala b/src/main/scala/top/TopMain.scala deleted file mode 100644 index 1f986d802b044dec4509a45e0702847eb75f2531..0000000000000000000000000000000000000000 --- a/src/main/scala/top/TopMain.scala +++ /dev/null @@ -1,34 +0,0 @@ -//package top -// -//import system.XSSoc -//import device.{AXI4Flash, AXI4Timer, AXI4VGA} -//import gpu._ -//import chisel3._ -//import chisel3.stage.ChiselGeneratorAnnotation -// -//class Top extends Module { -// val io = IO(new Bundle{}) -// val xsSoc = Module(new XSSoc()) -// val timer = Module(new AXI4Timer) -// val vga = Module(new AXI4VGA) -// val flash = Module(new AXI4Flash) -//// val gpu = Module(new AXI4GPU) -// -// xsSoc.io := DontCare -// timer.io := DontCare -// vga.io := DontCare -// flash.io := DontCare -//// gpu.io := DontCare -// dontTouch(xsSoc.io) -// dontTouch(timer.io) -// dontTouch(vga.io) -// dontTouch(flash.io) -//// dontTouch(gpu.io) -//} -// -//object TopMain extends App { -// (new chisel3.stage.ChiselStage).execute( -// args, -// Seq(ChiselGeneratorAnnotation(() => new Top)) -// ) -//} diff --git a/src/main/scala/top/XiangShanStage.scala b/src/main/scala/top/XiangShanStage.scala index 765bf9503836b71bd09213bf38c9e99b23ecbe9b..8c61e2b671c9dde850240b3c2a620797287b4d72 100644 --- a/src/main/scala/top/XiangShanStage.scala +++ b/src/main/scala/top/XiangShanStage.scala @@ -55,11 +55,25 @@ object DisableAllPrintAnnotation extends HasShellOptions { ) } +case class RemoveAssertAnnotation() extends NoTargetAnnotation + +object RemoveAssertAnnotation extends HasShellOptions{ + val options = Seq( + new ShellOption[Unit]( + longOption = "remove-assert", + toAnnotationSeq = _ => Seq(RemoveAssertAnnotation()), + helpText = "All the 'assert' will be removed\n", + shortOption = None + ) + ) +} + trait XiangShanCli { this: Shell => parser.note("XiangShan Options") DisablePrintfAnnotation.addOptions(parser) EnablePrintfAnnotation.addOptions(parser) DisableAllPrintAnnotation.addOptions(parser) + RemoveAssertAnnotation.addOptions(parser) } class XiangShanStage extends chisel3.stage.ChiselStage { diff --git a/src/main/scala/utils/BitUtils.scala b/src/main/scala/utils/BitUtils.scala index 375dc038f65f8b1f1d59f1a85b4481b6e47b17da..2a1aef12bf58a9c5fbaeef90cbe760c7bac36351 100644 --- a/src/main/scala/utils/BitUtils.scala +++ b/src/main/scala/utils/BitUtils.scala @@ -62,13 +62,37 @@ object OneHot { object LowerMask { def apply(a: UInt, len: Int) = { - (0 until len).map(i => a >> i.U).reduce(_|_) + ParallelOR((0 until len).map(i => a >> i.U)) + } + def apply(a: UInt): UInt = { + apply(a, a.getWidth) + } +} + +object HigherMask { + def apply(a: UInt, len: Int) = { + Reverse(LowerMask(Reverse(a), len)) + } + def apply(a: UInt): UInt = { + apply(a, a.getWidth) + } +} + +object LowerMaskFromLowest { + def apply(a: UInt) = { + LowerMask(PriorityEncoderOH(a)) + } +} + +object HigherMaskFromHighest { + def apply(a: UInt) = { + Reverse(LowerMask(PriorityEncoderOH(Reverse(a)))) } } object LowestBit { def apply(a: UInt, len: Int) = { - Mux(a(0), 1.U(len.W), Reverse(((0 until len).map(i => Reverse(a(len - 1, 0)) >> i.U).reduce(_|_) + 1.U) >> 1.U)) + Mux(a(0), 1.U(len.W), Reverse((ParallelOR((0 until len).map(i => Reverse(a(len - 1, 0)) >> i.U)) + 1.U) >> 1.U)) } } @@ -82,9 +106,13 @@ object GenMask { // generate w/r mask def apply(high: Int, low: Int) = { require(high > low) - VecInit(List.fill(high+1)(true.B)).asUInt >> low << low + (VecInit(List.fill(high+1)(true.B)).asUInt >> low << low).asUInt() } def apply(pos: Int) = { - 1.U << pos + (1.U << pos).asUInt() } -} \ No newline at end of file +} + +object UIntToMask { + def apply(ptr: UInt) = UIntToOH(ptr) - 1.U +} diff --git a/src/main/scala/utils/Debug.scala b/src/main/scala/utils/Debug.scala deleted file mode 100644 index 9761cdfd5922e13842a2275ca179b76dc2071568..0000000000000000000000000000000000000000 --- a/src/main/scala/utils/Debug.scala +++ /dev/null @@ -1,15 +0,0 @@ -package utils - -import chisel3._ -import chisel3.util._ - -import noop.NOOPConfig - -object Debug { - def apply(flag: Boolean = NOOPConfig().EnableDebug, cond: Bool = true.B)(body: => Unit): Any = - if (flag) { when (cond && GTimer() > 100.U) { body } } -} - -object ShowType { - def apply[T: Manifest](t: T) = println(manifest[T]) -} diff --git a/src/main/scala/utils/ExtractVerilogModules.scala b/src/main/scala/utils/ExtractVerilogModules.scala new file mode 100644 index 0000000000000000000000000000000000000000..7f83868c131f44ad8bba07d0ab6cfebbed9ce9a8 --- /dev/null +++ b/src/main/scala/utils/ExtractVerilogModules.scala @@ -0,0 +1,216 @@ +package utils + +/* + https://github.com/Lingrui98/scalaTage/blob/vme/src/main/scala/getVerilogModules.scala + */ + +import scala.io.Source +import java.io._ +import scala.language.postfixOps +import sys.process._ +import sys._ + +class VerilogModuleExtractor { + // name + val modulePattern = "module ([\\w]+)\\(".r.unanchored + // type name + val subMoudlePattern = "([\\w]+) ([\\w]+) \\((?: //.*)*\\Z".r.unanchored + val endMoudleIOPattern = "\\);".r.unanchored + val endMoudlePattern = "endmodule".r.unanchored + + // (submoudle type, submoudle name) + type SubMoudleRecord = Tuple2[String, String] + + // (content, submodules) + type ModuleRecord = Tuple2[List[String], List[SubMoudleRecord]] + // name + type ModuleMap = Map[String, ModuleRecord] + + def getLines(s: scala.io.BufferedSource): Iterator[String] = s.getLines() + + def makeRecord(s: Iterator[String]): ModuleMap = { + val m: ModuleMap = Map() + // called before we see the first line of a module + def processModule(firstLine: String, it: Iterator[String]): ModuleRecord = { + val content: List[String] = List(firstLine) + val submodules: List[SubMoudleRecord] = List() + def iter(cont: List[String], subm: List[SubMoudleRecord]): ModuleRecord = + it.next() match { + case l: String => l match { + case endMoudlePattern() => (l :: cont, subm) + case subMoudlePattern(ty, name) => + // println(s"submoudle $ty $name") + iter(l :: cont, (ty, name) :: subm) + case _ => iter(l :: cont, subm) + } + case _ => println("Should not reach here"); (cont, subm) + } + val temp = iter(content, submodules) + (temp._1.reverse, temp._2) + } + def traverse(m: ModuleMap, it: Iterator[String]): ModuleMap = + if (it.hasNext) { + it.next() match { + case l: String => + // println(f"traversing $l") + l match { + case modulePattern(name) => + // println(f"get Module of name $name") + traverse(m ++ Map(name -> processModule(l, it)), it) + case _ => + println(f"line $l is not a module definition") + traverse(m, it) + } + case _ => traverse(m, it) + } + } + else m + + traverse(m, s) + } + + def makeRecordFromFile(file: String): ModuleMap = { + val bufSrc = Source.fromFile(file) + makeRecord(bufSrc.getLines()) + } + + def writeModuleToFile(name: String, record: ModuleRecord, dir: String) = { + val path = dir+name+".v" + val writer = new PrintWriter(new File(path)) + println(f"Writing module $name%20s to $path") + record._1.foreach(r => { + writer.write(f"$r\n") + }) + writer.close() + } + + // get moudle definition of specified name + def getModule(name: String, m: ModuleMap): ModuleRecord = { + m(name) + } + + def showModuleRecord(r: ModuleRecord) = { + val (content, submodules) = r + submodules.foreach { + case (t, n) => println(f"submoudle type: $t, submodule name: $n") + } + println("\nprinting module contents...") + content.foreach(println(_)) + } + + // We first get records of all the modules and its submodule record + // Then we choose a module as the root node to traverse its submodule + def processFromModule(name: String, map: ModuleMap, outPath: String, doneSet: Set[String] = Set(), top: Tuple2[String, Boolean]): Unit = { + def printSRAMs(sub: List[SubMoudleRecord]) = { + sub map { + case (ty, subn) if (ty contains "SRAM") => println(s"top module $name, sub module type $ty, name $subn") + case _ => + } + } + val (topName, isTop) = top + if (!map.contains(name)) { + println(s"${if (isTop) "chosen top" else s"submodule of ${topName},"} module $name does not exist!") + return + } + if (isTop) println(s"\nProcessing top module $name") + val r = map(name) + new File(outPath).mkdirs() // ensure the path exists + writeModuleToFile(name, r, outPath) + val submodules = r._2 + // printSRAMs(submodules) + // DFS + val subTypesSet = submodules map (m => m._1) toSet + val nowMap = map - name + val nowSet = doneSet ++ subTypesSet + subTypesSet.foreach { s => if (!doneSet.contains(s)) processFromModule(s, nowMap, outPath, nowSet, (if (isTop) name else topName, false)) } + } + + def getDate: String = { + val d = java.time.LocalDate.now + d.toString.toCharArray.filterNot(_ == '-').mkString + } + + def makePath(topModule: String, outDir: String , user: String = "glr"): String = { + (if (outDir.last == '/') + outDir + else + outDir+"/") + getDate + "-" + user + "-" + topModule + "/" + } + + + + def extract(src: String, topModule: String, outDir: String, user: String, mapp: Option[ModuleMap]): Unit = { + val useMap = mapp.getOrElse(makeRecordFromFile(src)) + val path = makePath(topModule, outDir, user) + processFromModule(topModule, useMap, path, top=(topModule, true)) + } + + def extract(src: String, topModules: List[String], outDir: String, user: String): Unit = { + // avoid repeat + val mapp = makeRecordFromFile(src) + topModules.foreach(n => extract(src, n, outDir, user, Some(mapp))) + } +} + +trait VMEArgParser { + type OptionMap = Map[String, Option[Any]] + + val usage = """ + Usage: sbt "run [OPTION...]" + -s, --source the verilog file generated by chisel, all in one file + default: $NOOP_HOME/build/XSSimTop.v + -h, --help print this help info + -o, --output the place you want to store your extracted verilog + default: $NOOP_HOME/build/extracted + -u, --usr your name, will be used to name the output folder + default: current user + -m, --modules the top modules you would like to extract verilog from + should always be the last argument + default: IFU + """ + + def parse(args: List[String]) = { + def nextOption(map: OptionMap, l: List[String]): OptionMap = { + def isSwitch(s : String)= (s(0) == '-') + l match { + case Nil => map + case ("--help" | "-h") :: tail => { + println(usage) + sys.exit() + map + } + case ("--source" | "-s") :: file :: tail => + nextOption(map ++ Map("source" -> Some(file)), tail) + case ("--output" | "-o") :: path :: tail => + nextOption(map ++ Map("output" -> Some(path)), tail) + case ("--usr" | "-u") :: name :: tail => + nextOption(map ++ Map("usr" -> Some(name)), tail) + // this should always be the last argument, since it is length variable + case ("--modules" | "-m") :: m :: tail => + map ++ Map("modules" -> Some(m :: tail)) + case s :: tail => { + if (isSwitch(s)) println(s"unexpected argument $s") + nextOption(map, tail) + } + } + } + nextOption(Map("source" -> None, "output" -> None, "usr" -> None, "modules" -> None), args) + } + + def wrapParams(args: Array[String]): (String, List[String], String, String) = { + val argL = args.toList + val paramMap = parse(argL) + (paramMap("source").map(_.asInstanceOf[String]).getOrElse(env("NOOP_HOME")+"/build/XSSimTop.v"), + paramMap("modules").map(_.asInstanceOf[List[String]]).getOrElse(List("IFU")), + paramMap("output").map(_.asInstanceOf[String]).getOrElse(env("NOOP_HOME")+"/build/extracted/"), + paramMap("usr").map(_.asInstanceOf[String]).getOrElse("whoami".!!.init)) + } +} + +object ExtractVerilogModules extends VMEArgParser { + def main(args: Array[String]): Unit = { + val vme = new VerilogModuleExtractor() + val (sourceFile, topModules, outTopDir, usr) = wrapParams(args) + vme.extract(sourceFile, topModules, outTopDir, usr) + } +} diff --git a/src/main/scala/utils/Lock.scala b/src/main/scala/utils/Lock.scala deleted file mode 100644 index ee6d6b1a359da1249c599b46d26fe117e3abe558..0000000000000000000000000000000000000000 --- a/src/main/scala/utils/Lock.scala +++ /dev/null @@ -1,43 +0,0 @@ -package utils - -import chisel3._ -import chisel3.util._ - -class LockBundle extends Bundle { - val lock = Input(Bool()) - val unlock = Input(Bool()) - val holding = Output(Bool()) -} - -class Lock(n: Int) extends Module { - val io = IO(new Bundle { - val bundle = Vec(n, new LockBundle) - }) - - val lock = RegInit(0.U(n.W)) - val lockReq = VecInit(io.bundle.map(_.lock)).asUInt - val unlockReq = VecInit(io.bundle.map(_.unlock)).asUInt - - val lockEmpty = lock === 0.U - val hasLockReq = lockReq =/= 0.U - val lockNext = 1.U << PriorityEncoder(lockReq) - when (lockEmpty && hasLockReq) { lock := lockNext } - - val hasUnlockReq = unlockReq =/= 0.U - assert(PopCount(unlockReq) <= 1.U, "only the lock holder can issue unlock request") - assert(!(lockEmpty && hasUnlockReq), "only the lock holder can issue unlock request") - assert((lock & lockReq) === 0.U, "can not issue lock request when holding the lock") - when (!lockEmpty && hasUnlockReq) { - assert(unlockReq === lock, "only the lock holder can issue unlock request") - lock := 0.U - } - - val holding = Mux(lockEmpty && hasLockReq, lockNext, lock) - io.bundle.map(_.holding).zip(holding.asBools).map{ case (l, r) => l := r } - assert(PopCount(io.bundle.map(_.holding)) <= 1.U, "there should be only one lock holder") - - Debug() { - when (lockEmpty && hasLockReq) { printf("%d: %d acquire lock\n", GTimer(), PriorityEncoder(lockNext)) } - when (!lockEmpty && hasUnlockReq) { printf("%d: %d release lock\n", GTimer(), PriorityEncoder(lock)) } - } -} diff --git a/src/main/scala/utils/LogUtils.scala b/src/main/scala/utils/LogUtils.scala index 4ef9be369bacb26d46e0bc3a256e1dba78cd604c..d5a9fb242f4b437a6694fc321cfb5d5d08a518a2 100644 --- a/src/main/scala/utils/LogUtils.scala +++ b/src/main/scala/utils/LogUtils.scala @@ -25,11 +25,11 @@ object XSLog { { val logEnable = WireInit(false.B) val logTimestamp = WireInit(0.U(64.W)) - ExcitingUtils.addSink(logEnable, "DISPLAY_LOG_ENABLE") - ExcitingUtils.addSink(logTimestamp, "logTimestamp") val enableDebug = Parameters.get.envParameters.EnableDebug && debugLevel != XSLogLevel.PERF val enablePerf = Parameters.get.envParameters.EnablePerfDebug && debugLevel == XSLogLevel.PERF if (enableDebug || enablePerf) { + ExcitingUtils.addSink(logEnable, "DISPLAY_LOG_ENABLE") + ExcitingUtils.addSink(logTimestamp, "logTimestamp") when (cond && logEnable) { val commonInfo = p"[$debugLevel][time=$logTimestamp] $MagicStr: " printf((if (prefix) commonInfo else p"") + pable) @@ -42,9 +42,9 @@ object XSLog { def displayLog: Bool = { val logEnable = WireInit(false.B) - ExcitingUtils.addSink(logEnable, "DISPLAY_LOG_ENABLE") val ret = WireInit(false.B) if(Parameters.get.envParameters.EnableDebug) { + ExcitingUtils.addSink(logEnable, "DISPLAY_LOG_ENABLE") ret := logEnable } ret diff --git a/src/main/scala/utils/ParallelMux.scala b/src/main/scala/utils/ParallelMux.scala index f4b9bf3af340e4eac9e91db6ae9c0d1056c0cff9..6aa6d953f40ad621c5d8378dc1280776cb22705a 100644 --- a/src/main/scala/utils/ParallelMux.scala +++ b/src/main/scala/utils/ParallelMux.scala @@ -5,6 +5,7 @@ import chisel3.util._ object ParallelOperation { def apply[T <: Data](xs: Seq[T], func: (T, T) => T): T = { + require(xs.nonEmpty) xs match { case Seq(a) => a case Seq(a, b) => func(a, b) @@ -37,4 +38,16 @@ object ParallelLookUp { def apply[T<:Data](key: UInt, mapping:Seq[(UInt,T)]): T = { ParallelMux(mapping.map(m => (m._1===key) -> m._2)) } +} + +object ParallelMax { + def apply[T <: Data](xs: Seq[T]): T = { + ParallelOperation(xs, (a: T, b:T) => Mux(a.asUInt() > b.asUInt(),a, b).asTypeOf(xs.head)) + } +} + +object ParallelMin { + def apply[T <: Data](xs: Seq[T]): T = { + ParallelOperation(xs, (a: T, b:T) => Mux(a.asUInt() < b.asUInt(),a, b).asTypeOf(xs.head)) + } } \ No newline at end of file diff --git a/src/main/scala/utils/Replacement.scala b/src/main/scala/utils/Replacement.scala index fddf62ba73d272caf212d44f6ca59096728a51eb..0a592a0052ec935918dc1911071099d8053dbb6f 100644 --- a/src/main/scala/utils/Replacement.scala +++ b/src/main/scala/utils/Replacement.scala @@ -76,6 +76,7 @@ class TrueLRU(n_ways: Int) { nextState.zipWithIndex.tail.foldLeft((nextState.head.apply(n_ways-1,1),0)) { case ((pe,pi),(ce,ci)) => (Cat(ce.apply(n_ways-1,ci+1), pe), ci) }._1 } + def get_next_state(state: UInt, touch_ways: Seq[Valid[UInt]]): UInt = { touch_ways.foldLeft(state)((prev, touch_way) => Mux(touch_way.valid, get_next_state(prev, touch_way.bits), prev)) } @@ -162,3 +163,58 @@ class SeqPLRU(n_sets: Int, n_ways: Int) extends SeqReplacementPolicy { def way = plru_way } + +class SbufferLRU(n_ways: Int) { + + def nBits = n_ways * n_ways + private val state_reg = RegInit(0.U(nBits.W)) + def state_read = WireDefault(state_reg) + + + + // set the row touched with 1, column with 0 + def get_next_state(state: UInt, touch_ways: Seq[Valid[UInt]]): UInt = { + val nextState = Wire(Vec(n_ways, UInt(n_ways.W))) + val moreRecentVec = state.asTypeOf(Vec(n_ways, UInt(n_ways.W))) + val wayDecs = touch_ways.map( w => Mux(w.valid, UIntToOH(w.bits, n_ways), 0.U) ) + val wayDec = ParallelOR(wayDecs) + val wayUpd = (~wayDec).asUInt() + + nextState.zipWithIndex.foreach { case (e, i) => + e := Mux(wayDec(i), wayUpd, moreRecentVec(i) & wayUpd ) + } + nextState.asUInt() + } + + // update the stateRect + def access(touch_ways: Seq[Valid[UInt]]) { + when (ParallelOR(touch_ways.map(_.valid))) { + state_reg := get_next_state(state_reg, touch_ways) + } + } + + // get the index of the smallest value from a set of numbers + def get_min_value(xs: Seq[(UInt,UInt)]): (UInt,UInt)= { + xs match { + case Seq(a) => a + case Seq(a, b) => (Mux(a._1 + get_min_value(Seq(get_min_value(xs take xs.size/2), get_min_value(xs drop xs.size/2))) + } + } + + // get the way which is valid and has the least 1 + def get_replace_way(state: UInt, sbufferState:Seq[Bool]): UInt = { + val moreRecentVec = state.asTypeOf(Vec(n_ways, UInt(n_ways.W))) + val count = Wire(Vec(n_ways, UInt(log2Up(n_ways).W))) + for(i <- 0 until n_ways){ + count(i) := Mux(sbufferState(i), PopCount(moreRecentVec(i)), ((1< (1 << cb) - 1}.reduce(_+_) + def minVal = -(8 * (1 << TageCtrBits) + SCTableInfo.map{case (_,cb,_) => 1 << cb}.reduce(_+_)) + def sumCtrBits = max(log2Ceil(-minVal), log2Ceil(maxVal+1)) + 1 + val tageTaken = if (useSC) Bool() else UInt(0.W) + val scUsed = if (useSC) Bool() else UInt(0.W) + val scPred = if (useSC) Bool() else UInt(0.W) + // Suppose ctrbits of all tables are identical + val ctrs = if (useSC) Vec(SCNTables, SInt(SCCtrBits.W)) else Vec(SCNTables, SInt(0.W)) + val sumAbs = if (useSC) UInt(sumCtrBits.W) else UInt(0.W) +} + class TageMeta extends XSBundle with HasTageParameter { val provider = ValidUndirectioned(UInt(log2Ceil(TageNTables).W)) val altDiffers = Bool() val providerU = UInt(2.W) val providerCtr = UInt(3.W) val allocate = ValidUndirectioned(UInt(log2Ceil(TageNTables).W)) + val taken = Bool() + val scMeta = new SCMeta(EnableSC) } -class BranchPrediction extends XSBundle { - val redirect = Bool() - val taken = Bool() - val jmpIdx = UInt(log2Up(PredictWidth).W) - val hasNotTakenBrs = Bool() - val target = UInt(VAddrBits.W) - val saveHalfRVI = Bool() - val takenOnBr = Bool() +class BranchPrediction extends XSBundle with HasIFUConst { + // val redirect = Bool() + val takens = UInt(PredictWidth.W) + // val jmpIdx = UInt(log2Up(PredictWidth).W) + val brMask = UInt(PredictWidth.W) + val jalMask = UInt(PredictWidth.W) + val targets = Vec(PredictWidth, UInt(VAddrBits.W)) + + // marks the last 2 bytes of this fetch packet + // val endsAtTheEndOfFirstBank = Bool() + // val endsAtTheEndOfLastBank = Bool() + + // half RVI could only start at the end of a bank + val firstBankHasHalfRVI = Bool() + val lastBankHasHalfRVI = Bool() + + def lastHalfRVIMask = Mux(firstBankHasHalfRVI, UIntToOH((bankWidth-1).U), + Mux(lastBankHasHalfRVI, UIntToOH((PredictWidth-1).U), + 0.U(PredictWidth.W) + ) + ) + + def lastHalfRVIClearMask = ~lastHalfRVIMask + // is taken from half RVI + def lastHalfRVITaken = (takens & lastHalfRVIMask).orR + + def lastHalfRVIIdx = Mux(firstBankHasHalfRVI, (bankWidth-1).U, (PredictWidth-1).U) + // should not be used if not lastHalfRVITaken + def lastHalfRVITarget = Mux(firstBankHasHalfRVI, targets(bankWidth-1), targets(PredictWidth-1)) + + def realTakens = takens & lastHalfRVIClearMask + def realBrMask = brMask & lastHalfRVIClearMask + def realJalMask = jalMask & lastHalfRVIClearMask + + def brNotTakens = ~realTakens & realBrMask + def sawNotTakenBr = VecInit((0 until PredictWidth).map(i => + (if (i == 0) false.B else brNotTakens(i-1,0).orR))) + def hasNotTakenBrs = (brNotTakens & LowerMaskFromLowest(realTakens)).orR + def unmaskedJmpIdx = PriorityEncoder(takens) + def saveHalfRVI = (firstBankHasHalfRVI && (unmaskedJmpIdx === (bankWidth-1).U || !(takens.orR))) || + (lastBankHasHalfRVI && unmaskedJmpIdx === (PredictWidth-1).U) + // could get PredictWidth-1 when only the first bank is valid + def jmpIdx = PriorityEncoder(realTakens) + // only used when taken + def target = targets(jmpIdx) + def taken = realTakens.orR + def takenOnBr = taken && realBrMask(jmpIdx) } class BranchInfo extends XSBundle with HasBPUParameter { @@ -85,9 +142,10 @@ class BranchInfo extends XSBundle with HasBPUParameter { def fromUInt(x: UInt) = x.asTypeOf(this) } -class Predecode extends XSBundle { - val isFetchpcEqualFirstpc = Bool() +class Predecode extends XSBundle with HasIFUConst { + val hasLastHalfRVI = Bool() val mask = UInt((FetchWidth*2).W) + val lastHalf = UInt(nBanksInPacket.W) val pd = Vec(FetchWidth*2, (new PreDecodeInfo)) } @@ -127,8 +185,8 @@ class CtrlSignals extends XSBundle { val rfWen = Bool() val fpWen = Bool() val isXSTrap = Bool() - val noSpecExec = Bool() // This inst can not be speculated - val isBlocked = Bool() // This inst requires pipeline to be blocked + val noSpecExec = Bool() // wait forward + val blockBackward = Bool() // block backward val flushPipe = Bool() // This inst will flush all the pipe when commit, like exception but can commit val isRVF = Bool() val imm = UInt(XLEN.W) @@ -153,15 +211,8 @@ class PerfDebugInfo extends XSBundle { // Load / Store Index // -// When using unified lsroq, lsIdx serves as lsroqIdx, // while separated lq and sq is used, lsIdx consists of lqIdx, sqIdx and l/s type. -// All lsroqIdx will be replaced by new lsIdx in the future. trait HasLSIdx { this: HasXSParameter => - - // if(EnableUnifiedLSQ){ - // Unified LSQ - val lsroqIdx = UInt(LsroqIdxWidth.W) - // } else { // Separate LSQ val lqIdx = new LqPtr val sqIdx = new SqPtr @@ -208,29 +259,43 @@ class DebugBundle extends XSBundle{ class ExuInput extends XSBundle { val uop = new MicroOp - val src1, src2, src3 = UInt(XLEN.W) + val src1, src2, src3 = UInt((XLEN+1).W) } class ExuOutput extends XSBundle { val uop = new MicroOp - val data = UInt(XLEN.W) + val data = UInt((XLEN+1).W) + val fflags = new Fflags val redirectValid = Bool() val redirect = new Redirect val brUpdate = new BranchUpdateInfo val debug = new DebugBundle } -class ExuIO extends XSBundle { - val in = Flipped(DecoupledIO(new ExuInput)) - val redirect = Flipped(ValidIO(new Redirect)) - val out = DecoupledIO(new ExuOutput) - // for csr +class ExternalInterruptIO extends XSBundle { + val mtip = Input(Bool()) + val msip = Input(Bool()) + val meip = Input(Bool()) +} + +class CSRSpecialIO extends XSBundle { val exception = Flipped(ValidIO(new MicroOp)) - // for Lsu - val dmem = new SimpleBusUC - val mcommit = Input(UInt(3.W)) + val isInterrupt = Input(Bool()) + val memExceptionVAddr = Input(UInt(VAddrBits.W)) + val trapTarget = Output(UInt(VAddrBits.W)) + val externalInterrupt = new ExternalInterruptIO + val interrupt = Output(Bool()) } +//class ExuIO extends XSBundle { +// val in = Flipped(DecoupledIO(new ExuInput)) +// val redirect = Flipped(ValidIO(new Redirect)) +// val out = DecoupledIO(new ExuOutput) +// // for csr +// val csrOnly = new CSRSpecialIO +// val mcommit = Input(UInt(3.W)) +//} + class RoqCommit extends XSBundle { val uop = new MicroOp val isWalk = Bool() @@ -245,7 +310,7 @@ class FrontendToBackendIO extends XSBundle { // to backend end val cfVec = Vec(DecodeWidth, DecoupledIO(new CtrlFlow)) // from backend - val redirect = Flipped(ValidIO(new Redirect)) + val redirect = Flipped(ValidIO(UInt(VAddrBits.W))) val outOfOrderBrInfo = Flipped(ValidIO(new BranchUpdateInfo)) val inOrderBrInfo = Flipped(ValidIO(new BranchUpdateInfo)) } diff --git a/src/main/scala/xiangshan/XSCore.scala b/src/main/scala/xiangshan/XSCore.scala index af13662772e2347106289ee3ce5666ca5c910f1c..d1effcf0609d9ce2573dbc9b1b18fb55c19d178d 100644 --- a/src/main/scala/xiangshan/XSCore.scala +++ b/src/main/scala/xiangshan/XSCore.scala @@ -2,18 +2,21 @@ package xiangshan import chisel3._ import chisel3.util._ -import noop.{Cache, CacheConfig, HasExceptionNO, TLB, TLBConfig} import top.Parameters import xiangshan.backend._ import xiangshan.backend.dispatch.DispatchParameters import xiangshan.backend.exu.ExuParameters +import xiangshan.backend.exu.Exu._ import xiangshan.frontend._ import xiangshan.mem._ -import xiangshan.cache.{ICache, DCache, DCacheParameters, ICacheParameters, PTW, Uncache} +import xiangshan.backend.fu.HasExceptionNO +import xiangshan.cache.{ICache, DCache, L1plusCache, DCacheParameters, ICacheParameters, L1plusCacheParameters, PTW, Uncache} import chipsalliance.rocketchip.config -import freechips.rocketchip.diplomacy.{LazyModule, LazyModuleImp} -import freechips.rocketchip.tilelink.{TLBundleParameters, TLCacheCork, TLBuffer, TLClientNode, TLIdentityNode, TLXbar} +import freechips.rocketchip.diplomacy.{LazyModule, LazyModuleImp, AddressSet} +import freechips.rocketchip.tilelink.{TLBundleParameters, TLCacheCork, TLBuffer, TLClientNode, TLIdentityNode, TLXbar, TLWidthWidget, TLFilter, TLToAXI4} +import freechips.rocketchip.devices.tilelink.{TLError, DevNullParams} import sifive.blocks.inclusivecache.{CacheParameters, InclusiveCache, InclusiveCacheMicroParameters} +import freechips.rocketchip.amba.axi4.{AXI4ToTL, AXI4IdentityNode, AXI4UserYanker, AXI4Fragmenter, AXI4IdIndexer, AXI4Deinterleaver} import utils._ case class XSCoreParameters @@ -28,13 +31,14 @@ case class XSCoreParameters AddrBits: Int = 64, VAddrBits: Int = 39, PAddrBits: Int = 40, - HasFPU: Boolean = false, + HasFPU: Boolean = true, FectchWidth: Int = 8, EnableBPU: Boolean = true, EnableBPD: Boolean = true, EnableRAS: Boolean = true, EnableLB: Boolean = false, EnableLoop: Boolean = false, + EnableSC: Boolean = false, HistoryLength: Int = 64, BtbSize: Int = 2048, JbtacSize: Int = 1024, @@ -47,37 +51,32 @@ case class XSCoreParameters DecodeWidth: Int = 6, RenameWidth: Int = 6, CommitWidth: Int = 6, - BrqSize: Int = 48, - IssQueSize: Int = 16, + BrqSize: Int = 32, + IssQueSize: Int = 12, NRPhyRegs: Int = 160, - NRIntReadPorts: Int = 8, + NRIntReadPorts: Int = 14, NRIntWritePorts: Int = 8, NRFpReadPorts: Int = 14, - NRFpWritePorts: Int = 8, - EnableUnifiedLSQ: Boolean = false, - LsroqSize: Int = 64, + NRFpWritePorts: Int = 8, LoadQueueSize: Int = 64, StoreQueueSize: Int = 48, RoqSize: Int = 192, dpParams: DispatchParameters = DispatchParameters( DqEnqWidth = 4, - IntDqSize = 96, - FpDqSize = 96, - LsDqSize = 64, + IntDqSize = 128, + FpDqSize = 128, + LsDqSize = 96, IntDqDeqWidth = 4, FpDqDeqWidth = 4, - LsDqDeqWidth = 4, - IntDqReplayWidth = 4, - FpDqReplayWidth = 4, - LsDqReplayWidth = 4 + LsDqDeqWidth = 4 ), exuParameters: ExuParameters = ExuParameters( JmpCnt = 1, AluCnt = 4, MulCnt = 0, MduCnt = 2, - FmacCnt = 0, - FmiscCnt = 0, + FmacCnt = 4, + FmiscCnt = 2, FmiscDivSqrtCnt = 0, LduCnt = 2, StuCnt = 2 @@ -89,7 +88,8 @@ case class XSCoreParameters TlbEntrySize: Int = 32, TlbL2EntrySize: Int = 256, // or 512 PtwL1EntrySize: Int = 16, - PtwL2EntrySize: Int = 256 + PtwL2EntrySize: Int = 256, + NumPerfCounters: Int = 16 ) trait HasXSParameter { @@ -118,6 +118,7 @@ trait HasXSParameter { val EnableRAS = core.EnableRAS val EnableLB = core.EnableLB val EnableLoop = core.EnableLoop + val EnableSC = core.EnableSC val HistoryLength = core.HistoryLength val BtbSize = core.BtbSize // val BtbWays = 4 @@ -141,14 +142,9 @@ trait HasXSParameter { val NRPhyRegs = core.NRPhyRegs val PhyRegIdxWidth = log2Up(NRPhyRegs) val RoqSize = core.RoqSize - val EnableUnifiedLSQ = core.EnableUnifiedLSQ - val LsroqSize = core.LsroqSize // 64 - val InnerLsroqIdxWidth = log2Up(LsroqSize) - val LsroqIdxWidth = InnerLsroqIdxWidth + 1 val LoadQueueSize = core.LoadQueueSize val StoreQueueSize = core.StoreQueueSize val dpParams = core.dpParams - val ReplayWidth = dpParams.IntDqReplayWidth + dpParams.FpDqReplayWidth + dpParams.LsDqReplayWidth val exuParameters = core.exuParameters val NRIntReadPorts = core.NRIntReadPorts val NRIntWritePorts = core.NRIntWritePorts @@ -164,13 +160,18 @@ trait HasXSParameter { val TlbL2EntrySize = core.TlbL2EntrySize val PtwL1EntrySize = core.PtwL1EntrySize val PtwL2EntrySize = core.PtwL2EntrySize - - val l1BusDataWidth = 256 + val NumPerfCounters = core.NumPerfCounters val icacheParameters = ICacheParameters( + nMissEntries = 2 + ) + + val l1plusCacheParameters = L1plusCacheParameters( + tagECC = Some("secded"), + dataECC = Some("secded"), + nMissEntries = 8 ) - val LRSCCycles = 100 val dcacheParameters = DCacheParameters( tagECC = Some("secded"), dataECC = Some("secded"), @@ -178,20 +179,48 @@ trait HasXSParameter { nLoadMissEntries = 8, nStoreMissEntries = 8 ) + + val LRSCCycles = 100 + + + // cache hierarchy configurations + val l1BusDataWidth = 256 + + // L2 configurations + val L1BusWidth = 256 + val L2Size = 512 * 1024 // 512KB + val L2BlockSize = 64 + val L2NWays = 8 + val L2NSets = L2Size / L2BlockSize / L2NWays + + // L3 configurations + val L2BusWidth = 256 + val L3Size = 4 * 1024 * 1024 // 4MB + val L3BlockSize = 64 + val L3NBanks = 4 + val L3NWays = 8 + val L3NSets = L3Size / L3BlockSize / L3NBanks / L3NWays + + // on chip network configurations + val L3BusWidth = 256 } trait HasXSLog { this: RawModule => implicit val moduleName: String = this.name } -abstract class XSModule extends Module +abstract class XSModule extends MultiIOModule with HasXSParameter with HasExceptionNO with HasXSLog +{ + def io: Record +} //remove this trait after impl module logic -trait NeedImpl { this: Module => +trait NeedImpl { this: RawModule => override protected def IO[T <: Data](iodef: T): T = { + println(s"[Warn]: (${this.name}) please reomve 'NeedImpl' after implement this module") val io = chisel3.experimental.IO(iodef) io <> DontCare io @@ -212,8 +241,8 @@ object AddressSpace extends HasXSParameter { // (start, size) // address out of MMIO will be considered as DRAM def mmio = List( - (0x30000000L, 0x10000000L), // internal devices, such as CLINT and PLIC - (0x40000000L, 0x40000000L) // external devices + (0x00000000L, 0x40000000L), // internal devices, such as CLINT and PLIC + (0x40000000L, 0x40000000L) // external devices ) def isMMIO(addr: UInt): Bool = mmio.map(range => { @@ -225,70 +254,158 @@ object AddressSpace extends HasXSParameter { -class XSCore()(implicit p: config.Parameters) extends LazyModule { +class XSCore()(implicit p: config.Parameters) extends LazyModule with HasXSParameter { + // outer facing nodes val dcache = LazyModule(new DCache()) val uncache = LazyModule(new Uncache()) - val icache = LazyModule(new ICache()) + val l1pluscache = LazyModule(new L1plusCache()) val ptw = LazyModule(new PTW()) - val mem = TLIdentityNode() - val mmio = uncache.clientNode - - // TODO: refactor these params - private val l2 = LazyModule(new InclusiveCache( - CacheParameters( - level = 2, - ways = 4, - sets = 512 * 1024 / (64 * 4), - blockBytes = 64, - beatBytes = 32 // beatBytes = l1BusDataWidth / 8 - ), - InclusiveCacheMicroParameters( - writeBytes = 8 - ) - )) - - private val xbar = TLXbar() - - xbar := TLBuffer() := DebugIdentityNode() := dcache.clientNode - xbar := TLBuffer() := DebugIdentityNode() := icache.clientNode - xbar := TLBuffer() := DebugIdentityNode() := ptw.node - - l2.node := xbar - - mem := TLBuffer() := TLCacheCork() := TLBuffer() := l2.node - lazy val module = new XSCoreImp(this) } -class XSCoreImp(outer: XSCore) extends LazyModuleImp(outer) with HasXSParameter { - - val front = Module(new Frontend) - val backend = Module(new Backend) - val mem = Module(new Memend) +class XSCoreImp(outer: XSCore) extends LazyModuleImp(outer) + with HasXSParameter + with HasExeBlockHelper +{ + val io = IO(new Bundle { + val externalInterrupt = new ExternalInterruptIO + }) + + println(s"FPGAPlatform:${env.FPGAPlatform} EnableDebug:${env.EnableDebug}") + + // to fast wake up fp, mem rs + val intBlockFastWakeUpFp = intExuConfigs.filter(fpFastFilter) + val intBlockSlowWakeUpFp = intExuConfigs.filter(fpSlowFilter) + val intBlockFastWakeUpInt = intExuConfigs.filter(intFastFilter) + val intBlockSlowWakeUpInt = intExuConfigs.filter(intSlowFilter) + + val fpBlockFastWakeUpFp = fpExuConfigs.filter(fpFastFilter) + val fpBlockSlowWakeUpFp = fpExuConfigs.filter(fpSlowFilter) + val fpBlockFastWakeUpInt = fpExuConfigs.filter(intFastFilter) + val fpBlockSlowWakeUpInt = fpExuConfigs.filter(intSlowFilter) + + val frontend = Module(new Frontend) + val ctrlBlock = Module(new CtrlBlock) + val integerBlock = Module(new IntegerBlock( + fastWakeUpIn = fpBlockFastWakeUpInt, + slowWakeUpIn = fpBlockSlowWakeUpInt ++ loadExuConfigs, + fastFpOut = intBlockFastWakeUpFp, + slowFpOut = intBlockSlowWakeUpFp, + fastIntOut = intBlockFastWakeUpInt, + slowIntOut = intBlockSlowWakeUpInt + )) + val floatBlock = Module(new FloatBlock( + fastWakeUpIn = intBlockFastWakeUpFp, + slowWakeUpIn = intBlockSlowWakeUpFp ++ loadExuConfigs, + fastFpOut = fpBlockFastWakeUpFp, + slowFpOut = fpBlockSlowWakeUpFp, + fastIntOut = fpBlockFastWakeUpInt, + slowIntOut = fpBlockSlowWakeUpInt + )) + val memBlock = Module(new MemBlock( + fastWakeUpIn = intBlockFastWakeUpInt ++ intBlockFastWakeUpFp ++ fpBlockFastWakeUpInt ++ fpBlockFastWakeUpFp, + slowWakeUpIn = intBlockSlowWakeUpInt ++ intBlockSlowWakeUpFp ++ fpBlockSlowWakeUpInt ++ fpBlockSlowWakeUpFp, + fastFpOut = Seq(), + slowFpOut = loadExuConfigs, + fastIntOut = Seq(), + slowIntOut = loadExuConfigs + )) val dcache = outer.dcache.module val uncache = outer.uncache.module - val icache = outer.icache.module + val l1pluscache = outer.l1pluscache.module val ptw = outer.ptw.module - - // TODO: connect this - - front.io.backend <> backend.io.frontend - front.io.icacheResp <> icache.io.resp - front.io.icacheToTlb <> icache.io.tlb - icache.io.req <> front.io.icacheReq - icache.io.flush <> front.io.icacheFlush - mem.io.backend <> backend.io.mem - - ptw.io.tlb(0) <> mem.io.ptw - ptw.io.tlb(1) <> front.io.ptw - - dcache.io.lsu.load <> mem.io.loadUnitToDcacheVec - dcache.io.lsu.lsroq <> mem.io.loadMiss - dcache.io.lsu.atomics <> mem.io.atomics - dcache.io.lsu.store <> mem.io.sbufferToDcache - uncache.io.lsroq <> mem.io.uncache + val icache = Module(new ICache) + + frontend.io.backend <> ctrlBlock.io.frontend + frontend.io.icacheResp <> icache.io.resp + frontend.io.icacheToTlb <> icache.io.tlb + icache.io.req <> frontend.io.icacheReq + icache.io.flush <> frontend.io.icacheFlush + frontend.io.sfence <> integerBlock.io.fenceio.sfence + frontend.io.tlbCsr <> integerBlock.io.csrio.tlb + + icache.io.mem_acquire <> l1pluscache.io.req + l1pluscache.io.resp <> icache.io.mem_grant + l1pluscache.io.flush := icache.io.l1plusflush + icache.io.fencei := integerBlock.io.fenceio.fencei + + ctrlBlock.io.fromIntBlock <> integerBlock.io.toCtrlBlock + ctrlBlock.io.fromFpBlock <> floatBlock.io.toCtrlBlock + ctrlBlock.io.fromLsBlock <> memBlock.io.toCtrlBlock + ctrlBlock.io.toIntBlock <> integerBlock.io.fromCtrlBlock + ctrlBlock.io.toFpBlock <> floatBlock.io.fromCtrlBlock + ctrlBlock.io.toLsBlock <> memBlock.io.fromCtrlBlock + + integerBlock.io.wakeUpIn.fastUops <> floatBlock.io.wakeUpIntOut.fastUops + integerBlock.io.wakeUpIn.fast <> floatBlock.io.wakeUpIntOut.fast + integerBlock.io.wakeUpIn.slow <> floatBlock.io.wakeUpIntOut.slow ++ memBlock.io.wakeUpIntOut.slow + + floatBlock.io.wakeUpIn.fastUops <> integerBlock.io.wakeUpFpOut.fastUops + floatBlock.io.wakeUpIn.fast <> integerBlock.io.wakeUpFpOut.fast + floatBlock.io.wakeUpIn.slow <> integerBlock.io.wakeUpFpOut.slow ++ memBlock.io.wakeUpFpOut.slow + + + integerBlock.io.wakeUpIntOut.fast.map(_.ready := true.B) + integerBlock.io.wakeUpIntOut.slow.map(_.ready := true.B) + floatBlock.io.wakeUpFpOut.fast.map(_.ready := true.B) + floatBlock.io.wakeUpFpOut.slow.map(_.ready := true.B) + + val wakeUpMem = Seq( + integerBlock.io.wakeUpIntOut, + integerBlock.io.wakeUpFpOut, + floatBlock.io.wakeUpIntOut, + floatBlock.io.wakeUpFpOut + ) + memBlock.io.wakeUpIn.fastUops <> wakeUpMem.flatMap(_.fastUops) + memBlock.io.wakeUpIn.fast <> wakeUpMem.flatMap(w => w.fast.map(f => { + val raw = WireInit(f) + raw + })) + memBlock.io.wakeUpIn.slow <> wakeUpMem.flatMap(w => w.slow.map(s => { + val raw = WireInit(s) + raw + })) + + integerBlock.io.csrio.fflags <> ctrlBlock.io.roqio.toCSR.fflags + integerBlock.io.csrio.dirty_fs <> ctrlBlock.io.roqio.toCSR.dirty_fs + integerBlock.io.csrio.exception <> ctrlBlock.io.roqio.exception + integerBlock.io.csrio.isInterrupt <> ctrlBlock.io.roqio.isInterrupt + integerBlock.io.csrio.trapTarget <> ctrlBlock.io.roqio.toCSR.trapTarget + integerBlock.io.csrio.interrupt <> ctrlBlock.io.roqio.toCSR.intrBitSet + integerBlock.io.csrio.memExceptionVAddr <> memBlock.io.lsqio.exceptionAddr.vaddr + integerBlock.io.csrio.externalInterrupt <> io.externalInterrupt + integerBlock.io.csrio.tlb <> memBlock.io.tlbCsr + integerBlock.io.fenceio.sfence <> memBlock.io.sfence + integerBlock.io.fenceio.sbuffer <> memBlock.io.fenceToSbuffer + + floatBlock.io.frm <> integerBlock.io.csrio.frm + + memBlock.io.lsqio.commits <> ctrlBlock.io.roqio.commits + memBlock.io.lsqio.roqDeqPtr <> ctrlBlock.io.roqio.roqDeqPtr + memBlock.io.lsqio.exceptionAddr.lsIdx.lqIdx := ctrlBlock.io.roqio.exception.bits.lqIdx + memBlock.io.lsqio.exceptionAddr.lsIdx.sqIdx := ctrlBlock.io.roqio.exception.bits.sqIdx + memBlock.io.lsqio.exceptionAddr.isStore := CommitType.lsInstIsStore(ctrlBlock.io.roqio.exception.bits.ctrl.commitType) + + ptw.io.tlb(0) <> memBlock.io.ptw + ptw.io.tlb(1) <> frontend.io.ptw + ptw.io.sfence <> integerBlock.io.fenceio.sfence + ptw.io.csr <> integerBlock.io.csrio.tlb + + dcache.io.lsu.load <> memBlock.io.dcache.loadUnitToDcacheVec + dcache.io.lsu.lsq <> memBlock.io.dcache.loadMiss + dcache.io.lsu.atomics <> memBlock.io.dcache.atomics + dcache.io.lsu.store <> memBlock.io.dcache.sbufferToDcache + uncache.io.lsq <> memBlock.io.dcache.uncache + + if (!env.FPGAPlatform) { + val debugIntReg, debugFpReg = WireInit(VecInit(Seq.fill(32)(0.U(XLEN.W)))) + ExcitingUtils.addSink(debugIntReg, "DEBUG_INT_ARCH_REG", ExcitingUtils.Debug) + ExcitingUtils.addSink(debugFpReg, "DEBUG_FP_ARCH_REG", ExcitingUtils.Debug) + val debugArchReg = WireInit(VecInit(debugIntReg ++ debugFpReg)) + ExcitingUtils.addSource(debugArchReg, "difftestRegs", ExcitingUtils.Debug) + } } diff --git a/src/main/scala/xiangshan/backend/Backend.scala b/src/main/scala/xiangshan/backend/Backend.scala deleted file mode 100644 index 20c3c9b7a568b3d9cc010d0601d455b263f64ce3..0000000000000000000000000000000000000000 --- a/src/main/scala/xiangshan/backend/Backend.scala +++ /dev/null @@ -1,302 +0,0 @@ -package xiangshan.backend - -import chisel3._ -import chisel3.util._ -import chisel3.util.experimental.BoringUtils -import xiangshan._ -import xiangshan.backend.decode.{DecodeBuffer, DecodeStage} -import xiangshan.backend.rename.Rename -import xiangshan.backend.brq.Brq -import xiangshan.backend.dispatch.Dispatch -import xiangshan.backend.exu._ -import xiangshan.backend.fu.FunctionUnit -import xiangshan.backend.issue.{IssueQueue, ReservationStation} -import xiangshan.backend.regfile.{Regfile, RfWritePort} -import xiangshan.backend.roq.Roq -import xiangshan.mem._ -import utils._ - -/** Backend Pipeline: - * Decode -> Rename -> Dispatch-1 -> Dispatch-2 -> Issue -> Exe - */ -class Backend extends XSModule - with NeedImpl { - val io = IO(new Bundle { - val frontend = Flipped(new FrontendToBackendIO) - val mem = Flipped(new MemToBackendIO) - }) - val timer = GTimer() - - val aluExeUnits =Array.tabulate(exuParameters.AluCnt)(_ => Module(new AluExeUnit)) - val jmpExeUnit = Module(new JmpExeUnit) - val mulExeUnits = Array.tabulate(exuParameters.MulCnt)(_ => Module(new MulExeUnit)) - val mduExeUnits = Array.tabulate(exuParameters.MduCnt)(_ => Module(new MulDivExeUnit)) - // val fmacExeUnits = Array.tabulate(exuParameters.FmacCnt)(_ => Module(new Fmac)) - // val fmiscExeUnits = Array.tabulate(exuParameters.FmiscCnt)(_ => Module(new Fmisc)) - // val fmiscDivSqrtExeUnits = Array.tabulate(exuParameters.FmiscDivSqrtCnt)(_ => Module(new FmiscDivSqrt)) - val exeUnits = jmpExeUnit +: (aluExeUnits ++ mulExeUnits ++ mduExeUnits) - exeUnits.foreach(exe => { - exe.io.exception := DontCare - exe.io.dmem := DontCare - exe.io.mcommit := DontCare - }) - - val decode = Module(new DecodeStage) - val brq = Module(new Brq) - val decBuf = Module(new DecodeBuffer) - val rename = Module(new Rename) - val dispatch = Module(new Dispatch) - val roq = Module(new Roq) - val intRf = Module(new Regfile( - numReadPorts = NRIntReadPorts, - numWirtePorts = NRIntWritePorts, - hasZero = true - )) - val fpRf = Module(new Regfile( - numReadPorts = NRFpReadPorts, - numWirtePorts = NRFpWritePorts, - hasZero = false - )) - val memRf = Module(new Regfile( - numReadPorts = 2*exuParameters.StuCnt + exuParameters.LduCnt, - numWirtePorts = NRIntWritePorts, - hasZero = true, - isMemRf = true - )) - - // backend redirect, flush pipeline - val redirect = Mux( - roq.io.redirect.valid, - roq.io.redirect, - Mux( - brq.io.redirect.valid, - brq.io.redirect, - io.mem.replayAll - ) - ) - - io.frontend.redirect := redirect - io.frontend.redirect.valid := redirect.valid && !redirect.bits.isReplay - - val memConfigs = - Seq.fill(exuParameters.LduCnt)(Exu.ldExeUnitCfg) ++ - Seq.fill(exuParameters.StuCnt)(Exu.stExeUnitCfg) - - val exuConfigs = exeUnits.map(_.config) ++ memConfigs - - val exeWbReqs = exeUnits.map(_.io.out) ++ io.mem.ldout ++ io.mem.stout - - def needWakeup(cfg: ExuConfig): Boolean = - (cfg.readIntRf && cfg.writeIntRf) || (cfg.readFpRf && cfg.writeFpRf) - - def needData(a: ExuConfig, b: ExuConfig): Boolean = - (a.readIntRf && b.writeIntRf) || (a.readFpRf && b.writeFpRf) - - val reservedStations = exeUnits. - zipWithIndex. - map({ case (exu, i) => - - val cfg = exu.config - - val wakeUpDateVec = exuConfigs.zip(exeWbReqs).filter(x => needData(cfg, x._1)).map(_._2) - val bypassCnt = exuConfigs.count(c => c.enableBypass && needData(cfg, c)) - - println(s"exu:${cfg.name} wakeupCnt:${wakeUpDateVec.length} bypassCnt:$bypassCnt") - - val rs = Module(new ReservationStation( - cfg, wakeUpDateVec.length, bypassCnt, cfg.enableBypass, false - )) - rs.io.redirect <> redirect - rs.io.numExist <> dispatch.io.numExist(i) - rs.io.enqCtrl <> dispatch.io.enqIQCtrl(i) - rs.io.enqData <> dispatch.io.enqIQData(i) - for( - (wakeUpPort, exuOut) <- - rs.io.wakeUpPorts.zip(wakeUpDateVec) - ){ - wakeUpPort.bits := exuOut.bits - wakeUpPort.valid := exuOut.valid - } - - exu.io.in <> rs.io.deq - exu.io.in.bits.uop.debugInfo.issueTime := timer - exu.io.redirect <> redirect - rs - }) - - for( rs <- reservedStations){ - rs.io.bypassUops <> reservedStations. - filter(x => x.enableBypass && needData(rs.exuCfg, x.exuCfg)). - map(_.io.selectedUop) - - val bypassDataVec = exuConfigs.zip(exeWbReqs). - filter(x => x._1.enableBypass && needData(rs.exuCfg, x._1)).map(_._2) - - for(i <- bypassDataVec.indices){ - rs.io.bypassData(i).valid := bypassDataVec(i).valid - rs.io.bypassData(i).bits := bypassDataVec(i).bits - } - } - - val issueQueues = exuConfigs. - zipWithIndex. - takeRight(exuParameters.LduCnt + exuParameters.StuCnt). - map({case (cfg, i) => - val wakeUpDateVec = exuConfigs.zip(exeWbReqs).filter(x => needData(cfg, x._1)).map(_._2) - val bypassUopVec = reservedStations. - filter(r => r.exuCfg.enableBypass && needData(cfg, r.exuCfg)).map(_.io.selectedUop) - val bypassDataVec = exuConfigs.zip(exeWbReqs). - filter(x => x._1.enableBypass && needData(cfg, x._1)).map(_._2) - - val iq = Module(new IssueQueue( - cfg, wakeUpDateVec.length, bypassUopVec.length - )) - println(s"exu:${cfg.name} wakeupCnt:${wakeUpDateVec.length} bypassCnt:${bypassUopVec.length}") - iq.io.redirect <> redirect - iq.io.tlbFeedback := io.mem.tlbFeedback(i - exuParameters.ExuCnt + exuParameters.LduCnt + exuParameters.StuCnt) - iq.io.enq <> dispatch.io.enqIQCtrl(i) - dispatch.io.numExist(i) := iq.io.numExist - for( - (wakeUpPort, exuOut) <- - iq.io.wakeUpPorts.zip(wakeUpDateVec) - ){ - wakeUpPort.bits := exuOut.bits - wakeUpPort.valid := exuOut.fire() // data after arbit - } - iq.io.bypassUops <> bypassUopVec - for(i <- bypassDataVec.indices){ - iq.io.bypassData(i).valid := bypassDataVec(i).valid - iq.io.bypassData(i).bits := bypassDataVec(i).bits - } - iq - }) - - io.mem.commits <> roq.io.commits - io.mem.roqDeqPtr := roq.io.roqDeqPtr - io.mem.ldin <> issueQueues.filter(_.exuCfg == Exu.ldExeUnitCfg).map(_.io.deq) - io.mem.ldin.map(_.bits.uop.debugInfo.issueTime := timer) - io.mem.stin <> issueQueues.filter(_.exuCfg == Exu.stExeUnitCfg).map(_.io.deq) - io.mem.stin.map(_.bits.uop.debugInfo.issueTime := timer) - jmpExeUnit.io.exception.valid := roq.io.redirect.valid && roq.io.redirect.bits.isException - jmpExeUnit.io.exception.bits := roq.io.exception - - io.frontend.outOfOrderBrInfo <> brq.io.outOfOrderBrInfo - io.frontend.inOrderBrInfo <> brq.io.inOrderBrInfo - - decode.io.in <> io.frontend.cfVec - brq.io.roqRedirect <> roq.io.redirect - brq.io.memRedirect <> io.mem.replayAll - brq.io.bcommit := roq.io.bcommit - brq.io.enqReqs <> decode.io.toBrq - for ((x, y) <- brq.io.exuRedirect.zip(exeUnits.filter(_.config.hasRedirect))) { - x.bits := y.io.out.bits - x.valid := y.io.out.fire() && y.io.out.bits.redirectValid - } - decode.io.brTags <> brq.io.brTags - decBuf.io.isWalking := Cat(roq.io.commits.map(c => c.valid && c.bits.isWalk)).orR // TODO: opt this - decBuf.io.redirect <> redirect - decBuf.io.in <> decode.io.out - - rename.io.redirect <> redirect - rename.io.roqCommits <> roq.io.commits - rename.io.in <> decBuf.io.out - rename.io.intRfReadAddr <> dispatch.io.readIntRf.map(_.addr) ++ dispatch.io.intMemRegAddr - rename.io.intPregRdy <> dispatch.io.intPregRdy ++ dispatch.io.intMemRegRdy - rename.io.fpRfReadAddr <> dispatch.io.readFpRf.map(_.addr) ++ dispatch.io.fpMemRegAddr - rename.io.fpPregRdy <> dispatch.io.fpPregRdy ++ dispatch.io.fpMemRegRdy - rename.io.replayPregReq <> dispatch.io.replayPregReq - dispatch.io.redirect <> redirect - dispatch.io.fromRename <> rename.io.out - dispatch.io.fromRename.foreach(_.bits.debugInfo.renameTime := timer) - - roq.io.memRedirect <> io.mem.replayAll - roq.io.brqRedirect <> brq.io.redirect - roq.io.dp1Req <> dispatch.io.toRoq - roq.io.dp1Req.foreach(_.bits.debugInfo.dispatchTime := timer) - dispatch.io.roqIdxs <> roq.io.roqIdxs - io.mem.dp1Req <> dispatch.io.toLsroq - dispatch.io.lsIdxs <> io.mem.lsIdxs - dispatch.io.dequeueRoqIndex.valid := roq.io.commitRoqIndex.valid || io.mem.oldestStore.valid - // store writeback must be after commit roqIdx - dispatch.io.dequeueRoqIndex.bits := Mux(io.mem.oldestStore.valid, io.mem.oldestStore.bits, roq.io.commitRoqIndex.bits) - - - intRf.io.readPorts <> dispatch.io.readIntRf - fpRf.io.readPorts <> dispatch.io.readFpRf ++ issueQueues.flatMap(_.io.readFpRf) - memRf.io.readPorts <> issueQueues.flatMap(_.io.readIntRf) - - io.mem.redirect <> redirect - - val wbu = Module(new Wbu(exuConfigs)) - wbu.io.in <> exeWbReqs - - val wbIntResults = wbu.io.toIntRf - val wbFpResults = wbu.io.toFpRf - - def exuOutToRfWrite(x: Valid[ExuOutput]): RfWritePort = { - val rfWrite = Wire(new RfWritePort) - rfWrite.wen := x.valid - rfWrite.addr := x.bits.uop.pdest - rfWrite.data := x.bits.data - rfWrite - } - val intRfWrite = wbIntResults.map(exuOutToRfWrite) - intRf.io.writePorts <> intRfWrite - memRf.io.writePorts <> intRfWrite - fpRf.io.writePorts <> wbFpResults.map(exuOutToRfWrite) - - rename.io.wbIntResults <> wbIntResults - rename.io.wbFpResults <> wbFpResults - - roq.io.exeWbResults.take(exeWbReqs.length).zip(wbu.io.toRoq).foreach(x => x._1 := x._2) - roq.io.exeWbResults.last := brq.io.out - roq.io.exeWbResults.foreach(_.bits.uop.debugInfo.writebackTime := timer) - - val commitTime = timer - val renameToCommit = roq.io.commits.map(c => Mux(c.valid && !c.bits.isWalk, timer - c.bits.uop.debugInfo.renameTime, 0.U)).reduce(_ + _) - val dispatchToCommit = roq.io.commits.map(c => Mux(c.valid && !c.bits.isWalk, timer - c.bits.uop.debugInfo.dispatchTime, 0.U)).reduce(_ + _) - val issueToCommit = roq.io.commits.map(c => Mux(c.valid && !c.bits.isWalk, timer - c.bits.uop.debugInfo.issueTime, 0.U)).reduce(_ + _) - val writebackToCommit = roq.io.commits.map(c => Mux(c.valid && !c.bits.isWalk, timer - c.bits.uop.debugInfo.writebackTime, 0.U)).reduce(_ + _) - val loadIssueToCommit = roq.io.commits.map(c => Mux(c.valid && !c.bits.isWalk && c.bits.uop.ctrl.commitType === CommitType.LOAD, timer - c.bits.uop.debugInfo.issueTime, 0.U)).reduce(_ + _) - val loadIssueToWriteback = roq.io.commits.map(c => Mux(c.valid && !c.bits.isWalk && c.bits.uop.ctrl.commitType === CommitType.LOAD, c.bits.uop.debugInfo.writebackTime - c.bits.uop.debugInfo.issueTime, 0.U)).reduce(_ + _) - val storeIssueToCommit = roq.io.commits.map(c => Mux(c.valid && !c.bits.isWalk && c.bits.uop.ctrl.commitType === CommitType.STORE, timer - c.bits.uop.debugInfo.issueTime, 0.U)).reduce(_ + _) - val storeIssueToWriteback = roq.io.commits.map(c => Mux(c.valid && !c.bits.isWalk && c.bits.uop.ctrl.commitType === CommitType.STORE, c.bits.uop.debugInfo.writebackTime - c.bits.uop.debugInfo.issueTime, 0.U)).reduce(_ + _) - - XSPerf("renameToCommit", renameToCommit) - XSPerf("dispatchToCommit", dispatchToCommit) - XSPerf("issueToCommit", issueToCommit) - XSPerf("writebackToCommit", writebackToCommit) - XSPerf("loadIssueToCommit", loadIssueToCommit) - XSPerf("loadIssueToWriteback", loadIssueToWriteback) - XSPerf("storeIssueToCommit", storeIssueToCommit) - XSPerf("storeIssueToWriteback", storeIssueToWriteback) - - // TODO: Remove sink and source - val tmp = WireInit(0.U) - val sinks = Array[String]( - "DTLBFINISH", - "DTLBPF", - "DTLBENABLE", - "perfCntCondMdcacheLoss", - "perfCntCondMl2cacheLoss", - "perfCntCondMdcacheHit", - "lsuMMIO", - "perfCntCondMl2cacheHit", - "perfCntCondMl2cacheReq", - "mtip", - "perfCntCondMdcacheReq", - "meip" - ) - for (s <- sinks) { - BoringUtils.addSink(tmp, s) - } - - val debugIntReg, debugFpReg = WireInit(VecInit(Seq.fill(32)(0.U(XLEN.W)))) - BoringUtils.addSink(debugIntReg, "DEBUG_INT_ARCH_REG") - BoringUtils.addSink(debugFpReg, "DEBUG_FP_ARCH_REG") - val debugArchReg = WireInit(VecInit(debugIntReg ++ debugFpReg)) - if (!env.FPGAPlatform) { - BoringUtils.addSource(debugArchReg, "difftestRegs") - } - -} diff --git a/src/main/scala/xiangshan/backend/CtrlBlock.scala b/src/main/scala/xiangshan/backend/CtrlBlock.scala new file mode 100644 index 0000000000000000000000000000000000000000..bd1fb0a595351e3aa155b362f5ed682c976e0944 --- /dev/null +++ b/src/main/scala/xiangshan/backend/CtrlBlock.scala @@ -0,0 +1,171 @@ +package xiangshan.backend + +import chisel3._ +import chisel3.util._ +import utils._ +import xiangshan._ +import xiangshan.backend.decode.{DecodeBuffer, DecodeStage} +import xiangshan.backend.rename.{Rename, BusyTable} +import xiangshan.backend.brq.Brq +import xiangshan.backend.dispatch.Dispatch +import xiangshan.backend.exu._ +import xiangshan.backend.exu.Exu.exuConfigs +import xiangshan.backend.regfile.RfReadPort +import xiangshan.backend.roq.{Roq, RoqPtr, RoqCSRIO} + +class CtrlToIntBlockIO extends XSBundle { + val enqIqCtrl = Vec(exuParameters.IntExuCnt, DecoupledIO(new MicroOp)) + val enqIqData = Vec(exuParameters.IntExuCnt, Output(new ExuInput)) + val readRf = Vec(NRIntReadPorts, Flipped(new RfReadPort)) + val redirect = ValidIO(new Redirect) +} + +class CtrlToFpBlockIO extends XSBundle { + val enqIqCtrl = Vec(exuParameters.FpExuCnt, DecoupledIO(new MicroOp)) + val enqIqData = Vec(exuParameters.FpExuCnt, Output(new ExuInput)) + val readRf = Vec(NRFpReadPorts, Flipped(new RfReadPort)) + val redirect = ValidIO(new Redirect) +} + +class CtrlToLsBlockIO extends XSBundle { + val enqIqCtrl = Vec(exuParameters.LsExuCnt, DecoupledIO(new MicroOp)) + val enqIqData = Vec(exuParameters.LsExuCnt, Output(new ExuInput)) + val enqLsq = new Bundle() { + val canAccept = Input(Bool()) + val req = Vec(RenameWidth, ValidIO(new MicroOp)) + val resp = Vec(RenameWidth, Input(new LSIdx)) + } + val redirect = ValidIO(new Redirect) +} + +class CtrlBlock extends XSModule with HasCircularQueuePtrHelper { + val io = IO(new Bundle { + val frontend = Flipped(new FrontendToBackendIO) + val fromIntBlock = Flipped(new IntBlockToCtrlIO) + val fromFpBlock = Flipped(new FpBlockToCtrlIO) + val fromLsBlock = Flipped(new LsBlockToCtrlIO) + val toIntBlock = new CtrlToIntBlockIO + val toFpBlock = new CtrlToFpBlockIO + val toLsBlock = new CtrlToLsBlockIO + val roqio = new Bundle { + // to int block + val toCSR = new RoqCSRIO + val exception = ValidIO(new MicroOp) + val isInterrupt = Output(Bool()) + // to mem block + val commits = Vec(CommitWidth, ValidIO(new RoqCommit)) + val roqDeqPtr = Output(new RoqPtr) + } + }) + + val decode = Module(new DecodeStage) + val brq = Module(new Brq) + val decBuf = Module(new DecodeBuffer) + val rename = Module(new Rename) + val dispatch = Module(new Dispatch) + val intBusyTable = Module(new BusyTable(NRIntReadPorts, NRIntWritePorts)) + val fpBusyTable = Module(new BusyTable(NRFpReadPorts, NRFpWritePorts)) + + val roqWbSize = NRIntWritePorts + NRFpWritePorts + exuParameters.StuCnt + 1 + + val roq = Module(new Roq(roqWbSize)) + + // When replay and mis-prediction have the same roqIdx, + // mis-prediction should have higher priority, since mis-prediction flushes the load instruction. + // Thus, only when mis-prediction roqIdx is after replay roqIdx, replay should be valid. + val brqIsAfterLsq = isAfter(brq.io.redirect.bits.roqIdx, io.fromLsBlock.replay.bits.roqIdx) + val redirectArb = Mux(io.fromLsBlock.replay.valid && (!brq.io.redirect.valid || brqIsAfterLsq), + io.fromLsBlock.replay.bits, brq.io.redirect.bits) + val redirectValid = roq.io.redirect.valid || brq.io.redirect.valid || io.fromLsBlock.replay.valid + val redirect = Mux(roq.io.redirect.valid, roq.io.redirect.bits, redirectArb) + + io.frontend.redirect.valid := redirectValid + io.frontend.redirect.bits := Mux(roq.io.redirect.valid, roq.io.redirect.bits.target, redirectArb.target) + io.frontend.outOfOrderBrInfo <> brq.io.outOfOrderBrInfo + io.frontend.inOrderBrInfo <> brq.io.inOrderBrInfo + + decode.io.in <> io.frontend.cfVec + decode.io.toBrq <> brq.io.enqReqs + decode.io.brTags <> brq.io.brTags + decode.io.out <> decBuf.io.in + + brq.io.roqRedirect <> roq.io.redirect + brq.io.memRedirect.valid := brq.io.redirect.valid || io.fromLsBlock.replay.valid + brq.io.memRedirect.bits <> redirectArb + brq.io.bcommit <> roq.io.bcommit + brq.io.enqReqs <> decode.io.toBrq + brq.io.exuRedirect <> io.fromIntBlock.exuRedirect + + decBuf.io.isWalking := roq.io.commits(0).valid && roq.io.commits(0).bits.isWalk + decBuf.io.redirect.valid <> redirectValid + decBuf.io.redirect.bits <> redirect + decBuf.io.out <> rename.io.in + + rename.io.redirect.valid <> redirectValid + rename.io.redirect.bits <> redirect + rename.io.roqCommits <> roq.io.commits + rename.io.out <> dispatch.io.fromRename + rename.io.renameBypass <> dispatch.io.renameBypass + + dispatch.io.redirect.valid <> redirectValid + dispatch.io.redirect.bits <> redirect + dispatch.io.enqRoq <> roq.io.enq + dispatch.io.enqLsq <> io.toLsBlock.enqLsq + dispatch.io.readIntRf <> io.toIntBlock.readRf + dispatch.io.readFpRf <> io.toFpBlock.readRf + dispatch.io.allocPregs.zipWithIndex.foreach { case (preg, i) => + intBusyTable.io.allocPregs(i).valid := preg.isInt + fpBusyTable.io.allocPregs(i).valid := preg.isFp + intBusyTable.io.allocPregs(i).bits := preg.preg + fpBusyTable.io.allocPregs(i).bits := preg.preg + } + dispatch.io.numExist <> io.fromIntBlock.numExist ++ io.fromFpBlock.numExist ++ io.fromLsBlock.numExist + dispatch.io.enqIQCtrl <> io.toIntBlock.enqIqCtrl ++ io.toFpBlock.enqIqCtrl ++ io.toLsBlock.enqIqCtrl + dispatch.io.enqIQData <> io.toIntBlock.enqIqData ++ io.toFpBlock.enqIqData ++ io.toLsBlock.enqIqData + + + val flush = redirectValid && (redirect.isException || redirect.isFlushPipe) + fpBusyTable.io.flush := flush + intBusyTable.io.flush := flush + for((wb, setPhyRegRdy) <- io.fromIntBlock.wbRegs.zip(intBusyTable.io.wbPregs)){ + setPhyRegRdy.valid := wb.valid && wb.bits.uop.ctrl.rfWen && (wb.bits.uop.ctrl.ldest =/= 0.U) + setPhyRegRdy.bits := wb.bits.uop.pdest + } + for((wb, setPhyRegRdy) <- io.fromFpBlock.wbRegs.zip(fpBusyTable.io.wbPregs)){ + setPhyRegRdy.valid := wb.valid && wb.bits.uop.ctrl.fpWen + setPhyRegRdy.bits := wb.bits.uop.pdest + } + intBusyTable.io.rfReadAddr <> dispatch.io.readIntRf.map(_.addr) + intBusyTable.io.pregRdy <> dispatch.io.intPregRdy + fpBusyTable.io.rfReadAddr <> dispatch.io.readFpRf.map(_.addr) + fpBusyTable.io.pregRdy <> dispatch.io.fpPregRdy + + roq.io.memRedirect := DontCare + roq.io.memRedirect.valid := false.B + roq.io.brqRedirect.valid := brq.io.redirect.valid || io.fromLsBlock.replay.valid + roq.io.brqRedirect.bits <> redirectArb + roq.io.exeWbResults.take(roqWbSize-1).zip( + io.fromIntBlock.wbRegs ++ io.fromFpBlock.wbRegs ++ io.fromLsBlock.stOut + ).foreach{ + case(x, y) => + x.bits := y.bits + x.valid := y.valid && !y.bits.redirectValid + } + roq.io.exeWbResults.last := brq.io.out + + io.toIntBlock.redirect.valid := redirectValid + io.toIntBlock.redirect.bits := redirect + io.toFpBlock.redirect.valid := redirectValid + io.toFpBlock.redirect.bits := redirect + io.toLsBlock.redirect.valid := redirectValid + io.toLsBlock.redirect.bits := redirect + + // roq to int block + io.roqio.toCSR <> roq.io.csr + io.roqio.exception.valid := roq.io.redirect.valid && roq.io.redirect.bits.isException + io.roqio.exception.bits := roq.io.exception + io.roqio.isInterrupt := roq.io.redirect.bits.isFlushPipe + // roq to mem block + io.roqio.roqDeqPtr := roq.io.roqDeqPtr + io.roqio.commits := roq.io.commits +} diff --git a/src/main/scala/xiangshan/backend/FloatBlock.scala b/src/main/scala/xiangshan/backend/FloatBlock.scala new file mode 100644 index 0000000000000000000000000000000000000000..18fe704157376d3211ba557ae807ffa809211d62 --- /dev/null +++ b/src/main/scala/xiangshan/backend/FloatBlock.scala @@ -0,0 +1,162 @@ +package xiangshan.backend + +import chisel3._ +import chisel3.util._ +import xiangshan._ +import xiangshan.backend.regfile.Regfile +import xiangshan.backend.exu._ +import xiangshan.backend.issue.{ReservationStationCtrl, ReservationStationData} + + +class FpBlockToCtrlIO extends XSBundle { + val wbRegs = Vec(NRFpWritePorts, ValidIO(new ExuOutput)) + val numExist = Vec(exuParameters.FpExuCnt, Output(UInt(log2Ceil(IssQueSize).W))) +} + +class FloatBlock +( + fastWakeUpIn: Seq[ExuConfig], + slowWakeUpIn: Seq[ExuConfig], + fastFpOut: Seq[ExuConfig], + slowFpOut: Seq[ExuConfig], + fastIntOut: Seq[ExuConfig], + slowIntOut: Seq[ExuConfig] +) extends XSModule with HasExeBlockHelper { + val io = IO(new Bundle { + val fromCtrlBlock = Flipped(new CtrlToFpBlockIO) + val toCtrlBlock = new FpBlockToCtrlIO + + val wakeUpIn = new WakeUpBundle(fastWakeUpIn.size, slowWakeUpIn.size) + val wakeUpFpOut = Flipped(new WakeUpBundle(fastFpOut.size, slowFpOut.size)) + val wakeUpIntOut = Flipped(new WakeUpBundle(fastIntOut.size, slowIntOut.size)) + + // from csr + val frm = Input(UInt(3.W)) + }) + + val redirect = io.fromCtrlBlock.redirect + + val fpRf = Module(new Regfile( + numReadPorts = NRFpReadPorts, + numWirtePorts = NRFpWritePorts, + hasZero = false, + len = XLEN + 1 + )) + + val fmacExeUnits = Array.tabulate(exuParameters.FmacCnt)(_ => Module(new FmacExeUnit)) + val fmiscExeUnits = Array.tabulate(exuParameters.FmiscCnt)(_ => Module(new FmiscExeUnit)) + + fmacExeUnits.foreach(_.frm := io.frm) + fmiscExeUnits.foreach(_.frm := io.frm) + + val exeUnits = fmacExeUnits ++ fmiscExeUnits + + def needWakeup(cfg: ExuConfig): Boolean = + (cfg.readIntRf && cfg.writeIntRf) || (cfg.readFpRf && cfg.writeFpRf) + + def needData(a: ExuConfig, b: ExuConfig): Boolean = + (a.readIntRf && b.writeIntRf) || (a.readFpRf && b.writeFpRf) + + val reservedStations = exeUnits.map(_.config).zipWithIndex.map({ case (cfg, i) => + var certainLatency = -1 + if (cfg.hasCertainLatency) { + certainLatency = cfg.latency.latencyVal.get + } + + val readFpRf = cfg.readFpRf + + val inBlockWbData = exeUnits.filter(e => e.config.hasCertainLatency && readFpRf).map(_.io.toFp.bits.data) + val writeBackData = inBlockWbData ++ io.wakeUpIn.fast.map(_.bits.data) + val wakeupCnt = writeBackData.length + + val inBlockListenPorts = exeUnits.filter(e => e.config.hasUncertainlatency && readFpRf).map(_.io.toFp) + val extraListenPorts = inBlockListenPorts ++ io.wakeUpIn.slow + val extraListenPortsCnt = extraListenPorts.length + + println(s"${i}: exu:${cfg.name} wakeupCnt: ${wakeupCnt} " + + s"extraListenPorts: ${extraListenPortsCnt} " + + s"delay:${certainLatency}" + ) + + val rsCtrl = Module(new ReservationStationCtrl(cfg, wakeupCnt, extraListenPortsCnt, fixedDelay = certainLatency, feedback = false)) + val rsData = Module(new ReservationStationData(cfg, wakeupCnt, extraListenPortsCnt, fixedDelay = certainLatency, feedback = false)) + + rsCtrl.io.data <> rsData.io.ctrl + rsCtrl.io.redirect <> redirect // TODO: remove it + rsCtrl.io.numExist <> io.toCtrlBlock.numExist(i) + rsCtrl.io.enqCtrl <> io.fromCtrlBlock.enqIqCtrl(i) + rsData.io.enqData <> io.fromCtrlBlock.enqIqData(i) + rsData.io.redirect <> redirect + + rsData.io.writeBackedData <> writeBackData + for ((x, y) <- rsData.io.extraListenPorts.zip(extraListenPorts)) { + x.valid := y.fire() + x.bits := y.bits + } + + exeUnits(i).io.redirect <> redirect + exeUnits(i).io.fromFp <> rsData.io.deq + rsData.io.feedback := DontCare + + rsCtrl.suggestName(s"rsc_${cfg.name}") + rsData.suggestName(s"rsd_${cfg.name}") + + rsData + }) + + for(rs <- reservedStations){ + val inBlockUops = reservedStations.filter(x => + x.exuCfg.hasCertainLatency && x.exuCfg.writeFpRf + ).map(x => { + val raw = WireInit(x.io.selectedUop) + raw.valid := x.io.selectedUop.valid && raw.bits.ctrl.fpWen + raw + }) + rs.io.broadcastedUops <> inBlockUops ++ io.wakeUpIn.fastUops + } + + io.wakeUpFpOut.fastUops <> reservedStations.filter( + rs => fpFastFilter(rs.exuCfg) + ).map(_.io.selectedUop).map(fpValid) + + io.wakeUpFpOut.fast <> exeUnits.filter( + x => fpFastFilter(x.config) + ).map(_.io.toFp) + + io.wakeUpFpOut.slow <> exeUnits.filter( + x => fpSlowFilter(x.config) + ).map(_.io.toFp) + + io.wakeUpIntOut.fastUops <> reservedStations.filter( + rs => intFastFilter(rs.exuCfg) + ).map(_.io.selectedUop).map(intValid) + + io.wakeUpIntOut.fast <> exeUnits.filter( + x => intFastFilter(x.config) + ).map(_.io.toInt) + + io.wakeUpIntOut.slow <> exeUnits.filter( + x => intSlowFilter(x.config) + ).map(_.io.toInt) + + + // read fp rf from ctrl block + fpRf.io.readPorts <> io.fromCtrlBlock.readRf + // write fp rf arbiter + val fpWbArbiter = Module(new Wb( + (exeUnits.map(_.config) ++ fastWakeUpIn ++ slowWakeUpIn).map(_.wbFpPriority), + NRFpWritePorts + )) + fpWbArbiter.io.in <> exeUnits.map(_.io.toFp) ++ io.wakeUpIn.fast ++ io.wakeUpIn.slow + + // set busytable and update roq + io.toCtrlBlock.wbRegs <> fpWbArbiter.io.out + + fpRf.io.writePorts.zip(fpWbArbiter.io.out).foreach{ + case (rf, wb) => + rf.wen := wb.valid && wb.bits.uop.ctrl.fpWen + rf.addr := wb.bits.uop.pdest + rf.data := wb.bits.data + } + +} \ No newline at end of file diff --git a/src/main/scala/xiangshan/backend/IntegerBlock.scala b/src/main/scala/xiangshan/backend/IntegerBlock.scala new file mode 100644 index 0000000000000000000000000000000000000000..7bc754683469792e2da0545c64a0c6ab19515696 --- /dev/null +++ b/src/main/scala/xiangshan/backend/IntegerBlock.scala @@ -0,0 +1,228 @@ +package xiangshan.backend + +import chisel3._ +import chisel3.util._ +import xiangshan._ +import xiangshan.backend.exu.Exu.{jumpExeUnitCfg, ldExeUnitCfg, stExeUnitCfg} +import xiangshan.backend.exu.{AluExeUnit, ExuConfig, JumpExeUnit, MulDivExeUnit, Wb} +import xiangshan.backend.fu.FenceToSbuffer +import xiangshan.backend.issue.{ReservationStationCtrl, ReservationStationData} +import xiangshan.backend.regfile.Regfile +import xiangshan.backend.fu.fpu.Fflags + +class WakeUpBundle(numFast: Int, numSlow: Int) extends XSBundle { + val fastUops = Vec(numFast, Flipped(ValidIO(new MicroOp))) + val fast = Vec(numFast, Flipped(DecoupledIO(new ExuOutput))) //one cycle later than fastUops + val slow = Vec(numSlow, Flipped(DecoupledIO(new ExuOutput))) + + override def cloneType = (new WakeUpBundle(numFast, numSlow)).asInstanceOf[this.type] + +} + +class IntBlockToCtrlIO extends XSBundle { + // write back regfile signals after arbiter + // used to update busytable and roq state + val wbRegs = Vec(NRIntWritePorts, ValidIO(new ExuOutput)) + // write back to brq + val exuRedirect = Vec(exuParameters.AluCnt+exuParameters.JmpCnt, ValidIO(new ExuOutput)) + val numExist = Vec(exuParameters.IntExuCnt, Output(UInt(log2Ceil(IssQueSize).W))) +} + +trait HasExeBlockHelper { + def fpFastFilter(cfg: ExuConfig): Boolean = { + cfg.hasCertainLatency && cfg.writeFpRf + } + def fpSlowFilter(cfg: ExuConfig): Boolean = { + cfg.hasUncertainlatency && cfg.writeFpRf + } + def intFastFilter(cfg: ExuConfig): Boolean = { + cfg.hasCertainLatency && cfg.writeIntRf + } + def intSlowFilter(cfg: ExuConfig): Boolean = { + cfg.hasUncertainlatency && cfg.writeIntRf + } + def fpValid(x: ValidIO[MicroOp]): ValidIO[MicroOp] = { + val uop = WireInit(x) + uop.valid := x.valid && x.bits.ctrl.fpWen + uop + } + def intValid(x: ValidIO[MicroOp]): ValidIO[MicroOp] = { + val uop = WireInit(x) + uop.valid := x.valid && x.bits.ctrl.rfWen + uop + } +} + +class IntegerBlock +( + fastWakeUpIn: Seq[ExuConfig], + slowWakeUpIn: Seq[ExuConfig], + fastFpOut: Seq[ExuConfig], + slowFpOut: Seq[ExuConfig], + fastIntOut: Seq[ExuConfig], + slowIntOut: Seq[ExuConfig] +) extends XSModule with HasExeBlockHelper +{ + val io = IO(new Bundle { + val fromCtrlBlock = Flipped(new CtrlToIntBlockIO) + val toCtrlBlock = new IntBlockToCtrlIO + + val wakeUpIn = new WakeUpBundle(fastWakeUpIn.size, slowWakeUpIn.size) + val wakeUpFpOut = Flipped(new WakeUpBundle(fastFpOut.size, slowFpOut.size)) + val wakeUpIntOut = Flipped(new WakeUpBundle(fastIntOut.size, slowIntOut.size)) + + val csrio = new Bundle { + val fflags = Input(new Fflags) // from roq + val dirty_fs = Input(Bool()) // from roq + val frm = Output(UInt(3.W)) // to float + val exception = Flipped(ValidIO(new MicroOp)) // from roq + val isInterrupt = Input(Bool()) // from roq + val trapTarget = Output(UInt(VAddrBits.W)) // to roq + val interrupt = Output(Bool()) // to roq + val memExceptionVAddr = Input(UInt(VAddrBits.W)) // from lsq + val externalInterrupt = new ExternalInterruptIO // from outside + val tlb = Output(new TlbCsrBundle) // from tlb + } + val fenceio = new Bundle { + val sfence = Output(new SfenceBundle) // to front,mem + val fencei = Output(Bool()) // to icache + val sbuffer = new FenceToSbuffer // to mem + } + }) + + val redirect = io.fromCtrlBlock.redirect + + val intRf = Module(new Regfile( + numReadPorts = NRIntReadPorts, + numWirtePorts = NRIntWritePorts, + hasZero = true, + len = XLEN + )) + + val aluExeUnits = Array.tabulate(exuParameters.AluCnt)(_ => Module(new AluExeUnit)) + val jmpExeUnit = Module(new JumpExeUnit) + val mduExeUnits = Array.tabulate(exuParameters.MduCnt)(_ => Module(new MulDivExeUnit)) + + val exeUnits = jmpExeUnit +: (aluExeUnits ++ mduExeUnits) + + def needWakeup(cfg: ExuConfig): Boolean = + (cfg.readIntRf && cfg.writeIntRf) || (cfg.readFpRf && cfg.writeFpRf) + + def needData(a: ExuConfig, b: ExuConfig): Boolean = + (a.readIntRf && b.writeIntRf) || (a.readFpRf && b.writeFpRf) + + val reservationStations = exeUnits.map(_.config).zipWithIndex.map({ case (cfg, i) => + var certainLatency = -1 + if (cfg.hasCertainLatency) { + certainLatency = cfg.latency.latencyVal.get + } + + val readIntRf = cfg.readIntRf + + val inBlockWbData = exeUnits.filter(e => e.config.hasCertainLatency && readIntRf).map(_.io.toInt.bits.data) + val writeBackData = inBlockWbData ++ io.wakeUpIn.fast.map(_.bits.data) + val wakeupCnt = writeBackData.length + + val inBlockListenPorts = exeUnits.filter(e => e.config.hasUncertainlatency && readIntRf).map(_.io.toInt) + val extraListenPorts = inBlockListenPorts ++ io.wakeUpIn.slow + val extraListenPortsCnt = extraListenPorts.length + + val feedback = (cfg == ldExeUnitCfg) || (cfg == stExeUnitCfg) + + println(s"${i}: exu:${cfg.name} wakeupCnt: ${wakeupCnt} extraListenPorts: ${extraListenPortsCnt} delay:${certainLatency} feedback:${feedback}") + + // val rs = Module(new ReservationStationNew( + // cfg, wakeupCnt, extraListenPortsCnt, fixedDelay = certainLatency, feedback = feedback + // )) + val rsCtrl = Module(new ReservationStationCtrl(cfg, wakeupCnt, extraListenPortsCnt, fixedDelay = certainLatency, feedback = feedback)) + val rsData = Module(new ReservationStationData(cfg, wakeupCnt, extraListenPortsCnt, fixedDelay = certainLatency, feedback = feedback)) + + rsCtrl.io.data <> rsData.io.ctrl + rsCtrl.io.redirect <> redirect // TODO: remove it + rsCtrl.io.numExist <> io.toCtrlBlock.numExist(i) + rsCtrl.io.enqCtrl <> io.fromCtrlBlock.enqIqCtrl(i) + rsData.io.enqData <> io.fromCtrlBlock.enqIqData(i) + rsData.io.redirect <> redirect + + rsData.io.writeBackedData <> writeBackData + for ((x, y) <- rsData.io.extraListenPorts.zip(extraListenPorts)) { + x.valid := y.fire() + x.bits := y.bits + } + + exeUnits(i).io.redirect <> redirect + exeUnits(i).io.fromInt <> rsData.io.deq + rsData.io.feedback := DontCare + + rsCtrl.suggestName(s"rsc_${cfg.name}") + rsData.suggestName(s"rsd_${cfg.name}") + + rsData + }) + + for(rs <- reservationStations){ + val inBlockUops = reservationStations.filter(x => + x.exuCfg.hasCertainLatency && x.exuCfg.writeIntRf + ).map(x => { + val raw = WireInit(x.io.selectedUop) + raw.valid := x.io.selectedUop.valid && raw.bits.ctrl.rfWen + raw + }) + rs.io.broadcastedUops <> inBlockUops ++ io.wakeUpIn.fastUops + } + + io.wakeUpFpOut.fastUops <> reservationStations.filter( + rs => fpFastFilter(rs.exuCfg) + ).map(_.io.selectedUop).map(fpValid) + + io.wakeUpFpOut.fast <> exeUnits.filter( + x => fpFastFilter(x.config) + ).map(_.io.toFp) + + io.wakeUpFpOut.slow <> exeUnits.filter( + x => fpSlowFilter(x.config) + ).map(_.io.toFp) + + io.wakeUpIntOut.fastUops <> reservationStations.filter( + rs => intFastFilter(rs.exuCfg) + ).map(_.io.selectedUop).map(intValid) + + io.wakeUpIntOut.fast <> exeUnits.filter( + x => intFastFilter(x.config) + ).map(_.io.toInt) + + io.wakeUpIntOut.slow <> exeUnits.filter( + x => intSlowFilter(x.config) + ).map(_.io.toInt) + + // send misprediction to brq + io.toCtrlBlock.exuRedirect.zip( + exeUnits.filter(_.config.hasRedirect).map(_.io.toInt) + ).foreach{ + case (x, y) => + x.valid := y.fire() && y.bits.redirectValid + x.bits := y.bits + } + + jmpExeUnit.csrio <> io.csrio + jmpExeUnit.fenceio <> io.fenceio + + // read int rf from ctrl block + intRf.io.readPorts <> io.fromCtrlBlock.readRf + // write int rf arbiter + val intWbArbiter = Module(new Wb( + (exeUnits.map(_.config) ++ fastWakeUpIn ++ slowWakeUpIn).map(_.wbIntPriority), + NRIntWritePorts + )) + intWbArbiter.io.in <> exeUnits.map(_.io.toInt) ++ io.wakeUpIn.fast ++ io.wakeUpIn.slow + + // set busytable and update roq + io.toCtrlBlock.wbRegs <> intWbArbiter.io.out + + intRf.io.writePorts.zip(intWbArbiter.io.out).foreach{ + case (rf, wb) => + rf.wen := wb.valid && wb.bits.uop.ctrl.rfWen + rf.addr := wb.bits.uop.pdest + rf.data := wb.bits.data + } +} \ No newline at end of file diff --git a/src/main/scala/xiangshan/backend/MemBlock.scala b/src/main/scala/xiangshan/backend/MemBlock.scala new file mode 100644 index 0000000000000000000000000000000000000000..62a02b865151b6d286b4ee3803c7aa429f0fd7c1 --- /dev/null +++ b/src/main/scala/xiangshan/backend/MemBlock.scala @@ -0,0 +1,281 @@ +package xiangshan.backend + +import chisel3._ +import chisel3.util._ +import xiangshan._ +import xiangshan.backend.exu.Exu.{loadExuConfigs, storeExuConfigs} +import xiangshan.backend.roq.RoqPtr +import xiangshan.backend.exu._ +import xiangshan.cache._ +import xiangshan.mem._ +import xiangshan.backend.fu.FenceToSbuffer +import xiangshan.backend.issue.{ReservationStationCtrl, ReservationStationData} +import xiangshan.backend.fu.FunctionUnit.{lduCfg, mouCfg, stuCfg} + +class LsBlockToCtrlIO extends XSBundle { + val stOut = Vec(exuParameters.StuCnt, ValidIO(new ExuOutput)) // write to roq + val numExist = Vec(exuParameters.LsExuCnt, Output(UInt(log2Ceil(IssQueSize).W))) + val replay = ValidIO(new Redirect) +} + +class MemBlockToDcacheIO extends XSBundle { + val loadUnitToDcacheVec = Vec(exuParameters.LduCnt, new DCacheLoadIO) + val loadMiss = new DCacheLineIO + val atomics = new DCacheWordIO + val sbufferToDcache = new DCacheLineIO + val uncache = new DCacheWordIO +} + +class MemBlock +( + fastWakeUpIn: Seq[ExuConfig], + slowWakeUpIn: Seq[ExuConfig], + fastFpOut: Seq[ExuConfig], + slowFpOut: Seq[ExuConfig], + fastIntOut: Seq[ExuConfig], + slowIntOut: Seq[ExuConfig] +) extends XSModule with HasExeBlockHelper { + + val io = IO(new Bundle { + val fromCtrlBlock = Flipped(new CtrlToLsBlockIO) + val toCtrlBlock = new LsBlockToCtrlIO + + val wakeUpIn = new WakeUpBundle(fastWakeUpIn.size, slowWakeUpIn.size) + val wakeUpFpOut = Flipped(new WakeUpBundle(fastFpOut.size, slowFpOut.size)) + val wakeUpIntOut = Flipped(new WakeUpBundle(fastIntOut.size, slowIntOut.size)) + + val ptw = new TlbPtwIO + // TODO: dcache should be inside MemBlock + val dcache = new MemBlockToDcacheIO + val sfence = Input(new SfenceBundle) + val tlbCsr = Input(new TlbCsrBundle) + val fenceToSbuffer = Flipped(new FenceToSbuffer) + + val lsqio = new Bundle { + val exceptionAddr = new ExceptionAddrIO // to csr + val commits = Flipped(Vec(CommitWidth, Valid(new RoqCommit))) // to lsq + val roqDeqPtr = Input(new RoqPtr) // to lsq + } + }) + + val redirect = io.fromCtrlBlock.redirect + + val loadUnits = Seq.fill(exuParameters.LduCnt)(Module(new LoadUnit)) + val storeUnits = Seq.fill(exuParameters.StuCnt)(Module(new StoreUnit)) + val exeUnits = loadUnits ++ storeUnits + + val atomicsUnit = Module(new AtomicsUnit) + + val loadWritebackOverride = Mux(atomicsUnit.io.out.valid, atomicsUnit.io.out.bits, loadUnits.head.io.ldout.bits) + val ldOut0 = Wire(Decoupled(new ExuOutput)) + ldOut0.valid := atomicsUnit.io.out.valid || loadUnits.head.io.ldout.valid + ldOut0.bits := loadWritebackOverride + atomicsUnit.io.out.ready := ldOut0.ready + loadUnits.head.io.ldout.ready := ldOut0.ready + + val exeWbReqs = ldOut0 +: loadUnits.tail.map(_.io.ldout) + + val reservationStations = (loadExuConfigs ++ storeExuConfigs).zipWithIndex.map({ case (cfg, i) => + var certainLatency = -1 + if (cfg.hasCertainLatency) { + certainLatency = cfg.latency.latencyVal.get + } + + val readIntRf = cfg.readIntRf + val readFpRf = cfg.readFpRf + + // load has uncertain latency, so only use external wake up data + val writeBackData = fastWakeUpIn.zip(io.wakeUpIn.fast) + .filter(x => (x._1.writeIntRf && readIntRf) || (x._1.writeFpRf && readFpRf)) + .map(_._2.bits.data) + val wakeupCnt = writeBackData.length + + val inBlockListenPorts = exeWbReqs + val extraListenPorts = inBlockListenPorts ++ + slowWakeUpIn.zip(io.wakeUpIn.slow) + .filter(x => (x._1.writeIntRf && readIntRf) || (x._1.writeFpRf && readFpRf)) + .map(_._2) + + val extraListenPortsCnt = extraListenPorts.length + + // if tlb miss, replay + val feedback = true + + println(s"${i}: exu:${cfg.name} wakeupCnt: ${wakeupCnt} extraListenPorts: ${extraListenPortsCnt} delay:${certainLatency} feedback:${feedback}") + + val rsCtrl = Module(new ReservationStationCtrl(cfg, wakeupCnt, extraListenPortsCnt, fixedDelay = certainLatency, feedback = feedback)) + val rsData = Module(new ReservationStationData(cfg, wakeupCnt, extraListenPortsCnt, fixedDelay = certainLatency, feedback = feedback)) + + rsCtrl.io.data <> rsData.io.ctrl + rsCtrl.io.redirect <> redirect // TODO: remove it + rsCtrl.io.numExist <> io.toCtrlBlock.numExist(i) + rsCtrl.io.enqCtrl <> io.fromCtrlBlock.enqIqCtrl(i) + rsData.io.enqData <> io.fromCtrlBlock.enqIqData(i) + rsData.io.redirect <> redirect + + rsData.io.writeBackedData <> writeBackData + for ((x, y) <- rsData.io.extraListenPorts.zip(extraListenPorts)) { + x.valid := y.fire() + x.bits := y.bits + } + + // exeUnits(i).io.redirect <> redirect + // exeUnits(i).io.fromInt <> rsData.io.deq + rsData.io.feedback := DontCare + + rsCtrl.suggestName(s"rsc_${cfg.name}") + rsData.suggestName(s"rsd_${cfg.name}") + + rsData + }) + + for(rs <- reservationStations){ + rs.io.broadcastedUops <> fastWakeUpIn.zip(io.wakeUpIn.fastUops) + .filter(x => (x._1.writeIntRf && rs.exuCfg.readIntRf) || (x._1.writeFpRf && rs.exuCfg.readFpRf)) + .map(_._2) + } + + // TODO: make this better + io.wakeUpIn.fast.foreach(_.ready := true.B) + io.wakeUpIn.slow.foreach(_.ready := true.B) + + io.wakeUpFpOut.slow <> exeWbReqs.map(x => { + val raw = WireInit(x) + raw.valid := x.valid && x.bits.uop.ctrl.fpWen + raw + }) + + io.wakeUpIntOut.slow <> exeWbReqs.map(x => { + val raw = WireInit(x) + raw.valid := x.valid && x.bits.uop.ctrl.rfWen + raw + }) + + // load always ready + exeWbReqs.foreach(_.ready := true.B) + + val dtlb = Module(new TLB(Width = DTLBWidth, isDtlb = true)) + val lsq = Module(new LsqWrappper) + val sbuffer = Module(new NewSbuffer) + // if you wants to stress test dcache store, use FakeSbuffer + // val sbuffer = Module(new FakeSbuffer) + + // dtlb + io.ptw <> dtlb.io.ptw + dtlb.io.sfence <> io.sfence + dtlb.io.csr <> io.tlbCsr + + // LoadUnit + for (i <- 0 until exuParameters.LduCnt) { + loadUnits(i).io.redirect <> io.fromCtrlBlock.redirect + loadUnits(i).io.tlbFeedback <> reservationStations(i).io.feedback + loadUnits(i).io.dtlb <> dtlb.io.requestor(i) + // get input form dispatch + loadUnits(i).io.ldin <> reservationStations(i).io.deq + // dcache access + loadUnits(i).io.dcache <> io.dcache.loadUnitToDcacheVec(i) + // forward + loadUnits(i).io.lsq.forward <> lsq.io.forward(i) + loadUnits(i).io.sbuffer <> sbuffer.io.forward(i) + + // passdown to lsq + lsq.io.loadIn(i) <> loadUnits(i).io.lsq.loadIn + lsq.io.ldout(i) <> loadUnits(i).io.lsq.ldout + } + + // StoreUnit + for (i <- 0 until exuParameters.StuCnt) { + storeUnits(i).io.redirect <> io.fromCtrlBlock.redirect + storeUnits(i).io.tlbFeedback <> reservationStations(exuParameters.LduCnt + i).io.feedback + storeUnits(i).io.dtlb <> dtlb.io.requestor(exuParameters.LduCnt + i) + // get input form dispatch + storeUnits(i).io.stin <> reservationStations(exuParameters.LduCnt + i).io.deq + // passdown to lsq + storeUnits(i).io.lsq <> lsq.io.storeIn(i) + io.toCtrlBlock.stOut(i).valid := storeUnits(i).io.stout.valid + io.toCtrlBlock.stOut(i).bits := storeUnits(i).io.stout.bits + storeUnits(i).io.stout.ready := true.B + } + + // mmio store writeback will use store writeback port 0 + lsq.io.mmioStout.ready := false.B + when(lsq.io.mmioStout.valid && !storeUnits(0).io.stout.valid) { + io.toCtrlBlock.stOut(0).valid := true.B + lsq.io.mmioStout.ready := true.B + io.toCtrlBlock.stOut(0).bits := lsq.io.mmioStout.bits + } + + // Lsq + lsq.io.commits <> io.lsqio.commits + lsq.io.enq <> io.fromCtrlBlock.enqLsq + lsq.io.brqRedirect := io.fromCtrlBlock.redirect + lsq.io.roqDeqPtr := io.lsqio.roqDeqPtr + io.toCtrlBlock.replay <> lsq.io.rollback + lsq.io.dcache <> io.dcache.loadMiss + lsq.io.uncache <> io.dcache.uncache + + // LSQ to store buffer + lsq.io.sbuffer <> sbuffer.io.in + + // Sbuffer + sbuffer.io.dcache <> io.dcache.sbufferToDcache + + // flush sbuffer + val fenceFlush = io.fenceToSbuffer.flushSb + val atomicsFlush = atomicsUnit.io.flush_sbuffer.valid + io.fenceToSbuffer.sbIsEmpty := sbuffer.io.flush.empty + // if both of them tries to flush sbuffer at the same time + // something must have gone wrong + assert(!(fenceFlush && atomicsFlush)) + sbuffer.io.flush.valid := fenceFlush || atomicsFlush + + // TODO: make 0/1 configurable + // AtomicsUnit + // AtomicsUnit will override other control signials, + // as atomics insts (LR/SC/AMO) will block the pipeline + val st0_atomics = reservationStations(2).io.deq.valid && reservationStations(2).io.deq.bits.uop.ctrl.fuType === FuType.mou + val st1_atomics = reservationStations(3).io.deq.valid && reservationStations(3).io.deq.bits.uop.ctrl.fuType === FuType.mou + // amo should always go through store issue queue 0 + assert(!st1_atomics) + + atomicsUnit.io.dtlb.resp.valid := false.B + atomicsUnit.io.dtlb.resp.bits := DontCare + atomicsUnit.io.dtlb.req.ready := dtlb.io.requestor(0).req.ready + + // dispatch 0 takes priority + atomicsUnit.io.in.valid := st0_atomics + atomicsUnit.io.in.bits := reservationStations(2).io.deq.bits + when (st0_atomics) { + reservationStations(0).io.deq.ready := atomicsUnit.io.in.ready + storeUnits(0).io.stin.valid := false.B + } + + when(atomicsUnit.io.dtlb.req.valid) { + dtlb.io.requestor(0) <> atomicsUnit.io.dtlb + // take load unit 0's tlb port + // make sure not to disturb loadUnit + assert(!loadUnits(0).io.dtlb.req.valid) + loadUnits(0).io.dtlb.resp.valid := false.B + } + + when(atomicsUnit.io.tlbFeedback.valid) { + assert(!storeUnits(0).io.tlbFeedback.valid) + atomicsUnit.io.tlbFeedback <> reservationStations(exuParameters.LduCnt + 0).io.feedback + } + + atomicsUnit.io.dcache <> io.dcache.atomics + atomicsUnit.io.flush_sbuffer.empty := sbuffer.io.flush.empty + + atomicsUnit.io.redirect <> io.fromCtrlBlock.redirect + + when(atomicsUnit.io.out.valid){ + // take load unit 0's write back port + assert(!loadUnits(0).io.ldout.valid) + loadUnits(0).io.ldout.ready := false.B + } + + lsq.io.exceptionAddr.lsIdx := io.lsqio.exceptionAddr.lsIdx + lsq.io.exceptionAddr.isStore := io.lsqio.exceptionAddr.isStore + io.lsqio.exceptionAddr.vaddr := Mux(atomicsUnit.io.exceptionAddr.valid, atomicsUnit.io.exceptionAddr.bits, lsq.io.exceptionAddr.vaddr) + +} \ No newline at end of file diff --git a/src/main/scala/xiangshan/backend/brq/Brq.scala b/src/main/scala/xiangshan/backend/brq/Brq.scala index 31c00c029d7655040432e5a83c29250a7a125424..780faf9abca05b593b9c8aba0ea31caedfbd7e25 100644 --- a/src/main/scala/xiangshan/backend/brq/Brq.scala +++ b/src/main/scala/xiangshan/backend/brq/Brq.scala @@ -67,25 +67,22 @@ class Brq extends XSModule with HasCircularQueuePtrHelper { val exuOut = new ExuOutput } - val s_idle :: s_wb :: s_commited :: Nil = - List.tabulate(3)(i => (1 << i).U(3.W).asTypeOf(new StateQueueEntry)) + val s_invalid :: s_idle :: s_wb :: s_commited :: Nil = + List.tabulate(4)(i => (1 << i).U(4.W).asTypeOf(new StateQueueEntry)) class StateQueueEntry extends Bundle{ val isCommit = Bool() val isWb = Bool() val isIdle = Bool() + val isInvalid = Bool() } val brCommitCnt = RegInit(0.U(BrTagWidth.W)) val brQueue = Mem(BrqSize, new BrqEntry) //Reg(Vec(BrqSize, new BrqEntry)) - val stateQueue = RegInit(VecInit(Seq.fill(BrqSize)(s_idle))) + val stateQueue = RegInit(VecInit(Seq.fill(BrqSize)(s_invalid))) val headPtr, tailPtr = RegInit(BrqPtr(false.B, 0.U)) -// def isEmpty(ptr1: BrqPtr, ptr2: BrqPtr): Bool = ptr1 === ptr2 -// def isFull(ptr1: BrqPtr, ptr2: BrqPtr): Bool = (ptr1.flag=/=ptr2.flag) && (ptr1.value===ptr2.value) - - // dequeue val headIdx = headPtr.value @@ -102,12 +99,8 @@ class Brq extends XSModule with HasCircularQueuePtrHelper { commitIdx = 6 */ val headIdxOH = UIntToOH(headIdx) - val headIdxMaskHiVec = Wire(Vec(BrqSize, Bool())) - for(i <- headIdxMaskHiVec.indices){ - headIdxMaskHiVec(i) := { if(i==0) headIdxOH(i) else headIdxMaskHiVec(i-1) || headIdxOH(i) } - } - val headIdxMaskHi = headIdxMaskHiVec.asUInt() - val headIdxMaskLo = (~headIdxMaskHi).asUInt() + val headIdxMaskLo = headIdxOH - 1.U + val headIdxMaskHi = ~headIdxMaskLo val commitIdxHi = PriorityEncoder((~skipMask).asUInt() & headIdxMaskHi) val (commitIdxLo, findLo) = PriorityEncoderWithFlag((~skipMask).asUInt() & headIdxMaskLo) @@ -147,7 +140,8 @@ class Brq extends XSModule with HasCircularQueuePtrHelper { XSDebug(p"headPtr:$headPtr tailPtr:$tailPtr\n") XSDebug("") stateQueue.reverse.map(s =>{ - XSDebug(false, s.isIdle, "-") + XSDebug(false, s.isInvalid, "-") + XSDebug(false, s.isIdle, "i") XSDebug(false, s.isWb, "w") XSDebug(false, s.isCommit, "c") }) @@ -159,15 +153,15 @@ class Brq extends XSModule with HasCircularQueuePtrHelper { stateQueue(commitIdx) := s_commited } when(deqValid){ - stateQueue(headIdx) := s_idle + stateQueue(headIdx) := s_invalid } assert(!(commitIdx===headIdx && commitValid && deqValid), "Error: deq and commit a same entry!") headPtr := headPtrNext io.redirect.valid := commitValid && - commitIsMisPred && - !io.roqRedirect.valid && - !io.redirect.bits.roqIdx.needFlush(io.memRedirect) + commitIsMisPred //&& + // !io.roqRedirect.valid && + // !io.redirect.bits.roqIdx.needFlush(io.memRedirect) io.redirect.bits := commitEntry.exuOut.redirect io.out.valid := commitValid @@ -184,20 +178,21 @@ class Brq extends XSModule with HasCircularQueuePtrHelper { ) // branch insts enq - var full = WireInit(isFull(headPtrNext, tailPtr)) - var tailPtrNext = WireInit(tailPtr) - for((enq, brTag) <- io.enqReqs.zip(io.brTags)){ - enq.ready := !full - brTag := tailPtrNext - when(enq.fire()){ - brQueue(tailPtrNext.value).npc := enq.bits.cf.brUpdate.pnpc - brQueue(tailPtrNext.value).ptrFlag := tailPtrNext.flag + val validEntries = distanceBetween(tailPtr, headPtr) + for(i <- 0 until DecodeWidth){ + val offset = if(i == 0) 0.U else PopCount(io.enqReqs.take(i).map(_.valid)) + val brTag = tailPtr + offset + val idx = brTag.value + io.enqReqs(i).ready := validEntries <= (BrqSize - (i + 1)).U + io.brTags(i) := brTag + when(io.enqReqs(i).fire()){ + brQueue(idx).npc := io.enqReqs(i).bits.cf.brUpdate.pnpc + brQueue(idx).ptrFlag := brTag.flag + stateQueue(idx) := s_idle } - - tailPtrNext = tailPtrNext + enq.fire() - full = isFull(tailPtrNext, headPtrNext) } - tailPtr := tailPtrNext + val enqCnt = PopCount(io.enqReqs.map(_.fire())) + tailPtr := tailPtr + enqCnt // exu write back for(exuWb <- io.exuRedirect){ @@ -218,24 +213,24 @@ class Brq extends XSModule with HasCircularQueuePtrHelper { when(io.roqRedirect.valid){ // exception - stateQueue.foreach(_ := s_idle) + stateQueue.foreach(_ := s_invalid) headPtr := BrqPtr(false.B, 0.U) tailPtr := BrqPtr(false.B, 0.U) brCommitCnt := 0.U - }.elsewhen(io.redirect.valid || io.memRedirect.valid){ + }.elsewhen(io.memRedirect.valid){ // misprediction or replay stateQueue.zipWithIndex.foreach({case(s, i) => + // replay should flush brTag val ptr = BrqPtr(brQueue(i).ptrFlag, i.U) - when( - (io.redirect.valid && ptr.needBrFlush(io.redirect.bits.brTag)) || - (s.isWb && brQueue(i).exuOut.uop.roqIdx.needFlush(io.memRedirect)) - ){ - s := s_idle + val replayMatch = io.memRedirect.bits.isReplay && ptr === io.memRedirect.bits.brTag + when(io.memRedirect.valid && (ptr.needBrFlush(io.memRedirect.bits.brTag) || replayMatch)){ + s := s_invalid } }) - when(io.redirect.valid){ // Only Br Mispred reset tailPtr, replay does not - tailPtr := io.redirect.bits.brTag + true.B + when(io.memRedirect.valid){ + tailPtr := io.memRedirect.bits.brTag + Mux(io.memRedirect.bits.isReplay, 0.U, 1.U) } + } diff --git a/src/main/scala/xiangshan/backend/decode/DecodeBuffer.scala b/src/main/scala/xiangshan/backend/decode/DecodeBuffer.scala index f1b40c12f9170ee98932c02e6064fec0834ee8cc..4bc1e3d03b24471d600f7343a2609e08c532d4bc 100644 --- a/src/main/scala/xiangshan/backend/decode/DecodeBuffer.scala +++ b/src/main/scala/xiangshan/backend/decode/DecodeBuffer.scala @@ -24,7 +24,7 @@ class DecodeBuffer extends XSModule { }) ) - val flush = io.redirect.valid && !io.redirect.bits.isReplay + val flush = io.redirect.valid// && !io.redirect.bits.isReplay for( i <- 0 until RenameWidth){ when(io.out(i).fire()){ @@ -40,17 +40,7 @@ class DecodeBuffer extends XSModule { val r = RegEnable(io.in(i).bits, io.in(i).fire()) io.in(i).ready := leftCanIn io.out(i).bits <> r - if(i > 0 ){ - io.out(i).valid := validVec(i) && - !flush && - Mux(r.ctrl.noSpecExec, - !ParallelOR(validVec.take(i)), - !ParallelOR(io.out.zip(validVec).take(i).map(x => x._2 && x._1.bits.ctrl.noSpecExec)) - ) && !io.isWalking - } else { - require( i == 0) - io.out(i).valid := validVec(i) && !flush && !io.isWalking - } + io.out(i).valid := validVec(i) && !flush && !io.isWalking } for(in <- io.in){ diff --git a/src/main/scala/xiangshan/backend/decode/DecodeHelper.scala b/src/main/scala/xiangshan/backend/decode/DecodeHelper.scala index 0c6ee0d8721a0db7003fb780cdf8bc9a76bfef0f..3797d818b7b120a3d1fa6da471f01a9cee6851ec 100644 --- a/src/main/scala/xiangshan/backend/decode/DecodeHelper.scala +++ b/src/main/scala/xiangshan/backend/decode/DecodeHelper.scala @@ -58,9 +58,6 @@ object Instructions extends HasInstrType with HasXSParameter { (if (HasMExtension) RVMInstr.table else Nil) ++ (if (HasCExtension) RVCInstr.table else Nil) ++ (if (HasFPU) RVFInstr.table ++ RVDInstr.table else Nil) -// Privileged.table ++ -// RVAInstr.table ++ -// RVZicsrInstr.table } object CInstructions extends HasInstrType with HasXSParameter { diff --git a/src/main/scala/xiangshan/backend/decode/DecodeStage.scala b/src/main/scala/xiangshan/backend/decode/DecodeStage.scala index 7072338a33aeca901e1ee26347d61eb9d62d7312..a6ac36a2119513d2b5fa2ef1671995d8337e5dee 100644 --- a/src/main/scala/xiangshan/backend/decode/DecodeStage.scala +++ b/src/main/scala/xiangshan/backend/decode/DecodeStage.scala @@ -41,14 +41,12 @@ class DecodeStage extends XSModule { decoderToDecBuffer(i).brTag := io.brTags(i) io.out(i).bits := decoderToDecBuffer(i) - val thisReady = io.out(i).ready && io.toBrq(i).ready val isMret = decoders(i).io.out.cf.instr === BitPat("b001100000010_00000_000_00000_1110011") val isSret = decoders(i).io.out.cf.instr === BitPat("b000100000010_00000_000_00000_1110011") - val thisBrqValid = io.in(i).valid && (!decoders(i).io.out.cf.brUpdate.pd.notCFI || isMret || isSret) && io.out(i).ready - val thisOutValid = io.in(i).valid && io.toBrq(i).ready - io.in(i).ready := { if (i == 0) thisReady else io.in(i-1).ready && thisReady } - io.out(i).valid := { if (i == 0) thisOutValid else io.in(i-1).ready && thisOutValid } - io.toBrq(i).valid := { if (i == 0) thisBrqValid else io.in(i-1).ready && thisBrqValid } + val thisBrqValid = !decoders(i).io.out.cf.brUpdate.pd.notCFI || isMret || isSret + io.in(i).ready := io.out(i).ready && io.toBrq(i).ready + io.out(i).valid := io.in(i).valid && io.toBrq(i).ready + io.toBrq(i).valid := io.in(i).valid && thisBrqValid && io.out(i).ready XSDebug(io.in(i).valid || io.out(i).valid || io.toBrq(i).valid, "i:%d In(%d %d) Out(%d %d) ToBrq(%d %d) pc:%x instr:%x\n", i.U, io.in(i).valid, io.in(i).ready, io.out(i).valid, io.out(i).ready, io.toBrq(i).valid, io.toBrq(i).ready, io.in(i).bits.pc, io.in(i).bits.instr) } diff --git a/src/main/scala/xiangshan/backend/decode/Decoder.scala b/src/main/scala/xiangshan/backend/decode/Decoder.scala index cae8a0ce925d7cc179e2cdd1aff6719720d2026b..dae71f3d6c87043cb6ff563e290ad09a456dc630 100644 --- a/src/main/scala/xiangshan/backend/decode/Decoder.scala +++ b/src/main/scala/xiangshan/backend/decode/Decoder.scala @@ -2,11 +2,10 @@ package xiangshan.backend.decode import chisel3._ import chisel3.util._ -import chisel3.util.experimental.BoringUtils import xiangshan._ import utils._ import xiangshan.backend._ -import xiangshan.backend.decode.isa.RVCInstr +import xiangshan.backend.decode.isa.{RVCInstr, RV32I_ALUInstr, RVFInstr, RVDInstr} import xiangshan.{CfCtrl, CtrlFlow} @@ -36,16 +35,19 @@ class Decoder extends XSModule with HasInstrType { io.out.ctrl.fuType := fuType val SrcTypeTable = List( - InstrI -> (SrcType.reg, SrcType.imm), - InstrFI -> (SrcType.reg, SrcType.imm), - InstrR -> (SrcType.reg, SrcType.reg), - InstrS -> (SrcType.reg, SrcType.reg), - InstrFS -> (SrcType.reg, SrcType.fp ), - InstrSA -> (SrcType.reg, SrcType.reg), - InstrB -> (SrcType.reg, SrcType.reg), - InstrU -> (SrcType.pc , SrcType.imm), - InstrJ -> (SrcType.pc , SrcType.imm), - InstrN -> (SrcType.pc , SrcType.imm) + InstrI -> (SrcType.reg, SrcType.imm), + InstrFI -> (SrcType.reg, SrcType.imm), + InstrR -> (SrcType.reg, SrcType.reg), + InstrFR -> (SrcType.fp, SrcType.fp ), + InstrS -> (SrcType.reg, SrcType.reg), + InstrFS -> (SrcType.reg, SrcType.fp ), + InstrSA -> (SrcType.reg, SrcType.reg), + InstrB -> (SrcType.reg, SrcType.reg), + InstrU -> (SrcType.pc , SrcType.imm), + InstrJ -> (SrcType.pc , SrcType.imm), + InstrN -> (SrcType.pc , SrcType.imm), + InstrGtoF -> (SrcType.reg, SrcType.imm), + InstrFtoG -> (SrcType.fp , SrcType.fp) ) val src1Type = LookupTree(instrType, SrcTypeTable.map(p => (p._1, p._2._1))) val src2Type = LookupTree(instrType, SrcTypeTable.map(p => (p._1, p._2._2))) @@ -77,6 +79,7 @@ class Decoder extends XSModule with HasInstrType { val rfSrc1 = Mux(isRVC, rvc_src1, rs) val rfSrc2 = Mux(isRVC, rvc_src2, rt) + val rfSrc3 = instr(31, 27) val rfDest = Mux(isRVC, rvc_dest, rd) // TODO: refactor decode logic @@ -85,6 +88,7 @@ class Decoder extends XSModule with HasInstrType { val fpWen = isfpWen(instrType) io.out.ctrl.lsrc1 := Mux(src1Type === SrcType.pc, 0.U, rfSrc1) io.out.ctrl.lsrc2 := Mux(src2Type === SrcType.imm, 0.U, rfSrc2) + io.out.ctrl.lsrc3 := rfSrc3 io.out.ctrl.rfWen := rfWen io.out.ctrl.fpWen := fpWen io.out.ctrl.ldest := Mux(fpWen || rfWen, rfDest, 0.U) @@ -128,11 +132,46 @@ class Decoder extends XSModule with HasInstrType { } } - io.out.ctrl.src1Type := Mux(instr(6,0) === "b0110111".U || instr(15, 13) === "b011".U && instr(1, 0) === "b01".U, SrcType.reg, src1Type) - io.out.ctrl.src2Type := src2Type - // val vmEnable = WireInit(false.B) - // BoringUtils.addSink(vmEnable, "DTLBENABLE") + + + def bitPatLookup(key: UInt, default: UInt, mapping: Seq[(BitPat, UInt)]) = { + mapping.foldLeft(default){case (d, (k, v)) => Mux(k === key, v, d)} + } + + io.out.ctrl.src1Type := bitPatLookup(instr, src1Type, Seq( + RV32I_ALUInstr.LUI -> SrcType.reg // FIX LUI + )) + io.out.ctrl.src2Type := bitPatLookup(instr, src2Type, Seq( + RVFInstr.FSQRT_S -> SrcType.imm, + RVFInstr.FCLASS_S -> SrcType.imm, + RVFInstr.FMV_X_W -> SrcType.imm, + RVFInstr.FCVT_W_S -> SrcType.imm, + RVFInstr.FCVT_WU_S -> SrcType.imm, + RVFInstr.FCVT_L_S -> SrcType.imm, + RVFInstr.FCVT_LU_S -> SrcType.imm, + + RVDInstr.FSQRT_D -> SrcType.imm, + RVDInstr.FCVT_S_D -> SrcType.imm, + RVDInstr.FCVT_D_S -> SrcType.imm, + RVDInstr.FCLASS_D -> SrcType.imm, + RVDInstr.FMV_X_D -> SrcType.imm, + RVDInstr.FCVT_W_D -> SrcType.imm, + RVDInstr.FCVT_WU_D -> SrcType.imm, + RVDInstr.FCVT_L_D -> SrcType.imm, + RVDInstr.FCVT_LU_D -> SrcType.imm + )) + io.out.ctrl.src3Type := bitPatLookup(instr, SrcType.imm, Seq( + RVFInstr.FMADD_S -> SrcType.fp, + RVFInstr.FNMADD_S -> SrcType.fp, + RVFInstr.FMSUB_S -> SrcType.fp, + RVFInstr.FNMSUB_S -> SrcType.fp, + + RVDInstr.FMADD_D -> SrcType.fp, + RVDInstr.FNMADD_D -> SrcType.fp, + RVDInstr.FMSUB_D -> SrcType.fp, + RVDInstr.FNMSUB_D -> SrcType.fp, + )) io.out.cf.exceptionVec.map(_ := false.B) io.out.cf.exceptionVec(illegalInstr) := instrType === InstrN @@ -143,14 +182,30 @@ class Decoder extends XSModule with HasInstrType { when(io.out.ctrl.isXSTrap){ io.out.ctrl.lsrc1 := 10.U // a0 } - io.out.ctrl.noSpecExec := io.out.ctrl.isXSTrap || io.out.ctrl.fuType===FuType.csr || io.out.ctrl.fuType===FuType.mou || io.out.ctrl.fuType===FuType.fence/*noSpecExec make it sent to alu0,for roq is empty*/ + + /*noSpecExec make it sent to alu0,for roq is empty*/ + io.out.ctrl.noSpecExec := io.out.ctrl.isXSTrap || + io.out.ctrl.fuType===FuType.csr || + io.out.ctrl.fuType===FuType.mou || + io.out.ctrl.fuType===FuType.fence + + // fflags zero csrrs rd csr + val isFrflags = BitPat("b000000000001_00000_010_?????_1110011") === io.in.instr + + io.out.ctrl.blockBackward := io.out.ctrl.isXSTrap || + (io.out.ctrl.fuType===FuType.csr && !isFrflags) || + io.out.ctrl.fuType===FuType.mou || + io.out.ctrl.fuType===FuType.fence + io.out.ctrl.flushPipe := io.out.ctrl.fuType===FuType.fence + io.out.ctrl.isRVF := instr(26, 25) === 0.U + XSDebug("in: instr=%x pc=%x excepVec=%b intrVec=%b crossPageIPFFix=%d\n", io.in.instr, io.in.pc, io.in.exceptionVec.asUInt, io.in.intrVec.asUInt, io.in.crossPageIPFFix) XSDebug("out: src1Type=%b src2Type=%b src3Type=%b lsrc1=%d lsrc2=%d lsrc3=%d ldest=%d fuType=%b fuOpType=%b\n", io.out.ctrl.src1Type, io.out.ctrl.src2Type, io.out.ctrl.src3Type, io.out.ctrl.lsrc1, io.out.ctrl.lsrc2, io.out.ctrl.lsrc3, io.out.ctrl.ldest, io.out.ctrl.fuType, io.out.ctrl.fuOpType) XSDebug("out: rfWen=%d fpWen=%d isXSTrap=%d noSpecExec=%d isBlocked=%d flushPipe=%d isRVF=%d imm=%x\n", - io.out.ctrl.rfWen, io.out.ctrl.fpWen, io.out.ctrl.isXSTrap, io.out.ctrl.noSpecExec, io.out.ctrl.isBlocked, io.out.ctrl.flushPipe, io.out.ctrl.isRVF, io.out.ctrl.imm) + io.out.ctrl.rfWen, io.out.ctrl.fpWen, io.out.ctrl.isXSTrap, io.out.ctrl.noSpecExec, io.out.ctrl.blockBackward, io.out.ctrl.flushPipe, io.out.ctrl.isRVF, io.out.ctrl.imm) } diff --git a/src/main/scala/xiangshan/backend/decode/isa/RVC.scala b/src/main/scala/xiangshan/backend/decode/isa/RVC.scala index 6331b2acda4a48a49f59c90dc09a2dceb195a76a..834975c91ef12b7e33f375f34e9e3c342c0980b0 100644 --- a/src/main/scala/xiangshan/backend/decode/isa/RVC.scala +++ b/src/main/scala/xiangshan/backend/decode/isa/RVC.scala @@ -57,13 +57,13 @@ object RVCInstr extends HasInstrType with HasRVCConst { // def C_XX = BitPat("b????????????????_???_?_10_987_65_432_10") def C_ILLEGAL = BitPat("b0000000000000000_000_0_00_000_00_000_00") def C_ADDI4SPN = BitPat("b????????????????_000_?_??_???_??_???_00") - // def C_FLD = BitPat("b????????????????_001_?_??_???_??_???_00") + def C_FLD = BitPat("b????????????????_001_?_??_???_??_???_00") // def C_LQ = BitPat("b????????????????_001_?_??_???_??_???_00") def C_LW = BitPat("b????????????????_010_?_??_???_??_???_00") // def C_FLW = BitPat("b????????????????_011_?_??_???_??_???_00") // RV32FC Only def C_LD = BitPat("b????????????????_011_?_??_???_??_???_00") // def C_LI = BitPat("b????????????????_100_?_??_???_??_???_00") //reserved - // def C_FSD = BitPat("b????????????????_101_?_??_???_??_???_00") + def C_FSD = BitPat("b????????????????_101_?_??_???_??_???_00") // def C_SQ = BitPat("b????????????????_101_?_??_???_??_???_00") def C_SW = BitPat("b????????????????_110_?_??_???_??_???_00") // def C_FSW = BitPat("b????????????????_111_?_??_???_??_???_00") // RV32FC Only @@ -97,7 +97,7 @@ object RVCInstr extends HasInstrType with HasRVCConst { //RVC 11 def C_SLLI = BitPat("b????????????????_000_?_??_???_??_???_10") // def C_SLLI64 = BitPat("b????????????????_000_0_??_???_00_000_10") - // def C_FLDSP = BitPat("b????????????????_001_?_??_???_??_???_10") + def C_FLDSP = BitPat("b????????????????_001_?_??_???_??_???_10") // def C_LQSP = BitPat("b????????????????_001_?_??_???_??_???_10") def C_LWSP = BitPat("b????????????????_010_?_??_???_??_???_10") // def C_FLWSP = BitPat("b????????????????_011_?_??_???_??_???_10") // RV32FC Only @@ -107,8 +107,8 @@ object RVCInstr extends HasInstrType with HasRVCConst { def C_EBREAK = BitPat("b????????????????_100_1_00_000_00_000_10") def C_JALR = BitPat("b????????????????_100_1_??_???_00_000_10") def C_ADD = BitPat("b????????????????_100_1_??_???_??_???_10") - // def C_FSDSP = BitPat("b????????????????_101_?_??_???_??_???_10") - // def C_SQSP = BitPat("b????????????????_101_?_??_???_??_???_10") + def C_FSDSP = BitPat("b????????????????_101_?_??_???_??_???_10") +// def C_SQSP = BitPat("b????????????????_101_?_??_???_??_???_10") def C_SWSP = BitPat("b????????????????_110_?_??_???_??_???_10") // def C_FSWSP = BitPat("b????????????????_111_?_??_???_??_???_10") // RV32FC Only def C_SDSP = BitPat("b????????????????_111_?_??_???_??_???_10") @@ -121,10 +121,10 @@ object RVCInstr extends HasInstrType with HasRVCConst { val table = Array( C_ILLEGAL -> List(InstrN, FuType.csr, CSROpType.jmp), C_ADDI4SPN -> List(InstrI, FuType.alu, ALUOpType.add), - // C_FLD -> List(InstrFI, FuType.ldu, LSUOpType.ld), + C_FLD -> List(InstrFI, FuType.ldu, LSUOpType.ld), C_LW -> List(InstrI, FuType.ldu, LSUOpType.lw), C_LD -> List(InstrI, FuType.ldu, LSUOpType.ld), - // C_FSD -> List(InstrFS, FuType.stu, LSUOpType.sd), + C_FSD -> List(InstrFS, FuType.stu, LSUOpType.sd), C_SW -> List(InstrS, FuType.stu, LSUOpType.sw), C_SD -> List(InstrS, FuType.stu, LSUOpType.sd), C_NOP -> List(InstrI, FuType.alu, ALUOpType.add), @@ -133,7 +133,7 @@ object RVCInstr extends HasInstrType with HasRVCConst { C_ADDIW -> List(InstrI, FuType.alu, ALUOpType.addw), C_LI -> List(InstrI, FuType.alu, ALUOpType.add), C_ADDI16SP -> List(InstrI, FuType.alu, ALUOpType.add), - C_LUI -> List(InstrU, FuType.alu, ALUOpType.add), + C_LUI -> List(InstrI, FuType.alu, ALUOpType.add), C_SRLI -> List(InstrI, FuType.alu, ALUOpType.srl), C_SRAI -> List(InstrI, FuType.alu, ALUOpType.sra), C_ANDI -> List(InstrI, FuType.alu, ALUOpType.and), @@ -147,7 +147,7 @@ object RVCInstr extends HasInstrType with HasRVCConst { C_BEQZ -> List(InstrB, FuType.alu, ALUOpType.beq), C_BNEZ -> List(InstrB, FuType.alu, ALUOpType.bne), C_SLLI -> List(InstrI, FuType.alu, ALUOpType.sll), - // C_FLDSP -> List(InstrI, FuType.alu, ALUOpType.add), + C_FLDSP -> List(InstrFI, FuType.ldu, LSUOpType.ld), C_LWSP -> List(InstrI, FuType.ldu, LSUOpType.lw), // C_FLWSP -> List(InstrI, FuType.alu, ALUOpType.add), C_LDSP -> List(InstrI, FuType.ldu, LSUOpType.ld), @@ -156,18 +156,18 @@ object RVCInstr extends HasInstrType with HasRVCConst { C_EBREAK -> List(InstrI, FuType.alu, ALUOpType.add), C_JALR -> List(InstrI, FuType.jmp, JumpOpType.jalr), C_ADD -> List(InstrR, FuType.alu, ALUOpType.add), - // C_FSDSP -> List(InstrI, FuType.alu, ALUOpType.add), + C_FSDSP -> List(InstrFS, FuType.stu, LSUOpType.sd), C_SWSP -> List(InstrS, FuType.stu, LSUOpType.sw), // C_FSWSP -> List(InstrI, FuType.alu, ALUOpType.add), C_SDSP -> List(InstrS, FuType.stu, LSUOpType.sd) ) - val cExtraTable = Array( + val cExtraTable = Array( C_ADDI4SPN -> List(ImmADD4SPN, REGx2, DtCare, REGrs2p), - // C_FLD -> List(ImmLD, REGrs1p, DtCare, REGrs2p), + C_FLD -> List(ImmLD, REGrs1p, DtCare, REGrs2p), C_LW -> List(ImmLW, REGrs1p, DtCare, REGrs2p), C_LD -> List(ImmLD, REGrs1p, DtCare, REGrs2p), - // C_FSD -> List(ImmSD, REGrs1p, REGrs2p, DtCare), + C_FSD -> List(ImmSD, REGrs1p, REGrs2p, DtCare), C_SW -> List(ImmSW, REGrs1p, REGrs2p, DtCare), C_SD -> List(ImmSD, REGrs1p, REGrs2p, DtCare), C_NOP -> List(ImmNone, DtCare, DtCare, DtCare), @@ -190,7 +190,7 @@ object RVCInstr extends HasInstrType with HasRVCConst { C_BEQZ -> List(ImmB, REGrs1p, DtCare, DtCare), // rd: x0 C_BNEZ -> List(ImmB, REGrs1p, DtCare, DtCare), // rd: x0 C_SLLI -> List(ImmLI, REGrd, DtCare, REGrd), - // C_FLDSP -> List(ImmLDSP, REGx2, DtCare, REGrd), + C_FLDSP -> List(ImmLDSP, REGx2, DtCare, REGrd), // C_LQSP -> List(), C_LWSP -> List(ImmLWSP, REGx2, DtCare, REGrd), C_LDSP -> List(ImmLDSP, REGx2, DtCare, REGrd), @@ -199,7 +199,7 @@ object RVCInstr extends HasInstrType with HasRVCConst { C_EBREAK -> List(ImmNone, DtCare, DtCare, DtCare), //not implemented C_JALR -> List(ImmNone, REGrs1, DtCare, REGx1), C_ADD -> List(ImmNone, REGrd, REGrs2, REGrd), - // C_FSDSP -> List(ImmSDSP, REGx2, REGrs2, DtCare), + C_FSDSP -> List(ImmSDSP, REGx2, REGrs2, DtCare), // C_SQSP -> List(), C_SWSP -> List(ImmSWSP, REGx2, REGrs2, DtCare), C_SDSP -> List(ImmSDSP, REGx2, REGrs2, DtCare) diff --git a/src/main/scala/xiangshan/backend/decode/isa/RVD.scala b/src/main/scala/xiangshan/backend/decode/isa/RVD.scala index f0283f36bd653f1b0addb5f44a05f1a5556e3468..b902d1e97c0fa6ac2373d64c296a6522d02666af 100644 --- a/src/main/scala/xiangshan/backend/decode/isa/RVD.scala +++ b/src/main/scala/xiangshan/backend/decode/isa/RVD.scala @@ -1,9 +1,11 @@ package xiangshan.backend.decode.isa import chisel3.util._ -import xiangshan.{FuType, HasXSParameter} +import xiangshan.HasXSParameter +import xiangshan.FuType._ import xiangshan.backend.decode._ import xiangshan.backend.LSUOpType +import xiangshan.backend.fu.fpu.FPUOpType._ object RVDInstr extends HasXSParameter with HasInstrType { @@ -41,54 +43,43 @@ object RVDInstr extends HasXSParameter with HasInstrType { def FNMADD_D = BitPat("b?????01??????????????????1001111") val table = Array( - FLD -> List(InstrFI, FuType.ldu, LSUOpType.ld), - FSD -> List(InstrFS, FuType.stu, LSUOpType.sd) - ) + FLD -> List(InstrFI, ldu, LSUOpType.ld), + FSD -> List(InstrFS, stu, LSUOpType.sd), + + // FR + FADD_D -> List(InstrFR, fmac, fadd), + FSUB_D -> List(InstrFR, fmac, fsub), + FMUL_D -> List(InstrFR, fmac, fmul), + FDIV_D -> List(InstrFR, fmisc, fdiv), + FMIN_D -> List(InstrFR, fmisc, fmin), + FMAX_D -> List(InstrFR, fmisc, fmax), + FSGNJ_D -> List(InstrFR, fmisc, fsgnj), + FSGNJN_D -> List(InstrFR, fmisc, fsgnjn), + FSGNJX_D -> List(InstrFR, fmisc, fsgnjx), + FSQRT_D -> List(InstrFR, fmisc, fsqrt), + FMADD_D -> List(InstrFR, fmac, fmadd), + FNMADD_D -> List(InstrFR, fmac, fnmadd), + FMSUB_D -> List(InstrFR, fmac, fmsub), + FNMSUB_D -> List(InstrFR, fmac, fnmsub), + FCVT_S_D -> List(InstrFR, fmisc, d2s), + FCVT_D_S -> List(InstrFR, fmisc, s2d), - // (isFp, src1Type, src2Type, src3Type, rfWen, fpWen, fuOpType, inputFunc, outputFunc) -// val table = Array( + // FtoG + FCLASS_D -> List(InstrFtoG, fmisc, fclass), + FMV_X_D -> List(InstrFtoG, fmisc, fmv_f2i), + FCVT_W_D -> List(InstrFtoG, fmisc, f2w), + FCVT_WU_D -> List(InstrFtoG, fmisc, f2wu), + FCVT_L_D -> List(InstrFtoG, fmisc, f2l), + FCVT_LU_D -> List(InstrFtoG, fmisc, f2lu), + FLE_D -> List(InstrFtoG, fmisc, fle), + FLT_D -> List(InstrFtoG, fmisc, flt), + FEQ_D -> List(InstrFtoG, fmisc, feq), -// FLD -> List(Y, reg, imm, imm, N, Y, LSUOpType.ld, in_raw, out_raw), -// C_FLD -> List(Y, reg, imm, imm, N, Y, LSUOpType.ld, in_raw, out_raw), -// C_FLDSP -> List(Y, reg, imm, imm, N, Y, LSUOpType.ld, in_raw, out_raw), -// FSD -> List(Y, reg, fp, imm, N, N, LSUOpType.sd, in_raw, out_raw), -// C_FSD -> List(Y, reg, fp, imm, N, N, LSUOpType.sd, in_raw, out_raw), -// C_FSDSP -> List(Y, reg, fp, imm, N, N, LSUOpType.sd, in_raw, out_raw), -// // fp fp -> fp -// FADD_D -> List(Y, fp, fp, imm, N, Y, fadd, in_raw, out_raw), -// FSUB_D -> List(Y, fp, fp, imm, N, Y, fsub, in_raw, out_raw), -// FMUL_D -> List(Y, fp, fp, imm, N, Y, fmul, in_raw, out_raw), -// FDIV_D -> List(Y, fp, fp, imm, N, Y, fdiv, in_raw, out_raw), -// FMIN_D -> List(Y, fp, fp, imm, N, Y, fmin, in_raw, out_raw), -// FMAX_D -> List(Y, fp, fp, imm, N, Y, fmax, in_raw, out_raw), -// FSGNJ_D -> List(Y, fp, fp, imm, N, Y, fsgnj, in_raw, out_raw), -// FSGNJN_D -> List(Y, fp, fp, imm, N, Y, fsgnjn, in_raw, out_raw), -// FSGNJX_D -> List(Y, fp, fp, imm, N, Y, fsgnjx, in_raw, out_raw), -// // fp -> fp -// FSQRT_D -> List(Y, fp, imm, imm, N, Y, fsqrt, in_raw, out_raw), -// FCVT_S_D -> List(Y, fp, imm, imm, N, Y, d2s, in_raw, out_box), -// FCVT_D_S -> List(Y, fp, imm, imm, N, Y, s2d, in_unbox, out_raw), -// // fp fp fp -> fp -// FMADD_D -> List(Y, fp, fp, fp, N, Y, fmadd, in_raw, out_raw), -// FNMADD_D -> List(Y, fp, fp, fp, N, Y, fnmadd, in_raw, out_raw), -// FMSUB_D -> List(Y, fp, fp, fp, N, Y, fmsub, in_raw, out_raw), -// FNMSUB_D -> List(Y, fp, fp, fp, N, Y, fnmsub, in_raw, out_raw), -// // fp -> gp -// FCLASS_D -> List(Y, fp, imm, imm, Y, N, fclass, in_raw, out_raw), -// FMV_X_D -> List(Y, fp, imm, imm, Y, N, fmv_f2i, in_raw, out_raw), -// FCVT_W_D -> List(Y, fp, imm, imm, Y, N, f2w, in_raw, out_sext), -// FCVT_WU_D -> List(Y, fp, imm, imm, Y, N, f2wu, in_raw, out_sext), -// FCVT_L_D -> List(Y, fp, imm, imm, Y, N, f2l, in_raw, out_raw), -// FCVT_LU_D -> List(Y, fp, imm, imm, Y, N, f2lu, in_raw, out_raw), -// // fp fp -> gp -// FLE_D -> List(Y, fp, fp, imm, Y, N, fle, in_raw, out_raw), -// FLT_D -> List(Y, fp, fp, imm, Y, N, flt, in_raw, out_raw), -// FEQ_D -> List(Y, fp, fp, imm, Y, N, feq, in_raw, out_raw), -// // gp -> fp -// FMV_D_X -> List(Y, reg, imm, imm, N, Y, fmv_i2f, in_raw, out_raw), -// FCVT_D_W -> List(Y, reg, imm, imm, N, Y, w2f, in_raw, out_raw), -// FCVT_D_WU -> List(Y, reg, imm, imm, N, Y, wu2f, in_raw, out_raw), -// FCVT_D_L -> List(Y, reg, imm, imm, N, Y, l2f, in_raw, out_raw), -// FCVT_D_LU -> List(Y, reg, imm, imm, N, Y, lu2f, in_raw, out_raw) -// ) -} \ No newline at end of file + // GtoF + FMV_D_X -> List(InstrGtoF, i2f, fmv_i2f), + FCVT_D_W -> List(InstrGtoF, i2f, w2f), + FCVT_D_WU -> List(InstrGtoF, i2f, wu2f), + FCVT_D_L -> List(InstrGtoF, i2f, l2f), + FCVT_D_LU -> List(InstrGtoF, i2f, lu2f) + ) +} diff --git a/src/main/scala/xiangshan/backend/decode/isa/RVF.scala b/src/main/scala/xiangshan/backend/decode/isa/RVF.scala index a153e5f4a3c644a5da1bf57909e228c22cb8fb90..45c7aa76f191f11165ad0c5d572af814bd844f42 100644 --- a/src/main/scala/xiangshan/backend/decode/isa/RVF.scala +++ b/src/main/scala/xiangshan/backend/decode/isa/RVF.scala @@ -2,8 +2,10 @@ package xiangshan.backend.decode.isa import chisel3.util._ import xiangshan.backend._ -import xiangshan.{FuType, HasXSParameter} +import xiangshan.HasXSParameter +import xiangshan.FuType._ import xiangshan.backend.decode._ +import xiangshan.backend.fu.fpu.FPUOpType._ object RVFInstr extends HasXSParameter with HasInstrType { @@ -39,48 +41,41 @@ object RVFInstr extends HasXSParameter with HasInstrType { def FNMADD_S = BitPat("b?????00??????????????????1001111") val table = Array( - FLW -> List(InstrFI, FuType.ldu, LSUOpType.flw), - FSW -> List(InstrFS, FuType.stu, LSUOpType.sw) - ) + FLW -> List(InstrFI, ldu, LSUOpType.flw), + FSW -> List(InstrFS, stu, LSUOpType.sw), + + // FR + FADD_S -> List(InstrFR, fmac, fadd), + FSUB_S -> List(InstrFR, fmac, fsub), + FMUL_S -> List(InstrFR, fmac, fmul), + FDIV_S -> List(InstrFR, fmisc, fdiv), + FMIN_S -> List(InstrFR, fmisc, fmin), + FMAX_S -> List(InstrFR, fmisc, fmax), + FSGNJ_S -> List(InstrFR, fmisc, fsgnj), + FSGNJN_S -> List(InstrFR, fmisc, fsgnjn), + FSGNJX_S -> List(InstrFR, fmisc, fsgnjx), + FSQRT_S -> List(InstrFR, fmisc, fsqrt), + FMADD_S -> List(InstrFR, fmac, fmadd), + FNMADD_S -> List(InstrFR, fmac, fnmadd), + FMSUB_S -> List(InstrFR, fmac, fmsub), + FNMSUB_S -> List(InstrFR, fmac, fnmsub), - // (isFp, src1Type, src2Type, src3Type, rfWen, fpWen, fuOpType, inputFunc, outputFunc) -// val DecodeDefault = List(N, imm, imm, imm, N, N, fadd, in_raw, out_raw) -// val table = Array( -// FLW -> List(Y, reg, imm, imm, N, Y, LSUOpType.flw, in_raw, out_raw), -// FSW -> List(Y, reg, fp, imm, N, N, LSUOpType.sw, in_raw, out_raw), -// // fp fp -> fp -// FADD_S -> List(Y, fp, fp, imm, N, Y, fadd, in_unbox, out_box), -// FSUB_S -> List(Y, fp, fp, imm, N, Y, fsub, in_unbox, out_box), -// FMUL_S -> List(Y, fp, fp, imm, N, Y, fmul, in_unbox, out_box), -// FDIV_S -> List(Y, fp, fp, imm, N, Y, fdiv, in_unbox, out_box), -// FMIN_S -> List(Y, fp, fp, imm, N, Y, fmin, in_unbox, out_box), -// FMAX_S -> List(Y, fp, fp, imm, N, Y, fmax, in_unbox, out_box), -// FSGNJ_S -> List(Y, fp, fp, imm, N, Y, fsgnj, in_unbox, out_box), -// FSGNJN_S -> List(Y, fp, fp, imm, N, Y, fsgnjn, in_unbox, out_box), -// FSGNJX_S -> List(Y, fp, fp, imm, N, Y, fsgnjx, in_unbox, out_box), -// // fp -> fp -// FSQRT_S -> List(Y, fp, imm, imm, N, Y, fsqrt, in_unbox, out_box), -// // fp fp fp -> fp -// FMADD_S -> List(Y, fp, fp, fp, N, Y, fmadd, in_unbox, out_box), -// FNMADD_S -> List(Y, fp, fp, fp, N, Y, fnmadd, in_unbox, out_box), -// FMSUB_S -> List(Y, fp, fp, fp, N, Y, fmsub, in_unbox, out_box), -// FNMSUB_S -> List(Y, fp, fp, fp, N, Y, fnmsub, in_unbox, out_box), -// // fp -> gp -// FCLASS_S -> List(Y, fp, imm, imm, Y, N, fclass, in_unbox, out_raw), -// FMV_X_W -> List(Y, fp, imm, imm, Y, N, fmv_f2i, in_raw, out_sext), -// FCVT_W_S -> List(Y, fp, imm, imm, Y, N, f2w, in_unbox, out_sext), -// FCVT_WU_S -> List(Y, fp, imm, imm, Y, N, f2wu, in_unbox, out_sext), -// FCVT_L_S -> List(Y, fp, imm, imm, Y, N, f2l, in_unbox, out_raw), -// FCVT_LU_S -> List(Y, fp, imm, imm, Y, N, f2lu, in_unbox, out_raw) , -// // fp fp -> gp -// FLE_S -> List(Y, fp, fp, imm, Y, N, fle, in_unbox, out_raw), -// FLT_S -> List(Y, fp, fp, imm, Y, N, flt, in_unbox, out_raw), -// FEQ_S -> List(Y, fp, fp, imm, Y, N, feq, in_unbox, out_raw), -// // gp -> fp -// FMV_W_X -> List(Y, reg, imm, imm, N, Y, fmv_i2f, in_raw, out_box), -// FCVT_S_W -> List(Y, reg, imm, imm, N, Y, w2f, in_raw, out_box), -// FCVT_S_WU -> List(Y, reg, imm, imm, N, Y, wu2f, in_raw, out_box), -// FCVT_S_L -> List(Y, reg, imm, imm, N, Y, l2f, in_raw, out_box), -// FCVT_S_LU -> List(Y, reg, imm, imm, N, Y, lu2f, in_raw, out_box) -// ) + // F -> G + FCLASS_S -> List(InstrFtoG, fmisc, fclass), + FMV_X_W -> List(InstrFtoG, fmisc, fmv_f2i), + FCVT_W_S -> List(InstrFtoG, fmisc, f2w), + FCVT_WU_S -> List(InstrFtoG, fmisc, f2wu), + FCVT_L_S -> List(InstrFtoG, fmisc, f2l), + FCVT_LU_S -> List(InstrFtoG, fmisc, f2lu), + FLE_S -> List(InstrFtoG, fmisc, fle), + FLT_S -> List(InstrFtoG, fmisc, flt), + FEQ_S -> List(InstrFtoG, fmisc, feq), + + // G -> F + FMV_W_X -> List(InstrGtoF, i2f, fmv_i2f), + FCVT_S_W -> List(InstrGtoF, i2f, w2f), + FCVT_S_WU -> List(InstrGtoF, i2f, wu2f), + FCVT_S_L -> List(InstrGtoF, i2f, l2f), + FCVT_S_LU -> List(InstrGtoF, i2f, lu2f) + ) } diff --git a/src/main/scala/xiangshan/backend/dispatch/Dispatch.scala b/src/main/scala/xiangshan/backend/dispatch/Dispatch.scala index 0af5ff62315cc6bc34d4c744a0618a85952f130a..0e48173fb7de560c40b040695787cc3cd36e7869 100644 --- a/src/main/scala/xiangshan/backend/dispatch/Dispatch.scala +++ b/src/main/scala/xiangshan/backend/dispatch/Dispatch.scala @@ -7,6 +7,7 @@ import utils._ import xiangshan.backend.regfile.RfReadPort import chisel3.ExcitingUtils._ import xiangshan.backend.roq.RoqPtr +import xiangshan.backend.rename.RenameBypassInfo case class DispatchParameters ( @@ -16,10 +17,7 @@ case class DispatchParameters LsDqSize: Int, IntDqDeqWidth: Int, FpDqDeqWidth: Int, - LsDqDeqWidth: Int, - IntDqReplayWidth: Int, - FpDqReplayWidth: Int, - LsDqReplayWidth: Int + LsDqDeqWidth: Int ) class Dispatch extends XSModule { @@ -28,112 +26,92 @@ class Dispatch extends XSModule { val redirect = Flipped(ValidIO(new Redirect)) // from rename val fromRename = Vec(RenameWidth, Flipped(DecoupledIO(new MicroOp))) + val renameBypass = Input(new RenameBypassInfo) + // to busytable: set pdest to busy (not ready) when they are dispatched + val allocPregs = Vec(RenameWidth, Output(new ReplayPregReq)) // enq Roq - val toRoq = Vec(RenameWidth, DecoupledIO(new MicroOp)) - // get RoqIdx - val roqIdxs = Input(Vec(RenameWidth, new RoqPtr)) - // enq Lsroq - val toLsroq = Vec(RenameWidth, DecoupledIO(new MicroOp)) - // get LsIdx - val lsIdxs = Input(Vec(RenameWidth, new LSIdx)) - val dequeueRoqIndex = Input(Valid(new RoqPtr)) + val enqRoq = new Bundle { + val canAccept = Input(Bool()) + val isEmpty = Input(Bool()) + val extraWalk = Vec(RenameWidth, Output(Bool())) + val req = Vec(RenameWidth, ValidIO(new MicroOp)) + val resp = Vec(RenameWidth, Input(new RoqPtr)) + } + // enq Lsq + val enqLsq = new Bundle() { + val canAccept = Input(Bool()) + val req = Vec(RenameWidth, ValidIO(new MicroOp)) + val resp = Vec(RenameWidth, Input(new LSIdx)) + } // read regfile val readIntRf = Vec(NRIntReadPorts, Flipped(new RfReadPort)) - val readFpRf = Vec(NRFpReadPorts - exuParameters.StuCnt, Flipped(new RfReadPort)) + val readFpRf = Vec(NRFpReadPorts, Flipped(new RfReadPort)) // read reg status (busy/ready) val intPregRdy = Vec(NRIntReadPorts, Input(Bool())) - val fpPregRdy = Vec(NRFpReadPorts - exuParameters.StuCnt, Input(Bool())) - // load + store reg status (busy/ready) - val intMemRegAddr = Vec(NRMemReadPorts, Output(UInt(PhyRegIdxWidth.W))) - val fpMemRegAddr = Vec(exuParameters.StuCnt, Output(UInt(PhyRegIdxWidth.W))) - val intMemRegRdy = Vec(NRMemReadPorts, Input(Bool())) - val fpMemRegRdy = Vec(exuParameters.StuCnt, Input(Bool())) - // replay: set preg status to not ready - val replayPregReq = Output(Vec(ReplayWidth, new ReplayPregReq)) + val fpPregRdy = Vec(NRFpReadPorts, Input(Bool())) // to reservation stations val numExist = Input(Vec(exuParameters.ExuCnt, UInt(log2Ceil(IssQueSize).W))) val enqIQCtrl = Vec(exuParameters.ExuCnt, DecoupledIO(new MicroOp)) - val enqIQData = Vec(exuParameters.ExuCnt - exuParameters.LsExuCnt, Output(new ExuInput)) + val enqIQData = Vec(exuParameters.ExuCnt, Output(new ExuInput)) }) val dispatch1 = Module(new Dispatch1) - val intDq = Module(new DispatchQueue(dpParams.IntDqSize, dpParams.DqEnqWidth, dpParams.IntDqDeqWidth, dpParams.IntDqReplayWidth)) - val fpDq = Module(new DispatchQueue(dpParams.FpDqSize, dpParams.DqEnqWidth, dpParams.FpDqDeqWidth, dpParams.FpDqReplayWidth)) - val lsDq = Module(new DispatchQueue(dpParams.LsDqSize, dpParams.DqEnqWidth, dpParams.LsDqDeqWidth, dpParams.LsDqReplayWidth)) + val intDq = Module(new DispatchQueue(dpParams.IntDqSize, dpParams.DqEnqWidth, dpParams.IntDqDeqWidth)) + val fpDq = Module(new DispatchQueue(dpParams.FpDqSize, dpParams.DqEnqWidth, dpParams.FpDqDeqWidth)) + val lsDq = Module(new DispatchQueue(dpParams.LsDqSize, dpParams.DqEnqWidth, dpParams.LsDqDeqWidth)) // pipeline between rename and dispatch // accepts all at once + val redirectValid = io.redirect.valid// && !io.redirect.bits.isReplay for (i <- 0 until RenameWidth) { - PipelineConnect(io.fromRename(i), dispatch1.io.fromRename(i), dispatch1.io.recv(i), false.B) + PipelineConnect(io.fromRename(i), dispatch1.io.fromRename(i), dispatch1.io.recv(i), redirectValid) } // dispatch 1: accept uops from rename and dispatch them to the three dispatch queues dispatch1.io.redirect <> io.redirect - dispatch1.io.toRoq <> io.toRoq - dispatch1.io.roqIdxs <> io.roqIdxs - dispatch1.io.toLsroq <> io.toLsroq - dispatch1.io.lsIdx <> io.lsIdxs + dispatch1.io.renameBypass := RegEnable(io.renameBypass, io.fromRename(0).valid && dispatch1.io.fromRename(0).ready) + dispatch1.io.enqRoq <> io.enqRoq + dispatch1.io.enqLsq <> io.enqLsq + dispatch1.io.toIntDqReady <> intDq.io.enqReady dispatch1.io.toIntDq <> intDq.io.enq + dispatch1.io.toFpDqReady <> fpDq.io.enqReady dispatch1.io.toFpDq <> fpDq.io.enq + dispatch1.io.toLsDqReady <> lsDq.io.enqReady dispatch1.io.toLsDq <> lsDq.io.enq + dispatch1.io.allocPregs <> io.allocPregs // dispatch queue: queue uops and dispatch them to different reservation stations or issue queues // it may cancel the uops intDq.io.redirect <> io.redirect - intDq.io.dequeueRoqIndex <> io.dequeueRoqIndex - intDq.io.replayPregReq.zipWithIndex.map { case(replay, i) => - io.replayPregReq(i) <> replay - } - intDq.io.otherWalkDone := !fpDq.io.inReplayWalk && !lsDq.io.inReplayWalk - fpDq.io.redirect <> io.redirect - fpDq.io.dequeueRoqIndex <> io.dequeueRoqIndex - fpDq.io.replayPregReq.zipWithIndex.map { case(replay, i) => - io.replayPregReq(i + dpParams.IntDqReplayWidth) <> replay - } - fpDq.io.otherWalkDone := !intDq.io.inReplayWalk && !lsDq.io.inReplayWalk - lsDq.io.redirect <> io.redirect - lsDq.io.dequeueRoqIndex <> io.dequeueRoqIndex - lsDq.io.replayPregReq.zipWithIndex.map { case(replay, i) => - io.replayPregReq(i + dpParams.IntDqReplayWidth + dpParams.FpDqReplayWidth) <> replay - } - lsDq.io.otherWalkDone := !intDq.io.inReplayWalk && !fpDq.io.inReplayWalk // Int dispatch queue to Int reservation stations val intDispatch = Module(new Dispatch2Int) intDispatch.io.fromDq <> intDq.io.deq - intDispatch.io.readRf <> io.readIntRf - intDispatch.io.regRdy := io.intPregRdy + intDispatch.io.readRf.zipWithIndex.map({case (r, i) => r <> io.readIntRf(i)}) + intDispatch.io.regRdy.zipWithIndex.map({case (r, i) => r <> io.intPregRdy(i)}) intDispatch.io.numExist.zipWithIndex.map({case (num, i) => num := io.numExist(i)}) intDispatch.io.enqIQCtrl.zipWithIndex.map({case (enq, i) => enq <> io.enqIQCtrl(i)}) intDispatch.io.enqIQData.zipWithIndex.map({case (enq, i) => enq <> io.enqIQData(i)}) - // TODO: Fp dispatch queue to Fp reservation stations - if (exuParameters.FpExuCnt > 0) { - val fpDispatch = Module(new Dispatch2Fp) - fpDispatch.io.fromDq <> fpDq.io.deq - fpDispatch.io.readRf <> io.readFpRf - fpDispatch.io.regRdy <> io.fpPregRdy - fpDispatch.io.numExist.zipWithIndex.map({case (num, i) => num := io.numExist(i + exuParameters.IntExuCnt)}) - fpDispatch.io.enqIQCtrl.zipWithIndex.map({case (enq, i) => enq <> io.enqIQCtrl(i + exuParameters.IntExuCnt)}) - fpDispatch.io.enqIQData.zipWithIndex.map({case (enq, i) => enq <> io.enqIQData(i + exuParameters.IntExuCnt)}) - } - else { - fpDq.io.deq <> DontCare - io.readFpRf <> DontCare - } - + // Fp dispatch queue to Fp reservation stations + val fpDispatch = Module(new Dispatch2Fp) + fpDispatch.io.fromDq <> fpDq.io.deq + fpDispatch.io.readRf.zipWithIndex.map({case (r, i) => r <> io.readFpRf(i)}) + fpDispatch.io.regRdy.zipWithIndex.map({case (r, i) => r <> io.fpPregRdy(i)}) + fpDispatch.io.numExist.zipWithIndex.map({case (num, i) => num := io.numExist(i + exuParameters.IntExuCnt)}) + fpDispatch.io.enqIQCtrl.zipWithIndex.map({case (enq, i) => enq <> io.enqIQCtrl(i + exuParameters.IntExuCnt)}) + fpDispatch.io.enqIQData.zipWithIndex.map({case (enq, i) => enq <> io.enqIQData(i + exuParameters.IntExuCnt)}) + // Load/store dispatch queue to load/store issue queues val lsDispatch = Module(new Dispatch2Ls) lsDispatch.io.fromDq <> lsDq.io.deq - lsDispatch.io.intRegAddr <> io.intMemRegAddr - lsDispatch.io.fpRegAddr <> io.fpMemRegAddr - lsDispatch.io.intRegRdy <> io.intMemRegRdy - lsDispatch.io.fpRegRdy <> io.fpMemRegRdy + lsDispatch.io.readIntRf.zipWithIndex.map({case (r, i) => r <> io.readIntRf(i + 8)}) + lsDispatch.io.readFpRf.zipWithIndex.map({case (r, i) => r <> io.readFpRf(i + 12)}) + lsDispatch.io.intRegRdy.zipWithIndex.map({case (r, i) => r <> io.intPregRdy(i + 8)}) + lsDispatch.io.fpRegRdy.zipWithIndex.map({case (r, i) => r <> io.fpPregRdy(i + 12)}) lsDispatch.io.numExist.zipWithIndex.map({case (num, i) => num := io.numExist(exuParameters.IntExuCnt + exuParameters.FpExuCnt + i)}) lsDispatch.io.enqIQCtrl.zipWithIndex.map({case (enq, i) => enq <> io.enqIQCtrl(exuParameters.IntExuCnt + exuParameters.FpExuCnt + i)}) - - val inWalk = intDq.io.inReplayWalk || fpDq.io.inReplayWalk || lsDq.io.inReplayWalk - XSPerf("replayWalkCycle", inWalk) + lsDispatch.io.enqIQData.zipWithIndex.map({case (enq, i) => enq <> io.enqIQData(exuParameters.IntExuCnt + exuParameters.FpExuCnt + i)}) } diff --git a/src/main/scala/xiangshan/backend/dispatch/Dispatch1.scala b/src/main/scala/xiangshan/backend/dispatch/Dispatch1.scala index 833ee8c88ca95f5577cef984c1aac62bb13eda94..4572bf12b83c1d3bc7e5812bf7f8a4ddce21672d 100644 --- a/src/main/scala/xiangshan/backend/dispatch/Dispatch1.scala +++ b/src/main/scala/xiangshan/backend/dispatch/Dispatch1.scala @@ -6,6 +6,7 @@ import chisel3.ExcitingUtils._ import xiangshan._ import utils._ import xiangshan.backend.roq.RoqPtr +import xiangshan.backend.rename.RenameBypassInfo // read rob and enqueue class Dispatch1 extends XSModule { @@ -13,28 +14,44 @@ class Dispatch1 extends XSModule { val redirect = Flipped(ValidIO(new Redirect)) // from rename val fromRename = Vec(RenameWidth, Flipped(DecoupledIO(new MicroOp))) + val renameBypass = Input(new RenameBypassInfo) val recv = Output(Vec(RenameWidth, Bool())) // enq Roq - val toRoq = Vec(RenameWidth, DecoupledIO(new MicroOp)) - // get RoqIdx - val roqIdxs = Input(Vec(RenameWidth, new RoqPtr)) - // enq Lsroq - val toLsroq = Vec(RenameWidth, DecoupledIO(new MicroOp)) - // get LsIdx - val lsIdx = Input(Vec(RenameWidth, new LSIdx)) + val enqRoq = new Bundle { + val canAccept = Input(Bool()) + val isEmpty = Input(Bool()) + // if set, Roq needs extra walk + val extraWalk = Vec(RenameWidth, Output(Bool())) + val req = Vec(RenameWidth, ValidIO(new MicroOp)) + val resp = Vec(RenameWidth, Input(new RoqPtr)) + } + // enq Lsq + val enqLsq = new Bundle() { + val canAccept = Input(Bool()) + val req = Vec(RenameWidth, ValidIO(new MicroOp)) + val resp = Vec(RenameWidth, Input(new LSIdx)) + } + val allocPregs = Vec(RenameWidth, Output(new ReplayPregReq)) // to dispatch queue - val toIntDq = Vec(dpParams.DqEnqWidth, DecoupledIO(new MicroOp)) - val toFpDq = Vec(dpParams.DqEnqWidth, DecoupledIO(new MicroOp)) - val toLsDq = Vec(dpParams.DqEnqWidth, DecoupledIO(new MicroOp)) + val toIntDqReady = Input(Bool()) + val toIntDq = Vec(dpParams.DqEnqWidth, ValidIO(new MicroOp)) + val toFpDqReady = Input(Bool()) + val toFpDq = Vec(dpParams.DqEnqWidth, ValidIO(new MicroOp)) + val toLsDqReady = Input(Bool()) + val toLsDq = Vec(dpParams.DqEnqWidth, ValidIO(new MicroOp)) }) + + /** * Part 1: choose the target dispatch queue and the corresponding write ports */ // valid bits for different dispatch queues - val isInt = WireInit(VecInit(io.fromRename.map(uop => FuType.isIntExu(uop.bits.ctrl.fuType)))) - val isFp = WireInit(VecInit(io.fromRename.map(uop => FuType.isFpExu (uop.bits.ctrl.fuType)))) - val isLs = WireInit(VecInit(io.fromRename.map(uop => FuType.isMemExu(uop.bits.ctrl.fuType)))) - val isStore = WireInit(VecInit(io.fromRename.map(uop => FuType.isStoreExu(uop.bits.ctrl.fuType)))) + val isInt = VecInit(io.fromRename.map(req => FuType.isIntExu(req.bits.ctrl.fuType))) + val isFp = VecInit(io.fromRename.map(req => FuType.isFpExu (req.bits.ctrl.fuType))) + val isLs = VecInit(io.fromRename.map(req => FuType.isMemExu(req.bits.ctrl.fuType))) + val isStore = VecInit(io.fromRename.map(req => FuType.isStoreExu(req.bits.ctrl.fuType))) + val isBlockBackward = VecInit(io.fromRename.map(_.bits.ctrl.blockBackward)) + val isNoSpecExec = VecInit(io.fromRename.map(_.bits.ctrl.noSpecExec)) // generate index mapping val intIndex = Module(new IndexMapping(RenameWidth, dpParams.DqEnqWidth, false)) @@ -50,129 +67,167 @@ class Dispatch1 extends XSModule { lsIndex.io.priority := DontCare /** - * Part 2: acquire ROQ (all) and LSROQ (load/store only) indexes + * Part 2: + * Update commitType, psrc1, psrc2, psrc3, old_pdest for the uops */ - val cancelled = WireInit(VecInit(Seq.fill(RenameWidth)(io.redirect.valid && !io.redirect.bits.isReplay))) + val updatedUop = Wire(Vec(RenameWidth, new MicroOp)) + val updatedCommitType = Wire(Vec(RenameWidth, CommitType())) + val updatedPsrc1 = Wire(Vec(RenameWidth, UInt(PhyRegIdxWidth.W))) + val updatedPsrc2 = Wire(Vec(RenameWidth, UInt(PhyRegIdxWidth.W))) + val updatedPsrc3 = Wire(Vec(RenameWidth, UInt(PhyRegIdxWidth.W))) + val updatedOldPdest = Wire(Vec(RenameWidth, UInt(PhyRegIdxWidth.W))) + + for (i <- 0 until RenameWidth) { + updatedCommitType(i) := Cat(isLs(i), isStore(i) | isFp(i)) + updatedPsrc1(i) := io.fromRename.take(i).map(_.bits.pdest) + .zip(if (i == 0) Seq() else io.renameBypass.lsrc1_bypass(i-1).asBools) + .foldLeft(io.fromRename(i).bits.psrc1) { + (z, next) => Mux(next._2, next._1, z) + } + updatedPsrc2(i) := io.fromRename.take(i).map(_.bits.pdest) + .zip(if (i == 0) Seq() else io.renameBypass.lsrc2_bypass(i-1).asBools) + .foldLeft(io.fromRename(i).bits.psrc2) { + (z, next) => Mux(next._2, next._1, z) + } + updatedPsrc3(i) := io.fromRename.take(i).map(_.bits.pdest) + .zip(if (i == 0) Seq() else io.renameBypass.lsrc3_bypass(i-1).asBools) + .foldLeft(io.fromRename(i).bits.psrc3) { + (z, next) => Mux(next._2, next._1, z) + } + updatedOldPdest(i) := io.fromRename.take(i).map(_.bits.pdest) + .zip(if (i == 0) Seq() else io.renameBypass.ldest_bypass(i-1).asBools) + .foldLeft(io.fromRename(i).bits.old_pdest) { + (z, next) => Mux(next._2, next._1, z) + } + + updatedUop(i) := io.fromRename(i).bits + // update bypass psrc1/psrc2/psrc3/old_pdest + updatedUop(i).psrc1 := updatedPsrc1(i) + updatedUop(i).psrc2 := updatedPsrc2(i) + updatedUop(i).psrc3 := updatedPsrc3(i) + updatedUop(i).old_pdest := updatedOldPdest(i) + // update commitType + updatedUop(i).ctrl.commitType := updatedCommitType(i) + } - val uopWithIndex = Wire(Vec(RenameWidth, new MicroOp)) - val roqIndexReg = Reg(Vec(RenameWidth, new RoqPtr)) - val roqIndexRegValid = RegInit(VecInit(Seq.fill(RenameWidth)(false.B))) - val roqIndexAcquired = WireInit(VecInit(Seq.tabulate(RenameWidth)(i => io.toRoq(i).ready || roqIndexRegValid(i)))) - val lsIndexReg = Reg(Vec(RenameWidth, new LSIdx)) - val lsIndexRegValid = RegInit(VecInit(Seq.fill(RenameWidth)(false.B))) - val lsroqIndexAcquired = WireInit(VecInit(Seq.tabulate(RenameWidth)(i => io.toLsroq(i).ready || lsIndexRegValid(i)))) + /** + * Part 3: + * acquire ROQ (all), LSQ (load/store only) and dispatch queue slots + * only set valid when all of them provides enough entries + */ + val redirectValid = io.redirect.valid// && !io.redirect.bits.isReplay + val allResourceReady = io.enqLsq.canAccept && io.enqRoq.canAccept && io.toIntDqReady && io.toFpDqReady && io.toLsDqReady + + // Instructions should enter dispatch queues in order. + // When RenameWidth > DqEnqWidth, it's possible that some instructions cannot enter dispatch queue + // because previous instructions cannot enter dispatch queue. + // The reason is that although ROB and LSQ have enough empty slots, dispatch queue has limited enqueue ports. + // Thus, for i >= dpParams.DqEnqWidth, we have to check whether it's previous instructions (and the instruction itself) can enqueue. + // However, since, for instructions with indices less than dpParams.DqEnqWidth, + // they can always enter dispatch queue when ROB and LSQ are ready, we don't need to check whether they can enqueue. + // thisIsBlocked: this instruction is blocked by itself (based on noSpecExec) + // thisCanOut: this instruction can enqueue (based on resource) + // nextCanOut: next instructions can out (based on blockBackward and previous instructions) + // notBlockedByPrevious: previous instructions can enqueue + val thisIsBlocked = VecInit((0 until RenameWidth).map(i => { + // for i > 0, when Roq is empty but dispatch1 have valid instructions to enqueue, it's blocked + if (i > 0) isNoSpecExec(i) && (!io.enqRoq.isEmpty || Cat(io.fromRename.take(i).map(_.valid)).orR) + else isNoSpecExec(i) && !io.enqRoq.isEmpty + })) + val thisCanOut = VecInit((0 until RenameWidth).map(i => { + // For i in [0, DqEnqWidth), they can always enqueue when ROB and LSQ are ready + if (i < dpParams.DqEnqWidth) true.B + else Cat(Seq(intIndex, fpIndex, lsIndex).map(_.io.reverseMapping(i).valid)).orR + })) + val nextCanOut = VecInit((0 until RenameWidth).map(i => + (thisCanOut(i) && !isNoSpecExec(i) && !isBlockBackward(i)) || !io.fromRename(i).valid + )) + val notBlockedByPrevious = VecInit((0 until RenameWidth).map(i => + if (i == 0) true.B + else Cat((0 until i).map(j => nextCanOut(j))).andR + )) + + // this instruction can actually dequeue: 3 conditions + // (1) resources are ready + // (2) previous instructions are ready + val thisCanActualOut = (0 until RenameWidth).map(i => allResourceReady && thisCanOut(i) && !thisIsBlocked(i) && notBlockedByPrevious(i)) + + // input for ROQ and LSQ + // note that LSQ needs roqIdx for (i <- 0 until RenameWidth) { - // input for ROQ and LSROQ - val commitType = Cat(isLs(i), isStore(i) | isFp(i)) - - io.toRoq(i).valid := io.fromRename(i).valid && !roqIndexRegValid(i) - io.toRoq(i).bits := io.fromRename(i).bits - io.toRoq(i).bits.ctrl.commitType := commitType - - io.toLsroq(i).valid := io.fromRename(i).valid && !lsIndexRegValid(i) && isLs(i) && io.fromRename(i).bits.ctrl.fuType =/= FuType.mou && roqIndexAcquired(i) && !cancelled(i) - io.toLsroq(i).bits := io.fromRename(i).bits - io.toLsroq(i).bits.ctrl.commitType := commitType - io.toLsroq(i).bits.roqIdx := Mux(roqIndexRegValid(i), roqIndexReg(i), io.roqIdxs(i)) - - // receive indexes from ROQ and LSROQ - when(io.toRoq(i).fire() && !io.recv(i)) { - roqIndexReg(i) := io.roqIdxs(i) - roqIndexRegValid(i) := true.B - }.elsewhen(io.recv(i)) { - roqIndexRegValid(i) := false.B - } - when(io.toLsroq(i).fire() && !io.recv(i)) { - lsIndexReg(i) := io.lsIdx(i) - lsIndexRegValid(i) := true.B - }.elsewhen(io.recv(i)) { - lsIndexRegValid(i) := false.B - } + io.enqRoq.extraWalk(i) := io.fromRename(i).valid && !thisCanActualOut(i) + io.enqRoq.req(i).valid := io.fromRename(i).valid && thisCanActualOut(i) + io.enqRoq.req(i).bits := updatedUop(i) - // append ROQ and LSROQ indexed to uop - uopWithIndex(i) := io.fromRename(i).bits - uopWithIndex(i).roqIdx := Mux(roqIndexRegValid(i), roqIndexReg(i), io.roqIdxs(i)) - if(EnableUnifiedLSQ){ - uopWithIndex(i).lsroqIdx := Mux(lsIndexRegValid(i), lsIndexReg(i), io.lsIdx(i)).lsroqIdx - XSDebug(io.toLsroq(i).fire(), p"pc 0x${Hexadecimal(io.fromRename(i).bits.cf.pc)} receives lsroq ${io.lsIdx(i).lsroqIdx}\n") - } else { - uopWithIndex(i).lqIdx := Mux(lsIndexRegValid(i), lsIndexReg(i), io.lsIdx(i)).lqIdx - uopWithIndex(i).sqIdx := Mux(lsIndexRegValid(i), lsIndexReg(i), io.lsIdx(i)).sqIdx - XSDebug(io.toLsroq(i).fire(), p"pc 0x${Hexadecimal(io.fromRename(i).bits.cf.pc)} receives lq ${io.lsIdx(i).lqIdx} sq ${io.lsIdx(i).sqIdx}\n") - } + val shouldEnqLsq = isLs(i) && io.fromRename(i).bits.ctrl.fuType =/= FuType.mou + io.enqLsq.req(i).valid := io.fromRename(i).valid && shouldEnqLsq && !redirectValid && thisCanActualOut(i) + io.enqLsq.req(i).bits := updatedUop(i) + io.enqLsq.req(i).bits.roqIdx := io.enqRoq.resp(i) - XSDebug(io.toRoq(i).fire(), p"pc 0x${Hexadecimal(io.fromRename(i).bits.cf.pc)} receives nroq ${io.roqIdxs(i)}\n") - if (i > 0) { - XSError(io.toRoq(i).fire() && !io.toRoq(i - 1).ready && io.toRoq(i - 1).valid, p"roq handshake not continuous $i") - } + XSDebug(io.enqLsq.req(i).valid, + p"pc 0x${Hexadecimal(io.fromRename(i).bits.cf.pc)} receives lq ${io.enqLsq.resp(i).lqIdx} sq ${io.enqLsq.resp(i).sqIdx}\n") + + XSDebug(io.enqRoq.req(i).valid, p"pc 0x${Hexadecimal(io.fromRename(i).bits.cf.pc)} receives nroq ${io.enqRoq.resp(i)}\n") } + /** - * Part 3: send uop (should not be cancelled) with correct indexes to dispatch queues + * Part 4: + * append ROQ and LSQ indexed to uop, and send them to dispatch queue */ - val orderedEnqueue = Wire(Vec(RenameWidth, Bool())) - val canEnqueue = Wire(Vec(RenameWidth, Bool())) - var prevCanEnqueue = true.B + val updateUopWithIndex = Wire(Vec(RenameWidth, new MicroOp)) for (i <- 0 until RenameWidth) { - orderedEnqueue(i) := prevCanEnqueue - canEnqueue(i) := !cancelled(i) && roqIndexAcquired(i) && (!isLs(i) || io.fromRename(i).bits.ctrl.fuType === FuType.mou || lsroqIndexAcquired(i)) - val enqReady = (io.toIntDq(intIndex.io.reverseMapping(i).bits).ready && intIndex.io.reverseMapping(i).valid) || - (io.toFpDq(fpIndex.io.reverseMapping(i).bits).ready && fpIndex.io.reverseMapping(i).valid) || - (io.toLsDq(lsIndex.io.reverseMapping(i).bits).ready && lsIndex.io.reverseMapping(i).valid) - prevCanEnqueue = prevCanEnqueue && (!io.fromRename(i).valid || (canEnqueue(i) && enqReady)) + updateUopWithIndex(i) := updatedUop(i) + updateUopWithIndex(i).roqIdx := io.enqRoq.resp(i) + updateUopWithIndex(i).lqIdx := io.enqLsq.resp(i).lqIdx + updateUopWithIndex(i).sqIdx := io.enqLsq.resp(i).sqIdx } + + // send uops with correct indexes to dispatch queues + // Note that if one of their previous instructions cannot enqueue, they should not enter dispatch queue. + // We use notBlockedByPrevious here since mapping(i).valid implies there's a valid instruction that can enqueue, + // thus we don't need to check thisCanOut. for (i <- 0 until dpParams.DqEnqWidth) { - io.toIntDq(i).bits := uopWithIndex(intIndex.io.mapping(i).bits) - io.toIntDq(i).valid := intIndex.io.mapping(i).valid && - canEnqueue(intIndex.io.mapping(i).bits) && - orderedEnqueue(intIndex.io.mapping(i).bits) - - io.toFpDq(i).bits := uopWithIndex(fpIndex.io.mapping(i).bits) - io.toFpDq(i).valid := fpIndex.io.mapping(i).valid && - canEnqueue(fpIndex.io.mapping(i).bits) && - orderedEnqueue(fpIndex.io.mapping(i).bits) - - io.toLsDq(i).bits := uopWithIndex(lsIndex.io.mapping(i).bits) - io.toLsDq(i).valid := lsIndex.io.mapping(i).valid && - canEnqueue(lsIndex.io.mapping(i).bits) && - orderedEnqueue(lsIndex.io.mapping(i).bits) - - // XSDebug(io.toIntDq(i).valid, p"pc 0x${Hexadecimal(io.toIntDq(i).bits.cf.pc)} int index $i\n") - // XSDebug(io.toFpDq(i).valid , p"pc 0x${Hexadecimal(io.toFpDq(i).bits.cf.pc )} fp index $i\n") - // XSDebug(io.toLsDq(i).valid , p"pc 0x${Hexadecimal(io.toLsDq(i).bits.cf.pc )} ls index $i\n") + io.toIntDq(i).bits := updateUopWithIndex(intIndex.io.mapping(i).bits) + io.toIntDq(i).valid := intIndex.io.mapping(i).valid && allResourceReady && + !thisIsBlocked(intIndex.io.mapping(i).bits) && notBlockedByPrevious(intIndex.io.mapping(i).bits) + + // NOTE: floating point instructions are not noSpecExec currently + // remove commit /**/ when fp instructions are possible to be noSpecExec + io.toFpDq(i).bits := updateUopWithIndex(fpIndex.io.mapping(i).bits) + io.toFpDq(i).valid := fpIndex.io.mapping(i).valid && allResourceReady && + /*!thisIsBlocked(fpIndex.io.mapping(i).bits) && */notBlockedByPrevious(fpIndex.io.mapping(i).bits) + + io.toLsDq(i).bits := updateUopWithIndex(lsIndex.io.mapping(i).bits) + io.toLsDq(i).valid := lsIndex.io.mapping(i).valid && allResourceReady && + !thisIsBlocked(lsIndex.io.mapping(i).bits) && notBlockedByPrevious(lsIndex.io.mapping(i).bits) + + XSDebug(io.toIntDq(i).valid, p"pc 0x${Hexadecimal(io.toIntDq(i).bits.cf.pc)} int index $i\n") + XSDebug(io.toFpDq(i).valid , p"pc 0x${Hexadecimal(io.toFpDq(i).bits.cf.pc )} fp index $i\n") + XSDebug(io.toLsDq(i).valid , p"pc 0x${Hexadecimal(io.toLsDq(i).bits.cf.pc )} ls index $i\n") } /** - * Part 4: send response to rename when dispatch queue accepts the uop + * Part 3: send response to rename when dispatch queue accepts the uop */ val readyVector = (0 until RenameWidth).map(i => !io.fromRename(i).valid || io.recv(i)) for (i <- 0 until RenameWidth) { - val enqFire = (io.toIntDq(intIndex.io.reverseMapping(i).bits).fire() && intIndex.io.reverseMapping(i).valid) || - (io.toFpDq(fpIndex.io.reverseMapping(i).bits).fire() && fpIndex.io.reverseMapping(i).valid) || - (io.toLsDq(lsIndex.io.reverseMapping(i).bits).fire() && lsIndex.io.reverseMapping(i).valid) - io.recv(i) := enqFire || cancelled(i) + io.recv(i) := thisCanActualOut(i) io.fromRename(i).ready := Cat(readyVector).andR() - // TODO: add print method for lsIdx - if(EnableUnifiedLSQ){ - XSInfo(io.recv(i) && !cancelled(i), - p"pc 0x${Hexadecimal(io.fromRename(i).bits.cf.pc)} type(${isInt(i)}, ${isFp(i)}, ${isLs(i)}) " + - p"roq ${uopWithIndex(i).roqIdx} lsroq ${uopWithIndex(i).lsroqIdx} is accepted by dispatch queue " + - p"(${intIndex.io.reverseMapping(i).bits}, ${fpIndex.io.reverseMapping(i).bits}, ${lsIndex.io.reverseMapping(i).bits})\n") - }else{ - XSInfo(io.recv(i) && !cancelled(i), - p"pc 0x${Hexadecimal(io.fromRename(i).bits.cf.pc)} type(${isInt(i)}, ${isFp(i)}, ${isLs(i)}) " + - p"roq ${uopWithIndex(i).roqIdx} lq ${uopWithIndex(i).lqIdx} sq ${uopWithIndex(i).sqIdx}" + - p"(${intIndex.io.reverseMapping(i).bits}, ${fpIndex.io.reverseMapping(i).bits}, ${lsIndex.io.reverseMapping(i).bits})\n") - } + XSInfo(io.recv(i), + p"pc 0x${Hexadecimal(io.fromRename(i).bits.cf.pc)}, type(${isInt(i)}, ${isFp(i)}, ${isLs(i)}), " + + p"roq ${updateUopWithIndex(i).roqIdx}, lq ${updateUopWithIndex(i).lqIdx}, sq ${updateUopWithIndex(i).sqIdx}, " + + p"(${intIndex.io.reverseMapping(i).bits}, ${fpIndex.io.reverseMapping(i).bits}, ${lsIndex.io.reverseMapping(i).bits})\n" + ) - XSInfo(io.recv(i) && cancelled(i), - p"pc 0x${Hexadecimal(io.fromRename(i).bits.cf.pc)} with brTag ${io.fromRename(i).bits.brTag.value} cancelled\n") - XSDebug(io.fromRename(i).valid, "v:%d r:%d pc 0x%x of type %b is in %d-th slot\n", - io.fromRename(i).valid, io.fromRename(i).ready, io.fromRename(i).bits.cf.pc, io.fromRename(i).bits.ctrl.fuType, i.U) + io.allocPregs(i).isInt := io.fromRename(i).valid && io.fromRename(i).bits.ctrl.rfWen && (io.fromRename(i).bits.ctrl.ldest =/= 0.U) + io.allocPregs(i).isFp := io.fromRename(i).valid && io.fromRename(i).bits.ctrl.fpWen + io.allocPregs(i).preg := io.fromRename(i).bits.pdest } val renameFireCnt = PopCount(io.recv) - val enqFireCnt = PopCount(io.toIntDq.map(_.fire)) + PopCount(io.toFpDq.map(_.fire)) + PopCount(io.toLsDq.map(_.fire)) + val enqFireCnt = PopCount(io.toIntDq.map(_.valid && io.toIntDqReady)) + PopCount(io.toFpDq.map(_.valid && io.toFpDqReady)) + PopCount(io.toLsDq.map(_.valid && io.toLsDqReady)) XSError(enqFireCnt > renameFireCnt, "enqFireCnt should not be greater than renameFireCnt\n") XSPerf("utilization", PopCount(io.fromRename.map(_.valid))) diff --git a/src/main/scala/xiangshan/backend/dispatch/Dispatch2Fp.scala b/src/main/scala/xiangshan/backend/dispatch/Dispatch2Fp.scala index 11924dab5dddd9a3db8a21a7eaf16a0020c43a3a..8f309f38abdce9102e9c708943ade62fcd1c9d56 100644 --- a/src/main/scala/xiangshan/backend/dispatch/Dispatch2Fp.scala +++ b/src/main/scala/xiangshan/backend/dispatch/Dispatch2Fp.scala @@ -5,7 +5,7 @@ import chisel3.util._ import xiangshan._ import utils._ import xiangshan.backend.regfile.RfReadPort -import xiangshan.backend.exu._ +import xiangshan.backend.exu.Exu._ class Dispatch2Fp extends XSModule { val io = IO(new Bundle() { @@ -26,8 +26,8 @@ class Dispatch2Fp extends XSModule { val fmacPriority = PriorityGen((0 until exuParameters.FmacCnt).map(i => io.numExist(i))) val fmiscPriority = PriorityGen((0 until exuParameters.FmiscCnt).map(i => io.numExist(i+exuParameters.FmacCnt))) for (i <- 0 until dpParams.FpDqDeqWidth) { - fmacIndexGen.io.validBits(i) := io.fromDq(i).valid && Exu.fmacExeUnitCfg.canAccept(io.fromDq(i).bits.ctrl.fuType) - fmiscIndexGen.io.validBits(i) := io.fromDq(i).valid && Exu.fmiscExeUnitCfg.canAccept(io.fromDq(i).bits.ctrl.fuType) + fmacIndexGen.io.validBits(i) := io.fromDq(i).valid && fmacExeUnitCfg.canAccept(io.fromDq(i).bits.ctrl.fuType) + fmiscIndexGen.io.validBits(i) := io.fromDq(i).valid && fmiscExeUnitCfg.canAccept(io.fromDq(i).bits.ctrl.fuType) // XSDebug(io.fromDq(i).valid, // p"fp dp queue $i: ${Hexadecimal(io.fromDq(i).bits.cf.pc)} type ${Binary(io.fromDq(i).bits.ctrl.fuType)}\n") diff --git a/src/main/scala/xiangshan/backend/dispatch/Dispatch2Int.scala b/src/main/scala/xiangshan/backend/dispatch/Dispatch2Int.scala index 59fb9571320ad814a638927b4ca9ff080b1da2c1..b11749483e5bc7ff0c65d072a062378a66e1d9ce 100644 --- a/src/main/scala/xiangshan/backend/dispatch/Dispatch2Int.scala +++ b/src/main/scala/xiangshan/backend/dispatch/Dispatch2Int.scala @@ -4,14 +4,15 @@ import chisel3._ import chisel3.util._ import xiangshan._ import utils._ +import xiangshan.backend.exu.Exu._ import xiangshan.backend.regfile.RfReadPort import xiangshan.backend.exu._ class Dispatch2Int extends XSModule { val io = IO(new Bundle() { val fromDq = Flipped(Vec(dpParams.IntDqDeqWidth, DecoupledIO(new MicroOp))) - val readRf = Vec(NRIntReadPorts, Flipped(new RfReadPort)) - val regRdy = Vec(NRIntReadPorts, Input(Bool())) + val readRf = Vec(NRIntReadPorts - NRMemReadPorts, Flipped(new RfReadPort)) + val regRdy = Vec(NRIntReadPorts - NRMemReadPorts, Input(Bool())) val numExist = Input(Vec(exuParameters.IntExuCnt, UInt(log2Ceil(IssQueSize).W))) val enqIQCtrl = Vec(exuParameters.IntExuCnt, DecoupledIO(new MicroOp)) val enqIQData = Vec(exuParameters.IntExuCnt, Output(new ExuInput)) @@ -27,9 +28,9 @@ class Dispatch2Int extends XSModule { val aluPriority = PriorityGen((0 until exuParameters.AluCnt).map(i => io.numExist(i+exuParameters.JmpCnt))) val mduPriority = PriorityGen((0 until exuParameters.MduCnt).map(i => io.numExist(i+exuParameters.JmpCnt+exuParameters.AluCnt))) for (i <- 0 until dpParams.IntDqDeqWidth) { - jmpIndexGen.io.validBits(i) := io.fromDq(i).valid && Exu.jmpExeUnitCfg.canAccept(io.fromDq(i).bits.ctrl.fuType) - aluIndexGen.io.validBits(i) := io.fromDq(i).valid && Exu.aluExeUnitCfg.canAccept(io.fromDq(i).bits.ctrl.fuType) - mduIndexGen.io.validBits(i) := io.fromDq(i).valid && Exu.mulDivExeUnitCfg.canAccept(io.fromDq(i).bits.ctrl.fuType) + jmpIndexGen.io.validBits(i) := io.fromDq(i).valid && jumpExeUnitCfg.canAccept(io.fromDq(i).bits.ctrl.fuType) + aluIndexGen.io.validBits(i) := io.fromDq(i).valid && aluExeUnitCfg.canAccept(io.fromDq(i).bits.ctrl.fuType) + mduIndexGen.io.validBits(i) := io.fromDq(i).valid && mulDivExeUnitCfg.canAccept(io.fromDq(i).bits.ctrl.fuType) // XSDebug(io.fromDq(i).valid, // p"int dp queue $i: ${Hexadecimal(io.fromDq(i).bits.cf.pc)} type ${Binary(io.fromDq(i).bits.ctrl.fuType)}\n") } diff --git a/src/main/scala/xiangshan/backend/dispatch/Dispatch2Ls.scala b/src/main/scala/xiangshan/backend/dispatch/Dispatch2Ls.scala index 7f522eb5768f6fd6b8b2ee0c73b7b1329611e53d..edbd406b90744cc1e98c8bf8013e1b6b00367e35 100644 --- a/src/main/scala/xiangshan/backend/dispatch/Dispatch2Ls.scala +++ b/src/main/scala/xiangshan/backend/dispatch/Dispatch2Ls.scala @@ -5,17 +5,20 @@ import chisel3.util._ import xiangshan._ import utils._ import xiangshan.backend.regfile.RfReadPort -import xiangshan.backend.exu._ +import xiangshan.backend.exu.Exu._ class Dispatch2Ls extends XSModule { val io = IO(new Bundle() { val fromDq = Flipped(Vec(dpParams.LsDqDeqWidth, DecoupledIO(new MicroOp))) - val intRegAddr = Vec(NRMemReadPorts, Output(UInt(PhyRegIdxWidth.W))) - val fpRegAddr = Vec(exuParameters.StuCnt, Output(UInt(PhyRegIdxWidth.W))) + val readIntRf = Vec(NRMemReadPorts, Flipped(new RfReadPort)) + val readFpRf = Vec(exuParameters.StuCnt, Flipped(new RfReadPort)) + // val intRegAddr = Vec(NRMemReadPorts, Output(UInt(PhyRegIdxWidth.W))) + // val fpRegAddr = Vec(exuParameters.StuCnt, Output(UInt(PhyRegIdxWidth.W))) val intRegRdy = Vec(NRMemReadPorts, Input(Bool())) val fpRegRdy = Vec(exuParameters.StuCnt, Input(Bool())) val numExist = Input(Vec(exuParameters.LsExuCnt, UInt(log2Ceil(IssQueSize).W))) val enqIQCtrl = Vec(exuParameters.LsExuCnt, DecoupledIO(new MicroOp)) + val enqIQData = Vec(exuParameters.LsExuCnt, Output(new ExuInput)) }) /** @@ -26,8 +29,8 @@ class Dispatch2Ls extends XSModule { val loadPriority = PriorityGen((0 until exuParameters.LduCnt).map(i => io.numExist(i))) val storePriority = PriorityGen((0 until exuParameters.StuCnt).map(i => io.numExist(i+exuParameters.LduCnt))) for (i <- 0 until dpParams.LsDqDeqWidth) { - loadIndexGen.io.validBits(i) := io.fromDq(i).valid && Exu.ldExeUnitCfg.canAccept(io.fromDq(i).bits.ctrl.fuType) - storeIndexGen.io.validBits(i) := io.fromDq(i).valid && Exu.stExeUnitCfg.canAccept(io.fromDq(i).bits.ctrl.fuType) + loadIndexGen.io.validBits(i) := io.fromDq(i).valid && ldExeUnitCfg.canAccept(io.fromDq(i).bits.ctrl.fuType) + storeIndexGen.io.validBits(i) := io.fromDq(i).valid && stExeUnitCfg.canAccept(io.fromDq(i).bits.ctrl.fuType) // XSDebug(io.fromDq(i).valid, // p"ls dp queue $i: ${Hexadecimal(io.fromDq(i).bits.cf.pc)} type ${Binary(io.fromDq(i).bits.ctrl.fuType)}\n") @@ -70,12 +73,12 @@ class Dispatch2Ls extends XSModule { val readPort = Seq(0, 1, 2, 4) for (i <- 0 until exuParameters.LsExuCnt) { if (i < exuParameters.LduCnt) { - io.intRegAddr(readPort(i)) := io.fromDq(indexVec(i)).bits.psrc1 + io.readIntRf(readPort(i)).addr := io.fromDq(indexVec(i)).bits.psrc1 } else { - io.fpRegAddr(i - exuParameters.LduCnt) := io.fromDq(indexVec(i)).bits.psrc2 - io.intRegAddr(readPort(i) ) := io.fromDq(indexVec(i)).bits.psrc1 - io.intRegAddr(readPort(i)+1) := io.fromDq(indexVec(i)).bits.psrc2 + io.readFpRf(i - exuParameters.LduCnt).addr := io.fromDq(indexVec(i)).bits.psrc2 + io.readIntRf(readPort(i) ).addr := io.fromDq(indexVec(i)).bits.psrc1 + io.readIntRf(readPort(i)+1).addr := io.fromDq(indexVec(i)).bits.psrc2 } } @@ -112,6 +115,34 @@ class Dispatch2Ls extends XSModule { p"pc 0x${Hexadecimal(io.fromDq(i).bits.cf.pc)} waits at Ls dispatch queue with index $i\n") } + /** + * Part 5: the second stage of dispatch 2 (send data to reservation station) + */ + val uopReg = Reg(Vec(exuParameters.LsExuCnt, new MicroOp)) + val dataValidRegDebug = Reg(Vec(exuParameters.LsExuCnt, Bool())) + for (i <- 0 until exuParameters.LsExuCnt) { + uopReg(i) := io.enqIQCtrl(i).bits + dataValidRegDebug(i) := io.enqIQCtrl(i).fire() + + io.enqIQData(i) := DontCare + // assert(uopReg(i).ctrl.src1Type =/= SrcType.pc) + io.enqIQData(i).src1 := io.readIntRf(readPort(i)).data + if (i >= exuParameters.LduCnt) { + io.enqIQData(i).src2 := Mux( + uopReg(i).ctrl.src2Type === SrcType.imm, + uopReg(i).ctrl.imm, + Mux(uopReg(i).ctrl.src2Type === SrcType.fp, + io.readFpRf(i - exuParameters.LduCnt).data, + io.readIntRf(readPort(i) + 1).data)) + } + + XSDebug(dataValidRegDebug(i), + p"pc 0x${Hexadecimal(uopReg(i).cf.pc)} reads operands from " + + p"(${readPort(i) }, ${uopReg(i).psrc1}, ${Hexadecimal(io.enqIQData(i).src1)}), " + + p"(${readPort(i)+1}, ${uopReg(i).psrc2}, ${Hexadecimal(io.enqIQData(i).src2)})\n") + } + XSPerf("utilization", PopCount(io.fromDq.map(_.valid))) XSPerf("waitInstr", PopCount(io.fromDq.map(r => r.valid && !r.ready))) + } diff --git a/src/main/scala/xiangshan/backend/dispatch/DispatchQueue.scala b/src/main/scala/xiangshan/backend/dispatch/DispatchQueue.scala index 77957b4f8db70957d7fcb6356521abd2952beb54..9a1be7f3477b9e4a32684eafc4717c534c31d5a7 100644 --- a/src/main/scala/xiangshan/backend/dispatch/DispatchQueue.scala +++ b/src/main/scala/xiangshan/backend/dispatch/DispatchQueue.scala @@ -7,57 +7,43 @@ import xiangshan.backend.decode.SrcType import xiangshan._ import xiangshan.backend.roq.RoqPtr -class DispatchQueueIO(enqnum: Int, deqnum: Int, replayWidth: Int) extends XSBundle { - val enq = Vec(enqnum, Flipped(DecoupledIO(new MicroOp))) +class DispatchQueueIO(enqnum: Int, deqnum: Int) extends XSBundle { + val enq = Vec(enqnum, Flipped(ValidIO(new MicroOp))) + val enqReady = Output(Bool()) val deq = Vec(deqnum, DecoupledIO(new MicroOp)) - val dequeueRoqIndex = Input(Valid(new RoqPtr)) val redirect = Flipped(ValidIO(new Redirect)) - val replayPregReq = Output(Vec(replayWidth, new ReplayPregReq)) - val inReplayWalk = Output(Bool()) - val otherWalkDone = Input(Bool()) - override def cloneType: DispatchQueueIO.this.type = - new DispatchQueueIO(enqnum, deqnum, replayWidth).asInstanceOf[this.type] + new DispatchQueueIO(enqnum, deqnum).asInstanceOf[this.type] } // dispatch queue: accepts at most enqnum uops from dispatch1 and dispatches deqnum uops at every clock cycle -class DispatchQueue(size: Int, enqnum: Int, deqnum: Int, replayWidth: Int) extends XSModule with HasCircularQueuePtrHelper { - val io = IO(new DispatchQueueIO(enqnum, deqnum, replayWidth)) +class DispatchQueue(size: Int, enqnum: Int, deqnum: Int) extends XSModule with HasCircularQueuePtrHelper { + val io = IO(new DispatchQueueIO(enqnum, deqnum)) val indexWidth = log2Ceil(size) - val s_invalid :: s_valid :: s_dispatched :: Nil = Enum(3) + val s_invalid :: s_valid:: Nil = Enum(2) // queue data array val uopEntries = Mem(size, new MicroOp) val stateEntries = RegInit(VecInit(Seq.fill(size)(s_invalid))) + // head: first valid entry (dispatched entry) val headPtr = RegInit(0.U.asTypeOf(new CircularQueuePtr(size))) - // dispatch: first entry that has not been dispatched - val dispatchPtr = RegInit(0.U.asTypeOf(new CircularQueuePtr(size))) + val headPtrMask = UIntToMask(headPtr.value) // tail: first invalid entry (free entry) val tailPtr = RegInit(0.U.asTypeOf(new CircularQueuePtr(size))) + val tailPtrMask = UIntToMask(tailPtr.value) // TODO: make ptr a vector to reduce latency? - // commit: starting from head ptr - val commitIndex = (0 until CommitWidth).map(i => headPtr + i.U).map(_.value) - // deq: starting from dispatch ptr - val deqIndex = (0 until deqnum).map(i => dispatchPtr + i.U).map(_.value) + // deq: starting from head ptr + val deqIndex = (0 until deqnum).map(i => headPtr + i.U).map(_.value) // enq: starting from tail ptr val enqIndex = (0 until enqnum).map(i => tailPtr + i.U).map(_.value) - val validEntries = distanceBetween(tailPtr, headPtr) - val dispatchEntries = distanceBetween(tailPtr, dispatchPtr) - val commitEntries = validEntries - dispatchEntries - val emptyEntries = size.U - validEntries - - def rangeMask(start: CircularQueuePtr, end: CircularQueuePtr): UInt = { - val startMask = (1.U((size + 1).W) << start.value).asUInt - 1.U - val endMask = (1.U((size + 1).W) << end.value).asUInt - 1.U - val xorMask = startMask(size - 1, 0) ^ endMask(size - 1, 0) - Mux(start.flag === end.flag, xorMask, ~xorMask) - } - val dispatchedMask = rangeMask(headPtr, dispatchPtr) + val isTrueEmpty = ~Cat((0 until size).map(i => stateEntries(i) === s_valid)).orR + val canEnqueue = validEntries <= (size - enqnum).U + val canActualEnqueue = canEnqueue && !(io.redirect.valid /*&& !io.redirect.bits.isReplay*/) /** * Part 1: update states and uops when enqueue, dequeue, commit, redirect/replay @@ -72,8 +58,9 @@ class DispatchQueue(size: Int, enqnum: Int, deqnum: Int, replayWidth: Int) exten * (5) redirect (replay): from s_dispatched to s_valid (re-dispatch) */ // enqueue: from s_invalid to s_valid + io.enqReady := canEnqueue for (i <- 0 until enqnum) { - when (io.enq(i).fire()) { + when (io.enq(i).valid && canActualEnqueue) { uopEntries(enqIndex(i)) := io.enq(i).bits stateEntries(enqIndex(i)) := s_valid } @@ -81,36 +68,22 @@ class DispatchQueue(size: Int, enqnum: Int, deqnum: Int, replayWidth: Int) exten // dequeue: from s_valid to s_dispatched for (i <- 0 until deqnum) { - when (io.deq(i).fire()) { - stateEntries(deqIndex(i)) := s_dispatched + when (io.deq(i).fire() && !io.redirect.valid) { + stateEntries(deqIndex(i)) := s_invalid XSError(stateEntries(deqIndex(i)) =/= s_valid, "state of the dispatch entry is not s_valid\n") } } - // commit: from s_dispatched to s_invalid - val needDequeue = Wire(Vec(size, Bool())) - val deqRoqIdx = io.dequeueRoqIndex.bits - for (i <- 0 until size) { - needDequeue(i) := stateEntries(i) === s_dispatched && io.dequeueRoqIndex.valid && !isAfter(uopEntries(i).roqIdx, deqRoqIdx) && dispatchedMask(i) - when (needDequeue(i)) { - stateEntries(i) := s_invalid - } - - XSInfo(needDequeue(i), p"dispatched entry($i)(pc = ${Hexadecimal(uopEntries(i).cf.pc)}) " + - p"roqIndex 0x${Hexadecimal(uopEntries(i).roqIdx.asUInt)} " + - p"left dispatch queue with deqRoqIndex 0x${Hexadecimal(io.dequeueRoqIndex.bits.asUInt)}\n") - } - // redirect: cancel uops currently in the queue - val mispredictionValid = io.redirect.valid && io.redirect.bits.isMisPred + val mispredictionValid = io.redirect.valid //&& io.redirect.bits.isMisPred val exceptionValid = io.redirect.valid && io.redirect.bits.isException val flushPipeValid = io.redirect.valid && io.redirect.bits.isFlushPipe val roqNeedFlush = Wire(Vec(size, Bool())) val needCancel = Wire(Vec(size, Bool())) for (i <- 0 until size) { roqNeedFlush(i) := uopEntries(i.U).roqIdx.needFlush(io.redirect) - needCancel(i) := stateEntries(i) =/= s_invalid && ((roqNeedFlush(i) && mispredictionValid) || exceptionValid || flushPipeValid) && !needDequeue(i) + needCancel(i) := stateEntries(i) =/= s_invalid && ((roqNeedFlush(i) && mispredictionValid) || exceptionValid || flushPipeValid) when (needCancel(i)) { stateEntries(i) := s_invalid @@ -121,182 +94,78 @@ class DispatchQueue(size: Int, enqnum: Int, deqnum: Int, replayWidth: Int) exten p"cancelled with redirect roqIndex 0x${Hexadecimal(io.redirect.bits.roqIdx.asUInt)}\n") } - // replay: from s_dispatched to s_valid - val replayValid = io.redirect.valid && io.redirect.bits.isReplay - val needReplay = Wire(Vec(size, Bool())) - for (i <- 0 until size) { - needReplay(i) := roqNeedFlush(i) && stateEntries(i) === s_dispatched && replayValid - when (needReplay(i)) { - stateEntries(i) := s_valid - } - - XSInfo(needReplay(i), p"dispatched entry($i)(pc = ${Hexadecimal(uopEntries(i.U).cf.pc)}) " + - p"replayed with roqIndex ${io.redirect.bits.roqIdx}\n") - } - /** - * Part 2: walk - * - * Instead of keeping the walking distances, we keep the walking target position for simplicity. - * - * (1) replay: move dispatchPtr to the first needReplay entry - * (2) redirect (branch misprediction): move dispatchPtr, tailPtr to the first cancelled entry + * Part 2: update indices * + * tail: (1) enqueue; (2) redirect + * head: dequeue */ - // getFirstIndex: get the head index of consecutive ones - // note that it returns the position starting from either the leftmost or the rightmost - // 00000001 => 0 - // 00111000 => 3 - // 11000111 => 2 - // 10000000 => 1 - // 00000000 => 7 - // 11111111 => 7 - def getFirstMaskPosition(mask: Seq[Bool]) = { - Mux(mask(size - 1), - PriorityEncoder(mask.reverse.map(m => !m)), - PriorityEncoder(mask) - ) - } - - val maskedNeedReplay = Cat(needReplay.reverse) & dispatchedMask - val allCancel = Cat(needCancel).andR - val someReplay = Cat(maskedNeedReplay).orR - val allReplay = Cat(maskedNeedReplay).andR - XSDebug(replayValid, p"needReplay: ${Binary(Cat(needReplay))}\n") - XSDebug(replayValid, p"dispatchedMask: ${Binary(dispatchedMask)}\n") - XSDebug(replayValid, p"maskedNeedReplay: ${Binary(maskedNeedReplay)}\n") - // when nothing or everything is cancelled or replayed, the pointers remain unchanged - // if any uop is cancelled or replayed, the pointer should go to the first zero before all ones - // position: target index - // (1) if leftmost bits are ones, count continuous ones from leftmost (target position is the last one) - // (2) if leftmost bit is zero, count rightmost zero btis (target position is the first one) - // if all bits are one, we need to keep the index unchanged - // 00000000, 11111111: unchanged - // otherwise: firstMaskPosition - val cancelPosition = Mux(!Cat(needCancel).orR || allCancel, tailPtr.value, getFirstMaskPosition(needCancel)) - val replayPosition = Mux(!someReplay || allReplay, dispatchPtr.value, getFirstMaskPosition(maskedNeedReplay.asBools)) - XSDebug(replayValid, p"getFirstMaskPosition: ${getFirstMaskPosition(maskedNeedReplay.asBools)}\n") - assert(cancelPosition.getWidth == indexWidth) - assert(replayPosition.getWidth == indexWidth) - // If the highest bit is one, the direction flips. - // Otherwise, the direction keeps the same. - val tailCancelPtr = Wire(new CircularQueuePtr(size)) - tailCancelPtr.flag := Mux(needCancel(size - 1), ~tailPtr.flag, tailPtr.flag) - tailCancelPtr.value := Mux(needCancel(size - 1) && !allCancel, size.U - cancelPosition, cancelPosition) - // In case of branch mis-prediction: - // If mis-prediction happens after dispatchPtr, the pointer keeps the same as before. - // If dispatchPtr needs to be cancelled, reset dispatchPtr to tailPtr. - val dispatchCancelPtr = Mux(needCancel(dispatchPtr.value) || dispatchEntries === 0.U, tailCancelPtr, dispatchPtr) - // In case of replay, we need to walk back and recover preg states in the busy table. - // We keep track of the number of entries needed to be walked instead of target position to reduce overhead - // for 11111111, replayPosition is unuseful. We naively set Cnt to size.U - val dispatchReplayCnt = Mux(allReplay, size.U, Mux(maskedNeedReplay(size - 1), (dispatchPtr + replayPosition).value, (dispatchPtr - replayPosition).value)) - val dispatchReplayCntReg = RegInit(0.U) - // actually, if deqIndex points to head uops and they are replayed, there's no need for extraWalk - // however, to simplify logic, we simply let it do extra walk now - val needExtraReplayWalk = Cat((0 until deqnum).map(i => needReplay(deqIndex(i)))).orR - val needExtraReplayWalkReg = RegNext(needExtraReplayWalk && replayValid, false.B) - val inReplayWalk = dispatchReplayCntReg =/= 0.U || needExtraReplayWalkReg - val dispatchReplayStep = Mux(needExtraReplayWalkReg, 0.U, Mux(dispatchReplayCntReg > replayWidth.U, replayWidth.U, dispatchReplayCntReg)) - when (exceptionValid) { - dispatchReplayCntReg := 0.U - }.elsewhen (inReplayWalk && mispredictionValid && needCancel((dispatchPtr - 1.U).value)) { - val distance = distanceBetween(dispatchPtr, tailCancelPtr) - dispatchReplayCntReg := Mux(dispatchReplayCntReg > distance, dispatchReplayCntReg - distance, 0.U) - }.elsewhen (replayValid && someReplay) { - dispatchReplayCntReg := dispatchReplayCnt - dispatchReplayStep - }.elsewhen (!needExtraReplayWalkReg) { - dispatchReplayCntReg := dispatchReplayCntReg - dispatchReplayStep - } - - io.inReplayWalk := inReplayWalk - val replayIndex = (0 until replayWidth).map(i => (dispatchPtr - (i + 1).U).value) - for (i <- 0 until replayWidth) { - val index = Mux(needExtraReplayWalkReg, (if (i < deqnum) deqIndex(i) else 0.U), replayIndex(i)) - val shouldResetDest = inReplayWalk && stateEntries(index) === s_valid - io.replayPregReq(i).isInt := shouldResetDest && uopEntries(index).ctrl.rfWen && uopEntries(index).ctrl.ldest =/= 0.U - io.replayPregReq(i).isFp := shouldResetDest && uopEntries(index).ctrl.fpWen - io.replayPregReq(i).preg := uopEntries(index).pdest - - XSDebug(shouldResetDest, p"replay $i: " + - p"type (${uopEntries(index).ctrl.rfWen}, ${uopEntries(index).ctrl.fpWen}) " + - p"pdest ${uopEntries(index).pdest} ldest ${uopEntries(index).ctrl.ldest}\n") - } - - /** - * Part 3: update indices - * - * tail: (1) enqueue; (2) walk in case of redirect - * dispatch: (1) dequeue; (2) walk in case of replay; (3) walk in case of redirect - * head: commit - */ - // enqueue - val numEnqTry = Mux(emptyEntries > enqnum.U, enqnum.U, emptyEntries) - val numEnq = PriorityEncoder(io.enq.map(!_.fire()) :+ true.B) - XSError(numEnq =/= 0.U && (mispredictionValid || exceptionValid), "should not enqueue when redirect\n") - tailPtr := Mux(exceptionValid, - 0.U.asTypeOf(new CircularQueuePtr(size)), - Mux(mispredictionValid, - tailCancelPtr, - tailPtr + numEnq) - ) // dequeue - val numDeqTry = Mux(dispatchEntries > deqnum.U, deqnum.U, dispatchEntries) + val numDeqTry = Mux(validEntries > deqnum.U, deqnum.U, validEntries) val numDeqFire = PriorityEncoder(io.deq.zipWithIndex.map{case (deq, i) => // For dequeue, the first entry should never be s_invalid // Otherwise, there should be a redirect and tail walks back // in this case, we set numDeq to 0 - !deq.fire() && (if (i == 0) true.B else stateEntries(deqIndex(i)) =/= s_dispatched) + !deq.fire() && (if (i == 0) true.B else stateEntries(deqIndex(i)) =/= s_invalid) } :+ true.B) val numDeq = Mux(numDeqTry > numDeqFire, numDeqFire, numDeqTry) - dispatchPtr := Mux(exceptionValid, + // agreement with reservation station: don't dequeue when redirect.valid + val headPtrNext = Mux(mispredictionValid, headPtr, headPtr + numDeq) + headPtr := Mux(exceptionValid, 0.U.asTypeOf(new CircularQueuePtr(size)), headPtrNext) + + // For branch mis-prediction or memory violation replay, + // we delay updating the indices for one clock cycle. + // For now, we simply use PopCount to count #instr cancelled. + val lastCycleMisprediction = RegNext(io.redirect.valid && !(io.redirect.bits.isException || io.redirect.bits.isFlushPipe)) + // find the last one's position, starting from headPtr and searching backwards + val validBitVec = VecInit((0 until size).map(i => stateEntries(i) === s_valid)) + val loValidBitVec = Cat((0 until size).map(i => validBitVec(i) && headPtrMask(i))) + val hiValidBitVec = Cat((0 until size).map(i => validBitVec(i) && ~headPtrMask(i))) + val flippedFlag = loValidBitVec.orR + val lastOneIndex = size.U - PriorityEncoder(Mux(loValidBitVec.orR, loValidBitVec, hiValidBitVec)) + val walkedTailPtr = Wire(new CircularQueuePtr(size)) + walkedTailPtr.flag := flippedFlag ^ headPtr.flag + walkedTailPtr.value := lastOneIndex + + // enqueue + val numEnq = Mux(canActualEnqueue, PriorityEncoder(io.enq.map(!_.valid) :+ true.B), 0.U) + XSError(numEnq =/= 0.U && (mispredictionValid || exceptionValid), "should not enqueue when redirect\n") + tailPtr := Mux(exceptionValid, 0.U.asTypeOf(new CircularQueuePtr(size)), - Mux(mispredictionValid && (!inReplayWalk || needCancel((dispatchPtr - 1.U).value)), - dispatchCancelPtr, - Mux(inReplayWalk, dispatchPtr - dispatchReplayStep, dispatchPtr + numDeq)) + Mux(lastCycleMisprediction, + Mux(isTrueEmpty, headPtr, walkedTailPtr), + tailPtr + numEnq) ) - headPtr := Mux(exceptionValid, 0.U.asTypeOf(new CircularQueuePtr(size)), headPtr + PopCount(needDequeue)) /** - * Part 4: set output and input + * Part 3: set output and input */ - val allWalkDone = !inReplayWalk && io.otherWalkDone - val enqReadyBits = (1.U << numEnqTry).asUInt() - 1.U - for (i <- 0 until enqnum) { - io.enq(i).ready := enqReadyBits(i).asBool() && allWalkDone - } - + // TODO: remove this when replay moves to roq for (i <- 0 until deqnum) { io.deq(i).bits := uopEntries(deqIndex(i)) // do not dequeue when io.redirect valid because it may cause dispatchPtr work improperly - io.deq(i).valid := stateEntries(deqIndex(i)) === s_valid && !io.redirect.valid && allWalkDone + io.deq(i).valid := stateEntries(deqIndex(i)) === s_valid && !lastCycleMisprediction// && !io.redirect.valid } // debug: dump dispatch queue states - XSDebug(p"head: $headPtr, tail: $tailPtr, dispatch: $dispatchPtr, " + - p"replayCnt: $dispatchReplayCntReg, needExtraReplayWalkReg: $needExtraReplayWalkReg\n") + XSDebug(p"head: $headPtr, tail: $tailPtr\n") XSDebug(p"state: ") stateEntries.reverse.foreach { s => XSDebug(false, s === s_invalid, "-") XSDebug(false, s === s_valid, "v") - XSDebug(false, s === s_dispatched, "d") } XSDebug(false, true.B, "\n") XSDebug(p"ptr: ") (0 until size).reverse.foreach { i => - val isPtr = i.U === headPtr.value || i.U === tailPtr.value || i.U === dispatchPtr.value + val isPtr = i.U === headPtr.value || i.U === tailPtr.value XSDebug(false, isPtr, "^") XSDebug(false, !isPtr, " ") } XSDebug(false, true.B, "\n") XSError(isAfter(headPtr, tailPtr), p"assert greaterOrEqualThan(tailPtr: $tailPtr, headPtr: $headPtr) failed\n") - XSError(isAfter(dispatchPtr, tailPtr) && !inReplayWalk, p"assert greaterOrEqualThan(tailPtr: $tailPtr, dispatchPtr: $dispatchPtr) failed\n") - XSError(isAfter(headPtr, dispatchPtr), p"assert greaterOrEqualThan(dispatchPtr: $dispatchPtr, headPtr: $headPtr) failed\n") - XSError(validEntries < dispatchEntries && !inReplayWalk, "validEntries should be less than dispatchEntries\n") XSPerf("utilization", PopCount(stateEntries.map(_ =/= s_invalid))) - XSPerf("replayInstr", PopCount(io.replayPregReq.map(replay => replay.isInt || replay.isFp))) } diff --git a/src/main/scala/xiangshan/backend/exu/AluExeUnit.scala b/src/main/scala/xiangshan/backend/exu/AluExeUnit.scala index 57198165a4dd3e2153bfa98153d1e453f4bba09a..bdea8cd7a07bc9090d8c8e26cd3e8754005e3d26 100644 --- a/src/main/scala/xiangshan/backend/exu/AluExeUnit.scala +++ b/src/main/scala/xiangshan/backend/exu/AluExeUnit.scala @@ -1,98 +1,32 @@ + package xiangshan.backend.exu import chisel3._ import chisel3.util._ -import chisel3.util.experimental.BoringUtils -import xiangshan._ -import xiangshan.FuType._ import utils._ -import xiangshan.backend._ -import xiangshan.backend.fu.FunctionUnit._ - - -class AluExeUnit extends Exu(Exu.aluExeUnitCfg) { - - val (iovalid, src1, src2, offset, func, pc, uop) = (io.in.valid, io.in.bits.src1, io.in.bits.src2, - io.in.bits.uop.ctrl.imm, io.in.bits.uop.ctrl.fuOpType, SignExt(io.in.bits.uop.cf.pc, AddrBits), io.in.bits.uop) - - val redirectHit = uop.roqIdx.needFlush(io.redirect) - val valid = iovalid && !redirectHit - - val isAdderSub = (func =/= ALUOpType.add) && (func =/= ALUOpType.addw) - val adderRes = (src1 +& (src2 ^ Fill(XLEN, isAdderSub))) + isAdderSub - val xorRes = src1 ^ src2 - val sltu = !adderRes(XLEN) - val slt = xorRes(XLEN-1) ^ sltu - - val shsrc1 = LookupTreeDefault(func, src1, List( - ALUOpType.srlw -> ZeroExt(src1(31,0), 64), - ALUOpType.sraw -> SignExt(src1(31,0), 64) - )) - val shamt = Mux(ALUOpType.isWordOp(func), src2(4, 0), src2(5, 0)) - val res = LookupTreeDefault(func(3, 0), adderRes, List( - ALUOpType.sll -> ((shsrc1 << shamt)(XLEN-1, 0)), - ALUOpType.slt -> ZeroExt(slt, XLEN), - ALUOpType.sltu -> ZeroExt(sltu, XLEN), - ALUOpType.xor -> xorRes, - ALUOpType.srl -> (shsrc1 >> shamt), - ALUOpType.or -> (src1 | src2), - ALUOpType.and -> (src1 & src2), - ALUOpType.sra -> ((shsrc1.asSInt >> shamt).asUInt) - )) - val aluRes = Mux(ALUOpType.isWordOp(func), SignExt(res(31,0), 64), res) - - val branchOpTable = List( - ALUOpType.getBranchType(ALUOpType.beq) -> !xorRes.orR, - ALUOpType.getBranchType(ALUOpType.blt) -> slt, - ALUOpType.getBranchType(ALUOpType.bltu) -> sltu +import xiangshan.backend.exu.Exu.aluExeUnitCfg +import xiangshan.backend.fu.Alu + +class AluExeUnit extends Exu(aluExeUnitCfg) +{ + val alu = supportedFunctionUnits.collectFirst{ + case a: Alu => a + }.get + + io.toInt.bits.redirectValid := alu.redirectOutValid + io.toInt.bits.redirect := alu.redirectOut + io.toInt.bits.brUpdate := alu.brUpdate + + XSDebug(io.fromInt.valid || io.redirect.valid, + p"fromInt(${io.fromInt.valid} ${io.fromInt.ready}) toInt(${io.toInt.valid} ${io.toInt.ready})" + + p"Redirect:(${io.redirect.valid} ${io.redirect.bits.isException}${io.redirect.bits.isFlushPipe}${io.redirect.bits.isMisPred}${io.redirect.bits.isReplay}) roqIdx:${io.redirect.bits.roqIdx}\n", ) - - val isBranch = uop.cf.brUpdate.pd.isBr// ALUOpType.isBranch(func) - val isRVC = uop.cf.brUpdate.pd.isRVC//(io.in.bits.cf.instr(1,0) =/= "b11".U) - val taken = LookupTree(ALUOpType.getBranchType(func), branchOpTable) ^ ALUOpType.isBranchInvert(func) - val target = Mux(isBranch, pc + offset, adderRes)(VAddrBits-1,0) - val pcLatchSlot = Mux(isRVC, pc + 2.U, pc + 4.U) - - io.out.bits.redirectValid := io.out.valid && isBranch - io.out.bits.redirect.pc := uop.cf.pc - io.out.bits.redirect.target := Mux(!taken && isBranch, pcLatchSlot, target) - io.out.bits.redirect.brTag := uop.brTag - io.out.bits.redirect.isException := false.B - io.out.bits.redirect.isMisPred := DontCare // check this in brq - io.out.bits.redirect.isFlushPipe := false.B - io.out.bits.redirect.isReplay := false.B - io.out.bits.redirect.roqIdx := uop.roqIdx - - io.out.bits.brUpdate := uop.cf.brUpdate - // override brUpdate - io.out.bits.brUpdate.pc := uop.cf.pc - io.out.bits.brUpdate.target := Mux(!taken && isBranch, pcLatchSlot, target) - io.out.bits.brUpdate.brTarget := target - // io.out.bits.brUpdate.btbType := "b00".U - io.out.bits.brUpdate.taken := isBranch && taken - // io.out.bits.brUpdate.fetchIdx := uop.cf.brUpdate.fetchOffset >> 1.U //TODO: consider RVC - io.out.bits.brUpdate.brTag := uop.brTag - - io.in.ready := io.out.ready - io.out.valid := valid - io.out.bits.uop <> io.in.bits.uop - io.out.bits.data := aluRes - - XSDebug(io.in.valid || io.redirect.valid, - "In(%d %d) Out(%d %d) Redirect:(%d %d %d %d) brTag:f:%d v:%d\n", - io.in.valid, - io.in.ready, - io.out.valid, - io.out.ready, - io.redirect.valid, - io.redirect.bits.isException, - io.redirect.bits.isFlushPipe, - redirectHit, - io.redirect.bits.brTag.flag, - io.redirect.bits.brTag.value + XSDebug(io.fromInt.valid, + p"src1:${Hexadecimal(io.fromInt.bits.src1)} src2:${Hexadecimal(io.fromInt.bits.src2)} " + + p"src3:${Hexadecimal(io.fromInt.bits.src3)} func:${Binary(io.fromInt.bits.uop.ctrl.fuOpType)} " + + p"pc:${Hexadecimal(io.fromInt.bits.uop.cf.pc)} roqIdx:${io.fromInt.bits.uop.roqIdx}\n" + ) + XSDebug(io.toInt.valid, + p"res:${Hexadecimal(io.toInt.bits.data)}\n" ) - XSDebug(io.in.valid, "src1:%x src2:%x offset:%x func:%b pc:%x\n", - src1, src2, offset, func, pc) - XSDebug(io.out.valid, "res:%x aluRes:%x isRVC:%d isBranch:%d target:%x taken:%d\n", - io.out.bits.data, aluRes, isRVC, isBranch, target, taken) } \ No newline at end of file diff --git a/src/main/scala/xiangshan/backend/exu/DivExeUnit.scala b/src/main/scala/xiangshan/backend/exu/DivExeUnit.scala deleted file mode 100644 index c9c46b8b7a49a4952f57dfda00116585e86edbc2..0000000000000000000000000000000000000000 --- a/src/main/scala/xiangshan/backend/exu/DivExeUnit.scala +++ /dev/null @@ -1,64 +0,0 @@ -package xiangshan.backend.exu - -import chisel3._ -import chisel3.util._ -import xiangshan._ -import utils._ -import xiangshan.backend.fu.Divider -import xiangshan.backend.MDUOpType - -class DivExeUnit extends Exu(Exu.divExeUnitCfg) { - - val (src1, src2, uop, func) = - (io.in.bits.src1, io.in.bits.src2, io.in.bits.uop, io.in.bits.uop.ctrl.fuOpType) - - val divider = Module(new Divider(XLEN)) - - val isDiv = MDUOpType.isDiv(func) - val isDivSign = MDUOpType.isDivSign(func) - val isW = MDUOpType.isW(func) - val isH = MDUOpType.isH(func) - - val divInputFunc = (x: UInt) => Mux( - isW, - Mux(isDivSign, - SignExt(x(31,0), XLEN), - ZeroExt(x(31,0), XLEN) - ), - x - ) - - divider.io.redirect := io.redirect - divider.io.in.valid := io.in.valid - divider.io.in.bits.ctrl.uop := io.in.bits.uop - divider.io.in.bits.ctrl.sign := isDivSign - divider.io.in.bits.ctrl.isW := isW - divider.io.in.bits.ctrl.isHi := isH - divider.io.in.bits.src1 := divInputFunc(src1) - divider.io.in.bits.src2 := divInputFunc(src2) - divider.io.out.ready := io.out.ready - - io.in.ready := divider.io.in.ready - io.out.valid := divider.io.out.valid - io.out.bits.uop := divider.io.out.bits.uop - io.out.bits.data := divider.io.out.bits.data - io.out.bits.redirectValid := false.B - io.out.bits.redirect <> DontCare - io.dmem <> DontCare - io.out.bits.debug <> DontCare - - XSDebug(io.in.valid || io.redirect.valid, "In(%d %d) Out(%d %d) Redirect:(%d %d %d) brTag:%x\n", - io.in.valid, io.in.ready, - io.out.valid, io.out.ready, - io.redirect.valid, - io.redirect.bits.isException, - io.redirect.bits.isFlushPipe, - io.redirect.bits.brTag.value - ) - XSDebug(io.in.valid, p"src1: 0x${Hexadecimal(src1)} src2: 0x${Hexadecimal(src2)} func: ${Binary(func)} " + - p"pc: ${io.in.bits.uop.cf.pc} roqIdx: ${io.in.bits.uop.roqIdx}\n") - XSDebug(io.out.valid, p"Out(${io.out.valid} ${io.out.ready}) res: ${Hexadecimal(io.out.bits.data)} " + - p"func: ${Binary(io.out.bits.uop.ctrl.fuOpType)} pc: ${Hexadecimal(io.out.bits.uop.cf.pc)} roqIdx: ${io.out.bits.uop.roqIdx}\n" - ) - -} diff --git a/src/main/scala/xiangshan/backend/exu/Exu.scala b/src/main/scala/xiangshan/backend/exu/Exu.scala index 3e1b604eceb007339b1981326bea40f70b3cc0c8..0afc0abae4db3de65e1e5e8a0b4fe8c335a288ed 100644 --- a/src/main/scala/xiangshan/backend/exu/Exu.scala +++ b/src/main/scala/xiangshan/backend/exu/Exu.scala @@ -3,10 +3,8 @@ package xiangshan.backend.exu import chisel3._ import chisel3.util._ import xiangshan._ -import xiangshan.FuType._ -import xiangshan.backend.fu.FuConfig -import utils.ParallelOR import xiangshan.backend.fu.FunctionUnit._ +import xiangshan.backend.fu.{FuConfig, FuOutput, FunctionUnit, HasFuLatency, UncertainLatency} case class ExuParameters ( @@ -19,54 +17,211 @@ case class ExuParameters FmiscDivSqrtCnt: Int, LduCnt: Int, StuCnt: Int -){ +) { assert(JmpCnt == 1, "Only support 1 JmpUnit now!") + def IntExuCnt = AluCnt + MulCnt + MduCnt + JmpCnt + def FpExuCnt = FmacCnt + FmiscCnt + FmiscDivSqrtCnt + def LsExuCnt = LduCnt + StuCnt + def ExuCnt = IntExuCnt + FpExuCnt + LduCnt + StuCnt + def NRFuType = 9 + def FuOpWidth = 7 } case class ExuConfig ( name: String, - supportedFuncUnits: Array[FuConfig], - enableBypass: Boolean -){ - def max(in: Seq[Int]): Int = in.reduce((x, y) => if(x > y) x else y) - val intSrcCnt = max(supportedFuncUnits.map(_.numIntSrc)) - val fpSrcCnt = max(supportedFuncUnits.map(_.numFpSrc)) + fuConfigs: Seq[FuConfig], + wbIntPriority: Int, + wbFpPriority: Int +) { + def max(in: Seq[Int]): Int = in.reduce((x, y) => if (x > y) x else y) + + val intSrcCnt = max(fuConfigs.map(_.numIntSrc)) + val fpSrcCnt = max(fuConfigs.map(_.numFpSrc)) val readIntRf = intSrcCnt > 0 val readFpRf = fpSrcCnt > 0 - val writeIntRf = supportedFuncUnits.map(_.writeIntRf).reduce(_||_) - val writeFpRf = supportedFuncUnits.map(_.writeFpRf).reduce(_||_) - val hasRedirect = supportedFuncUnits.map(_.hasRedirect).reduce(_||_) + val writeIntRf = fuConfigs.map(_.writeIntRf).reduce(_ || _) + val writeFpRf = fuConfigs.map(_.writeFpRf).reduce(_ || _) + val hasRedirect = fuConfigs.map(_.hasRedirect).reduce(_ || _) + + val latency: HasFuLatency = { + val lats = fuConfigs.map(_.latency) + if (lats.exists(x => x.latencyVal.isEmpty)) { + UncertainLatency() + } else { + val x = lats.head + for (l <- lats.drop(1)) { + require(x.latencyVal.get == l.latencyVal.get) + } + x + } + } + val hasCertainLatency = latency.latencyVal.nonEmpty + val hasUncertainlatency = latency.latencyVal.isEmpty def canAccept(fuType: UInt): Bool = { - ParallelOR(supportedFuncUnits.map(_.fuType === fuType)) + Cat(fuConfigs.map(_.fuType === fuType)).orR() } } abstract class Exu(val config: ExuConfig) extends XSModule { - val io = IO(new ExuIO) - io.dmem <> DontCare - io.out.bits.brUpdate <> DontCare - io.out.bits.debug.isMMIO := false.B + + val supportedFunctionUnits = config.fuConfigs.map(_.fuGen).map(gen => Module(gen())) + + val fuSel = supportedFunctionUnits.zip(config.fuConfigs.map(_.fuSel)).map { + case (fu, sel) => sel(fu) + } + + val io = IO(new Bundle() { + val fromInt = if (config.readIntRf) Flipped(DecoupledIO(new ExuInput)) else null + val fromFp = if (config.readFpRf) Flipped(DecoupledIO(new ExuInput)) else null + val redirect = Flipped(ValidIO(new Redirect)) + val toInt = if (config.writeIntRf) DecoupledIO(new ExuOutput) else null + val toFp = if (config.writeFpRf) DecoupledIO(new ExuOutput) else null + }) + + for ((fuCfg, (fu, sel)) <- config.fuConfigs.zip(supportedFunctionUnits.zip(fuSel))) { + + val in = if (fuCfg.numIntSrc > 0) { + assert(fuCfg.numFpSrc == 0) + io.fromInt + } else { + assert(fuCfg.numFpSrc > 0) + io.fromFp + } + + val src1 = in.bits.src1 + val src2 = in.bits.src2 + val src3 = in.bits.src3 + + fu.io.in.valid := in.valid && sel && !in.bits.uop.roqIdx.needFlush(io.redirect) + fu.io.in.bits.uop := in.bits.uop + fu.io.in.bits.src.foreach(_ <> DontCare) + if (fuCfg.srcCnt > 0) { + fu.io.in.bits.src(0) := src1 + } + if (fuCfg.srcCnt > 1) { + fu.io.in.bits.src(1) := src2 + } + if (fuCfg.srcCnt > 2) { + fu.io.in.bits.src(2) := src3 + } + fu.io.redirectIn := io.redirect + } + + + val needArbiter = !(config.latency.latencyVal.nonEmpty && (config.latency.latencyVal.get == 0)) + + def writebackArb(in: Seq[DecoupledIO[FuOutput]], out: DecoupledIO[ExuOutput]): Arbiter[FuOutput] = { + if (needArbiter) { + val arb = Module(new Arbiter(new FuOutput, in.size)) + arb.io.in <> in + arb.io.out.ready := out.ready + out.bits.data := arb.io.out.bits.data + out.bits.uop := arb.io.out.bits.uop + out.valid := arb.io.out.valid + arb + } else { + in.foreach(_.ready := out.ready) + val sel = Mux1H(in.map(x => x.valid -> x)) + out.bits.data := sel.bits.data + out.bits.uop := sel.bits.uop + out.valid := sel.valid + null + } + } + + val intArb = if (config.writeIntRf) writebackArb( + supportedFunctionUnits.zip(config.fuConfigs).filter(x => !x._2.writeFpRf).map(_._1.io.out), + io.toInt + ) else null + + val fpArb = if (config.writeFpRf) writebackArb( + supportedFunctionUnits.zip(config.fuConfigs).filter(x => x._2.writeFpRf).map(_._1.io.out), + io.toFp + ) else null + + val readIntFu = config.fuConfigs + .zip(supportedFunctionUnits.zip(fuSel)) + .filter(_._1.numIntSrc > 0) + .map(_._2) + + val readFpFu = config.fuConfigs + .zip(supportedFunctionUnits.zip(fuSel)) + .filter(_._1.numFpSrc > 0) + .map(_._2) + + def inReady(s: Seq[(FunctionUnit, Bool)]): Bool = { + if (s.size == 1) { + s.head._1.io.in.ready + } else { + if (needArbiter) { + Cat(s.map(x => x._1.io.in.ready && x._2)).orR() + } else { + Cat(s.map(x => x._1.io.in.ready)).andR() + } + } + } + + + if (config.readIntRf) { + io.fromInt.ready := inReady(readIntFu) + } + + if (config.readFpRf) { + io.fromFp.ready := inReady(readFpFu) + } + + def assignDontCares(out: ExuOutput) = { + out.brUpdate := DontCare + out.fflags := DontCare + out.debug <> DontCare + out.debug.isMMIO := false.B + out.redirect <> DontCare + out.redirectValid := false.B + } + + if (config.writeFpRf) { + assignDontCares(io.toFp.bits) + } + if (config.writeIntRf) { + assignDontCares(io.toInt.bits) + } } object Exu { - val jmpExeUnitCfg = ExuConfig("JmpExu", Array(jmpCfg, i2fCfg, csrCfg, fenceCfg), enableBypass = false) - val aluExeUnitCfg = ExuConfig("AluExu", Array(aluCfg), enableBypass = true) - val mulExeUnitCfg = ExuConfig("MulExu", Array(mulCfg), enableBypass = false) - val divExeUnitCfg = ExuConfig("DivExu", Array(divCfg), enableBypass = false) - val fenceExeUnitCfg = ExuConfig("FenceCfg", Array(fenceCfg), enableBypass = false) - val mulDivExeUnitCfg = ExuConfig("MulDivExu", Array(mulCfg, divCfg), enableBypass = false) - val mulDivFenceExeUnitCfg = ExuConfig("MulDivFenceExu", Array(mulCfg, divCfg, fenceCfg), enableBypass = false) - val ldExeUnitCfg = ExuConfig("LoadExu", Array(lduCfg), enableBypass = false) - val stExeUnitCfg =ExuConfig("StoreExu", Array(stuCfg, mouCfg), enableBypass = false) - val fmacExeUnitCfg = ExuConfig("FmacExu", Array(fmacCfg), enableBypass = false) - val fmiscExeUnitCfg = ExuConfig("FmiscExu", Array(fmiscCfg), enableBypass = false) - val fmiscDivExeUnitCfg = ExuConfig("FmiscDivExu", Array(fmiscCfg, fDivSqrtCfg), enableBypass = false) -} + + val aluExeUnitCfg = ExuConfig("AluExeUnit", Seq(aluCfg), 0, Int.MaxValue) + val jumpExeUnitCfg = ExuConfig("JmpExeUnit", Seq(jmpCfg, csrCfg, fenceCfg, i2fCfg), 2, Int.MaxValue) + val mulDivExeUnitCfg = ExuConfig("MulDivExeUnit", Seq(mulCfg, divCfg), 1, Int.MaxValue) + val fmacExeUnitCfg = ExuConfig("FmacExeUnit", Seq(fmacCfg), Int.MaxValue, 0) + val fmiscExeUnitCfg = ExuConfig( + "FmiscExeUnit", + Seq(fcmpCfg, fminCfg, fmvCfg, fsgnjCfg, f2iCfg, s2dCfg, d2sCfg, fdivSqrtCfg), + Int.MaxValue, 1 + ) + val ldExeUnitCfg = ExuConfig("LoadExu", Seq(lduCfg), wbIntPriority = 0, wbFpPriority = 0) + val stExeUnitCfg = ExuConfig("StoreExu", Seq(stuCfg, mouCfg), wbIntPriority = Int.MaxValue, wbFpPriority = Int.MaxValue) + + val loadExuConfigs = Seq.fill(exuParameters.LduCnt)(ldExeUnitCfg) + val storeExuConfigs = Seq.fill(exuParameters.StuCnt)(stExeUnitCfg) + + val intExuConfigs = jumpExeUnitCfg +: ( + Seq.fill(exuParameters.AluCnt)(aluExeUnitCfg) ++ + Seq.fill(exuParameters.MduCnt)(mulDivExeUnitCfg) + ) + + val fpExuConfigs = + Seq.fill(exuParameters.FmacCnt)(fmacExeUnitCfg) ++ + Seq.fill(exuParameters.FmiscCnt)(fmiscExeUnitCfg) + + val exuConfigs: Seq[ExuConfig] = intExuConfigs ++ fpExuConfigs + + +} \ No newline at end of file diff --git a/src/main/scala/xiangshan/backend/exu/FmacExeUnit.scala b/src/main/scala/xiangshan/backend/exu/FmacExeUnit.scala new file mode 100644 index 0000000000000000000000000000000000000000..87fd2094447c35a3b5713b0f8f9feb32daad75e9 --- /dev/null +++ b/src/main/scala/xiangshan/backend/exu/FmacExeUnit.scala @@ -0,0 +1,29 @@ +package xiangshan.backend.exu + +import chisel3._ +import chisel3.util._ +import xiangshan.backend.exu.Exu.fmacExeUnitCfg +import xiangshan.backend.fu.fpu._ +import xiangshan.backend.fu.fpu.fma.FMA + +class FmacExeUnit extends Exu(fmacExeUnitCfg) +{ + val frm = IO(Input(UInt(3.W))) + + val fma = supportedFunctionUnits.head.asInstanceOf[FMA] + + val input = io.fromFp.bits + val fmaOut = fma.io.out.bits + val isRVD = !io.fromFp.bits.uop.ctrl.isRVF + fma.io.in.bits.src := VecInit(Seq(input.src1, input.src2, input.src3).map( + src => Mux(isRVD, src, unboxF64ToF32(src)) + )) + val instr_rm = io.fromFp.bits.uop.cf.instr(14, 12) + fma.rm := Mux(instr_rm =/= 7.U, instr_rm, frm) + + fma.io.redirectIn := io.redirect + fma.io.out.ready := io.toFp.ready + + io.toFp.bits.data := Mux(fmaOut.uop.ctrl.isRVF, boxF32ToF64(fmaOut.data), fmaOut.data) + io.toFp.bits.fflags := fma.fflags +} diff --git a/src/main/scala/xiangshan/backend/exu/FmiscExeUnit.scala b/src/main/scala/xiangshan/backend/exu/FmiscExeUnit.scala new file mode 100644 index 0000000000000000000000000000000000000000..738bc7addb1ba3e487152be20e04aea15a2acedc --- /dev/null +++ b/src/main/scala/xiangshan/backend/exu/FmiscExeUnit.scala @@ -0,0 +1,61 @@ +package xiangshan.backend.exu + +import chisel3._ +import chisel3.util._ +import utils._ +import xiangshan.backend.exu.Exu.fmiscExeUnitCfg +import xiangshan.backend.fu.fpu.FPUOpType._ +import xiangshan.backend.fu.fpu._ + +class FmiscExeUnit extends Exu(fmiscExeUnitCfg) { + + val frm = IO(Input(UInt(3.W))) + + val fcmp :: fmin :: fmv :: fsgnj :: f2i :: f32toF64 :: f64toF32 :: fdivSqrt :: Nil = supportedFunctionUnits.map(fu => fu.asInstanceOf[FPUSubModule]) + val toFpUnits = Seq(fmin, fsgnj, f32toF64, f64toF32, fdivSqrt) + val toIntUnits = Seq(fcmp, fmv, f2i) + + assert(fpArb.io.in.length == toFpUnits.size) + assert(intArb.io.in.length == toIntUnits.size) + + val input = io.fromFp + val fuOp = input.bits.uop.ctrl.fuOpType + assert(fuOp.getWidth == 7) // when fuOp's WIDTH change, here must change too + val fu = fuOp.head(4) + val op = fuOp.tail(4) + val isRVF = input.bits.uop.ctrl.isRVF + val instr_rm = input.bits.uop.cf.instr(14, 12) + val (src1, src2) = (input.bits.src1, input.bits.src2) + + supportedFunctionUnits.foreach { module => + module.io.in.bits.src(0) := Mux( + (isRVF && fuOp =/= d2s && fuOp =/= fmv_f2i) || fuOp === s2d, + unboxF64ToF32(src1), + src1 + ) + module.io.in.bits.src(1) := Mux(isRVF, unboxF64ToF32(src2), src2) + module.asInstanceOf[FPUSubModule].rm := Mux(instr_rm =/= 7.U, instr_rm, frm) + } + + io.toFp.bits.fflags := MuxCase( + 0.U.asTypeOf(new Fflags), + toFpUnits.map(x => x.io.out.fire() -> x.fflags) + ) + val fpOutCtrl = io.toFp.bits.uop.ctrl + io.toFp.bits.data := Mux(fpOutCtrl.isRVF, + boxF32ToF64(fpArb.io.out.bits.data), + fpArb.io.out.bits.data + ) + val intOutCtrl = io.toInt.bits.uop.ctrl + io.toInt.bits.data := Mux( + (intOutCtrl.isRVF && intOutCtrl.fuOpType === fmv_f2i) || + intOutCtrl.fuOpType === f2w || + intOutCtrl.fuOpType === f2wu, + SignExt(intArb.io.out.bits.data(31, 0), XLEN), + intArb.io.out.bits.data + ) + io.toInt.bits.fflags := MuxCase( + 0.U.asTypeOf(new Fflags), + toIntUnits.map(x => x.io.out.fire() -> x.fflags) + ) +} diff --git a/src/main/scala/xiangshan/backend/exu/JmpExeUnit.scala b/src/main/scala/xiangshan/backend/exu/JmpExeUnit.scala deleted file mode 100644 index 1984513fa2a7854ea7f82d66b0dcb6e6916facdc..0000000000000000000000000000000000000000 --- a/src/main/scala/xiangshan/backend/exu/JmpExeUnit.scala +++ /dev/null @@ -1,74 +0,0 @@ -package xiangshan.backend.exu - -import chisel3._ -import xiangshan.{ExuOutput, FuType} -import xiangshan.backend.fu.{CSR, Jump} -import xiangshan.backend.decode.isa._ -import utils._ - -class JmpExeUnit extends Exu(Exu.jmpExeUnitCfg) { - - val (valid, src1, src2, uop, fuType, func) = (io.in.valid, io.in.bits.src1, io.in.bits.src2, io.in.bits.uop, io.in.bits.uop.ctrl.fuType, io.in.bits.uop.ctrl.fuOpType) - - val jmp = Module(new Jump) - val csr = Module(new CSR) - val fence = Module(new FenceExeUnit) - - val isJmp = fuType === FuType.jmp - val isCsr = fuType === FuType.csr - val isFence = fuType === FuType.fence - - jmp.io.in.valid := io.in.valid && isJmp - jmp.io.in.bits := io.in.bits - jmp.io.out.ready := io.out.ready - jmp.io.exception <> DontCare - jmp.io.dmem <> DontCare - jmp.io.mcommit := DontCare - jmp.io.redirect := io.redirect - - csr.io.cfIn := io.in.bits.uop.cf - csr.io.fpu_csr := DontCare - csr.io.exception <> io.exception - csr.io.instrValid := DontCare - csr.io.out.ready := io.out.ready - csr.io.in.bits.src3 := DontCare - val csrOut = csr.access( - valid = io.in.valid && fuType === FuType.csr, - src1 = io.in.bits.src1, - src2 = io.in.bits.src2, - func = io.in.bits.uop.ctrl.fuOpType - ) - // val uop = io.in.bits.uop - val csrExuOut = Wire(new ExuOutput) - csrExuOut.uop := uop - csrExuOut.uop.cf := csr.io.cfOut - csrExuOut.uop.ctrl.flushPipe := csr.io.flushPipe - csrExuOut.data := csrOut - csrExuOut.redirectValid := csr.io.redirectValid - csrExuOut.redirect.brTag := uop.brTag - csrExuOut.redirect.isException := false.B - csrExuOut.redirect.isMisPred := false.B - csrExuOut.redirect.isFlushPipe := false.B - csrExuOut.redirect.isReplay := false.B - csrExuOut.redirect.roqIdx := uop.roqIdx - csrExuOut.redirect.target := csr.io.redirect.target - csrExuOut.redirect.pc := uop.cf.pc - csrExuOut.debug := DontCare - csrExuOut.brUpdate := DontCare - - fence.io.in.valid := valid && isFence - fence.io.in.bits := io.in.bits - fence.io.redirect <> DontCare // io.redirect // No need for fence is the first instr - fence.io.mcommit <> DontCare - fence.io.exception <> DontCare - fence.io.dmem <> DontCare - fence.io.out.ready := io.out.ready - - // NOTE: just one instr in this module at the same time - io.in.ready := jmp.io.in.ready && csr.io.in.ready && fence.io.in.ready - io.out.bits := Mux(jmp.io.out.valid, jmp.io.out.bits, Mux(csr.io.out.valid, csrExuOut, fence.io.out.bits)) - io.out.valid := jmp.io.out.valid || csr.io.out.valid || fence.io.out.valid - - XSDebug(io.in.valid, p"In(${io.in.valid} ${io.in.ready} ${jmp.io.in.ready}${csr.io.in.ready}${fence.io.in.ready}) pc:0x${Hexadecimal(io.in.bits.uop.cf.pc)} roqIdx:${io.in.bits.uop.roqIdx} fuType:b${Binary(io.in.bits.uop.ctrl.fuType)} fuOpType:b${Binary(io.in.bits.uop.ctrl.fuOpType)} isJmp:${isJmp} isCsr${isCsr} isFence:${isFence}\n") - XSDebug(io.out.valid, p"Out(${io.out.valid} ${io.out.ready} ${jmp.io.out.valid}${csr.io.out.valid}${fence.io.out.valid}) pc:0x${Hexadecimal(io.out.bits.uop.cf.pc)} roqIdx:${io.out.bits.uop.roqIdx} fuType:b${Binary(io.out.bits.uop.ctrl.fuType)} fuOpType:b${Binary(io.out.bits.uop.ctrl.fuOpType)}\n") -} \ No newline at end of file diff --git a/src/main/scala/xiangshan/backend/exu/JumpExeUnit.scala b/src/main/scala/xiangshan/backend/exu/JumpExeUnit.scala new file mode 100644 index 0000000000000000000000000000000000000000..101220f6c610b51c584df472bd132167b01ea888 --- /dev/null +++ b/src/main/scala/xiangshan/backend/exu/JumpExeUnit.scala @@ -0,0 +1,95 @@ +package xiangshan.backend.exu + + +import chisel3._ +import chisel3.util._ +import xiangshan._ +import xiangshan.backend.exu.Exu.jumpExeUnitCfg +import xiangshan.backend.fu.fpu.FPUOpType.FU_I2F +import xiangshan.backend.fu.{CSR, Fence, FenceToSbuffer, FunctionUnit, Jump} +import xiangshan.backend.fu.fpu.{Fflags, IntToFloatSingleCycle, boxF32ToF64} + +class JumpExeUnit extends Exu(jumpExeUnitCfg) +{ + val csrio = IO(new Bundle { + val fflags = Input(new Fflags) + val dirty_fs = Input(Bool()) + val frm = Output(UInt(3.W)) + val exception = Flipped(ValidIO(new MicroOp)) + val isInterrupt = Input(Bool()) + val trapTarget = Output(UInt(VAddrBits.W)) + val interrupt = Output(Bool()) + val memExceptionVAddr = Input(UInt(VAddrBits.W)) + val externalInterrupt = new ExternalInterruptIO + val tlb = Output(new TlbCsrBundle) + }) + val fenceio = IO(new Bundle { + val sfence = Output(new SfenceBundle) + val fencei = Output(Bool()) + val sbuffer = new FenceToSbuffer + }) + + val jmp = supportedFunctionUnits.collectFirst{ + case j: Jump => j + }.get + val csr = supportedFunctionUnits.collectFirst{ + case c: CSR => c + }.get + val fence = supportedFunctionUnits.collectFirst{ + case f: Fence => f + }.get + val i2f = supportedFunctionUnits.collectFirst { + case i: IntToFloatSingleCycle => i + }.get + + csr.csrio.perf <> DontCare + csr.csrio.fpu.fflags <> csrio.fflags + csr.csrio.fpu.isIllegal := false.B + csr.csrio.fpu.dirty_fs <> csrio.dirty_fs + csr.csrio.fpu.frm <> csrio.frm + csr.csrio.exception <> csrio.exception + csr.csrio.isInterrupt <> csrio.isInterrupt + csr.csrio.trapTarget <> csrio.trapTarget + csr.csrio.interrupt <> csrio.interrupt + csr.csrio.memExceptionVAddr <> csrio.memExceptionVAddr + csr.csrio.externalInterrupt <> csrio.externalInterrupt + csr.csrio.tlb <> csrio.tlb + + fenceio.sfence <> fence.sfence + fenceio.fencei <> fence.fencei + fenceio.sbuffer <> fence.toSbuffer + fence.io.out.ready := true.B + + val uop = io.fromInt.bits.uop + val instr_rm = uop.cf.instr(14, 12) + i2f.rm := Mux(instr_rm =/= 7.U, instr_rm, csr.csrio.fpu.frm) + + val isDouble = !uop.ctrl.isRVF + + when(i2f.io.in.valid){ + when(uop.ctrl.fuOpType.head(4)===s"b$FU_I2F".U){ + io.toFp.bits.data := Mux(isDouble, i2f.io.out.bits.data, boxF32ToF64(i2f.io.out.bits.data)) + io.toFp.bits.fflags := i2f.fflags + }.otherwise({ + // a mov.(s/d).x instruction + io.toFp.bits.data := Mux(isDouble, io.fromInt.bits.src1, boxF32ToF64(io.fromInt.bits.src1)) + io.toFp.bits.fflags := 0.U.asTypeOf(new Fflags) + }) + } + + when(csr.io.out.valid){ + io.toInt.bits.redirectValid := csr.csrio.redirectOut.valid + io.toInt.bits.redirect.brTag := uop.brTag + io.toInt.bits.redirect.isException := false.B + io.toInt.bits.redirect.isMisPred := false.B + io.toInt.bits.redirect.isFlushPipe := false.B + io.toInt.bits.redirect.isReplay := false.B + io.toInt.bits.redirect.roqIdx := uop.roqIdx + io.toInt.bits.redirect.target := csr.csrio.redirectOut.bits + io.toInt.bits.redirect.pc := uop.cf.pc + }.elsewhen(jmp.io.out.valid){ + io.toInt.bits.redirectValid := jmp.redirectOutValid + io.toInt.bits.redirect := jmp.redirectOut + io.toInt.bits.brUpdate := jmp.brUpdate + } +} diff --git a/src/main/scala/xiangshan/backend/exu/LsExeUnit.scala b/src/main/scala/xiangshan/backend/exu/LsExeUnit.scala deleted file mode 100644 index 75f7a43920a40819abad0062527428cc0d942586..0000000000000000000000000000000000000000 --- a/src/main/scala/xiangshan/backend/exu/LsExeUnit.scala +++ /dev/null @@ -1,232 +0,0 @@ -//package xiangshan.backend.exu -// -//import chisel3._ -//import chisel3.util._ -//import chisel3.util.experimental.BoringUtils -//import xiangshan._ -//import utils._ -//import bus.simplebus._ -//import xiangshan.AddressSpace -//import xiangshan.backend._ -//import xiangshan.backend.brq.BrqPtr -//import fpu.boxF32ToF64 -// -// -//class StoreQueueEntry extends XSBundle{ -// val src1 = UInt(XLEN.W) -// val src2 = UInt(XLEN.W) -// val addr = UInt(XLEN.W) -// val src3 = UInt(XLEN.W) -// val wdata = UInt(XLEN.W) -// val func = UInt(6.W) -// val pc = UInt(VAddrBits.W) //for debug -// val brTag = new BrqPtr //FIXIT -//} -// -//// Multi-cycle LSU ported from NOOP -//class LsExeUnit extends Exu(Exu.lsuExeUnitCfg){ -// -// // store buffer -// val stqData = Reg(Vec(8, new StoreQueueEntry)) -// val stqValid = RegInit(VecInit(List.fill(8)(false.B))) -// val stqPtr = Reg(Vec(8, UInt(3.W))) -// val stqHead = RegInit(0.U(3.W)) -// val stqTail = stqPtr(0) -// val stqCommited = RegInit(0.U(3.W)) -// val stqFull = stqHead === 7.U //stq_valid.reduce(_.valid && _.valid) -// val emptySlot = PriorityMux(~stqValid.asUInt, VecInit(List.tabulate(8)(_.U))) -// -// // when retiringStore, block all input insts -// val isStoreIn = io.in.valid && LSUOpType.isStore(io.in.bits.uop.ctrl.fuOpType) -// val retiringStore = RegInit(false.B) -// val (validIn, src1In, src2In, src3In, funcIn) = (io.in.valid, io.in.bits.src1, io.in.bits.uop.ctrl.imm, io.in.bits.src2, io.in.bits.uop.ctrl.fuOpType) -// val (valid, src1, src2, wdata, func) = -// ( -// Mux(retiringStore, stqValid(stqTail), validIn && !isStoreIn), -// Mux(retiringStore, stqData(stqTail).src1, src1In), -// Mux(retiringStore, stqData(stqTail).src2, src2In), -// Mux(retiringStore, stqData(stqTail).src3, src3In), -// Mux(retiringStore, stqData(stqTail).func, funcIn) -// ) -// // assert(!(retiringStore && !stqValid(stqTail))) -// -// def genWmask(addr: UInt, sizeEncode: UInt): UInt = { -// LookupTree(sizeEncode, List( -// "b00".U -> 0x1.U, //0001 << addr(2:0) -// "b01".U -> 0x3.U, //0011 -// "b10".U -> 0xf.U, //1111 -// "b11".U -> 0xff.U //11111111 -// )) << addr(2, 0) -// } -// def genWdata(data: UInt, sizeEncode: UInt): UInt = { -// LookupTree(sizeEncode, List( -// "b00".U -> Fill(8, data(7, 0)), -// "b01".U -> Fill(4, data(15, 0)), -// "b10".U -> Fill(2, data(31, 0)), -// "b11".U -> data -// )) -// } -// -// val dmem = io.dmem -// val addr = src1 + src2 -// val addrLatch = RegNext(addr) -// val isStore = valid && LSUOpType.isStore(func) -// val partialLoad = !isStore && (func =/= LSUOpType.ld) -// -// val s_idle :: s_wait_resp :: s_partialLoad :: Nil = Enum(3) -// val state = RegInit(s_idle) -// -// switch (state) { -// is (s_idle) { when (dmem.req.fire()) { state := Mux(isStore, s_partialLoad, s_wait_resp) } } -// is (s_wait_resp) { when (dmem.resp.fire()) { state := Mux(partialLoad, s_partialLoad, s_idle) } } -// is (s_partialLoad) { state := s_idle } -// } -// -// val size = func(1,0) -// dmem.req.bits.apply(addr = addr, size = size, wdata = genWdata(wdata, size), -// wmask = genWmask(addr, size), cmd = Mux(isStore, SimpleBusCmd.write, SimpleBusCmd.read)) -// dmem.req.valid := valid && (state === s_idle) -// dmem.resp.ready := true.B -// -// XSDebug("state %x req.valid/ready %x/%x resp.valid/ready %x/%x addr %x size %x data %x mask %x cmd %x\n", -// state, dmem.req.valid, dmem.req.ready, dmem.resp.valid, dmem.resp.ready, -// addr, size, genWdata(wdata, size), genWmask(addr, size), Mux(isStore, SimpleBusCmd.write, SimpleBusCmd.read) -// ) -// -// val rdata = Wire(UInt(XLEN.W)) -// val rdataLatch = RegNext(rdata) -// val rdataSel = LookupTree(addrLatch(2, 0), List( -// "b000".U -> rdataLatch(63, 0), -// "b001".U -> rdataLatch(63, 8), -// "b010".U -> rdataLatch(63, 16), -// "b011".U -> rdataLatch(63, 24), -// "b100".U -> rdataLatch(63, 32), -// "b101".U -> rdataLatch(63, 40), -// "b110".U -> rdataLatch(63, 48), -// "b111".U -> rdataLatch(63, 56) -// )) -// val rdataPartialLoad = LookupTree(func, List( -// LSUOpType.lb -> SignExt(rdataSel(7, 0) , XLEN), -// LSUOpType.lh -> SignExt(rdataSel(15, 0), XLEN), -// LSUOpType.lw -> SignExt(rdataSel(31, 0), XLEN), -// LSUOpType.lbu -> ZeroExt(rdataSel(7, 0) , XLEN), -// LSUOpType.lhu -> ZeroExt(rdataSel(15, 0), XLEN), -// LSUOpType.lwu -> ZeroExt(rdataSel(31, 0), XLEN), -// LSUOpType.flw -> boxF32ToF64(rdataSel(31,0)) -// )) -// -// // pop store queue if insts have been commited and dmem req fired successfully -// val storeFinish = retiringStore && dmem.resp.fire()//state === s_partialLoad -// val stqDequeue = storeFinish || !stqValid(stqTail) && stqHead > 0.U -// when(stqDequeue){ -// stqValid(stqTail) := false.B -// // update stq ptr -// for(i <- 1 until 8){ -// stqPtr(i-1) := stqPtr(i) -// } -// } -// -// // if store, add it to store queue -// val stqEnqueue = validIn && isStoreIn && !stqFull && !retiringStore && !io.redirect.valid && state === s_idle -// when(stqEnqueue){ -// stqPtr(stqHead - stqDequeue) := emptySlot -// stqData(emptySlot).src1 := src1In -// stqData(emptySlot).src2 := src2In -// stqData(emptySlot).addr := src1In + src2In -// stqData(emptySlot).src3 := genWdata(src3In, funcIn(1, 0)) -// stqData(emptySlot).pc := io.in.bits.uop.cf.pc -// stqData(emptySlot).func := funcIn -// stqData(emptySlot).brTag := io.in.bits.uop.brTag -// stqValid(emptySlot) := true.B -// } -// -// // if store insts have been commited, send dmem req -// // have to say it seems better to rebuild FSM instead of using such ugly wrapper -// val needRetireStore = stqCommited > 0.U && stqValid(stqTail) -// when( -// needRetireStore && !retiringStore && state === s_idle && (!io.in.valid || isStoreIn) -// ){ -// retiringStore := true.B -// } -// when(dmem.resp.fire() && retiringStore){ -// retiringStore := false.B -// } -// -// // update stqTail, stqCommited -// stqCommited := stqCommited + io.mcommit - storeFinish -// stqHead := stqHead + stqEnqueue - stqDequeue -// -// // Store addr forward match -// // If match, get data from store queue -// val dataBackVec = Wire(Vec(XLEN/8, (UInt((XLEN/8).W)))) -// for(j <- (0 to (XLEN/8 - 1))){ -// dataBackVec(j) := dmem.resp.bits.rdata(8*(j+1)-1, 8*j) -// } -// -// for(i <- 0 until 8){ -// when(stqValid(stqPtr(i)) && i.U < stqHead){ -// when(addr(PAddrBits-1, log2Up(XLEN/8)) === stqData(stqPtr(i)).addr(PAddrBits-1, log2Up(XLEN/8))){ -// for(j <- (0 to (XLEN/8 - 1))){ -// when(genWmask(stqData(stqPtr(i)).addr, stqData(stqPtr(i)).func(1, 0))(j)){ -// dataBackVec(j) := stqData(stqPtr(i)).src3(8*(j+1)-1, 8*j) -// XSDebug("forwarding data from stq, addr %x stqpos %d bitpos %d data %x\n", addr, i.U, j.U, stqData(stqPtr(i)).src3(8*(j+1)-1, 8*j)) -// } -// } -// } -// XSDebug("sbuffer id %d ptr %d pc %x addr %x data %x func %x wmask %b\n", -// i.U, stqPtr(i), stqData(stqPtr(i)).pc, stqData(stqPtr(i)).src1 + stqData(stqPtr(i)).src2, stqData(stqPtr(i)).src3, stqData(stqPtr(i)).func, genWmask(stqData(stqPtr(i)).addr, stqData(stqPtr(i)).func(1, 0)) -// ) -// } -// } -// rdata := dataBackVec.asUInt -// -// val expRedirect = io.redirect.valid && io.redirect.bits.isException -// val brRedirect = io.redirect.valid && io.redirect.bits.isMisPred -// for(i <- 0 until 8){ -// when((i.U >= stqCommited && i.U < stqHead) && (expRedirect || brRedirect && stqData(stqPtr(i)).brTag.needBrFlush(io.redirect.bits.brTag) && stqValid(stqPtr(i)))){ -// stqValid(stqPtr(i)) := false.B -// } -// XSDebug("sptrtable: id %d ptr %d valid %d\n", i.U, stqPtr(i), stqValid(stqPtr(i))) -// } -// when(expRedirect){ -// //invalidate uncommited store -// //FIXME -// } -// -// io.in.ready := io.out.fire() -// -// val validLoad = RegInit(false.B) -// when(state =/= s_idle && !io.in.valid) { validLoad := false.B } -// when(state === s_idle && io.in.valid && !retiringStore && dmem.req.fire()) { validLoad := true.B } -// io.out.valid := (!isStoreIn && !retiringStore && validLoad && Mux(partialLoad, state === s_partialLoad, dmem.resp.fire() && (state === s_wait_resp)) || stqEnqueue) && io.in.valid -// io.out.bits.uop <> io.in.bits.uop -// io.out.bits.data := Mux(partialLoad, rdataPartialLoad, rdata) -// // io.out.bits.debug.isMMIO := AddressSpace.isMMIO(addr) && io.out.valid -// io.out.bits.debug.isMMIO := AddressSpace.isMMIO(addr) //for debug -// io.out.bits.redirect := DontCare -// io.out.bits.redirectValid := false.B -// -// when(io.out.fire()){ -// XSDebug("LSU fire: pc %x addr %x mmio %x isStoreIn %x retiringStore %x partialLoad %x dmem %x stqEnqueue %x state %x dmemres %x fwdres %x\n", -// io.in.bits.uop.cf.pc, -// addr, -// io.out.bits.debug.isMMIO, -// isStoreIn, -// retiringStore, -// partialLoad, -// dmem.resp.fire(), -// stqEnqueue, -// state, -// dmem.resp.bits.rdata, -// io.out.bits.data -// ) -// } -// -// // debug -// XSDebug("state: %d (valid, ready): in (%d,%d) out (%d,%d)\n", state, io.in.valid, io.in.ready, io.out.valid, io.out.ready) -// XSDebug("stqinfo: stqValid.asUInt %b stqHead %d stqTail %d stqCommited %d emptySlot %d\n", stqValid.asUInt, stqHead, stqTail, stqCommited, emptySlot) -// XSDebug(retiringStore, "retiringStore now...\n") -// XSInfo(io.dmem.req.fire() && io.dmem.req.bits.cmd =/= SimpleBusCmd.write, "[DMEM LOAD REQ] addr 0x%x wdata 0x%x size %d\n", dmem.req.bits.addr, dmem.req.bits.wdata, dmem.req.bits.size) -// XSInfo(io.dmem.req.fire() && io.dmem.req.bits.cmd === SimpleBusCmd.write, "[DMEM STORE REQ] addr 0x%x wdata 0x%x size %d\n", dmem.req.bits.addr, dmem.req.bits.wdata, dmem.req.bits.size) -// XSInfo(io.dmem.resp.fire(), "[DMEM RESP] data %x\n", rdata) -//} diff --git a/src/main/scala/xiangshan/backend/exu/MulDivExeUnit.scala b/src/main/scala/xiangshan/backend/exu/MulDivExeUnit.scala index f4044438ea3e270a751bcdfe41a81db3e033ea10..ff294a5d656a01a567a1f0589af05d96d431810a 100644 --- a/src/main/scala/xiangshan/backend/exu/MulDivExeUnit.scala +++ b/src/main/scala/xiangshan/backend/exu/MulDivExeUnit.scala @@ -5,105 +5,77 @@ import chisel3.util._ import xiangshan._ import utils._ import xiangshan.backend.MDUOpType -import xiangshan.backend.fu.FunctionUnit._ +import xiangshan.backend.exu.Exu.mulDivExeUnitCfg +import xiangshan.backend.fu.{AbstractDivider, ArrayMultiplier, FunctionUnit, Radix2Divider} -class MulDivFenceExeUnit extends Exu(Exu.mulDivFenceExeUnitCfg){ - val (src1, src2, uop, func) = - (io.in.bits.src1, io.in.bits.src2, io.in.bits.uop, io.in.bits.uop.ctrl.fuOpType) +class MulDivExeUnit extends Exu(mulDivExeUnitCfg) { + val func = io.fromInt.bits.uop.ctrl.fuOpType + val (src1, src2) = ( + io.fromInt.bits.src1(XLEN - 1, 0), + io.fromInt.bits.src2(XLEN - 1, 0) + ) - val isMul = MDUOpType.isMul(func) - val isDiv = MDUOpType.isDiv(func) - val isFence = MDUOpType.isFence(func) - - val mul = Module(new MulExeUnit) - val div = Module(new DivExeUnit) - val fence = Module(new FenceExeUnit) - - for(x <- Seq(mul.io, div.io, fence.io)){ - x.mcommit <> DontCare - x.exception <> DontCare - x.dmem <> DontCare - x.in.bits := io.in.bits - x.redirect := io.redirect - } - - mul.io.in.valid := io.in.valid && isMul - div.io.in.valid := io.in.valid && isDiv - fence.io.in.valid := io.in.valid && isFence - - io.in.ready := false.B - when (isMul) { io.in.ready := mul.io.in.ready } - when (isDiv) { io.in.ready := div.io.in.ready } - when (isFence) { io.in.ready := fence.io.in.ready } - - val arb = Module(new Arbiter(new ExuOutput, 3)) - - arb.io.in(0) <> mul.io.out - arb.io.in(1) <> div.io.out - arb.io.in(2) <> fence.io.out - - io.out <> arb.io.out + val mul = supportedFunctionUnits.collectFirst { + case m: ArrayMultiplier => m + }.get + + val div = supportedFunctionUnits.collectFirst { + case d: AbstractDivider => d + }.orNull + + // override inputs + val op = MDUOpType.getMulOp(func) + val signext = SignExt(_: UInt, XLEN + 1) + val zeroext = ZeroExt(_: UInt, XLEN + 1) + val mulInputFuncTable = List( + MDUOpType.mul -> (zeroext, zeroext), + MDUOpType.mulh -> (signext, signext), + MDUOpType.mulhsu -> (signext, zeroext), + MDUOpType.mulhu -> (zeroext, zeroext) + ) - XSDebug(io.in.valid || io.redirect.valid, "In(%d %d) Out(%d %d) Redirect:(%d %d %d) brTag:%x\n", - io.in.valid, io.in.ready, - io.out.valid, io.out.ready, - io.redirect.valid, - io.redirect.bits.isException, - io.redirect.bits.isFlushPipe, - io.redirect.bits.brTag.value + mul.io.in.bits.src(0) := LookupTree( + op, + mulInputFuncTable.map(p => (p._1(1, 0), p._2._1(src1))) ) - XSDebug(io.in.valid, "src1:%x src2:%x pc:%x fuType:%b fuOpType:%b roqIdx:%d (%d%d%d)\n", - src1, src2, io.in.bits.uop.cf.pc, io.in.bits.uop.ctrl.fuType, io.in.bits.uop.ctrl.fuOpType, - io.in.bits.uop.roqIdx.asUInt, isMul, isDiv, isFence) - XSDebug(io.out.valid, "Out(%d %d) res:%x pc:%x fuType:%b fuOpType:%b roqIdx:%d chosen:%d\n", - io.out.valid, io.out.ready, io.out.bits.data, io.out.bits.uop.cf.pc, io.in.bits.uop.ctrl.fuType, - io.in.bits.uop.ctrl.fuOpType, io.in.bits.uop.roqIdx.asUInt, arb.io.chosen + mul.io.in.bits.src(1) := LookupTree( + op, + mulInputFuncTable.map(p => (p._1(1, 0), p._2._2(src2))) ) -} - -class MulDivExeUnit extends Exu(Exu.mulDivExeUnitCfg){ - val (src1, src2, uop, func) = - (io.in.bits.src1, io.in.bits.src2, io.in.bits.uop, io.in.bits.uop.ctrl.fuOpType) - - val isMul = MDUOpType.isMul(func) - val isDiv = MDUOpType.isDiv(func) - - val mul = Module(new MulExeUnit) - val div = Module(new DivExeUnit) - - for(x <- Seq(mul.io, div.io)){ - x.mcommit <> DontCare - x.exception <> DontCare - x.dmem <> DontCare - x.in.bits := io.in.bits - x.redirect := io.redirect - } - - mul.io.in.valid := io.in.valid && isMul - div.io.in.valid := io.in.valid && isDiv - io.in.ready := false.B - when (isMul) { io.in.ready := mul.io.in.ready } - when (isDiv) { io.in.ready := div.io.in.ready } - - val arb = Module(new Arbiter(new ExuOutput, 2)) - - arb.io.in(0) <> mul.io.out - arb.io.in(1) <> div.io.out - - io.out <> arb.io.out - - XSDebug(io.in.valid, "In(%d %d) Out(%d %d) Redirect:(%d %d %d) brTag:%x\n", - io.in.valid, io.in.ready, - io.out.valid, io.out.ready, + val isW = MDUOpType.isW(func) + val isH = MDUOpType.isH(func) + mul.ctrl.isW := isW + mul.ctrl.isHi := isH + mul.ctrl.sign := DontCare + + val isDivSign = MDUOpType.isDivSign(func) + val divInputFunc = (x: UInt) => Mux( + isW, + Mux(isDivSign, + SignExt(x(31, 0), XLEN), + ZeroExt(x(31, 0), XLEN) + ), + x + ) + div.io.in.bits.src(0) := divInputFunc(src1) + div.io.in.bits.src(1) := divInputFunc(src2) + div.ctrl.isHi := isH + div.ctrl.isW := isW + div.ctrl.sign := isDivSign + + XSDebug(io.fromInt.valid, "In(%d %d) Out(%d %d) Redirect:(%d %d %d) brTag:%x\n", + io.fromInt.valid, io.fromInt.ready, + io.toInt.valid, io.toInt.ready, io.redirect.valid, io.redirect.bits.isException, io.redirect.bits.isFlushPipe, io.redirect.bits.brTag.value ) - XSDebug(io.in.valid, "src1:%x src2:%x pc:%x\n", src1, src2, io.in.bits.uop.cf.pc) - XSDebug(io.out.valid, "Out(%d %d) res:%x pc:%x\n", - io.out.valid, io.out.ready, io.out.bits.data, io.out.bits.uop.cf.pc + XSDebug(io.fromInt.valid, "src1:%x src2:%x pc:%x\n", src1, src2, io.fromInt.bits.uop.cf.pc) + XSDebug(io.toInt.valid, "Out(%d %d) res:%x pc:%x\n", + io.toInt.valid, io.toInt.ready, io.toInt.bits.data, io.toInt.bits.uop.cf.pc ) } + diff --git a/src/main/scala/xiangshan/backend/exu/MulExeUnit.scala b/src/main/scala/xiangshan/backend/exu/MulExeUnit.scala deleted file mode 100644 index c5a008640b5c3c752495ada8db4f48a47bc849df..0000000000000000000000000000000000000000 --- a/src/main/scala/xiangshan/backend/exu/MulExeUnit.scala +++ /dev/null @@ -1,67 +0,0 @@ -package xiangshan.backend.exu - -import chisel3._ -import chisel3.util._ -import xiangshan._ -import utils._ -import xiangshan.backend.MDUOpType -import xiangshan.backend.fu.FunctionUnit._ -import xiangshan.backend.fu.ArrayMultiplier - - -class MulExeUnit extends Exu(Exu.mulExeUnitCfg){ - val (src1, src2, uop, func) = - (io.in.bits.src1, io.in.bits.src2, io.in.bits.uop, io.in.bits.uop.ctrl.fuOpType) - - val mul = Module(new ArrayMultiplier(XLEN+1)) - - val signext = SignExt(_: UInt, XLEN+1) - val zeroext = ZeroExt(_: UInt, XLEN+1) - val mulInputFuncTable = List( - MDUOpType.mul -> (zeroext, zeroext), - MDUOpType.mulh -> (signext, signext), - MDUOpType.mulhsu -> (signext, zeroext), - MDUOpType.mulhu -> (zeroext, zeroext) - ) - - val isW = MDUOpType.isW(func) - val isH = MDUOpType.isH(func) - val op = MDUOpType.getMulOp(func) - - mul.io.redirect := io.redirect - mul.io.in.bits.ctrl.uop := io.in.bits.uop - mul.io.in.bits.ctrl.sign := DontCare //Mul don't use this - mul.io.in.bits.ctrl.isW := isW - mul.io.in.bits.ctrl.isHi := isH - mul.io.in.bits.src1 := LookupTree( - op, - mulInputFuncTable.map(p => (p._1(1,0), p._2._1(src1))) - ) - mul.io.in.bits.src2 := LookupTree( - op, - mulInputFuncTable.map(p => (p._1(1,0), p._2._2(src2))) - ) - mul.io.in.valid := io.in.valid - mul.io.out.ready := io.out.ready - - io.in.ready := mul.io.in.ready - io.out.valid := mul.io.out.valid - io.out.bits.uop := mul.io.out.bits.uop - io.out.bits.data := mul.io.out.bits.data - io.out.bits.redirectValid := false.B - io.out.bits.redirect <> DontCare - - XSDebug(io.in.valid, "In(%d %d) Out(%d %d) Redirect:(%d %d %d) brTag:%x\n", - io.in.valid, io.in.ready, - io.out.valid, io.out.ready, - io.redirect.valid, - io.redirect.bits.isException, - io.redirect.bits.isFlushPipe, - io.redirect.bits.brTag.value - ) - XSDebug(io.in.valid, "src1:%x src2:%x pc:%x\n", src1, src2, io.in.bits.uop.cf.pc) - XSDebug(io.out.valid, "Out(%d %d) res:%x pc:%x\n", - io.out.valid, io.out.ready, io.out.bits.data, io.out.bits.uop.cf.pc - ) - XSDebug(io.redirect.valid, p"redirect: ${io.redirect.bits.brTag}\n") -} diff --git a/src/main/scala/xiangshan/backend/exu/Wb.scala b/src/main/scala/xiangshan/backend/exu/Wb.scala new file mode 100644 index 0000000000000000000000000000000000000000..a15762a2b563cd94e9df0892b125a7225a26ae0e --- /dev/null +++ b/src/main/scala/xiangshan/backend/exu/Wb.scala @@ -0,0 +1,71 @@ +package xiangshan.backend.exu + +import chisel3._ +import chisel3.util._ +import xiangshan._ +import utils._ + + +class Wb(priorities: Seq[Int], numOut: Int) extends XSModule { + val io = IO(new Bundle() { + val in = Vec(priorities.size, Flipped(DecoupledIO(new ExuOutput))) + val out = Vec(numOut, ValidIO(new ExuOutput)) + }) + + +// def exuOutToRfReq(exuOut: DecoupledIO[ExuOutput]): DecoupledIO[ExuOutput] = { +// val req = WireInit(exuOut) +// req.valid := exuOut.valid && wen(exuOut.bits) +// exuOut.ready := Mux(req.valid, req.ready, true.B) +// req +// } + + val directConnect = io.in.zip(priorities).filter(x => x._2 == 0).map(_._1) + val mulReq = io.in.zip(priorities).filter(x => x._2 == 1).map(_._1) + val otherReq = io.in.zip(priorities).filter(x => x._2 > 1).map(_._1) + + val portUsed = directConnect.size + mulReq.size + require(portUsed <= numOut) + + io.out.take(directConnect.size).zip(directConnect).foreach{ + case (o, i) => + o.bits := i.bits + o.valid := i.valid + i.ready := true.B + } + + def splitN[T](in: Seq[T], n: Int): Seq[Option[Seq[T]]] = { + require(n > 0) + if(n == 1){ + Seq(Some(in)) + } else { + if(in.size < n ){ + Seq(Some(in)) ++ Seq.fill(n-1)(None) + } else { + val m = in.size / n + Some(in.take(m)) +: splitN(in.drop(m), n-1) + } + } + } + + if(mulReq.nonEmpty){ + val arbReq = splitN( + otherReq, + mulReq.size + ) + for(i <- mulReq.indices){ + val other = arbReq(i).getOrElse(Seq()) + val arb = Module(new Arbiter(new ExuOutput, 1+other.size)) + arb.io.in <> mulReq(i) +: other + val out = io.out(directConnect.size + i) + out.valid := arb.io.out.valid + out.bits := arb.io.out.bits + arb.io.out.ready := true.B + } + } + + if(portUsed < numOut){ + println(s"Warning: ${numOut - portUsed} ports are not used!") + io.out.drop(portUsed).foreach(_ <> DontCare) + } +} \ No newline at end of file diff --git a/src/main/scala/xiangshan/backend/exu/Wbu.scala b/src/main/scala/xiangshan/backend/exu/Wbu.scala deleted file mode 100644 index ad2d2428d458fc9405636970dade988ef9f89493..0000000000000000000000000000000000000000 --- a/src/main/scala/xiangshan/backend/exu/Wbu.scala +++ /dev/null @@ -1,161 +0,0 @@ -package xiangshan.backend.exu - -import chisel3._ -import chisel3.util._ -import xiangshan._ -import utils._ - -class Wbu(exuConfigs: Array[ExuConfig]) extends XSModule{ - - val io = IO(new Bundle() { - val in = Vec(exuParameters.ExuCnt, Flipped(DecoupledIO(new ExuOutput))) - val toRoq = Vec(exuParameters.ExuCnt, ValidIO(new ExuOutput)) - val toIntRf = Vec(NRIntWritePorts, ValidIO(new ExuOutput)) - val toFpRf = Vec(NRFpWritePorts, ValidIO(new ExuOutput)) - }) - - require(io.in.length == exuConfigs.length) - - - def exuOutToRfReq - (exuOut: DecoupledIO[ExuOutput], fp: Boolean): DecoupledIO[ExuOutput] = { - val req = Wire(Decoupled(new ExuOutput)) - req.valid := exuOut.valid && { - if(fp) exuOut.bits.uop.ctrl.fpWen else exuOut.bits.uop.ctrl.rfWen - } - req.bits := exuOut.bits - req - } - - // ((ExuOutput, ExuConfig), index) => ((WbReq, ExuConfig), index) - val wbInt = io.in.zip(exuConfigs).zipWithIndex. - filter(_._1._2.writeIntRf).map(x => - ((exuOutToRfReq(x._1._1, fp = false), x._1._2), x._2)) - - val wbIntReq = wbInt.map(_._1) - - val wbFp = io.in.zip(exuConfigs).zipWithIndex. - filter(_._1._2.writeFpRf).map(x => - ((exuOutToRfReq(x._1._1, fp = true), x._1._2), x._2)) - - val wbFpReq = wbFp.map(_._1) - - for(i <- io.in.indices){ - val writeIntReqIdx = wbInt.map(_._2).indexOf(i) - val writeFpReqIdx = wbFp.map(_._2).indexOf(i) - val writeIntRf = writeIntReqIdx >= 0 - val writeFpRf = writeFpReqIdx >= 0 - - val iReq = if(writeIntRf) wbIntReq(writeIntReqIdx)._1 else null - val fReq = if(writeFpRf) wbFpReq(writeFpReqIdx)._1 else null - - if(writeIntRf && writeFpRf){ - io.in(i).ready := Mux(iReq.valid, - iReq.ready, - Mux(fReq.valid, - fReq.ready, - true.B - ) - ) - assert(!(iReq.valid && fReq.valid), s"Error: iReq and fReq valid at same time, idx=$i") - } else if(writeIntRf){ - io.in(i).ready := iReq.ready - } else if(writeFpRf){ - io.in(i).ready := fReq.ready - } else { - io.in(i).ready := true.B - } - - exuConfigs(i) match { - case Exu.aluExeUnitCfg => - io.toRoq(i).valid := io.in(i).fire() && !io.in(i).bits.redirectValid - case Exu.jmpExeUnitCfg => - io.toRoq(i).valid := io.in(i).fire() && !io.in(i).bits.redirectValid - case _ => - io.toRoq(i).valid := io.in(i).fire() - } - io.toRoq(i).bits := io.in(i).bits - } - - def directConnect(rfWrite: Valid[ExuOutput], wbReq: DecoupledIO[ExuOutput]) = { - rfWrite.bits := wbReq.bits - rfWrite.valid := wbReq.valid - wbReq.ready := true.B - } - - def splitN[T](in: Seq[T], n: Int): Seq[Option[Seq[T]]] = { - require(n > 0) - if(in.size < n) Seq(Some(in)) ++ Seq.fill(n-1)(None) - else { - val m = in.size/n - Some(in.take(m)) +: splitN(in.drop(m), n-1) - } - } - - if(wbIntReq.size <= NRIntWritePorts){ // write ports are enough - io.toIntRf. - take(wbIntReq.size). - zip(wbIntReq). - foreach(x => directConnect(x._1, x._2._1)) - - if(wbIntReq.size < NRIntWritePorts){ - println(s"Warrning: ${NRIntWritePorts-wbIntReq.size} int write ports are not used!") - io.toIntRf.drop(wbIntReq.size).foreach(_ <> DontCare) - } - } else { - val directReq = wbIntReq.filter(w => Seq(Exu.ldExeUnitCfg, Exu.aluExeUnitCfg).contains(w._2)) - val mulReq = wbIntReq.filter(w => Seq(Exu.mulExeUnitCfg, Exu.mulDivExeUnitCfg, Exu.mulDivFenceExeUnitCfg).contains(w._2)) - val otherReq = splitN( - wbIntReq.filterNot(w => Seq( - Exu.ldExeUnitCfg, Exu.aluExeUnitCfg, Exu.mulDivExeUnitCfg, Exu.mulExeUnitCfg, Exu.mulDivFenceExeUnitCfg - ).contains(w._2)), - mulReq.size - ) - require(directReq.size + mulReq.size <= NRIntWritePorts) - // alu && load: direct connect - io.toIntRf.take(directReq.size).zip(directReq).foreach(x => directConnect(x._1, x._2._1)) - for( i <- mulReq.indices){ - val arbiter = Module(new Arbiter(new ExuOutput, 1+otherReq(i).getOrElse(Seq()).size)) - arbiter.io.in <> (mulReq(i) +: otherReq(i).getOrElse(Seq())).map(_._1) - io.toIntRf.drop(directReq.size)(i) := arbiter.io.out - arbiter.io.out.ready := true.B - } - if(directReq.size + mulReq.size < NRIntWritePorts){ - println(s"Warrning: ${NRIntWritePorts-directReq.size-mulReq.size} int write ports are not used!") - io.toIntRf.drop(directReq.size + mulReq.size).foreach(_ <> DontCare) - } - } - - if(wbFpReq.size <= NRFpWritePorts){ - io.toFpRf. - take(wbFpReq.size). - zip(wbFpReq). - foreach(x => directConnect(x._1, x._2._1)) - - if(wbFpReq.size < NRFpWritePorts){ - println(s"Warrning: ${NRFpWritePorts-wbFpReq.size} fp write ports are not used!") - io.toFpRf.drop(wbFpReq.size).foreach(_ <> DontCare) - } - } else { - val directReq = wbFpReq.filter(w => Seq(Exu.ldExeUnitCfg, Exu.fmacExeUnitCfg).contains(w._2)) - val fmiscReq = wbFpReq.filter(w => Seq(Exu.fmiscExeUnitCfg, Exu.fmiscDivExeUnitCfg).contains(w._2)) - val otherReq = splitN( - wbFpReq.filterNot(w => Seq( - Exu.ldExeUnitCfg, Exu.fmacExeUnitCfg, Exu.fmiscExeUnitCfg, Exu.fmiscDivExeUnitCfg - ).contains(w._2)), - fmiscReq.size - ) - require(directReq.size + fmiscReq.size <= NRFpWritePorts) - io.toFpRf.take(directReq.size).zip(directReq).foreach(x => directConnect(x._1, x._2._1)) - for( i <- fmiscReq.indices){ - val arbiter = Module(new Arbiter(new ExuOutput, 1+otherReq(i).getOrElse(Seq()).size)) - arbiter.io.in <> (fmiscReq(i) +: otherReq(i).getOrElse(Seq())).map(_._1) - io.toFpRf.drop(directReq.size)(i) := arbiter.io.out - arbiter.io.out.ready := true.B - } - if(directReq.size + fmiscReq.size < NRFpWritePorts){ - println(s"Warrning: ${NRFpWritePorts-directReq.size-fmiscReq.size} fp write ports are not used!") - io.toFpRf.drop(directReq.size + fmiscReq.size).foreach(_ <> DontCare) - } - } -} diff --git a/src/main/scala/xiangshan/backend/fu/Alu.scala b/src/main/scala/xiangshan/backend/fu/Alu.scala index 44d567860f36eea65681be1a76453cbc77e60623..779b07487821c41c31acfd2d16c79c47423b2a6b 100644 --- a/src/main/scala/xiangshan/backend/fu/Alu.scala +++ b/src/main/scala/xiangshan/backend/fu/Alu.scala @@ -1,88 +1,80 @@ -//package xiangshan.backend.fu -// -//import chisel3._ -//import chisel3.util._ -//import xiangshan._ -//import utils._ -//import xiangshan.backend._ -// -//import xiangshan.backend.fu.FunctionUnit._ -// -//class Alu extends FunctionUnit(aluCfg) { -// val io = IO(new ExuIO) -// -// -// override def toString: String = "Alu" -// -// val (iovalid, src1, src2, offset, func, pc, uop) = (io.in.valid, io.in.bits.src1, io.in.bits.src2, -// io.in.bits.uop.ctrl.imm, io.in.bits.uop.ctrl.fuOpType, SignExt(io.in.bits.uop.cf.pc, AddrBits), io.in.bits.uop) -// -// val redirectHit = uop.brTag.needFlush(io.redirect) -// val valid = iovalid && !redirectHit -// -// val isAdderSub = (func =/= ALUOpType.add) && (func =/= ALUOpType.addw) && !ALUOpType.isJump(func) -// val adderRes = (src1 +& (src2 ^ Fill(XLEN, isAdderSub))) + isAdderSub -// val xorRes = src1 ^ src2 -// val sltu = !adderRes(XLEN) -// val slt = xorRes(XLEN-1) ^ sltu -// -// val shsrc1 = LookupTreeDefault(func, src1, List( -// ALUOpType.srlw -> ZeroExt(src1(31,0), 64), -// ALUOpType.sraw -> SignExt(src1(31,0), 64) -// )) -// val shamt = Mux(ALUOpType.isWordOp(func), src2(4, 0), src2(5, 0)) -// val res = LookupTreeDefault(func(3, 0), adderRes, List( -// ALUOpType.sll -> ((shsrc1 << shamt)(XLEN-1, 0)), -// ALUOpType.slt -> ZeroExt(slt, XLEN), -// ALUOpType.sltu -> ZeroExt(sltu, XLEN), -// ALUOpType.xor -> xorRes, -// ALUOpType.srl -> (shsrc1 >> shamt), -// ALUOpType.or -> (src1 | src2), -// ALUOpType.and -> (src1 & src2), -// ALUOpType.sra -> ((shsrc1.asSInt >> shamt).asUInt) -// )) -// val aluRes = Mux(ALUOpType.isWordOp(func), SignExt(res(31,0), 64), res) -// -// val branchOpTable = List( -// ALUOpType.getBranchType(ALUOpType.beq) -> !xorRes.orR, -// ALUOpType.getBranchType(ALUOpType.blt) -> slt, -// ALUOpType.getBranchType(ALUOpType.bltu) -> sltu -// ) -// -// val isBru = ALUOpType.isBru(func) -// // val isBranch = io.in.bits.uop.cf.isBr// ALUOpType.isBranch(func) -// val isBranch = ALUOpType.isBranch(func) -// val isJump = ALUOpType.isJump(func) -// val taken = LookupTree(ALUOpType.getBranchType(func), branchOpTable) ^ ALUOpType.isBranchInvert(func) -// val target = Mux(isBranch, pc + offset, adderRes)(VAddrBits-1,0) -// val isRVC = uop.cf.isRVC//(io.in.bits.cf.instr(1,0) =/= "b11".U) -// -// io.in.ready := io.out.ready -// val pcLatchSlot = Mux(isRVC, pc + 2.U, pc + 4.U) -// io.out.bits.redirectValid := io.out.valid && isBru//isBranch -// io.out.bits.redirect.target := Mux(!taken && isBranch, pcLatchSlot, target) -// io.out.bits.redirect.brTag := uop.brTag -// io.out.bits.redirect.isException := DontCare // false.B -// io.out.bits.redirect.roqIdx := uop.roqIdx -// -// io.out.valid := valid -// io.out.bits.uop <> io.in.bits.uop -// io.out.bits.data := Mux(isJump, pcLatchSlot, aluRes) -// -// XSDebug(io.in.valid, -// "In(%d %d) Out(%d %d) Redirect:(%d %d %d) brTag:f:%d v:%d\n", -// io.in.valid, -// io.in.ready, -// io.out.valid, -// io.out.ready, -// io.redirect.valid, -// io.redirect.bits.isException, -// redirectHit, -// io.redirect.bits.brTag.flag, -// io.redirect.bits.brTag.value -// ) -// XSDebug(io.in.valid, "src1:%x src2:%x offset:%x func:%b pc:%x\n", -// src1, src2, offset, func, pc) -// XSDebug(io.out.valid, "res:%x aluRes:%x isRVC:%d isBru:%d isBranch:%d isJump:%d target:%x taken:%d\n", -// io.out.bits.data, aluRes, isRVC, isBru, isBranch, isJump, target, taken) -//} +package xiangshan.backend.fu + +import chisel3._ +import chisel3.util._ +import utils.{LookupTree, LookupTreeDefault, SignExt, XSDebug, ZeroExt} +import xiangshan._ +import xiangshan.backend.ALUOpType + +class Alu extends FunctionUnit with HasRedirectOut { + + val (src1, src2, offset, func, pc, uop) = ( + io.in.bits.src(0), + io.in.bits.src(1), + io.in.bits.uop.ctrl.imm, + io.in.bits.uop.ctrl.fuOpType, + SignExt(io.in.bits.uop.cf.pc, AddrBits), + io.in.bits.uop + ) + + val redirectHit = uop.roqIdx.needFlush(io.redirectIn) + val valid = io.in.valid && !redirectHit + + val isAdderSub = (func =/= ALUOpType.add) && (func =/= ALUOpType.addw) + val adderRes = (src1 +& (src2 ^ Fill(XLEN, isAdderSub))) + isAdderSub + val xorRes = src1 ^ src2 + val sltu = !adderRes(XLEN) + val slt = xorRes(XLEN-1) ^ sltu + + val shsrc1 = LookupTreeDefault(func, src1, List( + ALUOpType.srlw -> ZeroExt(src1(31,0), 64), + ALUOpType.sraw -> SignExt(src1(31,0), 64) + )) + val shamt = Mux(ALUOpType.isWordOp(func), src2(4, 0), src2(5, 0)) + val res = LookupTreeDefault(func(3, 0), adderRes, List( + ALUOpType.sll -> ((shsrc1 << shamt)(XLEN-1, 0)), + ALUOpType.slt -> ZeroExt(slt, XLEN), + ALUOpType.sltu -> ZeroExt(sltu, XLEN), + ALUOpType.xor -> xorRes, + ALUOpType.srl -> (shsrc1 >> shamt), + ALUOpType.or -> (src1 | src2), + ALUOpType.and -> (src1 & src2), + ALUOpType.sra -> ((shsrc1.asSInt >> shamt).asUInt) + )) + val aluRes = Mux(ALUOpType.isWordOp(func), SignExt(res(31,0), 64), res) + + val branchOpTable = List( + ALUOpType.getBranchType(ALUOpType.beq) -> !xorRes.orR, + ALUOpType.getBranchType(ALUOpType.blt) -> slt, + ALUOpType.getBranchType(ALUOpType.bltu) -> sltu + ) + + val isBranch = uop.cf.brUpdate.pd.isBr// ALUOpType.isBranch(func) + val isRVC = uop.cf.brUpdate.pd.isRVC//(io.in.bits.cf.instr(1,0) =/= "b11".U) + val taken = LookupTree(ALUOpType.getBranchType(func), branchOpTable) ^ ALUOpType.isBranchInvert(func) + val target = Mux(isBranch, pc + offset, adderRes)(VAddrBits-1,0) + val snpc = Mux(isRVC, pc + 2.U, pc + 4.U) + + redirectOutValid := io.out.valid && isBranch + redirectOut.pc := uop.cf.pc + redirectOut.target := Mux(!taken && isBranch, snpc, target) + redirectOut.brTag := uop.brTag + redirectOut.isException := false.B + redirectOut.isMisPred := DontCare // check this in brq + redirectOut.isFlushPipe := false.B + redirectOut.isReplay := false.B + redirectOut.roqIdx := uop.roqIdx + + brUpdate := uop.cf.brUpdate + // override brUpdate + brUpdate.pc := uop.cf.pc + brUpdate.target := Mux(!taken && isBranch, snpc, target) + brUpdate.brTarget := target + brUpdate.taken := isBranch && taken + brUpdate.brTag := uop.brTag + + io.in.ready := io.out.ready + io.out.valid := valid + io.out.bits.uop <> io.in.bits.uop + io.out.bits.data := aluRes +} diff --git a/src/main/scala/xiangshan/backend/fu/CSR.scala b/src/main/scala/xiangshan/backend/fu/CSR.scala index 1cb4bd75de9a2e01e5164732eaf1e59f77677e64..21a73739c1465b0e9203d75ad8b8fb66ce85c34f 100644 --- a/src/main/scala/xiangshan/backend/fu/CSR.scala +++ b/src/main/scala/xiangshan/backend/fu/CSR.scala @@ -1,17 +1,22 @@ package xiangshan.backend.fu import chisel3._ -import chisel3.ExcitingUtils.ConnectionType +import chisel3.ExcitingUtils.{ConnectionType, Debug} import chisel3.util._ -import chisel3.util.experimental.BoringUtils import fpu.Fflags -import noop.MMUIO import utils._ import xiangshan._ import xiangshan.backend._ -import xiangshan.backend.fu.FunctionUnit._ import utils.XSDebug +object debugId extends Function0[Integer] { + var x = 0 + def apply(): Integer = { + x = x + 1 + return x + } +} + trait HasCSRConst { // User Trap Setup val Ustatus = 0x000 @@ -53,11 +58,11 @@ trait HasCSRConst { // Supervisor Protection and Translation val Satp = 0x180 - // Machine Information Registers - val Mvendorid = 0xF11 - val Marchid = 0xF12 - val Mimpid = 0xF13 - val Mhartid = 0xF14 + // Machine Information Registers + val Mvendorid = 0xF11 + val Marchid = 0xF12 + val Mimpid = 0xF13 + val Mhartid = 0xF14 // Machine Trap Setup val Mstatus = 0x300 @@ -84,7 +89,7 @@ trait HasCSRConst { val PmpaddrBase = 0x3B0 // Machine Counter/Timers - // Currently, we uses perfcnt csr set instead of standard Machine Counter/Timers + // Currently, we uses perfcnt csr set instead of standard Machine Counter/Timers // 0xB80 - 0x89F are also used as perfcnt csr // Machine Counter Setup (not implemented) @@ -166,35 +171,44 @@ class FpuCsrIO extends XSBundle { val frm = Input(UInt(3.W)) } -class CSRIO extends FunctionUnitIO { - val cfIn = Input(new CtrlFlow) - val redirect = Output(new Redirect) - val redirectValid = Output(Bool()) - val fpu_csr = Flipped(new FpuCsrIO) - val cfOut = Output(new CtrlFlow) - // from rob - val exception = Flipped(ValidIO(new MicroOp)) - // for exception check - val instrValid = Input(Bool()) - val flushPipe = Output(Bool()) - // for differential testing -// val intrNO = Output(UInt(XLEN.W)) - val wenFix = Output(Bool()) -} - -class CSR extends FunctionUnit(csrCfg) with HasCSRConst{ - val io = IO(new CSRIO) - io.cfOut := io.cfIn +class PerfCounterIO extends XSBundle { + val value = Input(UInt(XLEN.W)) +} - val (valid, src1, src2, func) = (io.in.valid, io.in.bits.src1, io.in.bits.src2, io.in.bits.func) - def access(valid: Bool, src1: UInt, src2: UInt, func: UInt): UInt = { - this.valid := valid - this.src1 := src1 - this.src2 := src2 - this.func := func - io.out.bits - } +class CSR extends FunctionUnit with HasCSRConst +{ + val csrio = IO(new Bundle { + // output (for func === CSROpType.jmp) + val redirectOut = ValidIO(UInt(VAddrBits.W)) + val perf = Vec(NumPerfCounters, new PerfCounterIO) + // to FPU + val fpu = Flipped(new FpuCsrIO) + // from rob + val exception = Flipped(ValidIO(new MicroOp)) + val isInterrupt = Input(Bool()) + // to ROB + val trapTarget = Output(UInt(VAddrBits.W)) + val interrupt = Output(Bool()) + // from LSQ + val memExceptionVAddr = Input(UInt(VAddrBits.W)) + // from outside cpu,externalInterrupt + val externalInterrupt = new ExternalInterruptIO + // TLB + val tlb = Output(new TlbCsrBundle) + }) + + val cfIn = io.in.bits.uop.cf + val cfOut = Wire(new CtrlFlow) + cfOut := cfIn + val flushPipe = Wire(Bool()) + + val (valid, src1, src2, func) = ( + io.in.valid, + io.in.bits.src(0), + io.in.bits.uop.ctrl.imm, + io.in.bits.uop.ctrl.fuOpType + ) // CSR define @@ -257,14 +271,14 @@ class CSR extends FunctionUnit(csrCfg) with HasCSRConst{ val mipFixMask = GenMask(9) | GenMask(5) | GenMask(1) val mip = (mipWire.asUInt | mipReg).asTypeOf(new Interrupt) - def getMisaMxl(mxl: Int): UInt = {mxl.U << (XLEN-2)} - def getMisaExt(ext: Char): UInt = {1.U << (ext.toInt - 'a'.toInt)} + def getMisaMxl(mxl: Int): UInt = {mxl.U << (XLEN-2)}.asUInt() + def getMisaExt(ext: Char): UInt = {1.U << (ext.toInt - 'a'.toInt)}.asUInt() var extList = List('a', 's', 'i', 'u') if(HasMExtension){ extList = extList :+ 'm'} if(HasCExtension){ extList = extList :+ 'c'} if(HasFPU){ extList = extList ++ List('f', 'd')} - val misaInitVal = getMisaMxl(2) | extList.foldLeft(0.U)((sum, i) => sum | getMisaExt(i)) //"h8000000000141105".U - val misa = RegInit(UInt(XLEN.W), misaInitVal) + val misaInitVal = getMisaMxl(2) | extList.foldLeft(0.U)((sum, i) => sum | getMisaExt(i)) //"h8000000000141105".U + val misa = RegInit(UInt(XLEN.W), misaInitVal) // MXL = 2 | 0 | EXT = b 00 0000 0100 0001 0001 0000 0101 // (XLEN-1, XLEN-2) | |(25, 0) ZY XWVU TSRQ PONM LKJI HGFE DCBA @@ -300,12 +314,12 @@ class CSR extends FunctionUnit(csrCfg) with HasCSRConst{ mstatusNew } - val mstatusMask = ~ZeroExt(( + val mstatusMask = (~ZeroExt(( GenMask(XLEN-2, 38) | GenMask(31, 23) | GenMask(10, 9) | GenMask(2) | GenMask(37) | // MBE GenMask(36) | // SBE GenMask(6) // UBE - ), 64) + ), 64)).asUInt() val medeleg = RegInit(UInt(XLEN.W), 0.U) val mideleg = RegInit(UInt(XLEN.W), 0.U) @@ -338,7 +352,7 @@ class CSR extends FunctionUnit(csrCfg) with HasCSRConst{ val sipMask = "h222".U & mideleg val satp = RegInit(0.U(XLEN.W)) // val satp = RegInit(UInt(XLEN.W), "h8000000000087fbe".U) // only use for tlb naive debug - val satpMask = "h80000fffffffffff".U // disable asid, mode can only be 8 / 0 + val satpMask = "h80000fffffffffff".U // disable asid, mode can only be 8 / 0 // val satp = RegInit(UInt(XLEN.W), 0.U) val sepc = RegInit(UInt(XLEN.W), 0.U) val scause = RegInit(UInt(XLEN.W), 0.U) @@ -347,11 +361,8 @@ class CSR extends FunctionUnit(csrCfg) with HasCSRConst{ val scounteren = RegInit(UInt(XLEN.W), 0.U) val tlbBundle = Wire(new TlbCsrBundle) - // val sfence = Wire(new SfenceBundle) tlbBundle.satp := satp.asTypeOf(new SatpStruct) - // sfence := 0.U.asTypeOf(new SfenceBundle) - BoringUtils.addSource(tlbBundle, "TLBCSRIO") - // BoringUtils.addSource(sfence, "SfenceBundle") // FIXME: move to MOU + csrio.tlb := tlbBundle // User-Level CSRs val uepc = Reg(UInt(XLEN.W)) @@ -401,11 +412,6 @@ class CSR extends FunctionUnit(csrCfg) with HasCSRConst{ // val setLrAddr = WireInit(UInt(AddrBits.W), DontCare) //TODO : need check // val lr = RegInit(Bool(), false.B) // val lrAddr = RegInit(UInt(AddrBits.W), 0.U) -// BoringUtils.addSink(setLr, "set_lr") -// BoringUtils.addSink(setLrVal, "set_lr_val") -// BoringUtils.addSink(setLrAddr, "set_lr_addr") -// BoringUtils.addSource(lr, "lr") -// BoringUtils.addSource(lrAddr, "lr_addr") // // when(setLr){ // lr := setLrVal @@ -421,7 +427,7 @@ class CSR extends FunctionUnit(csrCfg) with HasCSRConst{ val perfCnts = List.fill(nrPerfCnts)(RegInit(0.U(XLEN.W))) val perfCntsLoMapping = (0 until nrPerfCnts).map(i => MaskedRegMap(0xb00 + i, perfCnts(i))) val perfCntsHiMapping = (0 until nrPerfCnts).map(i => MaskedRegMap(0xb80 + i, perfCnts(i)(63, 32))) - + println(s"CSR: hasPerfCnt:${hasPerfCnt}") // CSR reg map val mapping = Map( @@ -500,7 +506,7 @@ class CSR extends FunctionUnit(csrCfg) with HasCSRConst{ val addr = src2(11, 0) val rdata = Wire(UInt(XLEN.W)) - val csri = ZeroExt(io.cfIn.instr(19,15), XLEN) //unsigned imm for csri. [TODO] + val csri = ZeroExt(cfIn.instr(19,15), XLEN) //unsigned imm for csri. [TODO] val wdata = LookupTree(func, List( CSROpType.wrt -> src1, CSROpType.set -> (rdata | src1), @@ -511,15 +517,18 @@ class CSR extends FunctionUnit(csrCfg) with HasCSRConst{ )) // satp wen check - val satpLegalMode = (wdata.asTypeOf(new SatpStruct).mode===0.U) || (wdata.asTypeOf(new SatpStruct).mode===8.U) + val satpLegalMode = (wdata.asTypeOf(new SatpStruct).mode===0.U) || (wdata.asTypeOf(new SatpStruct).mode===8.U) // general CSR wen check val wen = valid && func =/= CSROpType.jmp && (addr=/=Satp.U || satpLegalMode) - val permitted = csrAccessPermissionCheck(addr, false.B, priviledgeMode) + val permitted = csrAccessPermissionCheck(addr, false.B, priviledgeMode) // Writeable check is ingored. // Currently, write to illegal csr addr will be ignored MaskedRegMap.generate(mapping, addr, rdata, wen && permitted, wdata) - io.out.bits := rdata + io.out.bits.data := rdata + io.out.bits.uop := io.in.bits.uop + io.out.bits.uop.cf := cfOut + io.out.bits.uop.ctrl.flushPipe := flushPipe // Fix Mip/Sip write val fixMapping = Map( @@ -529,17 +538,17 @@ class CSR extends FunctionUnit(csrCfg) with HasCSRConst{ val rdataDummy = Wire(UInt(XLEN.W)) MaskedRegMap.generate(fixMapping, addr, rdataDummy, wen, wdata) - when(io.fpu_csr.fflags.asUInt() =/= 0.U){ - fcsr := fflags_wfn(io.fpu_csr.fflags.asUInt()) + when(csrio.fpu.fflags.asUInt() =/= 0.U){ + fcsr := fflags_wfn(csrio.fpu.fflags.asUInt()) } // set fs and sd in mstatus - when(csrw_dirty_fp_state || io.fpu_csr.dirty_fs){ + when(csrw_dirty_fp_state || csrio.fpu.dirty_fs){ val mstatusNew = WireInit(mstatus.asTypeOf(new MstatusStruct)) mstatusNew.fs := "b11".U mstatusNew.sd := true.B mstatus := mstatusNew.asUInt() } - io.fpu_csr.frm := fcsr.asTypeOf(new FcsrStruct).frm + csrio.fpu.frm := fcsr.asTypeOf(new FcsrStruct).frm // CSR inst decode val isEbreak = addr === privEbreak && func === CSROpType.jmp @@ -548,8 +557,8 @@ class CSR extends FunctionUnit(csrCfg) with HasCSRConst{ val isSret = addr === privSret && func === CSROpType.jmp val isUret = addr === privUret && func === CSROpType.jmp - XSDebug(wen, "csr write: pc %x addr %x rdata %x wdata %x func %x\n", io.cfIn.pc, addr, rdata, wdata, func) - XSDebug(wen, "pc %x mstatus %x mideleg %x medeleg %x mode %x\n", io.cfIn.pc, mstatus, mideleg , medeleg, priviledgeMode) + XSDebug(wen, "csr write: pc %x addr %x rdata %x wdata %x func %x\n", cfIn.pc, addr, rdata, wdata, func) + XSDebug(wen, "pc %x mstatus %x mideleg %x medeleg %x mode %x\n", cfIn.pc, mstatus, mideleg , medeleg, priviledgeMode) // Illegal priviledged operation list val illegalSModeSret = valid && isSret && priviledgeMode === ModeS && mstatusStruct.tsr.asBool @@ -588,43 +597,23 @@ class CSR extends FunctionUnit(csrCfg) with HasCSRConst{ tlbBundle.priv.imode := priviledgeMode tlbBundle.priv.dmode := Mux(mstatusStruct.mprv.asBool, mstatusStruct.mpp, priviledgeMode) - val hasInstrPageFault = io.exception.bits.cf.exceptionVec(instrPageFault) && io.exception.valid - val hasLoadPageFault = io.exception.bits.cf.exceptionVec(loadPageFault) && io.exception.valid - val hasStorePageFault = io.exception.bits.cf.exceptionVec(storePageFault) && io.exception.valid - val hasStoreAddrMisaligned = io.exception.bits.cf.exceptionVec(storeAddrMisaligned) && io.exception.valid - val hasLoadAddrMisaligned = io.exception.bits.cf.exceptionVec(loadAddrMisaligned) && io.exception.valid + val hasInstrPageFault = csrio.exception.bits.cf.exceptionVec(instrPageFault) && csrio.exception.valid + val hasLoadPageFault = csrio.exception.bits.cf.exceptionVec(loadPageFault) && csrio.exception.valid + val hasStorePageFault = csrio.exception.bits.cf.exceptionVec(storePageFault) && csrio.exception.valid + val hasStoreAddrMisaligned = csrio.exception.bits.cf.exceptionVec(storeAddrMisaligned) && csrio.exception.valid + val hasLoadAddrMisaligned = csrio.exception.bits.cf.exceptionVec(loadAddrMisaligned) && csrio.exception.valid // mtval write logic - val lsroqExceptionAddr = WireInit(0.U(VAddrBits.W)) - if(EnableUnifiedLSQ){ - ExcitingUtils.addSource(io.exception.bits.lsroqIdx, "EXECPTION_LSROQIDX") - ExcitingUtils.addSink(lsroqExceptionAddr, "EXECPTION_VADDR") - } else { - val lsIdx = WireInit(0.U.asTypeOf(new LSIdx())) - lsIdx.lqIdx := io.exception.bits.lqIdx - lsIdx.sqIdx := io.exception.bits.sqIdx - ExcitingUtils.addSource(lsIdx, "EXECPTION_LSROQIDX") - val lqExceptionAddr = WireInit(0.U(VAddrBits.W)) - val sqExceptionAddr = WireInit(0.U(VAddrBits.W)) - ExcitingUtils.addSink(lqExceptionAddr, "EXECPTION_LOAD_VADDR") - ExcitingUtils.addSink(sqExceptionAddr, "EXECPTION_STORE_VADDR") - lsroqExceptionAddr := Mux(CommitType.lsInstIsStore(io.exception.bits.ctrl.commitType), sqExceptionAddr, lqExceptionAddr) - } - - val atomExceptionAddr = WireInit(0.U(VAddrBits.W)) - val atomOverrideXtval = WireInit(false.B) - ExcitingUtils.addSink(atomExceptionAddr, "ATOM_EXECPTION_VADDR") - ExcitingUtils.addSink(atomOverrideXtval, "ATOM_OVERRIDE_XTVAL") - val memExceptionAddr = Mux(atomOverrideXtval, atomExceptionAddr, lsroqExceptionAddr) + val memExceptionAddr = SignExt(csrio.memExceptionVAddr, XLEN) when(hasInstrPageFault || hasLoadPageFault || hasStorePageFault){ val tval = Mux( hasInstrPageFault, Mux( - io.exception.bits.cf.crossPageIPFFix, - SignExt(io.exception.bits.cf.pc + 2.U, XLEN), - SignExt(io.exception.bits.cf.pc, XLEN) + csrio.exception.bits.cf.crossPageIPFFix, + SignExt(csrio.exception.bits.cf.pc + 2.U, XLEN), + SignExt(csrio.exception.bits.cf.pc, XLEN) ), - SignExt(memExceptionAddr, XLEN) + memExceptionAddr ) when(priviledgeMode === ModeM){ mtval := tval @@ -635,7 +624,7 @@ class CSR extends FunctionUnit(csrCfg) with HasCSRConst{ when(hasLoadAddrMisaligned || hasStoreAddrMisaligned) { - mtval := SignExt(memExceptionAddr, XLEN) + mtval := memExceptionAddr } // Exception and Intr @@ -650,20 +639,14 @@ class CSR extends FunctionUnit(csrCfg) with HasCSRConst{ intrVecEnable.zip(ideleg.asBools).map{case(x,y) => x := priviledgedEnableDetect(y)} val intrVec = mie(11,0) & mip.asUInt & intrVecEnable.asUInt val intrBitSet = intrVec.orR() - ExcitingUtils.addSource(intrBitSet, "intrBitSetIDU") + csrio.interrupt := intrBitSet val intrNO = IntPriority.foldRight(0.U)((i: Int, sum: UInt) => Mux(intrVec(i), i.U, sum)) - val raiseIntr = intrBitSet && io.exception.valid - XSDebug(raiseIntr, "interrupt: pc=0x%x, %d\n", io.exception.bits.cf.pc, intrNO) - - val mtip = WireInit(false.B) - val msip = WireInit(false.B) - val meip = WireInit(false.B) - ExcitingUtils.addSink(mtip, "mtip") - ExcitingUtils.addSink(msip, "msip") - ExcitingUtils.addSink(meip, "meip") - mipWire.t.m := mtip - mipWire.s.m := msip - mipWire.e.m := meip + val raiseIntr = intrBitSet && csrio.exception.valid && csrio.isInterrupt + XSDebug(raiseIntr, "interrupt: pc=0x%x, %d\n", csrio.exception.bits.cf.pc, intrNO) + + mipWire.t.m := csrio.externalInterrupt.mtip + mipWire.s.m := csrio.externalInterrupt.msip + mipWire.e.m := csrio.externalInterrupt.meip // exceptions val csrExceptionVec = Wire(Vec(16, Bool())) @@ -678,32 +661,44 @@ class CSR extends FunctionUnit(csrCfg) with HasCSRConst{ csrExceptionVec(illegalInstr) := (isIllegalAddr || isIllegalAccess) && wen csrExceptionVec(loadPageFault) := hasLoadPageFault csrExceptionVec(storePageFault) := hasStorePageFault - val iduExceptionVec = io.cfIn.exceptionVec + val iduExceptionVec = cfIn.exceptionVec val exceptionVec = csrExceptionVec.asUInt() | iduExceptionVec.asUInt() - io.cfOut.exceptionVec.zipWithIndex.map{case (e, i) => e := exceptionVec(i) } - io.wenFix := DontCare + cfOut.exceptionVec.zipWithIndex.map{case (e, i) => e := exceptionVec(i) } - val raiseExceptionVec = io.exception.bits.cf.exceptionVec.asUInt() + val raiseExceptionVec = csrio.exception.bits.cf.exceptionVec.asUInt() val exceptionNO = ExcPriority.foldRight(0.U)((i: Int, sum: UInt) => Mux(raiseExceptionVec(i), i.U, sum)) val causeNO = (raiseIntr << (XLEN-1)).asUInt() | Mux(raiseIntr, intrNO, exceptionNO) - val difftestIntrNO = Mux(raiseIntr, causeNO, 0.U) - ExcitingUtils.addSource(difftestIntrNO, "difftestIntrNOfromCSR") - ExcitingUtils.addSource(causeNO, "difftestCausefromCSR") - - val raiseExceptionIntr = io.exception.valid + // if (!env.FPGAPlatform) { + val id = debugId() + val difftestIntrNO = Mux(raiseIntr, causeNO, 0.U) + ExcitingUtils.addSource(difftestIntrNO, s"difftestIntrNOfromCSR$id") + ExcitingUtils.addSource(causeNO, s"difftestCausefromCSR$id") + // } + + val raiseExceptionIntr = csrio.exception.valid val retTarget = Wire(UInt(VAddrBits.W)) - val trapTarget = Wire(UInt(VAddrBits.W)) - ExcitingUtils.addSource(trapTarget, "trapTarget") val resetSatp = addr === Satp.U && wen // write to satp will cause the pipeline be flushed - io.redirect := DontCare - io.redirectValid := valid && func === CSROpType.jmp && !isEcall - io.redirect.target := retTarget - io.flushPipe := resetSatp - - XSDebug(io.redirectValid, "redirect to %x, pc=%x\n", io.redirect.target, io.cfIn.pc) - - XSDebug(raiseExceptionIntr, "int/exc: pc %x int (%d):%x exc: (%d):%x\n",io.exception.bits.cf.pc, intrNO, io.exception.bits.cf.intrVec.asUInt, exceptionNO, raiseExceptionVec.asUInt) - XSDebug(raiseExceptionIntr, "pc %x mstatus %x mideleg %x medeleg %x mode %x\n", io.exception.bits.cf.pc, mstatus, mideleg, medeleg, priviledgeMode) + csrio.redirectOut.valid := valid && func === CSROpType.jmp && !isEcall + csrio.redirectOut.bits := retTarget + flushPipe := resetSatp + + XSDebug(csrio.redirectOut.valid, "redirect to %x, pc=%x\n", csrio.redirectOut.bits, cfIn.pc) + + XSDebug(raiseExceptionIntr, "int/exc: pc %x int (%d):%x exc: (%d):%x\n", + csrio.exception.bits.cf.pc, + intrNO, + csrio.exception.bits.cf.intrVec.asUInt, + exceptionNO, + raiseExceptionVec.asUInt + ) + XSDebug(raiseExceptionIntr, + "pc %x mstatus %x mideleg %x medeleg %x mode %x\n", + csrio.exception.bits.cf.pc, + mstatus, + mideleg, + medeleg, + priviledgeMode + ) // Branch control @@ -712,7 +707,7 @@ class CSR extends FunctionUnit(csrCfg) with HasCSRConst{ val delegS = (deleg(causeNO(3,0))) && (priviledgeMode < ModeM) val tvalWen = !(hasInstrPageFault || hasLoadPageFault || hasStorePageFault || hasLoadAddrMisaligned || hasStoreAddrMisaligned) || raiseIntr // TODO: need check - trapTarget := Mux(delegS, stvec, mtvec)(VAddrBits-1, 0) + csrio.trapTarget := Mux(delegS, stvec, mtvec)(VAddrBits-1, 0) retTarget := DontCare // val illegalEret = TODO @@ -759,7 +754,7 @@ class CSR extends FunctionUnit(csrCfg) with HasCSRConst{ when (delegS) { scause := causeNO - sepc := SignExt(io.exception.bits.cf.pc, XLEN) + sepc := SignExt(csrio.exception.bits.cf.pc, XLEN) mstatusNew.spp := priviledgeMode mstatusNew.pie.s := mstatusOld.ie.s mstatusNew.ie.s := false.B @@ -768,7 +763,7 @@ class CSR extends FunctionUnit(csrCfg) with HasCSRConst{ // trapTarget := stvec(VAddrBits-1. 0) }.otherwise { mcause := causeNO - mepc := SignExt(io.exception.bits.cf.pc, XLEN) + mepc := SignExt(csrio.exception.bits.cf.pc, XLEN) mstatusNew.mpp := priviledgeMode mstatusNew.pie.m := mstatusOld.ie.m mstatusNew.ie.m := false.B @@ -784,11 +779,32 @@ class CSR extends FunctionUnit(csrCfg) with HasCSRConst{ io.out.valid := valid - XSDebug(io.redirectValid, "Rediret %x raiseExcepIntr:%d isSret:%d retTarget:%x sepc:%x delegs:%d deleg:%x cfInpc:%x valid:%d instrValid:%x \n", - io.redirect.target, raiseExceptionIntr, isSret, retTarget, sepc, delegS, deleg, io.cfIn.pc, valid, io.instrValid) - XSDebug(raiseExceptionIntr && delegS, "Red(%d, %x) raiseExcepIntr:%d isSret:%d retTarget:%x sepc:%x delegs:%d deleg:%x cfInpc:%x valid:%d instrValid:%x \n", - io.redirectValid, io.redirect.target, raiseExceptionIntr, isSret, retTarget, sepc, delegS, deleg, io.cfIn.pc, valid, io.instrValid) - XSDebug(raiseExceptionIntr && delegS, "sepc is writen!!! pc:%x\n", io.cfIn.pc) + XSDebug(csrio.redirectOut.valid, + "Rediret %x raiseExcepIntr:%d isSret:%d retTarget:%x sepc:%x delegs:%d deleg:%x cfInpc:%x valid:%d\n", + csrio.redirectOut.bits, + raiseExceptionIntr, + isSret, + retTarget, + sepc, + delegS, + deleg, + cfIn.pc, + valid + ) + XSDebug(raiseExceptionIntr && delegS, + "Red(%d, %x) raiseExcepIntr:%d isSret:%d retTarget:%x sepc:%x delegs:%d deleg:%x cfInpc:%x valid:%d\n", + csrio.redirectOut.valid, + csrio.redirectOut.bits, + raiseExceptionIntr, + isSret, + retTarget, + sepc, + delegS, + deleg, + cfIn.pc, + valid + ) + XSDebug(raiseExceptionIntr && delegS, "sepc is writen!!! pc:%x\n", cfIn.pc) // perfcnt @@ -815,16 +831,18 @@ class CSR extends FunctionUnit(csrCfg) with HasCSRConst{ "DTlbMissCnt1"-> (0xb20, "perfCntDtlbMissCnt1" ), "DTlbMissCnt2"-> (0xb21, "perfCntDtlbMissCnt2" ), "DTlbMissCnt3"-> (0xb22, "perfCntDtlbMissCnt3" ), - "PtwReqCnt" -> (0xb23, "perfCntPtwReqCnt" ), - "PtwCycleCnt" -> (0xb24, "perfCntPtwCycleCnt" ), - "PtwL2TlbHit" -> (0xb25, "perfCntPtwL2TlbHit" ), - "ITlbReqCnt0" -> (0xb28, "perfCntItlbReqCnt0" ), - "ITlbMissCnt0"-> (0xb29, "perfCntItlbMissCnt0" ), - "PtwReqCnt" -> (0xb2a, "perfCntPtwReqCnt" ), - "PtwCycleCnt" -> (0xb2b, "perfCntPtwCycleCnt" ), - "PtwL2TlbHit" -> (0xb2c, "perfCntPtwL2TlbHit" ), - "ICacheReq" -> (0xb2d, "perfCntIcacheReqCnt" ), - "ICacheMiss" -> (0xb2e, "perfCntIcacheMissCnt" ) + "ITlbReqCnt0" -> (0xb23, "perfCntItlbReqCnt0" ), + "ITlbMissCnt0"-> (0xb24, "perfCntItlbMissCnt0" ), + "PtwReqCnt" -> (0xb25, "perfCntPtwReqCnt" ), + "PtwCycleCnt" -> (0xb26, "perfCntPtwCycleCnt" ), + "PtwL2TlbHit" -> (0xb27, "perfCntPtwL2TlbHit" ), + "ICacheReq" -> (0xb28, "perfCntIcacheReqCnt" ), + "ICacheMiss" -> (0xb29, "perfCntIcacheMissCnt" )//, + // "FetchFromICache" -> (0xb2a, "CntFetchFromICache"), + // "FetchFromLoopBuffer" -> (0xb2b, "CntFetchFromLoopBuffer"), + // "ExitLoop1" -> (0xb2c, "CntExitLoop1"), + // "ExitLoop2" -> (0xb2d, "CntExitLoop2"), + // "ExitLoop3" -> (0xb2e, "CntExitLoop3") // "Custom1" -> (0xb1b, "Custom1" ), // "Custom2" -> (0xb1c, "Custom2" ), // "Custom3" -> (0xb1d, "Custom3" ), @@ -868,33 +886,24 @@ class CSR extends FunctionUnit(csrCfg) with HasCSRConst{ } } - // for differential testing -// BoringUtils.addSource(RegNext(priviledgeMode), "difftestMode") -// BoringUtils.addSource(RegNext(mstatus), "difftestMstatus") -// BoringUtils.addSource(RegNext(mstatus & sstatusRmask), "difftestSstatus") -// BoringUtils.addSource(RegNext(mepc), "difftestMepc") -// BoringUtils.addSource(RegNext(sepc), "difftestSepc") -// BoringUtils.addSource(RegNext(mcause), "difftestMcause") -// BoringUtils.addSource(RegNext(scause), "difftestScause") - BoringUtils.addSource(priviledgeMode, "difftestMode") - BoringUtils.addSource(mstatus, "difftestMstatus") - BoringUtils.addSource(mstatus & sstatusRmask, "difftestSstatus") - BoringUtils.addSource(mepc, "difftestMepc") - BoringUtils.addSource(sepc, "difftestSepc") - BoringUtils.addSource(mtval, "difftestMtval") - BoringUtils.addSource(stval, "difftestStval") - BoringUtils.addSource(mtvec, "difftestMtvec") - BoringUtils.addSource(stvec, "difftestStvec") - BoringUtils.addSource(mcause, "difftestMcause") - BoringUtils.addSource(scause, "difftestScause") - BoringUtils.addSource(satp, "difftestSatp") - BoringUtils.addSource(mipReg, "difftestMip") - BoringUtils.addSource(mie, "difftestMie") - BoringUtils.addSource(mscratch, "difftestMscratch") - BoringUtils.addSource(sscratch, "difftestSscratch") - BoringUtils.addSource(mideleg, "difftestMideleg") - BoringUtils.addSource(medeleg, "difftestMedeleg") + ExcitingUtils.addSource(priviledgeMode, "difftestMode", Debug) + ExcitingUtils.addSource(mstatus, "difftestMstatus", Debug) + ExcitingUtils.addSource(mstatus & sstatusRmask, "difftestSstatus", Debug) + ExcitingUtils.addSource(mepc, "difftestMepc", Debug) + ExcitingUtils.addSource(sepc, "difftestSepc", Debug) + ExcitingUtils.addSource(mtval, "difftestMtval", Debug) + ExcitingUtils.addSource(stval, "difftestStval", Debug) + ExcitingUtils.addSource(mtvec, "difftestMtvec", Debug) + ExcitingUtils.addSource(stvec, "difftestStvec", Debug) + ExcitingUtils.addSource(mcause, "difftestMcause", Debug) + ExcitingUtils.addSource(scause, "difftestScause", Debug) + ExcitingUtils.addSource(satp, "difftestSatp", Debug) + ExcitingUtils.addSource(mipReg, "difftestMip", Debug) + ExcitingUtils.addSource(mie, "difftestMie", Debug) + ExcitingUtils.addSource(mscratch, "difftestMscratch", Debug) + ExcitingUtils.addSource(sscratch, "difftestSscratch", Debug) + ExcitingUtils.addSource(mideleg, "difftestMideleg", Debug) + ExcitingUtils.addSource(medeleg, "difftestMedeleg", Debug) } else { -// BoringUtils.addSource(readWithScala(perfCntList("Minstret")._1), "ilaInstrCnt") } } diff --git a/src/main/scala/xiangshan/backend/fu/FDivSqrt.scala b/src/main/scala/xiangshan/backend/fu/FDivSqrt.scala deleted file mode 100644 index 673decc877d39e8974b7910541aa3c4d0a6d6e03..0000000000000000000000000000000000000000 --- a/src/main/scala/xiangshan/backend/fu/FDivSqrt.scala +++ /dev/null @@ -1,14 +0,0 @@ -package xiangshan.backend.fu - -import chisel3._ -import chisel3.util._ -import xiangshan._ -import utils._ -import xiangshan.backend._ - -import xiangshan.backend.fu.FunctionUnit._ - -class FDivSqrt extends FunctionUnit(fDivSqrtCfg){ - val io = IO(new Bundle() {}) - override def toString: String = "FDivSqrt" -} diff --git a/src/main/scala/xiangshan/backend/fu/Fence.scala b/src/main/scala/xiangshan/backend/fu/Fence.scala index 8947b54150e2c934675ff663e29429114966b022..5230e3429737d4a4b7693216fade2fc6ea571d8f 100644 --- a/src/main/scala/xiangshan/backend/fu/Fence.scala +++ b/src/main/scala/xiangshan/backend/fu/Fence.scala @@ -1,28 +1,40 @@ -package xiangshan.backend.exu +package xiangshan.backend.fu import chisel3._ import chisel3.util._ import xiangshan._ import utils._ -import chisel3.util.experimental.BoringUtils - import xiangshan.backend.FenceOpType -class FenceExeUnit extends Exu(Exu.fenceExeUnitCfg) { - val (valid, src1, src2, uop, func, lsrc1, lsrc2) = - (io.in.valid, io.in.bits.src1, io.in.bits.src2, io.in.bits.uop, io.in.bits.uop.ctrl.fuOpType, io.in.bits.uop.ctrl.lsrc1, io.in.bits.uop.ctrl.lsrc2) +class FenceToSbuffer extends XSBundle { + val flushSb = Output(Bool()) + val sbIsEmpty = Input(Bool()) +} + +// class Fence extends FunctionUnit(FuConfig( + // /*FuType.fence, 1, 0, writeIntRf = false, writeFpRf = false, hasRedirect = false,*/ latency = UncertainLatency() +// )){ +class Fence extends FunctionUnit{ // TODO: check it + + val sfence = IO(Output(new SfenceBundle)) + val fencei = IO(Output(Bool())) + val toSbuffer = IO(new FenceToSbuffer) + + val (valid, src1, uop, func, lsrc1, lsrc2) = ( + io.in.valid, + io.in.bits.src(0), + io.in.bits.uop, + io.in.bits.uop.ctrl.fuOpType, + io.in.bits.uop.ctrl.lsrc1, + io.in.bits.uop.ctrl.lsrc2 + ) val s_sb :: s_tlb :: s_icache :: s_none :: Nil = Enum(4) val state = RegInit(s_sb) - val sfence = WireInit(0.U.asTypeOf(new SfenceBundle)) - val sbuffer = WireInit(false.B) - val fencei = WireInit(false.B) - val sbEmpty = WireInit(false.B) - BoringUtils.addSource(sbuffer, "FenceUnitSbufferFlush") - BoringUtils.addSource(sfence, "SfenceBundle") - BoringUtils.addSource(fencei, "FenceI") - BoringUtils.addSink(sbEmpty, "SBufferEmpty") + val sbuffer = toSbuffer.flushSb + val sbEmpty = toSbuffer.sbIsEmpty + // NOTE: icache & tlb & sbuffer must receive flush signal at any time sbuffer := valid && state === s_sb && !sbEmpty fencei := (state === s_icache && sbEmpty) || (state === s_sb && valid && sbEmpty && func === FenceOpType.fencei) @@ -34,16 +46,13 @@ class FenceExeUnit extends Exu(Exu.fenceExeUnitCfg) { when (state === s_sb && valid && func === FenceOpType.fencei && !sbEmpty) { state := s_icache } when (state === s_sb && valid && func === FenceOpType.sfence && !sbEmpty) { state := s_tlb } when (state === s_sb && valid && func === FenceOpType.fence && !sbEmpty) { state := s_none } - when (state =/= s_sb && sbEmpty) { state := s_sb } + when (state =/= s_sb && sbEmpty) { state := s_sb } assert(!(io.out.valid && io.out.bits.uop.ctrl.rfWen)) io.in.ready := state === s_sb io.out.valid := (state =/= s_sb && sbEmpty) || (state === s_sb && sbEmpty && valid) io.out.bits.data := DontCare io.out.bits.uop := Mux(state === s_sb, uop, RegEnable(uop, io.in.fire())) - io.out.bits.redirect <> DontCare - io.out.bits.redirectValid := false.B - io.out.bits.debug <> DontCare assert(!(valid || state =/= s_sb) || io.out.ready) // NOTE: fence instr must be the first(only one) instr, so io.out.ready must be true diff --git a/src/main/scala/xiangshan/backend/fu/Fmac.scala b/src/main/scala/xiangshan/backend/fu/Fmac.scala deleted file mode 100644 index 25c864ae99506d57ec65d36437c3f6e3d3ec33d3..0000000000000000000000000000000000000000 --- a/src/main/scala/xiangshan/backend/fu/Fmac.scala +++ /dev/null @@ -1,14 +0,0 @@ -package xiangshan.backend.fu - -import chisel3._ -import chisel3.util._ -import xiangshan._ -import utils._ -import xiangshan.backend._ - -import xiangshan.backend.fu.FunctionUnit._ - -class Fmac extends FunctionUnit(fmacCfg){ - val io = IO(new Bundle() {}) - override def toString: String = "Fmac" -} diff --git a/src/main/scala/xiangshan/backend/fu/Fmisc.scala b/src/main/scala/xiangshan/backend/fu/Fmisc.scala deleted file mode 100644 index 22ec4b9575d11403a1f86f216c93c32db38b3fad..0000000000000000000000000000000000000000 --- a/src/main/scala/xiangshan/backend/fu/Fmisc.scala +++ /dev/null @@ -1,14 +0,0 @@ -package xiangshan.backend.fu - -import chisel3._ -import chisel3.util._ -import xiangshan._ -import utils._ -import xiangshan.backend._ - -import xiangshan.backend.fu.FunctionUnit._ - -class Fmisc extends FunctionUnit(fmiscCfg){ - val io = IO(new Bundle() {}) - override def toString: String = "Fmisc" -} diff --git a/src/main/scala/xiangshan/backend/fu/FunctionUnit.scala b/src/main/scala/xiangshan/backend/fu/FunctionUnit.scala index 899feee3f32137515ea36b5ffdbf31da97ac363d..3d95dbdea0243ae148f68c5e77879cd467e037df 100644 --- a/src/main/scala/xiangshan/backend/fu/FunctionUnit.scala +++ b/src/main/scala/xiangshan/backend/fu/FunctionUnit.scala @@ -2,78 +2,302 @@ package xiangshan.backend.fu import chisel3._ import chisel3.util._ - import xiangshan._ -import utils._ - -import FunctionUnit._ +import xiangshan.backend.MDUOpType +import xiangshan.backend.fu.fpu.FPUOpType.{FU_D2S, FU_DIVSQRT, FU_F2I, FU_FCMP, FU_FMV, FU_S2D} +import xiangshan.backend.fu.fpu.divsqrt.DivSqrt +import xiangshan.backend.fu.fpu._ +import xiangshan.backend.fu.fpu.fma.FMA /* XiangShan Function Unit A Exu can have one or more function units */ +trait HasFuLatency { + val latencyVal: Option[Int] +} + +case class CertainLatency(value: Int) extends HasFuLatency { + override val latencyVal: Option[Int] = Some(value) +} + +case class UncertainLatency() extends HasFuLatency { + override val latencyVal: Option[Int] = None +} + + case class FuConfig ( + fuGen: () => FunctionUnit, + fuSel: FunctionUnit => Bool, fuType: UInt, numIntSrc: Int, numFpSrc: Int, writeIntRf: Boolean, writeFpRf: Boolean, - hasRedirect: Boolean -) - -class FunctionUnitIO extends XSBundle { - val in = Flipped(Decoupled(new Bundle { - val src1 = Output(UInt(XLEN.W)) - val src2 = Output(UInt(XLEN.W)) - val src3 = Output(UInt(XLEN.W)) - val func = Output(FuOpType()) + hasRedirect: Boolean, + latency: HasFuLatency = CertainLatency(0) +) { + def srcCnt: Int = math.max(numIntSrc, numFpSrc) +} + + +class FuOutput extends XSBundle { + val data = UInt(XLEN.W) + val uop = new MicroOp +} + + +class FunctionUnitIO(len: Int) extends XSBundle { + val in = Flipped(DecoupledIO(new Bundle() { + val src = Vec(3, UInt(len.W)) + val uop = new MicroOp })) - val out = Decoupled(Output(UInt(XLEN.W))) + + val out = DecoupledIO(new FuOutput) + + val redirectIn = Flipped(ValidIO(new Redirect)) + + override def cloneType: FunctionUnitIO.this.type = + new FunctionUnitIO(len).asInstanceOf[this.type] +} + +abstract class FunctionUnit(len: Int = 64) extends XSModule { + + val io = IO(new FunctionUnitIO(len)) + +} + +trait HasPipelineReg { + this: FunctionUnit => + + def latency: Int + + require(latency > 0) + + val validVec = io.in.valid +: Array.fill(latency)(RegInit(false.B)) + val rdyVec = Array.fill(latency)(Wire(Bool())) :+ io.out.ready + val uopVec = io.in.bits.uop +: Array.fill(latency)(Reg(new MicroOp)) + + + val flushVec = uopVec.zip(validVec).map(x => x._2 && x._1.roqIdx.needFlush(io.redirectIn)) + + for (i <- 0 until latency) { + rdyVec(i) := !validVec(i + 1) || rdyVec(i + 1) + } + + for (i <- 1 to latency) { + when(flushVec(i - 1) || rdyVec(i) && !validVec(i - 1)) { + validVec(i) := false.B + }.elsewhen(rdyVec(i - 1) && validVec(i - 1) && !flushVec(i - 1)) { + validVec(i) := validVec(i - 1) + uopVec(i) := uopVec(i - 1) + } + } + + io.in.ready := rdyVec(0) + io.out.valid := validVec.last && !flushVec.last + io.out.bits.uop := uopVec.last + + def PipelineReg[TT <: Data](i: Int)(next: TT) = RegEnable( + next, + enable = validVec(i - 1) && rdyVec(i - 1) && !flushVec(i - 1) + ) + + def S1Reg[TT <: Data](next: TT): TT = PipelineReg[TT](1)(next) + + def S2Reg[TT <: Data](next: TT): TT = PipelineReg[TT](2)(next) + + def S3Reg[TT <: Data](next: TT): TT = PipelineReg[TT](3)(next) + + def S4Reg[TT <: Data](next: TT): TT = PipelineReg[TT](4)(next) + + def S5Reg[TT <: Data](next: TT): TT = PipelineReg[TT](5)(next) } -abstract class FunctionUnit(cfg: FuConfig) extends XSModule +object FunctionUnit extends HasXSParameter { + + def divider = new SRT4Divider(XLEN) + + def multiplier = new ArrayMultiplier(XLEN + 1, Seq(0, 2)) + + def alu = new Alu + + def jmp = new Jump + + def fence = new Fence + + def csr = new CSR + + def i2f = new IntToFloatSingleCycle + + def fmac = new FMA + + def fcmp = new FCMP + + def fmv = new FMV(XLEN) + + def f2i = new FloatToInt + + def f32toF64 = new F32toF64 + + def f64toF32 = new F64toF32 + + def fdivSqrt = new DivSqrt + + def fmiscSel(fu: String)(x: FunctionUnit): Bool = { + x.io.in.bits.uop.ctrl.fuOpType.head(4) === s"b$fu".U + } + + val aluCfg = FuConfig( + fuGen = alu _, + fuSel = _ => true.B, + fuType = FuType.alu, + numIntSrc = 2, + numFpSrc = 0, + writeIntRf = true, + writeFpRf = false, + hasRedirect = true + ) + + val jmpCfg = FuConfig( + fuGen = jmp _, + fuSel = (x: FunctionUnit) => x.io.in.bits.uop.ctrl.fuType === FuType.jmp, + fuType = FuType.jmp, + numIntSrc = 1, + numFpSrc = 0, + writeIntRf = true, + writeFpRf = false, + hasRedirect = true + ) + + val fenceCfg = FuConfig( + fuGen = fence _, + fuSel = (x: FunctionUnit) => x.io.in.bits.uop.ctrl.fuType === FuType.fence, + FuType.fence, 1, 0, writeIntRf = false, writeFpRf = false, hasRedirect = false, + UncertainLatency() // TODO: need rewrite latency structure, not just this value + ) + + val csrCfg = FuConfig( + fuGen = csr _, + fuSel = (x: FunctionUnit) => x.io.in.bits.uop.ctrl.fuType === FuType.csr, + fuType = FuType.csr, + numIntSrc = 1, + numFpSrc = 0, + writeIntRf = true, + writeFpRf = false, + hasRedirect = false + ) + + val i2fCfg = FuConfig( + fuGen = i2f _, + fuSel = (x: FunctionUnit) => x.io.in.bits.uop.ctrl.fuType === FuType.i2f, + FuType.i2f, + numIntSrc = 1, + numFpSrc = 0, + writeIntRf = false, + writeFpRf = true, + hasRedirect = false, + CertainLatency(0) + ) -object FunctionUnit { + val divCfg = FuConfig( + fuGen = divider _, + fuSel = (x: FunctionUnit) => MDUOpType.isDiv(x.io.in.bits.uop.ctrl.fuOpType), + FuType.div, + 2, + 0, + writeIntRf = true, + writeFpRf = false, + hasRedirect = false, + UncertainLatency() + ) - val csrCfg = - FuConfig(FuType.csr, 1, 0, writeIntRf = true, writeFpRf = false, hasRedirect = false) + val mulCfg = FuConfig( + fuGen = multiplier _, + fuSel = (x: FunctionUnit) => MDUOpType.isMul(x.io.in.bits.uop.ctrl.fuOpType), + FuType.mul, + 2, + 0, + writeIntRf = true, + writeFpRf = false, + hasRedirect = false, + CertainLatency(3) + ) - val jmpCfg = - FuConfig(FuType.jmp, 1, 0, writeIntRf = true, writeFpRf = false, hasRedirect = true) + val fmacCfg = FuConfig( + fuGen = fmac _, + fuSel = _ => true.B, + FuType.fmac, 0, 3, writeIntRf = false, writeFpRf = true, hasRedirect = false, CertainLatency(5) + ) - val i2fCfg = - FuConfig(FuType.i2f, 1, 0, writeIntRf = false, writeFpRf = true, hasRedirect = false) + val fcmpCfg = FuConfig( + fuGen = fcmp _, + fuSel = (x: FunctionUnit) => fmiscSel(FU_FCMP)(x) && x.io.in.bits.uop.ctrl.rfWen, + FuType.fmisc, 0, 2, writeIntRf = true, writeFpRf = false, hasRedirect = false, CertainLatency(2) + ) - val aluCfg = - FuConfig(FuType.alu, 2, 0, writeIntRf = true, writeFpRf = false, hasRedirect = true) + val fminCfg = FuConfig( + fuGen = fcmp _, + fuSel = (x: FunctionUnit) => fmiscSel(FU_FCMP)(x) && x.io.in.bits.uop.ctrl.fpWen, + FuType.fmisc, 0, 2, writeIntRf = false, writeFpRf = true, hasRedirect = false, CertainLatency(2) + ) - val mulCfg = - FuConfig(FuType.mul, 2, 0, writeIntRf = true, writeFpRf = false, hasRedirect = false) + val fsgnjCfg = FuConfig( + fuGen = fmv _, + fuSel = (x: FunctionUnit) => fmiscSel(FU_FMV)(x) && x.io.in.bits.uop.ctrl.fpWen, + FuType.fmisc, 0, 2, writeIntRf = false, writeFpRf = true, hasRedirect = false, CertainLatency(1) + ) - val divCfg = - FuConfig(FuType.div, 2, 0, writeIntRf = true, writeFpRf = false, hasRedirect = false) + val fmvCfg = FuConfig( + fuGen = fmv _, + fuSel = (x: FunctionUnit) => fmiscSel(FU_FMV)(x) && x.io.in.bits.uop.ctrl.rfWen, + FuType.fmisc, 0, 2, writeIntRf = true, writeFpRf = false, hasRedirect = false, CertainLatency(1) + ) - val fenceCfg = - FuConfig(FuType.fence, 2, 0, writeIntRf = false, writeFpRf = false, hasRedirect = false/*NOTE: need redirect but when commit*/) + val f2iCfg = FuConfig( + fuGen = f2i _, + fuSel = fmiscSel(FU_F2I), + FuType.fmisc, 0, 1, writeIntRf = true, writeFpRf = false, hasRedirect = false, CertainLatency(2) + ) - val lduCfg = - FuConfig(FuType.ldu, 1, 0, writeIntRf = true, writeFpRf = true, hasRedirect = false) + val s2dCfg = FuConfig( + fuGen = f32toF64 _, + fuSel = fmiscSel(FU_S2D), + FuType.fmisc, 0, 1, writeIntRf = false, writeFpRf = true, hasRedirect = false, CertainLatency(2) + ) - val stuCfg = - FuConfig(FuType.stu, 2, 1, writeIntRf = false, writeFpRf = false, hasRedirect = false) + val d2sCfg = FuConfig( + fuGen = f64toF32 _, + fuSel = fmiscSel(FU_D2S), + FuType.fmisc, 0, 1, writeIntRf = false, writeFpRf = true, hasRedirect = false, CertainLatency(2) + ) - // use ldu's write back port, so set writeIntRf to false - val mouCfg = - FuConfig(FuType.mou, 2, 0, writeIntRf = false, writeFpRf = false, hasRedirect = false) + val fdivSqrtCfg = FuConfig( + fuGen = fdivSqrt _, + fuSel = fmiscSel(FU_DIVSQRT), + FuType.fDivSqrt, 0, 2, writeIntRf = false, writeFpRf = true, hasRedirect = false, UncertainLatency() + ) - val fmacCfg = - FuConfig(FuType.fmac, 0, 3, writeIntRf = false, writeFpRf = true, hasRedirect = false) + val lduCfg = FuConfig( + null, // DontCare + null, + FuType.ldu, 1, 0, writeIntRf = true, writeFpRf = true, hasRedirect = false, + UncertainLatency() + ) - val fmiscCfg = - FuConfig(FuType.fmisc, 0, 2, writeIntRf = false, writeFpRf = true, hasRedirect = false) + val stuCfg = FuConfig( + null, + null, + FuType.stu, 2, 1, writeIntRf = false, writeFpRf = false, hasRedirect = false, + UncertainLatency() + ) - val fDivSqrtCfg = - FuConfig(FuType.fDivSqrt, 0, 2, writeIntRf = false, writeFpRf = true, hasRedirect = false) + val mouCfg = FuConfig( + null, + null, + FuType.mou, 2, 0, writeIntRf = false, writeFpRf = false, hasRedirect = false, + UncertainLatency() + ) } diff --git a/src/main/scala/xiangshan/backend/fu/I2f.scala b/src/main/scala/xiangshan/backend/fu/I2f.scala deleted file mode 100644 index 25d020fc1bb6d2ed03d9734ba00c89d6c6e82eb5..0000000000000000000000000000000000000000 --- a/src/main/scala/xiangshan/backend/fu/I2f.scala +++ /dev/null @@ -1,12 +0,0 @@ -package xiangshan.backend.fu - -import chisel3._ -import chisel3.util._ -import xiangshan._ -import utils._ -import xiangshan.backend._ -import xiangshan.backend.fu.FunctionUnit._ - -class I2f extends FunctionUnit(i2fCfg){ - val io = IO(new Bundle() {}) -} diff --git a/src/main/scala/xiangshan/backend/fu/Jump.scala b/src/main/scala/xiangshan/backend/fu/Jump.scala index ff98e319df14e2447042ff8071c0a12d99ec4083..75dcf983179a97c3c53dcd2917196a52893f83d7 100644 --- a/src/main/scala/xiangshan/backend/fu/Jump.scala +++ b/src/main/scala/xiangshan/backend/fu/Jump.scala @@ -8,59 +8,64 @@ import xiangshan.backend._ import xiangshan.backend.fu.FunctionUnit._ import xiangshan.backend.decode.isa._ -class Jump extends FunctionUnit(jmpCfg){ - val io = IO(new ExuIO) +trait HasRedirectOut { this: RawModule => + val redirectOutValid = IO(Output(Bool())) + val redirectOut = IO(Output(new Redirect)) + val brUpdate = IO(Output(new BranchUpdateInfo)) +} + +class Jump extends FunctionUnit with HasRedirectOut { - val (iovalid, src1, offset, func, pc, uop) = (io.in.valid, io.in.bits.src1, io.in.bits.uop.ctrl.imm, io.in.bits.uop.ctrl.fuOpType, SignExt(io.in.bits.uop.cf.pc, AddrBits), io.in.bits.uop) + val (src1, offset, func, pc, uop) = ( + io.in.bits.src(0), + io.in.bits.uop.ctrl.imm, + io.in.bits.uop.ctrl.fuOpType, + SignExt(io.in.bits.uop.cf.pc, AddrBits), + io.in.bits.uop + ) - val redirectHit = uop.roqIdx.needFlush(io.redirect) - val valid = iovalid && !redirectHit + val redirectHit = uop.roqIdx.needFlush(io.redirectIn) + val valid = io.in.valid && !redirectHit val isRVC = uop.cf.brUpdate.pd.isRVC - val pcDelaySlot = Mux(isRVC, pc + 2.U, pc + 4.U) + val snpc = Mux(isRVC, pc + 2.U, pc + 4.U) val target = src1 + offset // NOTE: src1 is (pc/rf(rs1)), src2 is (offset) - io.out.bits.redirectValid := valid - io.out.bits.redirect.pc := uop.cf.pc - io.out.bits.redirect.target := target - io.out.bits.redirect.brTag := uop.brTag - io.out.bits.redirect.isException := false.B - io.out.bits.redirect.isFlushPipe := false.B - io.out.bits.redirect.isMisPred := DontCare // check this in brq - io.out.bits.redirect.isReplay := false.B - io.out.bits.redirect.roqIdx := uop.roqIdx + redirectOutValid := valid + redirectOut.pc := uop.cf.pc + redirectOut.target := target + redirectOut.brTag := uop.brTag + redirectOut.isException := false.B + redirectOut.isFlushPipe := false.B + redirectOut.isMisPred := DontCare // check this in brq + redirectOut.isReplay := false.B + redirectOut.roqIdx := uop.roqIdx - io.out.bits.brUpdate := uop.cf.brUpdate - io.out.bits.brUpdate.pc := uop.cf.pc - io.out.bits.brUpdate.target := target - io.out.bits.brUpdate.brTarget := target // DontCare - // io.out.bits.brUpdate.btbType := LookupTree(func, RV32I_BRUInstr.bruFuncTobtbTypeTable) - io.out.bits.brUpdate.taken := true.B - // io.out.bits.brUpdate.fetchIdx := uop.cf.brUpdate.fetchOffset >> 1.U //TODO: consider RVC - io.out.bits.brUpdate.brTag := uop.brTag + brUpdate := uop.cf.brUpdate + brUpdate.pc := uop.cf.pc + brUpdate.target := target + brUpdate.brTarget := target // DontCare + brUpdate.taken := true.B // Output - val res = pcDelaySlot + val res = snpc io.in.ready := io.out.ready - io.out.valid := valid // TODO: CSR/MOU/FMV may need change it + io.out.valid := valid io.out.bits.uop <> io.in.bits.uop io.out.bits.data := res - io.dmem <> DontCare - io.out.bits.debug <> DontCare - // NOTE: the debug info is for one-cycle exec, if FMV needs multi-cycle, may needs change it XSDebug(io.in.valid, "In(%d %d) Out(%d %d) Redirect:(%d %d %d %d) brTag:%x\n", io.in.valid, io.in.ready, io.out.valid, io.out.ready, - io.redirect.valid, - io.redirect.bits.isException, - io.redirect.bits.isFlushPipe, + io.redirectIn.valid, + io.redirectIn.bits.isException, + io.redirectIn.bits.isFlushPipe, redirectHit, - io.redirect.bits.brTag.value + io.redirectIn.bits.brTag.value ) XSDebug(io.in.valid, "src1:%x offset:%x func:%b type:JUMP pc:%x res:%x\n", src1, offset, func, pc, res) } diff --git a/src/main/scala/xiangshan/backend/fu/Multiplier.scala b/src/main/scala/xiangshan/backend/fu/Multiplier.scala index 8f8cdbb01b153a2a0dc4babe145bfaa322032ff0..83a0c33ed2b59af7c1c049ac39930cc5235223b0 100644 --- a/src/main/scala/xiangshan/backend/fu/Multiplier.scala +++ b/src/main/scala/xiangshan/backend/fu/Multiplier.scala @@ -4,95 +4,171 @@ import chisel3._ import chisel3.util._ import xiangshan._ import utils._ -import xiangshan.backend._ -import xiangshan.backend.fu.FunctionUnit._ +import xiangshan.backend.fu.fpu.util.{C22, C32, C53} class MulDivCtrl extends Bundle{ - val uop = new MicroOp val sign = Bool() val isW = Bool() val isHi = Bool() // return hi bits of result ? } -class MulDivOutput extends XSBundle { - val data = UInt(XLEN.W) - val uop = new MicroOp +class AbstractMultiplier(len: Int) extends FunctionUnit( + len +){ + val ctrl = IO(Input(new MulDivCtrl)) } -class MulDivIO(val len: Int) extends XSBundle { - val in = Flipped(DecoupledIO(new Bundle() { - val src1, src2 = UInt(len.W) - val ctrl = new MulDivCtrl - })) - val out = DecoupledIO(new MulDivOutput) - val redirect = Flipped(ValidIO(new Redirect)) -} - -abstract class Multiplier -( - val len: Int, - val latency: Int = 3 -) extends FunctionUnit(mulCfg) { - val io = IO(new MulDivIO(len)) -} +class NaiveMultiplier(len: Int, val latency: Int) + extends AbstractMultiplier(len) + with HasPipelineReg +{ -trait HasPipelineReg { this: ArrayMultiplier => + val (src1, src2) = (io.in.bits.src(0), io.in.bits.src(1)) - val validVec = io.in.valid +: Array.fill(latency)(RegInit(false.B)) - val rdyVec = Array.fill(latency)(Wire(Bool())) :+ io.out.ready - val ctrlVec = io.in.bits.ctrl +: Array.fill(latency)(Reg(new MulDivCtrl)) - val flushVec = ctrlVec.zip(validVec).map(x => x._2 && x._1.uop.roqIdx.needFlush(io.redirect)) + val mulRes = src1.asSInt() * src2.asSInt() - for(i <- 0 until latency){ - rdyVec(i) := !validVec(i+1) || rdyVec(i+1) - } + var dataVec = Seq(mulRes.asUInt()) + var ctrlVec = Seq(ctrl) for(i <- 1 to latency){ - when(flushVec(i-1) || rdyVec(i) && !validVec(i-1)){ - validVec(i) := false.B - }.elsewhen(rdyVec(i-1) && validVec(i-1) && !flushVec(i-1)){ - validVec(i) := validVec(i-1) - ctrlVec(i) := ctrlVec(i-1) - } + dataVec = dataVec :+ PipelineReg(i)(dataVec(i-1)) + ctrlVec = ctrlVec :+ PipelineReg(i)(ctrlVec(i-1)) } - io.in.ready := rdyVec(0) - io.out.valid := validVec.last && !flushVec.last - io.out.bits.uop := ctrlVec.last.uop - - def PipelineReg[T<:Data](i: Int)(next: T) = RegEnable( - next, - enable = validVec(i-1) && rdyVec(i-1) && !flushVec(i-1) - ) + val xlen = io.out.bits.data.getWidth + val res = Mux(ctrlVec.last.isHi, dataVec.last(2*xlen-1, xlen), dataVec.last(xlen-1,0)) + io.out.bits.data := Mux(ctrlVec.last.isW, SignExt(res(31,0),xlen), res) - def S1Reg[T<:Data](next: T):T = PipelineReg[T](1)(next) - def S2Reg[T<:Data](next: T):T = PipelineReg[T](2)(next) - def S3Reg[T<:Data](next: T):T = PipelineReg[T](3)(next) - def S4Reg[T<:Data](next: T):T = PipelineReg[T](4)(next) - def S5Reg[T<:Data](next: T):T = PipelineReg[T](5)(next) + XSDebug(p"validVec:${Binary(Cat(validVec))} flushVec:${Binary(Cat(flushVec))}\n") } -class ArrayMultiplier -( - len: Int, - latency: Int = 3, - realArray: Boolean = false -) extends Multiplier(len, latency) with HasPipelineReg { +class ArrayMultiplier(len: Int, doReg: Seq[Int]) extends AbstractMultiplier(len) with HasPipelineReg { + + override def latency = doReg.size + + val doRegSorted = doReg.sortWith(_ < _) + println(doRegSorted) + + val (a, b) = (io.in.bits.src(0), io.in.bits.src(1)) + + val b_sext, bx2, neg_b, neg_bx2 = Wire(UInt((len+1).W)) + b_sext := SignExt(b, len+1) + bx2 := b_sext << 1 + neg_b := (~b_sext).asUInt() + neg_bx2 := neg_b << 1 + + val columns: Array[Seq[Bool]] = Array.fill(2*len)(Seq()) + + var last_x = WireInit(0.U(3.W)) + for(i <- Range(0, len, 2)){ + val x = if(i==0) Cat(a(1,0), 0.U(1.W)) else if(i+1==len) SignExt(a(i, i-1), 3) else a(i+1, i-1) + val pp_temp = MuxLookup(x, 0.U, Seq( + 1.U -> b_sext, + 2.U -> b_sext, + 3.U -> bx2, + 4.U -> neg_bx2, + 5.U -> neg_b, + 6.U -> neg_b + )) + val s = pp_temp(len) + val t = MuxLookup(last_x, 0.U(2.W), Seq( + 4.U -> 2.U(2.W), + 5.U -> 1.U(2.W), + 6.U -> 1.U(2.W) + )) + last_x = x + val (pp, weight) = i match { + case 0 => + (Cat(~s, s, s, pp_temp), 0) + case n if (n==len-1) || (n==len-2) => + (Cat(~s, pp_temp, t), i-2) + case _ => + (Cat(1.U(1.W), ~s, pp_temp, t), i-2) + } + for(j <- columns.indices){ + if(j >= weight && j < (weight + pp.getWidth)){ + columns(j) = columns(j) :+ pp(j-weight) + } + } + } - val mulRes = io.in.bits.src1.asSInt() * io.in.bits.src2.asSInt() + def addOneColumn(col: Seq[Bool], cin: Seq[Bool]): (Seq[Bool], Seq[Bool], Seq[Bool]) = { + var sum = Seq[Bool]() + var cout1 = Seq[Bool]() + var cout2 = Seq[Bool]() + col.size match { + case 1 => // do nothing + sum = col ++ cin + case 2 => + val c22 = Module(new C22) + c22.io.in := col + sum = c22.io.out(0).asBool() +: cin + cout2 = Seq(c22.io.out(1).asBool()) + case 3 => + val c32 = Module(new C32) + c32.io.in := col + sum = c32.io.out(0).asBool() +: cin + cout2 = Seq(c32.io.out(1).asBool()) + case 4 => + val c53 = Module(new C53) + for((x, y) <- c53.io.in.take(4) zip col){ + x := y + } + c53.io.in.last := (if(cin.nonEmpty) cin.head else 0.U) + sum = Seq(c53.io.out(0).asBool()) ++ (if(cin.nonEmpty) cin.drop(1) else Nil) + cout1 = Seq(c53.io.out(1).asBool()) + cout2 = Seq(c53.io.out(2).asBool()) + case n => + val cin_1 = if(cin.nonEmpty) Seq(cin.head) else Nil + val cin_2 = if(cin.nonEmpty) cin.drop(1) else Nil + val (s_1, c_1_1, c_1_2) = addOneColumn(col take 4, cin_1) + val (s_2, c_2_1, c_2_2) = addOneColumn(col drop 4, cin_2) + sum = s_1 ++ s_2 + cout1 = c_1_1 ++ c_2_1 + cout2 = c_1_2 ++ c_2_2 + } + (sum, cout1, cout2) + } - var dataVec = Seq(mulRes.asUInt()) + def max(in: Iterable[Int]): Int = in.reduce((a, b) => if(a>b) a else b) + def addAll(cols: Array[Seq[Bool]], depth: Int): (UInt, UInt) = { + if(max(cols.map(_.size)) <= 2){ + val sum = Cat(cols.map(_(0)).reverse) + var k = 0 + while(cols(k).size == 1) k = k+1 + val carry = Cat(cols.drop(k).map(_(1)).reverse) + (sum, Cat(carry, 0.U(k.W))) + } else { + val columns_next = Array.fill(2*len)(Seq[Bool]()) + var cout1, cout2 = Seq[Bool]() + for( i <- cols.indices){ + val (s, c1, c2) = addOneColumn(cols(i), cout1) + columns_next(i) = s ++ cout2 + cout1 = c1 + cout2 = c2 + } + + val needReg = doRegSorted.contains(depth) + val toNextLayer = if(needReg) + columns_next.map(_.map(PipelineReg(doRegSorted.indexOf(depth) + 1)(_))) + else + columns_next + + addAll(toNextLayer, depth+1) + } + } + val (sum, carry) = addAll(cols = columns, depth = 0) + val result = sum + carry + + var ctrlVec = Seq(ctrl) for(i <- 1 to latency){ - dataVec = dataVec :+ PipelineReg(i)(dataVec(i-1)) + ctrlVec = ctrlVec :+ PipelineReg(i)(ctrlVec(i-1)) } - val xlen = io.out.bits.data.getWidth - val res = Mux(ctrlVec.last.isHi, dataVec.last(2*xlen-1, xlen), dataVec.last(xlen-1,0)) - io.out.bits.data := Mux(ctrlVec.last.isW, SignExt(res(31,0),xlen), res) + val res = Mux(ctrlVec.last.isHi, result(2*xlen-1, xlen), result(xlen-1,0)) - XSDebug(p"validVec:${Binary(Cat(validVec))} flushVec:${Binary(Cat(flushVec))}\n")(this.name) + io.out.bits.data := Mux(ctrlVec.last.isW, SignExt(res(31,0),xlen), res) - // printf(p"t=${GTimer()} in: v${io.in.valid} r:${io.in.ready}\n") - // printf(p"t=${GTimer()} out: v:${io.out.valid} r:${io.out.ready} vec:${Binary(Cat(validVec))}\n") + XSDebug(p"validVec:${Binary(Cat(validVec))} flushVec:${Binary(Cat(flushVec))}\n") } \ No newline at end of file diff --git a/src/main/scala/xiangshan/backend/fu/Divider.scala b/src/main/scala/xiangshan/backend/fu/Radix2Divider.scala similarity index 80% rename from src/main/scala/xiangshan/backend/fu/Divider.scala rename to src/main/scala/xiangshan/backend/fu/Radix2Divider.scala index 5a3a1401440ff02e893bc998e5893df0e5f98caf..67fd4a6fe156d0d3b6a5a784c730146ed4207ee1 100644 --- a/src/main/scala/xiangshan/backend/fu/Divider.scala +++ b/src/main/scala/xiangshan/backend/fu/Radix2Divider.scala @@ -4,12 +4,13 @@ import chisel3._ import chisel3.util._ import xiangshan._ import utils._ -import xiangshan.backend._ -import xiangshan.backend.fu.FunctionUnit._ +abstract class AbstractDivider(len: Int) extends FunctionUnit(len){ + val ctrl = IO(Input(new MulDivCtrl)) + val sign = ctrl.sign +} -class Divider(len: Int) extends FunctionUnit(divCfg) { - val io = IO(new MulDivIO(len)) +class Radix2Divider(len: Int) extends AbstractDivider(len) { def abs(a: UInt, sign: Bool): (Bool, UInt) = { val s = a(len - 1) && sign @@ -20,7 +21,7 @@ class Divider(len: Int) extends FunctionUnit(divCfg) { val state = RegInit(s_idle) val newReq = (state === s_idle) && io.in.fire() - val (a, b) = (io.in.bits.src1, io.in.bits.src2) + val (a, b) = (io.in.bits.src(0), io.in.bits.src(1)) val divBy0 = b === 0.U(len.W) val divBy0Reg = RegEnable(divBy0, newReq) @@ -28,13 +29,16 @@ class Divider(len: Int) extends FunctionUnit(divCfg) { val hi = shiftReg(len * 2, len) val lo = shiftReg(len - 1, 0) - val (aSign, aVal) = abs(a, io.in.bits.ctrl.sign) - val (bSign, bVal) = abs(b, io.in.bits.ctrl.sign) + val uop = io.in.bits.uop + + val (aSign, aVal) = abs(a, sign) + val (bSign, bVal) = abs(b, sign) val aSignReg = RegEnable(aSign, newReq) val qSignReg = RegEnable((aSign ^ bSign) && !divBy0, newReq) val bReg = RegEnable(bVal, newReq) val aValx2Reg = RegEnable(Cat(aVal, "b0".U), newReq) - val ctrlReg = RegEnable(io.in.bits.ctrl, newReq) + val ctrlReg = RegEnable(ctrl, newReq) + val uopReg = RegEnable(uop, newReq) val cnt = Counter(len) when (newReq) { @@ -67,7 +71,8 @@ class Divider(len: Int) extends FunctionUnit(divCfg) { } } - when(state=/=s_idle && ctrlReg.uop.roqIdx.needFlush(io.redirect)){ + val kill = state=/=s_idle && uopReg.roqIdx.needFlush(io.redirectIn) + when(kill){ state := s_idle } @@ -78,8 +83,8 @@ class Divider(len: Int) extends FunctionUnit(divCfg) { val xlen = io.out.bits.data.getWidth val res = Mux(ctrlReg.isHi, resR, resQ) io.out.bits.data := Mux(ctrlReg.isW, SignExt(res(31,0),xlen), res) - io.out.bits.uop := ctrlReg.uop + io.out.bits.uop := uopReg - io.out.valid := state === s_finish + io.out.valid := state === s_finish && !kill io.in.ready := state === s_idle } \ No newline at end of file diff --git a/src/main/scala/xiangshan/backend/fu/SRT4Divider.scala b/src/main/scala/xiangshan/backend/fu/SRT4Divider.scala new file mode 100644 index 0000000000000000000000000000000000000000..df947e72a5896782831bf85772cb01924ed3a350 --- /dev/null +++ b/src/main/scala/xiangshan/backend/fu/SRT4Divider.scala @@ -0,0 +1,230 @@ +package xiangshan.backend.fu + +import chisel3._ +import chisel3.util._ +import utils.SignExt +import xiangshan.backend.fu.fpu.util.CSA3_2 + +/** A Radix-4 SRT Integer Divider + * + * 2 ~ (5 + (len+3)/2) cycles are needed for each division. + */ +class SRT4Divider(len: Int) extends AbstractDivider(len) { + + val s_idle :: s_lzd :: s_normlize :: s_recurrence :: s_recovery :: s_finish :: Nil = Enum(6) + val state = RegInit(s_idle) + val newReq = (state === s_idle) && io.in.fire() + val cnt_next = Wire(UInt(log2Up((len+3)/2).W)) + val cnt = RegEnable(cnt_next, state===s_normlize || state===s_recurrence) + val rec_enough = cnt_next === 0.U + + def abs(a: UInt, sign: Bool): (Bool, UInt) = { + val s = a(len - 1) && sign + (s, Mux(s, -a, a)) + } + val (a, b) = (io.in.bits.src(0), io.in.bits.src(1)) + val uop = io.in.bits.uop + val (aSign, aVal) = abs(a, sign) + val (bSign, bVal) = abs(b, sign) + val aSignReg = RegEnable(aSign, newReq) + val qSignReg = RegEnable(aSign ^ bSign, newReq) + val uopReg = RegEnable(uop, newReq) + val ctrlReg = RegEnable(ctrl, newReq) + val divZero = b === 0.U + val divZeroReg = RegEnable(divZero, newReq) + + val kill = state=/=s_idle && uopReg.roqIdx.needFlush(io.redirectIn) + + switch(state){ + is(s_idle){ + when(io.in.fire()){ state := Mux(divZero, s_finish, s_lzd) } + } + is(s_lzd){ // leading zero detection + state := s_normlize + } + is(s_normlize){ // shift a/b + state := s_recurrence + } + is(s_recurrence){ // (ws[j+1], wc[j+1]) = 4(ws[j],wc[j]) - q(j+1)*d + when(rec_enough){ state := s_recovery } + } + is(s_recovery){ // if rem < 0, rem = rem + d + state := s_finish + } + is(s_finish){ + when(io.out.fire()){ state := s_idle } + } + } + when(kill){ + state := s_idle + } + + /** Calculate abs(a)/abs(b) by recurrence + * + * ws, wc: partial remainder in carry-save form, + * in recurrence steps, ws/wc = 4ws[j]/4wc[j]; + * in recovery step, ws/wc = ws[j]/wc[j]; + * in final step, ws = abs(a)/abs(b). + * + * d: normlized divisor(1/2<=d<1) + * + * wLen = 3 integer bits + (len+1) frac bits + */ + def wLen = 3 + len + 1 + val ws, wc = Reg(UInt(wLen.W)) + val ws_next, wc_next = Wire(UInt(wLen.W)) + val d = Reg(UInt(wLen.W)) + + val aLeadingZeros = RegEnable( + next = PriorityEncoder(ws(len-1, 0).asBools().reverse), + enable = state===s_lzd + ) + val bLeadingZeros = RegEnable( + next = PriorityEncoder(d(len-1, 0).asBools().reverse), + enable = state===s_lzd + ) + val diff = Cat(0.U(1.W), bLeadingZeros).asSInt() - Cat(0.U(1.W), aLeadingZeros).asSInt() + val isNegDiff = diff(diff.getWidth - 1) + val quotientBits = Mux(isNegDiff, 0.U, diff.asUInt()) + val qBitsIsOdd = quotientBits(0) + val recoveryShift = RegEnable(len.U - bLeadingZeros, state===s_normlize) + val a_shifted, b_shifted = Wire(UInt(len.W)) + a_shifted := Mux(isNegDiff, + ws(len-1, 0) << bLeadingZeros, + ws(len-1, 0) << aLeadingZeros + ) + b_shifted := d(len-1, 0) << bLeadingZeros + + val rem_temp = ws + wc + val rem_fixed = Mux(rem_temp(wLen-1), rem_temp + d, rem_temp) + val rem_abs = (rem_fixed << recoveryShift)(2*len, len+1) + + when(newReq){ + ws := Cat(0.U(4.W), Mux(divZero, a, aVal)) + wc := 0.U + d := Cat(0.U(4.W), bVal) + }.elsewhen(state === s_normlize){ + d := Cat(0.U(3.W), b_shifted, 0.U(1.W)) + ws := Mux(qBitsIsOdd, a_shifted, a_shifted << 1) + }.elsewhen(state === s_recurrence){ + ws := Mux(rec_enough, ws_next, ws_next << 2) + wc := Mux(rec_enough, wc_next, wc_next << 2) + }.elsewhen(state === s_recovery){ + ws := rem_abs + } + + cnt_next := Mux(state === s_normlize, (quotientBits + 3.U) >> 1, cnt - 1.U) + + /** Quotient selection + * + * the quotient selection table use truncated 7-bit remainder + * and 3-bit divisor + */ + val sel_0 :: sel_d :: sel_dx2 :: sel_neg_d :: sel_neg_dx2 :: Nil = Enum(5) + val dx2, neg_d, neg_dx2 = Wire(UInt(wLen.W)) + dx2 := d << 1 + neg_d := (~d).asUInt() // add '1' in carry-save adder later + neg_dx2 := neg_d << 1 + + val q_sel = Wire(UInt(3.W)) + val wc_adj = MuxLookup(q_sel, 0.U(2.W), Seq( + sel_d -> 1.U(2.W), + sel_dx2 -> 2.U(2.W) + )) + + val w_truncated = (ws(wLen-1, wLen-1-6) + wc(wLen-1, wLen-1-6)).asSInt() + val d_truncated = d(len-1, len-3) + + val qSelTable = Array( + Array(12, 4, -4, -13), + Array(14, 4, -6, -15), + Array(15, 4, -6, -16), + Array(16, 4, -6, -18), + Array(18, 6, -8, -20), + Array(20, 6, -8, -20), + Array(20, 8, -8, -22), + Array(24, 8, -8, -24) + ) + + // ge(x): w_truncated >= x + var ge = Map[Int, Bool]() + for(row <- qSelTable){ + for(k <- row){ + if(!ge.contains(k)) ge = ge + (k -> (w_truncated >= k.S(7.W))) + } + } + q_sel := MuxLookup(d_truncated, sel_0, + qSelTable.map(x => + MuxCase(sel_neg_dx2, Seq( + ge(x(0)) -> sel_dx2, + ge(x(1)) -> sel_d, + ge(x(2)) -> sel_0, + ge(x(3)) -> sel_neg_d + )) + ).zipWithIndex.map({case(v, i) => i.U -> v}) + ) + + /** Calculate (ws[j+1],wc[j+1]) by a [3-2]carry-save adder + * + * (ws[j+1], wc[j+1]) = 4(ws[j],wc[j]) - q(j+1)*d + */ + val csa = Module(new CSA3_2(wLen)) + csa.io.in(0) := ws + csa.io.in(1) := Cat(wc(wLen-1, 2), wc_adj) + csa.io.in(2) := MuxLookup(q_sel, 0.U, Seq( + sel_d -> neg_d, + sel_dx2 -> neg_dx2, + sel_neg_d -> d, + sel_neg_dx2 -> dx2 + )) + ws_next := csa.io.out(0) + wc_next := csa.io.out(1) << 1 + + // On the fly quotient conversion + val q, qm = Reg(UInt(len.W)) + when(newReq){ + q := 0.U + qm := 0.U + }.elsewhen(state === s_recurrence){ + val qMap = Seq( + sel_0 -> (q, 0), + sel_d -> (q, 1), + sel_dx2 -> (q, 2), + sel_neg_d -> (qm, 3), + sel_neg_dx2 -> (qm, 2) + ) + q := MuxLookup(q_sel, 0.U, + qMap.map(m => m._1 -> Cat(m._2._1(len-3, 0), m._2._2.U(2.W))) + ) + val qmMap = Seq( + sel_0 -> (qm, 3), + sel_d -> (q, 0), + sel_dx2 -> (q, 1), + sel_neg_d -> (qm, 2), + sel_neg_dx2 -> (qm, 1) + ) + qm := MuxLookup(q_sel, 0.U, + qmMap.map(m => m._1 -> Cat(m._2._1(len-3, 0), m._2._2.U(2.W))) + ) + }.elsewhen(state === s_recovery){ + q := Mux(rem_temp(wLen-1), qm, q) + } + + + val remainder = Mux(aSignReg, -ws(len-1, 0), ws(len-1, 0)) + val quotient = Mux(qSignReg, -q, q) + + val res = Mux(ctrlReg.isHi, + Mux(divZeroReg, ws(len-1, 0), remainder), + Mux(divZeroReg, Fill(len, 1.U(1.W)), quotient) + ) + + io.in.ready := state===s_idle + io.out.valid := state===s_finish && !kill + io.out.bits.data := Mux(ctrlReg.isW, + SignExt(res(31, 0), len), + res + ) + io.out.bits.uop := uopReg + +} diff --git a/src/main/scala/fpu/Classify.scala b/src/main/scala/xiangshan/backend/fu/fpu/Classify.scala similarity index 98% rename from src/main/scala/fpu/Classify.scala rename to src/main/scala/xiangshan/backend/fu/fpu/Classify.scala index 45353000f4fe66c4a41b2523341b5a3488336448..b27c47e9605e3f78d0b8fd7dd2df7ac23c1143b6 100644 --- a/src/main/scala/fpu/Classify.scala +++ b/src/main/scala/xiangshan/backend/fu/fpu/Classify.scala @@ -1,4 +1,4 @@ -package fpu +package xiangshan.backend.fu.fpu import chisel3._ import chisel3.util._ diff --git a/src/main/scala/fpu/F32toF64.scala b/src/main/scala/xiangshan/backend/fu/fpu/F32toF64.scala similarity index 75% rename from src/main/scala/fpu/F32toF64.scala rename to src/main/scala/xiangshan/backend/fu/fpu/F32toF64.scala index 454dddabdc4830c91104b922669b930bc350ca09..0bd4b21b0de16b8c94df0c65c04f0457e423b995 100644 --- a/src/main/scala/fpu/F32toF64.scala +++ b/src/main/scala/xiangshan/backend/fu/fpu/F32toF64.scala @@ -1,12 +1,16 @@ -package fpu +package xiangshan.backend.fu.fpu import chisel3._ import chisel3.util._ +import xiangshan.FuType +import xiangshan.backend.fu.{CertainLatency, FuConfig, FunctionUnit} +import xiangshan.backend.fu.FunctionUnit._ -class F32toF64 extends FPUSubModule with HasPipelineReg { - def latency: Int = 2 +class F32toF64 extends FPUPipelineModule { - val a = io.in.bits.a + override def latency: Int = FunctionUnit.s2dCfg.latency.latencyVal.get + + val a = io.in.bits.src(0) val f32 = Float32(a) val classify = Module(new Classify(Float32.expWidth, Float32.mantWidth)) @@ -56,10 +60,10 @@ class F32toF64 extends FPUSubModule with HasPipelineReg { ) val result = Mux(s1_isNaN, Float64.defaultNaN, commonResult) - io.out.bits.result := S2Reg(result) - io.out.bits.fflags.invalid := S2Reg(s1_isSNaN) - io.out.bits.fflags.overflow := false.B - io.out.bits.fflags.underflow := false.B - io.out.bits.fflags.infinite := false.B - io.out.bits.fflags.inexact := false.B + io.out.bits.data := S2Reg(result) + fflags.invalid := S2Reg(s1_isSNaN) + fflags.overflow := false.B + fflags.underflow := false.B + fflags.infinite := false.B + fflags.inexact := false.B } diff --git a/src/main/scala/fpu/F64toF32.scala b/src/main/scala/xiangshan/backend/fu/fpu/F64toF32.scala similarity index 77% rename from src/main/scala/fpu/F64toF32.scala rename to src/main/scala/xiangshan/backend/fu/fpu/F64toF32.scala index e2eb4b094043d29eb2bc11bd6dec86912a95c410..079fd3baeff07223de61c683e4e128ecb3a87a92 100644 --- a/src/main/scala/fpu/F64toF32.scala +++ b/src/main/scala/xiangshan/backend/fu/fpu/F64toF32.scala @@ -1,15 +1,18 @@ -package fpu +package xiangshan.backend.fu.fpu import chisel3._ import chisel3.util._ -import fpu.util.ShiftRightJam +import xiangshan.FuType +import xiangshan.backend.fu.{CertainLatency, FuConfig, FunctionUnit} +import xiangshan.backend.fu.fpu.util.ShiftRightJam + +class F64toF32 extends FPUPipelineModule { + + override def latency = FunctionUnit.d2sCfg.latency.latencyVal.get -class F64toF32 extends FPUSubModule with HasPipelineReg { - def latency: Int = 2 def SEXP_WIDTH = Float64.expWidth + 2 - val rm = io.in.bits.rm - val a = io.in.bits.a + val a = io.in.bits.src(0) val classify = Module(new Classify(Float64.expWidth, Float64.mantWidth)) classify.io.in := a @@ -67,11 +70,11 @@ class F64toF32 extends FPUSubModule with HasPipelineReg { ) ) - io.out.bits.result := S2Reg(result) - io.out.bits.fflags.invalid := S2Reg(s1_isSNaN) - io.out.bits.fflags.overflow := S2Reg(overflow) - io.out.bits.fflags.underflow := S2Reg(underflow) - io.out.bits.fflags.infinite := false.B - io.out.bits.fflags.inexact := S2Reg(inexact) + io.out.bits.data := S2Reg(result) + fflags.invalid := S2Reg(s1_isSNaN) + fflags.overflow := S2Reg(overflow) + fflags.underflow := S2Reg(underflow) + fflags.infinite := false.B + fflags.inexact := S2Reg(inexact) } diff --git a/src/main/scala/fpu/FCMP.scala b/src/main/scala/xiangshan/backend/fu/fpu/FCMP.scala similarity index 70% rename from src/main/scala/fpu/FCMP.scala rename to src/main/scala/xiangshan/backend/fu/fpu/FCMP.scala index d90dd9829b80a1d729927b8d5bdafe5b8dd0d18e..5d5d5c6f17f4cf7beed7ec2768ce7fc7d7760e52 100644 --- a/src/main/scala/fpu/FCMP.scala +++ b/src/main/scala/xiangshan/backend/fu/fpu/FCMP.scala @@ -1,13 +1,16 @@ -package fpu +package xiangshan.backend.fu.fpu import chisel3._ import chisel3.util._ +import xiangshan.FuType +import xiangshan.backend.fu.{CertainLatency, FuConfig, FunctionUnit} +import xiangshan.backend.fu.FunctionUnit._ -class FCMP extends FPUSubModule with HasPipelineReg{ - def latency = 2 +class FCMP extends FPUPipelineModule { - val isDouble = io.in.bits.isDouble - val src = Seq(io.in.bits.a, io.in.bits.b).map(x => Mux(isDouble, x, extF32ToF64(x))) + override def latency = FunctionUnit.fcmpCfg.latency.latencyVal.get + + val src = io.in.bits.src.map(x => Mux(isDouble, x, extF32ToF64(x))) val sign = src.map(_(63)) val aSign = sign(0) @@ -20,8 +23,8 @@ class FCMP extends FPUSubModule with HasPipelineReg{ val srcIsSNaN = classify.map(_.isSNaN) val isDoubleReg = S1Reg(isDouble) - val opReg = S1Reg(io.in.bits.op) - val srcReg = Seq(io.in.bits.a, io.in.bits.b).map(S1Reg) + val opReg = S1Reg(op) + val srcReg = io.in.bits.src.map(S1Reg) val (aSignReg, bSignReg) = (S1Reg(sign(0)), S1Reg(sign(1))) val hasNaNReg = S1Reg(srcIsNaN(0) || srcIsNaN(1)) @@ -50,10 +53,10 @@ class FCMP extends FPUSubModule with HasPipelineReg{ val min = Mux(bothNaNReg, defaultNaN, Mux(sel_a && !aIsNaNReg, srcReg(0), srcReg(1))) val max = Mux(bothNaNReg, defaultNaN, Mux(!sel_a && !aIsNaNReg, srcReg(0), srcReg(1))) - io.out.bits.fflags.inexact := false.B - io.out.bits.fflags.underflow := false.B - io.out.bits.fflags.overflow := false.B - io.out.bits.fflags.infinite := false.B - io.out.bits.fflags.invalid := S2Reg(invalid) - io.out.bits.result := S2Reg(Mux(opReg===0.U, min, Mux(opReg===1.U, max, fcmpResult))) + fflags.inexact := false.B + fflags.underflow := false.B + fflags.overflow := false.B + fflags.infinite := false.B + fflags.invalid := S2Reg(invalid) + io.out.bits.data := S2Reg(Mux(opReg===0.U, min, Mux(opReg===1.U, max, fcmpResult))) } \ No newline at end of file diff --git a/src/main/scala/fpu/FMV.scala b/src/main/scala/xiangshan/backend/fu/fpu/FMV.scala similarity index 68% rename from src/main/scala/fpu/FMV.scala rename to src/main/scala/xiangshan/backend/fu/fpu/FMV.scala index 06e53475f0db285d9f1fe765d370f6b0a2b76578..b51878f22fd24a05afbebc5186ea69c28e54b512 100644 --- a/src/main/scala/fpu/FMV.scala +++ b/src/main/scala/xiangshan/backend/fu/fpu/FMV.scala @@ -1,15 +1,15 @@ -package fpu +package xiangshan.backend.fu.fpu import chisel3._ import chisel3.util._ +import xiangshan.FuType +import xiangshan.backend.fu.{CertainLatency, FuConfig, FunctionUnit} -class FMV(XLEN: Int) extends FPUSubModule with HasPipelineReg { +class FMV(XLEN: Int) extends FPUPipelineModule { - def latency = 1 + override def latency = FunctionUnit.fmvCfg.latency.latencyVal.get - val isDouble = io.in.bits.isDouble - val op = io.in.bits.op - val src = Seq(io.in.bits.a, io.in.bits.b).map(x => + val src = io.in.bits.src.map(x => Mux(isDouble || op(2,1)==="b00".U, x, extF32ToF64(x)) ) val aSign = Mux(op(2,1)==="b00".U && !isDouble, src(0)(31), src(0)(63)) @@ -39,12 +39,12 @@ class FMV(XLEN: Int) extends FPUSubModule with HasPipelineReg { val result = Mux(op === "b010".U, classifyResult, Mux(isDouble, - Cat(resSign, io.in.bits.a(62, 0)), - Cat(resSign, io.in.bits.a(30 ,0)) + Cat(resSign, io.in.bits.src(0)(62, 0)), + Cat(resSign, io.in.bits.src(0)(30 ,0)) ) ) val resultReg = S1Reg(result) - io.out.bits.result := resultReg - io.out.bits.fflags := 0.U.asTypeOf(new Fflags) + io.out.bits.data := resultReg + fflags := 0.U.asTypeOf(new Fflags) } diff --git a/src/main/scala/xiangshan/backend/fu/fpu/FPUSubModule.scala b/src/main/scala/xiangshan/backend/fu/fpu/FPUSubModule.scala new file mode 100644 index 0000000000000000000000000000000000000000..bd5ed61d528978fbd5cc3e93ceede2d8249237df --- /dev/null +++ b/src/main/scala/xiangshan/backend/fu/fpu/FPUSubModule.scala @@ -0,0 +1,48 @@ +package xiangshan.backend.fu.fpu + +import chisel3._ +import chisel3.util._ +import xiangshan.backend.fu.{FuConfig, FunctionUnit, HasPipelineReg} + + +class FPUSubModuleInput extends Bundle{ + val op = UInt(3.W) + val isDouble = Bool() + val a, b, c = UInt(64.W) + val rm = UInt(3.W) +} + +class FPUSubModuleOutput extends Bundle{ + val fflags = new Fflags + val result = UInt(64.W) +} + +class FPUSubModuleIO extends Bundle{ + val in = Flipped(DecoupledIO(new FPUSubModuleInput)) + val out = DecoupledIO(new FPUSubModuleOutput) +} + +trait HasUIntToSIntHelper { + implicit class UIntToSIntHelper(x: UInt){ + def toSInt: SInt = Cat(0.U(1.W), x).asSInt() + } +} + +trait HasFPUSigs { this: FPUSubModule => + val op = io.in.bits.uop.ctrl.fuOpType(2, 0) + // 'op' must change with fuOpType + require(io.in.bits.uop.ctrl.fuOpType.getWidth == 7) + val isDouble = !io.in.bits.uop.ctrl.isRVF +} + +abstract class FPUSubModule extends FunctionUnit + with HasUIntToSIntHelper + with HasFPUSigs +{ + val rm = IO(Input(UInt(3.W))) + val fflags = IO(Output(new Fflags)) +} + +abstract class FPUPipelineModule + extends FPUSubModule + with HasPipelineReg \ No newline at end of file diff --git a/src/main/scala/fpu/FloatToInt.scala b/src/main/scala/xiangshan/backend/fu/fpu/FloatToInt.scala similarity index 80% rename from src/main/scala/fpu/FloatToInt.scala rename to src/main/scala/xiangshan/backend/fu/fpu/FloatToInt.scala index 2ee0c860e80c4182476ea475b39f39687462b915..e90161bc1b4284939a37aa9612fa642e975583ae 100644 --- a/src/main/scala/fpu/FloatToInt.scala +++ b/src/main/scala/xiangshan/backend/fu/fpu/FloatToInt.scala @@ -1,27 +1,26 @@ -package fpu +package xiangshan.backend.fu.fpu import chisel3._ import chisel3.util._ -import fpu.util.{ORTree, ShiftRightJam} +import xiangshan.FuType +import xiangshan.backend.fu.{CertainLatency, FuConfig, FunctionUnit} +import xiangshan.backend.fu.fpu.util.{ORTree, ShiftRightJam} //def f2w:UInt = FpuOp("011", "000") //def f2wu:UInt = FpuOp("011", "001") //def f2l:UInt = FpuOp("011", "010") //def f2lu:UInt = FpuOp("011", "011") -class FloatToInt extends FPUSubModule with HasPipelineReg { +class FloatToInt extends FPUPipelineModule { - def latency = 2 + override def latency = FunctionUnit.f2iCfg.latency.latencyVal.get def SEXP_WIDTH = Float64.expWidth + 2 /** Stage 1: Shift Operand */ - val op = io.in.bits.op - val rm = io.in.bits.rm - val isDouble = io.in.bits.isDouble - val a = Mux(isDouble, io.in.bits.a, extF32ToF64(io.in.bits.a)) + val a = Mux(isDouble, io.in.bits.src(0), extF32ToF64(io.in.bits.src(0))) val f64 = Float64(a) val cls = Module(new Classify(Float64.expWidth, Float64.mantWidth)) @@ -97,10 +96,10 @@ class FloatToInt extends FPUSubModule with HasPipelineReg { /** Assign Outputs */ - io.out.bits.result := s2_result - io.out.bits.fflags.invalid := s2_invalid - io.out.bits.fflags.overflow := false.B - io.out.bits.fflags.underflow := false.B - io.out.bits.fflags.infinite := false.B - io.out.bits.fflags.inexact := s2_inexact + io.out.bits.data := s2_result + fflags.invalid := s2_invalid + fflags.overflow := false.B + fflags.underflow := false.B + fflags.infinite := false.B + fflags.inexact := s2_inexact } \ No newline at end of file diff --git a/src/main/scala/xiangshan/backend/fu/fpu/IntToFloat.scala b/src/main/scala/xiangshan/backend/fu/fpu/IntToFloat.scala new file mode 100644 index 0000000000000000000000000000000000000000..f3ea4b90369d94a383fd5e184c3234c27c3d9295 --- /dev/null +++ b/src/main/scala/xiangshan/backend/fu/fpu/IntToFloat.scala @@ -0,0 +1,79 @@ +//package xiangshan.backend.fu.fpu +// +//import chisel3._ +//import chisel3.util._ +//import xiangshan.FuType +//import xiangshan.backend.fu.{CertainLatency, FuConfig} +//import xiangshan.backend.fu.fpu.util.ORTree +// +//class IntToFloat extends FPUPipelineModule( +// FuConfig(FuType.i2f, 1, 0, writeIntRf = false, writeFpRf = true, hasRedirect = false, CertainLatency(2)) +//) { +// /** Stage 1: Count leading zeros and shift +// */ +// +// val a = io.in.bits.src(0) +// val aNeg = (~a).asUInt() +// val aComp = aNeg + 1.U +// val aSign = Mux(op(0), false.B, Mux(op(1), a(63), a(31))) +// +// val leadingZerosComp = PriorityEncoder(Mux(op(1), aComp, aComp(31, 0)).asBools().reverse) +// val leadingZerosNeg = PriorityEncoder(Mux(op(1), aNeg, aNeg(31, 0)).asBools().reverse) +// val leadingZerosPos = PriorityEncoder(Mux(op(1), a, a(31,0)).asBools().reverse) +// +// val aVal = Mux(aSign, Mux(op(1), aComp, aComp(31, 0)), Mux(op(1), a, a(31, 0))) +// val leadingZeros = Mux(aSign, leadingZerosNeg, leadingZerosPos) +// +// // exp = xlen - 1 - leadingZeros + bias +// val expUnrounded = S1Reg( +// Mux(isDouble, +// (64 - 1 + Float64.expBiasInt).U - leadingZeros, +// (64 - 1 + Float32.expBiasInt).U - leadingZeros +// ) +// ) +// val leadingZeroHasError = S1Reg(aSign && (leadingZerosComp=/=leadingZerosNeg)) +// val rmReg = S1Reg(rm) +// val opReg = S1Reg(op) +// val isDoubleReg = S1Reg(isDouble) +// val aIsZeroReg = S1Reg(a===0.U) +// val aSignReg = S1Reg(aSign) +// val aShifted = S1Reg((aVal << leadingZeros)(63, 0)) +// +// /** Stage 2: Rounding +// */ +// val aShiftedFix = Mux(leadingZeroHasError, aShifted(63, 1), aShifted(62, 0)) +// val mantD = aShiftedFix(62, 62-51) +// val mantS = aShiftedFix(62, 62-22) +// +// val g = Mux(isDoubleReg, aShiftedFix(62-52), aShiftedFix(62-23)) +// val r = Mux(isDoubleReg, aShiftedFix(62-53), aShiftedFix(62-24)) +// val s = Mux(isDoubleReg, ORTree(aShiftedFix(62-54, 0)), ORTree(aShiftedFix(62-25, 0))) +// +// val roudingUnit = Module(new RoundingUnit(Float64.mantWidth)) +// roudingUnit.io.in.rm := rmReg +// roudingUnit.io.in.mant := Mux(isDoubleReg, mantD, mantS) +// roudingUnit.io.in.sign := aSignReg +// roudingUnit.io.in.guard := g +// roudingUnit.io.in.round := r +// roudingUnit.io.in.sticky := s +// +// val mantRounded = roudingUnit.io.out.mantRounded +// val expRounded = Mux(isDoubleReg, +// expUnrounded + roudingUnit.io.out.mantCout, +// expUnrounded + mantRounded(Float32.mantWidth) +// ) + leadingZeroHasError +// +// val resS = Cat( +// aSignReg, +// expRounded(Float32.expWidth-1, 0), +// mantRounded(Float32.mantWidth-1, 0) +// ) +// val resD = Cat(aSignReg, expRounded, mantRounded) +// +// io.out.bits.data := S2Reg(Mux(aIsZeroReg, 0.U, Mux(isDoubleReg, resD, resS))) +// fflags.inexact := S2Reg(roudingUnit.io.out.inexact) +// fflags.underflow := false.B +// fflags.overflow := false.B +// fflags.infinite := false.B +// fflags.invalid := false.B +//} diff --git a/src/main/scala/fpu/IntToFloat.scala b/src/main/scala/xiangshan/backend/fu/fpu/IntToFloatSingleCycle.scala similarity index 62% rename from src/main/scala/fpu/IntToFloat.scala rename to src/main/scala/xiangshan/backend/fu/fpu/IntToFloatSingleCycle.scala index af63994ebd73ad0a553cde408e1581fa3e705ba5..c7ee781af2cc9700686bd7cd54f541c0da5163d5 100644 --- a/src/main/scala/fpu/IntToFloat.scala +++ b/src/main/scala/xiangshan/backend/fu/fpu/IntToFloatSingleCycle.scala @@ -1,18 +1,14 @@ -package fpu +package xiangshan.backend.fu.fpu import chisel3._ import chisel3.util._ -import fpu.util.ORTree +import xiangshan.FuType +import xiangshan.backend.fu.{CertainLatency, FuConfig, FunctionUnit} +import xiangshan.backend.fu.fpu.util.ORTree -class IntToFloat extends FPUSubModule with HasPipelineReg { - def latency = 2 +class IntToFloatSingleCycle extends FPUSubModule { - /** Stage 1: Count leading zeros and shift - */ - - val op = io.in.bits.op - val isDouble = io.in.bits.isDouble - val a = io.in.bits.a + val a = io.in.bits.src(0) val aNeg = (~a).asUInt() val aComp = aNeg + 1.U val aSign = Mux(op(0), false.B, Mux(op(1), a(63), a(31))) @@ -25,19 +21,18 @@ class IntToFloat extends FPUSubModule with HasPipelineReg { val leadingZeros = Mux(aSign, leadingZerosNeg, leadingZerosPos) // exp = xlen - 1 - leadingZeros + bias - val expUnrounded = S1Reg( - Mux(isDouble, - (64 - 1 + Float64.expBiasInt).U - leadingZeros, - (64 - 1 + Float32.expBiasInt).U - leadingZeros - ) + val expUnrounded = Mux(isDouble, + (64 - 1 + Float64.expBiasInt).U - leadingZeros, + (64 - 1 + Float32.expBiasInt).U - leadingZeros ) - val leadingZeroHasError = S1Reg(aSign && (leadingZerosComp=/=leadingZerosNeg)) - val rmReg = S1Reg(io.in.bits.rm) - val opReg = S1Reg(op) - val isDoubleReg = S1Reg(isDouble) - val aIsZeroReg = S1Reg(a===0.U) - val aSignReg = S1Reg(aSign) - val aShifted = S1Reg((aVal << leadingZeros)(63, 0)) + + val leadingZeroHasError = aSign && (leadingZerosComp=/=leadingZerosNeg) + val rmReg = rm + val opReg = op + val isDoubleReg = isDouble + val aIsZeroReg = a===0.U + val aSignReg = aSign + val aShifted = (aVal << leadingZeros)(63, 0) /** Stage 2: Rounding */ @@ -70,10 +65,14 @@ class IntToFloat extends FPUSubModule with HasPipelineReg { ) val resD = Cat(aSignReg, expRounded, mantRounded) - io.out.bits.result := S2Reg(Mux(aIsZeroReg, 0.U, Mux(isDoubleReg, resD, resS))) - io.out.bits.fflags.inexact := S2Reg(roudingUnit.io.out.inexact) - io.out.bits.fflags.underflow := false.B - io.out.bits.fflags.overflow := false.B - io.out.bits.fflags.infinite := false.B - io.out.bits.fflags.invalid := false.B + io.in.ready := io.out.ready + io.out.valid := io.in.valid + io.out.bits.uop := io.in.bits.uop + io.out.bits.data := Mux(aIsZeroReg, 0.U, Mux(isDoubleReg, resD, resS)) + fflags.inexact := roudingUnit.io.out.inexact + fflags.underflow := false.B + fflags.overflow := false.B + fflags.infinite := false.B + fflags.invalid := false.B + } diff --git a/src/main/scala/fpu/README.md b/src/main/scala/xiangshan/backend/fu/fpu/README.md similarity index 100% rename from src/main/scala/fpu/README.md rename to src/main/scala/xiangshan/backend/fu/fpu/README.md diff --git a/src/main/scala/fpu/RoundingUnit.scala b/src/main/scala/xiangshan/backend/fu/fpu/RoundingUnit.scala similarity index 97% rename from src/main/scala/fpu/RoundingUnit.scala rename to src/main/scala/xiangshan/backend/fu/fpu/RoundingUnit.scala index 1f1534ed931dbc384562c8d1fc72a32394cdb1a6..6e2da25ddb4c1959bbeb4556d62fd769de71584e 100644 --- a/src/main/scala/fpu/RoundingUnit.scala +++ b/src/main/scala/xiangshan/backend/fu/fpu/RoundingUnit.scala @@ -1,9 +1,9 @@ -package fpu +package xiangshan.backend.fu.fpu import chisel3._ import chisel3.util._ -import fpu.RoudingMode._ -import fpu.util.ORTree +import xiangshan.backend.fu.fpu.RoudingMode._ +import xiangshan.backend.fu.fpu.util.ORTree class RoundingUnit(mantWidth: Int) extends Module{ val io = IO(new Bundle() { diff --git a/src/main/scala/fpu/divsqrt/DivSqrt.scala b/src/main/scala/xiangshan/backend/fu/fpu/divsqrt/DivSqrt.scala similarity index 79% rename from src/main/scala/fpu/divsqrt/DivSqrt.scala rename to src/main/scala/xiangshan/backend/fu/fpu/divsqrt/DivSqrt.scala index de5de63f29e0e2958570ec0a61eb5230124da80f..d8c6cb40056aa1703cc5ac6c58adf79a8a1562a8 100644 --- a/src/main/scala/fpu/divsqrt/DivSqrt.scala +++ b/src/main/scala/xiangshan/backend/fu/fpu/divsqrt/DivSqrt.scala @@ -1,10 +1,11 @@ -package fpu.divsqrt +package xiangshan.backend.fu.fpu.divsqrt -import fpu._ +import xiangshan.backend.fu.fpu._ import chisel3._ import chisel3.util._ -import fpu.util.{FPUDebug, ORTree, ShiftRightJam} - +import xiangshan.FuType +import xiangshan.backend.fu.{FuConfig, FunctionUnit, UncertainLatency} +import xiangshan.backend.fu.fpu.util.{FPUDebug, ORTree, ShiftRightJam} class DivSqrt extends FPUSubModule { @@ -16,16 +17,17 @@ class DivSqrt extends FPUSubModule { val s_idle :: s_norm :: s_start :: s_compute :: s_round:: s_finish :: Nil = Enum(6) val state = RegInit(s_idle) - val rm = io.in.bits.rm + + val uopReg = RegEnable(io.in.bits.uop, io.in.fire()) + val kill = state=/=s_idle && uopReg.roqIdx.needFlush(io.redirectIn) val rmReg = RegEnable(rm, io.in.fire()) - val isDiv = !io.in.bits.op(0) + val isDiv = !op(0) val isDivReg = RegEnable(isDiv, io.in.fire()) - val isDouble = io.in.bits.isDouble val isDoubleReg = RegEnable(isDouble, io.in.fire()) val (a, b) = ( - Mux(isDouble, io.in.bits.a, extF32ToF64(io.in.bits.a)), - Mux(isDouble, io.in.bits.b, extF32ToF64(io.in.bits.b)) + Mux(isDouble, io.in.bits.src(0), extF32ToF64(io.in.bits.src(0))), + Mux(isDouble, io.in.bits.src(1), extF32ToF64(io.in.bits.src(1))) ) @@ -236,9 +238,12 @@ class DivSqrt extends FPUSubModule { state := s_finish } is(s_finish){ - state := s_idle + when(io.out.fire()){ + state := s_idle + } } } + when(kill){ state := s_idle } switch(state){ is(s_idle){ @@ -281,8 +286,8 @@ class DivSqrt extends FPUSubModule { ) io.in.ready := (state === s_idle) && io.out.ready - io.out.valid := state === s_finish - io.out.bits.result := Mux(specialCaseHappenReg, + io.out.valid := (state === s_finish) && !kill + io.out.bits.data := Mux(specialCaseHappenReg, specialResult, Mux(overflowReg, Mux(isDoubleReg, @@ -292,40 +297,41 @@ class DivSqrt extends FPUSubModule { commonResult ) ) - - io.out.bits.fflags.invalid := Mux(isDivReg, divInvalidReg, sqrtInvalidReg) - io.out.bits.fflags.underflow := !specialCaseHappenReg && underflowReg - io.out.bits.fflags.overflow := !specialCaseHappenReg && overflowReg - io.out.bits.fflags.infinite := Mux(isDivReg, divInfReg, false.B) - io.out.bits.fflags.inexact := !specialCaseHappenReg && (inexactReg || overflowReg || underflowReg) - - FPUDebug() { - // printf(p"$cnt in:${Hexadecimal(io.in.bits.src0)} \n") - when(io.in.fire()) { - printf(p"[In.fire] " + - p"a:${Hexadecimal(io.in.bits.a)} aexp:${aExp.asSInt()} amant:${Hexadecimal(aMant)} " + - p"b:${Hexadecimal(io.in.bits.b)} bexp:${bExp.asSInt()} bmant:${Hexadecimal(bMant)}\n") - } -// when(state === s_norm) { -// printf(p"[norm] lz:$aMantLez\n") + io.out.bits.uop := uopReg + + fflags.invalid := Mux(isDivReg, divInvalidReg, sqrtInvalidReg) + fflags.underflow := !specialCaseHappenReg && underflowReg + fflags.overflow := !specialCaseHappenReg && overflowReg + fflags.infinite := Mux(isDivReg, divInfReg, false.B) + fflags.inexact := !specialCaseHappenReg && (inexactReg || overflowReg || underflowReg) + +// FPUDebug() { +// // printf(p"$cnt in:${Hexadecimal(io.in.bits.src0)} \n") +// when(io.in.fire()) { +// printf(p"[In.fire] " + +// p"a:${Hexadecimal(io.in.bits.a)} aexp:${aExp.asSInt()} amant:${Hexadecimal(aMant)} " + +// p"b:${Hexadecimal(io.in.bits.b)} bexp:${bExp.asSInt()} bmant:${Hexadecimal(bMant)}\n") // } - when(state === s_compute){ -// when(sqrt.io.out.fire()){ -// printf(p"[compute] ") -// } - } - when(state === s_start) { - printf(p"[start] sign:$resSignReg mant:${Hexadecimal(aMantReg)} exp:${aExpReg.asSInt()}\n") - } - when(state === s_round){ - printf(p"[round] exp before round:${aExpReg} g:$gReg r:$rReg s:$sReg mant:${Hexadecimal(aMantReg)}\n" + - p"[round] mantRounded:${Hexadecimal(mantRounded)}\n") - } - when(io.out.valid) { - printf(p"[Out.valid] " + - p"invalid:$sqrtInvalidReg result:${Hexadecimal(commonResult)}\n" + - p"output:${Hexadecimal(io.out.bits.result)} " + - p"exp:${aExpReg.asSInt()} \n") - } - } +//// when(state === s_norm) { +//// printf(p"[norm] lz:$aMantLez\n") +//// } +// when(state === s_compute){ +//// when(sqrt.io.out.fire()){ +//// printf(p"[compute] ") +//// } +// } +// when(state === s_start) { +// printf(p"[start] sign:$resSignReg mant:${Hexadecimal(aMantReg)} exp:${aExpReg.asSInt()}\n") +// } +// when(state === s_round){ +// printf(p"[round] exp before round:${aExpReg} g:$gReg r:$rReg s:$sReg mant:${Hexadecimal(aMantReg)}\n" + +// p"[round] mantRounded:${Hexadecimal(mantRounded)}\n") +// } +// when(io.out.valid) { +// printf(p"[Out.valid] " + +// p"invalid:$sqrtInvalidReg result:${Hexadecimal(commonResult)}\n" + +// p"output:${Hexadecimal(io.out.bits.result)} " + +// p"exp:${aExpReg.asSInt()} \n") +// } +// } } diff --git a/src/main/scala/fpu/divsqrt/MantDivSqrt.scala b/src/main/scala/xiangshan/backend/fu/fpu/divsqrt/MantDivSqrt.scala similarity index 96% rename from src/main/scala/fpu/divsqrt/MantDivSqrt.scala rename to src/main/scala/xiangshan/backend/fu/fpu/divsqrt/MantDivSqrt.scala index 500eb5df9365cef6039470c413e844bb949fe56d..589d14a8116f126ae843245a5af736239021d584 100644 --- a/src/main/scala/fpu/divsqrt/MantDivSqrt.scala +++ b/src/main/scala/xiangshan/backend/fu/fpu/divsqrt/MantDivSqrt.scala @@ -1,9 +1,9 @@ -package fpu.divsqrt +package xiangshan.backend.fu.fpu.divsqrt import chisel3._ import chisel3.util._ -import fpu.util._ -import fpu.util.FPUDebug +import xiangshan.backend.fu.fpu.util._ +import xiangshan.backend.fu.fpu.util.FPUDebug class MantDivSqrt(len: Int) extends Module{ val io = IO(new Bundle() { diff --git a/src/main/scala/fpu/divsqrt/OnTheFlyConv.scala b/src/main/scala/xiangshan/backend/fu/fpu/divsqrt/OnTheFlyConv.scala similarity index 95% rename from src/main/scala/fpu/divsqrt/OnTheFlyConv.scala rename to src/main/scala/xiangshan/backend/fu/fpu/divsqrt/OnTheFlyConv.scala index 115a1d22acc76955ae014dc8c2722c3b950c8b0c..c7af9194a0cd8ab208b6913066cffe0c7c5210e2 100644 --- a/src/main/scala/fpu/divsqrt/OnTheFlyConv.scala +++ b/src/main/scala/xiangshan/backend/fu/fpu/divsqrt/OnTheFlyConv.scala @@ -1,10 +1,10 @@ -package fpu.divsqrt +package xiangshan.backend.fu.fpu.divsqrt import chisel3._ import chisel3.util._ import utils._ -import fpu._ -import fpu.util.FPUDebug +import xiangshan.backend.fu.fpu._ +import xiangshan.backend.fu.fpu.util.FPUDebug class OnTheFlyConv(len: Int) extends Module { val io = IO(new Bundle() { diff --git a/src/main/scala/fpu/divsqrt/SrtTable.scala b/src/main/scala/xiangshan/backend/fu/fpu/divsqrt/SrtTable.scala similarity index 92% rename from src/main/scala/fpu/divsqrt/SrtTable.scala rename to src/main/scala/xiangshan/backend/fu/fpu/divsqrt/SrtTable.scala index 66f178dab7a05e3bb8f27eaaf62bf2f5100e6dc5..5c3a9615efbd1a3e41d1c434571baa9e62724cd9 100644 --- a/src/main/scala/fpu/divsqrt/SrtTable.scala +++ b/src/main/scala/xiangshan/backend/fu/fpu/divsqrt/SrtTable.scala @@ -1,10 +1,10 @@ -package fpu.divsqrt +package xiangshan.backend.fu.fpu.divsqrt import chisel3._ import chisel3.util._ import utils._ -import fpu._ +import xiangshan.backend.fu.fpu._ class SrtTable extends Module { val io = IO(new Bundle() { diff --git a/src/main/scala/fpu/fma/ArrayMultiplier.scala b/src/main/scala/xiangshan/backend/fu/fpu/fma/ArrayMultiplier.scala similarity index 98% rename from src/main/scala/fpu/fma/ArrayMultiplier.scala rename to src/main/scala/xiangshan/backend/fu/fpu/fma/ArrayMultiplier.scala index 0e1926b586c48522fd526507a2536050e1e6fe95..0f61837c96694ba130432e6120bb4c1164246b8c 100644 --- a/src/main/scala/fpu/fma/ArrayMultiplier.scala +++ b/src/main/scala/xiangshan/backend/fu/fpu/fma/ArrayMultiplier.scala @@ -1,8 +1,8 @@ -package fpu.fma +package xiangshan.backend.fu.fpu.fma import chisel3._ import chisel3.util._ -import fpu.util._ +import xiangshan.backend.fu.fpu.util._ import utils.SignExt class ArrayMultiplier(len: Int, regDepth: Int = 0, realArraryMult: Boolean = false) extends Module { diff --git a/src/main/scala/fpu/fma/FMA.scala b/src/main/scala/xiangshan/backend/fu/fpu/fma/FMA.scala similarity index 86% rename from src/main/scala/fpu/fma/FMA.scala rename to src/main/scala/xiangshan/backend/fu/fpu/fma/FMA.scala index f79486f5c495837f7d56f28c0839c08c160d1f11..cd4f071f839fffff119115e33ee6738d323b12e5 100644 --- a/src/main/scala/fpu/fma/FMA.scala +++ b/src/main/scala/xiangshan/backend/fu/fpu/fma/FMA.scala @@ -1,13 +1,16 @@ -package fpu.fma +package xiangshan.backend.fu.fpu.fma import chisel3._ import chisel3.util._ -import fpu._ -import fpu.util.{CSA3_2, FPUDebug, ORTree, ShiftLeftJam, ShiftRightJam} +import xiangshan.FuType +import xiangshan.backend.fu.{CertainLatency, FuConfig, FunctionUnit} +import xiangshan.backend.fu.fpu._ +import xiangshan.backend.fu.fpu.util.{CSA3_2, FPUDebug, ORTree, ShiftLeftJam, ShiftRightJam} -class FMA extends FPUSubModule with HasPipelineReg { - def latency = 5 +class FMA extends FPUPipelineModule { + + override def latency = FunctionUnit.fmacCfg.latency.latencyVal.get def UseRealArraryMult = false @@ -21,12 +24,9 @@ class FMA extends FPUSubModule with HasPipelineReg { * Stage 1: Decode Operands *****************************************************************/ - val rm = io.in.bits.rm - val isDouble = io.in.bits.isDouble - val op = io.in.bits.op - val rs0 = io.in.bits.a - val rs1 = io.in.bits.b - val rs2 = io.in.bits.c + val rs0 = io.in.bits.src(0) + val rs1 = io.in.bits.src(1) + val rs2 = io.in.bits.src(2) val zero = 0.U(Float64.getWidth.W) val one = Mux(isDouble, Cat(0.U(1.W), Float64.expBiasInt.U(Float64.expWidth.W), 0.U(Float64.mantWidth.W)), @@ -151,11 +151,11 @@ class FMA extends FPUSubModule with HasPipelineReg { val s1_discardAMant = S1Reg(aIsZero || expDiff > (ADD_WIDTH+3).S) val s1_invalid = S1Reg(invalid) - FPUDebug(){ - when(valids(1) && ready){ - printf(p"[s1] prodExp+56:${s1_prodExpAdj} aExp:${s1_aExpRaw} diff:${s1_expDiff}\n") - } - } +// FPUDebug(){ +// when(valids(1) && ready){ +// printf(p"[s1] prodExp+56:${s1_prodExpAdj} aExp:${s1_aExpRaw} diff:${s1_expDiff}\n") +// } +// } /****************************************************************** @@ -188,11 +188,11 @@ class FMA extends FPUSubModule with HasPipelineReg { val s2_effSub = S2Reg(effSub) - FPUDebug(){ - when(valids(1) && ready){ - printf(p"[s2] discardAMant:${s1_discardAMant} discardProd:${s1_discardProdMant} \n") - } - } +// FPUDebug(){ +// when(valids(1) && ready){ +// printf(p"[s2] discardAMant:${s1_discardAMant} discardProd:${s1_discardProdMant} \n") +// } +// } /****************************************************************** * Stage 3: A + Prod => adder result @@ -284,14 +284,14 @@ class FMA extends FPUSubModule with HasPipelineReg { val s4_expPostNorm = S4Reg(expPostNorm) val s4_invalid = S4Reg(s3_invalid) - FPUDebug(){ - when(valids(3) && ready){ - printf(p"[s4] expPreNorm:${s3_expPreNorm} normShift:${s3_normShift} expPostNorm:${expPostNorm} " + - p"denormShift:${denormShift}" + - p"" + - p" \n") - } - } +// FPUDebug(){ +// when(valids(3) && ready){ +// printf(p"[s4] expPreNorm:${s3_expPreNorm} normShift:${s3_normShift} expPostNorm:${expPostNorm} " + +// p"denormShift:${denormShift}" + +// p"" + +// p" \n") +// } +// } /****************************************************************** * Stage 5: Rounding @@ -342,11 +342,11 @@ class FMA extends FPUSubModule with HasPipelineReg { val s5_inexact = S5Reg(inexact) val s5_ovSetInf = S5Reg(s4_ovSetInf) - FPUDebug(){ - when(valids(4) && ready){ - printf(p"[s5] expPostNorm:${s4_expPostNorm} expRounded:${expRounded}\n") - } - } +// FPUDebug(){ +// when(valids(4) && ready){ +// printf(p"[s5] expPostNorm:${s4_expPostNorm} expRounded:${expRounded}\n") +// } +// } /****************************************************************** * Assign Outputs @@ -375,22 +375,22 @@ class FMA extends FPUSubModule with HasPipelineReg { ) ) - io.out.bits.result := result - io.out.bits.fflags.invalid := s5_invalid - io.out.bits.fflags.inexact := s5_inexact - io.out.bits.fflags.overflow := s5_overflow - io.out.bits.fflags.underflow := s5_underflow - io.out.bits.fflags.infinite := false.B - - FPUDebug(){ - //printf(p"v0:${valids(0)} v1:${valids(1)} v2:${valids(2)} v3:${valids(3)} v4:${valids(4)} v5:${valids(5)}\n") - when(io.in.fire()){ - printf(p"[in] a:${Hexadecimal(a)} b:${Hexadecimal(b)} c:${Hexadecimal(c)}\n") - } - when(io.out.fire()){ - printf(p"[out] res:${Hexadecimal(io.out.bits.result)}\n") - } - } + io.out.bits.data := result + fflags.invalid := s5_invalid + fflags.inexact := s5_inexact + fflags.overflow := s5_overflow + fflags.underflow := s5_underflow + fflags.infinite := false.B + +// FPUDebug(){ +// //printf(p"v0:${valids(0)} v1:${valids(1)} v2:${valids(2)} v3:${valids(3)} v4:${valids(4)} v5:${valids(5)}\n") +// when(io.in.fire()){ +// printf(p"[in] a:${Hexadecimal(a)} b:${Hexadecimal(b)} c:${Hexadecimal(c)}\n") +// } +// when(io.out.fire()){ +// printf(p"[out] res:${Hexadecimal(io.out.bits.result)}\n") +// } +// } } diff --git a/src/main/scala/fpu/fma/LZA.scala b/src/main/scala/xiangshan/backend/fu/fpu/fma/LZA.scala similarity index 98% rename from src/main/scala/fpu/fma/LZA.scala rename to src/main/scala/xiangshan/backend/fu/fpu/fma/LZA.scala index bf5056b2bfba52411f0f753c3fa70a55e475d814..fac97bed3726b439cdc47c237c54cf0d7c436f92 100644 --- a/src/main/scala/fpu/fma/LZA.scala +++ b/src/main/scala/xiangshan/backend/fu/fpu/fma/LZA.scala @@ -1,4 +1,4 @@ -package fpu.fma +package xiangshan.backend.fu.fpu.fma import chisel3._ import chisel3.util._ diff --git a/src/main/scala/xiangshan/backend/fu/fpu/package.scala b/src/main/scala/xiangshan/backend/fu/fpu/package.scala new file mode 100644 index 0000000000000000000000000000000000000000..80b436325da2ee4a3d033aa81ac483d9453ba205 --- /dev/null +++ b/src/main/scala/xiangshan/backend/fu/fpu/package.scala @@ -0,0 +1,143 @@ +package xiangshan.backend.fu.fpu + +import chisel3._ +import chisel3.util._ + +object FPUOpType { + def funcWidth = 6 + def FpuOp(fu: String, op: String): UInt = ("b" + fu + op).U(funcWidth.W) + + def FU_FMAC = "000" + def FU_FCMP = "001" + def FU_FMV = "010" + def FU_F2I = "011" + def FU_I2F = "100" + def FU_S2D = "101" + def FU_D2S = "110" + def FU_DIVSQRT = "111" + + // FMA + def fadd:UInt = FpuOp(FU_FMAC, "000") + def fsub:UInt = FpuOp(FU_FMAC, "001") + def fmadd:UInt = FpuOp(FU_FMAC, "100") + def fmsub:UInt = FpuOp(FU_FMAC, "101") + def fnmsub:UInt = FpuOp(FU_FMAC, "110") + def fnmadd:UInt = FpuOp(FU_FMAC, "111") + def fmul:UInt = FpuOp(FU_FMAC, "010") + + // FCMP + def fmin:UInt = FpuOp(FU_FCMP, "000") + def fmax:UInt = FpuOp(FU_FCMP, "001") + def fle:UInt = FpuOp(FU_FCMP, "010") + def flt:UInt = FpuOp(FU_FCMP, "011") + def feq:UInt = FpuOp(FU_FCMP, "100") + + // FMV + def fmv_f2i:UInt= FpuOp(FU_FMV, "000") + def fmv_i2f:UInt= FpuOp(FU_FMV, "001") + def fclass:UInt = FpuOp(FU_FMV, "010") + def fsgnj:UInt = FpuOp(FU_FMV, "110") + def fsgnjn:UInt = FpuOp(FU_FMV, "101") + def fsgnjx:UInt = FpuOp(FU_FMV, "100") + + // FloatToInt + def f2w:UInt = FpuOp(FU_F2I, "000") + def f2wu:UInt = FpuOp(FU_F2I, "001") + def f2l:UInt = FpuOp(FU_F2I, "010") + def f2lu:UInt = FpuOp(FU_F2I, "011") + + // IntToFloat + def w2f:UInt = FpuOp(FU_I2F, "000") + def wu2f:UInt = FpuOp(FU_I2F, "001") + def l2f:UInt = FpuOp(FU_I2F, "010") + def lu2f:UInt = FpuOp(FU_I2F, "011") + + // FloatToFloat + def s2d:UInt = FpuOp(FU_S2D, "000") + def d2s:UInt = FpuOp(FU_D2S, "000") + + // Div/Sqrt + def fdiv:UInt = FpuOp(FU_DIVSQRT, "000") + def fsqrt:UInt = FpuOp(FU_DIVSQRT, "001") +} + +object FPUIOFunc { + def in_raw = 0.U(1.W) + def in_unbox = 1.U(1.W) + + def out_raw = 0.U(2.W) + def out_box = 1.U(2.W) + def out_sext = 2.U(2.W) + def out_zext = 3.U(2.W) + + def apply(inputFunc: UInt, outputFunc:UInt) = Cat(inputFunc, outputFunc) +} + +class Fflags extends Bundle { + val invalid = Bool() // 4 + val infinite = Bool() // 3 + val overflow = Bool() // 2 + val underflow = Bool() // 1 + val inexact = Bool() // 0 +} + +object RoudingMode { + val RNE = "b000".U(3.W) + val RTZ = "b001".U(3.W) + val RDN = "b010".U(3.W) + val RUP = "b011".U(3.W) + val RMM = "b100".U(3.W) +} + +class FloatPoint(val expWidth: Int, val mantWidth:Int) extends Bundle{ + val sign = Bool() + val exp = UInt(expWidth.W) + val mant = UInt(mantWidth.W) + def defaultNaN: UInt = Cat(0.U(1.W), Fill(expWidth+1,1.U(1.W)), Fill(mantWidth-1,0.U(1.W))) + def posInf: UInt = Cat(0.U(1.W), Fill(expWidth, 1.U(1.W)), 0.U(mantWidth.W)) + def negInf: UInt = Cat(1.U(1.W), posInf.tail(1)) + def maxNorm: UInt = Cat(0.U(1.W), Fill(expWidth-1, 1.U(1.W)), 0.U(1.W), Fill(mantWidth, 1.U(1.W))) + def expBias: UInt = Fill(expWidth-1, 1.U(1.W)) + def expBiasInt: Int = (1 << (expWidth-1)) - 1 + def mantExt: UInt = Cat(exp=/=0.U, mant) + def apply(x: UInt): FloatPoint = x.asTypeOf(new FloatPoint(expWidth, mantWidth)) +} + +object Float32 extends FloatPoint(8, 23) +object Float64 extends FloatPoint(11, 52) + + +object expOverflow { + def apply(sexp: SInt, expWidth: Int): Bool = + sexp >= Cat(0.U(1.W), Fill(expWidth, 1.U(1.W))).asSInt() + + def apply(uexp: UInt, expWidth: Int): Bool = + expOverflow(Cat(0.U(1.W), uexp).asSInt(), expWidth) +} + +object boxF32ToF64 { + def apply(x: UInt): UInt = Cat(Fill(32, 1.U(1.W)), x(31, 0)) +} + +object unboxF64ToF32 { + def apply(x: UInt): UInt = + Mux(x(63, 32)===Fill(32, 1.U(1.W)), x(31, 0), Float32.defaultNaN) +} + +object extF32ToF64 { + def apply(x: UInt): UInt = { + val f32 = Float32(x) + Cat( + f32.sign, + Mux(f32.exp === 0.U, + 0.U(Float64.expWidth.W), + Mux((~f32.exp).asUInt() === 0.U, + Cat("b111".U(3.W), f32.exp), + Cat("b0111".U(4.W) + f32.exp.head(1), f32.exp.tail(1)) + ) + ), + Cat(f32.mant, 0.U((Float64.mantWidth - Float32.mantWidth).W)) + ) + } +} + diff --git a/src/main/scala/fpu/util/CarrySaveAdder.scala b/src/main/scala/xiangshan/backend/fu/fpu/util/CarrySaveAdder.scala similarity index 97% rename from src/main/scala/fpu/util/CarrySaveAdder.scala rename to src/main/scala/xiangshan/backend/fu/fpu/util/CarrySaveAdder.scala index 5ec4ac2692e25cd716d2e9006bcb053c38c3d048..a6e7020ca82fd6187658147358355b5ac6798b55 100644 --- a/src/main/scala/fpu/util/CarrySaveAdder.scala +++ b/src/main/scala/xiangshan/backend/fu/fpu/util/CarrySaveAdder.scala @@ -1,4 +1,4 @@ -package fpu.util +package xiangshan.backend.fu.fpu.util import chisel3._ import chisel3.util._ diff --git a/src/main/scala/fpu/util/FPUDebug.scala b/src/main/scala/xiangshan/backend/fu/fpu/util/FPUDebug.scala similarity index 83% rename from src/main/scala/fpu/util/FPUDebug.scala rename to src/main/scala/xiangshan/backend/fu/fpu/util/FPUDebug.scala index 6b8dfa5bb38ffe0e9778b38f9914bb233f28277d..a5cc302be383fb7453b3ab3498d95d9c1bbda877 100644 --- a/src/main/scala/fpu/util/FPUDebug.scala +++ b/src/main/scala/xiangshan/backend/fu/fpu/util/FPUDebug.scala @@ -1,4 +1,4 @@ -package fpu.util +package xiangshan.backend.fu.fpu.util import chisel3._ diff --git a/src/main/scala/fpu/util/ORTree.scala b/src/main/scala/xiangshan/backend/fu/fpu/util/ORTree.scala similarity index 87% rename from src/main/scala/fpu/util/ORTree.scala rename to src/main/scala/xiangshan/backend/fu/fpu/util/ORTree.scala index 2d40b0ba0f574a4450e0e93a7d51bccb70b77179..08c54c26a620810b9d61544e5c124906950c0406 100644 --- a/src/main/scala/fpu/util/ORTree.scala +++ b/src/main/scala/xiangshan/backend/fu/fpu/util/ORTree.scala @@ -1,4 +1,4 @@ -package fpu.util +package xiangshan.backend.fu.fpu.util import chisel3._ diff --git a/src/main/scala/fpu/util/ShiftLeftJam.scala b/src/main/scala/xiangshan/backend/fu/fpu/util/ShiftLeftJam.scala similarity index 91% rename from src/main/scala/fpu/util/ShiftLeftJam.scala rename to src/main/scala/xiangshan/backend/fu/fpu/util/ShiftLeftJam.scala index f0043c246b9d2b2b24f2676dfda5d041ddf784e6..b7aafb7a5f84a46b3eb31ba6f9fe5a158b313ff8 100644 --- a/src/main/scala/fpu/util/ShiftLeftJam.scala +++ b/src/main/scala/xiangshan/backend/fu/fpu/util/ShiftLeftJam.scala @@ -1,4 +1,4 @@ -package fpu.util +package xiangshan.backend.fu.fpu.util import chisel3._ import chisel3.util._ diff --git a/src/main/scala/fpu/util/ShiftRightJam.scala b/src/main/scala/xiangshan/backend/fu/fpu/util/ShiftRightJam.scala similarity index 94% rename from src/main/scala/fpu/util/ShiftRightJam.scala rename to src/main/scala/xiangshan/backend/fu/fpu/util/ShiftRightJam.scala index 610c5e36caa7270edfeb15614878691ee0f623fc..2ed38fb54ddec8adc2a7fc6b4d5729e6d64bd80f 100644 --- a/src/main/scala/fpu/util/ShiftRightJam.scala +++ b/src/main/scala/xiangshan/backend/fu/fpu/util/ShiftRightJam.scala @@ -1,4 +1,4 @@ -package fpu.util +package xiangshan.backend.fu.fpu.util import chisel3._ import chisel3.util._ diff --git a/src/main/scala/xiangshan/backend/issue/IssueQueue.scala b/src/main/scala/xiangshan/backend/issue/IssueQueue.scala deleted file mode 100644 index b07340401e9d1b36e204cd28cfc8fe797a79e3eb..0000000000000000000000000000000000000000 --- a/src/main/scala/xiangshan/backend/issue/IssueQueue.scala +++ /dev/null @@ -1,304 +0,0 @@ -package xiangshan.backend.issue - -import chisel3.{util, _} -import chisel3.util._ -import utils.{ParallelMux, ParallelOR, PriorityEncoderWithFlag, XSDebug, XSInfo, XSPerf} -import xiangshan._ -import xiangshan.backend.exu.{Exu, ExuConfig} -import xiangshan.backend.regfile.RfReadPort - -class IssueQueue -( - val exuCfg: ExuConfig, - val wakeupCnt: Int, - val bypassCnt: Int = 0 -) extends XSModule with HasIQConst { - val io = IO(new Bundle() { - val redirect = Flipped(ValidIO(new Redirect)) - val enq = Flipped(DecoupledIO(new MicroOp)) - val readIntRf = Vec(exuCfg.intSrcCnt, Flipped(new RfReadPort)) - val readFpRf = Vec(exuCfg.fpSrcCnt, Flipped(new RfReadPort)) - val deq = DecoupledIO(new ExuInput) - val wakeUpPorts = Vec(wakeupCnt, Flipped(ValidIO(new ExuOutput))) - val bypassUops = Vec(bypassCnt, Flipped(ValidIO(new MicroOp))) - val bypassData = Vec(bypassCnt, Flipped(ValidIO(new ExuOutput))) - val numExist = Output(UInt(iqIdxWidth.W)) - // tlb hit, inst can deq - val tlbFeedback = Flipped(ValidIO(new TlbFeedback)) - }) - - def qsize: Int = IssQueSize - def idxWidth = log2Up(qsize) - def replayDelay = 16 - - require(isPow2(qsize)) - - val tlbHit = io.tlbFeedback.valid && io.tlbFeedback.bits.hit - val tlbMiss = io.tlbFeedback.valid && !io.tlbFeedback.bits.hit - - XSDebug(io.tlbFeedback.valid, - "tlb feedback: hit: %d roqIdx: %d\n", - io.tlbFeedback.bits.hit, - io.tlbFeedback.bits.roqIdx.asUInt - ) - /* - invalid --[enq]--> valid --[deq]--> wait --[tlbHit]--> invalid - wait --[replay]--> replay --[cnt]--> valid - */ - val s_invalid :: s_valid :: s_wait :: s_replay :: Nil = Enum(4) - - val idxQueue = RegInit(VecInit((0 until qsize).map(_.U(idxWidth.W)))) - val stateQueue = RegInit(VecInit(Seq.fill(qsize)(s_invalid))) - - val readyVec = Wire(Vec(qsize, Bool())) - val uopQueue = Reg(Vec(qsize, new MicroOp)) - val cntQueue = Reg(Vec(qsize, UInt(log2Up(replayDelay).W))) - - val tailPtr = RegInit(0.U((idxWidth+1).W)) - - // real deq - - /* - example: realDeqIdx = 2 | realDeqIdx=0 - moveMask = 11111100 | moveMask=11111111 - */ - - val (firstBubble, findBubble) = PriorityEncoderWithFlag(stateQueue.map(_ === s_invalid)) - val realDeqIdx = firstBubble - val realDeqValid = (firstBubble < tailPtr) && findBubble - val moveMask = { - (Fill(qsize, 1.U(1.W)) << realDeqIdx)(qsize-1, 0) - } & Fill(qsize, realDeqValid) - - for(i <- 0 until qsize-1){ - when(moveMask(i)){ - idxQueue(i) := idxQueue(i+1) - stateQueue(i) := stateQueue(i+1) - } - } - when(realDeqValid){ - idxQueue.last := idxQueue(realDeqIdx) - stateQueue.last := s_invalid - } - - - // wake up - def getSrcSeq(uop: MicroOp): Seq[UInt] = Seq(uop.psrc1, uop.psrc2, uop.psrc3) - def getSrcTypeSeq(uop: MicroOp): Seq[UInt] = Seq( - uop.ctrl.src1Type, uop.ctrl.src2Type, uop.ctrl.src3Type - ) - def getSrcStateSeq(uop: MicroOp): Seq[UInt] = Seq( - uop.src1State, uop.src2State, uop.src3State - ) - - def writeBackHit(src: UInt, srcType: UInt, wbUop: (Bool, MicroOp)): Bool = { - val (v, uop) = wbUop - val isSameType = - (SrcType.isReg(srcType) && uop.ctrl.rfWen && src =/= 0.U) || (SrcType.isFp(srcType) && uop.ctrl.fpWen) - - v && isSameType && (src===uop.pdest) - } - - //TODO: opt this, do bypass select in 'select' stage not 'issue' stage - val bypassData = RegNext(io.bypassData) - def doBypass(src: UInt, srcType: UInt): (Bool, UInt) = { - val hitVec = bypassData.map(p => (p.valid, p.bits.uop)). - map(wbUop => writeBackHit(src, srcType, wbUop)) - val data = ParallelMux(hitVec.zip(bypassData.map(_.bits.data))) - (ParallelOR(hitVec).asBool(), data) - } - - def wakeUp(uop: MicroOp): MicroOp = { - def getNewSrcState(i: Int): UInt = { - val src = getSrcSeq(uop)(i) - val srcType = getSrcTypeSeq(uop)(i) - val srcState = getSrcStateSeq(uop)(i) - val hitVec = ( - io.wakeUpPorts.map(w => (w.valid, w.bits.uop)) ++ - io.bypassUops.map(p => (p.valid, p.bits)) - ).map(wbUop => writeBackHit(src, srcType, wbUop)) - val hit = ParallelOR(hitVec).asBool() - Mux(hit, SrcState.rdy, srcState) - } - val new_uop = WireInit(uop) - new_uop.src1State := getNewSrcState(0) - if(exuCfg==Exu.stExeUnitCfg) new_uop.src2State := getNewSrcState(1) - new_uop - } - - def uopIsRdy(uop: MicroOp): Bool = { - def srcIsRdy(srcType: UInt, srcState: UInt): Bool = { - SrcType.isPcImm(srcType) || srcState===SrcState.rdy - } - exuCfg match { - case Exu.ldExeUnitCfg => - srcIsRdy(uop.ctrl.src1Type, uop.src1State) - case Exu.stExeUnitCfg => - srcIsRdy(uop.ctrl.src1Type, uop.src1State) && srcIsRdy(uop.ctrl.src2Type, uop.src2State) - } - } - - - // 1. wake up - for(i <- 0 until qsize){ - uopQueue(i) := wakeUp(uopQueue(i)) - } - - // 2. select - for(i <- 0 until qsize){ - readyVec(i) := uopIsRdy(uopQueue(i)) - } - - val selectedIdxRegOH = Wire(UInt(qsize.W)) - val selectMask = WireInit(VecInit( - (0 until qsize).map(i => - (stateQueue(i)===s_valid) && readyVec(idxQueue(i)) && !(selectedIdxRegOH(i) && io.deq.fire()) - ) - )) - val (selectedIdxWire, sel) = PriorityEncoderWithFlag(selectMask) - val selReg = RegNext(sel) - val selectedIdxReg = RegNext(selectedIdxWire - moveMask(selectedIdxWire)) - selectedIdxRegOH := UIntToOH(selectedIdxReg) - XSDebug( - p"selMaskWire:${Binary(selectMask.asUInt())} selected:$selectedIdxWire" + - p" moveMask:${Binary(moveMask)} selectedIdxReg:$selectedIdxReg\n" - ) - - - // read regfile - val selectedUop = uopQueue(idxQueue(selectedIdxWire)) - - exuCfg match { - case Exu.ldExeUnitCfg => - io.readIntRf(0).addr := selectedUop.psrc1 // base - XSDebug(p"src1 read addr: ${io.readIntRf(0).addr}\n") - case Exu.stExeUnitCfg => - io.readIntRf(0).addr := selectedUop.psrc1 // base - io.readIntRf(1).addr := selectedUop.psrc2 // store data (int) - io.readFpRf(0).addr := selectedUop.psrc2 // store data (fp) - XSDebug( - p"src1 read addr: ${io.readIntRf(0).addr} src2 read addr: ${io.readIntRf(1).addr}\n" - ) - case _ => - require(requirement = false, "Error: IssueQueue only support ldu and stu!") - } - - // (fake) deq to Load/Store unit - io.deq.valid := (stateQueue(selectedIdxReg)===s_valid) && selReg - io.deq.bits.uop := uopQueue(idxQueue(selectedIdxReg)) - - val src1Bypass = doBypass(io.deq.bits.uop.psrc1, io.deq.bits.uop.ctrl.src1Type) - io.deq.bits.src1 := Mux(src1Bypass._1, src1Bypass._2, io.readIntRf(0).data) - if(exuCfg == Exu.stExeUnitCfg){ - val src2Bypass = doBypass(io.deq.bits.uop.psrc2, io.deq.bits.uop.ctrl.src2Type) - io.deq.bits.src2 := Mux(src2Bypass._1, - src2Bypass._2, - Mux(SrcType.isReg(io.deq.bits.uop.ctrl.src2Type), - io.readIntRf(1).data, - io.readFpRf(0).data - ) - ) - } else { - io.deq.bits.src2 := DontCare - } - io.deq.bits.src3 := DontCare - - when(io.deq.fire()){ - stateQueue(selectedIdxReg - moveMask(selectedIdxReg)) := s_wait - assert(stateQueue(selectedIdxReg) === s_valid, "Dequeue a invalid entry to lsu!") - } - -// assert(!(tailPtr===0.U && tlbHit), "Error: queue is empty but tlbHit is true!") - - val tailAfterRealDeq = tailPtr - moveMask(tailPtr.tail(1)) - val isFull = tailAfterRealDeq.head(1).asBool() // tailPtr===qsize.U - - // enq - io.enq.ready := !isFull && !io.redirect.valid - when(io.enq.fire()){ - stateQueue(tailAfterRealDeq.tail(1)) := s_valid - val uopQIdx = idxQueue(tailPtr.tail(1)) - val new_uop = wakeUp(io.enq.bits) - uopQueue(uopQIdx) := new_uop - } - - tailPtr := tailAfterRealDeq + io.enq.fire() - - XSDebug( - realDeqValid, - p"realDeqIdx:$realDeqIdx\n" - ) - - XSDebug("State Dump: ") - stateQueue.reverse.foreach(s =>{ - XSDebug(false, s===s_invalid, "-") - XSDebug(false, s===s_valid, "v") - XSDebug(false, s===s_wait, "w") - XSDebug(false, s===s_replay, "p") - }) - XSDebug(false, true.B, "\n") - - XSDebug("State Dump: ") - idxQueue.reverse.foreach(id =>{ - XSDebug(false, true.B, p"$id") - }) - XSDebug(false, true.B, "\n") - - XSDebug("State Dump: ") - for(i <- readyVec.indices.reverse){ - val r = readyVec(idxQueue(i)) - XSDebug(false, r, p"r") - XSDebug(false, !r, p"-") - } - XSDebug(false, true.B, "\n") - -// assert(!(tlbMiss && realDeqValid), "Error: realDeqValid should be false when replay valid!") - for(i <- 0 until qsize){ - val uopQIdx = idxQueue(i) - val uop = uopQueue(uopQIdx) - val cnt = cntQueue(uopQIdx) - val nextIdx = i.U - moveMask(i) - //TODO: support replay - val roqIdxMatch = uop.roqIdx.asUInt === io.tlbFeedback.bits.roqIdx.asUInt - val notEmpty = stateQueue(i)=/=s_invalid - val replayThis = (stateQueue(i)===s_wait) && tlbMiss && roqIdxMatch - val tlbHitThis = notEmpty && tlbHit && roqIdxMatch - val flushThis = notEmpty && uop.roqIdx.needFlush(io.redirect) - - when(replayThis){ - stateQueue(nextIdx) := s_replay - cnt := (replayDelay-1).U - } - when(stateQueue(i)===s_replay){ - when(cnt === 0.U){ - stateQueue(nextIdx) := s_valid - }.otherwise({ - cnt := cnt - 1.U - }) - } - when(flushThis || tlbHitThis){ - stateQueue(nextIdx) := s_invalid - } - } - - - // assign outputs - io.numExist := Mux(isFull, (qsize-1).U, PopCount(stateQueue.map(_ =/= s_invalid))) - - // Debug sigs - XSInfo( - io.enq.fire(), - p"enq fire: pc:${Hexadecimal(io.enq.bits.cf.pc)} roqIdx:${io.enq.bits.roqIdx} " + - p"src1: ${io.enq.bits.psrc1} src2:${io.enq.bits.psrc2} pdst:${io.enq.bits.pdest}\n" - ) - XSInfo( - io.deq.fire(), - p"deq fire: pc:${Hexadecimal(io.deq.bits.uop.cf.pc)} roqIdx:${io.deq.bits.uop.roqIdx} " + - p"src1: ${io.deq.bits.uop.psrc1} data: ${Hexadecimal(io.deq.bits.src1)} " + - p"src2: ${io.deq.bits.uop.psrc2} data: ${Hexadecimal(io.deq.bits.src2)} " + - p"imm : ${Hexadecimal(io.deq.bits.uop.ctrl.imm)}\npdest: ${io.deq.bits.uop.pdest}\n" - ) - XSDebug(p"tailPtr:$tailPtr tailAfterDeq:$tailAfterRealDeq tlbHit:$tlbHit\n") - - XSPerf("utilization", tailPtr) -} diff --git a/src/main/scala/xiangshan/backend/issue/ReservationStation.scala b/src/main/scala/xiangshan/backend/issue/ReservationStation.scala deleted file mode 100644 index 317a7dc41a389d8e5d2c3b785693cd2e86b31b0f..0000000000000000000000000000000000000000 --- a/src/main/scala/xiangshan/backend/issue/ReservationStation.scala +++ /dev/null @@ -1,440 +0,0 @@ -package xiangshan.backend.issue - -import chisel3._ -import chisel3.util._ -import xiangshan._ -import xiangshan.backend.exu.{Exu, ExuConfig} -import xiangshan.backend.rename.FreeListPtr -import utils._ -import xiangshan.backend.fu.FunctionUnit._ -import xiangshan.backend.regfile.RfReadPort - - -trait HasIQConst extends HasXSParameter{ - val iqSize = IssQueSize - val iqIdxWidth = log2Up(iqSize) -} - -object OneCycleFire { - def apply(fire: Bool) = { - val valid = RegInit(false.B) - when (valid) { valid := false.B } - when (fire) { valid := true.B } - valid - } -} - -class ReservationStation -( - val exuCfg: ExuConfig, - val wakeupCnt: Int, - val bypassCnt: Int = 0, - val enableBypass: Boolean = false, - val fifo: Boolean = false -) extends XSModule with HasIQConst { - - val src2Use = true - val src3Use = (exuCfg.intSrcCnt > 2) || (exuCfg.fpSrcCnt > 2) - val src2Listen = true - val src3Listen = (exuCfg.intSrcCnt > 2) || (exuCfg.fpSrcCnt > 2) - - val io = IO(new Bundle() { - // flush Issue Queue - val redirect = Flipped(ValidIO(new Redirect)) - - // enq Ctrl sigs at dispatch-2 - val enqCtrl = Flipped(DecoupledIO(new MicroOp)) - // enq Data at next cycle (regfile has 1 cycle latency) - val enqData = Input(new ExuInput) - - // broadcast selected uop to other issue queues which has bypasses - val selectedUop = if(enableBypass) ValidIO(new MicroOp) else null - - // send to exu - val deq = DecoupledIO(new ExuInput) - - // listen to write back bus - val wakeUpPorts = Vec(wakeupCnt, Flipped(ValidIO(new ExuOutput))) - - // use bypass uops to speculative wake-up - val bypassUops = Vec(bypassCnt, Flipped(ValidIO(new MicroOp))) - val bypassData = Vec(bypassCnt, Flipped(ValidIO(new ExuOutput))) - - // to Dispatch - val numExist = Output(UInt(iqIdxWidth.W)) - }) - - val srcAllNum = 3 - val srcUseNum = 1 + (if(src2Use) 1 else 0) + (if(src3Use) 1 else 0)// when src2Use is false, then src3Use must be false - val srcListenNum = 1 + (if(src2Listen) 1 else 0) + (if(src3Listen) 1 else 0) // when src2Listen is false, then src3Listen must be false - // when use is false, Listen must be false - require(!(!src2Use && src2Listen)) - require(!(!src3Use && src3Listen)) - require(!(!src2Use && src3Use)) - require(!(!src2Listen && src3Listen)) - - // Issue Queue - // val issQue = IndexableMem(iqSize, new ExuInput, mem = false, init = None) - val issQue = Mem(iqSize, new ExuInput) - // val issQue = Reg(Vec(iqSize, new ExuInput)) - val validQue = RegInit(VecInit(Seq.fill(iqSize)(false.B))) - val idQue = RegInit(VecInit((0 until iqSize).map(_.U(iqIdxWidth.W)))) - val idValidQue = VecInit((0 until iqSize).map(i => validQue(idQue(i)))).asUInt - val tailAll = RegInit(0.U((iqIdxWidth+1).W)) - val tail = tailAll(iqIdxWidth-1, 0) - val full = tailAll(iqIdxWidth) - - // alias failed, turn to independent storage(Reg) - val psrc = VecInit(List.tabulate(iqSize)(i => VecInit(List(issQue(i.U).uop.psrc1, issQue(i.U).uop.psrc2, issQue(i.U).uop.psrc3)))) // NOTE: indexed by IssQue's idx - val srcRdyVec = Reg(Vec(iqSize, Vec(srcAllNum, Bool()))) // NOTE: indexed by IssQue's idx - val srcData = Reg(Vec(iqSize, Vec(srcAllNum, UInt(XLEN.W)))) // NOTE: indexed by IssQue's idx - val srcRdy = VecInit(srcRdyVec.map(a => if(src3Listen) { if(src2Listen) a(0)&&a(1)&&a(2) else a(0)&&a(2) } else { if(src2Listen) a(0)&&a(1) else a(0) }))// NOTE: indexed by IssQue's idx - val srcIdRdy = VecInit((0 until iqSize).map(i => srcRdy(idQue(i)))).asUInt // NOTE: indexed by IdQue's idx - val srcType = List.tabulate(iqSize)(i => List(issQue(i).uop.ctrl.src1Type, issQue(i).uop.ctrl.src2Type, issQue(i).uop.ctrl.src3Type)) // NOTE: indexed by IssQue's idx - - // val srcDataWire = Wire(srcData) - val srcDataWire = Wire(Vec(iqSize, Vec(srcAllNum, UInt(XLEN.W)))) // NOTE: indexed by IssQue's idx - srcDataWire := srcData - srcData := srcDataWire - - // there are three stages - // |-------------|--------------------|--------------| - // |Enq:get state|Deq: select/get data| fire stage | - // |-------------|--------------------|--------------| - - //----------------------------------------- - // Enqueue - //----------------------------------------- - val enqRedHit = Wire(Bool()) - val enqFire = io.enqCtrl.fire() && !enqRedHit - val deqFire = io.deq.fire() - val popOne = Wire(Bool()) - io.enqCtrl.ready := !full || popOne - val enqSelIq = Wire(UInt(iqIdxWidth.W)) - val enqSrcRdy = List(Mux(SrcType.isPcImm(io.enqCtrl.bits.ctrl.src1Type), true.B, io.enqCtrl.bits.src1State === SrcState.rdy), - Mux(SrcType.isPcImm(io.enqCtrl.bits.ctrl.src2Type), true.B, io.enqCtrl.bits.src2State === SrcState.rdy), - Mux(SrcType.isPcImm(io.enqCtrl.bits.ctrl.src3Type), true.B, io.enqCtrl.bits.src3State === SrcState.rdy)) - - // state enq - when (enqFire) { - issQue(enqSelIq).uop := io.enqCtrl.bits - validQue(enqSelIq) := true.B - assert(!validQue(enqSelIq) || popOne/* && idQue(deqSel)===enqSelIq*/) - - srcRdyVec(enqSelIq)(0) := enqSrcRdy(0) - if(src2Listen) { srcRdyVec(enqSelIq)(1) := enqSrcRdy(1) } - if(src3Listen) { srcRdyVec(enqSelIq)(2) := enqSrcRdy(2) } - } - - // data enq - val enqSelIqNext = RegEnable(enqSelIq, enqFire) - // val enqSelIqNext = RegNext(enqSelIq) - val enqFireNext = RegInit(false.B) - when (enqFireNext) { enqFireNext := false.B } - when (enqFire) { enqFireNext := true.B } - - val enqDataVec = List(io.enqData.src1, io.enqData.src2, io.enqData.src3) - when (enqFireNext) { - for(i <- 0 until srcUseNum) { - srcDataWire(enqSelIqNext)(i) := enqDataVec(i) - } - } - - //----------------------------------------- - // tail - //----------------------------------------- - val tailInc = enqFire - val tailDec = popOne - val tailKeep = tailInc === tailDec - val tailAdd = tailAll + 1.U - val tailSub = tailAll - 1.U - tailAll := Mux(tailKeep, tailAll, Mux(tailInc, tailAdd, tailSub)) - // Select to Dequeue - val deqSel = if (fifo) 0.U else PriorityEncoder(idValidQue & srcIdRdy) //may not need idx, just need oneHot, idx by IdQue's idx - val deqSelIq = idQue(deqSel) - val deqSelOH = PriorityEncoderOH(idValidQue & srcIdRdy) - val has1Rdy = if (fifo) idValidQue(deqSel) && srcIdRdy(deqSel) else ParallelOR((validQue.asUInt & srcRdy.asUInt).asBools).asBool() - - //----------------------------------------- - // idQue Move - //----------------------------------------- - def UIntToMHP(in: UInt) = { - // UInt to Multi-Hot plus 1: 1.U -> "11".U; 2.U(2.W) -> "0111".U; 3.U(3.W) -> "00001111".W - val a = Seq.fill(in.getWidth)(2).product - val s = (1 << (a-1)).S - Reverse((s(a-1,0).asSInt >> in)(a-1,0).asUInt) - } - def UIntToMH(in: UInt) = { - val a = Seq.fill(in.getWidth)(2).product - val s = (1 << (a-1)).S - Reverse((s(a-1,0).asSInt >> in)(a-1,0).asUInt) ^ UIntToOH(in) - } - def PriorityDot(in: UInt) = { - // "1100".U -> "0111".U; "1010".U -> "0011".U; "0000".U -> "0000".U - val a = Array.fill(iqSize)(1) - for(i <- 1 until in.getWidth) { - a(i) = a(i-1)*2 + 1 - } - Mux(in===0.U, 0.U(in.getWidth.W), PriorityMux(in, a.map(_.U(in.getWidth.W)))) - } - val tailDot = Mux(full, VecInit(Seq.fill(iqSize)(true.B)).asUInt, UIntToMHP(tail)) - val tailDot2 = Mux(full, VecInit(Seq.fill(iqSize)(true.B)).asUInt, UIntToMH(tail)) - val selDot = UIntToMHP(deqSel) // FIXIT: PriorityEncoder -> UIntToMHP means long latency - val nonValid = ~(idValidQue | ~tailDot2) - val popSel = PriorityEncoder(nonValid) // Note: idxed by IDque's index - val popDot = PriorityDot(nonValid) - val isPop = ParallelOR(nonValid.asBools).asBool() - val moveDot = Mux(isPop, tailDot ^ popDot, tailDot ^ selDot) - - assert(!(popOne&&moveDot(0))) - when (popOne) { - for(i <- 1 until iqSize) { - when (moveDot(i)) { idQue(i-1) := idQue(i) } - } - val ptr_tmp = Mux(full, VecInit(Seq.fill(iqIdxWidth)(true.B)).asUInt, tail) - idQue(ptr_tmp) := idQue(Mux(isPop, popSel, deqSel)) - } - assert(ParallelAND(List.tabulate(iqSize)(i => ParallelOR(List.tabulate(iqSize)(j => i.U === idQue(j))))).asBool) - - //----------------------------------------- - // Redirect - //----------------------------------------- - // redirect enq - enqRedHit := io.redirect.valid && io.enqCtrl.bits.roqIdx.needFlush(io.redirect) - - // redirect issQue - val redHitVec = List.tabulate(iqSize)(i => issQue(i).uop.roqIdx.needFlush(io.redirect)) - for (i <- validQue.indices) { - when (redHitVec(i) && validQue(i)) { - validQue(i) := false.B - } - } - // reditect deq(issToExu) - val redIdHitVec = List.tabulate(iqSize)(i => issQue(idQue(i)).uop.roqIdx.needFlush(io.redirect)) - val selIsRed = ParallelOR((deqSelOH & VecInit(redIdHitVec).asUInt).asBools).asBool - - //----------------------------------------- - // Dequeue (or to Issue Stage) - //----------------------------------------- - val issueToExu = Reg(new ExuInput) - val issueToExuValid = RegInit(false.B) - val deqFlushHit = issueToExu.uop.roqIdx.needFlush(io.redirect) - val deqCanIn = !issueToExuValid || io.deq.ready || deqFlushHit - - val toIssFire = deqCanIn && has1Rdy && !isPop && !selIsRed - popOne := deqCanIn && (has1Rdy || isPop) // send a empty or valid term to issueStage - - when (toIssFire) { - issueToExu := issQue(deqSelIq) - issueToExuValid := true.B - validQue(deqSelIq) := enqFire && enqSelIq===deqSelIq - assert(validQue(deqSelIq)) - issueToExu.src1 := srcDataWire(deqSelIq)(0) - if (src2Use) { issueToExu.src2 := srcDataWire(deqSelIq)(1) } else { issueToExu.src2 := DontCare } - if (src3Use) { issueToExu.src3 := srcDataWire(deqSelIq)(2) } else { issueToExu.src3 := DontCare } - } - when ((deqFire || deqFlushHit) && !toIssFire) { - issueToExuValid := false.B - } - - io.deq.valid := issueToExuValid && !deqFlushHit - io.deq.bits := issueToExu - - enqSelIq := Mux(full, - Mux(isPop, - idQue(popSel), - deqSelIq - ), - idQue(tail) - ) // Note: direct by IQue's idx, different from deqSel - - io.numExist := Mux(tailAll === iqSize.U, (iqSize-1).U, tailAll) - - //----------------------------------------- - // Issue with No Delay - //----------------------------------------- - // when enq is ready && no other rdy && no pop && fireStage is ready && no flush - // send out directly without store the data - val enqAlreadyRdy = if(src3Listen) { if(src2Listen) enqSrcRdy(0)&&enqSrcRdy(1)&&enqSrcRdy(2) else enqSrcRdy(0)&&enqSrcRdy(2) } else { if(src2Listen) enqSrcRdy(0)&&enqSrcRdy(1) else enqSrcRdy(0) } - val enqALRdyNext = OneCycleFire(enqAlreadyRdy && enqFire) - val enqSendFlushHit = issQue(enqSelIqNext).uop.roqIdx.needFlush(io.redirect) - val enqSendEnable = if(fifo) { RegNext(tailAll===0.U) && enqALRdyNext && (!issueToExuValid || deqFlushHit) && (enqSelIqNext === deqSelIq) && !isPop && !enqSendFlushHit/* && has1Rdy*//* && io.deq.ready*/ } else { enqALRdyNext && (!issueToExuValid || deqFlushHit) && (enqSelIqNext === deqSelIq) && !isPop && !enqSendFlushHit/* && has1Rdy*//* && io.deq.ready*/ } // FIXME: has1Rdy has combination loop - when (enqSendEnable) { - io.deq.valid := true.B - io.deq.bits := issQue(enqSelIqNext) - io.deq.bits.src1 := enqDataVec(0) - if (src2Use) { io.deq.bits.src2 := enqDataVec(1) } - if (src3Use) { io.deq.bits.src3 := enqDataVec(2) } - issueToExuValid := false.B - when (!io.deq.ready) { // if Func Unit is not ready, store it to FireStage - issueToExuValid := true.B - } - } - - //----------------------------------------- - // Wakeup and Bypass - //----------------------------------------- - val cdbValid = io.wakeUpPorts.map(_.valid) - val cdbData = io.wakeUpPorts.map(_.bits.data) - val cdbPdest = io.wakeUpPorts.map(_.bits.uop.pdest) - val cdbrfWen = io.wakeUpPorts.map(_.bits.uop.ctrl.rfWen) - val cdbfpWen = io.wakeUpPorts.map(_.bits.uop.ctrl.fpWen) - - for(i <- idQue.indices) { // Should be IssQue.indices but Mem() does not support - for(j <- 0 until srcListenNum) { - val hitVec = cdbValid.indices.map(k => psrc(i)(j) === cdbPdest(k) && cdbValid(k) && (srcType(i)(j)===SrcType.reg && cdbrfWen(k) && cdbPdest(k) =/= 0.U || srcType(i)(j)===SrcType.fp && cdbfpWen(k))) - val hit = ParallelOR(hitVec).asBool - val data = ParallelMux(hitVec zip cdbData) - when (validQue(i) && !srcRdyVec(i)(j) && hit) { - srcDataWire(i)(j) := data - srcRdyVec(i)(j) := true.B - } - // XSDebug(validQue(i) && !srcRdyVec(i)(j) && hit, "WakeUp: Sel:%d Src:(%d|%d) Rdy:%d Hit:%d HitVec:%b Data:%x\n", i.U, j.U, psrc(i)(j), srcRdyVec(i)(j), hit, VecInit(hitVec).asUInt, data) - for (k <- cdbValid.indices) { - XSDebug(validQue(i) && !srcRdyVec(i)(j) && hit && hitVec(k), - "WakeUpHit: IQIdx:%d Src%d:%d Ports:%d Data:%x Pc:%x RoqIdx:%x\n", - i.U, j.U, psrc(i)(j), k.U, cdbData(k), io.wakeUpPorts(k).bits.uop.cf.pc, io.wakeUpPorts(k).bits.uop.roqIdx.asUInt) - } - } - } - - val bpPdest = io.bypassUops.map(_.bits.pdest) - val bpValid = io.bypassUops.map(_.valid) - val bpData = io.bypassData.map(_.bits.data) - val bprfWen = io.bypassUops.map(_.bits.ctrl.rfWen) - val bpfpWen = io.bypassUops.map(_.bits.ctrl.fpWen) - - for (i <- idQue.indices) { // Should be IssQue.indices but Mem() does not support - for (j <- 0 until srcListenNum) { - val hitVec = bpValid.indices.map(k => psrc(i)(j) === bpPdest(k) && bpValid(k) && (srcType(i)(j)===SrcType.reg && bprfWen(k) && bpPdest(k) =/= 0.U || srcType(i)(j)===SrcType.fp && bpfpWen(k))) - val hitVecNext = hitVec.map(RegNext(_)) - val hit = ParallelOR(hitVec).asBool - when (validQue(i) && !srcRdyVec(i)(j) && hit) { - srcRdyVec(i)(j) := true.B - } - when (RegNext(validQue(i) && !srcRdyVec(i)(j) && hit)) { - srcDataWire(i)(j) := PriorityMux(hitVecNext zip bpData) - } - // XSDebug(validQue(i) && !srcRdyVec(i)(j) && hit, "BypassCtrl: Sel:%d Src:(%d|%d) Rdy:%d Hit:%d HitVec:%b\n", i.U, j.U, psrc(i)(j), srcRdyVec(i)(j), hit, VecInit(hitVec).asUInt) - for (k <- bpValid.indices) { - XSDebug(validQue(i) && !srcRdyVec(i)(j) && hit && hitVec(k), - "BypassCtrlHit: IQIdx:%d Src%d:%d Ports:%d Pc:%x RoqIdx:%x\n", - i.U, j.U, psrc(i)(j), k.U, io.bypassUops(k).bits.cf.pc, io.bypassUops(k).bits.roqIdx.asUInt) - } - // XSDebug(RegNext(validQue(i) && !srcRdyVec(i)(j) && hit), "BypassData: Sel:%d Src:(%d|%d) HitVecNext:%b Data:%x (for last cycle's Ctrl)\n", i.U, j.U, psrc(i)(j), VecInit(hitVecNext).asUInt, ParallelMux(hitVecNext zip bpData)) - for (k <- bpValid.indices) { - XSDebug(RegNext(validQue(i) && !srcRdyVec(i)(j) && hit && hitVec(k)), - "BypassDataHit: IQIdx:%d Src%d:%d Ports:%d Data:%x Pc:%x RoqIdx:%x\n", - i.U, j.U, psrc(i)(j), k.U, bpData(k), io.bypassUops(k).bits.cf.pc, io.bypassUops(k).bits.roqIdx.asUInt) - } - } - } - - // Enqueue Bypass - val enqCtrl = io.enqCtrl - val enqPsrc = List(enqCtrl.bits.psrc1, enqCtrl.bits.psrc2, enqCtrl.bits.psrc3) - val enqSrcType = List(enqCtrl.bits.ctrl.src1Type, enqCtrl.bits.ctrl.src2Type, enqCtrl.bits.ctrl.src3Type) - for (i <- 0 until srcListenNum) { - val hitVec = bpValid.indices.map(j => enqPsrc(i)===bpPdest(j) && bpValid(j) && (enqSrcType(i)===SrcType.reg && bprfWen(j) && bpPdest(j) =/= 0.U || enqSrcType(i)===SrcType.fp && bpfpWen(j))) - val hitVecNext = hitVec.map(RegNext(_)) - val hit = ParallelOR(hitVec).asBool - when (enqFire && hit && !enqSrcRdy(i)) { - srcRdyVec(enqSelIq)(i) := true.B - } - when (RegNext(enqFire && hit && !enqSrcRdy(i))) { - srcDataWire(enqSelIqNext)(i) := ParallelMux(hitVecNext zip bpData) - } - // XSDebug(enqFire && hit, "EnqBypassCtrl: enqSelIq:%d Src:(%d|%d) Hit:%d HitVec:%b \n", enqSelIq, i.U, enqPsrc(i), hit, VecInit(hitVec).asUInt) - for (k <- bpValid.indices) { - XSDebug(enqFire && hit && !enqSrcRdy(i) && hitVec(k), - "EnqBypassCtrlHit: enqSelIq:%d Src%d:%d Ports:%d Pc:%x RoqIdx:%x\n", - enqSelIq, i.U, enqPsrc(i), k.U, io.bypassUops(k).bits.cf.pc, io.bypassUops(k).bits.roqIdx.asUInt) - } - // XSDebug(RegNext(enqFire && hit), "EnqBypassData: enqSelIqNext:%d Src:(%d|%d) HitVecNext:%b Data:%x (for last cycle's Ctrl)\n", enqSelIqNext, i.U, enqPsrc(i), VecInit(hitVecNext).asUInt, ParallelMux(hitVecNext zip bpData)) - for (k <- bpValid.indices) { - XSDebug(RegNext(enqFire && hit && !enqSrcRdy(i) && hitVec(k)), - "EnqBypassDataHit: enqSelIq:%d Src%d:%d Ports:%d Data:%x Pc:%x RoqIdx:%x\n", - enqSelIq, i.U, enqPsrc(i), k.U, bpData(k), io.bypassUops(k).bits.cf.pc, io.bypassUops(k).bits.roqIdx.asUInt) - } - } - - if (enableBypass) { - // send out bypass - val sel = io.selectedUop - sel.valid := toIssFire && !enqSendEnable - sel.bits := DontCare - sel.bits.pdest := issQue(deqSelIq).uop.pdest - sel.bits.cf.pc := issQue(deqSelIq).uop.cf.pc - sel.bits.roqIdx := issQue(deqSelIq).uop.roqIdx - sel.bits.ctrl.rfWen := issQue(deqSelIq).uop.ctrl.rfWen - sel.bits.ctrl.fpWen := issQue(deqSelIq).uop.ctrl.fpWen - } - XSInfo(io.redirect.valid, "Redirect: valid:%d isExp:%d isFpp:%d brTag:%d redHitVec:%b redIdHitVec:%b enqHit:%d selIsRed:%d\n", io.redirect.valid, io.redirect.bits.isException, io.redirect.bits.isFlushPipe, io.redirect.bits.brTag.value, VecInit(redHitVec).asUInt, VecInit(redIdHitVec).asUInt, enqRedHit, selIsRed) - XSInfo(enqFire, - s"EnqCtrl(%d %d) enqSelIq:%d Psrc/Rdy(%d:%d %d:%d %d:%d) Dest:%d oldDest:%d pc:%x roqIdx:%x\n", - io.enqCtrl.valid, io.enqCtrl.ready, enqSelIq, io.enqCtrl.bits.psrc1, io.enqCtrl.bits.src1State, - io.enqCtrl.bits.psrc2, io.enqCtrl.bits.src2State, io.enqCtrl.bits.psrc3, io.enqCtrl.bits.src3State, - io.enqCtrl.bits.pdest, io.enqCtrl.bits.old_pdest, io.enqCtrl.bits.cf.pc, io.enqCtrl.bits.roqIdx.asUInt) - XSInfo(enqFireNext, - "EnqData: src1:%x src2:%x src3:%x pc:%x roqIdx:%x(for last cycle's Ctrl)\n", - io.enqData.src1, io.enqData.src2, io.enqData.src3, issQue(enqSelIqNext).uop.cf.pc, issQue(enqSelIqNext).uop.roqIdx.asUInt) - XSInfo(deqFire, - "Deq:(%d %d) [%d|%x][%d|%x][%d|%x] pdest:%d pc:%x roqIdx:%x\n", - io.deq.valid, io.deq.ready, io.deq.bits.uop.psrc1, io.deq.bits.src1, io.deq.bits.uop.psrc2, io.deq.bits.src2, io.deq.bits.uop.psrc3, - io.deq.bits.src3, io.deq.bits.uop.pdest, io.deq.bits.uop.cf.pc, io.deq.bits.uop.roqIdx.asUInt) - XSDebug("tailAll:%d KID(%d%d%d) tailDot:%b tailDot2:%b selDot:%b popDot:%b moveDot:%b In(%d %d) Out(%d %d)\n", tailAll, tailKeep, tailInc, tailDec, tailDot, tailDot2, selDot, popDot, moveDot, io.enqCtrl.valid, io.enqCtrl.ready, io.deq.valid, io.deq.ready) - XSInfo(issueToExuValid, - "FireStage:Out(%d %d) src1(%d|%x) src2(%d|%x) src3(%d|%x) deqFlush:%d pc:%x roqIdx:%d\n", - io.deq.valid, io.deq.ready, issueToExu.uop.psrc1, issueToExu.src1, issueToExu.uop.psrc2, issueToExu.src2, issueToExu.uop.psrc3, issueToExu.src3, - deqFlushHit, issueToExu.uop.cf.pc, issueToExu.uop.roqIdx.asUInt) - if(enableBypass) { - XSDebug("popOne:%d isPop:%d popSel:%d deqSel:%d deqCanIn:%d toIssFire:%d has1Rdy:%d selIsRed:%d nonValid:%b SelUop:(%d, %d)\n", popOne, isPop, popSel, deqSel, deqCanIn, toIssFire, has1Rdy, selIsRed, nonValid, io.selectedUop.valid, io.selectedUop.bits.pdest) - } else { - XSDebug("popOne:%d isPop:%d popSel:%d deqSel:%d deqCanIn:%d toIssFire:%d has1Rdy:%d selIsRed:%d nonValid:%b\n", popOne, isPop, popSel, deqSel, deqCanIn, toIssFire, has1Rdy, selIsRed, nonValid) - } - - XSDebug(enqSendEnable, p"NoDelayIss: enqALRdy:${enqAlreadyRdy} *Next:${enqALRdyNext} En:${enqSendEnable} flush:${enqSendFlushHit} enqSelIqNext:${enqSelIqNext} deqSelIq:${deqSelIq} deqReady:${io.deq.ready}\n") - XSDebug(s"id|v|r|psrc|r| src1 |psrc|r| src2 |psrc|r| src3 |brTag| pc |roqIdx Exu:${exuCfg.name}\n") - - for (i <- 0 until iqSize) { - when (i.U===tail && tailAll=/=8.U) { - XSDebug("%d |%d|%d| %d|%b|%x| %d|%b|%x| %d|%b|%x| %x |%x|%x <-\n", - idQue(i), - idValidQue(i), - srcRdy(idQue(i)), - psrc(idQue(i))(0), - srcRdyVec(idQue(i))(0), - srcData(idQue(i))(0), - psrc(idQue(i))(1), - srcRdyVec(idQue(i))(1), - srcData(idQue(i))(1), - psrc(idQue(i))(2), - srcRdyVec(idQue(i))(2), - srcData(idQue(i))(2), - issQue(idQue(i)).uop.brTag.value, - issQue(idQue(i)).uop.cf.pc, - issQue(idQue(i)).uop.roqIdx.asUInt - ) - }.otherwise { - XSDebug("%d |%d|%d| %d|%b|%x| %d|%b|%x| %d|%b|%x| %x |%x|%x\n", - idQue(i), - idValidQue(i), - srcRdy(idQue(i)), - psrc(idQue(i))(0), - srcRdyVec(idQue(i))(0), - srcData(idQue(i))(0), - psrc(idQue(i))(1), - srcRdyVec(idQue(i))(1), - srcData(idQue(i))(1), - psrc(idQue(i))(2), - srcRdyVec(idQue(i))(2), - srcData(idQue(i))(2), - issQue(idQue(i)).uop.brTag.value, - issQue(idQue(i)).uop.cf.pc, - issQue(idQue(i)).uop.roqIdx.asUInt - ) - } - } - - XSPerf("utilization", tailAll) -} diff --git a/src/main/scala/xiangshan/backend/issue/ReservationStationNew.scala b/src/main/scala/xiangshan/backend/issue/ReservationStationNew.scala new file mode 100644 index 0000000000000000000000000000000000000000..3d045b6cb78e32f35be575ac564e5aa9b3d4c937 --- /dev/null +++ b/src/main/scala/xiangshan/backend/issue/ReservationStationNew.scala @@ -0,0 +1,457 @@ +package xiangshan.backend.issue + +import chisel3._ +import chisel3.util._ +import xiangshan._ +import utils._ +import xiangshan.backend.exu.{Exu, ExuConfig} + +class BypassQueue(number: Int) extends XSModule { + val io = IO(new Bundle { + val in = Flipped(ValidIO(new MicroOp)) + val out = ValidIO(new MicroOp) + val redirect = Flipped(ValidIO(new Redirect)) + }) + if (number < 0) { + io.out.valid := false.B + io.out.bits := DontCare + } else if(number == 0) { + io.in <> io.out + io.out.valid := io.in.valid + } else { + val queue = Seq.fill(number)(RegInit(0.U.asTypeOf(new Bundle{ + val valid = Bool() + val bits = new MicroOp + }))) + queue(0).valid := io.in.valid + queue(0).bits := io.in.bits + (0 until (number-1)).map{i => + queue(i+1) := queue(i) + queue(i+1).valid := queue(i).valid && !queue(i).bits.roqIdx.needFlush(io.redirect) + } + io.out.valid := queue(number-1).valid + io.out.bits := queue(number-1).bits + for (i <- 0 until number) { + XSDebug(queue(i).valid, p"BPQue(${i.U}): pc:${Hexadecimal(queue(i).bits.cf.pc)} roqIdx:${queue(i).bits.roqIdx}" + + p" pdest:${queue(i).bits.pdest} rfWen:${queue(i).bits.ctrl.rfWen} fpWen${queue(i).bits.ctrl.fpWen}\n") + } + } +} + +class RSCtrlDataIO extends XSBundle { + // TODO: current: Ctrl to Data, next: Data to Ctrl + val enqPtr = Output(UInt(log2Up(IssQueSize).W)) + val deqPtr = ValidIO(UInt(log2Up(IssQueSize).W)) // one cycle earlier + val enqCtrl = ValidIO(new MicroOp) + + val fuReady = Input(Bool()) + val srcUpdate = Input(Vec(IssQueSize+1, Vec(3, Bool()))) // Note: the last one for enq + val redVec = Input(UInt(IssQueSize.W)) + val feedback = Input(Vec(IssQueSize+1, Bool())) // Note: the last one for hit +} + +class ReservationStationCtrl +( + val exuCfg: ExuConfig, + wakeupCnt: Int, + extraListenPortsCnt: Int, + srcNum: Int = 3, + feedback: Boolean, + fixedDelay: Int, + replayDelay: Int = 10 +) extends XSModule with HasCircularQueuePtrHelper { + + val iqSize = IssQueSize + val iqIdxWidth = log2Up(iqSize) + val fastWakeup = fixedDelay > 0 // NOTE: if do not enable fastWakeup(bypass), set fixedDelay to -1 + val nonBlocked = fastWakeup + + val io = IO(new XSBundle { + // flush + val redirect = Flipped(ValidIO(new Redirect)) + + // enq Ctrl sigs at dispatch-2, only use srcState + val enqCtrl = Flipped(DecoupledIO(new MicroOp)) + + // to DataPart + val data = new RSCtrlDataIO + + // to Dispatch + val numExist = Output(UInt(iqIdxWidth.W)) + }) + + // control part: + + val s_idle :: s_valid :: s_wait :: s_replay :: Nil = Enum(4) + + val needFeedback = if (feedback) true.B else false.B + val notBlock = if (nonBlocked) true.B else false.B + val stateQueue = RegInit(VecInit(Seq.fill(iqSize)(s_idle))) + val validQueue = stateQueue.map(_ === s_valid) + val emptyQueue = stateQueue.map(_ === s_idle) + val srcQueue = Reg(Vec(iqSize, Vec(srcNum, Bool()))) + val cntQueue = Reg(Vec(iqSize, UInt(log2Up(replayDelay).W))) + + // rs queue part: + // val tailPtr = RegInit(0.U((iqIdxWidth+1).W)) + val tailPtr = RegInit(0.U.asTypeOf(new CircularQueuePtr(iqSize))) + val idxQueue = RegInit(VecInit((0 until iqSize).map(_.U(iqIdxWidth.W)))) + val readyQueue = VecInit(srcQueue.zip(validQueue).map{ case (a,b) => Cat(a).andR & b }) + + // redirect + val redHitVec = VecInit((0 until iqSize).map(i => io.data.redVec(idxQueue(i)))) + val fbMatchVec = (0 until iqSize).map(i => needFeedback && io.data.feedback(idxQueue(i)) && + (stateQueue(i) === s_wait || stateQueue(i)===s_valid)) + val fbHit = io.data.feedback(IssQueSize) + + // select ready + // for no replay, select just equal to deq (attached) + // with replay, select is just two stage with deq. + val issFire = Wire(Bool()) + val moveMask = WireInit(0.U(iqSize.W)) + val selectedIdxRegOH = Wire(UInt(iqSize.W)) + val selectMask = WireInit(VecInit( + (0 until iqSize).map(i => + readyQueue(i) && Mux(notBlock, true.B, !(selectedIdxRegOH(i) && (issFire))) + // NOTE: if nonBlocked, then change state at sel stage + ) + )) + val haveBubble = Wire(Bool()) + val (selectedIdxWire, selected) = PriorityEncoderWithFlag(selectMask) + val redSel = redHitVec(selectedIdxWire) + val selValid = !redSel && selected && !haveBubble + val selReg = RegNext(selValid) + val selectedIdxReg = RegNext(selectedIdxWire - moveMask(selectedIdxWire)) + selectedIdxRegOH := UIntToOH(selectedIdxReg) + + // sel bubble + // TODO: + val bubIdxRegOH = Wire(UInt(iqSize.W)) + val bubMask = WireInit(VecInit( + (0 until iqSize).map(i => emptyQueue(i) && !bubIdxRegOH(i) && + Mux(notBlock, !selectedIdxRegOH(i), true.B) + ))) + val (firstBubble, findBubble) = PriorityEncoderWithFlag(bubMask) + haveBubble := findBubble && (firstBubble < tailPtr.asUInt) + val bubValid = haveBubble + val bubReg = RegNext(bubValid) + val bubIdxReg = RegNext(firstBubble - moveMask(firstBubble)) + bubIdxRegOH := UIntToOH(bubIdxReg) + + // deq + // TODO: divide needFeedback and not needFeedback + val deqValid = bubReg/*fire an bubble*/ || (issFire && !needFeedback/*fire an rdy*/) + val deqIdx = Mux(bubReg, bubIdxReg, selectedIdxReg) // TODO: may have one more cycle delay than fire slot + moveMask := { + (Fill(iqSize, 1.U(1.W)) << deqIdx)(iqSize-1, 0) + } & Fill(iqSize, deqValid) + + // move + for(i <- 0 until iqSize-1){ + when(moveMask(i)){ + idxQueue(i) := idxQueue(i+1) + srcQueue(i).zip(srcQueue(i+1)).map{case (a,b) => a := b} + stateQueue(i) := stateQueue(i+1) + } + } + when (notBlock && selValid) { // if notBlock, disable at select stage + stateQueue(selectedIdxWire - moveMask(selectedIdxWire)) := s_idle + // TODO: may have long latency + } + when(deqValid){ + idxQueue.last := idxQueue(deqIdx) + stateQueue.last := s_idle + } + when (issFire && needFeedback) { + stateQueue(selectedIdxReg) := s_wait + } + + + // redirect and feedback + for (i <- 0 until iqSize) { + val cnt = cntQueue(idxQueue(i)) + + if (i != 0) { // TODO: combine the two case + val nextIdx = i.U - moveMask(i-1) + when (stateQueue(i)===s_replay) { + when (cnt===0.U) { stateQueue(nextIdx) := s_valid } + .otherwise { cnt := cnt - 1.U } + } + when (fbMatchVec(i)) { + stateQueue(nextIdx) := Mux(fbHit, s_idle, s_replay) + cnt := Mux(fbHit, cnt, (replayDelay-1).U) + } + when (redHitVec(i)) { stateQueue(nextIdx) := s_idle } + } else { when (!moveMask(i)) { + val nextIdx = i + when (stateQueue(i)===s_replay) { + when (cnt===0.U) { stateQueue(nextIdx) := s_valid } + .otherwise { cnt := cnt - 1.U } + } + when (fbMatchVec(i)) { + stateQueue(nextIdx) := Mux(fbHit, s_idle, s_replay) + cnt := Mux(fbHit, cnt, (replayDelay-1).U) + } + when (redHitVec(i)) { stateQueue(nextIdx) := s_idle } + }} + } + + // output + val issValid = selReg && !redHitVec(selectedIdxReg) + issFire := issValid && Mux(notBlock, true.B, io.data.fuReady) + if (nonBlocked) { assert(RegNext(io.data.fuReady), "if fu wanna fast wakeup, it should not block")} + + // enq + val tailAfterRealDeq = tailPtr - (issFire && !needFeedback|| bubReg) + val isFull = tailAfterRealDeq.flag // tailPtr===qsize.U + // agreement with dispatch: don't fire when io.redirect.valid + val enqFire = io.enqCtrl.fire() && !io.redirect.valid + tailPtr := tailAfterRealDeq + enqFire + + io.enqCtrl.ready := !isFull + val enqUop = io.enqCtrl.bits + val srcSeq = Seq(enqUop.psrc1, enqUop.psrc2, enqUop.psrc3) + val srcTypeSeq = Seq(enqUop.ctrl.src1Type, enqUop.ctrl.src2Type, enqUop.ctrl.src3Type) + val srcStateSeq = Seq(enqUop.src1State, enqUop.src2State, enqUop.src3State) + + val enqIdx_ctrl = tailAfterRealDeq.value + val enqBpVec = io.data.srcUpdate(IssQueSize) + + def stateCheck(src: UInt, srcType: UInt): Bool = { + (srcType =/= SrcType.reg && srcType =/= SrcType.fp) || + (srcType === SrcType.reg && src === 0.U) + } + + when (enqFire) { + stateQueue(enqIdx_ctrl) := s_valid + srcQueue(enqIdx_ctrl).zipWithIndex.map{ case (s, i) => + s := Mux(enqBpVec(i) || stateCheck(srcSeq(i), srcTypeSeq(i)), true.B, + srcStateSeq(i)===SrcState.rdy) + } + XSDebug(p"EnqCtrl: roqIdx:${enqUop.roqIdx} pc:0x${Hexadecimal(enqUop.cf.pc)} " + + p"src1:${srcSeq(0)} state:${srcStateSeq(0)} type:${srcTypeSeq(0)} src2:${srcSeq(1)} " + + p" state:${srcStateSeq(1)} type:${srcTypeSeq(1)} src3:${srcSeq(2)} state:${srcStateSeq(2)} " + + p"type:${srcTypeSeq(2)}\n") + } + + // wakeup + for(i <- 0 until IssQueSize) { + val hitVec = io.data.srcUpdate(idxQueue(i)) + for(j <- 0 until srcNum) { + when (hitVec(j) && validQueue(i)) { + srcQueue(i.U - moveMask(i))(j) := true.B + XSDebug(p"srcHit: i:${i.U} j:${j.U}\n") + } + } + } + + // other to Data + io.data.enqPtr := idxQueue(Mux(tailPtr.flag, deqIdx, tailPtr.value)) + io.data.deqPtr.valid := selValid + io.data.deqPtr.bits := idxQueue(selectedIdxWire) + io.data.enqCtrl.valid := enqFire + io.data.enqCtrl.bits := io.enqCtrl.bits + + // other io + io.numExist := Mux(tailPtr.flag, (iqSize-1).U, tailPtr.value) // NOTE: numExist is iqIdxWidth.W, maybe a bug + + // assert + assert(RegNext(Mux(tailPtr.flag, tailPtr.value===0.U, true.B))) + + val print = !(tailPtr.asUInt===0.U) || io.enqCtrl.valid + XSDebug(print || true.B, p"In(${io.enqCtrl.valid} ${io.enqCtrl.ready}) Out(${issValid} ${io.data.fuReady})\n") + XSDebug(print , p"tailPtr:${tailPtr} tailPtrAdq:${tailAfterRealDeq} isFull:${isFull} " + + p"needFeed:${needFeedback} vQue:${Binary(VecInit(validQueue).asUInt)} rQue:${Binary(readyQueue.asUInt)}\n") + XSDebug(print && Cat(redHitVec).orR, p"Redirect: ${Hexadecimal(redHitVec.asUInt)}\n") + XSDebug(print && Cat(fbMatchVec).orR, p"Feedback: ${Hexadecimal(VecInit(fbMatchVec).asUInt)} Hit:${fbHit}\n") + XSDebug(print, p"moveMask:${Binary(moveMask)} selMask:${Binary(selectMask.asUInt)} haveBub:${haveBubble}\n") + XSDebug(print, p"selIdxWire:${selectedIdxWire} selected:${selected} redSel:${redSel}" + + p"selV:${selValid} selReg:${selReg} selIdxReg:${selectedIdxReg} selIdxRegOH:${Binary(selectedIdxRegOH)}\n") + XSDebug(print, p"bubMask:${Binary(bubMask.asUInt)} firstBub:${firstBubble} findBub:${findBubble} " + + p"bubReg:${bubReg} bubIdxReg:${bubIdxReg} bubIdxRegOH:${Binary(bubIdxRegOH)}\n") + XSDebug(p" :Idx|v|r|s |cnt|s1:s2:s3\n") + for(i <- srcQueue.indices) { + XSDebug(p"${i.U}: ${idxQueue(i)}|${validQueue(i)}|${readyQueue(i)}|${stateQueue(i)}|" + + p"${cntQueue(i)}|${srcQueue(i)(0)}:${srcQueue(i)(1)}:${srcQueue(i)(2)}\n") + } +} + +class ReservationStationData +( + val exuCfg: ExuConfig, + wakeupCnt: Int, + extraListenPortsCnt: Int, + fixedDelay: Int, + feedback: Boolean, + srcNum: Int = 3 +) extends XSModule { + + val iqSize = IssQueSize + val iqIdxWidth = log2Up(iqSize) + val fastWakeup = fixedDelay >= 0 // NOTE: if do not enable fastWakeup(bypass), set fixedDelay to -1 + val nonBlocked = fastWakeup + val notBlock = if (nonBlocked) true.B else false.B + + val io = IO(new XSBundle { + // flush + val redirect = Flipped(ValidIO(new Redirect)) + + // enq Data at next cycle (regfile has 1 cycle latency) + val enqData = Input(new ExuInput) + + // send to exu + val deq = DecoupledIO(new ExuInput) + + // listen to RSCtrl + val ctrl = Flipped(new RSCtrlDataIO) + + // broadcast selected uop to other issue queues + val selectedUop = ValidIO(new MicroOp) + + // recv broadcasted uops form any relative issue queue, + // to simplify wake up logic, the uop broadcasted by this queue self + // are also in 'boradcastedUops' + val broadcastedUops = Vec(wakeupCnt, Flipped(ValidIO(new MicroOp))) + + // listen to write back data bus(certain latency) + // and extra wrtie back(uncertan latency) + val writeBackedData = Vec(wakeupCnt, Input(UInt(XLEN.W))) + val extraListenPorts = Vec(extraListenPortsCnt, Flipped(ValidIO(new ExuOutput))) + + // tlb feedback + val feedback = Flipped(ValidIO(new TlbFeedback)) + }) + + val uop = Reg(Vec(iqSize, new MicroOp)) + val data = Reg(Vec(iqSize, Vec(srcNum, UInt(XLEN.W)))) + + // TODO: change srcNum + + val enq = io.ctrl.enqPtr + val sel = io.ctrl.deqPtr + val deq = RegEnable(sel.bits, sel.valid) + val enqCtrl = io.ctrl.enqCtrl + val enqUop = enqCtrl.bits + + // enq + val enqPtr = enq(log2Up(IssQueSize)-1,0) + val enqPtrReg = RegEnable(enqPtr, enqCtrl.valid) + val enqEn = enqCtrl.valid + val enqEnReg = RegNext(enqEn) + when (enqEn) { + uop(enqPtr) := enqUop + XSDebug(p"enqCtrl: enqPtr:${enqPtr} src1:${enqUop.psrc1}|${enqUop.src1State}|${enqUop.ctrl.src1Type}" + + p" src2:${enqUop.psrc2}|${enqUop.src2State}|${enqUop.ctrl.src2Type} src3:${enqUop.psrc3}|" + + p"${enqUop.src3State}|${enqUop.ctrl.src3Type} pc:0x${Hexadecimal(enqUop.cf.pc)} roqIdx:${enqUop.roqIdx}\n") + } + when (enqEnReg) { // TODO: turn to srcNum, not the 3 + data(enqPtrReg)(0) := io.enqData.src1 + data(enqPtrReg)(1) := io.enqData.src2 + data(enqPtrReg)(2) := io.enqData.src3 + XSDebug(p"enqData: enqPtrReg:${enqPtrReg} src1:${Hexadecimal(io.enqData.src1)}" + + p" src2:${Hexadecimal(io.enqData.src2)} src3:${Hexadecimal(io.enqData.src2)}\n") + } + + def wbHit(uop: MicroOp, src: UInt, srctype: UInt): Bool = { + (src === uop.pdest) && + ((srctype === SrcType.reg && uop.ctrl.rfWen && src=/=0.U) || + (srctype === SrcType.fp && uop.ctrl.fpWen)) + } + + // wakeup and bypass + def wakeup(src: UInt, srcType: UInt, valid: Bool = true.B) : (Bool, UInt) = { + val hitVec = io.extraListenPorts.map(port => wbHit(port.bits.uop, src, srcType) && port.valid && valid) + assert(RegNext(PopCount(hitVec)===0.U || PopCount(hitVec)===1.U)) + + val hit = ParallelOR(hitVec) + (hit, ParallelMux(hitVec zip io.extraListenPorts.map(_.bits.data))) + } + + def bypass(src: UInt, srcType: UInt, valid: Bool = true.B) : (Bool, Bool, UInt) = { + val hitVec = io.broadcastedUops.map(port => wbHit(port.bits, src, srcType) && port.valid && valid) + assert(RegNext(PopCount(hitVec)===0.U || PopCount(hitVec)===1.U)) + + val hit = ParallelOR(hitVec) + (hit, RegNext(hit), ParallelMux(hitVec.map(RegNext(_)) zip io.writeBackedData)) + } + + io.ctrl.srcUpdate.map(a => a.map(_ := false.B)) + for (i <- 0 until iqSize) { + val srcSeq = Seq(uop(i).psrc1, uop(i).psrc2, uop(i).psrc3) + val srcTypeSeq = Seq(uop(i).ctrl.src1Type, uop(i).ctrl.src2Type, uop(i).ctrl.src3Type) + for (j <- 0 until 3) { + val (wuHit, wuData) = wakeup(srcSeq(j), srcTypeSeq(j)) + val (bpHit, bpHitReg, bpData) = bypass(srcSeq(j), srcTypeSeq(j)) + when (wuHit || bpHit) { io.ctrl.srcUpdate(i)(j) := true.B } + when (wuHit) { data(i)(j) := wuData } + when (bpHitReg && !(enqPtrReg===i.U && enqEnReg)) { data(i)(j) := bpData } + // NOTE: the hit is from data's info, so there is an erro that: + // when enq, hit use last instr's info not the enq info. + // it will be long latency to add correct here, so add it to ctrl or somewhere else + // enq bp is done at below + XSDebug(wuHit, p"WUHit: (${i.U})(${j.U}) Data:0x${Hexadecimal(wuData)} i:${i.U} j:${j.U}\n") + XSDebug(bpHit, p"BPHit: (${i.U})(${j.U}) i:${i.U} j:${j.U}\n") + XSDebug(bpHitReg, p"BPHitData: (${i.U})(${j.U}) Data:0x${Hexadecimal(bpData)} i:${i.U} j:${j.U}\n") + } + } + + // deq + io.deq.bits.uop := uop(deq) + io.deq.bits.src1 := data(deq)(0) + io.deq.bits.src2 := data(deq)(1) + io.deq.bits.src3 := data(deq)(2) + io.deq.valid := RegNext(sel.valid) + if (nonBlocked) { assert(RegNext(io.deq.ready), s"${name} if fu wanna fast wakeup, it should not block")} + + // to ctrl + val srcSeq = Seq(enqUop.psrc1, enqUop.psrc2, enqUop.psrc3) + val srcTypeSeq = Seq(enqUop.ctrl.src1Type, enqUop.ctrl.src2Type, enqUop.ctrl.src3Type) + io.ctrl.srcUpdate(IssQueSize).zipWithIndex.map{ case (h, i) => + val (bpHit, bpHitReg, bpData)= bypass(srcSeq(i), srcTypeSeq(i), enqCtrl.valid) + when (bpHitReg) { data(enqPtrReg)(i) := bpData } + h := bpHit + // NOTE: enq bp is done here + XSDebug(bpHit, p"EnqBPHit: (${i.U})\n") + XSDebug(bpHitReg, p"EnqBPHitData: (${i.U}) data:${Hexadecimal(bpData)}\n") + } + io.ctrl.fuReady := Mux(notBlock, true.B, io.deq.ready) + io.ctrl.redVec := VecInit(uop.map(_.roqIdx.needFlush(io.redirect))).asUInt + + io.ctrl.feedback := DontCare + if (feedback) { + (0 until IssQueSize).map(i => + io.ctrl.feedback(i) := uop(i).roqIdx.asUInt === io.feedback.bits.roqIdx.asUInt && io.feedback.valid) + io.ctrl.feedback(IssQueSize) := io.feedback.bits.hit + } + + + // bypass send + io.selectedUop <> DontCare + if (fastWakeup) { + val bpQueue = Module(new BypassQueue(fixedDelay)) + bpQueue.io.in.valid := sel.valid // FIXME: error when function is blocked => fu should not be blocked + bpQueue.io.in.bits := uop(sel.bits) + bpQueue.io.redirect := io.redirect + io.selectedUop.valid := bpQueue.io.out.valid + io.selectedUop.bits := bpQueue.io.out.bits + + XSDebug(io.selectedUop.valid, p"SelUop: pc:0x${Hexadecimal(io.selectedUop.bits.cf.pc)}" + + p" roqIdx:${io.selectedUop.bits.roqIdx} pdest:${io.selectedUop.bits.pdest} " + + p"rfWen:${io.selectedUop.bits.ctrl.rfWen} fpWen:${io.selectedUop.bits.ctrl.fpWen}\n" ) + } + + + // log + XSDebug(io.feedback.valid, p"feedback: roqIdx:${io.feedback.bits.roqIdx} hit:${io.feedback.bits.hit}\n") + XSDebug(true.B, p"out(${io.deq.valid} ${io.deq.ready})\n") + XSDebug(io.deq.valid, p"Deq(${io.deq.valid} ${io.deq.ready}): deqPtr:${deq} pc:${Hexadecimal(io.deq.bits.uop.cf.pc)}" + + p" roqIdx:${io.deq.bits.uop.roqIdx} src1:${Hexadecimal(io.deq.bits.src1)} " + + p" src2:${Hexadecimal(io.deq.bits.src2)} src3:${Hexadecimal(io.deq.bits.src3)}\n") + XSDebug(p"Data: | src1:data | src2:data | src3:data |hit|pdest:rf:fp| roqIdx | pc\n") + for(i <- data.indices) { + XSDebug(p"${i.U}:|${uop(i).psrc1}:${Hexadecimal(data(i)(0))}|${uop(i).psrc2}:" + + p"${Hexadecimal(data(i)(1))}|${uop(i).psrc3}:${Hexadecimal(data(i)(2))}|" + + p"${Binary(io.ctrl.srcUpdate(i).asUInt)}|${uop(i).pdest}:${uop(i).ctrl.rfWen}:" + + p"${uop(i).ctrl.fpWen}|${uop(i).roqIdx} |${Hexadecimal(uop(i).cf.pc)}\n") + } +} \ No newline at end of file diff --git a/src/main/scala/xiangshan/backend/regfile/Regfile.scala b/src/main/scala/xiangshan/backend/regfile/Regfile.scala index 7ed3bc9e06ed46d1dda923b6eb3d5f7248e153d2..04e833a76f5836fd8ae057760db21eab1ae367b9 100644 --- a/src/main/scala/xiangshan/backend/regfile/Regfile.scala +++ b/src/main/scala/xiangshan/backend/regfile/Regfile.scala @@ -2,18 +2,17 @@ package xiangshan.backend.regfile import chisel3._ import chisel3.util._ -import chisel3.util.experimental.BoringUtils import xiangshan._ class RfReadPort extends XSBundle { val addr = Input(UInt(PhyRegIdxWidth.W)) - val data = Output(UInt(XLEN.W)) + val data = Output(UInt((XLEN + 1).W)) } class RfWritePort extends XSBundle { val wen = Input(Bool()) val addr = Input(UInt(PhyRegIdxWidth.W)) - val data = Input(UInt(XLEN.W)) + val data = Input(UInt((XLEN + 1).W)) } class Regfile @@ -21,44 +20,150 @@ class Regfile numReadPorts: Int, numWirtePorts: Int, hasZero: Boolean, - isMemRf: Boolean = false + len: Int ) extends XSModule { val io = IO(new Bundle() { val readPorts = Vec(numReadPorts, new RfReadPort) val writePorts = Vec(numWirtePorts, new RfWritePort) }) - val mem = Mem(NRPhyRegs, UInt(XLEN.W)) - - val debugRegSync = WireInit(0.U(XLEN.W)) - val debugCnt = RegInit(0.U((PhyRegIdxWidth+1).W)) - when(!debugCnt.head(1).asBool()){ - debugCnt := debugCnt + 1.U - if(isMemRf){ - BoringUtils.addSink(debugRegSync, "DEBUG_REG_SYNC") - mem(debugCnt) := debugRegSync - } else if (hasZero) { - debugRegSync := mem(debugCnt) - BoringUtils.addSource(debugRegSync, "DEBUG_REG_SYNC") - } - } - for(r <- io.readPorts){ - val addr_reg = RegNext(r.addr) - r.data := {if(hasZero) Mux(addr_reg===0.U, 0.U, mem(addr_reg)) else mem(addr_reg)} - } - for(w <- io.writePorts){ - when(w.wen){ - mem(w.addr) := w.data - } - } + if (!env.FPGAPlatform) { + + + val mem = Mem(NRPhyRegs, UInt(len.W)) + + for(r <- io.readPorts){ + val addr_reg = RegNext(r.addr) + r.data := {if(hasZero) Mux(addr_reg===0.U, 0.U, mem(addr_reg)) else mem(addr_reg)} + } + + for(w <- io.writePorts){ + when(w.wen){ + mem(w.addr) := w.data + } + } - if(!isMemRf){ val debugArchRat = WireInit(VecInit(Seq.fill(32)(0.U(PhyRegIdxWidth.W)))) - BoringUtils.addSink(debugArchRat, if(hasZero) "DEBUG_INI_ARCH_RAT" else "DEBUG_FP_ARCH_RAT") + ExcitingUtils.addSink( + debugArchRat, + if(hasZero) "DEBUG_INI_ARCH_RAT" else "DEBUG_FP_ARCH_RAT", + ExcitingUtils.Debug + ) + + val debugArchReg = WireInit(VecInit(debugArchRat.zipWithIndex.map( + x => if(hasZero && x._2==0) 0.U else mem(x._1) + ))) + ExcitingUtils.addSource( + debugArchReg, + if(hasZero) "DEBUG_INT_ARCH_REG" else "DEBUG_FP_ARCH_REG", + ExcitingUtils.Debug + ) + } else { + + val regfile = Module(new regfile_160x64_10w16r_sim) + + regfile.io.clk := this.clock + regfile.io.gpr := hasZero.B + + regfile.io.wen0 := io.writePorts(0).wen + regfile.io.waddr0 := io.writePorts(0).addr + regfile.io.wdata0 := io.writePorts(0).data + + regfile.io.wen1 := io.writePorts(1).wen + regfile.io.waddr1 := io.writePorts(1).addr + regfile.io.wdata1 := io.writePorts(1).data + + regfile.io.wen2 := io.writePorts(2).wen + regfile.io.waddr2 := io.writePorts(2).addr + regfile.io.wdata2 := io.writePorts(2).data + + regfile.io.wen3 := io.writePorts(3).wen + regfile.io.waddr3 := io.writePorts(3).addr + regfile.io.wdata3 := io.writePorts(3).data + + regfile.io.wen4 := io.writePorts(4).wen + regfile.io.waddr4 := io.writePorts(4).addr + regfile.io.wdata4 := io.writePorts(4).data + + regfile.io.wen5 := io.writePorts(5).wen + regfile.io.waddr5 := io.writePorts(5).addr + regfile.io.wdata5 := io.writePorts(5).data - val debugArchReg = WireInit(VecInit(debugArchRat.zipWithIndex.map(x => if(hasZero && x._2==0) 0.U else mem(x._1)))) - BoringUtils.addSource(debugArchReg, if(hasZero) "DEBUG_INT_ARCH_REG" else "DEBUG_FP_ARCH_REG") + regfile.io.wen6 := io.writePorts(6).wen + regfile.io.waddr6 := io.writePorts(6).addr + regfile.io.wdata6 := io.writePorts(6).data + + regfile.io.wen7 := io.writePorts(7).wen + regfile.io.waddr7 := io.writePorts(7).addr + regfile.io.wdata7 := io.writePorts(7).data + + regfile.io.wen8 := false.B //io.writePorts(8).wen + regfile.io.waddr8 := DontCare //io.writePorts(8).addr + regfile.io.wdata8 := DontCare //io.writePorts(8).data + + regfile.io.wen9 := false.B //io.writePorts(9).wen + regfile.io.waddr9 := DontCare //io.writePorts(9).addr + regfile.io.wdata9 := DontCare //io.writePorts(9).data + + + regfile.io.raddr0 := io.readPorts(0).addr + regfile.io.raddr1 := io.readPorts(1).addr + regfile.io.raddr2 := io.readPorts(2).addr + regfile.io.raddr3 := io.readPorts(3).addr + regfile.io.raddr4 := io.readPorts(4).addr + regfile.io.raddr5 := io.readPorts(5).addr + regfile.io.raddr6 := io.readPorts(6).addr + regfile.io.raddr7 := io.readPorts(7).addr + regfile.io.raddr8 := io.readPorts(8).addr + regfile.io.raddr9 := io.readPorts(9).addr + regfile.io.raddr10 := io.readPorts(10).addr + regfile.io.raddr11 := io.readPorts(11).addr + regfile.io.raddr12 := io.readPorts(12).addr + regfile.io.raddr13 := io.readPorts(13).addr + regfile.io.raddr14 := DontCare //io.readPorts(14).addr + regfile.io.raddr15 := DontCare //io.readPorts(15).addr + + io.readPorts(0).data := regfile.io.rdata0 + io.readPorts(1).data := regfile.io.rdata1 + io.readPorts(2).data := regfile.io.rdata2 + io.readPorts(3).data := regfile.io.rdata3 + io.readPorts(4).data := regfile.io.rdata4 + io.readPorts(5).data := regfile.io.rdata5 + io.readPorts(6).data := regfile.io.rdata6 + io.readPorts(7).data := regfile.io.rdata7 + io.readPorts(8).data := regfile.io.rdata8 + io.readPorts(9).data := regfile.io.rdata9 + io.readPorts(10).data := regfile.io.rdata10 + io.readPorts(11).data := regfile.io.rdata11 + io.readPorts(12).data := regfile.io.rdata12 + io.readPorts(13).data := regfile.io.rdata13 } + } + +class regfile_160x64_10w16r_sim extends BlackBox with HasBlackBoxResource { + + val io = IO(new Bundle{ + val clk = Input(Clock()) + val gpr = Input(Bool()) + + // write + val wen0, wen1, wen2, wen3, wen4, wen5, wen6, wen7, wen8, wen9 = Input(Bool()) + val waddr0, waddr1, waddr2, waddr3, waddr4, waddr5, waddr6, waddr7, waddr8, waddr9 = Input(UInt(8.W)) + val wdata0, wdata1, wdata2, wdata3, wdata4, wdata5, wdata6, wdata7, wdata8, wdata9 = Input(UInt(64.W)) + + // read + val raddr0, raddr1, raddr2, raddr3, raddr4, raddr5, raddr6, raddr7 = Input(UInt(8.W)) + val raddr8, raddr9, raddr10, raddr11, raddr12, raddr13, raddr14, raddr15 = Input(UInt(8.W)) + val rdata0, rdata1, rdata2, rdata3, rdata4, rdata5, rdata6, rdata7 = Output(UInt(64.W)) + val rdata8, rdata9, rdata10, rdata11, rdata12, rdata13, rdata14, rdata15 = Output(UInt(64.W)) + }) + + val vsrc = "/vsrc/regfile_160x64_10w16r_sim.v" + println(s"Regfile: Using verilog source at: $vsrc") + setResource(vsrc) + +} + diff --git a/src/main/scala/xiangshan/backend/rename/BusyTable.scala b/src/main/scala/xiangshan/backend/rename/BusyTable.scala index 4dd138598fdea656e50b0abdfea4b3e64f124da9..4ad32d8ebe912dbaecca7c3e38d2ac66e5c420b7 100644 --- a/src/main/scala/xiangshan/backend/rename/BusyTable.scala +++ b/src/main/scala/xiangshan/backend/rename/BusyTable.scala @@ -12,8 +12,6 @@ class BusyTable(numReadPorts: Int, numWritePorts: Int) extends XSModule { val allocPregs = Vec(RenameWidth, Flipped(ValidIO(UInt(PhyRegIdxWidth.W)))) // set preg state to ready (write back regfile + roq walk) val wbPregs = Vec(numWritePorts, Flipped(ValidIO(UInt(PhyRegIdxWidth.W)))) - // set preg state to busy when replay - val replayPregs = Vec(ReplayWidth, Flipped(ValidIO(UInt(PhyRegIdxWidth.W)))) // read preg state val rfReadAddr = Vec(numReadPorts, Input(UInt(PhyRegIdxWidth.W))) val pregRdy = Vec(numReadPorts, Output(Bool())) @@ -27,17 +25,15 @@ class BusyTable(numReadPorts: Int, numWritePorts: Int) extends XSModule { val wbMask = reqVecToMask(io.wbPregs) val allocMask = reqVecToMask(io.allocPregs) - val replayMask = reqVecToMask(io.replayPregs) val tableAfterWb = table & (~wbMask).asUInt val tableAfterAlloc = tableAfterWb | allocMask - val tableAfterReplay = tableAfterAlloc | replayMask for((raddr, rdy) <- io.rfReadAddr.zip(io.pregRdy)){ rdy := !tableAfterWb(raddr) } - table := tableAfterReplay + table := tableAfterAlloc // for((alloc, i) <- io.allocPregs.zipWithIndex){ // when(alloc.valid){ diff --git a/src/main/scala/xiangshan/backend/rename/FreeList.scala b/src/main/scala/xiangshan/backend/rename/FreeList.scala index 7416e8673d5949d66a499beb07920805b383c194..eac9bb60dd6e87e6ad85451dce4b066757015ba0 100644 --- a/src/main/scala/xiangshan/backend/rename/FreeList.scala +++ b/src/main/scala/xiangshan/backend/rename/FreeList.scala @@ -3,7 +3,7 @@ package xiangshan.backend.rename import chisel3._ import chisel3.util._ import xiangshan._ -import utils.XSDebug +import utils.{CircularQueuePtr, HasCircularQueuePtrHelper, XSDebug} import xiangshan.backend.brq.BrqPtr trait HasFreeListConsts extends HasXSParameter { @@ -11,29 +11,19 @@ trait HasFreeListConsts extends HasXSParameter { def PTR_WIDTH = log2Up(FL_SIZE) } -class FreeListPtr extends Bundle with HasFreeListConsts { +class FreeListPtr extends CircularQueuePtr(FreeListPtr.FL_SIZE) { - val flag = Bool() - val value = UInt(PTR_WIDTH.W) +// final def ===(that: FreeListPtr): Bool = { +// (this.value===that.value) && (this.flag===that.flag) +// } - final def +(inc: Bool): FreeListPtr = { - Mux(inc && (value === (FL_SIZE-1).U), - FreeListPtr(!flag, 0.U), - FreeListPtr(flag, value + inc) - ) - } - - final def ===(that: FreeListPtr): Bool = { - (this.value===that.value) && (this.flag===that.flag) - } - - override def toPrintable: Printable = { - p"$flag:$value" - } +// override def toPrintable: Printable = { +// p"$flag:$value" +// } } -object FreeListPtr { +object FreeListPtr extends HasFreeListConsts { def apply(f: Bool, v:UInt): FreeListPtr = { val ptr = Wire(new FreeListPtr) ptr.flag := f @@ -42,7 +32,7 @@ object FreeListPtr { } } -class FreeList extends XSModule with HasFreeListConsts { +class FreeList extends XSModule with HasFreeListConsts with HasCircularQueuePtrHelper{ val io = IO(new Bundle() { val redirect = Flipped(ValidIO(new Redirect)) @@ -53,6 +43,7 @@ class FreeList extends XSModule with HasFreeListConsts { // do checkpoints val cpReqs = Vec(RenameWidth, Flipped(ValidIO(new BrqPtr))) + val walk = Flipped(ValidIO(UInt(log2Up(RenameWidth).W))) // dealloc phy regs val deallocReqs = Input(Vec(CommitWidth, Bool())) @@ -66,52 +57,64 @@ class FreeList extends XSModule with HasFreeListConsts { val checkPoints = Reg(Vec(BrqSize, new FreeListPtr())) - def isEmpty(ptr1: FreeListPtr, ptr2: FreeListPtr): Bool = ptr1===ptr2 - // dealloc: commited instructions's 'old_pdest' enqueue - var tailPtrNext = WireInit(tailPtr) - for((deallocValid, deallocReg) <- io.deallocReqs.zip(io.deallocPregs)){ - when(deallocValid){ - freeList(tailPtrNext.value) := deallocReg - XSDebug(p"dealloc preg: $deallocReg\n") + for(i <- 0 until CommitWidth){ + val offset = if(i == 0) 0.U else PopCount(io.deallocReqs.take(i)) + val ptr = tailPtr + offset + val idx = ptr.value + when(io.deallocReqs(i)){ + freeList(idx) := io.deallocPregs(i) + XSDebug(p"dealloc preg: ${io.deallocPregs(i)}\n") } - tailPtrNext = tailPtrNext + deallocValid } + val tailPtrNext = tailPtr + PopCount(io.deallocReqs) tailPtr := tailPtrNext // allocate new pregs to rename instructions - var empty = WireInit(isEmpty(headPtr, tailPtr)) - var headPtrNext = WireInit(headPtr) - for( - (((allocReq, canAlloc),pdest),cpReq) <- - io.allocReqs.zip(io.canAlloc).zip(io.pdests).zip(io.cpReqs) - ){ - canAlloc := !empty - pdest := freeList(headPtrNext.value) - headPtrNext = headPtrNext + (allocReq && canAlloc) - when(cpReq.valid){ - checkPoints(cpReq.bits.value) := headPtrNext - XSDebug(p"do checkPt at BrqIdx=${cpReq.bits.value} headPtr:$headPtrNext\n") + + // number of free regs in freelist + val freeRegs = Wire(UInt()) + // use RegNext for better timing + val hasEnoughRegs = RegNext(freeRegs >= RenameWidth.U, true.B) + XSDebug(p"free regs: $freeRegs\n") + + + val newHeadPtrs = ((0 until RenameWidth) map {i => + if(i == 0) headPtr else headPtr + PopCount(io.allocReqs.take(i)) + }) :+ (headPtr + PopCount(io.allocReqs)) + + for(i <- 0 until RenameWidth){ + val ptr = newHeadPtrs(i) + val idx = ptr.value + io.canAlloc(i) := hasEnoughRegs + io.pdests(i) := freeList(idx) + when(io.cpReqs(i).valid){ + checkPoints(io.cpReqs(i).bits.value) := newHeadPtrs(i+1) + XSDebug(p"do checkPt at BrqIdx=${io.cpReqs(i).bits.value} ${newHeadPtrs(i+1)}\n") } - empty = isEmpty(headPtrNext, tailPtr) - XSDebug(p"req:$allocReq canAlloc:$canAlloc pdest:$pdest headNext:$headPtrNext\n") + XSDebug(p"req:${io.allocReqs(i)} canAlloc:$hasEnoughRegs pdest:${io.pdests(i)}\n") } - - headPtr := Mux(io.redirect.valid, // mispredict or exception happen - Mux(io.redirect.bits.isException || io.redirect.bits.isFlushPipe, // TODO: need check by JiaWei - FreeListPtr(!tailPtrNext.flag, tailPtrNext.value), - Mux(io.redirect.bits.isMisPred, - checkPoints(io.redirect.bits.brTag.value), - headPtrNext // replay - ) - ), - headPtrNext + val headPtrNext = Mux(hasEnoughRegs, newHeadPtrs.last, headPtr) + freeRegs := distanceBetween(tailPtr, headPtrNext) + + // when mispredict or exception happens, reset headPtr to tailPtr (freelist is full). + val resetHeadPtr = io.redirect.valid && (io.redirect.bits.isException || io.redirect.bits.isFlushPipe) + headPtr := Mux(resetHeadPtr, + FreeListPtr(!tailPtrNext.flag, tailPtrNext.value), + Mux(io.walk.valid, headPtr - io.walk.bits, headPtrNext) ) XSDebug(p"head:$headPtr tail:$tailPtr\n") XSDebug(io.redirect.valid, p"redirect: brqIdx=${io.redirect.bits.brTag.value}\n") - + val enableFreelistCheck = false + if(env.EnableDebug && enableFreelistCheck){ + for( i <- 0 until FL_SIZE){ + for(j <- i+1 until FL_SIZE){ + assert(freeList(i) != freeList(j), s"Found same entry in freelist! (i=$i j=$j)") + } + } + } } diff --git a/src/main/scala/xiangshan/backend/rename/Rename.scala b/src/main/scala/xiangshan/backend/rename/Rename.scala index fba2acfcb414acee7f93b69aa01d32aefe072ac8..b64aa07d3d03881a69894af988dcda994cb42636 100644 --- a/src/main/scala/xiangshan/backend/rename/Rename.scala +++ b/src/main/scala/xiangshan/backend/rename/Rename.scala @@ -5,22 +5,22 @@ import chisel3.util._ import xiangshan._ import utils._ +class RenameBypassInfo extends XSBundle { + val lsrc1_bypass = MixedVec(List.tabulate(RenameWidth-1)(i => UInt((i+1).W))) + val lsrc2_bypass = MixedVec(List.tabulate(RenameWidth-1)(i => UInt((i+1).W))) + val lsrc3_bypass = MixedVec(List.tabulate(RenameWidth-1)(i => UInt((i+1).W))) + val ldest_bypass = MixedVec(List.tabulate(RenameWidth-1)(i => UInt((i+1).W))) +} + class Rename extends XSModule { val io = IO(new Bundle() { val redirect = Flipped(ValidIO(new Redirect)) val roqCommits = Vec(CommitWidth, Flipped(ValidIO(new RoqCommit))) - val wbIntResults = Vec(NRIntWritePorts, Flipped(ValidIO(new ExuOutput))) - val wbFpResults = Vec(NRFpWritePorts, Flipped(ValidIO(new ExuOutput))) - val intRfReadAddr = Vec(NRIntReadPorts + NRMemReadPorts, Input(UInt(PhyRegIdxWidth.W))) - val fpRfReadAddr = Vec(NRFpReadPorts, Input(UInt(PhyRegIdxWidth.W))) - val intPregRdy = Vec(NRIntReadPorts + NRMemReadPorts, Output(Bool())) - val fpPregRdy = Vec(NRFpReadPorts, Output(Bool())) - // set preg to busy when replay - val replayPregReq = Vec(ReplayWidth, Input(new ReplayPregReq)) // from decode buffer val in = Vec(RenameWidth, Flipped(DecoupledIO(new CfCtrl))) // to dispatch1 val out = Vec(RenameWidth, DecoupledIO(new MicroOp)) + val renameBypass = Output(new RenameBypassInfo) }) def printRenameInfo(in: DecoupledIO[CfCtrl], out: DecoupledIO[MicroOp]) = { @@ -43,8 +43,6 @@ class Rename extends XSModule { val fpFreeList, intFreeList = Module(new FreeList).io val fpRat = Module(new RenameTable(float = true)).io val intRat = Module(new RenameTable(float = false)).io - val fpBusyTable = Module(new BusyTable(NRFpReadPorts, NRFpWritePorts)).io - val intBusyTable = Module(new BusyTable(NRIntReadPorts+NRMemReadPorts, NRIntWritePorts)).io fpFreeList.redirect := io.redirect intFreeList.redirect := io.redirect @@ -52,12 +50,15 @@ class Rename extends XSModule { val flush = io.redirect.valid && (io.redirect.bits.isException || io.redirect.bits.isFlushPipe) // TODO: need check by JiaWei fpRat.flush := flush intRat.flush := flush - fpBusyTable.flush := flush - intBusyTable.flush := flush def needDestReg[T <: CfCtrl](fp: Boolean, x: T): Bool = { {if(fp) x.ctrl.fpWen else x.ctrl.rfWen && (x.ctrl.ldest =/= 0.U)} } + val walkValid = Cat(io.roqCommits.map(_.valid)).orR && io.roqCommits(0).bits.isWalk + fpFreeList.walk.valid := walkValid + intFreeList.walk.valid := walkValid + fpFreeList.walk.bits := PopCount(io.roqCommits.map(c => c.valid && needDestReg(true, c.bits.uop))) + intFreeList.walk.bits := PopCount(io.roqCommits.map(c => c.valid && needDestReg(false, c.bits.uop))) val uops = Wire(Vec(RenameWidth, new MicroOp)) @@ -70,11 +71,12 @@ class Rename extends XSModule { uop.roqIdx := DontCare uop.diffTestDebugLrScValid := DontCare uop.debugInfo := DontCare - uop.lsroqIdx := DontCare uop.lqIdx := DontCare uop.sqIdx := DontCare }) + val needFpDest = Wire(Vec(RenameWidth, Bool())) + val needIntDest = Wire(Vec(RenameWidth, Bool())) var lastReady = WireInit(io.out(0).ready) // debug assert val outRdy = Cat(io.out.map(_.ready)) @@ -87,17 +89,17 @@ class Rename extends XSModule { val inValid = io.in(i).valid // alloc a new phy reg - val needFpDest = inValid && needDestReg(fp = true, io.in(i).bits) - val needIntDest = inValid && needDestReg(fp = false, io.in(i).bits) - fpFreeList.allocReqs(i) := needFpDest && lastReady - intFreeList.allocReqs(i) := needIntDest && lastReady + needFpDest(i) := inValid && needDestReg(fp = true, io.in(i).bits) + needIntDest(i) := inValid && needDestReg(fp = false, io.in(i).bits) + fpFreeList.allocReqs(i) := needFpDest(i) && lastReady + intFreeList.allocReqs(i) := needIntDest(i) && lastReady val fpCanAlloc = fpFreeList.canAlloc(i) val intCanAlloc = intFreeList.canAlloc(i) val this_can_alloc = Mux( - needIntDest, + needIntDest(i), intCanAlloc, Mux( - needFpDest, + needFpDest(i), fpCanAlloc, true.B ) @@ -112,7 +114,7 @@ class Rename extends XSModule { lastReady = io.in(i).ready - uops(i).pdest := Mux(needIntDest, + uops(i).pdest := Mux(needIntDest(i), intFreeList.pdests(i), Mux( uops(i).ctrl.ldest===0.U && uops(i).ctrl.rfWen, @@ -127,7 +129,6 @@ class Rename extends XSModule { def writeRat(fp: Boolean) = { val rat = if(fp) fpRat else intRat val freeList = if(fp) fpFreeList else intFreeList - val busyTable = if(fp) fpBusyTable else intBusyTable // speculative inst write val specWen = freeList.allocReqs(i) && freeList.canAlloc(i) // walk back write @@ -155,9 +156,6 @@ class Rename extends XSModule { freeList.deallocReqs(i) := rat.archWritePorts(i).wen freeList.deallocPregs(i) := io.roqCommits(i).bits.uop.old_pdest - // set phy reg status to busy - busyTable.allocPregs(i).valid := specWen - busyTable.allocPregs(i).bits := freeList.pdests(i) } writeRat(fp = false) @@ -191,27 +189,28 @@ class Rename extends XSModule { uops(i).old_pdest := Mux(uops(i).ctrl.rfWen, intOldPdest, fpOldPdest) } - - def updateBusyTable(fp: Boolean) = { - val wbResults = if(fp) io.wbFpResults else io.wbIntResults - val busyTable = if(fp) fpBusyTable else intBusyTable - for((wb, setPhyRegRdy) <- wbResults.zip(busyTable.wbPregs)){ - setPhyRegRdy.valid := wb.valid && needDestReg(fp, wb.bits.uop) - setPhyRegRdy.bits := wb.bits.uop.pdest - } - } - - updateBusyTable(false) - updateBusyTable(true) - - intBusyTable.rfReadAddr <> io.intRfReadAddr - intBusyTable.pregRdy <> io.intPregRdy - for(i <- io.replayPregReq.indices){ - intBusyTable.replayPregs(i).valid := io.replayPregReq(i).isInt - fpBusyTable.replayPregs(i).valid := io.replayPregReq(i).isFp - intBusyTable.replayPregs(i).bits := io.replayPregReq(i).preg - fpBusyTable.replayPregs(i).bits := io.replayPregReq(i).preg + // We don't bypass the old_pdest from valid instructions with the same ldest currently in rename stage. + // Instead, we determine whether there're some dependences between the valid instructions. + for (i <- 1 until RenameWidth) { + io.renameBypass.lsrc1_bypass(i-1) := Cat((0 until i).map(j => { + val fpMatch = needFpDest(j) && io.in(i).bits.ctrl.src1Type === SrcType.fp + val intMatch = needIntDest(j) && io.in(i).bits.ctrl.src1Type === SrcType.reg + (fpMatch || intMatch) && io.in(j).bits.ctrl.ldest === io.in(i).bits.ctrl.lsrc1 + }).reverse) + io.renameBypass.lsrc2_bypass(i-1) := Cat((0 until i).map(j => { + val fpMatch = needFpDest(j) && io.in(i).bits.ctrl.src2Type === SrcType.fp + val intMatch = needIntDest(j) && io.in(i).bits.ctrl.src2Type === SrcType.reg + (fpMatch || intMatch) && io.in(j).bits.ctrl.ldest === io.in(i).bits.ctrl.lsrc2 + }).reverse) + io.renameBypass.lsrc3_bypass(i-1) := Cat((0 until i).map(j => { + val fpMatch = needFpDest(j) && io.in(i).bits.ctrl.src3Type === SrcType.fp + val intMatch = needIntDest(j) && io.in(i).bits.ctrl.src3Type === SrcType.reg + (fpMatch || intMatch) && io.in(j).bits.ctrl.ldest === io.in(i).bits.ctrl.lsrc3 + }).reverse) + io.renameBypass.ldest_bypass(i-1) := Cat((0 until i).map(j => { + val fpMatch = needFpDest(j) && needFpDest(i) + val intMatch = needIntDest(j) && needIntDest(i) + (fpMatch || intMatch) && io.in(j).bits.ctrl.ldest === io.in(i).bits.ctrl.ldest + }).reverse) } - fpBusyTable.rfReadAddr <> io.fpRfReadAddr - fpBusyTable.pregRdy <> io.fpPregRdy } diff --git a/src/main/scala/xiangshan/backend/rename/RenameTable.scala b/src/main/scala/xiangshan/backend/rename/RenameTable.scala index 5884aeb50e9303e6ce95a6fc85cf6e19e66e20a2..6409d90b8068d87141ca118c25769b42607fe4c6 100644 --- a/src/main/scala/xiangshan/backend/rename/RenameTable.scala +++ b/src/main/scala/xiangshan/backend/rename/RenameTable.scala @@ -2,7 +2,6 @@ package xiangshan.backend.rename import chisel3._ import chisel3.util._ -import chisel3.util.experimental.BoringUtils import xiangshan._ class RatReadPort extends XSBundle { @@ -36,9 +35,9 @@ class RenameTable(float: Boolean) extends XSModule { for((r, i) <- io.readPorts.zipWithIndex){ r.rdata := spec_table(r.addr) - for(w <- io.specWritePorts.take(i/{if(float) 4 else 3})){ // bypass - when(w.wen && (w.addr === r.addr)){ r.rdata := w.wdata } - } + // for(w <- io.specWritePorts.take(i/{if(float) 4 else 3})){ // bypass + // when(w.wen && (w.addr === r.addr)){ r.rdata := w.wdata } + // } } for(w <- io.archWritePorts){ @@ -52,5 +51,11 @@ class RenameTable(float: Boolean) extends XSModule { } } - BoringUtils.addSource(arch_table, if(float) "DEBUG_FP_ARCH_RAT" else "DEBUG_INI_ARCH_RAT") -} \ No newline at end of file + if (!env.FPGAPlatform) { + ExcitingUtils.addSource( + arch_table, + if(float) "DEBUG_FP_ARCH_RAT" else "DEBUG_INI_ARCH_RAT", + ExcitingUtils.Debug + ) + } +} diff --git a/src/main/scala/xiangshan/backend/roq/Roq.scala b/src/main/scala/xiangshan/backend/roq/Roq.scala index dd052b67f68cb8434d740df2f86fad692c734a19..fb6088d8dabb38726a41ff3815dea11093097169 100644 --- a/src/main/scala/xiangshan/backend/roq/Roq.scala +++ b/src/main/scala/xiangshan/backend/roq/Roq.scala @@ -5,10 +5,15 @@ import chisel3._ import chisel3.util._ import xiangshan._ import utils._ -import chisel3.util.experimental.BoringUtils import xiangshan.backend.LSUOpType -import xiangshan.backend.decode.isa.Privileged.WFI - +import xiangshan.backend.fu.fpu.Fflags +object roqDebugId extends Function0[Integer] { + var x = 0 + def apply(): Integer = { + x = x + 1 + return x + } +} class RoqPtr extends CircularQueuePtr(RoqPtr.RoqSize) with HasCircularQueuePtrHelper { def needFlush(redirect: Valid[Redirect]): Bool = { @@ -25,33 +30,65 @@ object RoqPtr extends HasXSParameter { } } +class RoqCSRIO extends XSBundle { + val intrBitSet = Input(Bool()) + val trapTarget = Input(UInt(VAddrBits.W)) -class Roq extends XSModule with HasCircularQueuePtrHelper { + val fflags = Output(new Fflags) + val dirty_fs = Output(Bool()) +} + +class Roq(numWbPorts: Int) extends XSModule with HasCircularQueuePtrHelper { val io = IO(new Bundle() { val brqRedirect = Input(Valid(new Redirect)) val memRedirect = Input(Valid(new Redirect)) - val dp1Req = Vec(RenameWidth, Flipped(DecoupledIO(new MicroOp))) - val roqIdxs = Output(Vec(RenameWidth, new RoqPtr)) + val enq = new Bundle { + val canAccept = Output(Bool()) + val isEmpty = Output(Bool()) + val extraWalk = Vec(RenameWidth, Input(Bool())) + val req = Vec(RenameWidth, Flipped(ValidIO(new MicroOp))) + val resp = Vec(RenameWidth, Output(new RoqPtr)) + } val redirect = Output(Valid(new Redirect)) val exception = Output(new MicroOp) // exu + brq - val exeWbResults = Vec(exuParameters.ExuCnt + 1, Flipped(ValidIO(new ExuOutput))) + val exeWbResults = Vec(numWbPorts, Flipped(ValidIO(new ExuOutput))) val commits = Vec(CommitWidth, Valid(new RoqCommit)) val bcommit = Output(UInt(BrTagWidth.W)) - val commitRoqIndex = Output(Valid(new RoqPtr)) val roqDeqPtr = Output(new RoqPtr) + val csr = new RoqCSRIO }) - val numWbPorts = io.exeWbResults.length - - val microOp = Mem(RoqSize, new MicroOp) + // instvalid field val valid = RegInit(VecInit(List.fill(RoqSize)(false.B))) - val flag = RegInit(VecInit(List.fill(RoqSize)(false.B))) + + // status val writebacked = Reg(Vec(RoqSize, Bool())) - val exuData = Reg(Vec(RoqSize, UInt(XLEN.W)))//for debug - val exuDebug = Reg(Vec(RoqSize, new DebugBundle))//for debug + // data for redirect, exception, etc. + val flagBkup = RegInit(VecInit(List.fill(RoqSize)(false.B))) + val exuFflags = Mem(RoqSize, new Fflags) + + // uop field used when commit + // flushPipe (wb) (commit) (used in roq) + // lidx (wb) (commit) + // sidx (wb) (commit) + // uop.ctrl.commitType (wb) (commit) (L/S) + // exceptionVec (wb) (commit) + // roqIdx (dispatch) (commit) + // crossPageIPFFix (dispatch) (commit) + + // uop field used when walk + // ctrl.fpWen (dispatch) (walk) + // ctrl.rfWen (dispatch) (walk) + // ldest (dispatch) (walk) + + // data for debug + val microOp = Mem(RoqSize, new MicroOp) + val debug_exuData = Reg(Vec(RoqSize, UInt(XLEN.W)))//for debug + val debug_exuDebug = Reg(Vec(RoqSize, new DebugBundle))//for debug + // ptr val enqPtrExt = RegInit(0.U.asTypeOf(new RoqPtr)) val deqPtrExt = RegInit(0.U.asTypeOf(new RoqPtr)) val walkPtrExt = Reg(new RoqPtr) @@ -63,42 +100,71 @@ class Roq extends XSModule with HasCircularQueuePtrHelper { val isFull = enqPtr === deqPtr && enqPtrExt.flag =/= deqPtrExt.flag val notFull = !isFull + val emptyEntries = RoqSize.U - distanceBetween(enqPtrExt, deqPtrExt) + val s_idle :: s_walk :: s_extrawalk :: Nil = Enum(3) val state = RegInit(s_idle) io.roqDeqPtr := deqPtrExt + // common signal + val enqPtrValPlus = Wire(Vec(RenameWidth, UInt(log2Up(RoqSize).W))) + val enqPtrFlagPlus = Wire(Vec(RenameWidth, Bool())) + for (i <- 0 until RenameWidth) { + val offset = PopCount(io.enq.req.map(_.valid).take(i)) + val roqIdxExt = enqPtrExt + offset + enqPtrValPlus(i) := roqIdxExt.value + enqPtrFlagPlus(i) := roqIdxExt.flag + } + + val deqPtrExtPlus = Wire(Vec(RenameWidth, UInt(log2Up(RoqSize).W))) + for(i <- 0 until CommitWidth){ + val roqIdxExt = deqPtrExt + i.U + deqPtrExtPlus(i) := roqIdxExt.value + } + // Dispatch - val noSpecEnq = io.dp1Req.map(i => i.bits.ctrl.noSpecExec) - val hasNoSpec = RegInit(false.B) - when(isEmpty){ hasNoSpec:= false.B } - val validDispatch = io.dp1Req.map(_.valid) - val timer = GTimer() - XSDebug("(ready, valid): ") + val hasBlockBackward = RegInit(false.B) + val hasNoSpecExec = RegInit(false.B) + // When blockBackward instruction leaves Roq (commit or walk), hasBlockBackward should be set to false.B + val blockBackwardLeave = Cat(io.commits.map(c => c.valid && c.bits.uop.ctrl.blockBackward)).orR || io.redirect.valid + when(blockBackwardLeave){ hasBlockBackward:= false.B } + // When noSpecExec instruction commits (it should not be walked except when it has not entered Roq), + // hasNoSpecExec should be set to false.B + val noSpecExecCommit = Cat(io.commits.map(c => c.valid && !c.bits.isWalk && c.bits.uop.ctrl.noSpecExec)).orR || io.redirect.valid + when(noSpecExecCommit){ hasNoSpecExec:= false.B } + // Assertion on that noSpecExec should never be walked since it's the only instruction in Roq. + // Extra walk should be ok since noSpecExec has not enter Roq. + val walkNoSpecExec = Cat(io.commits.map(c => c.valid && c.bits.isWalk && c.bits.uop.ctrl.noSpecExec)).orR + XSError(state =/= s_extrawalk && walkNoSpecExec, "noSpecExec should not walk\n") + + val validDispatch = io.enq.req.map(_.valid) for (i <- 0 until RenameWidth) { val offset = PopCount(validDispatch.take(i)) val roqIdxExt = enqPtrExt + offset val roqIdx = roqIdxExt.value - when(io.dp1Req(i).fire()){ - microOp(roqIdx) := io.dp1Req(i).bits - valid(roqIdx) := true.B - flag(roqIdx) := roqIdxExt.flag - writebacked(roqIdx) := false.B - when(noSpecEnq(i)){ hasNoSpec := true.B } + when(io.enq.req(i).valid) { + microOp(roqIdx) := io.enq.req(i).bits + when(io.enq.req(i).bits.ctrl.blockBackward) { + hasBlockBackward := true.B + } + when(io.enq.req(i).bits.ctrl.noSpecExec) { + hasNoSpecExec := true.B + } } - io.dp1Req(i).ready := (notFull && !valid(roqIdx) && state === s_idle) && - (!noSpecEnq(i) || isEmpty) && - !hasNoSpec - io.roqIdxs(i) := roqIdxExt - XSDebug(false, true.B, "(%d, %d) ", io.dp1Req(i).ready, io.dp1Req(i).valid) + io.enq.resp(i) := roqIdxExt } - XSDebug(false, true.B, "\n") - val firedDispatch = Cat(io.dp1Req.map(_.fire())) + val validEntries = distanceBetween(enqPtrExt, deqPtrExt) + val firedDispatch = Cat(io.enq.req.map(_.valid)) + io.enq.canAccept := (validEntries <= (RoqSize - RenameWidth).U) && !hasBlockBackward + io.enq.isEmpty := isEmpty + XSDebug(p"(ready, valid): ${io.enq.canAccept}, ${Binary(firedDispatch)}\n") + val dispatchCnt = PopCount(firedDispatch) - when(firedDispatch.orR){ - enqPtrExt := enqPtrExt + dispatchCnt + enqPtrExt := enqPtrExt + dispatchCnt + when (firedDispatch.orR) { XSInfo("dispatched %d insts\n", dispatchCnt) } @@ -109,68 +175,67 @@ class Roq extends XSModule with HasCircularQueuePtrHelper { when(io.exeWbResults(i).fire()){ val wbIdxExt = io.exeWbResults(i).bits.uop.roqIdx val wbIdx = wbIdxExt.value - writebacked(wbIdx) := true.B microOp(wbIdx).cf.exceptionVec := io.exeWbResults(i).bits.uop.cf.exceptionVec - microOp(wbIdx).lsroqIdx := io.exeWbResults(i).bits.uop.lsroqIdx microOp(wbIdx).lqIdx := io.exeWbResults(i).bits.uop.lqIdx microOp(wbIdx).sqIdx := io.exeWbResults(i).bits.uop.sqIdx microOp(wbIdx).ctrl.flushPipe := io.exeWbResults(i).bits.uop.ctrl.flushPipe microOp(wbIdx).diffTestDebugLrScValid := io.exeWbResults(i).bits.uop.diffTestDebugLrScValid + debug_exuData(wbIdx) := io.exeWbResults(i).bits.data + debug_exuDebug(wbIdx) := io.exeWbResults(i).bits.debug microOp(wbIdx).debugInfo.issueTime := io.exeWbResults(i).bits.uop.debugInfo.issueTime microOp(wbIdx).debugInfo.writebackTime := io.exeWbResults(i).bits.uop.debugInfo.writebackTime - exuData(wbIdx) := io.exeWbResults(i).bits.data - exuDebug(wbIdx) := io.exeWbResults(i).bits.debug - val debugUop = microOp(wbIdx) + val debug_Uop = microOp(wbIdx) XSInfo(true.B, - p"writebacked pc 0x${Hexadecimal(debugUop.cf.pc)} wen ${debugUop.ctrl.rfWen} " + - p"data 0x${Hexadecimal(io.exeWbResults(i).bits.data)} ldst ${debugUop.ctrl.ldest} pdst ${debugUop.ctrl.ldest} " + + p"writebacked pc 0x${Hexadecimal(debug_Uop.cf.pc)} wen ${debug_Uop.ctrl.rfWen} " + + p"data 0x${Hexadecimal(io.exeWbResults(i).bits.data)} ldst ${debug_Uop.ctrl.ldest} pdst ${debug_Uop.ctrl.ldest} " + p"skip ${io.exeWbResults(i).bits.debug.isMMIO} roqIdx: ${wbIdxExt}\n" ) } } - // roq redirect only used for exception - val intrBitSet = WireInit(false.B) - ExcitingUtils.addSink(intrBitSet, "intrBitSetIDU") - val trapTarget = WireInit(0.U(VAddrBits.W)) - ExcitingUtils.addSink(trapTarget, "trapTarget") - + // Interrupt val deqUop = microOp(deqPtr) val deqPtrWritebacked = writebacked(deqPtr) && valid(deqPtr) - val intrEnable = intrBitSet && !isEmpty && !hasNoSpec && + val intrEnable = io.csr.intrBitSet && !isEmpty && !hasNoSpecExec && deqUop.ctrl.commitType =/= CommitType.STORE && deqUop.ctrl.commitType =/= CommitType.LOAD// TODO: wanna check why has hasCsr(hasNoSpec) val exceptionEnable = deqPtrWritebacked && Cat(deqUop.cf.exceptionVec).orR() val isFlushPipe = deqPtrWritebacked && deqUop.ctrl.flushPipe io.redirect := DontCare io.redirect.valid := (state === s_idle) && (intrEnable || exceptionEnable || isFlushPipe)// TODO: add fence flush to flush the whole pipe io.redirect.bits.isException := intrEnable || exceptionEnable - io.redirect.bits.isFlushPipe := isFlushPipe - io.redirect.bits.target := Mux(isFlushPipe, deqUop.cf.pc + 4.U, trapTarget) + // reuse isFlushPipe to represent interrupt for CSR + io.redirect.bits.isFlushPipe := isFlushPipe || intrEnable + io.redirect.bits.target := Mux(isFlushPipe, deqUop.cf.pc + 4.U, io.csr.trapTarget) io.exception := deqUop - XSDebug(io.redirect.valid, "generate redirect: pc 0x%x intr %d excp %d flushpp %d target:0x%x Traptarget 0x%x exceptionVec %b\n", io.exception.cf.pc, intrEnable, exceptionEnable, isFlushPipe, io.redirect.bits.target, trapTarget, Cat(microOp(deqPtr).cf.exceptionVec)) + XSDebug(io.redirect.valid, + "generate redirect: pc 0x%x intr %d excp %d flushpp %d target:0x%x Traptarget 0x%x exceptionVec %b\n", + io.exception.cf.pc, intrEnable, exceptionEnable, isFlushPipe, io.redirect.bits.target, io.csr.trapTarget, + Cat(microOp(deqPtr).cf.exceptionVec)) // Commit uop to Rename (walk) + val walkCounter = Reg(UInt(log2Up(RoqSize).W)) val shouldWalkVec = Wire(Vec(CommitWidth, Bool())) - val walkPtrMatchVec = Wire(Vec(CommitWidth, Bool())) val walkPtrVec = Wire(Vec(CommitWidth, new RoqPtr)) for(i <- shouldWalkVec.indices){ walkPtrVec(i) := walkPtrExt - i.U - walkPtrMatchVec(i) := walkPtrVec(i) === walkTgtExt - if(i == 0) shouldWalkVec(i) := !walkPtrMatchVec(i) - else shouldWalkVec(i) := shouldWalkVec(i-1) && !walkPtrMatchVec(i) + shouldWalkVec(i) := i.U < walkCounter } - val walkFinished = Cat(walkPtrMatchVec).orR() + val walkFinished = walkCounter <= CommitWidth.U //&& // walk finish in this cycle + //!io.brqRedirect.valid // no new redirect comes and update walkptr // extra space is used weh roq has no enough space, but mispredict recovery needs such info to walk regmap val needExtraSpaceForMPR = WireInit(VecInit( - List.tabulate(RenameWidth)(i => io.brqRedirect.valid && io.dp1Req(i).valid && !io.dp1Req(i).ready) + List.tabulate(RenameWidth)(i => io.brqRedirect.valid && io.enq.extraWalk(i)) )) val extraSpaceForMPR = Reg(Vec(RenameWidth, new MicroOp)) val usedSpaceForMPR = Reg(Vec(RenameWidth, Bool())) val storeCommitVec = WireInit(VecInit(Seq.fill(CommitWidth)(false.B))) val cfiCommitVec = WireInit(VecInit(Seq.fill(CommitWidth)(false.B))) + // wiring to csr + val fflags = WireInit(0.U.asTypeOf(new Fflags)) + val dirty_fs = WireInit(false.B) for(i <- 0 until CommitWidth){ io.commits(i) := DontCare switch(state){ @@ -190,28 +255,29 @@ class Roq extends XSModule with HasCircularQueuePtrHelper { cfiCommitVec(i) := io.commits(i).valid && !commitUop.cf.brUpdate.pd.notCFI - when(io.commits(i).valid){v := false.B} + val commitFflags = exuFflags(commitIdx) + when(io.commits(i).valid){ + when(commitFflags.asUInt.orR()){ + // update fflags + fflags := exuFflags(commitIdx) + } + when(commitUop.ctrl.fpWen){ + // set fs to dirty + dirty_fs := true.B + } + } + XSInfo(io.commits(i).valid, - "retired pc %x wen %d ldest %d pdest %x old_pdest %x data %x\n", + "retired pc %x wen %d ldest %d pdest %x old_pdest %x data %x fflags: %b\n", commitUop.cf.pc, commitUop.ctrl.rfWen, commitUop.ctrl.ldest, commitUop.pdest, commitUop.old_pdest, - exuData(commitIdx) + debug_exuData(commitIdx), + exuFflags(commitIdx).asUInt ) - when (io.commits(i).valid) { - printf( - "retired pc %x wen %d ldest %d pdest %x old_pdest %x data %x\n", - commitUop.cf.pc, - commitUop.ctrl.rfWen, - commitUop.ctrl.ldest, - commitUop.pdest, - commitUop.old_pdest, - exuData(commitIdx) - ) - } - XSInfo(io.commits(i).valid && exuDebug(commitIdx).isMMIO, + XSInfo(io.commits(i).valid && debug_exuDebug(commitIdx).isMMIO, "difftest skiped pc0x%x\n", commitUop.cf.pc ) @@ -230,7 +296,7 @@ class Roq extends XSModule with HasCircularQueuePtrHelper { walkUop.cf.pc, walkUop.ctrl.rfWen, walkUop.ctrl.ldest, - exuData(idx) + debug_exuData(idx) ) } @@ -250,69 +316,150 @@ class Roq extends XSModule with HasCircularQueuePtrHelper { io.commits(i).bits.isWalk := state =/= s_idle } + io.csr.fflags := fflags + io.csr.dirty_fs := dirty_fs + val validCommit = io.commits.map(_.valid) + val commitCnt = PopCount(validCommit) when(state===s_walk) { //exit walk state when all roq entry is commited when(walkFinished) { state := s_idle } walkPtrExt := walkPtrExt - CommitWidth.U - // ringBufferWalkExtended := ringBufferWalkExtended - validCommit - XSInfo("rolling back: enqPtr %d deqPtr %d walk %d:%d\n", enqPtr, deqPtr, walkPtrExt.flag, walkPtr) + walkCounter := walkCounter - commitCnt + XSInfo("rolling back: enqPtr %d deqPtr %d walk %d:%d walkcnt %d\n", enqPtr, deqPtr, walkPtrExt.flag, walkPtr, walkCounter) } // move tail ptr - val commitCnt = PopCount(validCommit) when(state === s_idle){ deqPtrExt := deqPtrExt + commitCnt } val retireCounter = Mux(state === s_idle, commitCnt, 0.U) XSInfo(retireCounter > 0.U, "retired %d insts\n", retireCounter) - val commitOffset = PriorityEncoder((validCommit :+ false.B).map(!_)) - io.commitRoqIndex.valid := io.commits(0).valid && !io.commits(0).bits.isWalk - io.commitRoqIndex.bits := deqPtrExt + commitOffset // commit branch to brq io.bcommit := PopCount(cfiCommitVec) - val hasWFI = io.commits.map(c => c.valid && state===s_idle && c.bits.uop.cf.instr===WFI).reduce(_||_) - ExcitingUtils.addSource(hasWFI, "isWFI") - // when redirect, walk back roq entries when(io.brqRedirect.valid){ // TODO: need check if consider exception redirect? state := s_walk - walkPtrExt := Mux(state === s_walk && !walkFinished, walkPtrExt - CommitWidth.U, Mux(state === s_extrawalk, walkPtrExt, enqPtrExt - 1.U + dispatchCnt)) - walkTgtExt := io.brqRedirect.bits.roqIdx + val nextEnqPtr = (enqPtrExt - 1.U) + dispatchCnt + walkPtrExt := Mux(state === s_walk, + walkPtrExt - Mux(walkFinished, walkCounter, CommitWidth.U), + Mux(state === s_extrawalk, walkPtrExt, nextEnqPtr)) + // walkTgtExt := io.brqRedirect.bits.roqIdx + val currentWalkPtr = Mux(state === s_walk || state === s_extrawalk, walkPtrExt, nextEnqPtr) + walkCounter := distanceBetween(currentWalkPtr, io.brqRedirect.bits.roqIdx) - Mux(state === s_walk, commitCnt, 0.U) enqPtrExt := io.brqRedirect.bits.roqIdx + 1.U } // no enough space for walk, allocate extra space when(needExtraSpaceForMPR.asUInt.orR && io.brqRedirect.valid){ usedSpaceForMPR := needExtraSpaceForMPR - (0 until RenameWidth).foreach(i => extraSpaceForMPR(i) := io.dp1Req(i).bits) + (0 until RenameWidth).foreach(i => extraSpaceForMPR(i) := io.enq.req(i).bits) state := s_extrawalk XSDebug("roq full, switched to s_extrawalk. needExtraSpaceForMPR: %b\n", needExtraSpaceForMPR.asUInt) } - // when rollback, reset writebacked entry to valid - when(io.memRedirect.valid) { // TODO: opt timing - for (i <- 0 until RoqSize) { - val recRoqIdx = RoqPtr(flag(i), i.U) - when (valid(i) && isAfter(recRoqIdx, io.memRedirect.bits.roqIdx)) { - writebacked(i) := false.B - } - } - } - // when exception occurs, cancels all when (io.redirect.valid) { // TODO: need check for flushPipe enqPtrExt := 0.U.asTypeOf(new RoqPtr) deqPtrExt := 0.U.asTypeOf(new RoqPtr) + } + + // instvalid field + + // write + // enqueue logic writes 6 valid + for (i <- 0 until RenameWidth) { + when(io.enq.req(i).fire()){ + valid(enqPtrValPlus(i)) := true.B + } + } + // dequeue/walk logic writes 6 valid, dequeue and walk will not happen at the same time + for(i <- 0 until CommitWidth){ + switch(state){ + is(s_idle){ + when(io.commits(i).valid){valid(deqPtrExtPlus(i)) := false.B} + } + is(s_walk){ + val idx = walkPtrVec(i).value + when(shouldWalkVec(i)){ + valid(idx) := false.B + } + } + } + } + + // read + // enqueue logic reads 6 valid + // dequeue/walk logic reads 6 valid, dequeue and walk will not happen at the same time + // rollback reads all valid? is it necessary? + + // reset + // when exception, reset all valid to false + when (io.redirect.valid) { for (i <- 0 until RoqSize) { valid(i) := false.B } } + // status field: writebacked + + // write + // enqueue logic set 6 writebacked to false + for (i <- 0 until RenameWidth) { + when(io.enq.req(i).fire()){ + writebacked(enqPtrValPlus(i)) := false.B + } + } + // writeback logic set numWbPorts writebacked to true + for(i <- 0 until numWbPorts){ + when(io.exeWbResults(i).fire()){ + val wbIdxExt = io.exeWbResults(i).bits.uop.roqIdx + val wbIdx = wbIdxExt.value + writebacked(wbIdx) := true.B + } + } + // rollback: write all + // when rollback, reset writebacked entry to valid + // when(io.memRedirect.valid) { // TODO: opt timing + // for (i <- 0 until RoqSize) { + // val recRoqIdx = RoqPtr(flagBkup(i), i.U) + // when (valid(i) && isAfter(recRoqIdx, io.memRedirect.bits.roqIdx)) { + // writebacked(i) := false.B + // } + // } + // } + + // read + // deqPtrWritebacked + // gen io.commits(i).valid read 6 (CommitWidth) + + // flagBkup + // write: update when enqueue + // enqueue logic set 6 flagBkup at most + for (i <- 0 until RenameWidth) { + when(io.enq.req(i).fire()){ + flagBkup(enqPtrValPlus(i)) := enqPtrFlagPlus(i) + } + } + // read: used in rollback logic + // all flagBkup will be used + + // exuFflags + // write: writeback logic set numWbPorts exuFflags + for(i <- 0 until numWbPorts){ + when(io.exeWbResults(i).fire()){ + val wbIdxExt = io.exeWbResults(i).bits.uop.roqIdx + val wbIdx = wbIdxExt.value + exuFflags(wbIdx) := io.exeWbResults(i).bits.fflags + } + } + // read: used in commit logic + // read CommitWidth exuFflags + // debug info XSDebug(p"enqPtr ${enqPtrExt} deqPtr ${deqPtrExt}\n") XSDebug("") @@ -337,7 +484,7 @@ class Roq extends XSModule with HasCircularQueuePtrHelper { XSPerf("commitInstrLoad", PopCount(io.commits.map(c => c.valid && !c.bits.isWalk && c.bits.uop.ctrl.commitType === CommitType.LOAD))) XSPerf("commitInstrStore", PopCount(io.commits.map(c => c.valid && !c.bits.isWalk && c.bits.uop.ctrl.commitType === CommitType.STORE))) XSPerf("writeback", PopCount((0 until RoqSize).map(i => valid(i) && writebacked(i)))) - XSPerf("enqInstr", PopCount(io.dp1Req.map(_.fire()))) + // XSPerf("enqInstr", PopCount(io.enq.map(_.fire()))) XSPerf("walkInstr", PopCount(io.commits.map(c => c.valid && c.bits.isWalk))) XSPerf("walkCycle", state === s_walk || state === s_extrawalk) val deqNotWritebacked = valid(deqPtr) && !writebacked(deqPtr) @@ -347,70 +494,72 @@ class Roq extends XSModule with HasCircularQueuePtrHelper { XSPerf("waitLoadCycle", deqNotWritebacked && deqUopCommitType === CommitType.LOAD) XSPerf("waitStoreCycle", deqNotWritebacked && deqUopCommitType === CommitType.STORE) - //difftest signals - val firstValidCommit = deqPtr + PriorityMux(validCommit, VecInit(List.tabulate(CommitWidth)(_.U))) - - val skip = Wire(Vec(CommitWidth, Bool())) - val wen = Wire(Vec(CommitWidth, Bool())) - val wdata = Wire(Vec(CommitWidth, UInt(XLEN.W))) - val wdst = Wire(Vec(CommitWidth, UInt(32.W))) - val diffTestDebugLrScValid = Wire(Vec(CommitWidth, Bool())) - val wpc = Wire(Vec(CommitWidth, UInt(XLEN.W))) - val trapVec = Wire(Vec(CommitWidth, Bool())) - val isRVC = Wire(Vec(CommitWidth, Bool())) - for(i <- 0 until CommitWidth){ - // io.commits(i).valid - val idx = deqPtr+i.U - val uop = io.commits(i).bits.uop - val DifftestSkipSC = false - if(!DifftestSkipSC){ - skip(i) := exuDebug(idx).isMMIO && io.commits(i).valid - }else{ - skip(i) := ( - exuDebug(idx).isMMIO || - uop.ctrl.fuType === FuType.mou && uop.ctrl.fuOpType === LSUOpType.sc_d || - uop.ctrl.fuType === FuType.mou && uop.ctrl.fuOpType === LSUOpType.sc_w - ) && io.commits(i).valid - } - wen(i) := io.commits(i).valid && uop.ctrl.rfWen && uop.ctrl.ldest =/= 0.U - wdata(i) := exuData(idx) - wdst(i) := uop.ctrl.ldest - diffTestDebugLrScValid(i) := uop.diffTestDebugLrScValid - wpc(i) := SignExt(uop.cf.pc, XLEN) - trapVec(i) := io.commits(i).valid && (state===s_idle) && uop.ctrl.isXSTrap - isRVC(i) := uop.cf.brUpdate.pd.isRVC - } - - val scFailed = !diffTestDebugLrScValid(0) && - io.commits(0).bits.uop.ctrl.fuType === FuType.mou && - (io.commits(0).bits.uop.ctrl.fuOpType === LSUOpType.sc_d || io.commits(0).bits.uop.ctrl.fuOpType === LSUOpType.sc_w) - - val instrCnt = RegInit(0.U(64.W)) - instrCnt := instrCnt + retireCounter - + val id = roqDebugId() val difftestIntrNO = WireInit(0.U(XLEN.W)) val difftestCause = WireInit(0.U(XLEN.W)) - ExcitingUtils.addSink(difftestIntrNO, "difftestIntrNOfromCSR") - ExcitingUtils.addSink(difftestCause, "difftestCausefromCSR") - - XSDebug(difftestIntrNO =/= 0.U, "difftest intrNO set %x\n", difftestIntrNO) - val retireCounterFix = Mux(io.redirect.valid, 1.U, retireCounter) - val retirePCFix = SignExt(Mux(io.redirect.valid, microOp(deqPtr).cf.pc, microOp(firstValidCommit).cf.pc), XLEN) - val retireInstFix = Mux(io.redirect.valid, microOp(deqPtr).cf.instr, microOp(firstValidCommit).cf.instr) - if(!env.FPGAPlatform){ - BoringUtils.addSource(RegNext(retireCounterFix), "difftestCommit") - BoringUtils.addSource(RegNext(retirePCFix), "difftestThisPC")//first valid PC - BoringUtils.addSource(RegNext(retireInstFix), "difftestThisINST")//first valid inst - BoringUtils.addSource(RegNext(skip.asUInt), "difftestSkip") - // BoringUtils.addSource(RegNext(false.B), "difftestIsRVC")//FIXIT - BoringUtils.addSource(RegNext(isRVC.asUInt), "difftestIsRVC") - BoringUtils.addSource(RegNext(wen.asUInt), "difftestWen") - BoringUtils.addSource(RegNext(wpc), "difftestWpc") - BoringUtils.addSource(RegNext(wdata), "difftestWdata") - BoringUtils.addSource(RegNext(wdst), "difftestWdst") - BoringUtils.addSource(RegNext(scFailed), "difftestScFailed") - BoringUtils.addSource(RegNext(difftestIntrNO), "difftestIntrNO") - BoringUtils.addSource(RegNext(difftestCause), "difftestCause") + ExcitingUtils.addSink(difftestIntrNO, s"difftestIntrNOfromCSR$id") + ExcitingUtils.addSink(difftestCause, s"difftestCausefromCSR$id") + + if(!env.FPGAPlatform){ + + //difftest signals + val firstValidCommit = deqPtr + PriorityMux(validCommit, VecInit(List.tabulate(CommitWidth)(_.U))) + + val skip = Wire(Vec(CommitWidth, Bool())) + val wen = Wire(Vec(CommitWidth, Bool())) + val wdata = Wire(Vec(CommitWidth, UInt(XLEN.W))) + val wdst = Wire(Vec(CommitWidth, UInt(32.W))) + val diffTestDebugLrScValid = Wire(Vec(CommitWidth, Bool())) + val wpc = Wire(Vec(CommitWidth, UInt(XLEN.W))) + val trapVec = Wire(Vec(CommitWidth, Bool())) + val isRVC = Wire(Vec(CommitWidth, Bool())) + for(i <- 0 until CommitWidth){ + // io.commits(i).valid + val idx = deqPtr+i.U + val uop = io.commits(i).bits.uop + val DifftestSkipSC = false + if(!DifftestSkipSC){ + skip(i) := debug_exuDebug(idx).isMMIO && io.commits(i).valid + }else{ + skip(i) := ( + debug_exuDebug(idx).isMMIO || + uop.ctrl.fuType === FuType.mou && uop.ctrl.fuOpType === LSUOpType.sc_d || + uop.ctrl.fuType === FuType.mou && uop.ctrl.fuOpType === LSUOpType.sc_w + ) && io.commits(i).valid + } + wen(i) := io.commits(i).valid && uop.ctrl.rfWen && uop.ctrl.ldest =/= 0.U + wdata(i) := debug_exuData(idx) + wdst(i) := uop.ctrl.ldest + diffTestDebugLrScValid(i) := uop.diffTestDebugLrScValid + wpc(i) := SignExt(uop.cf.pc, XLEN) + trapVec(i) := io.commits(i).valid && (state===s_idle) && uop.ctrl.isXSTrap + isRVC(i) := uop.cf.brUpdate.pd.isRVC + } + + val scFailed = !diffTestDebugLrScValid(0) && + io.commits(0).bits.uop.ctrl.fuType === FuType.mou && + (io.commits(0).bits.uop.ctrl.fuOpType === LSUOpType.sc_d || io.commits(0).bits.uop.ctrl.fuOpType === LSUOpType.sc_w) + + val instrCnt = RegInit(0.U(64.W)) + instrCnt := instrCnt + retireCounter + + XSDebug(difftestIntrNO =/= 0.U, "difftest intrNO set %x\n", difftestIntrNO) + val retireCounterFix = Mux(io.redirect.valid, 1.U, retireCounter) + val retirePCFix = SignExt(Mux(io.redirect.valid, microOp(deqPtr).cf.pc, microOp(firstValidCommit).cf.pc), XLEN) + val retireInstFix = Mux(io.redirect.valid, microOp(deqPtr).cf.instr, microOp(firstValidCommit).cf.instr) + + ExcitingUtils.addSource(RegNext(retireCounterFix), "difftestCommit", ExcitingUtils.Debug) + ExcitingUtils.addSource(RegNext(retirePCFix), "difftestThisPC", ExcitingUtils.Debug)//first valid PC + ExcitingUtils.addSource(RegNext(retireInstFix), "difftestThisINST", ExcitingUtils.Debug)//first valid inst + ExcitingUtils.addSource(RegNext(skip.asUInt), "difftestSkip", ExcitingUtils.Debug) + ExcitingUtils.addSource(RegNext(isRVC.asUInt), "difftestIsRVC", ExcitingUtils.Debug) + ExcitingUtils.addSource(RegNext(wen.asUInt), "difftestWen", ExcitingUtils.Debug) + ExcitingUtils.addSource(RegNext(wpc), "difftestWpc", ExcitingUtils.Debug) + ExcitingUtils.addSource(RegNext(wdata), "difftestWdata", ExcitingUtils.Debug) + ExcitingUtils.addSource(RegNext(wdst), "difftestWdst", ExcitingUtils.Debug) + ExcitingUtils.addSource(RegNext(scFailed), "difftestScFailed", ExcitingUtils.Debug) + ExcitingUtils.addSource(RegNext(difftestIntrNO), "difftestIntrNO", ExcitingUtils.Debug) + ExcitingUtils.addSource(RegNext(difftestCause), "difftestCause", ExcitingUtils.Debug) val hitTrap = trapVec.reduce(_||_) val trapCode = PriorityMux(wdata.zip(trapVec).map(x => x._2 -> x._1)) diff --git a/src/main/scala/xiangshan/cache/L1plusCache.scala b/src/main/scala/xiangshan/cache/L1plusCache.scala new file mode 100644 index 0000000000000000000000000000000000000000..26ba86e8b3f039ec2ea8164e1599defd89b90061 --- /dev/null +++ b/src/main/scala/xiangshan/cache/L1plusCache.scala @@ -0,0 +1,806 @@ +package xiangshan.cache + +import chisel3._ +import chisel3.util._ +import utils.{Code, RandomReplacement, HasTLDump, XSDebug, SRAMTemplate} +import xiangshan.{HasXSLog} + +import chipsalliance.rocketchip.config.Parameters +import freechips.rocketchip.diplomacy.{LazyModule, LazyModuleImp, IdRange} +import freechips.rocketchip.tilelink.{TLClientNode, TLClientParameters, + TLMasterParameters, TLMasterPortParameters, TLArbiter, + TLEdgeOut, TLBundleA, TLBundleD, + ClientStates, ClientMetadata +} + +import scala.math.max + + +// L1plusCache specific parameters +// L1 L1plusCache is 64set, 8way-associative, with 64byte block, a total of 32KB +// It's a virtually indexed, physically tagged cache. +case class L1plusCacheParameters +( + nSets: Int = 64, + nWays: Int = 8, + rowBits: Int = 64, + tagECC: Option[String] = None, + dataECC: Option[String] = None, + nMissEntries: Int = 1, + blockBytes: Int = 64 +) extends L1CacheParameters { + + def tagCode: Code = Code.fromString(tagECC) + def dataCode: Code = Code.fromString(dataECC) + + def replacement = new RandomReplacement(nWays) +} + +trait HasL1plusCacheParameters extends HasL1CacheParameters { + val cacheParams = l1plusCacheParameters + val icacheParams = icacheParameters + val cfg = cacheParams + val icfg = icacheParams + + def encRowBits = cacheParams.dataCode.width(rowBits) + + def missQueueEntryIdWidth = log2Up(cfg.nMissEntries) + def icacheMissQueueEntryIdWidth = log2Up(icfg.nMissEntries) + + require(isPow2(nSets), s"nSets($nSets) must be pow2") + require(isPow2(nWays), s"nWays($nWays) must be pow2") + require(full_divide(beatBits, rowBits), s"beatBits($beatBits) must be multiple of rowBits($rowBits)") +} + +abstract class L1plusCacheModule extends L1CacheModule + with HasL1plusCacheParameters + +abstract class L1plusCacheBundle extends L1CacheBundle + with HasL1plusCacheParameters + +// basic building blocks for L1plusCache +// MetaArray and DataArray +// TODO: dedup with DCache +class L1plusCacheMetadata extends L1plusCacheBundle { + val valid = Bool() + val tag = UInt(tagBits.W) +} + +object L1plusCacheMetadata { + def apply(tag: Bits, valid: Bool) = { + val meta = Wire(new L1plusCacheMetadata) + meta.tag := tag + meta.valid := valid + meta + } +} + +class L1plusCacheMetaReadReq extends L1plusCacheBundle { + val idx = UInt(idxBits.W) + val way_en = UInt(nWays.W) + val tag = UInt(tagBits.W) +} + +class L1plusCacheMetaWriteReq extends L1plusCacheMetaReadReq { + val data = new L1plusCacheMetadata +} + +class L1plusCacheDataReadReq extends L1plusCacheBundle { + // you can choose which bank to read to save power + val rmask = Bits(blockRows.W) + val way_en = Bits(nWays.W) + val addr = Bits(untagBits.W) +} + +// Now, we can write a cache-block in a single cycle +class L1plusCacheDataWriteReq extends L1plusCacheDataReadReq { + val wmask = Bits(blockRows.W) + val data = Vec(blockRows, Bits(encRowBits.W)) +} + +class L1plusCacheDataArray extends L1plusCacheModule { + val io = IO(new L1plusCacheBundle { + val read = Flipped(DecoupledIO(new L1plusCacheDataReadReq)) + val write = Flipped(DecoupledIO(new L1plusCacheDataWriteReq)) + val resp = Output(Vec(nWays, Vec(blockRows, Bits(encRowBits.W)))) + }) + + val singlePort = true + + // write is always ready + io.write.ready := true.B + val waddr = (io.write.bits.addr >> blockOffBits).asUInt() + val raddr = (io.read.bits.addr >> blockOffBits).asUInt() + + // for single port SRAM, do not allow read and write in the same cycle + // for dual port SRAM, raddr === waddr is undefined behavior + val rwhazard = if(singlePort) io.write.valid else io.write.valid && waddr === raddr + io.read.ready := !rwhazard + + for (w <- 0 until nWays) { + for (r <- 0 until blockRows) { + val array = Module(new SRAMTemplate(Bits(encRowBits.W), set=nSets, way=1, + shouldReset=false, holdRead=false, singlePort=singlePort)) + // data write + array.io.w.req.valid := io.write.bits.way_en(w) && io.write.bits.wmask(r).asBool && io.write.valid + array.io.w.req.bits.apply( + setIdx=waddr, + data=io.write.bits.data(r), + waymask=1.U) + + // data read + array.io.r.req.valid := io.read.bits.way_en(w) && io.read.bits.rmask(r) && io.read.valid + array.io.r.req.bits.apply(setIdx=raddr) + io.resp(w)(r) := RegNext(array.io.r.resp.data(0)) + } + } + + // debug output + def dumpRead() = { + when (io.read.valid) { + XSDebug(s"DataArray Read valid way_en: %x addr: %x\n", + io.read.bits.way_en, io.read.bits.addr) + } + } + + def dumpWrite() = { + when (io.write.valid) { + XSDebug(s"DataArray Write valid way_en: %x addr: %x\n", + io.write.bits.way_en, io.write.bits.addr) + + (0 until blockRows) map { r => + XSDebug(s"cycle: $r data: %x wmask: %x\n", + io.write.bits.data(r), io.write.bits.wmask(r)) + } + } + } + + def dumpResp() = { + XSDebug(s"DataArray ReadResp\n") + (0 until nWays) map { i => + (0 until blockRows) map { r => + XSDebug(s"way: $i cycle: $r data: %x\n", io.resp(i)(r)) + } + } + } + + def dump() = { + dumpRead + dumpWrite + dumpResp + } +} + +class L1plusCacheMetadataArray extends L1plusCacheModule { + val io = IO(new Bundle { + val read = Flipped(Decoupled(new L1plusCacheMetaReadReq)) + val write = Flipped(Decoupled(new L1plusCacheMetaWriteReq)) + val resp = Output(Vec(nWays, new L1plusCacheMetadata)) + val flush = Input(Bool()) + }) + val waddr = io.write.bits.idx + val wvalid = io.write.bits.data.valid + val wtag = io.write.bits.data.tag.asUInt + val wmask = Mux((nWays == 1).B, (-1).asSInt, io.write.bits.way_en.asSInt).asBools + val rmask = Mux((nWays == 1).B, (-1).asSInt, io.read.bits.way_en.asSInt).asBools + + def encTagBits = cacheParams.tagCode.width(tagBits) + val tag_array = Module(new SRAMTemplate(UInt(encTagBits.W), set=nSets, way=nWays, + shouldReset=false, holdRead=false, singlePort=true)) + val valid_array = Reg(Vec(nSets, UInt(nWays.W))) + when (reset.toBool || io.flush) { + for (i <- 0 until nSets) { + valid_array(i) := 0.U + } + } + XSDebug("valid_array:%x flush:%d\n",valid_array.asUInt,io.flush) + + // tag write + val wen = io.write.valid && !reset.toBool && !io.flush + tag_array.io.w.req.valid := wen + tag_array.io.w.req.bits.apply( + setIdx=waddr, + data=cacheParams.tagCode.encode(wtag), + waymask=VecInit(wmask).asUInt) + + when (wen) { + when (wvalid) { + valid_array(waddr) := valid_array(waddr) | io.write.bits.way_en + } .otherwise { + valid_array(waddr) := valid_array(waddr) & ~io.write.bits.way_en + } + } + + // tag read + tag_array.io.r.req.valid := io.read.fire() + tag_array.io.r.req.bits.apply(setIdx=io.read.bits.idx) + val rtags = tag_array.io.r.resp.data.map(rdata => + cacheParams.tagCode.decode(rdata).corrected) + + for (i <- 0 until nWays) { + io.resp(i).valid := RegNext(valid_array(io.read.bits.idx)(i)) + io.resp(i).tag := rtags(i) + } + + // we use single port SRAM + // do not allow read and write in the same cycle + io.read.ready := !io.write.valid && !reset.toBool && !io.flush && tag_array.io.r.req.ready + io.write.ready := !reset.toBool && !io.flush && tag_array.io.w.req.ready + + def dumpRead() = { + when (io.read.fire()) { + XSDebug("MetaArray Read: idx: %d way_en: %x tag: %x\n", + io.read.bits.idx, io.read.bits.way_en, io.read.bits.tag) + } + } + + def dumpWrite() = { + when (io.write.fire()) { + XSDebug("MetaArray Write: idx: %d way_en: %x tag: %x new_tag: %x new_valid: %x\n", + io.write.bits.idx, io.write.bits.way_en, io.write.bits.tag, io.write.bits.data.tag, io.write.bits.data.valid) + } + } + + def dumpResp() = { + (0 until nWays) map { i => + XSDebug(s"MetaArray Resp: way: $i tag: %x valid: %x\n", + io.resp(i).tag, io.resp(i).valid) + } + } + + def dump() = { + dumpRead + dumpWrite + dumpResp + } +} + +class L1plusCacheReq extends L1plusCacheBundle +{ + val cmd = UInt(M_SZ.W) + val addr = UInt(PAddrBits.W) + val id = UInt(icacheMissQueueEntryIdWidth.W) +} + +class L1plusCacheResp extends L1plusCacheBundle +{ + val data = UInt((cfg.blockBytes * 8).W) + val id = UInt(icacheMissQueueEntryIdWidth.W) +} + +class L1plusCacheIO extends L1plusCacheBundle +{ + val req = DecoupledIO(new L1plusCacheReq) + val resp = Flipped(DecoupledIO(new L1plusCacheResp)) + val flush = Output(Bool()) + val empty = Input(Bool()) +} + +class L1plusCache()(implicit p: Parameters) extends LazyModule with HasL1plusCacheParameters { + + val clientParameters = TLMasterPortParameters.v1( + Seq(TLMasterParameters.v1( + name = "l1plusCache", + sourceId = IdRange(0, cfg.nMissEntries) + )) + ) + + val clientNode = TLClientNode(Seq(clientParameters)) + + lazy val module = new L1plusCacheImp(this) +} + + +class L1plusCacheImp(outer: L1plusCache) extends LazyModuleImp(outer) with HasL1plusCacheParameters with HasXSLog { + + val io = IO(Flipped(new L1plusCacheIO)) + + val (bus, edge) = outer.clientNode.out.head + require(bus.d.bits.data.getWidth == l1BusDataWidth, "L1plusCache: tilelink width does not match") + + //---------------------------------------- + // core data structures + val dataArray = Module(new L1plusCacheDataArray) + + val metaArray = Module(new L1plusCacheMetadataArray()) + dataArray.dump() + metaArray.dump() + + + //---------------------------------------- + val pipe = Module(new L1plusCachePipe) + val missQueue = Module(new L1plusCacheMissQueue(edge)) + val resp_arb = Module(new Arbiter(new L1plusCacheResp, 2)) + + val flush_block_req = Wire(Bool()) + val req_block = block_req(io.req.bits.addr) || flush_block_req + block_decoupled(io.req, pipe.io.req, req_block) + XSDebug(req_block, "Request blocked\n") + + pipe.io.data_read <> dataArray.io.read + pipe.io.data_resp <> dataArray.io.resp + pipe.io.meta_read <> metaArray.io.read + pipe.io.meta_resp <> metaArray.io.resp + + missQueue.io.req <> pipe.io.miss_req + bus.a <> missQueue.io.mem_acquire + missQueue.io.mem_grant <> bus.d + metaArray.io.write <> missQueue.io.meta_write + dataArray.io.write <> missQueue.io.refill + + // response + io.resp <> resp_arb.io.out + resp_arb.io.in(0) <> pipe.io.resp + resp_arb.io.in(1) <> missQueue.io.resp + + bus.b.ready := false.B + bus.c.valid := false.B + bus.c.bits := DontCare + bus.e.valid := false.B + bus.e.bits := DontCare + + // flush state machine + val s_invalid :: s_drain_cache :: s_flush_cache :: s_send_resp :: Nil = Enum(4) + val state = RegInit(s_invalid) + + switch (state) { + is (s_invalid) { + when (io.flush) { + state := s_drain_cache + } + } + is (s_drain_cache) { + when (pipe.io.empty && missQueue.io.empty) { + state := s_flush_cache + } + } + is (s_flush_cache) { + state := s_send_resp + } + is (s_send_resp) { + state := s_invalid + } + } + metaArray.io.flush := state === s_flush_cache + io.empty := state === s_send_resp + flush_block_req := state =/= s_invalid + + when (state =/= s_invalid) { + XSDebug(s"L1plusCache flush state machine: %d\n", state) + } + + // to simplify synchronization, we do not allow reqs with same indexes + def block_req(addr: UInt) = { + val pipe_idx_matches = VecInit(pipe.io.inflight_req_idxes map (entry => entry.valid && entry.bits === get_idx(addr))) + val pipe_idx_match = pipe_idx_matches.reduce(_||_) + + val miss_idx_matches = VecInit(missQueue.io.inflight_req_idxes map (entry => entry.valid && entry.bits === get_idx(addr))) + val miss_idx_match = miss_idx_matches.reduce(_||_) + + pipe_idx_match || miss_idx_match + } + + def block_decoupled[T <: Data](source: DecoupledIO[T], sink: DecoupledIO[T], block_signal: Bool) = { + sink.valid := source.valid && !block_signal + source.ready := sink.ready && !block_signal + sink.bits := source.bits + } + + // debug output + when (io.req.valid) { + XSDebug(s"L1plusCache req cmd: %x addr: %x id: %d\n", + io.req.bits.cmd, io.req.bits.addr, io.req.bits.id) + } + + when (io.resp.valid) { + XSDebug(s"L1plusCache resp data: %x id: %d\n", + io.resp.bits.data, io.resp.bits.id) + } + + when (io.flush) { + XSDebug(s"L1plusCache flush\n") + } + + when (io.empty) { + XSDebug(s"L1plusCache empty\n") + } +} + +class L1plusCachePipe extends L1plusCacheModule +{ + val io = IO(new L1plusCacheBundle{ + val req = Flipped(DecoupledIO(new L1plusCacheReq)) + val resp = DecoupledIO(new L1plusCacheResp) + val data_read = DecoupledIO(new L1plusCacheDataReadReq) + val data_resp = Input(Vec(nWays, Vec(blockRows, Bits(encRowBits.W)))) + val meta_read = DecoupledIO(new L1plusCacheMetaReadReq) + val meta_resp = Input(Vec(nWays, new L1plusCacheMetadata)) + val miss_req = DecoupledIO(new L1plusCacheMissReq) + val inflight_req_idxes = Output(Vec(2, Valid(UInt()))) + val empty = Output(Bool()) + }) + + val s0_passdown = Wire(Bool()) + val s1_passdown = Wire(Bool()) + val s2_passdown = Wire(Bool()) + + val s0_valid = Wire(Bool()) + val s1_valid = Wire(Bool()) + val s2_valid = Wire(Bool()) + + // requests + val can_accept_req = !s1_valid || s1_passdown + io.req.ready := io.meta_read.ready && io.data_read.ready && can_accept_req + io.meta_read.valid := io.req.valid && can_accept_req + io.data_read.valid := io.req.valid && can_accept_req + + val meta_read = io.meta_read.bits + val data_read = io.data_read.bits + + // Tag read for new requests + meta_read.idx := get_idx(io.req.bits.addr) + meta_read.way_en := ~0.U(nWays.W) + meta_read.tag := DontCare + // Data read for new requests + data_read.addr := io.req.bits.addr + data_read.way_en := ~0.U(nWays.W) + data_read.rmask := ~0.U(blockRows.W) + + // Pipeline + // stage 0 + s0_valid := io.req.fire() + val s0_req = io.req.bits + + s0_passdown := s0_valid + + assert(!(s0_valid && s0_req.cmd =/= MemoryOpConstants.M_XRD), "L1plusCachePipe only accepts read req") + + dump_pipeline_reqs("L1plusCachePipe s0", s0_valid, s0_req) +// stage 1 + val s1_req = RegEnable(s0_req, s0_passdown) + val s1_valid_reg = RegEnable(s0_valid, init = false.B, enable = s0_passdown) + val s1_addr = s1_req.addr + when (s1_passdown && !s0_passdown) { + s1_valid_reg := false.B + } + s1_valid := s1_valid_reg + + dump_pipeline_reqs("L1plusCachePipe s1", s1_valid, s1_req) + + val meta_resp = io.meta_resp + // tag check + def wayMap[T <: Data](f: Int => T) = VecInit((0 until nWays).map(f)) + val s1_tag_eq_way = wayMap((w: Int) => meta_resp(w).tag === (get_tag(s1_addr))).asUInt + val s1_tag_match_way = wayMap((w: Int) => s1_tag_eq_way(w) && meta_resp(w).valid).asUInt + + s1_passdown := s1_valid && (!s2_valid || s2_passdown) + + // stage 2 + val s2_req = RegEnable(s1_req, s1_passdown) + val s2_valid_reg = RegEnable(s1_valid, init=false.B, enable=s1_passdown) + when (s2_passdown && !s1_passdown) { + s2_valid_reg := false.B + } + s2_valid := s2_valid_reg + + dump_pipeline_reqs("L1plusCachePipe s2", s2_valid, s2_req) + + val s2_tag_match_way = RegEnable(s1_tag_match_way, s1_passdown) + val s2_hit = s2_tag_match_way.orR + val s2_hit_way = OHToUInt(s2_tag_match_way, nWays) + + val data_resp = io.data_resp + val s2_data = data_resp(s2_hit_way) + val s2_data_decoded = Cat((0 until blockRows).reverse map { r => + val data = s2_data(r) + val decoded = cacheParams.dataCode.decode(data) + assert(!(s2_valid && s2_hit && decoded.uncorrectable)) + decoded.corrected + }) + + io.resp.valid := s2_valid && s2_hit + io.resp.bits.data := s2_data_decoded + io.resp.bits.id := s2_req.id + + // replacement policy + val replacer = cacheParams.replacement + val replaced_way_en = UIntToOH(replacer.way) + + io.miss_req.valid := s2_valid && !s2_hit + io.miss_req.bits.id := s2_req.id + io.miss_req.bits.cmd := M_XRD + io.miss_req.bits.addr := s2_req.addr + io.miss_req.bits.way_en := replaced_way_en + + s2_passdown := s2_valid && ((s2_hit && io.resp.ready) || (!s2_hit && io.miss_req.ready)) + + when (io.miss_req.fire()) { + replacer.miss + } + + val resp = io.resp + when (resp.valid) { + XSDebug(s"L1plusCachePipe resp: data: %x id: %d\n", + resp.bits.data, resp.bits.id) + } + + io.inflight_req_idxes(0).valid := s1_valid + io.inflight_req_idxes(0).bits := get_idx(s1_req.addr) + io.inflight_req_idxes(1).valid := s2_valid + io.inflight_req_idxes(1).bits := get_idx(s2_req.addr) + + io.empty := !s0_valid && !s1_valid && !s2_valid + + // ------- + // Debug logging functions + def dump_pipeline_reqs(pipeline_stage_name: String, valid: Bool, req: L1plusCacheReq) = { + when (valid) { + XSDebug( + s"$pipeline_stage_name cmd: %x addr: %x id: %d\n", + req.cmd, req.addr, req.id + ) + } + } +} + +class L1plusCacheMissReq extends L1plusCacheBundle +{ + // transaction id + val id = UInt(missQueueEntryIdWidth.W) + val cmd = UInt(M_SZ.W) + val addr = UInt(PAddrBits.W) + val way_en = UInt(nWays.W) +} + +class L1plusCacheMissEntry(edge: TLEdgeOut) extends L1plusCacheModule +{ + val io = IO(new Bundle { + val id = Input(UInt()) + val req = Flipped(DecoupledIO(new L1plusCacheMissReq)) + val resp = DecoupledIO(new L1plusCacheResp) + + val mem_acquire = DecoupledIO(new TLBundleA(edge.bundle)) + val mem_grant = Flipped(DecoupledIO(new TLBundleD(edge.bundle))) + + val meta_write = DecoupledIO(new L1plusCacheMetaWriteReq) + val refill = DecoupledIO(new L1plusCacheDataWriteReq) + + val idx = Output(Valid(UInt())) + val tag = Output(Valid(UInt())) + }) + + val s_invalid :: s_refill_req :: s_refill_resp :: s_send_resp :: s_data_write_req :: s_meta_write_req :: Nil = Enum(6) + + val state = RegInit(s_invalid) + + val req = Reg(new L1plusCacheMissReq) + val req_idx = get_idx(req.addr) + val req_tag = get_tag(req.addr) + + val (_, _, refill_done, refill_address_inc) = edge.addr_inc(io.mem_grant) + + val refill_ctr = Reg(UInt(log2Up(refillCycles).W)) + + io.idx.valid := state =/= s_invalid + io.tag.valid := state =/= s_invalid + io.idx.bits := req_idx + io.tag.bits := req_tag + + // assign default values to output signals + io.req.ready := false.B + io.resp.valid := false.B + io.resp.bits := DontCare + + io.mem_acquire.valid := false.B + io.mem_acquire.bits := DontCare + + io.mem_grant.ready := false.B + + io.meta_write.valid := false.B + io.meta_write.bits := DontCare + + io.refill.valid := false.B + io.refill.bits := DontCare + + when (state =/= s_invalid) { + XSDebug("entry: %d state: %d\n", io.id, state) + XSDebug("entry: %d idx_valid: %b idx: %x tag_valid: %b tag: %x\n", + io.id, io.idx.valid, io.idx.bits, io.tag.valid, io.tag.bits) + } + + + // -------------------------------------------- + // s_invalid: receive requests + when (state === s_invalid) { + io.req.ready := true.B + + when (io.req.fire()) { + refill_ctr := 0.U + req := io.req.bits + state := s_refill_req + } + } + + // -------------------------------------------- + // refill + when (state === s_refill_req) { + io.mem_acquire.valid := true.B + io.mem_acquire.bits := edge.Get( + fromSource = io.id, + toAddress = req.addr, + lgSize = (log2Up(cfg.blockBytes)).U)._2 + when (io.mem_acquire.fire()) { + state := s_refill_resp + } + } + + val refill_data = Reg(Vec(blockRows, UInt(encRowBits.W))) + // not encoded data + val refill_data_raw = Reg(Vec(blockRows, UInt(rowBits.W))) + when (state === s_refill_resp) { + io.mem_grant.ready := true.B + + when (edge.hasData(io.mem_grant.bits)) { + when (io.mem_grant.fire()) { + refill_ctr := refill_ctr + 1.U + for (i <- 0 until beatRows) { + val row = io.mem_grant.bits.data(rowBits * (i + 1) - 1, rowBits * i) + refill_data((refill_ctr << log2Floor(beatRows)) + i.U) := cacheParams.dataCode.encode(row) + refill_data_raw((refill_ctr << log2Floor(beatRows)) + i.U) := row + } + + when (refill_ctr === (refillCycles - 1).U) { + assert(refill_done, "refill not done!") + state := s_send_resp + } + } + } + } + + // -------------------------------------------- + when (state === s_send_resp) { + + val resp_data = Cat((0 until blockRows).reverse map { r => refill_data_raw(r) }) + io.resp.valid := true.B + io.resp.bits.data := resp_data + io.resp.bits.id := req.id + + when (io.resp.fire()) { + state := s_data_write_req + } + } + + // -------------------------------------------- + // data write + when (state === s_data_write_req) { + io.refill.valid := true.B + io.refill.bits.addr := req.addr + io.refill.bits.way_en := req.way_en + io.refill.bits.wmask := ~0.U(blockRows.W) + io.refill.bits.rmask := DontCare + io.refill.bits.data := refill_data + + when (io.refill.fire()) { + state := s_meta_write_req + } + } + + // -------------------------------------------- + // meta write + when (state === s_meta_write_req) { + io.meta_write.valid := true.B + io.meta_write.bits.idx := req_idx + io.meta_write.bits.data.valid := true.B + io.meta_write.bits.data.tag := req_tag + io.meta_write.bits.way_en := req.way_en + + when (io.meta_write.fire()) { + state := s_invalid + } + } +} + +class L1plusCacheMissQueue(edge: TLEdgeOut) extends L1plusCacheModule with HasTLDump +{ + val io = IO(new Bundle { + val req = Flipped(DecoupledIO(new L1plusCacheMissReq)) + val resp = DecoupledIO(new L1plusCacheResp) + + val mem_acquire = DecoupledIO(new TLBundleA(edge.bundle)) + val mem_grant = Flipped(DecoupledIO(new TLBundleD(edge.bundle))) + + val meta_write = DecoupledIO(new L1plusCacheMetaWriteReq) + val refill = DecoupledIO(new L1plusCacheDataWriteReq) + val inflight_req_idxes = Output(Vec(cfg.nMissEntries, Valid(UInt()))) + val empty = Output(Bool()) + }) + + val resp_arb = Module(new Arbiter(new L1plusCacheResp, cfg.nMissEntries)) + val meta_write_arb = Module(new Arbiter(new L1plusCacheMetaWriteReq, cfg.nMissEntries)) + val refill_arb = Module(new Arbiter(new L1plusCacheDataWriteReq, cfg.nMissEntries)) + + // assign default values to output signals + io.mem_grant.ready := false.B + + val idx_matches = Wire(Vec(cfg.nMissEntries, Bool())) + val tag_matches = Wire(Vec(cfg.nMissEntries, Bool())) + + val tag_match = Mux1H(idx_matches, tag_matches) + val idx_match = idx_matches.reduce(_||_) + + val req = io.req + val entry_alloc_idx = Wire(UInt()) + val pri_rdy = WireInit(false.B) + val pri_val = req.valid && !idx_match + + val entries = (0 until cfg.nMissEntries) map { i => + val entry = Module(new L1plusCacheMissEntry(edge)) + + entry.io.id := i.U(missQueueEntryIdWidth.W) + + idx_matches(i) := entry.io.idx.valid && entry.io.idx.bits === get_idx(req.bits.addr) + tag_matches(i) := entry.io.tag.valid && entry.io.tag.bits === get_tag(req.bits.addr) + io.inflight_req_idxes(i) <> entry.io.idx + + // req and resp + entry.io.req.valid := (i.U === entry_alloc_idx) && pri_val + when (i.U === entry_alloc_idx) { + pri_rdy := entry.io.req.ready + } + entry.io.req.bits := req.bits + + resp_arb.io.in(i) <> entry.io.resp + + // tilelink + entry.io.mem_grant.valid := false.B + entry.io.mem_grant.bits := DontCare + when (io.mem_grant.bits.source === i.U) { + entry.io.mem_grant <> io.mem_grant + } + + // meta and data write + meta_write_arb.io.in(i) <> entry.io.meta_write + refill_arb.io.in(i) <> entry.io.refill + + entry + } + + entry_alloc_idx := PriorityEncoder(entries.map(m=>m.io.req.ready)) + + // whenever index matches, do not let it in + req.ready := pri_rdy && !idx_match + io.resp <> resp_arb.io.out + io.meta_write <> meta_write_arb.io.out + io.refill <> refill_arb.io.out + + TLArbiter.lowestFromSeq(edge, io.mem_acquire, entries.map(_.io.mem_acquire)) + + io.empty := VecInit(entries.map(m=>m.io.req.ready)).asUInt.andR + + // print all input/output requests for debug purpose + // print req + XSDebug(req.fire(), "req id: %d cmd: %x addr: %x way_en: %x\n", + req.bits.id, req.bits.cmd, req.bits.addr, req.bits.way_en) + + val resp = io.resp + XSDebug(resp.fire(), s"resp: data: %x id: %d\n", + resp.bits.data, resp.bits.id) + + // print refill + XSDebug(io.refill.fire(), "refill addr %x\n", io.refill.bits.addr) + + // print meta_write + XSDebug(io.meta_write.fire(), "meta_write idx %x way_en: %x old_tag: %x new_valid: %d new_tag: %x\n", + io.meta_write.bits.idx, io.meta_write.bits.way_en, io.meta_write.bits.tag, + io.meta_write.bits.data.valid, io.meta_write.bits.data.tag) + + // print tilelink messages + when (io.mem_acquire.fire()) { + XSDebug("mem_acquire ") + io.mem_acquire.bits.dump + } + when (io.mem_grant.fire()) { + XSDebug("mem_grant ") + io.mem_grant.bits.dump + } +} diff --git a/src/main/scala/xiangshan/cache/atomics.scala b/src/main/scala/xiangshan/cache/atomics.scala index 4323cc14e19cf2e2f2772c6a83a8ca473b392184..8991f0a0f9ba480d99d2f28e045d261cdf3bec38 100644 --- a/src/main/scala/xiangshan/cache/atomics.scala +++ b/src/main/scala/xiangshan/cache/atomics.scala @@ -2,7 +2,6 @@ package xiangshan.cache import chisel3._ import chisel3.util._ -import chisel3.util.experimental.BoringUtils import utils.{XSDebug} @@ -69,7 +68,7 @@ class AtomicsPipe extends DCacheModule // --------------------------------------- // stage 2 val s2_req = RegNext(s1_req) - val s2_valid = RegNext(s1_valid && !io.lsu.s1_kill, init = false.B) + val s2_valid = RegNext(s1_valid, init = false.B) dump_pipeline_reqs("AtomicsPipe s2", s2_valid, s2_req) @@ -122,8 +121,6 @@ class AtomicsPipe extends DCacheModule val s2_sc_fail = s2_sc && !s2_lrsc_addr_match val s2_sc_resp = Mux(s2_sc_fail, 1.U, 0.U) - // BoringUtils.addSource(RegEnable(lrsc_addr, s2_valid && s2_lr), "difftestLrscAddr") - // we have permission on this block // but we can not finish in this pass // we need to go to miss queue to update meta and set dirty first diff --git a/src/main/scala/xiangshan/cache/atomicsMissQueue.scala b/src/main/scala/xiangshan/cache/atomicsMissQueue.scala index 490e27b82a9c933a9f88d265e1dbd7336546db00..54592f58e0c14ba87e4bb009f68e6edb48aa2a26 100644 --- a/src/main/scala/xiangshan/cache/atomicsMissQueue.scala +++ b/src/main/scala/xiangshan/cache/atomicsMissQueue.scala @@ -34,7 +34,6 @@ class AtomicsMissQueue extends DCacheModule io.replay.req.valid := false.B io.replay.req.bits := DontCare io.replay.resp.ready := false.B - io.replay.s1_kill := false.B io.miss_req.valid := false.B io.miss_req.bits := DontCare diff --git a/src/main/scala/xiangshan/cache/dcache.scala b/src/main/scala/xiangshan/cache/dcache.scala index cabb46815922a78c819c74f3d59e81ea4026ef9a..fc5c5cbecf8ffa9ead1739a7d7e8521e95ac6213 100644 --- a/src/main/scala/xiangshan/cache/dcache.scala +++ b/src/main/scala/xiangshan/cache/dcache.scala @@ -3,7 +3,7 @@ package xiangshan.cache import chisel3._ import chisel3.util._ import freechips.rocketchip.tilelink.{ClientMetadata, TLClientParameters, TLEdgeOut} -import utils.{Code, RandomReplacement, XSDebug} +import utils.{Code, RandomReplacement, XSDebug, SRAMTemplate} import scala.math.max @@ -178,25 +178,40 @@ abstract class AbstractDataArray extends DCacheModule { class DuplicatedDataArray extends AbstractDataArray { + val singlePort = true // write is always ready io.write.ready := true.B val waddr = (io.write.bits.addr >> blockOffBits).asUInt() for (j <- 0 until LoadPipelineWidth) { val raddr = (io.read(j).bits.addr >> blockOffBits).asUInt() - // raddr === waddr is undefined behavior! - // block read in this case - io.read(j).ready := !io.write.valid || raddr =/= waddr + + // for single port SRAM, do not allow read and write in the same cycle + // for dual port SRAM, raddr === waddr is undefined behavior + val rwhazard = if(singlePort) io.write.valid else io.write.valid && waddr === raddr + io.read(j).ready := !rwhazard + for (w <- 0 until nWays) { for (r <- 0 until blockRows) { - val array = SyncReadMem(nSets, Vec(rowWords, Bits(encWordBits.W))) - // data write - when (io.write.bits.way_en(w) && io.write.valid) { - val data = VecInit((0 until rowWords) map (i => io.write.bits.data(r)(encWordBits*(i+1)-1,encWordBits*i))) - array.write(waddr, data, io.write.bits.wmask(r).asBools) + val resp = Seq.fill(rowWords)(Wire(Bits(encWordBits.W))) + io.resp(j)(w)(r) := Cat((0 until rowWords).reverse map (k => resp(k))) + + for (k <- 0 until rowWords) { + val array = Module(new SRAMTemplate(Bits(encWordBits.W), set=nSets, way=1, + shouldReset=false, holdRead=false, singlePort=singlePort)) + // data write + val wen = io.write.valid && io.write.bits.way_en(w) && io.write.bits.wmask(r)(k) + array.io.w.req.valid := wen + array.io.w.req.bits.apply( + setIdx=waddr, + data=io.write.bits.data(r)(encWordBits*(k+1)-1,encWordBits*k), + waymask=1.U) + + // data read + val ren = io.read(j).valid && io.read(j).bits.way_en(w) && io.read(j).bits.rmask(r) + array.io.r.req.valid := ren + array.io.r.req.bits.apply(setIdx=raddr) + resp(k) := RegNext(array.io.r.resp.data(0)) } - // data read - io.resp(j)(w)(r) := RegNext(array.read(raddr, io.read(j).bits.way_en(w) - && io.read(j).bits.rmask(r) && io.read(j).valid).asUInt) } } io.nacks(j) := false.B @@ -221,12 +236,21 @@ class L1MetadataArray(onReset: () => L1Metadata) extends DCacheModule { val metaBits = rstVal.getWidth val encMetaBits = cacheParams.tagCode.width(metaBits) - val tag_array = SyncReadMem(nSets, Vec(nWays, UInt(encMetaBits.W))) + val tag_array = Module(new SRAMTemplate(UInt(encMetaBits.W), set=nSets, way=nWays, + shouldReset=false, holdRead=false, singlePort=true)) + + // tag write val wen = rst || io.write.valid - when (wen) { - tag_array.write(waddr, VecInit(Array.fill(nWays)(cacheParams.tagCode.encode(wdata))), wmask) - } - io.resp := tag_array.read(io.read.bits.idx, io.read.fire()).map(rdata => + tag_array.io.w.req.valid := wen + tag_array.io.w.req.bits.apply( + setIdx=waddr, + data=cacheParams.tagCode.encode(wdata), + waymask=VecInit(wmask).asUInt) + + // tag read + tag_array.io.r.req.valid := io.read.fire() + tag_array.io.r.req.bits.apply(setIdx=io.read.bits.idx) + io.resp := tag_array.io.r.resp.data.map(rdata => cacheParams.tagCode.decode(rdata).corrected.asTypeOf(rstVal)) io.read.ready := !wen diff --git a/src/main/scala/xiangshan/cache/dcacheWrapper.scala b/src/main/scala/xiangshan/cache/dcacheWrapper.scala index da27669b678603924018d352c92ee87c79028668..efc7c19eeac2a9a7766e4da42d24abdd97bebda8 100644 --- a/src/main/scala/xiangshan/cache/dcacheWrapper.scala +++ b/src/main/scala/xiangshan/cache/dcacheWrapper.scala @@ -25,7 +25,19 @@ class DCacheMeta extends DCacheBundle { val replay = Bool() // whether it's a replayed request? } -// ordinary load and special memory operations(lr/sc, atomics) +// for load from load unit +// cycle 0: vaddr +// cycle 1: paddr +class DCacheLoadReq extends DCacheBundle +{ + val cmd = UInt(M_SZ.W) + val addr = UInt(VAddrBits.W) + val data = UInt(DataBits.W) + val mask = UInt((DataBits/8).W) + val meta = new DCacheMeta +} + +// special memory operations(lr/sc, atomics) class DCacheWordReq extends DCacheBundle { val cmd = UInt(M_SZ.W) @@ -45,6 +57,16 @@ class DCacheLineReq extends DCacheBundle val meta = new DCacheMeta } +class DCacheLoadResp extends DCacheBundle +{ + val data = UInt(DataBits.W) + val meta = new DCacheMeta + // cache req missed, send it to miss queue + val miss = Bool() + // cache req nacked, replay it later + val nack = Bool() +} + class DCacheWordResp extends DCacheBundle { val data = UInt(DataBits.W) @@ -65,12 +87,19 @@ class DCacheLineResp extends DCacheBundle val nack = Bool() } -class DCacheWordIO extends DCacheBundle +class DCacheLoadIO extends DCacheBundle { - val req = DecoupledIO(new DCacheWordReq ) + val req = DecoupledIO(new DCacheWordReq) val resp = Flipped(DecoupledIO(new DCacheWordResp)) // kill previous cycle's req - val s1_kill = Output(Bool()) + val s1_kill = Output(Bool()) + val s1_paddr = Output(UInt(PAddrBits.W)) +} + +class DCacheWordIO extends DCacheBundle +{ + val req = DecoupledIO(new DCacheWordReq) + val resp = Flipped(DecoupledIO(new DCacheWordResp)) } class DCacheLineIO extends DCacheBundle @@ -80,8 +109,8 @@ class DCacheLineIO extends DCacheBundle } class DCacheToLsuIO extends DCacheBundle { - val load = Vec(LoadPipelineWidth, Flipped(new DCacheWordIO)) // for speculative load - val lsroq = Flipped(new DCacheLineIO) // lsroq load/store + val load = Vec(LoadPipelineWidth, Flipped(new DCacheLoadIO)) // for speculative load + val lsq = Flipped(new DCacheLineIO) // lsq load/store val store = Flipped(new DCacheLineIO) // for sbuffer val atomics = Flipped(new DCacheWordIO) // atomics reqs } @@ -229,6 +258,7 @@ class DCacheImp(outer: DCache) extends LazyModuleImp(outer) with HasDCacheParame for (w <- 0 until LoadPipelineWidth) { val load_w_nack = nack_load(io.lsu.load(w).req.bits.addr) ldu(w).io.lsu.req <> io.lsu.load(w).req + ldu(w).io.lsu.s1_paddr <> io.lsu.load(w).s1_paddr ldu(w).io.nack := load_w_nack XSDebug(load_w_nack, s"LoadUnit $w nacked\n") @@ -243,7 +273,7 @@ class DCacheImp(outer: DCache) extends LazyModuleImp(outer) with HasDCacheParame } // load miss queue - loadMissQueue.io.lsu <> io.lsu.lsroq + loadMissQueue.io.lsu <> io.lsu.lsq //---------------------------------------- // store pipe and store miss queue @@ -289,8 +319,6 @@ class DCacheImp(outer: DCache) extends LazyModuleImp(outer) with HasDCacheParame "MMIO requests should not go to cache") assert(!(atomicsReq.fire() && atomicsReq.bits.meta.tlb_miss), "TLB missed requests should not go to cache") - assert(!io.lsu.atomics.s1_kill, "Lsroq should never use s1 kill on atomics") - //---------------------------------------- // miss queue @@ -417,6 +445,17 @@ class DCacheImp(outer: DCache) extends LazyModuleImp(outer) with HasDCacheParame TLArbiter.lowestFromSeq(edge, bus.c, Seq(prober.io.rep, wb.io.release)) + // dcache should only deal with DRAM addresses + when (bus.a.fire()) { + assert(bus.a.bits.address >= 0x80000000L.U) + } + when (bus.b.fire()) { + assert(bus.b.bits.address >= 0x80000000L.U) + } + when (bus.c.fire()) { + assert(bus.c.bits.address >= 0x80000000L.U) + } + // synchronization stuff def nack_load(addr: UInt) = { val store_addr_matches = VecInit(stu.io.inflight_req_block_addrs map (entry => entry.valid && entry.bits === get_block_addr(addr))) diff --git a/src/main/scala/xiangshan/cache/dtlb.scala b/src/main/scala/xiangshan/cache/dtlb.scala index 1b8543eb04fe8d16b1b81162d5b2a93c25946b93..0494e2ee82751137c4718aff3d869ac9b6b651cf 100644 --- a/src/main/scala/xiangshan/cache/dtlb.scala +++ b/src/main/scala/xiangshan/cache/dtlb.scala @@ -4,11 +4,9 @@ import chisel3._ import chisel3.util._ import xiangshan._ import utils._ -import chisel3.util.experimental.BoringUtils import xiangshan.backend.decode.XSTrap import xiangshan.backend.roq.RoqPtr import xiangshan.mem._ -import bus.simplebus._ import xiangshan.backend.fu.HasCSRConst import chisel3.ExcitingUtils._ @@ -58,7 +56,7 @@ class PermBundle(val hasV: Boolean = true) extends TlbBundle { if (hasV) { val v = Bool() } override def toPrintable: Printable = { - p"d:${d} a:${a} g:${g} u:${u} x:${x} w:${w} r:${r}"// + + p"d:${d} a:${a} g:${g} u:${u} x:${x} w:${w} r:${r}"// + //(if(hasV) (p"v:${v}") else p"") } } @@ -116,6 +114,73 @@ class TlbEntry extends TlbBundle { } } +class TlbEntires(num: Int, tagLen: Int) extends TlbBundle { + require(log2Up(num)==log2Down(num)) + /* vpn can be divide into three part */ + // vpn: tagPart + addrPart + val cutLen = log2Up(num) + + val tag = UInt(tagLen.W) // NOTE: high part of vpn + val level = UInt(log2Up(Level).W) + val ppns = Vec(num, UInt(ppnLen.W)) + val perms = Vec(num, new PermBundle(hasV = false)) + val vs = Vec(num, Bool()) + + def tagClip(vpn: UInt, level: UInt) = { // full vpn => tagLen + Mux(level===0.U, Cat(vpn(vpnLen-1, vpnnLen*2+cutLen), 0.U(vpnnLen*2+cutLen)), + Mux(level===1.U, Cat(vpn(vpnLen-1, vpnnLen*1+cutLen), 0.U(vpnnLen*1+cutLen)), + Cat(vpn(vpnLen-1, vpnnLen*0+cutLen), 0.U(vpnnLen*0+cutLen))))(tagLen-1, 0) + } + + // NOTE: get insize idx + def idxClip(vpn: UInt, level: UInt) = { + Mux(level===0.U, vpn(vpnnLen*2+cutLen-1, vpnnLen*2), + Mux(level===1.U, vpn(vpnnLen*1+cutLen-1, vpnnLen*1), + vpn(vpnnLen*0+cutLen-1, vpnnLen*0))) + } + + def hit(vpn: UInt) = { + (tag === tagClip(vpn, level)) && vs(idxClip(vpn, level)) + } + + def genEntries(data: UInt, level: UInt, vpn: UInt): TlbEntires = { + require((data.getWidth / XLEN) == num, + "input data length must be multiple of pte length") + assert(level=/=3.U, "level should not be 3") + + val ts = Wire(new TlbEntires(num, tagLen)) + ts.tag := tagClip(vpn, level) + ts.level := level + for (i <- 0 until num) { + val pte = data((i+1)*XLEN-1, i*XLEN).asTypeOf(new PteBundle) + ts.ppns(i) := pte.ppn + ts.perms(i):= pte.perm // this.perms has no v + ts.vs(i) := !pte.isPf(level) && pte.isLeaf() // legal and leaf, store to l2Tlb + } + + ts + } + + def get(vpn: UInt): TlbEntry = { + val t = Wire(new TlbEntry()) + val idx = idxClip(vpn, level) + t.vpn := vpn // Note: Use input vpn, not vpn in TlbL2 + t.ppn := ppns(idx) + t.level := level + t.perm := perms(idx) + t + } + + override def cloneType: this.type = (new TlbEntires(num, tagLen)).asInstanceOf[this.type] + override def toPrintable: Printable = { + require(num == 4, "if num is not 4, please comment this toPrintable") + // NOTE: if num is not 4, please comment this toPrintable + p"tag:${Hexadecimal(tag)} level:${level} ppn(0):${Hexadecimal(ppns(0))} ppn(1):${Hexadecimal(ppns(1))}" + + p"ppn(2):${Hexadecimal(ppns(2))} ppn(3):${Hexadecimal(ppns(3))} " + + p"perms(0):${perms(0)} perms(1):${perms(1)} perms(2):${perms(2)} perms(3):${perms(3)} vs:${Binary(vs.asUInt)}" + } +} + object TlbCmd { def read = "b00".U def write = "b01".U @@ -133,11 +198,10 @@ class TlbReq extends TlbBundle { val roqIdx = new RoqPtr val debug = new Bundle { val pc = UInt(XLEN.W) - val lsroqIdx = UInt(LsroqIdxWidth.W) // FIXME: need update } override def toPrintable: Printable = { - p"vaddr:0x${Hexadecimal(vaddr)} cmd:${cmd} pc:0x${Hexadecimal(debug.pc)} roqIdx:${roqIdx} lsroqIdx:${debug.lsroqIdx}" + p"vaddr:0x${Hexadecimal(vaddr)} cmd:${cmd} pc:0x${Hexadecimal(debug.pc)} roqIdx:${roqIdx}" } } @@ -157,10 +221,8 @@ class TlbResp extends TlbBundle { } class TlbRequestIO() extends TlbBundle { - val req = Valid(new TlbReq) - val resp = Flipped(Valid(new TlbResp)) - - // override def cloneType: this.type = (new TlbRequestIO(Width)).asInstanceOf[this.type] + val req = DecoupledIO(new TlbReq) + val resp = Flipped(DecoupledIO(new TlbResp)) } class BlockTlbRequestIO() extends TlbBundle { @@ -176,6 +238,8 @@ class TlbPtwIO extends TlbBundle { class TlbIO(Width: Int) extends TlbBundle { val requestor = Vec(Width, Flipped(new TlbRequestIO)) val ptw = new TlbPtwIO + val sfence = Input(new SfenceBundle) + val csr = Input(new TlbCsrBundle) override def cloneType: this.type = (new TlbIO(Width)).asInstanceOf[this.type] } @@ -188,16 +252,14 @@ class TLB(Width: Int, isDtlb: Boolean) extends TlbModule with HasCSRConst{ val resp = io.requestor.map(_.resp) val ptw = io.ptw - val sfence = WireInit(0.U.asTypeOf(new SfenceBundle)) - val csr = WireInit(0.U.asTypeOf(new TlbCsrBundle)) + val sfence = io.sfence + val csr = io.csr val satp = csr.satp val priv = csr.priv val ifecth = if (isDtlb) false.B else true.B val mode = if (isDtlb) priv.dmode else priv.imode // val vmEnable = satp.mode === 8.U // && (mode < ModeM) // FIXME: fix me when boot xv6/linux... val vmEnable = satp.mode === 8.U && (mode < ModeM) - BoringUtils.addSink(sfence, "SfenceBundle") - BoringUtils.addSink(csr, "TLBCSRIO") val reqAddr = req.map(_.bits.vaddr.asTypeOf(vaBundle)) val cmd = req.map(_.bits.cmd) @@ -234,6 +296,7 @@ class TLB(Width: Int, isDtlb: Boolean) extends TlbModule with HasCSRConst{ 2.U -> Cat(hitppn(i), reqAddr(i).off) )) + req(i).ready := resp(i).ready resp(i).valid := valid(i) resp(i).bits.paddr := Mux(vmEnable, paddr, SignExt(req(i).bits.vaddr, PAddrBits)) resp(i).bits.miss := miss(i) @@ -282,7 +345,7 @@ class TLB(Width: Int, isDtlb: Boolean) extends TlbModule with HasCSRConst{ } // reset pf when pf hit - val pfHitReset = ParallelOR(widthMap{i => Mux(valid(i), VecInit(pfHitVec(i)).asUInt, 0.U) }) + val pfHitReset = ParallelOR(widthMap{i => Mux(resp(i).fire(), VecInit(pfHitVec(i)).asUInt, 0.U) }) val pfHitRefill = ParallelOR(pfHitReset.asBools) // refill @@ -359,8 +422,8 @@ class TLB(Width: Int, isDtlb: Boolean) extends TlbModule with HasCSRConst{ // Log for(i <- 0 until Width) { - XSDebug(req(i).valid, p"req(${i.U}): ${req(i).bits}\n") - XSDebug(resp(i).valid, p"resp(${i.U}): ${resp(i).bits}\n") + XSDebug(req(i).valid, p"req(${i.U}): (${req(i).valid} ${req(i).ready}) ${req(i).bits}\n") + XSDebug(resp(i).valid, p"resp(${i.U}): (${resp(i).valid} ${resp(i).ready}) ${resp(i).bits}\n") } XSDebug(sfence.valid, p"Sfence: ${sfence}\n") @@ -392,39 +455,47 @@ class TLB(Width: Int, isDtlb: Boolean) extends TlbModule with HasCSRConst{ // assert(req(i).bits.vaddr===resp(i).bits.paddr, "vaddr:0x%x paddr:0x%x hitVec:%x ", req(i).bits.vaddr, resp(i).bits.paddr, VecInit(hitVec(i)).asUInt) // } // FIXME: remove me when tlb may be ok // } - + // assert((v&pf)===0.U, "v and pf can't be true at same time: v:0x%x pf:0x%x", v, pf) } object TLB { - def apply(in: Seq[BlockTlbRequestIO], width: Int, isDtlb: Boolean, shouldBlock: Boolean) = { + def apply + ( + in: Seq[BlockTlbRequestIO], + sfence: SfenceBundle, + csr: TlbCsrBundle, + width: Int, + isDtlb: Boolean, + shouldBlock: Boolean + ) = { require(in.length == width) - + val tlb = Module(new TLB(width, isDtlb)) - + + tlb.io.sfence <> sfence + tlb.io.csr <> csr + if (!shouldBlock) { // dtlb for (i <- 0 until width) { - tlb.io.requestor(i).req.valid := in(i).req.valid - tlb.io.requestor(i).req.bits := in(i).req.bits - in(i).req.ready := DontCare - - in(i).resp.valid := tlb.io.requestor(i).resp.valid - in(i).resp.bits := tlb.io.requestor(i).resp.bits + tlb.io.requestor(i) <> in(i) + // tlb.io.requestor(i).req.valid := in(i).req.valid + // tlb.io.requestor(i).req.bits := in(i).req.bits + // in(i).req.ready := tlb.io.requestor(i).req.ready + + // in(i).resp.valid := tlb.io.requestor(i).resp.valid + // in(i).resp.bits := tlb.io.requestor(i).resp.bits + // tlb.io.requestor(i).resp.ready := in(i).resp.ready } } else { // itlb require(width == 1) tlb.io.requestor(0).req.valid := in(0).req.valid tlb.io.requestor(0).req.bits := in(0).req.bits - in(0).req.ready := !tlb.io.requestor(0).resp.bits.miss && in(0).resp.ready - - // val pf = LookupTree(tlb.io.requestor(0).req.bits.cmd, List( - // TlbCmd.read -> tlb.io.requestor(0).resp.bits.excp.pf.ld, - // TlbCmd.write -> tlb.io.requestor(0).resp.bits.excp.pf.st, - // TlbCmd.exec -> tlb.io.requestor(0).resp.bits.excp.pf.instr - // )) + in(0).req.ready := !tlb.io.requestor(0).resp.bits.miss && in(0).resp.ready && tlb.io.requestor(0).req.ready in(0).resp.valid := tlb.io.requestor(0).resp.valid && !tlb.io.requestor(0).resp.bits.miss in(0).resp.bits := tlb.io.requestor(0).resp.bits + tlb.io.requestor(0).resp.ready := in(0).resp.ready } tlb.io.ptw diff --git a/src/main/scala/xiangshan/cache/icache.scala b/src/main/scala/xiangshan/cache/icache.scala index bbd62f4305a0b97a97b86b91de69e79fba6e5fa8..4724608072e4afb2ce7dc2cf9dbc048e090c3412 100644 --- a/src/main/scala/xiangshan/cache/icache.scala +++ b/src/main/scala/xiangshan/cache/icache.scala @@ -7,13 +7,6 @@ import xiangshan._ import xiangshan.frontend._ import utils._ import chisel3.ExcitingUtils._ -import chisel3.util.experimental.BoringUtils -import chipsalliance.rocketchip.config.Parameters - -import freechips.rocketchip.tilelink.{TLBundleA,TLBundleD,TLBundleE,TLEdgeOut} -import freechips.rocketchip.diplomacy.{AddressSet,IdRange,LazyModule, LazyModuleImp, TransferSizes} -import freechips.rocketchip.tilelink.{TLClientNode, TLClientParameters, TLMasterParameters, TLMasterPortParameters, TLArbiter} -import bus.tilelink.{TLParameters, TLPermissions, ClientMetadata} case class ICacheParameters( nSets: Int = 64, @@ -24,6 +17,7 @@ case class ICacheParameters( dataECC: Option[String] = None, nSDQ: Int = 17, nRPQ: Int = 16, + nMissEntries: Int = 1, nMMIOs: Int = 1, blockBytes: Int = 64 )extends L1CacheParameters { @@ -40,8 +34,16 @@ trait HasICacheParameters extends HasL1CacheParameters { def cacheID = 0 // RVC instruction length def RVCInsLen = 16 - val groupAlign = log2Up(FetchWidth * 4 * 2) + + // icache Queue + val groupAlign = log2Up(cacheParams.blockBytes) def groupPC(pc: UInt): UInt = Cat(pc(PAddrBits-1, groupAlign), 0.U(groupAlign.W)) + + //ECC encoding + def encRowBits = cacheParams.dataCode.width(rowBits) + def encTagBits = cacheParams.tagCode.width(tagBits) + + // ICache MSHR settings require(isPow2(nSets), s"nSets($nSets) must be pow2") require(isPow2(nWays), s"nWays($nWays) must be pow2") @@ -54,21 +56,25 @@ trait HasICacheParameters extends HasL1CacheParameters { abstract class ICacheBundle extends XSBundle with HasICacheParameters -abstract class ICacheModule(outer: ICache) extends LazyModuleImp(outer) - with HasICacheParameters - with HasXSLog +abstract class ICacheModule extends XSModule + with HasICacheParameters with ICacheBase +abstract class ICacheArray extends XSModule + with HasICacheParameters -sealed class ICacheMetaBundle extends ICacheBundle -{ - val tag = UInt(tagBits.W) -} +abstract class ICachArray extends XSModule + with HasICacheParameters -sealed class ICacheDataBundle extends ICacheBundle -{ - val data = UInt(wordBits.W) -} +// sealed class ICacheMetaBundle extends ICacheBundle +// { +// val tag = UInt(tagBits.W) +// } + +// sealed class ICacheDataBundle extends ICacheBundle +// { +// val data = UInt(encRowBits.W) +// } class ICacheReq extends ICacheBundle { @@ -85,12 +91,16 @@ class ICacheResp extends ICacheBundle } -class ICacheIO(edge: TLEdgeOut) extends ICacheBundle +class ICacheIO extends ICacheBundle { val req = Flipped(DecoupledIO(new ICacheReq)) val resp = DecoupledIO(new ICacheResp) + val mem_acquire = DecoupledIO(new L1plusCacheReq) + val mem_grant = Flipped(DecoupledIO(new L1plusCacheResp)) val tlb = new BlockTlbRequestIO val flush = Input(UInt(2.W)) + val l1plusflush = Output(Bool()) + val fencei = Input(Bool()) } /* ------------------------------------------------------------ @@ -126,29 +136,105 @@ trait ICacheBase extends HasICacheParameters } -/* ------------------------------------------------------------ - * This module is the Top tilelink module of Icache - * ------------------------------------------------------------ - */ -class ICache()(implicit p: Parameters) extends LazyModule - with HasICacheParameters +class ICacheMetaWriteBundle extends ICacheBundle { - val clientParameters = TLMasterPortParameters.v1( - Seq(TLMasterParameters.v1( - name = "icache")) - ) - val clientNode = TLClientNode(Seq(clientParameters)) - lazy val module = new ICacheImp(this) - + val virIdx = UInt(idxBits.W) + val phyTag = UInt(tagBits.W) + val waymask = UInt(nWays.W) + + def apply(tag:UInt, idx:UInt, waymask:UInt){ + this.virIdx := idx + this.phyTag := tag + this.waymask := waymask + } + } +class ICacheDataWriteBundle extends ICacheBundle +{ + val virIdx = UInt(idxBits.W) + val data = UInt(blockBits.W) + val waymask = UInt(nWays.W) + + def apply(data:UInt, idx:UInt, waymask:UInt){ + this.virIdx := idx + this.data := data + this.waymask := waymask + } + +} + +class ICacheMetaArray extends ICachArray +{ + val io=IO{new Bundle{ + val write = Flipped(DecoupledIO(new ICacheMetaWriteBundle)) + val read = Flipped(DecoupledIO(UInt(idxBits.W))) + val readResp = Output(Vec(nWays,UInt(tagBits.W))) + }} + + val metaArray = Module(new SRAMTemplate(UInt(encTagBits.W), set=nSets, way=nWays, shouldReset = true)) + + //read + metaArray.io.r.req.valid := io.read.valid + io.read.ready := metaArray.io.r.req.ready + io.write.ready := DontCare + metaArray.io.r.req.bits.apply(setIdx=io.read.bits) + + val rtag = metaArray.io.r.resp.asTypeOf(Vec(nWays,UInt(encTagBits.W))) + val tag_encoded = VecInit(rtag.map(wtag => cacheParams.tagCode.decode(wtag).corrected)) + io.readResp :=tag_encoded.asTypeOf(Vec(nWays,UInt(tagBits.W))) + //write + val write = io.write.bits + val wdata_encoded = cacheParams.tagCode.encode(write.phyTag.asUInt) + metaArray.io.w.req.valid := io.write.valid + metaArray.io.w.req.bits.apply(data=wdata_encoded, setIdx=write.virIdx, waymask=write.waymask) + + +} + +class ICacheDataArray extends ICachArray +{ + val io=IO{new Bundle{ + val write = Flipped(DecoupledIO(new ICacheDataWriteBundle)) + val read = Flipped(DecoupledIO(UInt(idxBits.W))) + val readResp = Output(Vec(blockWords,Vec(nWays,UInt(encRowBits.W)))) + }} + + val dataArray = List.fill(blockWords){ Module(new SRAMTemplate(UInt(encRowBits.W), set=nSets, way = nWays))} + + //read + //do ECC decoding after way choose + for(b <- 0 until blockWords){ + dataArray(b).io.r.req.valid := io.read.valid + dataArray(b).io.r.req.bits.apply(setIdx=io.read.bits) + } + val dataArrayReadyVec = dataArray.map(b => b.io.r.req.ready) + + io.read.ready := ParallelOR(dataArrayReadyVec) + io.write.ready := DontCare + io.readResp := VecInit(dataArray.map(b => b.io.r.resp.asTypeOf(Vec(nWays,UInt(encRowBits.W))))) + + //write + val write = io.write.bits + val write_data = write.data.asTypeOf(Vec(blockWords,UInt(rowBits.W))) + val write_data_encoded = write_data.map(wdata => cacheParams.tagCode.encode(wdata)) + + for(b <- 0 until blockWords){ + dataArray(b).io.w.req.valid := io.write.valid + dataArray(b).io.w.req.bits.apply( setIdx=write.virIdx, + data=write_data_encoded(b), + waymask=write.waymask) + + } + +} /* ------------------------------------------------------------ * This module is a SRAM with 4-way associated mapping * The hardware implementation of ICache * ------------------------------------------------------------ */ -class ICacheImp(outer: ICache) extends ICacheModule(outer) +class ICache extends ICacheModule { // cut a cacheline into a fetch packet def cutHelper(sourceVec: Vec[UInt], startPtr: UInt, mask: UInt): UInt = { @@ -158,9 +244,10 @@ class ICacheImp(outer: ICache) extends ICacheModule(outer) sourceVec_16bit(i*4 + j) := sourceVec(i)(j*16+15, j*16) } } - val cutPacket = WireInit(VecInit(Seq.fill(blockWords * 2){0.U(RVCInsLen.W)})) - (0 until blockWords * 2).foreach{ i => - cutPacket(i) := Mux(mask(i).asBool,sourceVec_16bit(startPtr + i.U),0.U) + val cutPacket = WireInit(VecInit(Seq.fill(PredictWidth){0.U(RVCInsLen.W)})) + val start = Cat(startPtr(4,3),0.U(3.W)) + (0 until PredictWidth ).foreach{ i => + cutPacket(i) := Mux(mask(i).asBool,sourceVec_16bit(start + i.U),0.U) } cutPacket.asUInt } @@ -174,16 +261,13 @@ class ICacheImp(outer: ICache) extends ICacheModule(outer) } - val (bus, edge) = outer.clientNode.out.head - require(bus.d.bits.data.getWidth == l1BusDataWidth, "ICache: tilelink width does not match") - val io = IO(new ICacheIO(edge)) - val (_, _, refill_done, refill_cnt) = edge.count(bus.d) + val io = IO(new ICacheIO) //---------------------------- // Memory Part //---------------------------- - val metaArray = Module(new SRAMTemplate(new ICacheMetaBundle, set=nSets, way=nWays, shouldReset = true)) - val dataArray = List.fill(blockWords){ Module(new SRAMTemplate(new ICacheDataBundle, set=nSets, way = nWays))} + val metaArray = Module(new ICacheMetaArray) + val dataArray = Module(new ICacheDataArray) // 256-bit valid val validArray = RegInit(0.U((nSets * nWays).W)) @@ -198,12 +282,11 @@ class ICacheImp(outer: ICache) extends ICacheModule(outer) // SRAM(Meta and Data) read request val s1_idx = get_idx(s1_req_pc) - metaArray.io.r.req.valid := s1_valid - metaArray.io.r.req.bits.apply(setIdx=s1_idx) - for(b <- 0 until blockWords){ - dataArray(b).io.r.req.valid := s1_valid - dataArray(b).io.r.req.bits.apply(setIdx=s1_idx) - } + metaArray.io.read.valid := s1_valid + metaArray.io.read.bits :=s1_idx + dataArray.io.read.valid := s1_valid + dataArray.io.read.bits :=s1_idx + XSDebug("[Stage 1] v : r : f (%d %d %d) request pc: 0x%x mask: %b\n",s1_valid,s2_ready,s1_fire,s1_req_pc,s1_req_mask) XSDebug("[Stage 1] index: %d\n",s1_idx) @@ -221,13 +304,13 @@ class ICacheImp(outer: ICache) extends ICacheModule(outer) .elsewhen(s2_fire) { s2_valid := false.B} // SRAM(Meta and Data) read reseponse - val metas = metaArray.io.r.resp.asTypeOf(Vec(nWays,new ICacheMetaBundle)) - val datas =dataArray.map(b => RegEnable(next=b.io.r.resp.asTypeOf(Vec(nWays,new ICacheDataBundle)), enable=s2_fire)) + val metas = metaArray.io.readResp + val datas =RegEnable(next=dataArray.io.readResp, enable=s2_fire) val validMeta = Cat((0 until nWays).map{w => validArray(Cat(s2_idx, w.U(2.W)))}.reverse).asUInt // hit check and generate victim cacheline mask - val hitVec = VecInit((0 until nWays).map{w => metas(w).tag === s2_tag && validMeta(w) === 1.U}) + val hitVec = VecInit((0 until nWays).map{w => metas(w)=== s2_tag && validMeta(w) === 1.U}) val victimWayMask = (1.U << LFSR64()(log2Up(nWays)-1,0)) val invalidVec = ~validMeta val hasInvalidWay = invalidVec.orR @@ -254,134 +337,86 @@ class ICacheImp(outer: ICache) extends ICacheModule(outer) val s3_hit = RegEnable(next=s2_hit,init=false.B,enable=s2_fire) val s3_wayMask = RegEnable(next=waymask,init=0.U,enable=s2_fire) val s3_miss = s3_valid && !s3_hit + val s3_idx = get_idx(s3_req_pc) when(io.flush(1)) { s3_valid := false.B } .elsewhen(s2_fire) { s3_valid := s2_valid } .elsewhen(io.resp.fire()) { s3_valid := false.B } val refillDataReg = Reg(Vec(refillCycles,UInt(beatBits.W))) // icache hit + // data ECC encoding // simply cut the hit cacheline - val dataHitWay = s3_data.map(b => Mux1H(s3_wayMask,b).asUInt) + val dataHitWay = VecInit(s3_data.map(b => Mux1H(s3_wayMask,b).asUInt)) val outPacket = Wire(UInt((FetchWidth * 32).W)) - outPacket := cutHelper(VecInit(dataHitWay),s3_req_pc(5,1).asUInt,s3_req_mask.asUInt) - - //icache miss - val s_idle :: s_memReadReq :: s_memReadResp :: s_wait_resp :: Nil = Enum(4) - val state = RegInit(s_idle) - val readBeatCnt = Counter(refillCycles) - - //pipeline flush register - val needFlush = RegInit(false.B) - when(io.flush(1) && (state =/= s_idle) && (state =/= s_wait_resp)){ needFlush := true.B } - .elsewhen((state=== s_wait_resp) && needFlush){ needFlush := false.B } - + val dataHitWayDecoded = VecInit( + (0 until blockWords).map{r => + val row = dataHitWay.asTypeOf(Vec(blockWords,UInt(encRowBits.W)))(r) + val decodedRow = cacheParams.dataCode.decode(row) + assert(!(s3_valid && s3_hit && decodedRow.uncorrectable)) + decodedRow.corrected + } + ) + outPacket := cutHelper(dataHitWay,s3_req_pc(5,1).asUInt,s3_req_mask.asUInt) + + //ICache MissQueue + val icacheMissQueue = Module(new IcacheMissQueue) + val blocking = RegInit(false.B) + val isICacheResp = icacheMissQueue.io.resp.valid && icacheMissQueue.io.resp.bits.clientID === cacheID.U(2.W) + icacheMissQueue.io.req.valid := s3_miss && !io.flush(1) && !blocking//TODO: specificate flush condition + icacheMissQueue.io.req.bits.apply(missAddr=groupPC(s3_tlb_resp.paddr),missIdx=s3_idx,missWaymask=s3_wayMask,source=cacheID.U(2.W)) + icacheMissQueue.io.resp.ready := io.resp.ready + icacheMissQueue.io.flush := io.flush(1) + + when(icacheMissQueue.io.req.fire()){blocking := true.B} + .elsewhen(blocking && ((icacheMissQueue.io.resp.fire() && isICacheResp) || io.flush(1)) ){blocking := false.B} + + XSDebug(blocking && io.flush(1),"check for icache non-blocking") //cache flush register - val icacheFlush = WireInit(false.B) + val icacheFlush = io.fencei val cacheflushed = RegInit(false.B) - BoringUtils.addSink(icacheFlush, "FenceI") XSDebug("[Fence.i] icacheFlush:%d, cacheflushed:%d\n",icacheFlush,cacheflushed) - when(icacheFlush && (state =/= s_idle) && (state =/= s_wait_resp)){ cacheflushed := true.B} - .elsewhen((state=== s_wait_resp) && cacheflushed) {cacheflushed := false.B } - - val waitForRefillDone = needFlush || cacheflushed - - // state change to wait for a cacheline refill - val countFull = readBeatCnt.value === (refillCycles - 1).U - switch(state){ - is(s_idle){ - when(s3_miss && io.flush === 0.U){ - state := s_memReadReq - readBeatCnt.value := 0.U - } - } - - - // memory request - is(s_memReadReq){ - when(bus.a.fire()){ - state := s_memReadResp - } - } - - is(s_memReadResp){ - when (edge.hasData(bus.d.bits) && bus.d.fire()) { - readBeatCnt.inc() - refillDataReg(readBeatCnt.value) := bus.d.bits.data - when(countFull){ - assert(refill_done, "refill not done!") - state := s_wait_resp - } - } - } - - is(s_wait_resp){ - when(io.resp.fire() || needFlush ){state := s_idle} - } - - } + when(icacheFlush && blocking && !isICacheResp){ cacheflushed := true.B} + .elsewhen(isICacheResp && cacheflushed) {cacheflushed := false.B } + //TODO: Prefetcher //refill write - val metaWrite = Wire(new ICacheMetaBundle) - val refillFinalOneBeat = (state === s_memReadResp) && bus.d.fire() && refill_done - val wayNum = OHToUInt(s3_wayMask.asTypeOf(Vec(nWays,Bool()))) - val validPtr = Cat(get_idx(s3_req_pc),wayNum) - metaWrite.tag := s3_tag - metaArray.io.w.req.valid := refillFinalOneBeat - metaArray.io.w.req.bits.apply(data=metaWrite, setIdx=get_idx(s3_req_pc), waymask=s3_wayMask) - - if(beatBits == 64){ - for(b <- 0 until blockWords){ - val writeOneBeat = (state === s_memReadResp) && bus.d.fire() && (b.U === readBeatCnt.value) - dataArray(b).io.w.req.valid := writeOneBeat - dataArray(b).io.w.req.bits.apply( setIdx=get_idx(s3_req_pc), - data=bus.d.bits.data.asTypeOf(new ICacheDataBundle), - waymask=s3_wayMask) - - } - } - else{ - val writeFirstHalf = (state === s_memReadResp) && bus.d.fire() && (readBeatCnt.value === 0.U) - (0 until blockWords/2).foreach{ b => - dataArray(b).io.w.req.valid := writeFirstHalf - dataArray(b).io.w.req.bits.apply( setIdx=get_idx(s3_req_pc), - data=bus.d.bits.data(b * 64 +63, b*64).asTypeOf(new ICacheDataBundle), - waymask=s3_wayMask) - - } - val writeLastHalf = (state === s_memReadResp) && bus.d.fire() && (readBeatCnt.value === 1.U) - (blockWords/2 until blockWords).foreach{ b => - val index = b - blockWords/2 - dataArray(b).io.w.req.valid := writeLastHalf - dataArray(b).io.w.req.bits.apply( setIdx=get_idx(s3_req_pc), - data=bus.d.bits.data(index * 64 +63, index*64).asTypeOf(new ICacheDataBundle), - waymask=s3_wayMask) - - } - - } - - when(refillFinalOneBeat && !cacheflushed){ + val metaWriteReq = icacheMissQueue.io.meta_write.bits + icacheMissQueue.io.meta_write.ready := true.B + metaArray.io.write.valid := icacheMissQueue.io.meta_write.valid + metaArray.io.write.bits.apply(tag=metaWriteReq.meta_write_tag, + idx=metaWriteReq.meta_write_idx, + waymask=metaWriteReq.meta_write_waymask) + + val wayNum = OHToUInt(metaWriteReq.meta_write_waymask.asTypeOf(Vec(nWays,Bool()))) + val validPtr = Cat(metaWriteReq.meta_write_idx,wayNum) + when(icacheMissQueue.io.meta_write.valid && !cacheflushed){ validArray := validArray.bitSet(validPtr, true.B) } + //data + icacheMissQueue.io.refill.ready := true.B + val refillReq = icacheMissQueue.io.refill.bits + dataArray.io.write.valid := icacheMissQueue.io.refill.valid + dataArray.io.write.bits.apply(data=refillReq.refill_data, + idx=refillReq.refill_idx, + waymask=refillReq.refill_waymask) + //icache flush: only flush valid Array register when(icacheFlush){ validArray := 0.U } - val refillDataVec = refillDataReg.asTypeOf(Vec(blockWords,UInt(wordBits.W))) + val refillDataVec = icacheMissQueue.io.resp.bits.data.asTypeOf(Vec(blockWords,UInt(wordBits.W))) val refillDataOut = cutHelper(refillDataVec, s3_req_pc(5,1),s3_req_mask ) - s3_ready := ((io.resp.fire() || !s3_valid) && !waitForRefillDone) || (waitForRefillDone && state === s_wait_resp) + s3_ready := ((io.resp.fire() || !s3_valid) && !blocking) || (blocking && icacheMissQueue.io.resp.fire()) //TODO: coherence XSDebug("[Stage 3] valid:%d pc: 0x%x mask: %b ipf:%d\n",s3_valid,s3_req_pc,s3_req_mask,s3_tlb_resp.excp.pf.instr) - XSDebug("[Stage 3] hit:%d miss:%d waymask:%x \n",s3_hit,s3_miss,s3_wayMask.asUInt) - XSDebug("[Stage 3] state: %d\n",state) - XSDebug("[Stage 3] needflush:%d, refilldone:%d\n",needFlush,refill_done) + XSDebug("[Stage 3] hit:%d miss:%d waymask:%x blocking:%d\n",s3_hit,s3_miss,s3_wayMask.asUInt,blocking) XSDebug("[Stage 3] tag: %x idx: %d\n",s3_tag,get_idx(s3_req_pc)) XSDebug(p"[Stage 3] tlb resp: ${s3_tlb_resp}\n") - XSDebug("[Chanel A] valid:%d ready:%d\n",bus.a.valid,bus.a.ready) - XSDebug("[Chanel D] valid:%d ready:%d data:%x readBeatcnt:%d \n",bus.d.valid,bus.d.ready,bus.d.bits.data,readBeatCnt.value) + XSDebug("[mem_acquire] valid:%d ready:%d\n",io.mem_acquire.valid,io.mem_acquire.ready) + XSDebug("[mem_grant] valid:%d ready:%d data:%x id:%d \n",io.mem_grant.valid,io.mem_grant.ready,io.mem_grant.bits.data,io.mem_grant.bits.id) XSDebug("[Stage 3] ---------Hit Way--------- \n") for(i <- 0 until blockWords){ XSDebug("[Stage 3] %x\n",dataHitWay(i)) @@ -393,11 +428,10 @@ class ICacheImp(outer: ICache) extends ICacheModule(outer) // Out Put //---------------------------- //icache request - val dataArrayReadyVec = dataArray.map(b => b.io.r.req.ready) - io.req.ready := metaArray.io.r.req.ready && ParallelOR(dataArrayReadyVec) && s2_ready + io.req.ready := metaArray.io.read.ready && dataArray.io.read.ready && s2_ready //icache response: to pre-decoder - io.resp.valid := s3_valid && (s3_hit || state === s_wait_resp) + io.resp.valid := s3_valid && (s3_hit || icacheMissQueue.io.resp.valid) io.resp.bits.data := Mux((s3_valid && s3_hit),outPacket,refillDataOut) io.resp.bits.mask := s3_req_mask io.resp.bits.pc := s3_req_pc @@ -410,26 +444,19 @@ class ICacheImp(outer: ICache) extends ICacheModule(outer) io.tlb.req.bits.cmd := TlbCmd.exec io.tlb.req.bits.roqIdx := DontCare io.tlb.req.bits.debug.pc := s2_req_pc - io.tlb.req.bits.debug.lsroqIdx := DontCare - //tilelink - bus.b.ready := true.B - bus.c.valid := false.B - bus.e.valid := false.B - bus.a.valid := (state === s_memReadReq) - val memTileReq = edge.Get( - fromSource = cacheID.U, - toAddress = groupPC(s3_tlb_resp.paddr), - lgSize = (log2Up(cacheParams.blockBytes)).U )._2 - bus.a.bits := memTileReq - bus.d.ready := true.B + //To L1 plus + io.mem_acquire <> icacheMissQueue.io.mem_acquire + icacheMissQueue.io.mem_grant <> io.mem_grant + + io.l1plusflush := icacheFlush XSDebug("[flush] flush_0:%d flush_1:%d\n",io.flush(0),io.flush(1)) //Performance Counter if (!env.FPGAPlatform ) { - ExcitingUtils.addSource( s3_valid && (state === s_idle), "perfCntIcacheReqCnt", Perf) - ExcitingUtils.addSource( s3_valid && (state === s_idle) && s3_miss, "perfCntIcacheMissCnt", Perf) + ExcitingUtils.addSource( s3_valid && !blocking, "perfCntIcacheReqCnt", Perf) + ExcitingUtils.addSource( s3_valid && !blocking && s3_miss, "perfCntIcacheMissCnt", Perf) } } diff --git a/src/main/scala/xiangshan/cache/icacheMissQueue.scala b/src/main/scala/xiangshan/cache/icacheMissQueue.scala new file mode 100644 index 0000000000000000000000000000000000000000..87275a565c76517aecdf5985ddac90e0bb41ce87 --- /dev/null +++ b/src/main/scala/xiangshan/cache/icacheMissQueue.scala @@ -0,0 +1,239 @@ +package xiangshan.cache + +import chisel3._ +import chisel3.util._ +import device._ +import xiangshan._ +import utils._ +import chisel3.ExcitingUtils._ + +abstract class ICacheMissQueueModule extends XSModule + with HasICacheParameters + with HasXSLog + +abstract class ICacheMissQueueBundle extends XSBundle + with HasICacheParameters + +class ICacheRefill extends ICacheMissQueueBundle +{ + val refill_idx = UInt(idxBits.W) + val refill_data = UInt(blockBits.W) + val refill_waymask = UInt(nWays.W) + + def apply(data:UInt, setIdx:UInt, waymask:UInt) = { + this.refill_idx := setIdx + this.refill_data := data + this.refill_waymask := waymask + } +} + +class ICacheMetaWrite extends ICacheMissQueueBundle +{ + val meta_write_idx = UInt(idxBits.W) + val meta_write_tag = UInt(tagBits.W) + val meta_write_waymask = UInt(nWays.W) + + def apply(tag:UInt, setIdx:UInt, waymask:UInt) = { + this.meta_write_idx := setIdx + this.meta_write_tag := tag + this.meta_write_waymask := waymask + } +} + +class IcacheMissReq extends ICacheBundle +{ + val addr = UInt(PAddrBits.W) + val setIdx = UInt(idxBits.W) + val waymask = UInt(PredictWidth.W) + val clientID = UInt(2.W) + def apply(missAddr:UInt, missIdx:UInt, missWaymask:UInt, source:UInt) = { + this.addr := missAddr + this.setIdx := missIdx + this.waymask := missWaymask + this.clientID := source + } +} + +class IcacheMissResp extends ICacheBundle +{ + val data = UInt(blockBits.W) + val clientID = UInt(2.W) +} + +class IcacheMissEntry extends ICacheMissQueueModule +{ + val io = IO(new Bundle{ + // MSHR ID + val id = Input(UInt(log2Up(cacheParams.nMissEntries).W)) + + val req = Flipped(DecoupledIO(new IcacheMissReq)) + val resp = DecoupledIO(new IcacheMissResp) + + val mem_acquire = DecoupledIO(new L1plusCacheReq) + val mem_grant = Flipped(DecoupledIO(new L1plusCacheResp)) + + val meta_write = DecoupledIO(new ICacheMetaWrite) + val refill = DecoupledIO(new ICacheRefill) + + val flush = Input(Bool()) + }) + + val s_idle :: s_memReadReq :: s_memReadResp :: s_write_back :: s_wait_resp :: Nil = Enum(5) + val state = RegInit(s_idle) + + //req register + val req = Reg(new IcacheMissReq) + val req_idx = req.setIdx //virtual index + val req_tag = get_tag(req.addr) //physical tag + val req_waymask = req.waymask + + //8 for 64 bits bus and 2 for 256 bits + val readBeatCnt = Counter(refillCycles) + //val respDataReg = Reg(Vec(refillCycles,UInt(beatBits.W))) + val respDataReg = Reg(UInt(blockBits.W)) + + //initial + io.resp.bits := DontCare + io.mem_acquire.bits := DontCare + io.mem_grant.ready := true.B + io.meta_write.bits := DontCare + io.refill.bits := DontCare + + io.req.ready := state === s_idle + io.mem_acquire.valid := state === s_memReadReq + + //flush register + val needFlush = RegInit(false.B) + when(io.flush && (state =/= s_idle) && (state =/= s_wait_resp)){ needFlush := true.B } + .elsewhen((state=== s_wait_resp) && needFlush){ needFlush := false.B } + + //state change + switch(state){ + is(s_idle){ + when(io.req.fire()){ + state := s_memReadReq + req := io.req.bits + } + } + + // memory request + is(s_memReadReq){ + when(io.mem_acquire.fire()){ + state := s_memReadResp + } + } + + is(s_memReadResp){ + when (io.mem_grant.bits.id === io.id && io.mem_grant.fire()) { + respDataReg := io.mem_grant.bits.data + state := Mux(needFlush || io.flush,s_wait_resp,s_write_back) + } + } + + //TODO: Maybe this sate is noe necessary so we don't need respDataReg + is(s_write_back){ + when(io.refill.fire() && io.meta_write.fire()){ + state := s_wait_resp + } + } + + is(s_wait_resp){ + io.resp.bits.data := respDataReg.asUInt + io.resp.bits.clientID := req.clientID + when(io.resp.fire() || needFlush ){ state := s_idle } + } + + } + + //refill write and meta write + //WARNING: Maybe could not finish refill in 1 cycle + io.meta_write.valid := (state === s_write_back) && !needFlush + io.meta_write.bits.apply(tag=req_tag, setIdx=req_idx, waymask=req_waymask) + + io.refill.valid := (state === s_write_back) && !needFlush + io.refill.bits.apply(data=respDataReg.asUInt, + setIdx=req_idx, + waymask=req_waymask) + + //mem request + io.mem_acquire.bits.cmd := MemoryOpConstants.M_XRD + io.mem_acquire.bits.addr := req.addr + io.mem_acquire.bits.id := io.id + + //resp to icache + io.resp.valid := (state === s_wait_resp) && !needFlush + + XSDebug("[ICache MSHR %d] (req)valid:%d ready:%d req.addr:%x waymask:%b || Register: req:%x \n",io.id.asUInt,io.req.valid,io.req.ready,io.req.bits.addr,io.req.bits.waymask,req.asUInt) + XSDebug("[ICache MSHR %d] (Info)state:%d needFlush:%d\n",io.id.asUInt,state,needFlush) + XSDebug("[ICache MSHR %d] (mem_acquire) valid%d ready:%d\n",io.id.asUInt,io.mem_acquire.valid,io.mem_acquire.ready) + XSDebug("[ICache MSHR %d] (mem_grant) valid%d ready:%d data:%x \n",io.id.asUInt,io.mem_grant.valid,io.mem_grant.ready,io.mem_grant.bits.data) + XSDebug("[ICache MSHR %d] (meta_write) valid%d ready:%d tag:%x \n",io.id.asUInt,io.meta_write.valid,io.meta_write.ready,io.meta_write.bits.meta_write_tag) + XSDebug("[ICache MSHR %d] (refill) valid%d ready:%d data:%x \n",io.id.asUInt,io.refill.valid,io.refill.ready,io.refill.bits.refill_data) + XSDebug("[ICache MSHR %d] (resp) valid%d ready:%d \n",io.id.asUInt,io.resp.valid,io.resp.ready) + + +} + +class IcacheMissQueue extends ICacheMissQueueModule +{ + val io = IO(new Bundle{ + val req = Flipped(DecoupledIO(new IcacheMissReq)) + val resp = DecoupledIO(new IcacheMissResp) + + val mem_acquire = DecoupledIO(new L1plusCacheReq) + val mem_grant = Flipped(DecoupledIO(new L1plusCacheResp)) + + val meta_write = DecoupledIO(new ICacheMetaWrite) + val refill = DecoupledIO(new ICacheRefill) + + val flush = Input(Bool()) + + }) + + val resp_arb = Module(new Arbiter(new IcacheMissResp, cacheParams.nMissEntries)) + val meta_write_arb = Module(new Arbiter(new ICacheMetaWrite, cacheParams.nMissEntries)) + val refill_arb = Module(new Arbiter(new ICacheRefill, cacheParams.nMissEntries)) + val mem_acquire_arb= Module(new Arbiter(new L1plusCacheReq, cacheParams.nMissEntries)) + + //initial + io.mem_grant.ready := true.B + + val entry_alloc_idx = Wire(UInt()) + val req_ready = WireInit(false.B) + + val entries = (0 until cacheParams.nMissEntries) map { i => + val entry = Module(new IcacheMissEntry) + + entry.io.id := i.U(log2Up(cacheParams.nMissEntries).W) + entry.io.flush := io.flush + + // entry req + entry.io.req.valid := (i.U === entry_alloc_idx) && io.req.valid + entry.io.req.bits := io.req.bits + when (i.U === entry_alloc_idx) { + req_ready := entry.io.req.ready + } + + // entry resp + resp_arb.io.in(i) <> entry.io.resp + meta_write_arb.io.in(i) <> entry.io.meta_write + refill_arb.io.in(i) <> entry.io.refill + mem_acquire_arb.io.in(i) <> entry.io.mem_acquire + + entry.io.mem_grant.valid := false.B + entry.io.mem_grant.bits := DontCare + when (io.mem_grant.bits.id === i.U) { + entry.io.mem_grant <> io.mem_grant + } + entry + } + + entry_alloc_idx := PriorityEncoder(entries.map(m=>m.io.req.ready)) + + io.req.ready := req_ready + io.resp <> resp_arb.io.out + io.meta_write <> meta_write_arb.io.out + io.refill <> refill_arb.io.out + io.mem_acquire <> mem_acquire_arb.io.out + +} diff --git a/src/main/scala/xiangshan/cache/ldu.scala b/src/main/scala/xiangshan/cache/ldu.scala index a86cfb7c5d6a3e4a4d08c7a14587d9d6b9713c21..341b067764aa276eb440defb3dea534f82b0cbcb 100644 --- a/src/main/scala/xiangshan/cache/ldu.scala +++ b/src/main/scala/xiangshan/cache/ldu.scala @@ -8,7 +8,7 @@ import utils.XSDebug class LoadPipe extends DCacheModule { val io = IO(new DCacheBundle{ - val lsu = Flipped(new DCacheWordIO) + val lsu = Flipped(new DCacheLoadIO) val data_read = DecoupledIO(new L1DataReadReq) val data_resp = Input(Vec(nWays, Vec(blockRows, Bits(encRowBits.W)))) val meta_read = DecoupledIO(new L1MetaReadReq) @@ -56,7 +56,8 @@ class LoadPipe extends DCacheModule // stage 1 val s1_req = RegNext(s0_req) val s1_valid = RegNext(s0_valid, init = false.B) - val s1_addr = s1_req.addr + // in stage 1, load unit gets the physical address + val s1_addr = io.lsu.s1_paddr val s1_nack = RegNext(io.nack) dump_pipeline_reqs("LoadPipe s1", s1_valid, s1_req) @@ -68,7 +69,7 @@ class LoadPipe extends DCacheModule val s1_tag_match_way = wayMap((w: Int) => s1_tag_eq_way(w) && meta_resp(w).coh.isValid()).asUInt assert(!(s1_valid && s1_req.meta.replay && io.lsu.s1_kill), - "lsroq tried to kill an replayed request!") + "lsq tried to kill an replayed request!") // stage 2 val s2_req = RegNext(s1_req) @@ -76,6 +77,7 @@ class LoadPipe extends DCacheModule dump_pipeline_reqs("LoadPipe s2", s2_valid, s2_req) + val s2_addr = RegNext(s1_addr) val s2_tag_match_way = RegNext(s1_tag_match_way) val s2_tag_match = s2_tag_match_way.orR val s2_hit_state = Mux1H(s2_tag_match_way, wayMap((w: Int) => RegNext(meta_resp(w).coh))) @@ -96,12 +98,12 @@ class LoadPipe extends DCacheModule val s2_data = Wire(Vec(nWays, UInt(encRowBits.W))) val data_resp = io.data_resp for (w <- 0 until nWays) { - s2_data(w) := data_resp(w)(get_row(s2_req.addr)) + s2_data(w) := data_resp(w)(get_row(s2_addr)) } val s2_data_muxed = Mux1H(s2_tag_match_way, s2_data) // the index of word in a row, in case rowBits != wordBits - val s2_word_idx = if (rowWords == 1) 0.U else s2_req.addr(log2Up(rowWords*wordBytes)-1, log2Up(wordBytes)) + val s2_word_idx = if (rowWords == 1) 0.U else s2_addr(log2Up(rowWords*wordBytes)-1, log2Up(wordBytes)) val s2_nack_hit = RegNext(s1_nack) // Can't allocate MSHR for same set currently being written back diff --git a/src/main/scala/xiangshan/cache/ptw.scala b/src/main/scala/xiangshan/cache/ptw.scala index 8a5bb3351dd7f0cec5cb73b66177a5566d94a5af..cb5ef389d36cade6c225a3981757bfbf4cb76e27 100644 --- a/src/main/scala/xiangshan/cache/ptw.scala +++ b/src/main/scala/xiangshan/cache/ptw.scala @@ -5,15 +5,29 @@ import chisel3._ import chisel3.util._ import xiangshan._ import utils._ -import chisel3.util.experimental.BoringUtils -import xiangshan.backend.decode.XSTrap -import xiangshan.mem._ import chisel3.ExcitingUtils._ import freechips.rocketchip.diplomacy.{LazyModule, LazyModuleImp} import freechips.rocketchip.tilelink.{TLClientNode, TLMasterParameters, TLMasterPortParameters} trait HasPtwConst extends HasTlbConst with MemoryOpConstants{ val PtwWidth = 2 + val MemBandWidth = 256 // TODO: change to IO bandwidth param + val TlbL2LineSize = MemBandWidth/XLEN + val TlbL2LineNum = TlbL2EntrySize/TlbL2LineSize + val PtwL2LineSize = MemBandWidth/XLEN + val PtwL2LineNum = PtwL2EntrySize/PtwL2LineSize + val PtwL1TagLen = PAddrBits - log2Up(XLEN/8) + val PtwL2TagLen = PAddrBits - log2Up(XLEN/8) - log2Up(PtwL2EntrySize) + val TlbL2TagLen = vpnLen - log2Up(TlbL2EntrySize) + + def genPtwL2Idx(addr: UInt) = { + /* tagLen :: outSizeIdxLen :: insideIdxLen*/ + addr(log2Up(PtwL2EntrySize)-1+log2Up(XLEN/8), log2Up(PtwL2LineSize)+log2Up(XLEN/8)) + } + + def genTlbL2Idx(vpn: UInt) = { + vpn(log2Up(TlbL2LineNum)-1+log2Up(TlbL2LineSize), 0+log2Up(TlbL2LineSize)) + } def MakeAddr(ppn: UInt, off: UInt) = { require(off.getWidth == 9) @@ -44,12 +58,19 @@ class PteBundle extends PtwBundle{ val v = Bool() } - def isPf() = { - !perm.v || (!perm.r && perm.w) + def unaligned(level: UInt) = { + assert(level=/=3.U) + isLeaf() && !(level === 2.U || + level === 1.U && ppn(vpnnLen-1, 0) === 0.U || + level === 0.U && ppn(vpnnLen*2-1, 0) === 0.U) + } + + def isPf(level: UInt) = { + !perm.v || (!perm.r && perm.w) || unaligned(level) } def isLeaf() = { - !isPf() && (perm.r || perm.x) + perm.r || perm.x || perm.w } override def toPrintable: Printable = { @@ -60,9 +81,7 @@ class PteBundle extends PtwBundle{ class PtwEntry(tagLen: Int) extends PtwBundle { val tag = UInt(tagLen.W) val ppn = UInt(ppnLen.W) - val perm = new PermBundle - // TODO: add superpage def hit(addr: UInt) = { require(addr.getWidth >= PAddrBits) tag === addr(PAddrBits-1, PAddrBits-tagLen) @@ -71,21 +90,69 @@ class PtwEntry(tagLen: Int) extends PtwBundle { def refill(addr: UInt, pte: UInt) { tag := addr(PAddrBits-1, PAddrBits-tagLen) ppn := pte.asTypeOf(pteBundle).ppn - perm := pte.asTypeOf(pteBundle).perm } def genPtwEntry(addr: UInt, pte: UInt) = { val e = Wire(new PtwEntry(tagLen)) e.tag := addr(PAddrBits-1, PAddrBits-tagLen) e.ppn := pte.asTypeOf(pteBundle).ppn - e.perm := pte.asTypeOf(pteBundle).perm e } override def cloneType: this.type = (new PtwEntry(tagLen)).asInstanceOf[this.type] override def toPrintable: Printable = { - p"tag:0x${Hexadecimal(tag)} ppn:0x${Hexadecimal(ppn)} perm:${perm}" + // p"tag:0x${Hexadecimal(tag)} ppn:0x${Hexadecimal(ppn)} perm:${perm}" + p"tag:0x${Hexadecimal(tag)} ppn:0x${Hexadecimal(ppn)}" + } +} + +class PtwEntries(num: Int, tagLen: Int) extends PtwBundle { + require(log2Up(num)==log2Down(num)) + + val tag = UInt(tagLen.W) + val ppns = Vec(num, UInt(ppnLen.W)) + val vs = Vec(num, Bool()) + + def tagClip(addr: UInt) = { + require(addr.getWidth==PAddrBits) + + addr(PAddrBits-1, PAddrBits-tagLen) + } + + def hit(idx: UInt, addr: UInt) = { + require(idx.getWidth == log2Up(num), s"PtwEntries.hit: error idx width idxWidth:${idx.getWidth} num:${num}") + + (tag === tagClip(addr)) && vs(idx) + } + + def genEntries(addr: UInt, data: UInt, level: UInt): PtwEntries = { + require((data.getWidth / XLEN) == num, + "input data length must be multiple of pte length") + + val ps = Wire(new PtwEntries(num, tagLen)) + ps.tag := tagClip(addr) + for (i <- 0 until num) { + val pte = data((i+1)*XLEN-1, i*XLEN).asTypeOf(new PteBundle) + ps.ppns(i) := pte.ppn + ps.vs(i) := !pte.isPf(level) && !pte.isLeaf() + } + + ps + } + + def get(idx: UInt) = { + require(idx.getWidth == log2Up(num), s"PtwEntries.get: error idx width idxWidth:${idx.getWidth} num:${num}") + + (vs(idx), ppns(idx)) + } + + override def cloneType: this.type = (new PtwEntries(num, tagLen)).asInstanceOf[this.type] + override def toPrintable: Printable = { + require(num == 4, "if num is not 4, please comment this toPrintable") + // NOTE: if num is not 4, please comment this toPrintable + p"tag:${Hexadecimal(tag)} ppn(0):${Hexadecimal(ppns(0))} ppn(1):${Hexadecimal(ppns(1))}" + + p"ppn(2):${Hexadecimal(ppns(2))} ppn(3):${Hexadecimal(ppns(3))} vs:${Binary(vs.asUInt)}" } } @@ -108,6 +175,8 @@ class PtwResp extends PtwBundle { class PtwIO extends PtwBundle { val tlb = Vec(PtwWidth, Flipped(new TlbPtwIO)) + val sfence = Input(new SfenceBundle) + val csr = Input(new TlbCsrBundle) } object ValidHold { @@ -154,40 +223,33 @@ class PTWImp(outer: PTW) extends PtwModule(outer){ val req = RegEnable(arb.io.out.bits, arb.io.out.fire()) val resp = VecInit(io.tlb.map(_.resp)) - val valid = ValidHold(arb.io.out.fire(), resp(arbChosen).fire()) val validOneCycle = OneCycleValid(arb.io.out.fire()) arb.io.out.ready := !valid// || resp(arbChosen).fire() - val sfence = WireInit(0.U.asTypeOf(new SfenceBundle)) - val csr = WireInit(0.U.asTypeOf(new TlbCsrBundle)) + val sfence = io.sfence + val csr = io.csr val satp = csr.satp val priv = csr.priv - BoringUtils.addSink(sfence, "SfenceBundle") - BoringUtils.addSink(csr, "TLBCSRIO") // two level: l2-tlb-cache && pde/pte-cache // l2-tlb-cache is ram-larger-edition tlb // pde/pte-cache is cache of page-table, speeding up ptw - - // may seperate valid bits to speed up sfence's flush - // Reg/Mem/SyncReadMem is not sure now - val tagLen1 = PAddrBits - log2Up(XLEN/8) - val tagLen2 = PAddrBits - log2Up(XLEN/8) - log2Up(PtwL2EntrySize) - val tlbl2 = SyncReadMem(TlbL2EntrySize, new TlbEntry) - val tlbv = RegInit(0.U(TlbL2EntrySize.W)) // valid - val tlbg = RegInit(0.U(TlbL2EntrySize.W)) // global - val ptwl1 = Reg(Vec(PtwL1EntrySize, new PtwEntry(tagLen = tagLen1))) + val tlbl2 = Module(new SRAMTemplate(new TlbEntires(num = TlbL2LineSize, tagLen = TlbL2TagLen), set = TlbL2LineNum)) // (total 256, one line is 4 => 64 lines) + val tlbv = RegInit(0.U(TlbL2LineNum.W)) // valid + val tlbg = Reg(UInt(TlbL2LineNum.W)) // global + val ptwl1 = Reg(Vec(PtwL1EntrySize, new PtwEntry(tagLen = PtwL1TagLen))) val l1v = RegInit(0.U(PtwL1EntrySize.W)) // valid - val l1g = VecInit((ptwl1.map(_.perm.g))).asUInt - val ptwl2 = SyncReadMem(PtwL2EntrySize, new PtwEntry(tagLen = tagLen2)) // NOTE: the Mem could be only single port(r&w) - val l2v = RegInit(0.U(PtwL2EntrySize.W)) // valid - val l2g = RegInit(0.U(PtwL2EntrySize.W)) // global - + val l1g = Reg(UInt(PtwL1EntrySize.W)) + val ptwl2 = Module(new SRAMTemplate(new PtwEntries(num = PtwL2LineSize, tagLen = PtwL2TagLen), set = PtwL2LineNum)) // (total 256, one line is 4 => 64 lines) + val l2v = RegInit(0.U(PtwL2LineNum.W)) // valid + val l2g = Reg(UInt(PtwL2LineNum.W)) // global + // mem alias - // val memRdata = mem.d.bits.data - val memRdata = Wire(UInt(XLEN.W)) - val memPte = memRdata.asTypeOf(new PteBundle) + val memRdata = mem.d.bits.data + val memSelData = Wire(UInt(XLEN.W)) + val memPte = memSelData.asTypeOf(new PteBundle) + val memPtes =(0 until TlbL2LineSize).map(i => memRdata((i+1)*XLEN-1, i*XLEN).asTypeOf(new PteBundle)) val memValid = mem.d.valid val memRespReady = mem.d.ready val memRespFire = mem.d.fire() @@ -206,22 +268,27 @@ class PTWImp(outer: PTW) extends PtwModule(outer){ * tlbl2 */ val (tlbHit, tlbHitData) = { - // tlbl2 is by addr - // TODO: optimize tlbl2'l2 tag len - val ramData = tlbl2.read(req.vpn(log2Up(TlbL2EntrySize)-1, 0), validOneCycle) - val vidx = RegEnable(tlbv(req.vpn(log2Up(TlbL2EntrySize)-1, 0)), validOneCycle) - (ramData.hit(req.vpn) && vidx, ramData) // TODO: optimize tag - // TODO: add exception and refill + assert(tlbl2.io.r.req.ready) + + val ridx = genTlbL2Idx(req.vpn) + val vidx = RegEnable(tlbv(ridx), validOneCycle) + tlbl2.io.r.req.valid := validOneCycle + tlbl2.io.r.req.bits.apply(setIdx = ridx) + val ramData = tlbl2.io.r.resp.data(0) + + XSDebug(tlbl2.io.r.req.valid, p"tlbl2 Read rIdx:${Hexadecimal(ridx)}\n") + XSDebug(RegNext(tlbl2.io.r.req.valid), p"tlbl2 RamData:${ramData}\n") + XSDebug(RegNext(tlbl2.io.r.req.valid), p"tlbl2 v:${vidx} hit:${ramData.hit(req.vpn)} tlbPte:${ramData.get(req.vpn)}\n") + (ramData.hit(req.vpn) && vidx, ramData.get(req.vpn)) } /* * ptwl1 */ val l1addr = MakeAddr(satp.ppn, getVpnn(req.vpn, 2)) - val (l1Hit, l1HitData) = { // TODO: add excp - // 16 terms may casue long latency, so divide it into 2 stage, like l2tlb + val (l1Hit, l1HitData) = { val hitVecT = ptwl1.zipWithIndex.map{case (a,b) => a.hit(l1addr) && l1v(b) } - val hitVec = hitVecT.map(RegEnable(_, validOneCycle)) // TODO: could have useless init value + val hitVec = hitVecT.map(RegEnable(_, validOneCycle)) val hitData = ParallelMux(hitVec zip ptwl1) val hit = ParallelOR(hitVec).asBool (hit, hitData) @@ -233,12 +300,21 @@ class PTWImp(outer: PTW) extends PtwModule(outer){ val l1MemBack = memRespFire && state===state_wait_resp && level===0.U val l1Res = Mux(l1Hit, l1HitData.ppn, RegEnable(memPte.ppn, l1MemBack)) val l2addr = MakeAddr(l1Res, getVpnn(req.vpn, 1)) - val (l2Hit, l2HitData) = { // TODO: add excp - val readRam = (l1Hit && level===0.U && state===state_req) || (memRespFire && state===state_wait_resp && level===0.U) - val ridx = l2addr(log2Up(PtwL2EntrySize)-1+log2Up(XLEN/8), log2Up(XLEN/8)) - val ramData = ptwl2.read(ridx, readRam) + val (l2Hit, l2HitPPN) = { + val readRam = (!tlbHit && l1Hit && level===0.U && state===state_req) || (memRespFire && state===state_wait_resp && level===0.U) + val ridx = genPtwL2Idx(l2addr) + val idx = RegEnable(l2addr(log2Up(PtwL2LineSize)+log2Up(XLEN/8)-1, log2Up(XLEN/8)), readRam) val vidx = RegEnable(l2v(ridx), readRam) - (ramData.hit(l2addr) && vidx, ramData) // TODO: optimize tag + + assert(ptwl2.io.r.req.ready) + ptwl2.io.r.req.valid := readRam + ptwl2.io.r.req.bits.apply(setIdx = ridx) + val ramData = ptwl2.io.r.resp.data(0) + + XSDebug(ptwl2.io.r.req.valid, p"ptwl2 rIdx:${Hexadecimal(ridx)}\n") + XSDebug(RegNext(ptwl2.io.r.req.valid), p"ptwl2 RamData:${ramData}\n") + XSDebug(RegNext(ptwl2.io.r.req.valid), p"ptwl2 v:${vidx} hit:${ramData.hit(idx, l2addr)}\n") + (ramData.hit(idx, l2addr) && vidx, ramData.get(idx)._2) // TODO: optimize tag } /* ptwl3 @@ -247,7 +323,7 @@ class PTWImp(outer: PTW) extends PtwModule(outer){ * if l2-tlb does not hit, ptwl3 would not hit (mostly) */ val l2MemBack = memRespFire && state===state_wait_resp && level===1.U - val l2Res = Mux(l2Hit, l2HitData.ppn, RegEnable(memPte.ppn, l2MemBack)) + val l2Res = Mux(l2Hit, l2HitPPN, RegEnable(memPte.ppn, l2MemBack)) val l3addr = MakeAddr(l2Res, getVpnn(req.vpn, 0)) /* @@ -272,7 +348,7 @@ class PTWImp(outer: PTW) extends PtwModule(outer){ state := state_wait_ready } } .elsewhen (l1Hit && level===0.U || l2Hit && level===1.U) { - level := levelNext // TODO: consider superpage + level := levelNext } .elsewhen (memReqReady && !sfenceLatch) { state := state_wait_resp } @@ -280,13 +356,13 @@ class PTWImp(outer: PTW) extends PtwModule(outer){ is (state_wait_resp) { when (memRespFire) { - when (memPte.isLeaf() || memPte.isPf()) { + when (memPte.isLeaf() || memPte.isPf(level)) { when (resp(arbChosen).ready) { state := state_idle }.otherwise { state := state_wait_ready latch.entry := new TlbEntry().genTlbEntry(memRdata, level, req.vpn) - latch.pf := memPte.isPf() + latch.pf := memPte.isPf(level) } }.otherwise { level := levelNext @@ -323,49 +399,77 @@ class PTWImp(outer: PTW) extends PtwModule(outer){ lgSize = log2Up(l1BusDataWidth/8).U )._2 mem.a.bits := pteRead - mem.a.valid := state === state_req && + mem.a.valid := state === state_req && ((level===0.U && !tlbHit && !l1Hit) || (level===1.U && !l2Hit) || (level===2.U)) && !sfenceLatch && !sfence.valid mem.d.ready := state === state_wait_resp || sfenceLatch val memAddrLatch = RegEnable(memAddr, mem.a.valid) - memRdata := (mem.d.bits.data >> (memAddrLatch(log2Up(l1BusDataWidth/8) - 1, log2Up(XLEN/8)) << log2Up(XLEN)))(XLEN - 1, 0) + memSelData := memRdata.asTypeOf(Vec(MemBandWidth/XLEN, UInt(XLEN.W)))(memAddrLatch(log2Up(l1BusDataWidth/8) - 1, log2Up(XLEN/8))) /* * resp */ - val ptwFinish = (state===state_req && tlbHit && level===0.U) || ((memPte.isLeaf() || memPte.isPf() || (!memPte.isLeaf() && level===2.U)) && memRespFire && !sfenceLatch) || state===state_wait_ready + val ptwFinish = (state===state_req && tlbHit && level===0.U) || + ((memPte.isLeaf() || memPte.isPf(level) || + (!memPte.isLeaf() && level===2.U)) && memRespFire && !sfenceLatch) || + state===state_wait_ready for(i <- 0 until PtwWidth) { resp(i).valid := valid && arbChosen===i.U && ptwFinish // TODO: add resp valid logic resp(i).bits.entry := Mux(tlbHit, tlbHitData, - Mux(state===state_wait_ready, latch.entry, new TlbEntry().genTlbEntry(memRdata, Mux(level===3.U, 2.U, level), req.vpn))) - resp(i).bits.pf := Mux(level===3.U || notFound, true.B, Mux(tlbHit, false.B, Mux(state===state_wait_ready, latch.pf, memPte.isPf()))) + Mux(state===state_wait_ready, latch.entry, new TlbEntry().genTlbEntry(memSelData, Mux(level===3.U, 2.U, level), req.vpn))) + resp(i).bits.pf := Mux(level===3.U || notFound, true.B, Mux(tlbHit, false.B, Mux(state===state_wait_ready, latch.pf, memPte.isPf(level)))) // TODO: the pf must not be correct, check it } /* * refill */ + ptwl2.io.w.req <> DontCare + tlbl2.io.w.req <> DontCare + ptwl2.io.w.req.valid := false.B + tlbl2.io.w.req.valid := false.B assert(!memRespFire || (state===state_wait_resp || sfenceLatch)) - when (memRespFire && !memPte.isPf() && !sfenceLatch) { + when (memRespFire && !memPte.isPf(level) && !sfenceLatch) { when (level===0.U && !memPte.isLeaf) { val refillIdx = LFSR64()(log2Up(PtwL1EntrySize)-1,0) // TODO: may be LRU - ptwl1(refillIdx).refill(l1addr, memRdata) + ptwl1(refillIdx).refill(l1addr, memSelData) l1v := l1v | UIntToOH(refillIdx) + l1g := (l1g & ~UIntToOH(refillIdx)) | Mux(memPte.perm.g, UIntToOH(refillIdx), 0.U) } when (level===1.U && !memPte.isLeaf) { val l2addrStore = RegEnable(l2addr, memReqFire && state===state_req && level===1.U) - val refillIdx = getVpnn(req.vpn, 1)(log2Up(PtwL2EntrySize)-1, 0) - ptwl2.write(refillIdx, new PtwEntry(tagLen2).genPtwEntry(l2addrStore, memRdata)) + val refillIdx = genPtwL2Idx(l2addrStore) //getVpnn(req.vpn, 1)(log2Up(PtwL2EntrySize)-1, 0) + //TODO: check why the old refillIdx is right + + assert(ptwl2.io.w.req.ready) + val ps = new PtwEntries(PtwL2LineSize, PtwL2TagLen).genEntries(l2addrStore, memRdata, level) + ptwl2.io.w.apply( + valid = true.B, + setIdx = refillIdx, + data = ps, + waymask = -1.S.asUInt + ) l2v := l2v | UIntToOH(refillIdx) - l2g := l2g | Mux(memPte.perm.g, UIntToOH(refillIdx), 0.U) + l2g := (l2g & ~UIntToOH(refillIdx)) | Mux(Cat(memPtes.map(_.perm.g)).andR, UIntToOH(refillIdx), 0.U) + XSDebug(p"ptwl2 RefillIdx:${Hexadecimal(refillIdx)} ps:${ps}\n") } when (memPte.isLeaf()) { - val refillIdx = getVpnn(req.vpn, 0)(log2Up(TlbL2EntrySize)-1, 0) - tlbl2.write(refillIdx, new TlbEntry().genTlbEntry(memRdata, level, req.vpn)) + val refillIdx = genTlbL2Idx(req.vpn)//getVpnn(req.vpn, 0)(log2Up(TlbL2EntrySize)-1, 0) + //TODO: check why the old refillIdx is right + + assert(tlbl2.io.w.req.ready) + val ts = new TlbEntires(num = TlbL2LineSize, tagLen = TlbL2TagLen).genEntries(memRdata, level, req.vpn) + tlbl2.io.w.apply( + valid = true.B, + setIdx = refillIdx, + data = ts, + waymask = -1.S.asUInt + ) tlbv := tlbv | UIntToOH(refillIdx) - tlbg := tlbg | Mux(memPte.perm.g, UIntToOH(refillIdx), 0.U) + tlbg := (tlbg & ~UIntToOH(refillIdx)) | Mux(Cat(memPtes.map(_.perm.g)).andR, UIntToOH(refillIdx), 0.U) + XSDebug(p"tlbl2 refillIdx:${Hexadecimal(refillIdx)} ts:${ts}\n") } } @@ -427,16 +531,25 @@ class PTWImp(outer: PTW) extends PtwModule(outer){ XSDebug(false, validOneCycle, p"(v:${validOneCycle} r:${arb.io.out.ready}) vpn:0x${Hexadecimal(req.vpn)}\n") XSDebug(resp(arbChosen).fire(), "**Ptw Resp to ") PrintFlag(resp(arbChosen).fire(), arbChosen===0.U, "DTLB**:\n", "ITLB**\n") - XSDebug(resp(arbChosen).fire(), p"(v:${resp(arbChosen).valid} r:${resp(arbChosen).ready}) entry:${resp(arbChosen).bits.entry} pf:${resp(arbChosen).bits.pf}\n") + XSDebug(resp(arbChosen).fire(), p"(v:${resp(arbChosen).valid} r:${resp(arbChosen).ready})" + + p" entry:${resp(arbChosen).bits.entry} pf:${resp(arbChosen).bits.pf}\n") XSDebug(sfence.valid, p"Sfence: sfence instr here ${sfence.bits}\n") XSDebug(valid, p"CSR: ${csr}\n") - XSDebug(valid, p"vpn2:0x${Hexadecimal(getVpnn(req.vpn, 2))} vpn1:0x${Hexadecimal(getVpnn(req.vpn, 1))} vpn0:0x${Hexadecimal(getVpnn(req.vpn, 0))}\n") - XSDebug(valid, p"state:${state} level:${level} tlbHit:${tlbHit} l1addr:0x${Hexadecimal(l1addr)} l1Hit:${l1Hit} l2addr:0x${Hexadecimal(l2addr)} l2Hit:${l2Hit} l3addr:0x${Hexadecimal(l3addr)} memReq(v:${mem.a.valid} r:${mem.a.ready})\n") + XSDebug(valid, p"vpn2:0x${Hexadecimal(getVpnn(req.vpn, 2))} vpn1:0x${Hexadecimal(getVpnn(req.vpn, 1))}" + + p" vpn0:0x${Hexadecimal(getVpnn(req.vpn, 0))}\n") + XSDebug(valid, p"state:${state} level:${level} tlbHit:${tlbHit} l1addr:0x${Hexadecimal(l1addr)} l1Hit:${l1Hit}" + + p" l2addr:0x${Hexadecimal(l2addr)} l2Hit:${l2Hit} l3addr:0x${Hexadecimal(l3addr)} memReq(v:${mem.a.valid} r:${mem.a.ready})\n") XSDebug(memReqFire, p"mem req fire addr:0x${Hexadecimal(memAddr)}\n") - XSDebug(memRespFire, p"mem resp fire rdata:0x${Hexadecimal(mem.d.bits.data)} Pte:${memPte}\n") + XSDebug(memRespFire, p"mem resp fire: \n") + for(i <- 0 until (MemBandWidth/XLEN)) { + XSDebug(memRespFire, p" ${i.U}: ${memPtes(i)} isPf:${memPtes(i).isPf(level)} isLeaf:${memPtes(i).isLeaf}\n") + } + + XSDebug(sfenceLatch, p"ptw has a flushed req waiting for resp... " + + p"state:${state} mem.a(${mem.a.valid} ${mem.a.ready}) d($memValid} ${memRespReady})\n") - XSDebug(sfenceLatch, p"ptw has a flushed req waiting for resp... state:${state} mem.a(${mem.a.valid} ${mem.a.ready}) d($memValid} ${memRespReady})\n") + // TODO: add ptw perf cnt } diff --git a/src/main/scala/xiangshan/cache/storeMissQueue.scala b/src/main/scala/xiangshan/cache/storeMissQueue.scala index 902c01245cbb2b17cd73834c6cb24c53dd0fa898..9845c8fa7f8901b41b4317fd103c85eb8a18b039 100644 --- a/src/main/scala/xiangshan/cache/storeMissQueue.scala +++ b/src/main/scala/xiangshan/cache/storeMissQueue.scala @@ -54,7 +54,7 @@ class StoreMissEntry extends DCacheModule when (state =/= s_invalid) { - XSDebug("entry: %d state: %d\n", io.id, state) + XSDebug("entry: %d state: %d idx: %x tag: %x\n", io.id, state, io.idx.bits, io.tag.bits) } // -------------------------------------------- @@ -158,12 +158,13 @@ class StoreMissQueue extends DCacheModule val replay_arb = Module(new Arbiter(new DCacheLineReq, cfg.nStoreMissEntries)) val resp_arb = Module(new Arbiter(new DCacheLineResp, cfg.nStoreMissEntries)) - val idx_matches = Wire(Vec(cfg.nLoadMissEntries, Bool())) - val tag_matches = Wire(Vec(cfg.nLoadMissEntries, Bool())) + val idx_matches = Wire(Vec(cfg.nStoreMissEntries, Bool())) + val tag_matches = Wire(Vec(cfg.nStoreMissEntries, Bool())) val tag_match = Mux1H(idx_matches, tag_matches) val idx_match = idx_matches.reduce(_||_) + XSDebug("idx_match: %b tag_match: %b\n", idx_match, tag_match) val req = io.lsu.req val entry_alloc_idx = Wire(UInt()) diff --git a/src/main/scala/xiangshan/cache/uncache.scala b/src/main/scala/xiangshan/cache/uncache.scala index 73ba3b6038ff1624fd1d61d8e7174615f0ad99af..897c0ae48b578faed798d0984f9bc59db1af296c 100644 --- a/src/main/scala/xiangshan/cache/uncache.scala +++ b/src/main/scala/xiangshan/cache/uncache.scala @@ -6,7 +6,7 @@ import utils.{HasTLDump, PriorityMuxWithFlag, XSDebug} import chipsalliance.rocketchip.config.Parameters import freechips.rocketchip.diplomacy.{IdRange, LazyModule, LazyModuleImp, TransferSizes} import freechips.rocketchip.tilelink.{TLArbiter, TLBundleA, TLBundleD, TLClientNode, TLEdgeOut, TLMasterParameters, TLMasterPortParameters} -import xiangshan.{HasXSLog, MicroOp, NeedImpl, Redirect} +import xiangshan.{HasXSLog, MicroOp, Redirect} // One miss entry deals with one mmio request class MMIOEntry(edge: TLEdgeOut) extends DCacheModule @@ -119,7 +119,7 @@ class MMIOEntry(edge: TLEdgeOut) extends DCacheModule } class UncacheIO extends DCacheBundle { - val lsroq = Flipped(new DCacheWordIO) + val lsq = Flipped(new DCacheWordIO) } // convert DCacheIO to TileLink @@ -152,8 +152,8 @@ class UncacheImp(outer: Uncache) val resp_arb = Module(new Arbiter(new DCacheWordResp, cfg.nMMIOEntries)) - val req = io.lsroq.req - val resp = io.lsroq.resp + val req = io.lsq.req + val resp = io.lsq.resp val mem_acquire = bus.a val mem_grant = bus.d diff --git a/src/main/scala/xiangshan/frontend/BPU.scala b/src/main/scala/xiangshan/frontend/BPU.scala index 4edd1776d4409fa1c94b2856a5f7b505479a2787..0844dc418ed8a60f1f1506a1daecabda62e502a4 100644 --- a/src/main/scala/xiangshan/frontend/BPU.scala +++ b/src/main/scala/xiangshan/frontend/BPU.scala @@ -6,12 +6,13 @@ import utils._ import xiangshan._ import xiangshan.backend.ALUOpType import xiangshan.backend.JumpOpType +import chisel3.experimental.chiselName trait HasBPUParameter extends HasXSParameter { - val BPUDebug = false - val EnableCFICommitLog = false + val BPUDebug = true + val EnableCFICommitLog = true val EnbaleCFIPredLog = false - val EnableBPUTimeRecord = false + val EnableBPUTimeRecord = EnableCFICommitLog || EnbaleCFIPredLog } class TableAddr(val idxBits: Int, val banks: Int) extends XSBundle { @@ -63,7 +64,43 @@ class PredictorResponse extends XSBundle { val loop = new LoopResp } -abstract class BasePredictor extends XSModule with HasBPUParameter{ +trait PredictorUtils { + // circular shifting + def circularShiftLeft(source: UInt, len: Int, shamt: UInt): UInt = { + val res = Wire(UInt(len.W)) + val higher = source << shamt + val lower = source >> (len.U - shamt) + res := higher | lower + res + } + + def circularShiftRight(source: UInt, len: Int, shamt: UInt): UInt = { + val res = Wire(UInt(len.W)) + val higher = source << (len.U - shamt) + val lower = source >> shamt + res := higher | lower + res + } + + // To be verified + def satUpdate(old: UInt, len: Int, taken: Bool): UInt = { + val oldSatTaken = old === ((1 << len)-1).U + val oldSatNotTaken = old === 0.U + Mux(oldSatTaken && taken, ((1 << len)-1).U, + Mux(oldSatNotTaken && !taken, 0.U, + Mux(taken, old + 1.U, old - 1.U))) + } + + def signedSatUpdate(old: SInt, len: Int, taken: Bool): SInt = { + val oldSatTaken = old === ((1 << (len-1))-1).S + val oldSatNotTaken = old === (-(1 << (len-1))).S + Mux(oldSatTaken && taken, ((1 << (len-1))-1).S, + Mux(oldSatNotTaken && !taken, (-(1 << (len-1))).S, + Mux(taken, old + 1.S, old - 1.S))) + } +} +abstract class BasePredictor extends XSModule + with HasBPUParameter with HasIFUConst with PredictorUtils { val metaLen = 0 // An implementation MUST extend the IO bundle with a response @@ -79,214 +116,159 @@ abstract class BasePredictor extends XSModule with HasBPUParameter{ val hist = Input(UInt(HistoryLength.W)) val inMask = Input(UInt(PredictWidth.W)) val update = Flipped(ValidIO(new BranchUpdateInfoWithHist)) + val outFire = Input(Bool()) } val io = new DefaultBasePredictorIO val debug = false - - // circular shifting - def circularShiftLeft(source: UInt, len: Int, shamt: UInt): UInt = { - val res = Wire(UInt(len.W)) - val higher = source << shamt - val lower = source >> (len.U - shamt) - res := higher | lower - res - } - - def circularShiftRight(source: UInt, len: Int, shamt: UInt): UInt = { - val res = Wire(UInt(len.W)) - val higher = source << (len.U - shamt) - val lower = source >> shamt - res := higher | lower - res - } } class BPUStageIO extends XSBundle { val pc = UInt(VAddrBits.W) val mask = UInt(PredictWidth.W) val resp = new PredictorResponse - val target = UInt(VAddrBits.W) + // val target = UInt(VAddrBits.W) val brInfo = Vec(PredictWidth, new BranchInfo) - val saveHalfRVI = Bool() + // val saveHalfRVI = Bool() } -abstract class BPUStage extends XSModule with HasBPUParameter{ +abstract class BPUStage extends XSModule with HasBPUParameter with HasIFUConst { class DefaultIO extends XSBundle { val flush = Input(Bool()) - val in = Flipped(Decoupled(new BPUStageIO)) - val pred = Decoupled(new BranchPrediction) - val out = Decoupled(new BPUStageIO) - val predecode = Flipped(ValidIO(new Predecode)) - val recover = Flipped(ValidIO(new BranchUpdateInfo)) - val cacheValid = Input(Bool()) - val debug_hist = Input(UInt(HistoryLength.W)) - val debug_histPtr = Input(UInt(log2Up(ExtHistoryLength).W)) + val in = Input(new BPUStageIO) + val inFire = Input(Bool()) + val pred = Output(new BranchPrediction) // to ifu + val out = Output(new BPUStageIO) // to the next stage + val outFire = Input(Bool()) + + val debug_hist = Input(UInt((if (BPUDebug) (HistoryLength) else 0).W)) + val debug_histPtr = Input(UInt((if (BPUDebug) (ExtHistoryLength) else 0).W)) } val io = IO(new DefaultIO) - val predValid = RegInit(false.B) - - io.in.ready := !predValid || io.out.fire() && io.pred.fire() || io.flush - def npc(pc: UInt, instCount: UInt) = pc + (instCount << 1.U) - val inFire = io.in.fire() - val inLatch = RegEnable(io.in.bits, inFire) - - val outFire = io.out.fire() + val inLatch = RegEnable(io.in, io.inFire) // Each stage has its own logic to decide // takens, notTakens and target val takens = Wire(Vec(PredictWidth, Bool())) - val notTakens = Wire(Vec(PredictWidth, Bool())) + // val notTakens = Wire(Vec(PredictWidth, Bool())) val brMask = Wire(Vec(PredictWidth, Bool())) - val jmpIdx = PriorityEncoder(takens) - val hasNTBr = (0 until PredictWidth).map(i => i.U <= jmpIdx && notTakens(i) && brMask(i)).reduce(_||_) - val taken = takens.reduce(_||_) - // get the last valid inst - val lastValidPos = WireInit(PriorityMux(Reverse(inLatch.mask), (PredictWidth-1 to 0 by -1).map(i => i.U))) - val lastHit = Wire(Bool()) - val lastIsRVC = Wire(Bool()) - val saveHalfRVI = ((lastValidPos === jmpIdx && taken) || !taken ) && !lastIsRVC && lastHit - - val targetSrc = Wire(Vec(PredictWidth, UInt(VAddrBits.W))) - val target = Mux(taken, targetSrc(jmpIdx), npc(inLatch.pc, PopCount(inLatch.mask))) - - io.pred.bits <> DontCare - io.pred.bits.redirect := target =/= inLatch.target || inLatch.saveHalfRVI && !saveHalfRVI - io.pred.bits.taken := taken - io.pred.bits.jmpIdx := jmpIdx - io.pred.bits.hasNotTakenBrs := hasNTBr - io.pred.bits.target := target - io.pred.bits.saveHalfRVI := saveHalfRVI - io.pred.bits.takenOnBr := taken && brMask(jmpIdx) - - io.out.bits <> DontCare - io.out.bits.pc := inLatch.pc - io.out.bits.mask := inLatch.mask - io.out.bits.target := target - io.out.bits.resp <> inLatch.resp - io.out.bits.brInfo := inLatch.brInfo - io.out.bits.saveHalfRVI := saveHalfRVI - (0 until PredictWidth).map(i => - io.out.bits.brInfo(i).sawNotTakenBranch := (if (i == 0) false.B else (brMask.asUInt & notTakens.asUInt)(i-1,0).orR)) - - // Default logic - // pred.ready not taken into consideration - // could be broken - when (io.flush) { predValid := false.B } - .elsewhen (inFire) { predValid := true.B } - .elsewhen (outFire) { predValid := false.B } - .otherwise { predValid := predValid } - - io.out.valid := predValid && !io.flush - io.pred.valid := predValid && !io.flush + val jalMask = Wire(Vec(PredictWidth, Bool())) + + val targets = Wire(Vec(PredictWidth, UInt(VAddrBits.W))) + + val firstBankHasHalfRVI = Wire(Bool()) + val lastBankHasHalfRVI = Wire(Bool()) + val lastBankHasInst = WireInit(inLatch.mask(PredictWidth-1, bankWidth).orR) + + io.pred <> DontCare + io.pred.takens := takens.asUInt + io.pred.brMask := brMask.asUInt + io.pred.jalMask := jalMask.asUInt + io.pred.targets := targets + io.pred.firstBankHasHalfRVI := firstBankHasHalfRVI + io.pred.lastBankHasHalfRVI := lastBankHasHalfRVI + + io.out <> DontCare + io.out.pc := inLatch.pc + io.out.mask := inLatch.mask + io.out.resp <> inLatch.resp + io.out.brInfo := inLatch.brInfo + (0 until PredictWidth).map(i => io.out.brInfo(i).sawNotTakenBranch := io.pred.sawNotTakenBr(i)) if (BPUDebug) { - XSDebug(io.in.fire(), "in:(%d %d) pc=%x, mask=%b, target=%x\n", - io.in.valid, io.in.ready, io.in.bits.pc, io.in.bits.mask, io.in.bits.target) - XSDebug(io.out.fire(), "out:(%d %d) pc=%x, mask=%b, target=%x\n", - io.out.valid, io.out.ready, io.out.bits.pc, io.out.bits.mask, io.out.bits.target) + val jmpIdx = io.pred.jmpIdx + val taken = io.pred.taken + val target = Mux(taken, io.pred.targets(jmpIdx), snpc(inLatch.pc)) + XSDebug("in(%d): pc=%x, mask=%b\n", io.inFire, io.in.pc, io.in.mask) + XSDebug("inLatch: pc=%x, mask=%b\n", inLatch.pc, inLatch.mask) + XSDebug("out(%d): pc=%x, mask=%b, taken=%d, jmpIdx=%d, target=%x, firstHasHalfRVI=%d, lastHasHalfRVI=%d\n", + io.outFire, io.out.pc, io.out.mask, taken, jmpIdx, target, firstBankHasHalfRVI, lastBankHasHalfRVI) XSDebug("flush=%d\n", io.flush) - XSDebug("taken=%d, takens=%b, notTakens=%b, jmpIdx=%d, hasNTBr=%d, lastValidPos=%d, target=%x\n", - taken, takens.asUInt, notTakens.asUInt, jmpIdx, hasNTBr, lastValidPos, target) - val p = io.pred.bits - XSDebug(io.pred.fire(), "outPred: redirect=%d, taken=%d, jmpIdx=%d, hasNTBrs=%d, target=%x, saveHalfRVI=%d\n", - p.redirect, p.taken, p.jmpIdx, p.hasNotTakenBrs, p.target, p.saveHalfRVI) - XSDebug(io.pred.fire() && p.taken, "outPredTaken: fetchPC:%x, jmpPC:%x\n", - inLatch.pc, inLatch.pc + (jmpIdx << 1.U)) - XSDebug(io.pred.fire() && p.redirect, "outPred: previous target:%x redirected to %x \n", - inLatch.target, p.target) - XSDebug(io.pred.fire(), "outPred targetSrc: ") - for (i <- 0 until PredictWidth) { - XSDebug(false, io.pred.fire(), "(%d):%x ", i.U, targetSrc(i)) - } - XSDebug(false, io.pred.fire(), "\n") + val p = io.pred } } +@chiselName class BPUStage1 extends BPUStage { - // 'overrides' default logic - // when flush, the prediction should also starts - when (inFire) { predValid := true.B } - .elsewhen (io.flush) { predValid := false.B } - .elsewhen (outFire) { predValid := false.B } - .otherwise { predValid := predValid } - // io.out.valid := predValid - // ubtb is accessed with inLatch pc in s1, // so we use io.in instead of inLatch - val ubtbResp = io.in.bits.resp.ubtb + val ubtbResp = io.in.resp.ubtb // the read operation is already masked, so we do not need to mask here takens := VecInit((0 until PredictWidth).map(i => ubtbResp.hits(i) && ubtbResp.takens(i))) - notTakens := VecInit((0 until PredictWidth).map(i => ubtbResp.hits(i) && !ubtbResp.takens(i) && ubtbResp.brMask(i))) - targetSrc := ubtbResp.targets + // notTakens := VecInit((0 until PredictWidth).map(i => ubtbResp.hits(i) && !ubtbResp.takens(i) && ubtbResp.brMask(i))) brMask := ubtbResp.brMask + jalMask := DontCare + targets := ubtbResp.targets - lastIsRVC := ubtbResp.is_RVC(lastValidPos) - lastHit := ubtbResp.hits(lastValidPos) + firstBankHasHalfRVI := Mux(lastBankHasInst, false.B, ubtbResp.hits(bankWidth-1) && !ubtbResp.is_RVC(bankWidth-1) && inLatch.mask(bankWidth-1)) + lastBankHasHalfRVI := ubtbResp.hits(PredictWidth-1) && !ubtbResp.is_RVC(PredictWidth-1) && inLatch.mask(PredictWidth-1) // resp and brInfo are from the components, // so it does not need to be latched - io.out.bits.resp <> io.in.bits.resp - io.out.bits.brInfo := io.in.bits.brInfo + io.out.resp <> io.in.resp + io.out.brInfo := io.in.brInfo if (BPUDebug) { - XSDebug(io.pred.fire(), "outPred using ubtb resp: hits:%b, takens:%b, notTakens:%b, isRVC:%b\n", + XSDebug(io.outFire, "outPred using ubtb resp: hits:%b, takens:%b, notTakens:%b, isRVC:%b\n", ubtbResp.hits.asUInt, ubtbResp.takens.asUInt, ~ubtbResp.takens.asUInt & brMask.asUInt, ubtbResp.is_RVC.asUInt) } if (EnableBPUTimeRecord) { - io.out.bits.brInfo.map(_.debug_ubtb_cycle := GTimer()) + io.out.brInfo.map(_.debug_ubtb_cycle := GTimer()) } } - +@chiselName class BPUStage2 extends BPUStage { - - io.out.valid := predValid && !io.flush && io.cacheValid // Use latched response from s1 val btbResp = inLatch.resp.btb val bimResp = inLatch.resp.bim takens := VecInit((0 until PredictWidth).map(i => btbResp.hits(i) && (btbResp.types(i) === BTBtype.B && bimResp.ctrs(i)(1) || btbResp.types(i) =/= BTBtype.B))) - notTakens := VecInit((0 until PredictWidth).map(i => btbResp.hits(i) && btbResp.types(i) === BTBtype.B && !bimResp.ctrs(i)(1))) - targetSrc := btbResp.targets - brMask := VecInit(btbResp.types.map(_ === BTBtype.B)) - - lastIsRVC := btbResp.isRVC(lastValidPos) - lastHit := btbResp.hits(lastValidPos) + targets := btbResp.targets + brMask := VecInit(btbResp.types.map(_ === BTBtype.B)) + jalMask := DontCare + firstBankHasHalfRVI := Mux(lastBankHasInst, false.B, btbResp.hits(bankWidth-1) && !btbResp.isRVC(bankWidth-1) && inLatch.mask(bankWidth-1)) + lastBankHasHalfRVI := btbResp.hits(PredictWidth-1) && !btbResp.isRVC(PredictWidth-1) && inLatch.mask(PredictWidth-1) if (BPUDebug) { - XSDebug(io.pred.fire(), "outPred using btb&bim resp: hits:%b, ctrTakens:%b\n", + XSDebug(io.outFire, "outPred using btb&bim resp: hits:%b, ctrTakens:%b\n", btbResp.hits.asUInt, VecInit(bimResp.ctrs.map(_(1))).asUInt) } if (EnableBPUTimeRecord) { - io.out.bits.brInfo.map(_.debug_btb_cycle := GTimer()) + io.out.brInfo.map(_.debug_btb_cycle := GTimer()) } } - +@chiselName class BPUStage3 extends BPUStage { + class S3IO extends XSBundle { - - io.out.valid := predValid && io.predecode.valid && !io.flush + val predecode = Input(new Predecode) + val realMask = Input(UInt(PredictWidth.W)) + val prevHalf = Input(new PrevHalfInstr) + val recover = Flipped(ValidIO(new BranchUpdateInfo)) + } + val s3IO = IO(new S3IO) // TAGE has its own pipelines and the // response comes directly from s3, // so we do not use those from inLatch - val tageResp = io.in.bits.resp.tage + val tageResp = io.in.resp.tage val tageTakens = tageResp.takens - val tageHits = tageResp.hits - val tageValidTakens = VecInit((tageTakens zip tageHits).map{case (t, h) => t && h}) - val loopResp = io.in.bits.resp.loop.exit + val loopResp = io.in.resp.loop.exit - val pdMask = io.predecode.bits.mask - val pds = io.predecode.bits.pd + // realMask is in it + val pdMask = s3IO.predecode.mask + val pdLastHalf = s3IO.predecode.lastHalf + val pds = s3IO.predecode.pd - val btbHits = inLatch.resp.btb.hits.asUInt + val btbResp = inLatch.resp.btb + val btbHits = btbResp.hits.asUInt val bimTakens = VecInit(inLatch.resp.bim.ctrs.map(_(1))) val brs = pdMask & Reverse(Cat(pds.map(_.isBr))) @@ -296,97 +278,104 @@ class BPUStage3 extends BPUStage { val rets = pdMask & Reverse(Cat(pds.map(_.isRet))) val RVCs = pdMask & Reverse(Cat(pds.map(_.isRVC))) - val callIdx = PriorityEncoder(calls) - val retIdx = PriorityEncoder(rets) + val callIdx = PriorityEncoder(calls) + val retIdx = PriorityEncoder(rets) - // Use bim results for those who tage does not have an entry for - val brTakens = brs & - (if (EnableBPD) Reverse(Cat((0 until PredictWidth).map(i => tageValidTakens(i) || !tageHits(i) && bimTakens(i)))) else Reverse(Cat((0 until PredictWidth).map(i => bimTakens(i))))) & - (if (EnableLoop) ~loopResp.asUInt else Fill(PredictWidth, 1.U(1.W))) - // if (EnableBPD) { - // brs & Reverse(Cat((0 until PredictWidth).map(i => tageValidTakens(i)))) - // } else { - // brs & Reverse(Cat((0 until PredictWidth).map(i => bimTakens(i)))) - // } + val brPred = (if(EnableBPD) tageTakens else bimTakens).asUInt + val loopRes = (if (EnableLoop) loopResp else VecInit(Fill(PredictWidth, 0.U(1.W)))).asUInt + val prevHalfTaken = s3IO.prevHalf.valid && s3IO.prevHalf.taken + val prevHalfTakenMask = prevHalfTaken.asUInt + val brTakens = ((brs & brPred | prevHalfTakenMask) & ~loopRes) + // VecInit((0 until PredictWidth).map(i => brs(i) && (brPred(i) || (if (i == 0) prevHalfTaken else false.B)) && !loopRes(i))) // predict taken only if btb has a target, jal targets will be provided by IFU takens := VecInit((0 until PredictWidth).map(i => (brTakens(i) || jalrs(i)) && btbHits(i) || jals(i))) - // Whether should we count in branches that are not recorded in btb? - // PS: Currently counted in. Whenever tage does not provide a valid - // taken prediction, the branch is counted as a not taken branch - notTakens := ((VecInit((0 until PredictWidth).map(i => brs(i) && !takens(i)))).asUInt | - (if (EnableLoop) { VecInit((0 until PredictWidth).map(i => brs(i) && loopResp(i)))} - else { WireInit(0.U.asTypeOf(UInt(PredictWidth.W))) }).asUInt).asTypeOf(Vec(PredictWidth, Bool())) - targetSrc := inLatch.resp.btb.targets - brMask := WireInit(brs.asTypeOf(Vec(PredictWidth, Bool()))) + + + targets := inLatch.resp.btb.targets + + brMask := WireInit(brs.asTypeOf(Vec(PredictWidth, Bool()))) + jalMask := WireInit(jals.asTypeOf(Vec(PredictWidth, Bool()))) + + lastBankHasInst := s3IO.realMask(PredictWidth-1, bankWidth).orR + firstBankHasHalfRVI := Mux(lastBankHasInst, false.B, pdLastHalf(0)) + lastBankHasHalfRVI := pdLastHalf(1) //RAS if(EnableRAS){ val ras = Module(new RAS) ras.io <> DontCare - ras.io.pc.bits := inLatch.pc - ras.io.pc.valid := io.out.fire()//predValid - ras.io.is_ret := rets.orR && (retIdx === jmpIdx) && io.predecode.valid - ras.io.callIdx.valid := calls.orR && (callIdx === jmpIdx) && io.predecode.valid + ras.io.pc.bits := bankAligned(inLatch.pc) + ras.io.pc.valid := io.outFire//predValid + ras.io.is_ret := rets.orR && (retIdx === io.pred.jmpIdx) + ras.io.callIdx.valid := calls.orR && (callIdx === io.pred.jmpIdx) ras.io.callIdx.bits := callIdx ras.io.isRVC := (calls & RVCs).orR //TODO: this is ugly - ras.io.recover := io.recover + ras.io.isLastHalfRVI := s3IO.predecode.hasLastHalfRVI + ras.io.recover := s3IO.recover for(i <- 0 until PredictWidth){ - io.out.bits.brInfo(i).rasSp := ras.io.branchInfo.rasSp - io.out.bits.brInfo(i).rasTopCtr := ras.io.branchInfo.rasTopCtr - io.out.bits.brInfo(i).rasToqAddr := ras.io.branchInfo.rasToqAddr + io.out.brInfo(i).rasSp := ras.io.branchInfo.rasSp + io.out.brInfo(i).rasTopCtr := ras.io.branchInfo.rasTopCtr + io.out.brInfo(i).rasToqAddr := ras.io.branchInfo.rasToqAddr } - takens := VecInit((0 until PredictWidth).map(i => (brTakens(i) || jalrs(i)) && btbHits(i) || jals(i)|| rets(i))) - when(ras.io.is_ret && ras.io.out.valid){targetSrc(retIdx) := ras.io.out.bits.target} - } + takens := VecInit((0 until PredictWidth).map(i => { + ((brTakens(i) || jalrs(i)) && btbHits(i)) || + jals(i) || + (ras.io.out.valid && rets(i)) || + (!ras.io.out.valid && rets(i) && btbHits(i)) + } + )) + for (i <- 0 until PredictWidth) { + when(rets(i) && ras.io.out.valid){ + targets(i) := ras.io.out.bits.target + } + } + } - // when (!io.predecode.bits.isFetchpcEqualFirstpc) { - // lastValidPos := PriorityMux(Reverse(inLatch.mask), (PredictWidth-1 to 0 by -1).map(i => i.U)) + 1.U - // } - lastIsRVC := pds(lastValidPos).isRVC - when (lastValidPos === 1.U) { - lastHit := pdMask(1) | - !pdMask(0) & !pdMask(1) | - pdMask(0) & !pdMask(1) & (pds(0).isRVC | !io.predecode.bits.isFetchpcEqualFirstpc) - }.elsewhen (lastValidPos > 0.U) { - lastHit := pdMask(lastValidPos) | - !pdMask(lastValidPos - 1.U) & !pdMask(lastValidPos) | - pdMask(lastValidPos - 1.U) & !pdMask(lastValidPos) & pds(lastValidPos - 1.U).isRVC - }.otherwise { - lastHit := pdMask(0) | !pdMask(0) & !pds(0).isRVC + // we should provide the prediction for the first half RVI of the end of a fetch packet + // branch taken information would be lost in the prediction of the next packet, + // so we preserve this information here + when (firstBankHasHalfRVI && btbResp.types(bankWidth-1) === BTBtype.B && btbHits(bankWidth-1)) { + takens(bankWidth-1) := brPred(bankWidth-1) && !loopRes(bankWidth-1) + } + when (lastBankHasHalfRVI && btbResp.types(PredictWidth-1) === BTBtype.B && btbHits(PredictWidth-1)) { + takens(PredictWidth-1) := brPred(PredictWidth-1) && !loopRes(PredictWidth-1) } - - io.pred.bits.saveHalfRVI := ((lastValidPos === jmpIdx && taken && !(jmpIdx === 0.U && !io.predecode.bits.isFetchpcEqualFirstpc)) || !taken ) && !lastIsRVC && lastHit + // targets would be lost as well, since it is from btb + // unless it is a ret, which target is from ras + when (prevHalfTaken && !rets(0)) { + targets(0) := s3IO.prevHalf.target + } // Wrap tage resp and tage meta in // This is ugly - io.out.bits.resp.tage <> io.in.bits.resp.tage - io.out.bits.resp.loop <> io.in.bits.resp.loop + io.out.resp.tage <> io.in.resp.tage + io.out.resp.loop <> io.in.resp.loop for (i <- 0 until PredictWidth) { - io.out.bits.brInfo(i).tageMeta := io.in.bits.brInfo(i).tageMeta - io.out.bits.brInfo(i).specCnt := io.in.bits.brInfo(i).specCnt + io.out.brInfo(i).tageMeta := io.in.brInfo(i).tageMeta + io.out.brInfo(i).specCnt := io.in.brInfo(i).specCnt } if (BPUDebug) { - XSDebug(io.predecode.valid, "predecode: pc:%x, mask:%b\n", inLatch.pc, io.predecode.bits.mask) + XSDebug(io.inFire, "predecode: pc:%x, mask:%b\n", inLatch.pc, s3IO.predecode.mask) for (i <- 0 until PredictWidth) { - val p = io.predecode.bits.pd(i) - XSDebug(io.predecode.valid && io.predecode.bits.mask(i), "predecode(%d): brType:%d, br:%d, jal:%d, jalr:%d, call:%d, ret:%d, RVC:%d, excType:%d\n", + val p = s3IO.predecode.pd(i) + XSDebug(io.inFire && s3IO.predecode.mask(i), "predecode(%d): brType:%d, br:%d, jal:%d, jalr:%d, call:%d, ret:%d, RVC:%d, excType:%d\n", i.U, p.brType, p.isBr, p.isJal, p.isJalr, p.isCall, p.isRet, p.isRVC, p.excType) } } if (EnbaleCFIPredLog) { val out = io.out - XSDebug(out.fire(), p"cfi_pred: fetchpc(${Hexadecimal(out.bits.pc)}) mask(${out.bits.mask}) brmask(${brMask.asUInt}) hist(${Hexadecimal(io.debug_hist)}) histPtr(${io.debug_histPtr})\n") + XSDebug(io.outFire, p"cfi_pred: fetchpc(${Hexadecimal(out.pc)}) mask(${out.mask}) brmask(${brMask.asUInt}) hist(${Hexadecimal(io.debug_hist)}) histPtr(${io.debug_histPtr})\n") } if (EnableBPUTimeRecord) { - io.out.bits.brInfo.map(_.debug_tage_cycle := GTimer()) + io.out.brInfo.map(_.debug_tage_cycle := GTimer()) } } @@ -429,15 +418,17 @@ abstract class BaseBPU extends XSModule with BranchPredictorComponents with HasB val outOfOrderBrInfo = Flipped(ValidIO(new BranchUpdateInfoWithHist)) // from ifu, frontend redirect val flush = Input(Vec(3, Bool())) - val cacheValid = Input(Bool()) // from if1 - val in = Flipped(ValidIO(new BPUReq)) + val in = Input(new BPUReq) + val inFire = Input(Vec(4, Bool())) // to if2/if3/if4 - val out = Vec(3, Decoupled(new BranchPrediction)) + val out = Vec(3, Output(new BranchPrediction)) // from if4 - val predecode = Flipped(ValidIO(new Predecode)) + val predecode = Input(new Predecode) + val realMask = Input(UInt(PredictWidth.W)) + val prevHalf = Input(new PrevHalfInstr) // to if4, some bpu info used for updating - val branchInfo = Decoupled(Vec(PredictWidth, new BranchInfo)) + val branchInfo = Output(Vec(PredictWidth, new BranchInfo)) }) def npc(pc: UInt, instCount: UInt) = pc + (instCount << 1.U) @@ -449,6 +440,11 @@ abstract class BaseBPU extends XSModule with BranchPredictorComponents with HasB val s2 = Module(new BPUStage2) val s3 = Module(new BPUStage3) + val s1_fire = io.inFire(0) + val s2_fire = io.inFire(1) + val s3_fire = io.inFire(2) + val s4_fire = io.inFire(3) + s1.io.flush := io.flush(0) s2.io.flush := io.flush(1) s3.io.flush := io.flush(2) @@ -457,36 +453,28 @@ abstract class BaseBPU extends XSModule with BranchPredictorComponents with HasB s2.io.in <> s1.io.out s3.io.in <> s2.io.out + s1.io.inFire := s1_fire + s2.io.inFire := s2_fire + s3.io.inFire := s3_fire + + s1.io.outFire := s2_fire + s2.io.outFire := s3_fire + s3.io.outFire := s4_fire + io.out(0) <> s1.io.pred io.out(1) <> s2.io.pred io.out(2) <> s3.io.pred - s1.io.predecode <> DontCare - s2.io.predecode <> DontCare - s3.io.predecode <> io.predecode - - io.branchInfo.valid := s3.io.out.valid - io.branchInfo.bits := s3.io.out.bits.brInfo - s3.io.out.ready := io.branchInfo.ready - - s1.io.recover <> DontCare - s2.io.recover <> DontCare - s3.io.recover.valid <> io.inOrderBrInfo.valid - s3.io.recover.bits <> io.inOrderBrInfo.bits.ui - - s1.io.cacheValid := DontCare - s2.io.cacheValid := io.cacheValid - s3.io.cacheValid := io.cacheValid - + io.branchInfo := s3.io.out.brInfo if (BPUDebug) { - XSDebug(io.branchInfo.fire(), "branchInfo sent!\n") + XSDebug(io.inFire(3), "branchInfo sent!\n") for (i <- 0 until PredictWidth) { - val b = io.branchInfo.bits(i) - XSDebug(io.branchInfo.fire(), "brInfo(%d): ubtbWrWay:%d, ubtbHit:%d, btbWrWay:%d, btbHitJal:%d, bimCtr:%d, fetchIdx:%d\n", + val b = io.branchInfo(i) + XSDebug(io.inFire(3), "brInfo(%d): ubtbWrWay:%d, ubtbHit:%d, btbWrWay:%d, btbHitJal:%d, bimCtr:%d, fetchIdx:%d\n", i.U, b.ubtbWriteWay, b.ubtbHits, b.btbWriteWay, b.btbHitJal, b.bimCtr, b.fetchIdx) val t = b.tageMeta - XSDebug(io.branchInfo.fire(), " tageMeta: pvder(%d):%d, altDiffers:%d, pvderU:%d, pvderCtr:%d, allocate(%d):%d\n", + XSDebug(io.inFire(3), " tageMeta: pvder(%d):%d, altDiffers:%d, pvderU:%d, pvderCtr:%d, allocate(%d):%d\n", t.provider.valid, t.provider.bits, t.altDiffers, t.providerU, t.providerCtr, t.allocate.valid, t.allocate.bits) } } @@ -497,17 +485,16 @@ abstract class BaseBPU extends XSModule with BranchPredictorComponents with HasB class FakeBPU extends BaseBPU { io.out.foreach(i => { // Provide not takens - i.valid := true.B - i.bits <> DontCare - i.bits.redirect := false.B + i <> DontCare + i.takens := 0.U }) io.branchInfo <> DontCare } - +@chiselName class BPU extends BaseBPU { //**********************Stage 1****************************// - val s1_fire = s1.io.in.fire() + val s1_resp_in = Wire(new PredictorResponse) val s1_brInfo_in = Wire(Vec(PredictWidth, new BranchInfo)) @@ -518,9 +505,9 @@ class BPU extends BaseBPU { val s1_inLatch = RegEnable(io.in, s1_fire) ubtb.io.flush := io.flush(0) // TODO: fix this - ubtb.io.pc.valid := s1_inLatch.valid - ubtb.io.pc.bits := s1_inLatch.bits.pc - ubtb.io.inMask := s1_inLatch.bits.inMask + ubtb.io.pc.valid := s2_fire + ubtb.io.pc.bits := s1_inLatch.pc + ubtb.io.inMask := s1_inLatch.inMask @@ -532,9 +519,9 @@ class BPU extends BaseBPU { } btb.io.flush := io.flush(0) // TODO: fix this - btb.io.pc.valid := io.in.valid - btb.io.pc.bits := io.in.bits.pc - btb.io.inMask := io.in.bits.inMask + btb.io.pc.valid := s1_fire + btb.io.pc.bits := io.in.pc + btb.io.inMask := io.in.inMask @@ -546,9 +533,9 @@ class BPU extends BaseBPU { } bim.io.flush := io.flush(0) // TODO: fix this - bim.io.pc.valid := io.in.valid - bim.io.pc.bits := io.in.bits.pc - bim.io.inMask := io.in.bits.inMask + bim.io.pc.valid := s1_fire + bim.io.pc.bits := io.in.pc + bim.io.inMask := io.in.inMask // Wrap bim response into resp_in and brInfo_in @@ -558,25 +545,23 @@ class BPU extends BaseBPU { } - s1.io.in.valid := io.in.valid - s1.io.in.bits.pc := io.in.bits.pc - s1.io.in.bits.mask := io.in.bits.inMask - s1.io.in.bits.target := npc(io.in.bits.pc, PopCount(io.in.bits.inMask)) // Deault target npc - s1.io.in.bits.resp <> s1_resp_in - s1.io.in.bits.brInfo <> s1_brInfo_in - s1.io.in.bits.saveHalfRVI := false.B + s1.io.inFire := s1_fire + s1.io.in.pc := io.in.pc + s1.io.in.mask := io.in.inMask + s1.io.in.resp <> s1_resp_in + s1.io.in.brInfo <> s1_brInfo_in - val s1_hist = RegEnable(io.in.bits.hist, enable=s1_fire) - val s2_hist = RegEnable(s1_hist, enable=s2.io.in.fire()) - val s3_hist = RegEnable(s2_hist, enable=s3.io.in.fire()) + val s1_hist = RegEnable(io.in.hist, enable=s1_fire) + val s2_hist = RegEnable(s1_hist, enable=s2_fire) + val s3_hist = RegEnable(s2_hist, enable=s3_fire) s1.io.debug_hist := s1_hist s2.io.debug_hist := s2_hist s3.io.debug_hist := s3_hist - val s1_histPtr = RegEnable(io.in.bits.histPtr, enable=s1_fire) - val s2_histPtr = RegEnable(s1_histPtr, enable=s2.io.in.fire()) - val s3_histPtr = RegEnable(s2_histPtr, enable=s3.io.in.fire()) + val s1_histPtr = RegEnable(io.in.histPtr, enable=s1_fire) + val s2_histPtr = RegEnable(s1_histPtr, enable=s2_fire) + val s3_histPtr = RegEnable(s2_histPtr, enable=s3_fire) s1.io.debug_histPtr := s1_histPtr s2.io.debug_histPtr := s2_histPtr @@ -584,29 +569,42 @@ class BPU extends BaseBPU { //**********************Stage 2****************************// tage.io.flush := io.flush(1) // TODO: fix this - tage.io.pc.valid := s1.io.out.fire() - tage.io.pc.bits := s1.io.out.bits.pc // PC from s1 + tage.io.pc.valid := s2_fire + tage.io.pc.bits := s2.io.in.pc // PC from s1 tage.io.hist := s1_hist // The inst is from s1 - tage.io.inMask := s1.io.out.bits.mask - tage.io.s3Fire := s3.io.in.fire() // Tell tage to march 1 stage - tage.io.bim <> s1.io.out.bits.resp.bim // Use bim results from s1 + tage.io.inMask := s2.io.in.mask + tage.io.s3Fire := s3_fire // Tell tage to march 1 stage + tage.io.bim <> s1.io.out.resp.bim // Use bim results from s1 //**********************Stage 3****************************// // Wrap tage response and meta into s3.io.in.bits // This is ugly loop.io.flush := io.flush(2) - loop.io.pc.valid := s2.io.out.fire() - loop.io.pc.bits := s2.io.out.bits.pc - loop.io.inMask := s2.io.out.bits.mask - - s3.io.in.bits.resp.tage <> tage.io.resp - s3.io.in.bits.resp.loop <> loop.io.resp + loop.io.pc.valid := s3_fire + loop.io.pc.bits := s3.io.in.pc + loop.io.inMask := s3.io.in.mask + loop.io.outFire := s4_fire + loop.io.respIn.taken := s3.io.pred.taken + loop.io.respIn.jmpIdx := s3.io.pred.jmpIdx + + + s3.io.in.resp.tage <> tage.io.resp + s3.io.in.resp.loop <> loop.io.resp for (i <- 0 until PredictWidth) { - s3.io.in.bits.brInfo(i).tageMeta := tage.io.meta(i) - s3.io.in.bits.brInfo(i).specCnt := loop.io.meta.specCnts(i) + s3.io.in.brInfo(i).tageMeta := tage.io.meta(i) + s3.io.in.brInfo(i).specCnt := loop.io.meta.specCnts(i) } + s3.s3IO.predecode <> io.predecode + + s3.s3IO.realMask := io.realMask + + s3.s3IO.prevHalf := io.prevHalf + + s3.s3IO.recover.valid <> io.inOrderBrInfo.valid + s3.s3IO.recover.bits <> io.inOrderBrInfo.bits.ui + if (BPUDebug) { if (debug_verbose) { val uo = ubtb.io.out diff --git a/src/main/scala/xiangshan/frontend/Bim.scala b/src/main/scala/xiangshan/frontend/Bim.scala index 67a0f24688e24c4865f58ebd9d3679eaddf868d6..d449f146b0daa98d153dbfb9b603352abbcb763f 100644 --- a/src/main/scala/xiangshan/frontend/Bim.scala +++ b/src/main/scala/xiangshan/frontend/Bim.scala @@ -1,141 +1,136 @@ -package xiangshan.frontend - -import chisel3._ -import chisel3.util._ -import xiangshan._ -import xiangshan.backend.ALUOpType -import utils._ -import chisel3.util.experimental.BoringUtils -import xiangshan.backend.decode.XSTrap - -trait BimParams extends HasXSParameter { - val BimBanks = PredictWidth - val BimSize = 4096 - val nRows = BimSize / BimBanks - val bypassEntries = 4 -} - -class BIM extends BasePredictor with BimParams{ - class BIMResp extends Resp { - val ctrs = Vec(PredictWidth, UInt(2.W)) - } - class BIMMeta extends Meta { - val ctrs = Vec(PredictWidth, UInt(2.W)) - } - class BIMFromOthers extends FromOthers {} - - class BIMIO extends DefaultBasePredictorIO { - val resp = Output(new BIMResp) - val meta = Output(new BIMMeta) - } - - override val io = IO(new BIMIO) - // Update logic - // 1 calculate new 2-bit saturated counter value - def satUpdate(old: UInt, len: Int, taken: Bool): UInt = { - val oldSatTaken = old === ((1 << len)-1).U - val oldSatNotTaken = old === 0.U - Mux(oldSatTaken && taken, ((1 << len)-1).U, - Mux(oldSatNotTaken && !taken, 0.U, - Mux(taken, old + 1.U, old - 1.U))) - } - - val bimAddr = new TableAddr(log2Up(BimSize), BimBanks) - - val pcLatch = RegEnable(io.pc.bits, io.pc.valid) - - val bim = List.fill(BimBanks) { - Module(new SRAMTemplate(UInt(2.W), set = nRows, shouldReset = false, holdRead = true)) - } - - val doing_reset = RegInit(true.B) - val resetRow = RegInit(0.U(log2Ceil(nRows).W)) - resetRow := resetRow + doing_reset - when (resetRow === (nRows-1).U) { doing_reset := false.B } - - val baseBank = bimAddr.getBank(io.pc.bits) - - val realMask = circularShiftRight(io.inMask, BimBanks, baseBank) - - // those banks whose indexes are less than baseBank are in the next row - val isInNextRow = VecInit((0 until BtbBanks).map(_.U < baseBank)) - - val baseRow = bimAddr.getBankIdx(io.pc.bits) - - val realRow = VecInit((0 until BimBanks).map(b => Mux(isInNextRow(b.U), (baseRow+1.U)(log2Up(nRows)-1, 0), baseRow))) - - val realRowLatch = VecInit(realRow.map(RegEnable(_, enable=io.pc.valid))) - - for (b <- 0 until BimBanks) { - bim(b).reset := reset.asBool - bim(b).io.r.req.valid := realMask(b) && io.pc.valid - bim(b).io.r.req.bits.setIdx := realRow(b) - } - - val bimRead = VecInit(bim.map(_.io.r.resp.data(0))) - - val baseBankLatch = bimAddr.getBank(pcLatch) - - // e.g: baseBank == 5 => (5, 6,..., 15, 0, 1, 2, 3, 4) - val bankIdxInOrder = VecInit((0 until BimBanks).map(b => (baseBankLatch +& b.U)(log2Up(BimBanks)-1, 0))) - - for (b <- 0 until BimBanks) { - val ctr = bimRead(bankIdxInOrder(b)) - io.resp.ctrs(b) := ctr - io.meta.ctrs(b) := ctr - } - - val u = io.update.bits.ui - - val updateBank = bimAddr.getBank(u.pc) - val updateRow = bimAddr.getBankIdx(u.pc) - - - val wrbypass_ctrs = Reg(Vec(bypassEntries, Vec(BimBanks, UInt(2.W)))) - val wrbypass_ctr_valids = Reg(Vec(bypassEntries, Vec(BimBanks, Bool()))) - val wrbypass_rows = Reg(Vec(bypassEntries, UInt(log2Up(nRows).W))) - val wrbypass_enq_idx = RegInit(0.U(log2Up(bypassEntries).W)) - - val wrbypass_hits = VecInit((0 until bypassEntries).map( i => - !doing_reset && wrbypass_rows(i) === updateRow)) - val wrbypass_hit = wrbypass_hits.reduce(_||_) - val wrbypass_hit_idx = PriorityEncoder(wrbypass_hits) - - val oldCtr = Mux(wrbypass_hit && wrbypass_ctr_valids(wrbypass_hit_idx)(updateBank), wrbypass_ctrs(wrbypass_hit_idx)(updateBank), u.brInfo.bimCtr) - val newTaken = u.taken - val newCtr = satUpdate(oldCtr, 2, newTaken) - // val oldSaturated = newCtr === oldCtr - - val needToUpdate = io.update.valid && u.pd.isBr - - when (reset.asBool) { wrbypass_ctr_valids.foreach(_.foreach(_ := false.B))} - - when (needToUpdate) { - when (wrbypass_hit) { - wrbypass_ctrs(wrbypass_hit_idx)(updateBank) := newCtr - wrbypass_ctr_valids(wrbypass_enq_idx)(updateBank) := true.B - } .otherwise { - wrbypass_ctrs(wrbypass_hit_idx)(updateBank) := newCtr - (0 until BimBanks).foreach(b => wrbypass_ctr_valids(wrbypass_enq_idx)(b) := false.B) // reset valid bits - wrbypass_ctr_valids(wrbypass_enq_idx)(updateBank) := true.B - wrbypass_rows(wrbypass_enq_idx) := updateRow - wrbypass_enq_idx := (wrbypass_enq_idx + 1.U)(log2Up(bypassEntries)-1,0) - } - } - - for (b <- 0 until BimBanks) { - bim(b).io.w.req.valid := needToUpdate && b.U === updateBank || doing_reset - bim(b).io.w.req.bits.setIdx := Mux(doing_reset, resetRow, updateRow) - bim(b).io.w.req.bits.data := Mux(doing_reset, 2.U(2.W), newCtr) - } - - if (BPUDebug && debug) { - XSDebug(doing_reset, "Reseting...\n") - XSDebug("[update] v=%d pc=%x pnpc=%x tgt=%x brTgt=%x\n", io.update.valid, u.pc, u.pnpc, u.target, u.brTarget) - XSDebug("[update] taken=%d isMisPred=%d", u.taken, u.isMisPred) - XSDebug(false, true.B, p"brTag=${u.brTag} pd.isBr=${u.pd.isBr} brInfo.bimCtr=${Binary(u.brInfo.bimCtr)}\n") - XSDebug("needToUpdate=%d updateBank=%x updateRow=%x newCtr=%b oldCtr=%b\n", needToUpdate, updateBank, updateRow, newCtr, oldCtr) - XSDebug("[wrbypass] hit=%d hits=%b\n", wrbypass_hit, wrbypass_hits.asUInt) - } - +package xiangshan.frontend + +import chisel3._ +import chisel3.util._ +import xiangshan._ +import xiangshan.backend.ALUOpType +import utils._ +import xiangshan.backend.decode.XSTrap +import chisel3.experimental.chiselName + +trait BimParams extends HasXSParameter { + val BimBanks = PredictWidth + val BimSize = 4096 + val nRows = BimSize / BimBanks + val bypassEntries = 4 +} + +@chiselName +class BIM extends BasePredictor with BimParams { + class BIMResp extends Resp { + val ctrs = Vec(PredictWidth, UInt(2.W)) + } + class BIMMeta extends Meta { + val ctrs = Vec(PredictWidth, UInt(2.W)) + } + class BIMFromOthers extends FromOthers {} + + class BIMIO extends DefaultBasePredictorIO { + val resp = Output(new BIMResp) + val meta = Output(new BIMMeta) + } + + override val io = IO(new BIMIO) + override val debug = true + + val bimAddr = new TableAddr(log2Up(BimSize), BimBanks) + + val bankAlignedPC = bankAligned(io.pc.bits) + val pcLatch = RegEnable(bankAlignedPC, io.pc.valid) + + val bim = List.fill(BimBanks) { + Module(new SRAMTemplate(UInt(2.W), set = nRows, shouldReset = false, holdRead = true)) + } + + val doing_reset = RegInit(true.B) + val resetRow = RegInit(0.U(log2Ceil(nRows).W)) + resetRow := resetRow + doing_reset + when (resetRow === (nRows-1).U) { doing_reset := false.B } + + // this bank means cache bank + val startsAtOddBank = bankInGroup(bankAlignedPC)(0) + + val realMask = Mux(startsAtOddBank, + Cat(io.inMask(bankWidth-1,0), io.inMask(PredictWidth-1, bankWidth)), + io.inMask) + + + val isInNextRow = VecInit((0 until BimBanks).map(i => Mux(startsAtOddBank, (i < bankWidth).B, false.B))) + + val baseRow = bimAddr.getBankIdx(bankAlignedPC) + + val realRow = VecInit((0 until BimBanks).map(b => Mux(isInNextRow(b), (baseRow+1.U)(log2Up(nRows)-1, 0), baseRow))) + + val realRowLatch = VecInit(realRow.map(RegEnable(_, enable=io.pc.valid))) + + for (b <- 0 until BimBanks) { + bim(b).io.r.req.valid := realMask(b) && io.pc.valid + bim(b).io.r.req.bits.setIdx := realRow(b) + } + + val bimRead = VecInit(bim.map(_.io.r.resp.data(0))) + + val startsAtOddBankLatch = bankInGroup(pcLatch)(0) + + for (b <- 0 until BimBanks) { + val realBank = (if (b < bankWidth) Mux(startsAtOddBankLatch, (b+bankWidth).U, b.U) + else Mux(startsAtOddBankLatch, (b-bankWidth).U, b.U)) + val ctr = bimRead(realBank) + io.resp.ctrs(b) := ctr + io.meta.ctrs(b) := ctr + } + + val u = io.update.bits.ui + + val updateBank = bimAddr.getBank(u.pc) + val updateRow = bimAddr.getBankIdx(u.pc) + + + val wrbypass_ctrs = Reg(Vec(bypassEntries, Vec(BimBanks, UInt(2.W)))) + val wrbypass_ctr_valids = Reg(Vec(bypassEntries, Vec(BimBanks, Bool()))) + val wrbypass_rows = Reg(Vec(bypassEntries, UInt(log2Up(nRows).W))) + val wrbypass_enq_idx = RegInit(0.U(log2Up(bypassEntries).W)) + + val wrbypass_hits = VecInit((0 until bypassEntries).map( i => + !doing_reset && wrbypass_rows(i) === updateRow)) + val wrbypass_hit = wrbypass_hits.reduce(_||_) + val wrbypass_hit_idx = PriorityEncoder(wrbypass_hits) + + val oldCtr = Mux(wrbypass_hit && wrbypass_ctr_valids(wrbypass_hit_idx)(updateBank), wrbypass_ctrs(wrbypass_hit_idx)(updateBank), u.brInfo.bimCtr) + val newTaken = u.taken + val newCtr = satUpdate(oldCtr, 2, newTaken) + // val oldSaturated = newCtr === oldCtr + + val needToUpdate = io.update.valid && u.pd.isBr + + when (reset.asBool) { wrbypass_ctr_valids.foreach(_.foreach(_ := false.B))} + + when (needToUpdate) { + when (wrbypass_hit) { + wrbypass_ctrs(wrbypass_hit_idx)(updateBank) := newCtr + wrbypass_ctr_valids(wrbypass_enq_idx)(updateBank) := true.B + } .otherwise { + wrbypass_ctrs(wrbypass_hit_idx)(updateBank) := newCtr + (0 until BimBanks).foreach(b => wrbypass_ctr_valids(wrbypass_enq_idx)(b) := false.B) // reset valid bits + wrbypass_ctr_valids(wrbypass_enq_idx)(updateBank) := true.B + wrbypass_rows(wrbypass_enq_idx) := updateRow + wrbypass_enq_idx := (wrbypass_enq_idx + 1.U)(log2Up(bypassEntries)-1,0) + } + } + + for (b <- 0 until BimBanks) { + bim(b).io.w.req.valid := needToUpdate && b.U === updateBank || doing_reset + bim(b).io.w.req.bits.setIdx := Mux(doing_reset, resetRow, updateRow) + bim(b).io.w.req.bits.data := Mux(doing_reset, 2.U(2.W), newCtr) + } + + if (BPUDebug && debug) { + XSDebug(doing_reset, "Reseting...\n") + XSDebug("[update] v=%d pc=%x pnpc=%x tgt=%x brTgt=%x\n", io.update.valid, u.pc, u.pnpc, u.target, u.brTarget) + XSDebug("[update] taken=%d isMisPred=%d", u.taken, u.isMisPred) + XSDebug(false, true.B, p"brTag=${u.brTag} pd.isBr=${u.pd.isBr} brInfo.bimCtr=${Binary(u.brInfo.bimCtr)}\n") + XSDebug("needToUpdate=%d updateBank=%x updateRow=%x newCtr=%b oldCtr=%b\n", needToUpdate, updateBank, updateRow, newCtr, oldCtr) + XSDebug("[wrbypass] hit=%d hits=%b\n", wrbypass_hit, wrbypass_hits.asUInt) + } + } \ No newline at end of file diff --git a/src/main/scala/xiangshan/frontend/Btb.scala b/src/main/scala/xiangshan/frontend/Btb.scala index 1aada7f1021e58600d9b39113fdf3cc71c63cb06..a0444c582eaf26a06ddc2a85a512cc3ea8f960c8 100644 --- a/src/main/scala/xiangshan/frontend/Btb.scala +++ b/src/main/scala/xiangshan/frontend/Btb.scala @@ -1,12 +1,14 @@ package xiangshan.frontend import chisel3._ +import chisel3.stage.{ChiselGeneratorAnnotation, ChiselStage} import chisel3.util._ import xiangshan._ import xiangshan.backend.ALUOpType import utils._ -import chisel3.util.experimental.BoringUtils import xiangshan.backend.decode.XSTrap +import chisel3.experimental.chiselName + import scala.math.min @@ -70,7 +72,9 @@ class BTB extends BasePredictor with BTBParams{ override val io = IO(new BTBIO) val btbAddr = new TableAddr(log2Up(BtbSize/BtbWays), BtbBanks) - val pcLatch = RegEnable(io.pc.bits, io.pc.valid) + val bankAlignedPC = bankAligned(io.pc.bits) + + val pcLatch = RegEnable(bankAlignedPC, io.pc.valid) val data = List.fill(BtbWays) { List.fill(BtbBanks) { @@ -82,47 +86,53 @@ class BTB extends BasePredictor with BTBParams{ Module(new SRAMTemplate(new BtbMetaEntry, set = nRows, shouldReset = true, holdRead = true)) } } - val edata = Module(new SRAMTemplate(UInt(VAddrBits.W), set = extendedNRows, shouldReset = true, holdRead = true)) + val edata = List.fill(2)(Module(new SRAMTemplate(UInt(VAddrBits.W), set = extendedNRows/2, shouldReset = true, holdRead = true))) // BTB read requests - val baseBank = btbAddr.getBank(io.pc.bits) - val realMask = circularShiftLeft(io.inMask, BtbBanks, baseBank) + // this bank means cache bank + val startsAtOddBank = bankInGroup(bankAlignedPC)(0) - val realMaskLatch = RegEnable(realMask, io.pc.valid) + val baseBank = btbAddr.getBank(bankAlignedPC) - // those banks whose indexes are less than baseBank are in the next row - val isInNextRow = VecInit((0 until BtbBanks).map(_.U < baseBank)) + val realMask = Mux(startsAtOddBank, + Cat(io.inMask(bankWidth-1,0), io.inMask(PredictWidth-1, bankWidth)), + io.inMask) + val realMaskLatch = RegEnable(realMask, io.pc.valid) - val baseRow = btbAddr.getBankIdx(io.pc.bits) + val isInNextRow = VecInit((0 until BtbBanks).map(i => Mux(startsAtOddBank, (i < bankWidth).B, false.B))) + val baseRow = btbAddr.getBankIdx(bankAlignedPC) + val nextRowStartsUp = baseRow.andR - val realRow = VecInit((0 until BtbBanks).map(b => Mux(isInNextRow(b.U), (baseRow+1.U)(log2Up(nRows)-1, 0), baseRow))) + val realRow = VecInit((0 until BtbBanks).map(b => Mux(isInNextRow(b), (baseRow+1.U)(log2Up(nRows)-1, 0), baseRow))) val realRowLatch = VecInit(realRow.map(RegEnable(_, enable=io.pc.valid))) for (w <- 0 until BtbWays) { for (b <- 0 until BtbBanks) { - meta(w)(b).reset := reset.asBool meta(w)(b).io.r.req.valid := realMask(b) && io.pc.valid meta(w)(b).io.r.req.bits.setIdx := realRow(b) - data(w)(b).reset := reset.asBool data(w)(b).io.r.req.valid := realMask(b) && io.pc.valid data(w)(b).io.r.req.bits.setIdx := realRow(b) } } - edata.reset := reset.asBool - edata.io.r.req.valid := io.pc.valid - edata.io.r.req.bits.setIdx := realRow(0) // Use the baseRow + for (b <- 0 to 1) { + edata(b).io.r.req.valid := io.pc.valid + val row = if (b == 0) { Mux(startsAtOddBank, realRow(bankWidth), realRow(0)) } + else { Mux(startsAtOddBank, realRow(0), realRow(bankWidth))} + edata(b).io.r.req.bits.setIdx := row + } // Entries read from SRAM val metaRead = VecInit((0 until BtbWays).map(w => VecInit((0 until BtbBanks).map( b => meta(w)(b).io.r.resp.data(0))))) val dataRead = VecInit((0 until BtbWays).map(w => VecInit((0 until BtbBanks).map( b => data(w)(b).io.r.resp.data(0))))) - val edataRead = edata.io.r.resp.data(0) + val edataRead = VecInit((0 to 1).map(i => edata(i).io.r.resp.data(0))) val baseBankLatch = btbAddr.getBank(pcLatch) + val startsAtOddBankLatch = bankInGroup(pcLatch)(0) val baseTag = btbAddr.getTag(pcLatch) val tagIncremented = VecInit((0 until BtbBanks).map(b => RegEnable(isInNextRow(b.U) && nextRowStartsUp, io.pc.valid))) @@ -165,20 +175,22 @@ class BTB extends BasePredictor with BTBParams{ b => Mux(bankHits(b), bankHitWays(b), allocWays(b)) )) - // e.g: baseBank == 5 => (5, 6,..., 15, 0, 1, 2, 3, 4) - val bankIdxInOrder = VecInit((0 until BtbBanks).map(b => (baseBankLatch +& b.U)(log2Up(BtbBanks)-1,0))) for (b <- 0 until BtbBanks) { - val meta_entry = metaRead(bankHitWays(bankIdxInOrder(b)))(bankIdxInOrder(b)) - val data_entry = dataRead(bankHitWays(bankIdxInOrder(b)))(bankIdxInOrder(b)) + val realBank = (if (b < bankWidth) Mux(startsAtOddBankLatch, (b+bankWidth).U, b.U) + else Mux(startsAtOddBankLatch, (b-bankWidth).U, b.U)) + val meta_entry = metaRead(bankHitWays(realBank))(realBank) + val data_entry = dataRead(bankHitWays(realBank))(realBank) + val edataBank = (if (b < bankWidth) Mux(startsAtOddBankLatch, 1.U, 0.U) + else Mux(startsAtOddBankLatch, 0.U, 1.U)) // Use real pc to calculate the target - io.resp.targets(b) := Mux(data_entry.extended, edataRead, (pcLatch.asSInt + (b << 1).S + data_entry.offset).asUInt) - io.resp.hits(b) := bankHits(bankIdxInOrder(b)) + io.resp.targets(b) := Mux(data_entry.extended, edataRead(edataBank), (pcLatch.asSInt + (b << 1).S + data_entry.offset).asUInt) + io.resp.hits(b) := bankHits(realBank) io.resp.types(b) := meta_entry.btbType io.resp.isRVC(b) := meta_entry.isRVC - io.meta.writeWay(b) := writeWay(bankIdxInOrder(b)) - io.meta.hitJal(b) := bankHits(bankIdxInOrder(b)) && meta_entry.btbType === BTBtype.J + io.meta.writeWay(b) := writeWay(realBank) + io.meta.hitJal(b) := bankHits(realBank) && meta_entry.btbType === BTBtype.J } def pdInfoToBTBtype(pd: PreDecodeInfo) = { @@ -200,13 +212,14 @@ class BTB extends BasePredictor with BTBParams{ val updateWay = u.brInfo.btbWriteWay val updateBankIdx = btbAddr.getBank(u.pc) + val updateEBank = updateBankIdx(log2Ceil(BtbBanks)-1) // highest bit of bank idx val updateRow = btbAddr.getBankIdx(u.pc) val updateType = pdInfoToBTBtype(u.pd) val metaWrite = BtbMetaEntry(btbAddr.getTag(u.pc), updateType, u.pd.isRVC) val dataWrite = BtbDataEntry(new_offset, new_extended) val jalFirstEncountered = !u.isMisPred && !u.brInfo.btbHitJal && updateType === BTBtype.J - val updateValid = io.update.valid && (u.isMisPred || jalFirstEncountered || !u.isMisPred && u.pd.isBr) + val updateValid = io.update.valid && (u.isMisPred || jalFirstEncountered) // Update btb for (w <- 0 until BtbWays) { for (b <- 0 until BtbBanks) { @@ -218,10 +231,12 @@ class BTB extends BasePredictor with BTBParams{ data(w)(b).io.w.req.bits.data := dataWrite } } - - edata.io.w.req.valid := updateValid && new_extended - edata.io.w.req.bits.setIdx := updateRow - edata.io.w.req.bits.data := u.target + + for (b <- 0 to 1) { + edata(b).io.w.req.valid := updateValid && new_extended && b.U === updateEBank + edata(b).io.w.req.bits.setIdx := updateRow + edata(b).io.w.req.bits.data := u.target + } if (BPUDebug && debug) { @@ -234,7 +249,7 @@ class BTB extends BasePredictor with BTBParams{ }) val validLatch = RegNext(io.pc.valid) - XSDebug(io.pc.valid, "read: pc=0x%x, baseBank=%d, realMask=%b\n", io.pc.bits, baseBank, realMask) + XSDebug(io.pc.valid, "read: pc=0x%x, baseBank=%d, realMask=%b\n", bankAlignedPC, baseBank, realMask) XSDebug(validLatch, "read_resp: pc=0x%x, readIdx=%d-------------------------------\n", pcLatch, btbAddr.getIdx(pcLatch)) if (debug_verbose) { @@ -245,6 +260,9 @@ class BTB extends BasePredictor with BTBParams{ } } } + // e.g: baseBank == 5 => (5, 6,..., 15, 0, 1, 2, 3, 4) + val bankIdxInOrder = VecInit((0 until BtbBanks).map(b => (baseBankLatch +& b.U)(log2Up(BtbBanks)-1,0))) + for (i <- 0 until BtbBanks) { val idx = bankIdxInOrder(i) XSDebug(validLatch && bankHits(bankIdxInOrder(i)), "resp(%d): bank(%d) hits, tgt=%x, isRVC=%d, type=%d\n", diff --git a/src/main/scala/xiangshan/frontend/FakeICache.scala b/src/main/scala/xiangshan/frontend/FakeICache.scala index 32dfd65c2bccea5afaa4d8ae9979b5d11a61926f..19c1d73b46a511f236c009f7735c6666dca7b435 100644 --- a/src/main/scala/xiangshan/frontend/FakeICache.scala +++ b/src/main/scala/xiangshan/frontend/FakeICache.scala @@ -4,7 +4,7 @@ import chisel3._ import chisel3.util._ import device.RAMHelper import xiangshan._ -import utils.{Debug, GTimer, XSDebug} +import utils.{GTimer, XSDebug} import xiangshan.backend.decode.isa import xiangshan.backend.decode.Decoder diff --git a/src/main/scala/xiangshan/frontend/Frontend.scala b/src/main/scala/xiangshan/frontend/Frontend.scala index f0fbb05223aa4377b9ce2459be26e749a15afe6f..bfc58954129409b4922c52e6dd952b4f01c6654f 100644 --- a/src/main/scala/xiangshan/frontend/Frontend.scala +++ b/src/main/scala/xiangshan/frontend/Frontend.scala @@ -15,10 +15,12 @@ class Frontend extends XSModule { val icacheToTlb = Flipped(new BlockTlbRequestIO) val ptw = new TlbPtwIO val backend = new FrontendToBackendIO + val sfence = Input(new SfenceBundle) + val tlbCsr = Input(new TlbCsrBundle) }) val ifu = Module(new IFU) - val ibuffer = if(EnableLB) Module(new LoopBuffer) else Module(new Ibuffer) + val ibuffer = Module(new Ibuffer) val needFlush = io.backend.redirect.valid @@ -33,6 +35,8 @@ class Frontend extends XSModule { //itlb to ptw io.ptw <> TLB( in = Seq(io.icacheToTlb), + sfence = io.sfence, + csr = io.tlbCsr, width = 1, isDtlb = false, shouldBlock = true @@ -50,4 +54,4 @@ class Frontend extends XSModule { // } -} +} \ No newline at end of file diff --git a/src/main/scala/xiangshan/frontend/IFU.scala b/src/main/scala/xiangshan/frontend/IFU.scala index cea3de504cbbdf123d6647f17a1de6abba82674b..4d655bc758dd08b930659152ec207b0086915f0d 100644 --- a/src/main/scala/xiangshan/frontend/IFU.scala +++ b/src/main/scala/xiangshan/frontend/IFU.scala @@ -6,94 +6,161 @@ import device.RAMHelper import xiangshan._ import utils._ import xiangshan.cache._ +import chisel3.experimental.chiselName -trait HasIFUConst { this: XSModule => +trait HasIFUConst extends HasXSParameter { val resetVector = 0x80000000L//TODO: set reset vec - val groupAlign = log2Up(FetchWidth * 4 * 2) - def groupPC(pc: UInt): UInt = Cat(pc(VAddrBits-1, groupAlign), 0.U(groupAlign.W)) + def align(pc: UInt, bytes: Int): UInt = Cat(pc(VAddrBits-1, log2Ceil(bytes)), 0.U(log2Ceil(bytes).W)) + val groupBytes = FetchWidth * 4 * 2 // correspond to cache line size + val groupOffsetBits = log2Ceil(groupBytes) + val nBanksInPacket = 2 + val bankBytes = PredictWidth * 2 / nBanksInPacket + val nBanksInGroup = groupBytes / bankBytes + val bankWidth = PredictWidth / nBanksInPacket + val bankOffsetBits = log2Ceil(bankBytes) + // (0, nBanksInGroup-1) + def bankInGroup(pc: UInt) = pc(groupOffsetBits-1,bankOffsetBits) + def isInLastBank(pc: UInt) = bankInGroup(pc) === (nBanksInGroup-1).U + // (0, bankBytes/2-1) + def offsetInBank(pc: UInt) = pc(bankOffsetBits-1,1) + def bankAligned(pc: UInt) = align(pc, bankBytes) + def groupAligned(pc: UInt) = align(pc, groupBytes) // each 1 bit in mask stands for 2 Bytes - def mask(pc: UInt): UInt = (Fill(PredictWidth * 2, 1.U(1.W)) >> pc(groupAlign - 1, 1))(PredictWidth - 1, 0) - def snpc(pc: UInt): UInt = pc + (PopCount(mask(pc)) << 1) - + // 8 bits, in which only the first 7 bits could be 0 + def maskFirstHalf(pc: UInt): UInt = ((~(0.U(bankWidth.W))) >> offsetInBank(pc))(bankWidth-1,0) + // when in loop(buffer), we need to make use of the full packet + // and get the real mask in iCacheResp from loop buffer + // we may make predictions on more instructions than we could get from loop buffer + // and this will be handled in if4 + def maskLastHalf(pc: UInt, inLoop: Bool = false.B): UInt = Mux(isInLastBank(pc) && !inLoop, 0.U(bankWidth.W), ~0.U(bankWidth.W)) + def mask(pc: UInt, inLoop: Bool = false.B): UInt = Reverse(Cat(maskFirstHalf(pc), maskLastHalf(pc, inLoop))) + def snpc(pc: UInt, inLoop: Bool = false.B): UInt = pc + (PopCount(mask(pc, inLoop)) << 1) + + val enableGhistRepair = true val IFUDebug = true } class GlobalHistoryInfo() extends XSBundle { + val nowPtr = UInt(log2Ceil(ExtHistoryLength).W) val sawNTBr = Bool() val takenOnBr = Bool() - val saveHalfRVI = Bool() + // val saveHalfRVI = Bool() def shifted = takenOnBr || sawNTBr - def newPtr(ptr: UInt) = Mux(shifted, ptr - 1.U, ptr) + def newPtr(ptr: UInt = nowPtr): UInt = Mux(shifted, ptr - 1.U, ptr) + + final def === (that: GlobalHistoryInfo): Bool = { + shifted === that.shifted && + takenOnBr === that.takenOnBr && + nowPtr === that.nowPtr + } + + final def =/= (that: GlobalHistoryInfo): Bool = !(this === that) + + // def update(): GlobalHistoryInfo = { + // val g = WireInit(this) + // g.nowPtr := nowPtr - Mux(shifted, 1.U, 0.U) + // g.sawNTBr := Mux(saveHalfRVI, sawNTBr, false.B) + // g.takenOnBr := Mux(saveHalfRVI, takenOnBr, false.B) + // // g.saveHalfRVI := false.B + // g + // } + implicit val name = "IFU" - def debug = XSDebug("[GHInfo] sawNTBr=%d, takenOnBr=%d, saveHalfRVI=%d\n", sawNTBr, takenOnBr, saveHalfRVI) + def debug(where: String) = XSDebug(p"[${where}_GHInfo] sawNTBr=${sawNTBr}, takenOnBr=${takenOnBr}\n") // override def toString(): String = "histPtr=%d, sawNTBr=%d, takenOnBr=%d, saveHalfRVI=%d".format(histPtr, sawNTBr, takenOnBr, saveHalfRVI) } class IFUIO extends XSBundle { val fetchPacket = DecoupledIO(new FetchPacket) - val redirect = Flipped(ValidIO(new Redirect)) + val redirect = Flipped(ValidIO(UInt(VAddrBits.W))) val outOfOrderBrInfo = Flipped(ValidIO(new BranchUpdateInfo)) val inOrderBrInfo = Flipped(ValidIO(new BranchUpdateInfo)) val icacheReq = DecoupledIO(new ICacheReq) val icacheResp = Flipped(DecoupledIO(new ICacheResp)) val icacheFlush = Output(UInt(2.W)) + // val loopBufPar = Flipped(new LoopBufferParameters) } +class PrevHalfInstr extends XSBundle { + val valid = Bool() + val taken = Bool() + val ghInfo = new GlobalHistoryInfo() + val fetchpc = UInt(VAddrBits.W) // only for debug + val idx = UInt(VAddrBits.W) // only for debug + val pc = UInt(VAddrBits.W) + val target = UInt(VAddrBits.W) + val instr = UInt(16.W) + val ipf = Bool() + val newPtr = UInt(log2Up(ExtHistoryLength).W) +} +@chiselName class IFU extends XSModule with HasIFUConst { val io = IO(new IFUIO) val bpu = BPU(EnableBPU) val pd = Module(new PreDecode) + val loopBuffer = if(EnableLB) { Module(new LoopBuffer) } else { Module(new FakeLoopBuffer) } val if2_redirect, if3_redirect, if4_redirect = WireInit(false.B) val if1_flush, if2_flush, if3_flush, if4_flush = WireInit(false.B) - if4_flush := io.redirect.valid + val loopBufPar = loopBuffer.io.loopBufPar + val inLoop = WireInit(loopBuffer.io.out.valid) + val icacheResp = WireInit(Mux(inLoop, loopBuffer.io.out.bits, io.icacheResp.bits)) + + if4_flush := io.redirect.valid || loopBufPar.LBredirect.valid if3_flush := if4_flush || if4_redirect if2_flush := if3_flush || if3_redirect if1_flush := if2_flush || if2_redirect + loopBuffer.io.flush := io.redirect.valid + //********************** IF1 ****************************// val if1_valid = !reset.asBool && GTimer() > 500.U val if1_npc = WireInit(0.U(VAddrBits.W)) val if2_ready = WireInit(false.B) - val if1_fire = if1_valid && (if2_ready || if1_flush) && io.icacheReq.ready + val if1_fire = if1_valid && (if2_ready || if1_flush) && (inLoop || io.icacheReq.ready) - val if1_histPtr, if2_histPtr, if3_histPtr, if4_histPtr = Wire(UInt(log2Up(ExtHistoryLength).W)) - val if2_newPtr, if3_newPtr, if4_newPtr = Wire(UInt(log2Up(ExtHistoryLength).W)) - + // val if2_newPtr, if3_newPtr, if4_newPtr = Wire(UInt(log2Up(ExtHistoryLength).W)) + val extHist = RegInit(VecInit(Seq.fill(ExtHistoryLength)(0.U(1.W)))) - val shiftPtr = WireInit(false.B) + val updatePtr = WireInit(false.B) val newPtr = Wire(UInt(log2Up(ExtHistoryLength).W)) - val ptr = Mux(shiftPtr, newPtr, if1_histPtr) + val if1_histPtr = RegEnable(next=newPtr, init=0.U(log2Up(ExtHistoryLength).W), enable=updatePtr) + val ptr = Mux(updatePtr, newPtr, if1_histPtr) val hist = Wire(Vec(HistoryLength, UInt(1.W))) for (i <- 0 until HistoryLength) { hist(i) := extHist(ptr + i.U) } - shiftPtr := false.B + updatePtr := false.B newPtr := if1_histPtr + - - val if1_GHInfo = Wire(new GlobalHistoryInfo()) - if1_GHInfo := 0.U.asTypeOf(new GlobalHistoryInfo) + def wrapGHInfo(bp: BranchPrediction, ptr: UInt) = { + val ghi = Wire(new GlobalHistoryInfo()) + ghi.sawNTBr := bp.hasNotTakenBrs + ghi.takenOnBr := bp.takenOnBr + // ghi.saveHalfRVI := bp.saveHalfRVI + ghi.nowPtr := ptr + ghi + } //********************** IF2 ****************************// - val if2_valid = RegEnable(next = if1_valid, init = false.B, enable = if1_fire) + val if2_valid = RegInit(init = false.B) val if3_ready = WireInit(false.B) val if2_fire = if2_valid && if3_ready && !if2_flush val if2_pc = RegEnable(next = if1_npc, init = resetVector.U, enable = if1_fire) - val if2_snpc = snpc(if2_pc) - val if2_GHInfo = RegEnable(if1_GHInfo, if1_fire) + val if2_snpc = snpc(if2_pc, inLoop) val if2_predHistPtr = RegEnable(ptr, enable=if1_fire) if2_ready := if2_fire || !if2_valid || if2_flush - when (if2_flush) { if2_valid := if1_fire } - .elsewhen (if1_fire) { if2_valid := if1_valid } - .elsewhen (if2_fire) { if2_valid := false.B } + when (if1_fire) { if2_valid := if1_valid } + .elsewhen (if2_flush) { if2_valid := false.B } + .elsewhen (if2_fire) { if2_valid := false.B } when (RegNext(reset.asBool) && !reset.asBool) { if1_npc := resetVector.U(VAddrBits.W) @@ -103,253 +170,274 @@ class IFU extends XSModule with HasIFUConst if1_npc := RegNext(if1_npc) } - val if2_bp = bpu.io.out(0).bits + val if2_bp = bpu.io.out(0) + + val if2_GHInfo = wrapGHInfo(if2_bp, if2_predHistPtr) // if taken, bp_redirect should be true // when taken on half RVI, we suppress this redirect signal - if2_redirect := if2_fire && bpu.io.out(0).valid && if2_bp.redirect && !if2_bp.saveHalfRVI + if2_redirect := if2_fire && if2_bp.taken when (if2_redirect) { if1_npc := if2_bp.target } - - val if2_realGHInfo = Wire(new GlobalHistoryInfo()) - if2_realGHInfo.sawNTBr := if2_bp.hasNotTakenBrs - if2_realGHInfo.takenOnBr := if2_bp.takenOnBr - if2_realGHInfo.saveHalfRVI := if2_bp.saveHalfRVI - - when (if2_fire && if2_realGHInfo.shifted) { - shiftPtr := true.B + when (if2_fire && if2_GHInfo.shifted) { + val if2_newPtr = if2_GHInfo.newPtr() + updatePtr := true.B newPtr := if2_newPtr - } - when (if2_realGHInfo.shifted && if2_newPtr >= ptr) { - hist(if2_newPtr-ptr) := if2_realGHInfo.takenOnBr.asUInt + extHist(if2_newPtr) := if2_GHInfo.takenOnBr.asUInt } //********************** IF3 ****************************// - val if3_valid = RegEnable(next = if2_valid, init = false.B, enable = if2_fire) + val if3_valid = RegInit(init = false.B) val if4_ready = WireInit(false.B) - val if3_fire = if3_valid && if4_ready && io.icacheResp.valid && !if3_flush + val if3_fire = if3_valid && if4_ready && (inLoop || io.icacheResp.valid) && !if3_flush val if3_pc = RegEnable(if2_pc, if2_fire) - val if3_GHInfo = RegEnable(if2_realGHInfo, if2_fire) val if3_predHistPtr = RegEnable(if2_predHistPtr, enable=if2_fire) + val if3_lastGHInfo = RegEnable(if2_GHInfo, enable=if2_fire) + // val if3_nextValidPC = Mux(if2_valid) if3_ready := if3_fire || !if3_valid || if3_flush - when (if3_flush) { if3_valid := false.B } - .elsewhen (if2_fire) { if3_valid := if2_valid } + when (if3_flush) { if3_valid := false.B } + .elsewhen (if2_fire) { if3_valid := true.B } .elsewhen (if3_fire) { if3_valid := false.B } - val if3_bp = bpu.io.out(1).bits - - val if3_realGHInfo = Wire(new GlobalHistoryInfo()) - if3_realGHInfo.sawNTBr := if3_bp.hasNotTakenBrs - if3_realGHInfo.takenOnBr := if3_bp.takenOnBr - if3_realGHInfo.saveHalfRVI := if3_bp.saveHalfRVI - - class PrevHalfInstr extends Bundle { - val valid = Bool() - val taken = Bool() - val ghInfo = new GlobalHistoryInfo() - val fetchpc = UInt(VAddrBits.W) // only for debug - val idx = UInt(VAddrBits.W) // only for debug - val pc = UInt(VAddrBits.W) - val target = UInt(VAddrBits.W) - val instr = UInt(16.W) - val ipf = Bool() - val newPtr = UInt(log2Up(ExtHistoryLength).W) - } + val if3_bp = bpu.io.out(1) + + val if3_GHInfo = wrapGHInfo(if3_bp, if3_predHistPtr) + + val prevHalfInstrReq = Wire(new PrevHalfInstr) + // only valid when if4_fire + val hasPrevHalfInstrReq = prevHalfInstrReq.valid val if3_prevHalfInstr = RegInit(0.U.asTypeOf(new PrevHalfInstr)) - val if4_prevHalfInstr = Wire(new PrevHalfInstr) + // val if4_prevHalfInstr = Wire(new PrevHalfInstr) // 32-bit instr crosses 2 pages, and the higher 16-bit triggers page fault val crossPageIPF = WireInit(false.B) - when (if4_prevHalfInstr.valid) { - if3_prevHalfInstr := if4_prevHalfInstr - } - val prevHalfInstr = Mux(if4_prevHalfInstr.valid, if4_prevHalfInstr, if3_prevHalfInstr) - + + val if3_pendingPrevHalfInstr = if3_prevHalfInstr.valid + // the previous half of RVI instruction waits until it meets its last half - val if3_hasPrevHalfInstr = prevHalfInstr.valid && (prevHalfInstr.pc + 2.U) === if3_pc + val if3_prevHalfInstrMet = if3_pendingPrevHalfInstr && (if3_prevHalfInstr.pc + 2.U) === if3_pc && if3_valid // set to invalid once consumed or redirect from backend - val prevHalfConsumed = if3_hasPrevHalfInstr && if3_fire || if4_flush - when (prevHalfConsumed) { + val if3_prevHalfConsumed = if3_prevHalfInstrMet && if3_fire + val if3_prevHalfFlush = if4_flush + when (hasPrevHalfInstrReq) { + if3_prevHalfInstr := prevHalfInstrReq + }.elsewhen (if3_prevHalfConsumed || if3_prevHalfFlush) { if3_prevHalfInstr.valid := false.B } // when bp signal a redirect, we distinguish between taken and not taken // if taken and saveHalfRVI is true, we do not redirect to the target - if3_redirect := if3_fire && bpu.io.out(1).valid && (if3_hasPrevHalfInstr && prevHalfInstr.taken || if3_bp.redirect && (if3_bp.taken && !if3_bp.saveHalfRVI || !if3_bp.taken) ) - when (if3_redirect) { - when (!(if3_hasPrevHalfInstr && prevHalfInstr.taken)) { - if1_npc := if3_bp.target - when (if3_realGHInfo.shifted){ - shiftPtr := true.B - newPtr := if3_newPtr - } - } + def if3_nextValidPCNotEquals(pc: UInt) = !if2_valid || if2_valid && if2_pc =/= pc + val if3_prevHalfMetRedirect = if3_pendingPrevHalfInstr && if3_prevHalfInstrMet && if3_prevHalfInstr.taken && if3_nextValidPCNotEquals(if3_prevHalfInstr.target) + val if3_prevHalfNotMetRedirect = if3_pendingPrevHalfInstr && !if3_prevHalfInstrMet && if3_nextValidPCNotEquals(if3_prevHalfInstr.pc + 2.U) + val if3_predTakenRedirect = !if3_pendingPrevHalfInstr && if3_bp.taken && if3_nextValidPCNotEquals(if3_bp.target) + val if3_predNotTakenRedirect = !if3_pendingPrevHalfInstr && !if3_bp.taken && if3_nextValidPCNotEquals(snpc(if3_pc, inLoop)) + // when pendingPrevHalfInstr, if3_GHInfo is set to the info of last prev half instr + val if3_ghInfoNotIdenticalRedirect = !if3_pendingPrevHalfInstr && if3_GHInfo =/= if3_lastGHInfo && enableGhistRepair.B + + if3_redirect := if3_fire && ( + // prevHalf is consumed but the next packet is not where it meant to be + // we do not handle this condition because of the burden of building a correct GHInfo + // prevHalfMetRedirect || + // prevHalf does not match if3_pc and the next fetch packet is not snpc + if3_prevHalfNotMetRedirect || + // pred taken and next fetch packet is not the predicted target + if3_predTakenRedirect || + // pred not taken and next fetch packet is not snpc + if3_predNotTakenRedirect || + // GHInfo from last pred does not corresponds with this packet + if3_ghInfoNotIdenticalRedirect + ) + + val if3_target = WireInit(snpc(if3_pc)) + + /* when (prevHalfMetRedirect) { + if1_npc := if3_prevHalfInstr.target + }.else */ + when (if3_prevHalfNotMetRedirect) { + if3_target := if3_prevHalfInstr.pc + 2.U + }.elsewhen (if3_predTakenRedirect) { + if3_target := if3_bp.target + }.elsewhen (if3_predNotTakenRedirect) { + if3_target := snpc(if3_pc) + }.elsewhen (if3_ghInfoNotIdenticalRedirect) { + if3_target := Mux(if3_bp.taken, if3_bp.target, snpc(if3_pc)) } - // when it does not redirect, we still need to modify hist(wire) - when(if3_realGHInfo.shifted && if3_newPtr >= ptr) { - hist(if3_newPtr-ptr) := if3_realGHInfo.takenOnBr - } - when (if3_hasPrevHalfInstr && prevHalfInstr.ghInfo.shifted && prevHalfInstr.newPtr >= ptr) { - hist(prevHalfInstr.newPtr-ptr) := prevHalfInstr.ghInfo.takenOnBr + when (if3_redirect) { + if1_npc := if3_target + val if3_newPtr = if3_GHInfo.newPtr() + updatePtr := true.B + newPtr := if3_newPtr + extHist(if3_newPtr) := if3_GHInfo.takenOnBr.asUInt } //********************** IF4 ****************************// val if4_pd = RegEnable(pd.io.out, if3_fire) - val if4_ipf = RegEnable(io.icacheResp.bits.ipf || if3_hasPrevHalfInstr && prevHalfInstr.ipf, if3_fire) + val if4_ipf = RegEnable(icacheResp.ipf || if3_prevHalfInstrMet && if3_prevHalfInstr.ipf, if3_fire) val if4_crossPageIPF = RegEnable(crossPageIPF, if3_fire) val if4_valid = RegInit(false.B) val if4_fire = if4_valid && io.fetchPacket.ready val if4_pc = RegEnable(if3_pc, if3_fire) + val if4_lastGHInfo = RegEnable(if3_GHInfo, if3_fire) + // This is the real mask given from icache or loop buffer + val if4_mask = RegEnable(icacheResp.mask, if3_fire) + val if4_snpc = Mux(inLoop, if4_pc + (PopCount(if4_mask) << 1), snpc(if4_pc)) - val if4_GHInfo = RegEnable(if3_realGHInfo, if3_fire) + val if4_predHistPtr = RegEnable(if3_predHistPtr, enable=if3_fire) - if4_ready := (if4_fire || !if4_valid || if4_flush) && GTimer() > 500.U + // wait until prevHalfInstr written into reg + if4_ready := (if4_fire && !hasPrevHalfInstrReq || !if4_valid || if4_flush) && GTimer() > 500.U when (if4_flush) { if4_valid := false.B } - .elsewhen (if3_fire) { if4_valid := if3_valid } - .elsewhen(if4_fire) { if4_valid := false.B } - + .elsewhen (if3_fire) { if4_valid := true.B } + .elsewhen (if4_fire) { if4_valid := false.B } + val if4_bp = Wire(new BranchPrediction) - if4_bp := bpu.io.out(2).bits - - val if4_realGHInfo = Wire(new GlobalHistoryInfo()) - if4_realGHInfo.sawNTBr := if4_bp.hasNotTakenBrs - if4_realGHInfo.takenOnBr := if4_bp.takenOnBr - if4_realGHInfo.saveHalfRVI := if4_bp.saveHalfRVI - - - val if4_cfi_jal = if4_pd.instrs(if4_bp.jmpIdx) - val if4_cfi_jal_tgt = if4_pd.pc(if4_bp.jmpIdx) + Mux(if4_pd.pd(if4_bp.jmpIdx).isRVC, - SignExt(Cat(if4_cfi_jal(12), if4_cfi_jal(8), if4_cfi_jal(10, 9), if4_cfi_jal(6), if4_cfi_jal(7), if4_cfi_jal(2), if4_cfi_jal(11), if4_cfi_jal(5, 3), 0.U(1.W)), XLEN), - SignExt(Cat(if4_cfi_jal(31), if4_cfi_jal(19, 12), if4_cfi_jal(20), if4_cfi_jal(30, 21), 0.U(1.W)), XLEN)) - if4_bp.target := Mux(if4_pd.pd(if4_bp.jmpIdx).isJal && if4_bp.taken, if4_cfi_jal_tgt, bpu.io.out(2).bits.target) - if4_bp.redirect := bpu.io.out(2).bits.redirect || if4_pd.pd(if4_bp.jmpIdx).isJal && if4_bp.taken && if4_cfi_jal_tgt =/= bpu.io.out(2).bits.target - - if4_prevHalfInstr := 0.U.asTypeOf(new PrevHalfInstr) - when (bpu.io.out(2).valid && if4_fire && if4_bp.saveHalfRVI) { - if4_prevHalfInstr.valid := true.B - if4_prevHalfInstr.taken := if4_bp.taken - if4_prevHalfInstr.ghInfo := if4_realGHInfo - // Make sure shifted can work - if4_prevHalfInstr.ghInfo.saveHalfRVI := false.B - if4_prevHalfInstr.newPtr := if4_newPtr - if4_prevHalfInstr.fetchpc := if4_pc - if4_prevHalfInstr.idx := PopCount(mask(if4_pc)) - 1.U - if4_prevHalfInstr.pc := if4_pd.pc(if4_prevHalfInstr.idx) - if4_prevHalfInstr.target := if4_bp.target - if4_prevHalfInstr.instr := if4_pd.instrs(if4_prevHalfInstr.idx)(15, 0) - if4_prevHalfInstr.ipf := if4_ipf - } - - // Redirect and npc logic for if4 - when (bpu.io.out(2).valid && if4_fire && if4_bp.redirect) { - if4_redirect := true.B - when (if4_bp.saveHalfRVI) { - if1_npc := snpc(if4_pc) - }.otherwise { - if1_npc := if4_bp.target - } + if4_bp := bpu.io.out(2) + if4_bp.takens := bpu.io.out(2).takens & if4_mask + if4_bp.brMask := bpu.io.out(2).brMask & if4_mask + if4_bp.jalMask := bpu.io.out(2).jalMask & if4_mask + + val if4_GHInfo = wrapGHInfo(if4_bp, if4_predHistPtr) + + def cal_jal_tgt(inst: UInt, rvc: Bool): UInt = { + Mux(rvc, + SignExt(Cat(inst(12), inst(8), inst(10, 9), inst(6), inst(7), inst(2), inst(11), inst(5, 3), 0.U(1.W)), XLEN), + SignExt(Cat(inst(31), inst(19, 12), inst(20), inst(30, 21), 0.U(1.W)), XLEN) + ) } - // }.elsewhen (bpu.io.out(2).valid && if4_fire/* && !if4_bp.redirect*/) { - // // We redirect the pipeline to the next fetch packet, - // // which contains the last half of the RVI instruction - // when (if4_bp.saveHalfRVI && if4_bp.taken) { - // if4_redirect := true.B - // if1_npc := snpc(if4_pc) - // } - // } + val if4_instrs = if4_pd.instrs + val if4_jals = if4_bp.jalMask + val if4_jal_tgts = VecInit((0 until PredictWidth).map(i => if4_pd.pc(i) + cal_jal_tgt(if4_instrs(i), if4_pd.pd(i).isRVC))) - // This should cover the if4 redirect to snpc when saveHalfRVI - when (if3_redirect) { - when (if3_hasPrevHalfInstr && prevHalfInstr.taken) { - if1_npc := prevHalfInstr.target + (0 until PredictWidth).foreach {i => + when (if4_jals(i)) { + if4_bp.targets(i) := if4_jal_tgts(i) } } - - // history logic for if4 - when (bpu.io.out(2).valid && if4_fire && if4_bp.redirect) { - shiftPtr := true.B - newPtr := if4_newPtr - // }.elsewhen (bpu.io.out(2).valid && if4_fire/* && !if4_bp.redirect*/) { - // // only if we hasn't seen not taken branches and - // // see a not taken branch in if4 should we tell - // // if3 and if4 to update histptr - // // We do not shift global history pointer unless we have the full - // // RVI instruction - // when (if4_newSawNTBrs && !if4_bp.takenOnBr) { - // shiftPtr := true.B - // // newPtr := if4_realGHInfo.newPtr - // } - } - - when (if4_realGHInfo.shifted && if4_newPtr >= ptr) { - hist(if4_newPtr-ptr) := if4_realGHInfo.takenOnBr + + // we need this to tell BPU the prediction of prev half + // because the prediction is with the start of each inst + val if4_prevHalfInstr = RegInit(0.U.asTypeOf(new PrevHalfInstr)) + val if4_pendingPrevHalfInstr = if4_prevHalfInstr.valid + val if4_prevHalfInstrMet = if4_pendingPrevHalfInstr && (if4_prevHalfInstr.pc + 2.U) === if4_pc && if4_valid + val if4_prevHalfConsumed = if4_prevHalfInstrMet && if4_fire + val if4_prevHalfFlush = if4_flush + + val if4_takenPrevHalf = WireInit(if4_prevHalfInstrMet && if4_prevHalfInstr.taken) + when (if3_prevHalfConsumed) { + if4_prevHalfInstr := if3_prevHalfInstr + }.elsewhen (if4_prevHalfConsumed || if4_prevHalfFlush) { + if4_prevHalfInstr.valid := false.B } - when (if3_redirect) { - // when redirect and if3_hasPrevHalfInstr, this prevHalfInstr should only be taken - when (if3_hasPrevHalfInstr && prevHalfInstr.ghInfo.shifted) { - shiftPtr := true.B - newPtr := prevHalfInstr.newPtr - extHist(prevHalfInstr.newPtr) := prevHalfInstr.ghInfo.takenOnBr - } + prevHalfInstrReq := 0.U.asTypeOf(new PrevHalfInstr) + when (if4_fire && if4_bp.saveHalfRVI) { + val idx = if4_bp.lastHalfRVIIdx + prevHalfInstrReq.valid := true.B + // this is result of the last half RVI + prevHalfInstrReq.taken := if4_bp.lastHalfRVITaken + prevHalfInstrReq.ghInfo := if4_GHInfo + prevHalfInstrReq.newPtr := if4_GHInfo.newPtr() + prevHalfInstrReq.fetchpc := if4_pc + prevHalfInstrReq.idx := idx + prevHalfInstrReq.pc := if4_pd.pc(idx) + prevHalfInstrReq.target := if4_bp.lastHalfRVITarget + prevHalfInstrReq.instr := if4_pd.instrs(idx)(15, 0) + prevHalfInstrReq.ipf := if4_ipf } - // modify GHR at the end of a prediction lifetime - when (if4_fire && if4_realGHInfo.shifted) { - extHist(if4_newPtr) := if4_realGHInfo.takenOnBr + def if4_nextValidPCNotEquals(pc: UInt) = if3_valid && if3_pc =/= pc || + !if3_valid && (if2_valid && if2_pc =/= pc) || + !if3_valid && !if2_valid + + val if4_prevHalfNextNotMet = hasPrevHalfInstrReq && if4_nextValidPCNotEquals(prevHalfInstrReq.pc+2.U) + val if4_predTakenRedirect = !hasPrevHalfInstrReq && if4_bp.taken && if4_nextValidPCNotEquals(if4_bp.target) + val if4_predNotTakenRedirect = !hasPrevHalfInstrReq && !if4_bp.taken && if4_nextValidPCNotEquals(if4_snpc) + val if4_ghInfoNotIdenticalRedirect = if4_GHInfo =/= if4_lastGHInfo && enableGhistRepair.B + + if4_redirect := if4_fire && ( + // when if4 has a lastHalfRVI, but the next fetch packet is not snpc + if4_prevHalfNextNotMet || + // when if4 preds taken, but the pc of next fetch packet is not the target + if4_predTakenRedirect || + // when if4 preds not taken, but the pc of next fetch packet is not snpc + if4_predNotTakenRedirect || + // GHInfo from last pred does not corresponds with this packet + if4_ghInfoNotIdenticalRedirect + ) + + val if4_target = WireInit(if4_snpc) + + when (if4_prevHalfNextNotMet) { + if4_target := prevHalfInstrReq.pc+2.U + }.elsewhen (if4_predTakenRedirect) { + if4_target := if4_bp.target + }.elsewhen (if4_predNotTakenRedirect) { + if4_target := if4_snpc + }.elsewhen (if4_ghInfoNotIdenticalRedirect) { + if4_target := Mux(if4_bp.taken, if4_bp.target, if4_snpc) } - - // This is a histPtr which is only modified when a prediction - // is sent, so that it can get the final prediction info - val finalPredHistPtr = RegInit(0.U(log2Up(ExtHistoryLength).W)) - if4_histPtr := finalPredHistPtr - if4_newPtr := if3_histPtr - when (if4_fire && if4_realGHInfo.shifted) { - finalPredHistPtr := if4_newPtr + when (if4_redirect) { + if1_npc := if4_target + val if4_newPtr = if4_GHInfo.newPtr() + updatePtr := true.B + newPtr := if4_newPtr + extHist(if4_newPtr) := if4_GHInfo.takenOnBr.asUInt } - if3_histPtr := Mux(if4_realGHInfo.shifted && if4_valid && !if4_flush, if4_histPtr - 1.U, if4_histPtr) - if3_newPtr := if2_histPtr - - if2_histPtr := Mux(if3_realGHInfo.shifted && if3_valid && !if3_flush, if3_histPtr - 1.U, if3_histPtr) - if2_newPtr := if1_histPtr - - if1_histPtr := Mux(if2_realGHInfo.shifted && if2_valid && !if2_flush, if2_histPtr - 1.U, if2_histPtr) - - - when (io.outOfOrderBrInfo.valid && io.outOfOrderBrInfo.bits.isMisPred) { val b = io.outOfOrderBrInfo.bits val oldPtr = b.brInfo.histPtr - shiftPtr := true.B + updatePtr := true.B when (!b.pd.isBr && !b.brInfo.sawNotTakenBranch) { // If mispredicted cfi is not a branch, // and there wasn't any not taken branch before it, // we should only recover the pointer to an unshifted state newPtr := oldPtr - finalPredHistPtr := oldPtr + // finalPredHistPtr := oldPtr }.otherwise { newPtr := oldPtr - 1.U - finalPredHistPtr := oldPtr - 1.U - hist(0) := Mux(b.pd.isBr, b.taken, 0.U) + // finalPredHistPtr := oldPtr - 1.U + // hist(0) := Mux(b.pd.isBr, b.taken, 0.U) extHist(newPtr) := Mux(b.pd.isBr, b.taken, 0.U) } } + when (loopBufPar.LBredirect.valid) { + if1_npc := loopBufPar.LBredirect.bits + } + when (io.redirect.valid) { - if1_npc := io.redirect.bits.target + if1_npc := io.redirect.bits } - io.icacheReq.valid := if1_valid && if2_ready + when(inLoop) { + io.icacheReq.valid := if4_flush + }.otherwise { + io.icacheReq.valid := if1_valid && if2_ready + } + io.icacheResp.ready := if4_ready io.icacheReq.bits.addr := if1_npc + + // when(if4_bp.taken) { + // when(if4_bp.saveHalfRVI) { + // io.loopBufPar.LBReq := snpc(if4_pc) + // }.otherwise { + // io.loopBufPar.LBReq := if4_bp.target + // } + // }.otherwise { + // io.loopBufPar.LBReq := snpc(if4_pc) + // XSDebug(p"snpc(if4_pc)=${Hexadecimal(snpc(if4_pc))}\n") + // } + loopBufPar.fetchReq := if3_pc + io.icacheReq.bits.mask := mask(if1_npc) - io.icacheResp.ready := if4_ready - //io.icacheResp.ready := if3_valid + io.icacheFlush := Cat(if3_flush, if2_flush) val inOrderBrHist = Wire(Vec(HistoryLength, UInt(1.W))) @@ -361,93 +449,118 @@ class IFU extends XSModule with HasIFUConst // bpu.io.flush := Cat(if4_flush, if3_flush, if2_flush) bpu.io.flush := VecInit(if2_flush, if3_flush, if4_flush) - bpu.io.cacheValid := io.icacheResp.valid - bpu.io.in.valid := if1_fire - bpu.io.in.bits.pc := if1_npc - bpu.io.in.bits.hist := hist.asUInt - bpu.io.in.bits.histPtr := ptr - bpu.io.in.bits.inMask := mask(if1_npc) - bpu.io.out(0).ready := if2_fire - bpu.io.out(1).ready := if3_fire - bpu.io.out(2).ready := if4_fire - bpu.io.predecode.valid := if4_valid - bpu.io.predecode.bits.mask := if4_pd.mask - bpu.io.predecode.bits.pd := if4_pd.pd - bpu.io.predecode.bits.isFetchpcEqualFirstpc := if4_pc === if4_pd.pc(0) - bpu.io.branchInfo.ready := if4_fire - - pd.io.in := io.icacheResp.bits - pd.io.prev.valid := if3_hasPrevHalfInstr - pd.io.prev.bits := prevHalfInstr.instr + bpu.io.inFire(0) := if1_fire + bpu.io.inFire(1) := if2_fire + bpu.io.inFire(2) := if3_fire + bpu.io.inFire(3) := if4_fire + bpu.io.in.pc := if1_npc + bpu.io.in.hist := hist.asUInt + bpu.io.in.histPtr := ptr + bpu.io.in.inMask := mask(if1_npc) + bpu.io.predecode.mask := if4_pd.mask + bpu.io.predecode.lastHalf := if4_pd.lastHalf + bpu.io.predecode.pd := if4_pd.pd + bpu.io.predecode.hasLastHalfRVI := if4_pc =/= if4_pd.pc(0) + bpu.io.realMask := if4_mask + bpu.io.prevHalf := if4_prevHalfInstr + + pd.io.in := icacheResp + when(inLoop) { + pd.io.in.mask := loopBuffer.io.out.bits.mask // TODO: Maybe this is unnecessary + // XSDebug("Fetch from LB\n") + // XSDebug(p"pc=${Hexadecimal(io.loopBufPar.LBResp.pc)}\n") + // XSDebug(p"data=${Hexadecimal(io.loopBufPar.LBResp.data)}\n") + // XSDebug(p"mask=${Hexadecimal(io.loopBufPar.LBResp.mask)}\n") + } + + pd.io.prev.valid := if3_prevHalfInstrMet + pd.io.prev.bits := if3_prevHalfInstr.instr // if a fetch packet triggers page fault, set the pf instruction to nop - when (!if3_hasPrevHalfInstr && io.icacheResp.bits.ipf) { + when (!if3_prevHalfInstrMet && icacheResp.ipf) { val instrs = Wire(Vec(FetchWidth, UInt(32.W))) (0 until FetchWidth).foreach(i => instrs(i) := ZeroExt("b0010011".U, 32)) // nop pd.io.in.data := instrs.asUInt - }.elsewhen (if3_hasPrevHalfInstr && (prevHalfInstr.ipf || io.icacheResp.bits.ipf)) { + }.elsewhen (if3_prevHalfInstrMet && (if3_prevHalfInstr.ipf || icacheResp.ipf)) { pd.io.prev.bits := ZeroExt("b0010011".U, 16) val instrs = Wire(Vec(FetchWidth, UInt(32.W))) (0 until FetchWidth).foreach(i => instrs(i) := Cat(ZeroExt("b0010011".U, 16), Fill(16, 0.U(1.W)))) pd.io.in.data := instrs.asUInt - when (io.icacheResp.bits.ipf && !prevHalfInstr.ipf) { crossPageIPF := true.B } // higher 16 bits page fault + when (icacheResp.ipf && !if3_prevHalfInstr.ipf) { crossPageIPF := true.B } // higher 16 bits page fault } - io.fetchPacket.valid := if4_valid && !io.redirect.valid - io.fetchPacket.bits.instrs := if4_pd.instrs - io.fetchPacket.bits.mask := if4_pd.mask & (Fill(PredictWidth, !if4_bp.taken) | (Fill(PredictWidth, 1.U(1.W)) >> (~if4_bp.jmpIdx))) - io.fetchPacket.bits.pc := if4_pd.pc - (0 until PredictWidth).foreach(i => io.fetchPacket.bits.pnpc(i) := if4_pd.pc(i) + Mux(if4_pd.pd(i).isRVC, 2.U, 4.U)) + //Performance Counter + // if (!env.FPGAPlatform ) { + // ExcitingUtils.addSource(io.fetchPacket.fire && !inLoop, "CntFetchFromICache", Perf) + // ExcitingUtils.addSource(io.fetchPacket.fire && inLoop, "CntFetchFromLoopBuffer", Perf) + // } + + val fetchPacketValid = if4_valid && !io.redirect.valid + val fetchPacketWire = Wire(new FetchPacket) + + // io.fetchPacket.valid := if4_valid && !io.redirect.valid + fetchPacketWire.instrs := if4_pd.instrs + fetchPacketWire.mask := if4_pd.mask & (Fill(PredictWidth, !if4_bp.taken) | (Fill(PredictWidth, 1.U(1.W)) >> (~if4_bp.jmpIdx))) + + loopBufPar.noTakenMask := if4_pd.mask + fetchPacketWire.pc := if4_pd.pc + (0 until PredictWidth).foreach(i => fetchPacketWire.pnpc(i) := if4_pd.pc(i) + Mux(if4_pd.pd(i).isRVC, 2.U, 4.U)) when (if4_bp.taken) { - io.fetchPacket.bits.pnpc(if4_bp.jmpIdx) := if4_bp.target + fetchPacketWire.pnpc(if4_bp.jmpIdx) := if4_bp.target } - io.fetchPacket.bits.brInfo := bpu.io.branchInfo.bits - (0 until PredictWidth).foreach(i => io.fetchPacket.bits.brInfo(i).histPtr := finalPredHistPtr) - (0 until PredictWidth).foreach(i => io.fetchPacket.bits.brInfo(i).predHistPtr := if4_predHistPtr) - io.fetchPacket.bits.pd := if4_pd.pd - io.fetchPacket.bits.ipf := if4_ipf - io.fetchPacket.bits.crossPageIPFFix := if4_crossPageIPF + fetchPacketWire.brInfo := bpu.io.branchInfo + (0 until PredictWidth).foreach(i => fetchPacketWire.brInfo(i).histPtr := if4_predHistPtr) + (0 until PredictWidth).foreach(i => fetchPacketWire.brInfo(i).predHistPtr := if4_predHistPtr) + fetchPacketWire.pd := if4_pd.pd + fetchPacketWire.ipf := if4_ipf + fetchPacketWire.crossPageIPFFix := if4_crossPageIPF + + // predTaken Vec + fetchPacketWire.predTaken := if4_bp.taken + + loopBuffer.io.in.bits := fetchPacketWire + io.fetchPacket.bits := fetchPacketWire + io.fetchPacket.valid := fetchPacketValid + loopBuffer.io.in.valid := io.fetchPacket.fire // debug info if (IFUDebug) { XSDebug(RegNext(reset.asBool) && !reset.asBool, "Reseting...\n") XSDebug(io.icacheFlush(0).asBool, "Flush icache stage2...\n") XSDebug(io.icacheFlush(1).asBool, "Flush icache stage3...\n") - XSDebug(io.redirect.valid, "Redirect from backend! isExcp=%d isFpp:%d isMisPred=%d isReplay=%d pc=%x\n", - io.redirect.bits.isException, io.redirect.bits.isFlushPipe, io.redirect.bits.isMisPred, io.redirect.bits.isReplay, io.redirect.bits.pc) - XSDebug(io.redirect.valid, p"Redirect from backend! target=${Hexadecimal(io.redirect.bits.target)} brTag=${io.redirect.bits.brTag}\n") + XSDebug(io.redirect.valid, p"Redirect from backend! target=${Hexadecimal(io.redirect.bits)}\n") XSDebug("[IF1] v=%d fire=%d flush=%d pc=%x ptr=%d mask=%b\n", if1_valid, if1_fire, if1_flush, if1_npc, ptr, mask(if1_npc)) - XSDebug("[IF2] v=%d r=%d fire=%d redirect=%d flush=%d pc=%x ptr=%d snpc=%x\n", if2_valid, if2_ready, if2_fire, if2_redirect, if2_flush, if2_pc, if2_histPtr, if2_snpc) - XSDebug("[IF3] v=%d r=%d fire=%d redirect=%d flush=%d pc=%x ptr=%d crossPageIPF=%d sawNTBrs=%d\n", if3_valid, if3_ready, if3_fire, if3_redirect, if3_flush, if3_pc, if3_histPtr, crossPageIPF, if3_realGHInfo.sawNTBr) - XSDebug("[IF4] v=%d r=%d fire=%d redirect=%d flush=%d pc=%x ptr=%d crossPageIPF=%d sawNTBrs=%d\n", if4_valid, if4_ready, if4_fire, if4_redirect, if4_flush, if4_pc, if4_histPtr, if4_crossPageIPF, if4_realGHInfo.sawNTBr) + XSDebug("[IF2] v=%d r=%d fire=%d redirect=%d flush=%d pc=%x ptr=%d snpc=%x\n", if2_valid, if2_ready, if2_fire, if2_redirect, if2_flush, if2_pc, if2_predHistPtr, if2_snpc) + XSDebug("[IF3] v=%d r=%d fire=%d redirect=%d flush=%d pc=%x ptr=%d crossPageIPF=%d sawNTBrs=%d\n", if3_valid, if3_ready, if3_fire, if3_redirect, if3_flush, if3_pc, if3_predHistPtr, crossPageIPF, if3_GHInfo.sawNTBr) + XSDebug("[IF4] v=%d r=%d fire=%d redirect=%d flush=%d pc=%x ptr=%d crossPageIPF=%d sawNTBrs=%d\n", if4_valid, if4_ready, if4_fire, if4_redirect, if4_flush, if4_pc, if4_predHistPtr, if4_crossPageIPF, if4_GHInfo.sawNTBr) XSDebug("[IF1][icacheReq] v=%d r=%d addr=%x\n", io.icacheReq.valid, io.icacheReq.ready, io.icacheReq.bits.addr) - XSDebug("[IF1][ghr] headPtr=%d shiftPtr=%d newPtr=%d ptr=%d\n", if1_histPtr, shiftPtr, newPtr, ptr) + XSDebug("[IF1][ghr] headPtr=%d updatePtr=%d newPtr=%d ptr=%d\n", if1_histPtr, updatePtr, newPtr, ptr) XSDebug("[IF1][ghr] hist=%b\n", hist.asUInt) XSDebug("[IF1][ghr] extHist=%b\n\n", extHist.asUInt) - XSDebug("[IF2][bp] redirect=%d taken=%d jmpIdx=%d hasNTBrs=%d target=%x saveHalfRVI=%d\n\n", if2_bp.redirect, if2_bp.taken, if2_bp.jmpIdx, if2_bp.hasNotTakenBrs, if2_bp.target, if2_bp.saveHalfRVI) - // XSDebug("[IF2][GHInfo]: %s\n", if2_realGHInfo) - if2_realGHInfo.debug + XSDebug("[IF2][bp] taken=%d jmpIdx=%d hasNTBrs=%d target=%x saveHalfRVI=%d\n\n", if2_bp.taken, if2_bp.jmpIdx, if2_bp.hasNotTakenBrs, if2_bp.target, if2_bp.saveHalfRVI) + if2_GHInfo.debug("if2") XSDebug("[IF3][icacheResp] v=%d r=%d pc=%x mask=%b\n", io.icacheResp.valid, io.icacheResp.ready, io.icacheResp.bits.pc, io.icacheResp.bits.mask) - XSDebug("[IF3][bp] redirect=%d taken=%d jmpIdx=%d hasNTBrs=%d target=%x saveHalfRVI=%d\n", if3_bp.redirect, if3_bp.taken, if3_bp.jmpIdx, if3_bp.hasNotTakenBrs, if3_bp.target, if3_bp.saveHalfRVI) + XSDebug("[IF3][bp] taken=%d jmpIdx=%d hasNTBrs=%d target=%x saveHalfRVI=%d\n", if3_bp.taken, if3_bp.jmpIdx, if3_bp.hasNotTakenBrs, if3_bp.target, if3_bp.saveHalfRVI) + XSDebug("[IF3][redirect]: v=%d, prevMet=%d, prevNMet=%d, predT=%d, predNT=%d, ghInfo=%d\n", if3_redirect, if3_prevHalfMetRedirect, if3_prevHalfNotMetRedirect, if3_predTakenRedirect, if3_predNotTakenRedirect, if3_ghInfoNotIdenticalRedirect) // XSDebug("[IF3][prevHalfInstr] v=%d redirect=%d fetchpc=%x idx=%d tgt=%x taken=%d instr=%x\n\n", // prev_half_valid, prev_half_redirect, prev_half_fetchpc, prev_half_idx, prev_half_tgt, prev_half_taken, prev_half_instr) XSDebug("[IF3][ prevHalfInstr] v=%d taken=%d fetchpc=%x idx=%d pc=%x tgt=%x instr=%x ipf=%d\n", - prevHalfInstr.valid, prevHalfInstr.taken, prevHalfInstr.fetchpc, prevHalfInstr.idx, prevHalfInstr.pc, prevHalfInstr.target, prevHalfInstr.instr, prevHalfInstr.ipf) + if3_prevHalfInstr.valid, if3_prevHalfInstr.taken, if3_prevHalfInstr.fetchpc, if3_prevHalfInstr.idx, if3_prevHalfInstr.pc, if3_prevHalfInstr.target, if3_prevHalfInstr.instr, if3_prevHalfInstr.ipf) XSDebug("[IF3][if3_prevHalfInstr] v=%d taken=%d fetchpc=%x idx=%d pc=%x tgt=%x instr=%x ipf=%d\n\n", if3_prevHalfInstr.valid, if3_prevHalfInstr.taken, if3_prevHalfInstr.fetchpc, if3_prevHalfInstr.idx, if3_prevHalfInstr.pc, if3_prevHalfInstr.target, if3_prevHalfInstr.instr, if3_prevHalfInstr.ipf) - // XSDebug("[IF3][GHInfo]: %s\n", if3_realGHInfo) - if3_realGHInfo.debug + if3_GHInfo.debug("if3") XSDebug("[IF4][predecode] mask=%b\n", if4_pd.mask) - XSDebug("[IF4][bp] redirect=%d taken=%d jmpIdx=%d hasNTBrs=%d target=%x saveHalfRVI=%d\n", if4_bp.redirect, if4_bp.taken, if4_bp.jmpIdx, if4_bp.hasNotTakenBrs, if4_bp.target, if4_bp.saveHalfRVI) - XSDebug(if4_pd.pd(if4_bp.jmpIdx).isJal && if4_bp.taken, "[IF4] cfi is jal! instr=%x target=%x\n", if4_cfi_jal, if4_cfi_jal_tgt) + XSDebug("[IF4][snpc]: %x, realMask=%b\n", if4_snpc, if4_mask) + XSDebug("[IF4][bp] taken=%d jmpIdx=%d hasNTBrs=%d target=%x saveHalfRVI=%d\n", if4_bp.taken, if4_bp.jmpIdx, if4_bp.hasNotTakenBrs, if4_bp.target, if4_bp.saveHalfRVI) + XSDebug("[IF4][redirect]: v=%d, prevNotMet=%d, predT=%d, predNT=%d, ghInfo=%d\n", if4_redirect, if4_prevHalfNextNotMet, if4_predTakenRedirect, if4_predNotTakenRedirect, if4_ghInfoNotIdenticalRedirect) + XSDebug(if4_pd.pd(if4_bp.jmpIdx).isJal && if4_bp.taken, "[IF4] cfi is jal! instr=%x target=%x\n", if4_instrs(if4_bp.jmpIdx), if4_jal_tgts(if4_bp.jmpIdx)) XSDebug("[IF4][if4_prevHalfInstr] v=%d taken=%d fetchpc=%x idx=%d pc=%x tgt=%x instr=%x ipf=%d\n", if4_prevHalfInstr.valid, if4_prevHalfInstr.taken, if4_prevHalfInstr.fetchpc, if4_prevHalfInstr.idx, if4_prevHalfInstr.pc, if4_prevHalfInstr.target, if4_prevHalfInstr.instr, if4_prevHalfInstr.ipf) - // XSDebug("[IF4][GHInfo]: %s\n", if4_realGHInfo) - if4_realGHInfo.debug + if4_GHInfo.debug("if4") XSDebug(io.fetchPacket.fire(), "[IF4][fetchPacket] v=%d r=%d mask=%b ipf=%d crossPageIPF=%d\n", io.fetchPacket.valid, io.fetchPacket.ready, io.fetchPacket.bits.mask, io.fetchPacket.bits.ipf, io.fetchPacket.bits.crossPageIPFFix) for (i <- 0 until PredictWidth) { diff --git a/src/main/scala/xiangshan/frontend/Ibuffer.scala b/src/main/scala/xiangshan/frontend/Ibuffer.scala index d536db1a127d1394e18b7d0ae03f56c2f7af0a40..68f3cdfad8cc1443f21c270ca002f08b5e56a627 100644 --- a/src/main/scala/xiangshan/frontend/Ibuffer.scala +++ b/src/main/scala/xiangshan/frontend/Ibuffer.scala @@ -7,12 +7,14 @@ import xiangshan._ import utils._ import xiangshan.backend.fu.HasExceptionNO +class IBufferIO extends XSBundle { + val flush = Input(Bool()) + val in = Flipped(DecoupledIO(new FetchPacket)) + val out = Vec(DecodeWidth, DecoupledIO(new CtrlFlow)) +} + class Ibuffer extends XSModule { - val io = IO(new Bundle() { - val flush = Input(Bool()) - val in = Flipped(DecoupledIO(new FetchPacket)) - val out = Vec(DecodeWidth, DecoupledIO(new CtrlFlow)) - }) + val io = IO(new IBufferIO) class IBufEntry extends XSBundle { val inst = UInt(32.W) @@ -25,6 +27,11 @@ class Ibuffer extends XSModule { } // Ignore + // io.loopBufPar <> DontCare + // io.loopBufPar.LBredirect.valid := false.B + // io.loopBufPar.inLoop := false.B + + for(out <- io.out) { // out.bits.exceptionVec := DontCare out.bits.intrVec := DontCare @@ -40,23 +47,27 @@ class Ibuffer extends XSModule { val enqValid = !io.flush && !ibuf_valid(tail_ptr + PredictWidth.U - 1.U) val deqValid = !io.flush && ibuf_valid(head_ptr) + // Enque io.in.ready := enqValid - // Enque when(io.in.fire) { - var enq_idx = tail_ptr + var enq_idx = WireInit(tail_ptr) for(i <- 0 until PredictWidth) { + var inWire = Wire(new IBufEntry) + inWire := DontCare + ibuf_valid(enq_idx) := io.in.bits.mask(i) - ibuf(enq_idx).inst := io.in.bits.instrs(i) - ibuf(enq_idx).pc := io.in.bits.pc(i) - ibuf(enq_idx).pnpc := io.in.bits.pnpc(i) - ibuf(enq_idx).brInfo := io.in.bits.brInfo(i) - ibuf(enq_idx).pd := io.in.bits.pd(i) - ibuf(enq_idx).ipf := io.in.bits.ipf - ibuf(enq_idx).crossPageIPFFix := io.in.bits.crossPageIPFFix + inWire.inst := io.in.bits.instrs(i) + inWire.pc := io.in.bits.pc(i) + inWire.pnpc := io.in.bits.pnpc(i) + inWire.brInfo := io.in.bits.brInfo(i) + inWire.pd := io.in.bits.pd(i) + inWire.ipf := io.in.bits.ipf + inWire.crossPageIPFFix := io.in.bits.crossPageIPFFix + ibuf(enq_idx) := inWire enq_idx = enq_idx + io.in.bits.mask(i) } @@ -65,28 +76,29 @@ class Ibuffer extends XSModule { // Deque when(deqValid) { - var deq_idx = head_ptr for(i <- 0 until DecodeWidth) { - io.out(i).valid := ibuf_valid(deq_idx) - // Only when the entry is valid can it be set invalid - when (ibuf_valid(deq_idx)) { ibuf_valid(deq_idx) := !io.out(i).fire } - - io.out(i).bits.instr := ibuf(deq_idx).inst - io.out(i).bits.pc := ibuf(deq_idx).pc - // io.out(i).bits.exceptionVec := Mux(ibuf(deq_idx).ipf, UIntToOH(instrPageFault.U), 0.U) + val head_wire = head_ptr + i.U + val outWire = WireInit(ibuf(head_wire)) + + io.out(i).valid := ibuf_valid(head_wire) + when(ibuf_valid(head_wire) && io.out(i).ready) { + ibuf_valid(head_wire) := false.B + } + + io.out(i).bits.instr := outWire.inst + io.out(i).bits.pc := outWire.pc + // io.out(i).bits.exceptionVec := Mux(outWire.ipf, UIntToOH(instrPageFault.U), 0.U) io.out(i).bits.exceptionVec := 0.U.asTypeOf(Vec(16, Bool())) - io.out(i).bits.exceptionVec(instrPageFault) := ibuf(deq_idx).ipf - // io.out(i).bits.brUpdate := ibuf(deq_idx).brInfo + io.out(i).bits.exceptionVec(instrPageFault) := outWire.ipf + // io.out(i).bits.brUpdate := outWire.brInfo io.out(i).bits.brUpdate := DontCare - io.out(i).bits.brUpdate.pc := ibuf(deq_idx).pc - io.out(i).bits.brUpdate.pnpc := ibuf(deq_idx).pnpc - io.out(i).bits.brUpdate.pd := ibuf(deq_idx).pd - io.out(i).bits.brUpdate.brInfo := ibuf(deq_idx).brInfo - io.out(i).bits.crossPageIPFFix := ibuf(deq_idx).crossPageIPFFix - - deq_idx = deq_idx + io.out(i).fire + io.out(i).bits.brUpdate.pc := outWire.pc + io.out(i).bits.brUpdate.pnpc := outWire.pnpc + io.out(i).bits.brUpdate.pd := outWire.pd + io.out(i).bits.brUpdate.brInfo := outWire.brInfo + io.out(i).bits.crossPageIPFFix := outWire.crossPageIPFFix } - head_ptr := deq_idx + head_ptr := head_ptr + io.out.map(_.fire).fold(0.U(log2Up(DecodeWidth).W))(_+_) }.otherwise { io.out.foreach(_.valid := false.B) io.out.foreach(_.bits <> DontCare) diff --git a/src/main/scala/xiangshan/frontend/LoopBuffer.scala b/src/main/scala/xiangshan/frontend/LoopBuffer.scala index 34a120011f64ca6b6da96f557fbc25e132ff0970..13c0ee2e2c8f819f07e86eb6ff63af0e7c772d1e 100644 --- a/src/main/scala/xiangshan/frontend/LoopBuffer.scala +++ b/src/main/scala/xiangshan/frontend/LoopBuffer.scala @@ -4,339 +4,246 @@ import chisel3._ import chisel3.util._ import utils._ import xiangshan._ +import xiangshan.cache._ -class LoopBuffer extends XSModule with NeedImpl { - val io = IO(new Bundle() { - val flush = Input(Bool()) - val in = Flipped(DecoupledIO(new FetchPacket)) - val out = Vec(DecodeWidth, DecoupledIO(new CtrlFlow)) - }) - -// class LBufEntry extends XSBundle { -// val inst = UInt(32.W) -// val pc = UInt(VAddrBits.W) -// val fetchOffset = UInt((log2Up(FetchWidth * 4)).W) -// val pnpc = UInt(VAddrBits.W) -// val hist = UInt(HistoryLength.W) -// val btbPredCtr = UInt(2.W) -// val btbHit = Bool() -// val tageMeta = new TageMeta -// val rasSp = UInt(log2Up(RasSize).W) -// val rasTopCtr = UInt(8.W) -// val exceptionVec = Vec(16, Bool()) -// val intrVec = Vec(12, Bool()) -// val isRVC = Bool() -// val isBr = Bool() -// val crossPageIPFFix = Bool() -// -// // val valid = Bool() -// val isTaken = Bool() -// } -// -// // ignore -// for(i <- 0 until DecodeWidth) { -// io.out(i).bits.exceptionVec := DontCare -// io.out(i).bits.intrVec := DontCare -// io.out(i).bits.isBr := DontCare -// io.out(i).bits.crossPageIPFFix := DontCare -// } -// -// // Check Short Backward Branch -// def isSBB(inst: UInt): Bool = { -// inst === BitPat("b1111_???????_111111111_?????_1101111") || inst === BitPat("b1111???_?????_?????_???_????1_1100011") -// } -// -// // Get sbb target -// def SBBOffset(inst: UInt): UInt = { -// val isJal = inst === BitPat("b1111_???????_111111111_?????_1101111") -// val isCon = inst === BitPat("b1111???_?????_?????_???_????1_1100011") -// Mux(isJal, inst(27, 21), Mux(isCon, Cat(inst(27,25), inst(11,8)), 0.U(7.W))) -// } -// -// // Can be replace bt isBr -// def isBranch(inst: UInt): Bool = { -// inst === BitPat("b????????????????????_?????_1101111") || -// inst === BitPat("b????????????????????_?????_1100111") || -// inst === BitPat("b???????_?????_?????_???_?????_1100011") -// } -// -// //Count Register -// val offsetCounter = Reg(UInt((log2Up(IBufSize)+2).W)) -// val tsbbPC = Reg(UInt(VAddrBits.W)) -// -// // def isFull(ptr1: UInt, ptr2: UInt): Bool = ptr1 === ptr2 && lbuf_valid(ptr2) -// // def isEmpty(ptr1: UInt, ptr2: UInt): Bool = ptr1 === ptr2 && !lbuf_valid(ptr1) -// def isOverflow(ptr: UInt): Bool = lbuf_valid(ptr) -// -// // Loop Buffer define -// val lbuf = Reg(Vec(IBufSize, new LBufEntry)) -// val lbuf_valid = RegInit(VecInit(Seq.fill(IBufSize)(false.B))) -// val out_isTaken = WireInit(VecInit(Seq.fill(DecodeWidth)(false.B))) -// val head_ptr = RegInit(0.U(log2Up(IBufSize).W)) -// val tail_ptr = RegInit(0.U(log2Up(IBufSize).W)) -// -// val loop_str = RegInit(0.U(log2Up(IBufSize).W)) -// val loop_end = RegInit(0.U(log2Up(IBufSize).W)) -// val loop_ptr = RegInit(0.U(log2Up(IBufSize).W)) -// -// // FSM state define -// val s_idle :: s_fill :: s_active :: Nil = Enum(3) -// val LBstate = RegInit(s_idle) -// -// // val has_sbb = (0 until DecodeWidth).map(i => lbuf_valid(head_ptr + i.U) && isSBB(lbuf(head_ptr + i.U).inst)).reduce(_||_) -// val sbb_vec = (0 until DecodeWidth).map(i => io.out(i).fire && isSBB(io.out(i).bits.instr)) -// val has_sbb = ParallelOR(sbb_vec) -// val sbb_and_taken = (0 until DecodeWidth).map(i => sbb_vec(i) && out_isTaken(i)) -// val sbbIdx = OHToUInt(HighestBit(VecInit(sbb_and_taken).asUInt, DecodeWidth).asUInt) // The first SBB that is predicted to jump -// val sbbTaken = ParallelOR(sbb_and_taken) -// -// val tsbb_vec = (0 until DecodeWidth).map(i => io.out(i).fire && io.out(i).bits.pc === tsbbPC) -// val has_tsbb = ParallelOR(tsbb_vec) -// val tsbbIdx = OHToUInt(HighestBit(VecInit(tsbb_vec).asUInt, DecodeWidth).asUInt) -// val tsbbTaken = Mux(LBstate === s_fill, out_isTaken(tsbbIdx), true.B) -// -// val has_branch = ParallelOR((0 until DecodeWidth).map(i => io.out(i).fire && i.U > sbbIdx && !sbb_vec(i) && out_isTaken(i))) -// -// def flush() = { -// XSDebug("Loop Buffer Flushed.\n") -// LBstate := s_idle -// for(i <- 0 until IBufSize) { -// lbuf(i).inst := 0.U // Delete can improve performance? -// lbuf(i).pc := 0.U // Delete can improve performance? -// lbuf_valid(i) := false.B -// } -// head_ptr := 0.U -// tail_ptr := 0.U -// } -// -// // clean invalid insts in LB when out FILL state -// def cleanFILL(str: UInt, end: UInt): Unit = { -// for(i <- 0 until IBufSize) { -// when(str <= end && (str <= i.U && i.U < end)) { -// lbuf_valid(i) := false.B -// }.elsewhen(str > end && (str <= i.U || i.U < end)) { -// lbuf_valid(i) := false.B -// } -// } -// // when(str <= end) { -// // for(i <- 0 until IBufSize) { -// // lbuf_valid(i) := (str > i.U || i.U >= end) && lbuf_valid(i) -// // } -// // }.otherwise { -// // for(i <- 0 until IBufSize) { -// // lbuf_valid(i) := (str <= i.U && i.U < end) && lbuf_valid(i) -// // } -// // } -// } -// -// /*---------------*/ -// /* Dequeue */ -// /*---------------*/ -// var deq_idx = WireInit(0.U(log2Up(DecodeWidth+2).W)) -// -// when(LBstate =/= s_active) { -// for(i <- 0 until DecodeWidth) { -// // io.out(i).valid := !isEmpty(head_ptr + deq_idx, tail_ptr) && lbuf_valid(head_ptr + deq_idx) -// io.out(i).valid := lbuf_valid(head_ptr + deq_idx) -// -// when(io.out(i).fire){ -// io.out(i).bits.instr := lbuf(head_ptr + deq_idx).inst -// io.out(i).bits.pc := lbuf(head_ptr + deq_idx).pc -// io.out(i).bits.fetchOffset := lbuf(head_ptr + deq_idx).fetchOffset -// io.out(i).bits.pnpc := lbuf(head_ptr + deq_idx).pnpc -// io.out(i).bits.hist := lbuf(head_ptr + deq_idx).hist -// io.out(i).bits.btbPredCtr := lbuf(head_ptr + deq_idx).btbPredCtr -// io.out(i).bits.btbHit := lbuf(head_ptr + deq_idx).btbHit -// io.out(i).bits.tageMeta := lbuf(head_ptr + deq_idx).tageMeta -// io.out(i).bits.rasSp := lbuf(head_ptr + deq_idx).rasSp -// io.out(i).bits.rasTopCtr := lbuf(head_ptr + deq_idx).rasTopCtr -// io.out(i).bits.isRVC := false.B -// lbuf_valid(head_ptr + deq_idx) := (lbuf_valid(head_ptr + deq_idx) && LBstate === s_fill) || (has_sbb && sbbTaken && !has_branch && i.U > sbbIdx) -// out_isTaken(i) := lbuf(head_ptr + deq_idx).isTaken -// }.otherwise { -// io.out(i).bits <> DontCare -// } -// -// // XSDebug("deq_idx=%d\n", deq_idx) -// deq_idx = deq_idx + (lbuf_valid(head_ptr + deq_idx) && io.out(i).fire) -// } -// -// head_ptr := head_ptr + deq_idx -// }.otherwise { -// deq_idx = 0.U -// for(i <- 0 until DecodeWidth) { -// io.out(i).valid := deq_idx =/= DecodeWidth.U + 1.U && lbuf(loop_ptr + deq_idx).pc <= tsbbPC -// -// when(io.out(i).fire) { -// io.out(i).bits.instr := lbuf(loop_ptr + deq_idx).inst -// io.out(i).bits.pc := lbuf(loop_ptr + deq_idx).pc -// io.out(i).bits.fetchOffset := lbuf(loop_ptr + deq_idx).fetchOffset -// io.out(i).bits.pnpc := lbuf(loop_ptr + deq_idx).pnpc -// io.out(i).bits.hist := lbuf(loop_ptr + deq_idx).hist -// io.out(i).bits.btbPredCtr := lbuf(loop_ptr + deq_idx).btbPredCtr -// io.out(i).bits.btbHit := lbuf(loop_ptr + deq_idx).btbHit -// io.out(i).bits.tageMeta := lbuf(loop_ptr + deq_idx).tageMeta -// io.out(i).bits.rasSp := lbuf(loop_ptr + deq_idx).rasSp -// io.out(i).bits.rasTopCtr := lbuf(loop_ptr + deq_idx).rasTopCtr -// io.out(i).bits.isRVC := false.B -// // out_isTaken(i) := lbuf(loop_ptr + deq_idx).isTaken -// }.otherwise { -// io.out(i).bits <> DontCare -// } -// -// // deq_idx = Mux(deq_idx === DecodeWidth.U + 1.U || loop_ptr + deq_idx === loop_end, DecodeWidth.U + 1.U, deq_idx + io.out(i).fire) -// deq_idx = PriorityMux(Seq( -// (!io.out(i).fire || deq_idx === DecodeWidth.U + 1.U) -> deq_idx, -// (loop_ptr + deq_idx === loop_end) -> (DecodeWidth.U + 1.U), -// (loop_ptr + deq_idx =/= loop_end) -> (deq_idx + 1.U) -// )) -// } -// -// val next_loop_ptr = Mux(deq_idx === DecodeWidth.U + 1.U, loop_str, loop_ptr + deq_idx) -// loop_ptr := next_loop_ptr -// // XSDebug("deq_idx = %d\n", deq_idx) -// // XSDebug("loop_ptr = %d\n", Mux(deq_idx === DecodeWidth.U, loop_str, loop_ptr + deq_idx)) -// } -// -// val offsetCounterWire = WireInit(offsetCounter + (PopCount((0 until DecodeWidth).map(io.out(_).fire())) << 1).asUInt) -// offsetCounter := offsetCounterWire -// XSDebug("countReg=%b\n", offsetCounterWire) -// -// /*---------------*/ -// /* Enqueue */ -// /*---------------*/ -// var enq_idx = 0.U(log2Up(FetchWidth+1).W) -// -// io.in.ready := LBstate =/= s_active && !isOverflow(tail_ptr + FetchWidth.U - 1.U) -// -// when(io.in.fire()){ -// for(i <- 0 until FetchWidth) { -// lbuf(tail_ptr + enq_idx).inst := io.in.bits.instrs(i) -// lbuf(tail_ptr + enq_idx).pc := io.in.bits.pc + (enq_idx << 2).asUInt -// lbuf(tail_ptr + enq_idx).pnpc := io.in.bits.pnpc(i<<1) -// lbuf(tail_ptr + enq_idx).fetchOffset := (enq_idx<<2).asUInt -// lbuf(tail_ptr + enq_idx).hist := io.in.bits.hist(i<<1) -// lbuf(tail_ptr + enq_idx).btbPredCtr := io.in.bits.predCtr(i<<1) -// lbuf(tail_ptr + enq_idx).btbHit := io.in.bits.btbHit(i<<1) -// lbuf(tail_ptr + enq_idx).tageMeta := io.in.bits.tageMeta(i<<1) -// lbuf(tail_ptr + enq_idx).rasSp := io.in.bits.rasSp -// lbuf(tail_ptr + enq_idx).rasTopCtr := io.in.bits.rasTopCtr -// -// lbuf_valid(tail_ptr + enq_idx) := io.in.bits.mask(i<<1) // FIXME: need fix me when support RVC -// lbuf(tail_ptr + enq_idx).isTaken := io.in.bits.branchInfo(i) // isTaken can reduce to LBufSize/FetchWidth -// // lbuf(tail_ptr + enq_idx).isTaken := false.B // isTaken can reduce to LBufSize/FetchWidth -// -// enq_idx = enq_idx + io.in.bits.mask(i<<1) -// } -// tail_ptr := tail_ptr + enq_idx -// } -// -// /*-----------------------*/ -// /* Loop Buffer FSM */ -// /*-----------------------*/ -// switch(LBstate) { -// is(s_idle) { -// // To FILL -// // 检测到sbb且跳转,sbb成为triggrting sbb -// XSDebug(has_sbb, "SBB detected\n") -// when(has_sbb && sbbTaken && !has_branch) { -// LBstate := s_fill -// XSDebug("State change: FILL\n") -// offsetCounter := Cat("b1".U, SBBOffset(io.out(sbbIdx).bits.instr)) + ((DecodeWidth.U - sbbIdx)<<1).asUInt -// tsbbPC := io.out(sbbIdx).bits.pc -// loop_str := head_ptr + sbbIdx + 1.U -// XSDebug("loop_str=%d\n", head_ptr + sbbIdx + 1.U) -// } -// } -// is(s_fill) { -// when(offsetCounterWire((log2Up(IBufSize)+2)-1) === 0.U && has_tsbb) { -// when(tsbbTaken) { -// // To ACTIVE -// // triggering sbb造成cof -// LBstate := s_active -// XSDebug("State change: ACTIVE\n") -// loop_end := head_ptr + tsbbIdx -// XSDebug("loop_end=%d\n", head_ptr + tsbbIdx) -// // This is so ugly -// loop_ptr := loop_str + PopCount((0 until DecodeWidth).map(io.out(_).fire())) - tsbbIdx - 1.U -// }.otherwise { -// // triggering sbb不跳转 -// // To IDLE -// LBstate := s_idle -// cleanFILL(loop_str, head_ptr + PopCount((0 until DecodeWidth).map(io.out(_).fire()))) -// XSDebug("State change: IDLE\n") -// } -// } -// -// // 非triggering sbb造成的cof -// // 遇到过一个周期内跳转为ACTIVE后又跳转为IDLE,无法重现 -// // when(ParallelOR((0 until DecodeWidth).map(i => io.out(i).valid && !isSBB(io.out(i).bits.instr) && isJal(io.out(i).bits.instr) && out_isTaken(i))).asBool()) { -// when(ParallelOR((0 until DecodeWidth).map(i => out_isTaken(i) && io.out(i).bits.pc =/= tsbbPC))) { -// // To IDLE -// LBstate := s_idle -// cleanFILL(loop_str, head_ptr + PopCount((0 until DecodeWidth).map(io.out(_).fire()))) -// XSDebug("State change: IDLE\n") -// } -// } -// is(s_active) { -// // To IDLE -// // triggering sbb不跳转 -// when(has_tsbb && !tsbbTaken) { -// // To IDLE -// XSDebug("tsbb not taken, State change: IDLE\n") -// flush() -// } -// -// // 非triggering sbb造成的cof -// when(ParallelOR((0 until DecodeWidth).map(i => out_isTaken(i) && io.out(i).bits.pc =/= tsbbPC))) { -// // To IDLE -// XSDebug("cof by other inst, State change: IDLE\n") -// flush() -// } -// } -// } -// -// // flush -// when(io.flush) { -// flush() -// } -// -// // Debug Info -// // XSDebug(io.in.fire(), p"PC= ${Hexadecimal(io.in.bits.pc)}\n") -// XSDebug(io.flush, "Loop Buffer Flushed\n") -// -// XSDebug(LBstate === s_idle, "Current state: IDLE\n") -// XSDebug(LBstate === s_fill, "Current state: FILL\n") -// XSDebug(LBstate === s_active, "Current state: ACTIVE\n") -// -// when(io.in.valid) { -// XSDebug("Enque:\n") -// XSDebug(p"PC=${Hexadecimal(io.in.bits.pc)} MASK=${Binary(io.in.bits.mask)}\n") -// for(i <- 0 until FetchWidth){ -// XSDebug(p"${Hexadecimal(io.in.bits.instrs(i))} v=${io.in.valid} r=${io.in.ready} t=${io.in.bits.branchInfo(i)}\n") -// } -// } -// -//// when((0 until DecodeWidth).map(i => io.out(i).ready).reduce(_||_)){ -// XSDebug("Deque:\n") -// for(i <- 0 until DecodeWidth){ -// XSDebug(p"${Hexadecimal(io.out(i).bits.instr)} pnpc=${Hexadecimal(io.out(i).bits.pnpc)} PC=${Hexadecimal(io.out(i).bits.pc)} v=${io.out(i).valid} r=${io.out(i).ready} t=${out_isTaken(i)}\n") -// } -//// } -// -// XSDebug(p"last_head_ptr=$head_ptr last_tail_ptr=$tail_ptr\n") -// -//// Print loop buffer -// for(i <- 0 until IBufSize/8) { -// XSDebug("%x v:%b | %x v:%b | %x v:%b | %x v:%b | %x v:%b | %x v:%b | %x v:%b | %x v:%b\n", -// lbuf(i*8+0).inst, lbuf_valid(i*8+0), -// lbuf(i*8+1).inst, lbuf_valid(i*8+1), -// lbuf(i*8+2).inst, lbuf_valid(i*8+2), -// lbuf(i*8+3).inst, lbuf_valid(i*8+3), -// lbuf(i*8+4).inst, lbuf_valid(i*8+4), -// lbuf(i*8+5).inst, lbuf_valid(i*8+5), -// lbuf(i*8+6).inst, lbuf_valid(i*8+6), -// lbuf(i*8+7).inst, lbuf_valid(i*8+7) -// ) -// } +class LoopBufferParameters extends XSBundle { + val LBredirect = ValidIO(UInt(VAddrBits.W)) + val fetchReq = Input(UInt(VAddrBits.W)) + val noTakenMask = Input(UInt(PredictWidth.W)) } + +class LoopBufferIO extends XSBundle { + val flush = Input(Bool()) + val in = Flipped(DecoupledIO(new FetchPacket)) + val out = ValidIO(new ICacheResp) + val loopBufPar = new LoopBufferParameters +} + +class FakeLoopBuffer extends XSModule { + val io = IO(new LoopBufferIO) + + io.out <> DontCare + io.out.valid := false.B + io.in.ready := false.B + io.loopBufPar <> DontCare + io.loopBufPar.LBredirect.valid := false.B +} + +class LoopBuffer extends XSModule with HasIFUConst{ + val io = IO(new LoopBufferIO) + + // FSM state define + val s_idle :: s_fill :: s_active :: Nil = Enum(3) + val LBstate = RegInit(s_idle) + + io.out <> DontCare + io.out.valid := LBstate === s_active + io.in.ready := true.B + + class LBufEntry extends XSBundle { + val inst = UInt(16.W) + // val tag = UInt(tagBits.W) + } + + def sbboffset(inst: UInt) = { + val isJal = inst === BitPat("b1111_???????_111111111_?????_1101111") + val isCon = inst === BitPat("b1111???_?????_?????_???_????1_1100011") + val isRVCJal = inst === BitPat("b????????????????_001_1?111??????_01") + val isRVCCon = inst === BitPat("b????????????????_11?_1??_???_?????_01") + + val rst = PriorityMux(Seq( + isJal -> inst(27, 21), + isCon -> Cat(inst(27,25), inst(11,8)), + isRVCJal -> Cat(inst(6), inst(7), inst(2), inst(11), inst(5,3)), + isRVCCon -> Cat(inst(6), inst(5), inst(2), inst(11,10), inst(4,3)), + true.B -> 0.U(7.W) + )) + + ((~rst).asUInt + 1.U, rst) + } + + def isSBB(inst: UInt): Bool = { + val sbboffsetWire = WireInit(sbboffset(inst)._1) + sbboffsetWire > 0.U && sbboffsetWire <= 112.U // TODO < 56.U + } + + // predTaken to OH + val predTakenVec = Mux(io.in.bits.predTaken, Reverse(PriorityEncoderOH(Reverse(io.in.bits.mask))), 0.U(PredictWidth.W)) + + // Loop detect register + val offsetCounter = Reg(UInt((log2Up(IBufSize)+2).W)) + val tsbbPC = RegInit(0.U(VAddrBits.W)) + + val brTaken = Cat((0 until PredictWidth).map(i => io.in.fire && io.in.bits.mask(i) && predTakenVec(i))).orR() + val brIdx = OHToUInt(predTakenVec.asUInt) + val sbbTaken = brTaken && isSBB(io.in.bits.instrs(brIdx)) + + val tsbbVec = Cat((0 until PredictWidth).map(i => io.in.fire && io.in.bits.mask(i) && io.in.bits.pc(i) === tsbbPC)) + val hasTsbb = tsbbVec.orR() + val tsbbIdx = OHToUInt(Reverse(tsbbVec)) + val tsbbTaken = brTaken && io.in.bits.pc(brIdx) === tsbbPC + + val buffer = Mem(IBufSize*2, new LBufEntry) + val bufferValid = RegInit(VecInit(Seq.fill(IBufSize*2)(false.B))) + + val redirect_pc = io.in.bits.pnpc(PredictWidth.U - PriorityEncoder(Reverse(io.in.bits.mask)) - 1.U) + + def flush() = { + XSDebug("Loop Buffer Flushed.\n") + LBstate := s_idle + for(i <- 0 until IBufSize*2) { + // buffer(i).inst := 0.U // TODO: This is to make the debugging information clearer, this can be deleted + bufferValid(i) := false.B + } + } + + // Enque loop body + when(io.in.fire && LBstate === s_fill) { + io.loopBufPar.noTakenMask.asBools().zipWithIndex.map {case(m, i) => + when(m) { + buffer(io.in.bits.pc(i)(7,1)).inst := io.in.bits.instrs(i)(15, 0) + bufferValid(io.in.bits.pc(i)(7,1)) := true.B + when(!io.in.bits.pd(i).isRVC) { + buffer(io.in.bits.pc(i)(7,1) + 1.U).inst := io.in.bits.instrs(i)(31, 16) + bufferValid(io.in.bits.pc(i)(7,1) + 1.U) := true.B // May need to be considered already valid + } + } + } + } + + // This is ugly + val pcStep = (0 until PredictWidth).map(i => Mux(!io.in.fire || !io.in.bits.mask(i), 0.U, Mux(io.in.bits.pd(i).isRVC, 1.U, 2.U))).fold(0.U(log2Up(16+1).W))(_+_) + val offsetCounterWire = WireInit(offsetCounter + pcStep) + offsetCounter := offsetCounterWire + + // Provide ICacheResp to IFU + when(LBstate === s_active) { + val offsetInBankWire = offsetInBank(io.loopBufPar.fetchReq) + io.out.bits.pc := io.loopBufPar.fetchReq + io.out.bits.data := Cat((15 to 0 by -1).map(i => buffer(io.loopBufPar.fetchReq(7,1) + i.U).inst)) >> Cat(offsetInBankWire, 0.U(4.W)) + io.out.bits.mask := Cat((15 to 0 by -1).map(i => bufferValid(io.loopBufPar.fetchReq(7,1) + i.U))) >> offsetInBankWire + io.out.bits.ipf := false.B + } + + io.loopBufPar.LBredirect.valid := false.B + io.loopBufPar.LBredirect.bits := DontCare + + /*-----------------------*/ + /* Loop Buffer FSM */ + /*-----------------------*/ + when(io.in.fire) { + switch(LBstate) { + is(s_idle) { + // To FILL + // 检测到sbb且跳转,sbb成为triggering sbb + when(sbbTaken) { + LBstate := s_fill + XSDebug("State change: FILL\n") + // This is ugly + // offsetCounter := Cat("b1".U, sbboffset(io.in.bits.instrs(brIdx))) + + // (0 until PredictWidth).map(i => Mux(!io.in.bits.mask(i) || i.U < brIdx, 0.U, Mux(io.in.bits.pd(i).isRVC, 1.U, 2.U))).fold(0.U(log2Up(16+1).W))(_+_) + offsetCounter := Cat("b1".U, sbboffset(io.in.bits.instrs(brIdx))._2) + tsbbPC := io.in.bits.pc(brIdx) + } + } + is(s_fill) { + // To AVTIVE + // triggering sbb 造成cof + when(offsetCounterWire((log2Up(IBufSize)+2)-1) === 0.U){ + when(hasTsbb && tsbbTaken) { + LBstate := s_active + XSDebug("State change: ACTIVE\n") + }.otherwise { + LBstate := s_idle + XSDebug("State change: IDLE\n") + flush() + } + } + + when(brTaken && !tsbbTaken) { + // To IDLE + LBstate := s_idle + XSDebug("State change: IDLE\n") + flush() + } + } + is(s_active) { + // To IDLE + // triggering sbb不跳转 退出循环 + when(hasTsbb && !tsbbTaken) { + XSDebug("tsbb not taken, State change: IDLE\n") + LBstate := s_idle + io.loopBufPar.LBredirect.valid := true.B + io.loopBufPar.LBredirect.bits := redirect_pc + XSDebug(p"redirect pc=${Hexadecimal(redirect_pc)}\n") + flush() + } + + when(brTaken && !tsbbTaken) { + XSDebug("cof by other inst, State change: IDLE\n") + LBstate := s_idle + io.loopBufPar.LBredirect.valid := true.B + io.loopBufPar.LBredirect.bits := redirect_pc + XSDebug(p"redirect pc=${Hexadecimal(redirect_pc)}\n") + flush() + } + + when(hasTsbb && brTaken && !tsbbTaken) { + XSDebug("tsbb and cof, State change: IDLE\n") + LBstate := s_idle + io.loopBufPar.LBredirect.valid := true.B + io.loopBufPar.LBredirect.bits := redirect_pc + XSDebug(p"redirect pc=${Hexadecimal(redirect_pc)}\n") + flush() + } + } + } + } + + when(io.flush){ + flush() + } + + // XSDebug(io.flush, "LoopBuffer Flushed\n") + // if (!env.FPGAPlatform ) { + // ExcitingUtils.addSource(LBstate === s_active && hasTsbb && !tsbbTaken, "CntExitLoop1", Perf) + // ExcitingUtils.addSource(LBstate === s_active && brTaken && !tsbbTaken, "CntExitLoop2", Perf) + // ExcitingUtils.addSource(LBstate === s_active && hasTsbb && brTaken && !tsbbTaken, "CntExitLoop3", Perf) + // } + + XSDebug(LBstate === s_idle, "Current state: IDLE\n") + XSDebug(LBstate === s_fill, "Current state: FILL\n") + XSDebug(LBstate === s_active, "Current state: ACTIVE\n") + + XSDebug(p"offsetCounter = ${Binary(offsetCounterWire)}\n") + XSDebug(p"tsbbIdx = ${tsbbIdx}\n") + when(io.in.fire) { + XSDebug("Enque:\n") + XSDebug(brTaken, p"Detected jump, idx=${brIdx}\n") + XSDebug(p"predTaken=${io.in.bits.predTaken}, predTakenVec=${Binary(predTakenVec)}\n") + XSDebug(p"MASK=${Binary(io.in.bits.mask)}\n") + for(i <- 0 until PredictWidth){ + XSDebug(p"PC=${Hexadecimal(io.in.bits.pc(i))} ${Hexadecimal(io.in.bits.instrs(i))}\n") + } + } + + XSDebug("LoopBuffer:\n") + for(i <- 0 until IBufSize*2/8) { + XSDebug("%x v:%b | %x v:%b | %x v:%b | %x v:%b | %x v:%b | %x v:%b | %x v:%b | %x v:%b\n", + buffer(i*8+0).inst, bufferValid(i*8+0), + buffer(i*8+1).inst, bufferValid(i*8+1), + buffer(i*8+2).inst, bufferValid(i*8+2), + buffer(i*8+3).inst, bufferValid(i*8+3), + buffer(i*8+4).inst, bufferValid(i*8+4), + buffer(i*8+5).inst, bufferValid(i*8+5), + buffer(i*8+6).inst, bufferValid(i*8+6), + buffer(i*8+7).inst, bufferValid(i*8+7) + ) + } + + XSDebug(io.out.valid, p"fetch pc: ${Hexadecimal(io.loopBufPar.fetchReq)}\n") + XSDebug(io.out.valid, p"fetchIdx: ${io.loopBufPar.fetchReq(7,1)}\n") + XSDebug(io.out.valid, p"out data: ${Hexadecimal(io.out.bits.data)}\n") + XSDebug(io.out.valid, p"out mask: ${Binary(io.out.bits.mask)}\n") + XSDebug(io.out.valid, p"out pc : ${Hexadecimal(io.out.bits.pc)}\n") +} \ No newline at end of file diff --git a/src/main/scala/xiangshan/frontend/LoopPredictor.scala b/src/main/scala/xiangshan/frontend/LoopPredictor.scala index 29ecee27df8e5a117f0a605165b164ea103d053c..38447a8d022101d1015c9d88314103187f95f375 100644 --- a/src/main/scala/xiangshan/frontend/LoopPredictor.scala +++ b/src/main/scala/xiangshan/frontend/LoopPredictor.scala @@ -5,6 +5,7 @@ import chisel3.util._ import xiangshan._ import utils._ import xiangshan.backend.brq.BrqPtr +import chisel3.experimental.chiselName trait LTBParams extends HasXSParameter with HasBPUParameter { // +-----------+---------+--------------+-----------+ @@ -64,17 +65,62 @@ class LTBColumnUpdate extends LTBBundle { } // each column/bank of Loop Termination Buffer +@chiselName class LTBColumn extends LTBModule { val io = IO(new Bundle() { // if3 send req - val req = Input(Valid(new LTBColumnReq)) + val req = Input(new LTBColumnReq) + val if3_fire = Input(Bool()) + val if4_fire = Input(Bool()) + val outMask = Input(Bool()) // send out resp to if4 val resp = Output(new LTBColumnResp) val update = Input(Valid(new LTBColumnUpdate)) val repair = Input(Bool()) // roll back specCnts in the other 15 LTBs }) - val ltb = Reg(Vec(nRows, new LoopEntry)) + class LTBMem extends LTBModule { + val io = IO(new Bundle { + val rIdx = Input(UInt(idxLen.W)) + val rdata = Output(new LoopEntry) + val urIdx = Input(UInt(idxLen.W)) + val urdata = Output(new LoopEntry) + val wen = Input(Bool()) + val wIdx = Input(UInt(idxLen.W)) + val wdata = Input(new LoopEntry) + val swen = Input(Bool()) + val swIdx = Input(UInt(idxLen.W)) + val swdata = Input(new LoopEntry) + val copyCnt = Input(Vec(nRows, Bool())) + }) + + // val mem = RegInit(0.U.asTypeOf(Vec(nRows, new LoopEntry))) + val mem = Mem(nRows, new LoopEntry) + io.rdata := mem(io.rIdx) + io.urdata := mem(io.urIdx) + val wdata = WireInit(io.wdata) + val swdata = WireInit(io.swdata) + for (i <- 0 until nRows) { + val copyValid = io.copyCnt(i) + when (copyValid && io.swIdx === i.U && io.swen) { + swdata.specCnt := mem(i).nSpecCnt + } + val wd = WireInit(mem(i)) // default for copycnt + val wen = WireInit(io.copyCnt(i) || io.wen && io.wIdx === i.U || io.swen && io.swIdx === i.U) + when (!copyValid) { + when (io.swen) { + wd := swdata + }.elsewhen (io.wen) { + wd := wdata + } + } + when (wen) { + mem.write(i.U, wd) + } + } + } + // val ltb = Reg(Vec(nRows, new LoopEntry)) + val ltb = Module(new LTBMem).io val ltbAddr = new TableAddr(idxLen + 4, PredictWidth) val updateIdx = ltbAddr.getBankIdx(io.update.bits.pc) val updateTag = ltbAddr.getTag(io.update.bits.pc)(tagLen - 1, 0) @@ -86,21 +132,32 @@ class LTBColumn extends LTBModule { when (resetIdx === (nRows - 1).U) { doingReset := false.B } // during branch prediction - val if3_idx = io.req.bits.idx - val if3_tag = io.req.bits.tag - val if3_pc = io.req.bits.pc // only for debug - val if3_entry = WireInit(ltb(if3_idx)) + val if4_idx = io.req.idx + val if4_tag = io.req.tag + val if4_pc = io.req.pc // only for debug + ltb.rIdx := if4_idx + val if4_entry = WireInit(ltb.rdata) - io.resp.meta := RegEnable(if3_entry.specCnt + 1.U, io.req.valid) - // io.resp.exit := RegNext(if3_tag === if3_entry.tag && (if3_entry.specCnt + 1.U) === if3_entry.tripCnt/* && if3_entry.isConf*/ && io.req.valid) - io.resp.exit := RegEnable(if3_tag === if3_entry.tag && (if3_entry.specCnt + 1.U) === if3_entry.tripCnt && io.req.valid && !if3_entry.unusable, io.req.valid) + val valid = RegInit(false.B) + when (io.if4_fire) { valid := false.B } + when (io.if3_fire) { valid := true.B } + when (io.update.valid && io.update.bits.misPred) { valid := false.B } + + io.resp.meta := if4_entry.specCnt + 1.U + // io.resp.exit := if4_tag === if4_entry.tag && (if4_entry.specCnt + 1.U) === if4_entry.tripCnt && valid && !if4_entry.unusable + io.resp.exit := if4_tag === if4_entry.tag && (if4_entry.specCnt + 1.U) === if4_entry.tripCnt && valid && if4_entry.isConf // when resolving a branch - val entry = ltb(updateIdx) + ltb.urIdx := updateIdx + val entry = ltb.urdata val tagMatch = entry.tag === updateTag val cntMatch = entry.tripCnt === io.update.bits.meta val wEntry = WireInit(entry) + ltb.wIdx := updateIdx + ltb.wdata := wEntry + ltb.wen := false.B + when (io.update.valid && !doingReset) { // When a branch resolves and is found to not be in the LTB, // it is inserted into the LTB if determined to be a loop-branch and if it is mispredicted by the default predictor. @@ -109,20 +166,25 @@ class LTBColumn extends LTBModule { wEntry.conf := 0.U wEntry.age := 7.U wEntry.tripCnt := Fill(cntBits, 1.U(1.W)) - wEntry.specCnt := 1.U - wEntry.nSpecCnt := 1.U + wEntry.specCnt := Mux(io.update.bits.taken, 1.U, 0.U) + wEntry.nSpecCnt := Mux(io.update.bits.taken, 1.U, 0.U) wEntry.brTag := updateBrTag wEntry.unusable := false.B - ltb(updateIdx) := wEntry + // ltb(updateIdx) := wEntry + ltb.wen := true.B }.elsewhen (tagMatch) { // During resolution, a taken branch found in the LTB has its nSpecCnt incremented by one. when (io.update.bits.taken) { wEntry.nSpecCnt := entry.nSpecCnt + 1.U wEntry.specCnt := Mux(io.update.bits.misPred/* && !entry.brTag.needBrFlush(updateBrTag)*/, entry.nSpecCnt + 1.U, entry.specCnt) + wEntry.conf := Mux(io.update.bits.misPred, 0.U, entry.conf) + // wEntry.tripCnt := Fill(cntBits, 1.U(1.W)) + wEntry.tripCnt := Mux(io.update.bits.misPred, Fill(cntBits, 1.U(1.W)), entry.tripCnt) // A not-taken loop-branch found in the LTB during branch resolution updates its trip count and conf. }.otherwise { // wEntry.conf := Mux(entry.nSpecCnt === entry.tripCnt, Mux(entry.isLearned, 7.U, entry.conf + 1.U), 0.U) - wEntry.conf := Mux(io.update.bits.misPred, 0.U, Mux(entry.isLearned, 7.U, entry.conf + 1.U)) + // wEntry.conf := Mux(io.update.bits.misPred, 0.U, Mux(entry.isLearned, 7.U, entry.conf + 1.U)) + wEntry.conf := Mux((entry.nSpecCnt + 1.U) === entry.tripCnt, Mux(entry.isLearned, 7.U, entry.conf + 1.U), 0.U) // wEntry.tripCnt := entry.nSpecCnt + 1.U wEntry.tripCnt := io.update.bits.meta wEntry.specCnt := Mux(io.update.bits.misPred, 0.U, entry.specCnt/* - entry.nSpecCnt - 1.U*/) @@ -130,65 +192,68 @@ class LTBColumn extends LTBModule { wEntry.brTag := updateBrTag wEntry.unusable := io.update.bits.misPred && (io.update.bits.meta > entry.tripCnt) } - ltb(updateIdx) := wEntry + // ltb(updateIdx) := wEntry + ltb.wen := true.B } } // speculatively update specCnt - when (io.req.valid && if3_entry.tag === if3_tag) { - when ((if3_entry.specCnt + 1.U) === if3_entry.tripCnt/* && if3_entry.isConf*/) { - ltb(if3_idx).age := 7.U - ltb(if3_idx).specCnt := 0.U + ltb.swen := valid && if4_entry.tag === if4_tag || doingReset + ltb.swIdx := Mux(doingReset, resetIdx, if4_idx) + val swEntry = WireInit(if4_entry) + ltb.swdata := Mux(doingReset, 0.U.asTypeOf(new LoopEntry), swEntry) + when (io.if4_fire && if4_entry.tag === if4_tag && io.outMask) { + when ((if4_entry.specCnt + 1.U) === if4_entry.tripCnt/* && if4_entry.isConf*/) { + swEntry.age := 7.U + swEntry.specCnt := 0.U }.otherwise { - ltb(if3_idx).age := Mux(if3_entry.age === 7.U, 7.U, if3_entry.age + 1.U) - ltb(if3_idx).specCnt := if3_entry.specCnt + 1.U + swEntry.age := Mux(if4_entry.age === 7.U, 7.U, if4_entry.age + 1.U) + swEntry.specCnt := if4_entry.specCnt + 1.U } } // Reseting - when (doingReset) { - ltb(resetIdx) := 0.U.asTypeOf(new LoopEntry) - } + // when (doingReset) { + // ltb(resetIdx) := 0.U.asTypeOf(new LoopEntry) + // } // when a branch misprediction occurs, all of the nSpecCnts copy their values into the specCnts for (i <- 0 until nRows) { - when (io.update.valid && io.update.bits.misPred && i.U =/= updateIdx || io.repair) { - ltb(i).specCnt := ltb(i).nSpecCnt - } + ltb.copyCnt(i) := io.update.valid && io.update.bits.misPred && i.U =/= updateIdx || io.repair } - // bypass for if3_entry.specCnt - when (io.update.valid && !doingReset && io.req.valid && updateIdx === if3_idx) { + // bypass for if4_entry.specCnt + when (io.update.valid && !doingReset && valid && updateIdx === if4_idx) { when (!tagMatch && io.update.bits.misPred || tagMatch) { - if3_entry.specCnt := wEntry.specCnt + swEntry.specCnt := wEntry.specCnt } } - when (io.repair && !doingReset && io.req.valid) { - if3_entry.specCnt := if3_entry.nSpecCnt + when (io.repair && !doingReset && valid) { + swEntry.specCnt := if4_entry.nSpecCnt } if (BPUDebug && debug) { //debug info XSDebug(doingReset, "Reseting...\n") - XSDebug("[IF3][req] v=%d pc=%x idx=%x tag=%x\n", io.req.valid, io.req.bits.pc, io.req.bits.idx, io.req.bits.tag) - XSDebug("[IF3][if3_entry] tag=%x conf=%d age=%d tripCnt=%d specCnt=%d nSpecCnt=%d", if3_entry.tag, if3_entry.conf, if3_entry.age, if3_entry.tripCnt, if3_entry.specCnt, if3_entry.nSpecCnt) - XSDebug(false, true.B, p" brTag=${if3_entry.brTag}\n") - // XSDebug("[IF4] idx=%x tag=%x specCnt=%d\n", if4_idx, if4_tag, if4_specCnt) - // XSDebug(RegNext(io.req.valid) && if4_entry.tag === if4_tag, "[IF4][speculative update] new specCnt=%d\n", - // Mux(if4_specCnt === if4_entry.tripCnt && if4_entry.isLearned, 0.U, if4_specCnt + 1.U)) - XSDebug(io.req.valid && if3_entry.tag === if3_tag, "[IF3][speculative update] new specCnt=%d\n", - Mux(if3_entry.specCnt === if3_entry.tripCnt && if3_entry.isConf, 0.U, if3_entry.specCnt + 1.U)) + XSDebug("if3_fire=%d if4_fire=%d valid=%d\n", io.if3_fire, io.if4_fire,valid) + XSDebug("[req] v=%d pc=%x idx=%x tag=%x\n", valid, io.req.pc, io.req.idx, io.req.tag) + XSDebug("[if4_entry] tag=%x conf=%d age=%d tripCnt=%d specCnt=%d nSpecCnt=%d", + if4_entry.tag, if4_entry.conf, if4_entry.age, if4_entry.tripCnt, if4_entry.specCnt, if4_entry.nSpecCnt) + XSDebug(false, true.B, p" brTag=${if4_entry.brTag} unusable=${if4_entry.unusable}\n") + XSDebug(io.if4_fire && if4_entry.tag === if4_tag && io.outMask, "[speculative update] new specCnt=%d\n", + Mux((if4_entry.specCnt + 1.U) === if4_entry.tripCnt, 0.U, if4_entry.specCnt + 1.U)) XSDebug("[update] v=%d misPred=%d pc=%x idx=%x tag=%x meta=%d taken=%d tagMatch=%d cntMatch=%d", io.update.valid, io.update.bits.misPred, io.update.bits.pc, updateIdx, updateTag, io.update.bits.meta, io.update.bits.taken, tagMatch, cntMatch) XSDebug(false, true.B, p" brTag=${updateBrTag}\n") XSDebug("[entry ] tag=%x conf=%d age=%d tripCnt=%d specCnt=%d nSpecCnt=%d", entry.tag, entry.conf, entry.age, entry.tripCnt, entry.specCnt, entry.nSpecCnt) - XSDebug(false, true.B, p" brTag=${entry.brTag}\n") + XSDebug(false, true.B, p" brTag=${entry.brTag} unusable=${entry.unusable}\n") XSDebug("[wEntry] tag=%x conf=%d age=%d tripCnt=%d specCnt=%d nSpecCnt=%d", wEntry.tag, wEntry.conf, wEntry.age, wEntry.tripCnt, wEntry.specCnt, wEntry.nSpecCnt) - XSDebug(false, true.B, p" brTag=${wEntry.brTag}\n") + XSDebug(false, true.B, p" brTag=${wEntry.brTag} unusable=${wEntry.unusable}\n") XSDebug(io.update.valid && io.update.bits.misPred || io.repair, "MisPred or repairing, all of the nSpecCnts copy their values into the specCnts\n") } } +@chiselName class LoopPredictor extends BasePredictor with LTBParams { class LoopResp extends Resp { val exit = Vec(PredictWidth, Bool()) @@ -196,8 +261,13 @@ class LoopPredictor extends BasePredictor with LTBParams { class LoopMeta extends Meta { val specCnts = Vec(PredictWidth, UInt(cntBits.W)) } + class LoopRespIn extends XSBundle { + val taken = Bool() + val jmpIdx = UInt(log2Up(PredictWidth).W) + } class LoopIO extends DefaultBasePredictorIO { + val respIn = Input(new LoopRespIn) val resp = Output(new LoopResp) val meta = Output(new LoopMeta) } @@ -208,32 +278,37 @@ class LoopPredictor extends BasePredictor with LTBParams { val ltbAddr = new TableAddr(idxLen + 4, PredictWidth) - val baseBank = ltbAddr.getBank(io.pc.bits) - val baseRow = ltbAddr.getBankIdx(io.pc.bits) - val baseTag = ltbAddr.getTag(io.pc.bits) + // Latch for 1 cycle + val pc = RegEnable(io.pc.bits, io.pc.valid) + val inMask = RegEnable(io.inMask, io.pc.valid) + val baseBank = ltbAddr.getBank(pc) + val baseRow = ltbAddr.getBankIdx(pc) + val baseTag = ltbAddr.getTag(pc) val nextRowStartsUp = baseRow.andR // TODO: use parallel andR val isInNextRow = VecInit((0 until PredictWidth).map(_.U < baseBank)) val tagIncremented = VecInit((0 until PredictWidth).map(i => isInNextRow(i.U) && nextRowStartsUp)) val realTags = VecInit((0 until PredictWidth).map(i => Mux(tagIncremented(i), baseTag + 1.U, baseTag)(tagLen - 1, 0))) val bankIdxInOrder = VecInit((0 until PredictWidth).map(i => (baseBank +& i.U)(log2Up(PredictWidth) - 1, 0))) - val realMask = circularShiftLeft(io.inMask, PredictWidth, baseBank) + val realMask = circularShiftLeft(inMask, PredictWidth, baseBank) + val outMask = inMask & (Fill(PredictWidth, !io.respIn.taken) | (Fill(PredictWidth, 1.U(1.W)) >> (~io.respIn.jmpIdx))) for (i <- 0 until PredictWidth) { - ltbs(i).io.req.bits.pc := io.pc.bits + ltbs(i).io.req.pc := pc + ltbs(i).io.outMask := false.B for (j <- 0 until PredictWidth) { when (Mux(isInNextRow(i), baseBank + j.U === (PredictWidth + i).U, baseBank + j.U === i.U)) { - ltbs(i).io.req.bits.pc := io.pc.bits + (j.U << 1) + ltbs(i).io.req.pc := pc + (j.U << 1) + ltbs(i).io.outMask := outMask(j).asBool } } } for (i <- 0 until PredictWidth) { - ltbs(i).io.req.valid := io.pc.valid && !io.flush && realMask(i) - // ltbs(i).io.req.bits.pc := io.pc.bits + (bankIdxInOrder(i) << 1) // only for debug - ltbs(i).io.req.bits.idx := Mux(isInNextRow(i), baseRow + 1.U, baseRow) - ltbs(i).io.req.bits.tag := realTags(i) - // ltbs(i).io.if4_fire := io.if4_fire - // ltbs(i).io.update := io.update + ltbs(i).io.if3_fire := io.pc.valid + ltbs(i).io.if4_fire := io.outFire + ltbs(i).io.req.idx := Mux(isInNextRow(i), baseRow + 1.U, baseRow) + ltbs(i).io.req.tag := realTags(i) + // ltbs(i).io.outMask := outMask(i) ltbs(i).io.update.valid := i.U === ltbAddr.getBank(io.update.bits.ui.pc) && io.update.valid && io.update.bits.ui.pd.isBr ltbs(i).io.update.bits.misPred := io.update.bits.ui.isMisPred ltbs(i).io.update.bits.pc := io.update.bits.ui.pc @@ -243,30 +318,28 @@ class LoopPredictor extends BasePredictor with LTBParams { ltbs(i).io.repair := i.U =/= ltbAddr.getBank(io.update.bits.ui.pc) && io.update.valid && io.update.bits.ui.isMisPred } - val baseBankLatch = RegEnable(baseBank, io.pc.valid) - // val bankIdxInOrder = VecInit((0 until PredictWidth).map(i => (baseBankLatch +& i.U)(log2Up(PredictWidth) - 1, 0)))] - val bankIdxInOrderLatch = RegEnable(bankIdxInOrder, io.pc.valid) val ltbResps = VecInit((0 until PredictWidth).map(i => ltbs(i).io.resp)) - (0 until PredictWidth).foreach(i => io.resp.exit(i) := ltbResps(bankIdxInOrderLatch(i)).exit) - (0 until PredictWidth).foreach(i => io.meta.specCnts(i) := ltbResps(bankIdxInOrderLatch(i)).meta) + (0 until PredictWidth).foreach(i => io.resp.exit(i) := ltbResps(bankIdxInOrder(i)).exit) + (0 until PredictWidth).foreach(i => io.meta.specCnts(i) := ltbResps(bankIdxInOrder(i)).meta) if (BPUDebug && debug) { // debug info - XSDebug("[IF3][req] fire=%d flush=%d fetchpc=%x baseBank=%x baseRow=%x baseTag=%x\n", io.pc.valid, io.flush, io.pc.bits, baseBank, baseRow, baseTag) - XSDebug("[IF3][req] isInNextRow=%b tagInc=%b\n", isInNextRow.asUInt, tagIncremented.asUInt) + XSDebug("[IF3][req] fire=%d flush=%d fetchpc=%x\n", io.pc.valid, io.flush, io.pc.bits) + XSDebug("[IF4][req] fire=%d baseBank=%x baseRow=%x baseTag=%x\n", io.outFire, baseBank, baseRow, baseTag) + XSDebug("[IF4][req] isInNextRow=%b tagInc=%b\n", isInNextRow.asUInt, tagIncremented.asUInt) for (i <- 0 until PredictWidth) { - XSDebug("[IF3][req] bank %d: v=%d mask=%d pc=%x idx=%x tag=%x\n", i.U, ltbs(i).io.req.valid, realMask(i), ltbs(i).io.req.bits.pc, ltbs(i).io.req.bits.idx, ltbs(i).io.req.bits.tag) + XSDebug("[IF4][req] bank %d: realMask=%d pc=%x idx=%x tag=%x\n", i.U, realMask(i), ltbs(i).io.req.pc, ltbs(i).io.req.idx, ltbs(i).io.req.tag) } - XSDebug("[IF4] baseBankLatch=%x bankIdxInOrderLatch=", baseBankLatch) + XSDebug("[IF4] baseBank=%x bankIdxInOrder=", baseBank) for (i <- 0 until PredictWidth) { - XSDebug(false, true.B, "%x ", bankIdxInOrderLatch(i)) + XSDebug(false, true.B, "%x ", bankIdxInOrder(i)) } XSDebug(false, true.B, "\n") for (i <- 0 until PredictWidth) { - XSDebug(RegNext(io.pc.valid) && (i.U === 0.U || i.U === 8.U), "[IF4][resps]") - XSDebug(false, RegNext(io.pc.valid), " %d:%d %d", i.U, io.resp.exit(i), io.meta.specCnts(i)) - XSDebug(false, RegNext(io.pc.valid) && (i.U === 7.U || i.U === 15.U), "\n") + XSDebug(io.outFire && (i.U === 0.U || i.U === 8.U), "[IF4][resps]") + XSDebug(false, io.outFire, " %d:%d %d", i.U, io.resp.exit(i), io.meta.specCnts(i)) + XSDebug(false, io.outFire && (i.U === 7.U || i.U === 15.U), "\n") } } } \ No newline at end of file diff --git a/src/main/scala/xiangshan/frontend/PreDecode.scala b/src/main/scala/xiangshan/frontend/PreDecode.scala index f2dd0fd7bacac9fef773631c7d397aa38b4a12ee..e03d11d2b6361c6f40c412b9cf481a7f5dc4d918 100644 --- a/src/main/scala/xiangshan/frontend/PreDecode.scala +++ b/src/main/scala/xiangshan/frontend/PreDecode.scala @@ -2,7 +2,7 @@ package xiangshan.frontend import chisel3._ import chisel3.util._ -import utils.XSDebug +import utils._ import xiangshan._ import xiangshan.backend.decode.isa.predecode.PreDecodeInst import xiangshan.cache._ @@ -14,7 +14,7 @@ trait HasPdconst{ this: XSModule => val brType::Nil = ListLookup(instr, List(BrType.notBr), PreDecodeInst.brTable) val rd = Mux(isRVC(instr), instr(12), instr(11,7)) val rs = Mux(isRVC(instr), Mux(brType === BrType.jal, 0.U, instr(11, 7)), instr(19, 15)) - val isCall = (brType === BrType.jal || brType === BrType.jalr) && isLink(rd) + val isCall = (brType === BrType.jal && !isRVC(instr) || brType === BrType.jalr) && isLink(rd) // Only for RV64 val isRet = brType === BrType.jalr && isLink(rs) && !isCall List(brType, isCall, isRet) } @@ -45,14 +45,16 @@ class PreDecodeInfo extends XSBundle { // 8 bit def notCFI = brType === BrType.notBr } -class PreDecodeResp extends XSBundle { +class PreDecodeResp extends XSBundle with HasIFUConst { val instrs = Vec(PredictWidth, UInt(32.W)) val pc = Vec(PredictWidth, UInt(VAddrBits.W)) val mask = UInt(PredictWidth.W) + // one for the first bank + val lastHalf = UInt(nBanksInPacket.W) val pd = Vec(PredictWidth, (new PreDecodeInfo)) } -class PreDecode extends XSModule with HasPdconst{ +class PreDecode extends XSModule with HasPdconst with HasIFUConst { val io = IO(new Bundle() { val in = Input(new ICacheResp) val prev = Flipped(ValidIO(UInt(16.W))) @@ -61,38 +63,53 @@ class PreDecode extends XSModule with HasPdconst{ val data = io.in.data val mask = io.in.mask + + val validCount = PopCount(mask) + val bankAlignedPC = bankAligned(io.in.pc) + val bankOffset = offsetInBank(io.in.pc) + val isAligned = bankOffset === 0.U + + val firstValidIdx = bankOffset // io.prev.valid should only occur with firstValidIdx = 0 + XSError(firstValidIdx =/= 0.U && io.prev.valid, p"pc:${io.in.pc}, mask:${io.in.mask}, prevhalfInst valid occurs on unaligned fetch packet\n") + // val lastHalfInstrIdx = Mux(isInLastBank(pc), (bankWidth-1).U, (bankWidth*2-1).U) + // in case loop buffer gives a packet ending at an unaligned position + val lastHalfInstrIdx = PriorityMux(Reverse(mask), (PredictWidth-1 to 0 by -1).map(i => i.U)) val insts = Wire(Vec(PredictWidth, UInt(32.W))) val instsMask = Wire(Vec(PredictWidth, Bool())) + val instsEndMask = Wire(Vec(PredictWidth, Bool())) val instsRVC = Wire(Vec(PredictWidth,Bool())) val instsPC = Wire(Vec(PredictWidth, UInt(VAddrBits.W))) + + val rawInsts = VecInit((0 until PredictWidth).map(i => if (i == PredictWidth-1) Cat(0.U(16.W), data(i*16+15, i*16)) + else data(i*16+31, i*16))) // val nextHalf = Wire(UInt(16.W)) - val lastHalfInstrIdx = PopCount(mask) - 1.U + val lastHalf = Wire(Vec(nBanksInPacket, UInt(1.W))) for (i <- 0 until PredictWidth) { - val inst = Wire(UInt(32.W)) - val valid = Wire(Bool()) - val pc = io.in.pc + (i << 1).U - Mux(io.prev.valid && (i.U === 0.U), 2.U, 0.U) - - if (i==0) { - inst := Mux(io.prev.valid, Cat(data(15,0), io.prev.bits), data(31,0)) - // valid := Mux(lastHalfInstrIdx === 0.U, isRVC(inst), true.B) - valid := Mux(lastHalfInstrIdx === 0.U, Mux(!io.prev.valid, isRVC(inst), true.B), true.B) - } else if (i==1) { - inst := data(47,16) - valid := (io.prev.valid || !(instsMask(0) && !isRVC(insts(0)))) && Mux(lastHalfInstrIdx === 1.U, isRVC(inst), true.B) - } else if (i==PredictWidth-1) { - inst := Cat(0.U(16.W), data(i*16+15, i*16)) - valid := !(instsMask(i-1) && !isRVC(insts(i-1)) || !isRVC(inst)) - } else { - inst := data(i*16+31, i*16) - valid := !(instsMask(i-1) && !isRVC(insts(i-1))) && Mux(i.U === lastHalfInstrIdx, isRVC(inst), true.B) - } + val inst = WireInit(rawInsts(i)) + val validStart = Wire(Bool()) // is the beginning of a valid inst + val validEnd = Wire(Bool()) // is the end of a valid inst + val pc = bankAlignedPC + (i << 1).U - Mux(io.prev.valid && (i.U === firstValidIdx), 2.U, 0.U) + + val isFirstInPacket = i.U === firstValidIdx + val isLastInPacket = i.U === lastHalfInstrIdx + val currentRVC = isRVC(insts(i)) + + val lastIsValidEnd = if (i == 0) { !io.prev.valid } else { instsEndMask(i-1) } + + inst := Mux(io.prev.valid && i.U === 0.U, Cat(rawInsts(i)(15,0), io.prev.bits), rawInsts(i)) + + validStart := lastIsValidEnd && !(isLastInPacket && !currentRVC) + validEnd := validStart && currentRVC || !validStart && !(isLastInPacket && !currentRVC) + + val currentLastHalf = lastIsValidEnd && (isLastInPacket && !currentRVC) insts(i) := inst instsRVC(i) := isRVC(inst) - instsMask(i) := mask(i) && valid + instsMask(i) := (if (i == 0) Mux(io.prev.valid, validEnd, validStart) else validStart) + instsEndMask(i) := validEnd instsPC(i) := pc val brType::isCall::isRet::Nil = brInfo(inst) @@ -103,14 +120,18 @@ class PreDecode extends XSModule with HasPdconst{ io.out.pd(i).excType := ExcType.notExc io.out.instrs(i) := insts(i) io.out.pc(i) := instsPC(i) - + + if (i == bankWidth-1) { lastHalf(0) := currentLastHalf } + if (i == PredictWidth-1) { lastHalf(1) := currentLastHalf } } - io.out.mask := instsMask.asUInt + io.out.mask := instsMask.asUInt & mask + io.out.lastHalf := lastHalf.asUInt for (i <- 0 until PredictWidth) { XSDebug(true.B, p"instr ${Hexadecimal(io.out.instrs(i))}, " + p"mask ${Binary(instsMask(i))}, " + + p"endMask ${Binary(instsEndMask(i))}, " + p"pc ${Hexadecimal(io.out.pc(i))}, " + p"isRVC ${Binary(io.out.pd(i).isRVC)}, " + p"brType ${Binary(io.out.pd(i).brType)}, " + diff --git a/src/main/scala/xiangshan/frontend/RAS.scala b/src/main/scala/xiangshan/frontend/RAS.scala index 68f010ceaf4ba36f3cef70a434ba3cb9432f42c1..8fcec24880aa70d7c65ad79687794be2683f88b4 100644 --- a/src/main/scala/xiangshan/frontend/RAS.scala +++ b/src/main/scala/xiangshan/frontend/RAS.scala @@ -5,7 +5,9 @@ import chisel3.util._ import xiangshan._ import xiangshan.backend.ALUOpType import utils._ +import chisel3.experimental.chiselName +@chiselName class RAS extends BasePredictor { class RASResp extends Resp @@ -25,17 +27,102 @@ class RAS extends BasePredictor val is_ret = Input(Bool()) val callIdx = Flipped(ValidIO(UInt(log2Ceil(PredictWidth).W))) val isRVC = Input(Bool()) + val isLastHalfRVI = Input(Bool()) val recover = Flipped(ValidIO(new BranchUpdateInfo)) val out = ValidIO(new RASResp) val branchInfo = Output(new RASBranchInfo) } - def rasEntry() = new Bundle { + class RASEntry() extends XSBundle { val retAddr = UInt(VAddrBits.W) val ctr = UInt(8.W) // layer of nested call functions } + + def rasEntry() = new RASEntry + + object RASEntry { + def apply(retAddr: UInt, ctr: UInt): RASEntry = { + val e = Wire(rasEntry()) + e.retAddr := retAddr + e.ctr := ctr + e + } + } + override val io = IO(new RASIO) + @chiselName + class RASStack(val rasSize: Int) extends XSModule { + val io = IO(new Bundle { + val push_valid = Input(Bool()) + val pop_valid = Input(Bool()) + val new_addr = Input(UInt(VAddrBits.W)) + val top_addr = Output(UInt(VAddrBits.W)) + val is_empty = Output(Bool()) + val is_full = Output(Bool()) + val copy_valid = Input(Bool()) + val copy_in_mem = Input(Vec(rasSize, rasEntry())) + val copy_in_sp = Input(UInt(log2Up(rasSize).W)) + val copy_out_mem = Output(Vec(rasSize, rasEntry())) + val copy_out_sp = Output(UInt(log2Up(rasSize).W)) + }) + @chiselName + class Stack(val size: Int) extends XSModule { + val io = IO(new Bundle { + val rIdx = Input(UInt(log2Up(size).W)) + val rdata = Output(rasEntry()) + val wen = Input(Bool()) + val wIdx = Input(UInt(log2Up(size).W)) + val wdata = Input(rasEntry()) + val copyen = Input(Bool()) + val copy_in = Input(Vec(size, rasEntry())) + val copy_out = Output(Vec(size, rasEntry())) + }) + val mem = Reg(Vec(size, rasEntry())) + when (io.wen) { + mem(io.wIdx) := io.wdata + } + io.rdata := mem(io.rIdx) + (0 until size).foreach { i => io.copy_out(i) := mem(i) } + when (io.copyen) { + (0 until size).foreach {i => mem(i) := io.copy_in(i) } + } + } + val sp = RegInit(0.U(log2Up(rasSize).W)) + val stack = Module(new Stack(rasSize)).io + + stack.rIdx := sp - 1.U + val top_entry = stack.rdata + val top_addr = top_entry.retAddr + val top_ctr = top_entry.ctr + val alloc_new = io.new_addr =/= top_addr + stack.wen := io.push_valid || io.pop_valid && top_ctr =/= 1.U + stack.wIdx := Mux(io.pop_valid && top_ctr =/= 1.U, sp - 1.U, Mux(alloc_new, sp, sp - 1.U)) + stack.wdata := Mux(io.pop_valid && top_ctr =/= 1.U, + RASEntry(top_addr, top_ctr - 1.U), + Mux(alloc_new, RASEntry(io.new_addr, 1.U), RASEntry(top_addr, top_ctr + 1.U))) + + when (io.push_valid && alloc_new) { + sp := sp + 1.U + } + + when (io.pop_valid && top_ctr === 1.U) { + sp := Mux(sp === 0.U, 0.U, sp - 1.U) + } + + io.copy_out_mem := stack.copy_out + io.copy_out_sp := sp + stack.copyen := io.copy_valid + stack.copy_in := io.copy_in_mem + when (io.copy_valid) { + sp := io.copy_in_sp + } + + io.top_addr := top_addr + io.is_empty := sp === 0.U + io.is_full := sp === (RasSize - 1).U + } + // val ras_0 = Reg(Vec(RasSize, rasEntry())) //RegInit(0.U)asTypeOf(Vec(RasSize,rasEntry)) cause comb loop // val ras_1 = Reg(Vec(RasSize, rasEntry())) // val sp_0 = RegInit(0.U(log2Up(RasSize).W)) @@ -46,120 +133,82 @@ class RAS extends BasePredictor // val commit_ras = Mux(choose_bit, ras_0, ras_1) // val commit_sp = Mux(choose_bit,sp_0,sp_1) - val spec_ras = Reg(Vec(RasSize, rasEntry())) - val spec_sp = RegInit(0.U(log2Up(RasSize).W)) - val commit_ras = Reg(Vec(RasSize, rasEntry())) - val commit_sp = RegInit(0.U(log2Up(RasSize).W)) + // val spec_ras = Reg(Vec(RasSize, rasEntry())) + // val spec_sp = RegInit(0.U(log2Up(RasSize).W)) + // val commit_ras = Reg(Vec(RasSize, rasEntry())) + // val commit_sp = RegInit(0.U(log2Up(RasSize).W)) + val spec_ras = Module(new RASStack(RasSize)).io - val spec_is_empty = spec_sp === 0.U - val spec_is_full = spec_sp === (RasSize - 1).U + val spec_push = WireInit(false.B) + val spec_pop = WireInit(false.B) + val spec_new_addr = WireInit(bankAligned(io.pc.bits) + (io.callIdx.bits << 1.U) + Mux(io.isRVC,2.U,Mux(io.isLastHalfRVI, 2.U, 4.U))) + spec_ras.push_valid := spec_push + spec_ras.pop_valid := spec_pop + spec_ras.new_addr := spec_new_addr + val spec_is_empty = spec_ras.is_empty + val spec_is_full = spec_ras.is_full + val spec_top_addr = spec_ras.top_addr - val spec_ras_top_entry = spec_ras(spec_sp-1.U) - val spec_ras_top_addr = spec_ras_top_entry.retAddr - val spec_ras_top_ctr = spec_ras_top_entry.ctr - //no need to pass the ras branchInfo - io.branchInfo.rasSp := DontCare - io.branchInfo.rasTopCtr := DontCare - io.branchInfo.rasToqAddr := DontCare + spec_push := !spec_is_full && io.callIdx.valid && io.pc.valid + spec_pop := !spec_is_empty && io.is_ret && io.pc.valid - io.out.valid := !spec_is_empty && io.is_ret - - // update spec RAS - // speculative update RAS - val spec_push = !spec_is_full && io.callIdx.valid && io.pc.valid - val spec_pop = !spec_is_empty && io.is_ret && io.pc.valid - val spec_new_addr = io.pc.bits + (io.callIdx.bits << 1.U) + Mux(io.isRVC,2.U,4.U) - val spec_ras_write = WireInit(0.U.asTypeOf(rasEntry())) - val sepc_alloc_new = spec_new_addr =/= spec_ras_top_addr - when (spec_push) { - //push - spec_ras_write.ctr := 1.U - spec_ras_write.retAddr := spec_new_addr - when(sepc_alloc_new){ - spec_sp := spec_sp + 1.U - spec_ras(spec_sp) := spec_ras_write - }.otherwise{ - spec_ras_top_ctr := spec_ras_top_ctr + 1.U - } - } - - when (spec_pop) { - //pop - when (spec_ras_top_ctr === 1.U) { - spec_sp := Mux(spec_sp === 0.U, 0.U, spec_sp - 1.U) - }.otherwise { - spec_ras_top_ctr := spec_ras_top_ctr - 1.U - } - } - io.out.bits.target := spec_ras_top_addr + val commit_ras = Module(new RASStack(RasSize)).io + + val commit_push = WireInit(false.B) + val commit_pop = WireInit(false.B) + val commit_new_addr = Mux(io.recover.bits.pd.isRVC,io.recover.bits.pc + 2.U,io.recover.bits.pc + 4.U) + commit_ras.push_valid := commit_push + commit_ras.pop_valid := commit_pop + commit_ras.new_addr := commit_new_addr + val commit_is_empty = commit_ras.is_empty + val commit_is_full = commit_ras.is_full + val commit_top_addr = commit_ras.top_addr + + commit_push := !commit_is_full && io.recover.valid && io.recover.bits.pd.isCall + commit_pop := !commit_is_empty && io.recover.valid && io.recover.bits.pd.isRet + + + io.out.valid := !spec_is_empty + io.out.bits.target := spec_top_addr // TODO: back-up stack for ras // use checkpoint to recover RAS - val commit_is_empty = commit_sp === 0.U - val commit_is_full = commit_sp === (RasSize - 1).U - val commit_ras_top_entry = commit_ras(commit_sp-1.U) - val commit_ras_top_addr = commit_ras_top_entry.retAddr - val commit_ras_top_ctr = commit_ras_top_entry.ctr - //update commit ras - val commit_push = !commit_is_full && io.recover.valid && io.recover.bits.pd.isCall - val commit_pop = !commit_is_empty && io.recover.valid && io.recover.bits.pd.isRet - val commit_new_addr = io.recover.bits.pc + 4.U //TODO: consider RVC - val commit_ras_write = WireInit(0.U.asTypeOf(rasEntry())) - val commit_alloc_new = commit_new_addr =/= commit_ras_top_addr - when (commit_push) { - //push - commit_ras_write.ctr := 1.U - commit_ras_write.retAddr := commit_new_addr - when(commit_alloc_new){ - commit_sp := commit_sp + 1.U - commit_ras(commit_sp) := commit_ras_write - }.otherwise{ - commit_ras_top_ctr := commit_ras_top_ctr + 1.U - } - } - - when (commit_pop) { - //pop - when (commit_ras_top_ctr === 1.U) { - commit_sp := Mux(commit_sp === 0.U, 0.U, commit_sp - 1.U) - }.otherwise { - commit_ras_top_ctr := commit_ras_top_ctr - 1.U - } - } - val copy_valid = io.recover.valid && io.recover.bits.isMisPred val copy_next = RegNext(copy_valid) - when(copy_next) - { - for(i <- 0 until RasSize) - { - spec_ras(i) := commit_ras(i) - spec_sp := commit_sp - } - } + spec_ras.copy_valid := copy_next + spec_ras.copy_in_mem := commit_ras.copy_out_mem + spec_ras.copy_in_sp := commit_ras.copy_out_sp + commit_ras.copy_valid := DontCare + commit_ras.copy_in_mem := DontCare + commit_ras.copy_in_sp := DontCare - if (BPUDebug && debug) { - XSDebug("----------------RAS(spec)----------------\n") - XSDebug(" index addr ctr \n") - for(i <- 0 until RasSize){ - XSDebug(" (%d) 0x%x %d",i.U,spec_ras(i).retAddr,spec_ras(i).ctr) - when(i.U === spec_sp){XSDebug(false,true.B," <----sp")} - XSDebug(false,true.B,"\n") - } - XSDebug("----------------RAS(commit)----------------\n") - XSDebug(" index addr ctr \n") - for(i <- 0 until RasSize){ - XSDebug(" (%d) 0x%x %d",i.U,commit_ras(i).retAddr,commit_ras(i).ctr) - when(i.U === commit_sp){XSDebug(false,true.B," <----sp")} - XSDebug(false,true.B,"\n") - } + //no need to pass the ras branchInfo + io.branchInfo.rasSp := DontCare + io.branchInfo.rasTopCtr := DontCare + io.branchInfo.rasToqAddr := DontCare - XSDebug(spec_push, "(spec_ras)push inAddr: 0x%x inCtr: %d | allocNewEntry:%d | sp:%d \n",spec_ras_write.retAddr,spec_ras_write.ctr,sepc_alloc_new,spec_sp.asUInt) - XSDebug(spec_pop, "(spec_ras)pop outValid:%d outAddr: 0x%x \n",io.out.valid,io.out.bits.target) - XSDebug(commit_push, "(commit_ras)push inAddr: 0x%x inCtr: %d | allocNewEntry:%d | sp:%d \n",commit_ras_write.retAddr,commit_ras_write.ctr,sepc_alloc_new,commit_sp.asUInt) - XSDebug(commit_pop, "(commit_ras)pop outValid:%d outAddr: 0x%x \n",io.out.valid,io.out.bits.target) - XSDebug("copyValid:%d copyNext:%d \n",copy_valid,copy_next) + if (BPUDebug && debug) { + // XSDebug("----------------RAS(spec)----------------\n") + // XSDebug(" index addr ctr \n") + // for(i <- 0 until RasSize){ + // XSDebug(" (%d) 0x%x %d",i.U,spec_ras(i).retAddr,spec_ras(i).ctr) + // when(i.U === spec_sp){XSDebug(false,true.B," <----sp")} + // XSDebug(false,true.B,"\n") + // } + // XSDebug("----------------RAS(commit)----------------\n") + // XSDebug(" index addr ctr \n") + // for(i <- 0 until RasSize){ + // XSDebug(" (%d) 0x%x %d",i.U,commit_ras(i).retAddr,commit_ras(i).ctr) + // when(i.U === commit_sp){XSDebug(false,true.B," <----sp")} + // XSDebug(false,true.B,"\n") + // } + + // XSDebug(spec_push, "(spec_ras)push inAddr: 0x%x inCtr: %d | allocNewEntry:%d | sp:%d \n",spec_ras_write.retAddr,spec_ras_write.ctr,sepc_alloc_new,spec_sp.asUInt) + // XSDebug(spec_pop, "(spec_ras)pop outValid:%d outAddr: 0x%x \n",io.out.valid,io.out.bits.target) + // XSDebug(commit_push, "(commit_ras)push inAddr: 0x%x inCtr: %d | allocNewEntry:%d | sp:%d \n",commit_ras_write.retAddr,commit_ras_write.ctr,sepc_alloc_new,commit_sp.asUInt) + // XSDebug(commit_pop, "(commit_ras)pop outValid:%d outAddr: 0x%x \n",io.out.valid,io.out.bits.target) + // XSDebug("copyValid:%d copyNext:%d \n",copy_valid,copy_next) } @@ -200,4 +249,4 @@ class RAS extends BasePredictor // } // } -} \ No newline at end of file +} diff --git a/src/main/scala/xiangshan/frontend/SC.scala b/src/main/scala/xiangshan/frontend/SC.scala new file mode 100644 index 0000000000000000000000000000000000000000..0e2917a411b6ee9cb80e680ee633af12f1d96254 --- /dev/null +++ b/src/main/scala/xiangshan/frontend/SC.scala @@ -0,0 +1,148 @@ +package xiangshan.frontend + +import chisel3._ +import chisel3.util._ +import xiangshan._ +import utils._ +import chisel3.experimental.chiselName + +import scala.math.min + +class SCReq extends TageReq + +class SCResp(val ctrBits: Int = 6) extends TageBundle { + val ctr = Vec(2, SInt(ctrBits.W)) +} + +class SCUpdate(val ctrBits: Int = 6) extends TageBundle { + val pc = UInt(VAddrBits.W) + val fetchIdx = UInt(log2Up(TageBanks).W) + val hist = UInt(HistoryLength.W) + val mask = Vec(TageBanks, Bool()) + val oldCtr = SInt(ctrBits.W) + val tagePred = Bool() + val taken = Bool() +} + +class SCTableIO extends TageBundle { + val req = Input(Valid(new SCReq)) + val resp = Output(Vec(TageBanks, new SCResp)) + val update = Input(new SCUpdate) +} + +abstract class BaseSCTable(val r: Int = 1024, val cb: Int = 6, val h: Int = 0) extends TageModule { + val io = IO(new SCTableIO) + def getCenteredValue(ctr: SInt): SInt = (ctr << 1).asSInt + 1.S +} + +class FakeSCTable extends BaseSCTable { + io.resp := 0.U.asTypeOf(Vec(TageBanks, new SCResp)) +} + +@chiselName +class SCTable(val nRows: Int, val ctrBits: Int, val histLen: Int) extends BaseSCTable(nRows, ctrBits, histLen) { + + val table = List.fill(TageBanks) { + List.fill(2) { + Module(new SRAMTemplate(SInt(ctrBits.W), set=nRows, shouldReset=false, holdRead=true, singlePort=false)) + } + } + + def compute_folded_hist(hist: UInt, l: Int) = { + if (histLen > 0) { + val nChunks = (histLen + l - 1) / l + val hist_chunks = (0 until nChunks) map {i => + hist(min((i+1)*l, histLen)-1, i*l) + } + hist_chunks.reduce(_^_) + } + else 0.U + } + + def getIdx(hist: UInt, pc: UInt) = { + (compute_folded_hist(hist, log2Ceil(nRows)) ^ (pc >> 1.U))(log2Ceil(nRows)-1,0) + } + + def ctrUpdate(ctr: SInt, cond: Bool): SInt = signedSatUpdate(ctr, ctrBits, cond) + + val doing_reset = RegInit(true.B) + val reset_idx = RegInit(0.U(log2Ceil(nRows).W)) + reset_idx := reset_idx + doing_reset + when (reset_idx === (nRows-1).U) { doing_reset := false.B } + + val idx = getIdx(io.req.bits.hist, io.req.bits.pc) + val idxLatch = RegEnable(idx, enable=io.req.valid) + + val table_r = WireInit(0.U.asTypeOf(Vec(TageBanks,Vec(2, SInt(ctrBits.W))))) + + val baseBank = io.req.bits.pc(log2Up(TageBanks), 1) + val baseBankLatch = RegEnable(baseBank, enable=io.req.valid) + + val bankIdxInOrder = VecInit((0 until TageBanks).map(b => (baseBankLatch +& b.U)(log2Up(TageBanks)-1, 0))) + val realMask = circularShiftLeft(io.req.bits.mask, TageBanks, baseBank) + val maskLatch = RegEnable(io.req.bits.mask, enable=io.req.valid) + + val update_idx = getIdx(io.update.hist, io.update.pc - (io.update.fetchIdx << 1)) + val update_wdata = ctrUpdate(io.update.oldCtr, io.update.taken) + + + for (b <- 0 until TageBanks) { + for (i <- 0 to 1) { + table(b)(i).reset := reset.asBool + table(b)(i).io.r.req.valid := io.req.valid && realMask(b) + table(b)(i).io.r.req.bits.setIdx := idx + + table_r(b)(i) := table(b)(i).io.r.resp.data(0) + + table(b)(i).io.w.req.valid := (io.update.mask(b) && i.U === io.update.tagePred.asUInt) || doing_reset + table(b)(i).io.w.req.bits.setIdx := Mux(doing_reset, reset_idx, update_idx) + table(b)(i).io.w.req.bits.data := Mux(doing_reset, 0.S, update_wdata) + } + + } + + (0 until TageBanks).map(b => { + io.resp(b).ctr := table_r(bankIdxInOrder(b)) + }) + + if (BPUDebug && debug) { + val u = io.update + val b = PriorityEncoder(u.mask) + XSDebug(io.req.valid, p"scTableReq: pc=0x${io.req.bits.pc}%x, idx=${idx}%d, hist=${io.req.bits.hist}%x, baseBank=${baseBank}%d, mask=${io.req.bits.mask}%b, realMask=${realMask}%b\n") + for (i <- 0 until TageBanks) { + XSDebug(RegNext(io.req.valid), p"scTableResp[${i.U}]: idx=${idxLatch}%d, ctr:${io.resp(i).ctr}\n") + } + XSDebug(io.update.mask.reduce(_||_), p"update Table: pc:${u.pc}%x, fetchIdx:${u.fetchIdx}%d, hist:${u.hist}%x, bank:${b}%d, tageTaken:${u.tagePred}%d, taken:${u.taken}%d, oldCtr:${u.oldCtr}%d\n") + } + +} + +class SCThreshold(val ctrBits: Int = 5) extends TageBundle { + val ctr = UInt(ctrBits.W) + def satPos(ctr: UInt = this.ctr) = ctr === ((1.U << ctrBits) - 1.U) + def satNeg(ctr: UInt = this.ctr) = ctr === 0.U + def neutralVal = (1.U << (ctrBits - 1)) + val thres = UInt(5.W) + def minThres = 5.U + def maxThres = 31.U + def update(cause: Bool): SCThreshold = { + val res = Wire(new SCThreshold(this.ctrBits)) + val newCtr = satUpdate(this.ctr, this.ctrBits, cause) + val newThres = Mux(res.satPos(newCtr), this.thres + 1.U, + Mux(res.satNeg(newCtr), this.thres - 1.U, + this.thres)) + res.thres := newThres + res.ctr := Mux(res.satPos(newCtr) || res.satNeg(newCtr), res.neutralVal, newCtr) + // XSDebug(true.B, p"scThres Update: cause${cause} newCtr ${newCtr} newThres ${newThres}\n") + res + } +} + +object SCThreshold { + def apply(bits: Int) = { + val t = Wire(new SCThreshold(ctrBits=bits)) + t.ctr := t.neutralVal + t.thres := t.minThres + t + } +} \ No newline at end of file diff --git a/src/main/scala/xiangshan/frontend/Tage.scala b/src/main/scala/xiangshan/frontend/Tage.scala index ccb1b9c6f6f376efbde68dfef5c2ae781afa2a95..ce5ac9eecada0ff6f01eaf0fe8eee0016a7b916f 100644 --- a/src/main/scala/xiangshan/frontend/Tage.scala +++ b/src/main/scala/xiangshan/frontend/Tage.scala @@ -4,6 +4,7 @@ import chisel3._ import chisel3.util._ import xiangshan._ import utils._ +import chisel3.experimental.chiselName import scala.math.min @@ -24,16 +25,21 @@ trait HasTageParameter extends HasXSParameter with HasBPUParameter{ val TageNTables = TableInfo.size val UBitPeriod = 2048 val TageBanks = PredictWidth // FetchWidth - + val TageCtrBits = 3 + val SCHistLens = 0 :: TableInfo.map{ case (_,h,_) => h}.toList + val SCNTables = 6 + val SCCtrBits = 6 + val SCNRows = 1024 + val SCTableInfo = Seq.fill(SCNTables)((SCNRows, SCCtrBits)) zip SCHistLens map {case ((n, cb), h) => (n, cb, h)} val TotalBits = TableInfo.map { case (s, h, t) => { - s * (1+t+3) * PredictWidth + s * (1+t+TageCtrBits) * PredictWidth } }.reduce(_+_) } -abstract class TageBundle extends XSBundle with HasTageParameter -abstract class TageModule extends XSModule with HasTageParameter { val debug = false } +abstract class TageBundle extends XSBundle with HasTageParameter with PredictorUtils +abstract class TageModule extends XSModule with HasTageParameter with PredictorUtils { val debug = true } @@ -45,7 +51,7 @@ class TageReq extends TageBundle { } class TageResp extends TageBundle { - val ctr = UInt(3.W) + val ctr = UInt(TageCtrBits.W) val u = UInt(2.W) } @@ -57,7 +63,7 @@ class TageUpdate extends TageBundle { val mask = Vec(TageBanks, Bool()) val taken = Vec(TageBanks, Bool()) val alloc = Vec(TageBanks, Bool()) - val oldCtr = Vec(TageBanks, UInt(3.W)) + val oldCtr = Vec(TageBanks, UInt(TageCtrBits.W)) // update u val uMask = Vec(TageBanks, Bool()) val u = Vec(TageBanks, UInt(2.W)) @@ -72,8 +78,8 @@ class FakeTageTable() extends TageModule { io.resp := DontCare } - -class TageTable(val nRows: Int, val histLen: Int, val tagLen: Int, val uBitPeriod: Int) extends TageModule { +@chiselName +class TageTable(val nRows: Int, val histLen: Int, val tagLen: Int, val uBitPeriod: Int) extends TageModule with HasIFUConst { val io = IO(new Bundle() { val req = Input(Valid(new TageReq)) val resp = Output(Vec(TageBanks, Valid(new TageResp))) @@ -81,7 +87,7 @@ class TageTable(val nRows: Int, val histLen: Int, val tagLen: Int, val uBitPerio }) // override val debug = true // bypass entries for tage update - val wrBypassEntries = 8 + val wrBypassEntries = 4 def compute_folded_hist(hist: UInt, l: Int) = { val nChunks = (histLen + l - 1) / l @@ -100,18 +106,7 @@ class TageTable(val nRows: Int, val histLen: Int, val tagLen: Int, val uBitPerio (idx, tag) } - def inc_ctr(ctr: UInt, taken: Bool): UInt = { - Mux(!taken, Mux(ctr === 0.U, 0.U, ctr - 1.U), - Mux(ctr === 7.U, 7.U, ctr + 1.U)) - } - // circular shifting - def circularShiftLeft(source: UInt, len: Int, shamt: UInt): UInt = { - val res = Wire(UInt(len.W)) - val higher = source << shamt - val lower = source >> (len.U - shamt) - res := higher | lower - res - } + def inc_ctr(ctr: UInt, taken: Bool): UInt = satUpdate(ctr, TageCtrBits, taken) val doing_reset = RegInit(true.B) val reset_idx = RegInit(0.U(log2Ceil(nRows).W)) @@ -121,25 +116,63 @@ class TageTable(val nRows: Int, val histLen: Int, val tagLen: Int, val uBitPerio class TageEntry() extends TageBundle { val valid = Bool() val tag = UInt(tagLen.W) - val ctr = UInt(3.W) + val ctr = UInt(TageCtrBits.W) } - val tageEntrySz = 1 + tagLen + 3 + val tageEntrySz = 1 + tagLen + TageCtrBits + val bankAlignedPC = bankAligned(io.req.bits.pc) + // this bank means cache bank + val startsAtOddBank = bankInGroup(bankAlignedPC)(0) // use real address to index // val unhashed_idxes = VecInit((0 until TageBanks).map(b => ((io.req.bits.pc >> 1.U) + b.U) >> log2Up(TageBanks).U)) - val unhashed_idx = io.req.bits.pc >> 1.U + val unhashed_idx = Wire(Vec(2, UInt((log2Ceil(nRows)+tagLen).W))) + // the first bank idx always correspond with pc + unhashed_idx(0) := io.req.bits.pc >> (1+log2Ceil(TageBanks)) + // when pc is at odd bank, the second bank is at the next idx + unhashed_idx(1) := unhashed_idx(0) + startsAtOddBank // val idxes_and_tags = (0 until TageBanks).map(b => compute_tag_and_hash(unhashed_idxes(b.U), io.req.bits.hist)) - val (idx, tag) = compute_tag_and_hash(unhashed_idx, io.req.bits.hist) + // val (idx, tag) = compute_tag_and_hash(unhashed_idx, io.req.bits.hist) + val idxes_and_tags = unhashed_idx.map(compute_tag_and_hash(_, io.req.bits.hist)) // val idxes = VecInit(idxes_and_tags.map(_._1)) // val tags = VecInit(idxes_and_tags.map(_._2)) - val idxLatch = RegEnable(idx, enable=io.req.valid) - val tagLatch = RegEnable(tag, enable=io.req.valid) + val idxes_latch = RegEnable(VecInit(idxes_and_tags.map(_._1)), io.req.valid) + val tags_latch = RegEnable(VecInit(idxes_and_tags.map(_._2)), io.req.valid) + // and_tags_latch = RegEnable(idxes_and_tags, enable=io.req.valid) + + // val idxLatch = RegEnable(idx, enable=io.req.valid) + // val tagLatch = RegEnable(tag, enable=io.req.valid) + + class HL_Bank (val nRows: Int = nRows) extends TageModule { + val io = IO(new Bundle { + val r = new Bundle { + val req = Flipped(ValidIO(new Bundle { + val setIdx = UInt(log2Ceil(nRows).W) + })) + val resp = new Bundle { + val data = Output(Bool()) + } + } + val w = new Bundle { + val req = Flipped(ValidIO(new Bundle { + val setIdx = UInt(log2Ceil(nRows).W) + val data = Bool() + })) + } + }) - val hi_us = List.fill(TageBanks)(Module(new SRAMTemplate(Bool(), set=nRows, shouldReset=false, holdRead=true, singlePort=false))) - val lo_us = List.fill(TageBanks)(Module(new SRAMTemplate(Bool(), set=nRows, shouldReset=false, holdRead=true, singlePort=false))) + val mem = Mem(nRows, Bool()) + // 1-cycle latency just as SyncReadMem + io.r.resp.data := RegEnable(mem.read(io.r.req.bits.setIdx), enable=io.r.req.valid) + when (io.w.req.valid) { + mem.write(io.w.req.bits.setIdx, io.w.req.bits.data) + } + } + + val hi_us = List.fill(TageBanks)(Module(new HL_Bank(nRows))) + val lo_us = List.fill(TageBanks)(Module(new HL_Bank(nRows))) val table = List.fill(TageBanks)(Module(new SRAMTemplate(new TageEntry, set=nRows, shouldReset=false, holdRead=true, singlePort=false))) val hi_us_r = WireInit(0.U.asTypeOf(Vec(TageBanks, Bool()))) @@ -151,33 +184,50 @@ class TageTable(val nRows: Int, val histLen: Int, val tagLen: Int, val uBitPerio val bankIdxInOrder = VecInit((0 until TageBanks).map(b => (baseBankLatch +& b.U)(log2Up(TageBanks)-1, 0))) - val realMask = circularShiftLeft(io.req.bits.mask, TageBanks, baseBank) - val maskLatch = RegEnable(io.req.bits.mask, enable=io.req.valid) + val realMask = Mux(startsAtOddBank, + Cat(io.req.bits.mask(bankWidth-1,0), io.req.bits.mask(PredictWidth-1, bankWidth)), + io.req.bits.mask) + val maskLatch = RegEnable(realMask, enable=io.req.valid) + + (0 until TageBanks).map( b => { - hi_us(b).reset := reset.asBool - lo_us(b).reset := reset.asBool - table(b).reset := reset.asBool + val idxes = VecInit(idxes_and_tags.map(_._1)) + val idx = (if (b < bankWidth) Mux(startsAtOddBank, idxes(1), idxes(0)) + else Mux(startsAtOddBank, idxes(0), idxes(1))) hi_us(b).io.r.req.valid := io.req.valid && realMask(b) + hi_us(b).io.r.req.bits.setIdx := idx + lo_us(b).io.r.req.valid := io.req.valid && realMask(b) - table(b).io.r.req.valid := io.req.valid && realMask(b) lo_us(b).io.r.req.bits.setIdx := idx - hi_us(b).io.r.req.bits.setIdx := idx + + table(b).reset := reset.asBool + table(b).io.r.req.valid := io.req.valid && realMask(b) table(b).io.r.req.bits.setIdx := idx - hi_us_r(b) := hi_us(b).io.r.resp.data(0) - lo_us_r(b) := lo_us(b).io.r.resp.data(0) + hi_us_r(b) := hi_us(b).io.r.resp.data + lo_us_r(b) := lo_us(b).io.r.resp.data table_r(b) := table(b).io.r.resp.data(0) } ) - val req_rhits = VecInit((0 until TageBanks).map(b => table_r(bankIdxInOrder(b)).valid && table_r(bankIdxInOrder(b)).tag === tagLatch)) + val startsAtOddBankLatch = RegEnable(startsAtOddBank, io.req.valid) + val req_rhits = VecInit((0 until TageBanks).map(b => { + val tag = (if (b < bankWidth) Mux(startsAtOddBank, tags_latch(1), tags_latch(0)) + else Mux(startsAtOddBank, tags_latch(0), tags_latch(1))) + val bank = (if (b < bankWidth) Mux(startsAtOddBankLatch, (b+bankWidth).U, b.U) + else Mux(startsAtOddBankLatch, (b-bankWidth).U, b.U)) + table_r(bank).valid && table_r(bank).tag === tag + })) + (0 until TageBanks).map(b => { + val bank = (if (b < bankWidth) Mux(startsAtOddBankLatch, (b+bankWidth).U, b.U) + else Mux(startsAtOddBankLatch, (b-bankWidth).U, b.U)) io.resp(b).valid := req_rhits(b) && maskLatch(b) - io.resp(b).bits.ctr := table_r(bankIdxInOrder(b)).ctr - io.resp(b).bits.u := Cat(hi_us_r(bankIdxInOrder(b)),lo_us_r(bankIdxInOrder(b))) + io.resp(b).bits.ctr := table_r(bank).ctr + io.resp(b).bits.u := Cat(hi_us_r(bank),lo_us_r(bank)) }) @@ -190,7 +240,7 @@ class TageTable(val nRows: Int, val histLen: Int, val tagLen: Int, val uBitPerio val clear_u_idx = clear_u_ctr >> log2Ceil(uBitPeriod) // Use fetchpc to compute hash - val (update_idx, update_tag) = compute_tag_and_hash((io.update.pc >> 1.U) - io.update.fetchIdx, io.update.hist) + val (update_idx, update_tag) = compute_tag_and_hash((io.update.pc >> (1 + log2Ceil(TageBanks))), io.update.hist) val update_wdata = Wire(Vec(TageBanks, new TageEntry)) @@ -217,7 +267,7 @@ class TageTable(val nRows: Int, val histLen: Int, val tagLen: Int, val uBitPerio val wrbypass_tags = Reg(Vec(wrBypassEntries, UInt(tagLen.W))) val wrbypass_idxs = Reg(Vec(wrBypassEntries, UInt(log2Ceil(nRows).W))) - val wrbypass_ctrs = Reg(Vec(wrBypassEntries, Vec(TageBanks, UInt(3.W)))) + val wrbypass_ctrs = Reg(Vec(wrBypassEntries, Vec(TageBanks, UInt(TageCtrBits.W)))) val wrbypass_ctr_valids = Reg(Vec(wrBypassEntries, Vec(TageBanks, Bool()))) val wrbypass_enq_idx = RegInit(0.U(log2Ceil(wrBypassEntries).W)) @@ -229,28 +279,23 @@ class TageTable(val nRows: Int, val histLen: Int, val tagLen: Int, val uBitPerio wrbypass_idxs(i) === update_idx }) - val wrbypass_rhits = VecInit((0 until wrBypassEntries) map { i => - io.req.valid && - wrbypass_tags(i) === tag && - wrbypass_idxs(i) === idx - }) val wrbypass_hit = wrbypass_hits.reduce(_||_) - val wrbypass_rhit = wrbypass_rhits.reduce(_||_) + // val wrbypass_rhit = wrbypass_rhits.reduce(_||_) val wrbypass_hit_idx = PriorityEncoder(wrbypass_hits) - val wrbypass_rhit_idx = PriorityEncoder(wrbypass_rhits) + // val wrbypass_rhit_idx = PriorityEncoder(wrbypass_rhits) - val wrbypass_rctr_hits = VecInit((0 until TageBanks).map( b => wrbypass_ctr_valids(wrbypass_rhit_idx)(b))) + // val wrbypass_rctr_hits = VecInit((0 until TageBanks).map( b => wrbypass_ctr_valids(wrbypass_rhit_idx)(b))) - val rhit_ctrs = RegEnable(wrbypass_ctrs(wrbypass_rhit_idx), wrbypass_rhit) + // val rhit_ctrs = RegEnable(wrbypass_ctrs(wrbypass_rhit_idx), wrbypass_rhit) - when (RegNext(wrbypass_rhit)) { - for (b <- 0 until TageBanks) { - when (RegNext(wrbypass_rctr_hits(b.U + baseBank))) { - io.resp(b).bits.ctr := rhit_ctrs(bankIdxInOrder(b)) - } - } - } + // when (RegNext(wrbypass_rhit)) { + // for (b <- 0 until TageBanks) { + // when (RegNext(wrbypass_rctr_hits(b.U + baseBank))) { + // io.resp(b).bits.ctr := rhit_ctrs(bankIdxInOrder(b)) + // } + // } + // } val updateBank = PriorityEncoder(io.update.mask) @@ -290,10 +335,13 @@ class TageTable(val nRows: Int, val histLen: Int, val tagLen: Int, val uBitPerio val u = io.update val b = PriorityEncoder(u.mask) val ub = PriorityEncoder(u.uMask) - XSDebug(io.req.valid, "tableReq: pc=0x%x, hist=%x, idx=%d, tag=%x, baseBank=%d, mask=%b, realMask=%b\n", - io.req.bits.pc, io.req.bits.hist, idx, tag, baseBank, io.req.bits.mask, realMask) + val idx = idxes_and_tags.map(_._1) + val tag = idxes_and_tags.map(_._2) + XSDebug(io.req.valid, "tableReq: pc=0x%x, hist=%x, idx=(%d,%d), tag=(%x,%x), baseBank=%d, mask=%b, realMask=%b\n", + io.req.bits.pc, io.req.bits.hist, idx(0), idx(1), tag(0), tag(1), baseBank, io.req.bits.mask, realMask) for (i <- 0 until TageBanks) { - XSDebug(RegNext(io.req.valid) && req_rhits(i), "TageTableResp[%d]: idx=%d, hit:%d, ctr:%d, u:%d\n", i.U, idxLatch, req_rhits(i), io.resp(i).bits.ctr, io.resp(i).bits.u) + XSDebug(RegNext(io.req.valid) && req_rhits(i), "TageTableResp[%d]: idx=(%d,%d), hit:%d, ctr:%d, u:%d\n", + i.U, idxes_latch(0), idxes_latch(1), req_rhits(i), io.resp(i).bits.ctr, io.resp(i).bits.u) } XSDebug(RegNext(io.req.valid), "TageTableResp: hits:%b, maskLatch is %b\n", req_rhits.asUInt, maskLatch) @@ -311,13 +359,13 @@ class TageTable(val nRows: Int, val histLen: Int, val tagLen: Int, val uBitPerio "wrbypass hits, wridx:%d, tag:%x, idx:%d, hitctr:%d, bank:%d\n", wrbypass_hit_idx, update_tag, update_idx, wrbypass_ctrs(wrbypass_hit_idx)(updateBank), updateBank) - when (wrbypass_rhit && wrbypass_ctr_valids(wrbypass_rhit_idx).reduce(_||_)) { - for (b <- 0 until TageBanks) { - XSDebug(wrbypass_ctr_valids(wrbypass_rhit_idx)(b), - "wrbypass rhits, wridx:%d, tag:%x, idx:%d, hitctr:%d, bank:%d\n", - wrbypass_rhit_idx, tag, idx, wrbypass_ctrs(wrbypass_rhit_idx)(b), b.U) - } - } + // when (wrbypass_rhit && wrbypass_ctr_valids(wrbypass_rhit_idx).reduce(_||_)) { + // for (b <- 0 until TageBanks) { + // XSDebug(wrbypass_ctr_valids(wrbypass_rhit_idx)(b), + // "wrbypass rhits, wridx:%d, tag:%x, idx:%d, hitctr:%d, bank:%d\n", + // wrbypass_rhit_idx, tag, idx, wrbypass_ctrs(wrbypass_rhit_idx)(b), b.U) + // } + // } // ------------------------------Debug------------------------------------- val valids = Reg(Vec(TageBanks, Vec(nRows, Bool()))) @@ -354,7 +402,7 @@ class FakeTage extends BaseTage { io.meta <> DontCare } - +@chiselName class Tage extends BaseTage { val tables = TableInfo.map { @@ -368,10 +416,27 @@ class Tage extends BaseTage { } } - // override val debug = true + val scTables = SCTableInfo.map { + case (nRows, ctrBits, histLen) => { + val t = if (EnableSC) Module(new SCTable(nRows/TageBanks, ctrBits, histLen)) else Module(new FakeSCTable) + val req = t.io.req + req.valid := io.pc.valid && !io.flush + req.bits.pc := io.pc.bits + req.bits.hist := io.hist + req.bits.mask := io.inMask + t + } + } + + val scThreshold = RegInit(SCThreshold(5)) + val useThreshold = WireInit(scThreshold.thres) + val updateThreshold = WireInit((useThreshold << 3) + 21.U) + + override val debug = true // Keep the table responses to process in s3 val resps = VecInit(tables.map(t => RegEnable(t.io.resp, enable=io.s3Fire))) + val scResps = VecInit(scTables.map(t => RegEnable(t.io.resp, enable=io.s3Fire))) // val flushLatch = RegNext(io.flush) val s2_bim = RegEnable(io.bim, enable=io.pc.valid) // actually it is s2Fire @@ -387,24 +452,37 @@ class Tage extends BaseTage { val updateValid = io.update.valid val updateHist = io.update.bits.hist + val updateIsBr = u.pd.isBr val updateMeta = u.brInfo.tageMeta - val updateMisPred = u.isMisPred && u.pd.isBr + val updateMisPred = u.isMisPred && updateIsBr val updateMask = WireInit(0.U.asTypeOf(Vec(TageNTables, Vec(TageBanks, Bool())))) val updateUMask = WireInit(0.U.asTypeOf(Vec(TageNTables, Vec(TageBanks, Bool())))) val updateTaken = Wire(Vec(TageNTables, Vec(TageBanks, Bool()))) val updateAlloc = Wire(Vec(TageNTables, Vec(TageBanks, Bool()))) - val updateOldCtr = Wire(Vec(TageNTables, Vec(TageBanks, UInt(3.W)))) + val updateOldCtr = Wire(Vec(TageNTables, Vec(TageBanks, UInt(TageCtrBits.W)))) val updateU = Wire(Vec(TageNTables, Vec(TageBanks, UInt(2.W)))) updateTaken := DontCare updateAlloc := DontCare updateOldCtr := DontCare updateU := DontCare + val scUpdateMask = WireInit(0.U.asTypeOf(Vec(SCNTables, Vec(TageBanks, Bool())))) + val scUpdateTagePred = Wire(Bool()) + val scUpdateTaken = Wire(Bool()) + val scUpdateOldCtrs = Wire(Vec(SCNTables, SInt(SCCtrBits.W))) + scUpdateTagePred := DontCare + scUpdateTaken := DontCare + scUpdateOldCtrs := DontCare + + val updateSCMeta = u.brInfo.tageMeta.scMeta + val updateTageMisPred = updateMeta.taken =/= u.taken && updateIsBr + val updateBank = u.pc(log2Ceil(TageBanks), 1) // access tag tables and output meta info for (w <- 0 until TageBanks) { + val tageTaken = WireInit(s3_bim.ctrs(w)(1).asBool) var altPred = s3_bim.ctrs(w)(1) val finalAltPred = WireInit(s3_bim.ctrs(w)(1)) var provided = false.B @@ -416,6 +494,7 @@ class Tage extends BaseTage { val ctr = resps(i)(w).bits.ctr when (hit) { io.resp.takens(w) := Mux(ctr === 3.U || ctr === 4.U, altPred, ctr(2)) // Use altpred on weak taken + tageTaken := Mux(ctr === 3.U || ctr === 4.U, altPred, ctr(2)) finalAltPred := altPred } provided = provided || hit // Once hit then provide @@ -428,6 +507,7 @@ class Tage extends BaseTage { io.meta(w).altDiffers := finalAltPred =/= io.resp.takens(w) io.meta(w).providerU := resps(provider)(w).bits.u io.meta(w).providerCtr := resps(provider)(w).bits.ctr + io.meta(w).taken := tageTaken // Create a mask fo tables which did not hit our query, and also contain useless entries // and also uses a longer history than the provider @@ -441,10 +521,53 @@ class Tage extends BaseTage { io.meta(w).allocate.valid := allocatableSlots =/= 0.U io.meta(w).allocate.bits := allocEntry + val scMeta = io.meta(w).scMeta + scMeta := DontCare + val scTableSums = VecInit( + (0 to 1) map { i => { + // val providerCtr = resps(provider)(w).bits.ctr.zext() + // val pvdrCtrCentered = (((providerCtr - 4.S) << 1) + 1.S) << 3 + // sum += pvdrCtrCentered + if (EnableSC) { + (0 until SCNTables) map { j => + scTables(j).getCenteredValue(scResps(j)(w).ctr(i)) + } reduce (_+_) // TODO: rewrite with adder tree + } + else 0.S + } + } + ) + + if (EnableSC) { + scMeta.tageTaken := tageTaken + scMeta.scUsed := provided + scMeta.scPred := tageTaken + scMeta.sumAbs := 0.U + when (provided) { + val providerCtr = resps(provider)(w).bits.ctr.zext() + val pvdrCtrCentered = ((((providerCtr - 4.S) << 1).asSInt + 1.S) << 3).asSInt + val totalSum = scTableSums(tageTaken.asUInt) + pvdrCtrCentered + val sumAbs = totalSum.abs().asUInt + val sumBelowThreshold = totalSum.abs.asUInt < useThreshold + val scPred = totalSum >= 0.S + scMeta.sumAbs := sumAbs + scMeta.ctrs := VecInit(scResps.map(r => r(w).ctr(tageTaken.asUInt))) + for (i <- 0 until SCNTables) { + XSDebug(RegNext(io.s3Fire), p"SCTable(${i.U})(${w.U}): ctr:(${scResps(i)(w).ctr(0)},${scResps(i)(w).ctr(1)})\n") + } + XSDebug(RegNext(io.s3Fire), p"SC(${w.U}): pvdCtr(${providerCtr}), pvdCentred(${pvdrCtrCentered}), totalSum(${totalSum}), abs(${sumAbs}) useThres(${useThreshold}), scPred(${scPred})\n") + // Use prediction from Statistical Corrector + when (!sumBelowThreshold) { + XSDebug(RegNext(io.s3Fire), p"SC(${w.U}) overriden pred to ${scPred}\n") + scMeta.scPred := scPred + io.resp.takens(w) := scPred + } + } + } val isUpdateTaken = updateValid && updateBank === w.U && - u.taken && u.pd.isBr - when (u.pd.isBr && updateValid && updateBank === w.U) { + u.taken && updateIsBr + when (updateIsBr && updateValid && updateBank === w.U) { when (updateMeta.provider.valid) { val provider = updateMeta.provider.bits @@ -462,7 +585,7 @@ class Tage extends BaseTage { } } - when (updateValid && updateMisPred) { + when (updateValid && updateTageMisPred) { val idx = updateBank val allocate = updateMeta.allocate when (allocate.valid) { @@ -483,6 +606,28 @@ class Tage extends BaseTage { } } + if (EnableSC) { + when (updateValid && updateSCMeta.scUsed.asBool && updateIsBr) { + val scPred = updateSCMeta.scPred + val tageTaken = updateSCMeta.tageTaken + val sumAbs = updateSCMeta.sumAbs.asUInt + val scOldCtrs = updateSCMeta.ctrs + when (scPred =/= tageTaken && sumAbs < useThreshold - 2.U) { + val newThres = scThreshold.update(scPred =/= u.taken) + scThreshold := newThres + XSDebug(p"scThres update: old d${useThreshold} --> new ${newThres.thres}\n") + } + when (scPred =/= u.taken || sumAbs < updateThreshold) { + scUpdateMask.foreach(t => t(updateBank) := true.B) + scUpdateTagePred := tageTaken + scUpdateTaken := u.taken + (scUpdateOldCtrs zip scOldCtrs).foreach{case (t, c) => t := c} + XSDebug(p"scUpdate: bank(${updateBank}), scPred(${scPred}), tageTaken(${tageTaken}), scSumAbs(${sumAbs}), mispred: sc(${updateMisPred}), tage(${updateTageMisPred})\n") + XSDebug(p"update: sc: ${updateSCMeta}\n") + } + } + } + for (i <- 0 until TageNTables) { for (w <- 0 until TageBanks) { tables(i).io.update.mask(w) := updateMask(i)(w) @@ -499,6 +644,17 @@ class Tage extends BaseTage { tables(i).io.update.fetchIdx := u.brInfo.fetchIdx } + for (i <- 0 until SCNTables) { + scTables(i).io.update.mask := scUpdateMask(i) + scTables(i).io.update.tagePred := scUpdateTagePred + scTables(i).io.update.taken := scUpdateTaken + scTables(i).io.update.oldCtr := scUpdateOldCtrs(i) + scTables(i).io.update.pc := u.pc + scTables(i).io.update.hist := updateHist + scTables(i).io.update.fetchIdx := u.brInfo.fetchIdx + } + + if (BPUDebug && debug) { val m = updateMeta @@ -508,9 +664,11 @@ class Tage extends BaseTage { XSDebug(RegNext(io.s3Fire), "s3FireOnLastCycle: resp: pc=%x, hist=%x, hits=%b, takens=%b\n", debug_pc_s3, debug_hist_s3, io.resp.hits.asUInt, io.resp.takens.asUInt) for (i <- 0 until TageNTables) { - XSDebug(RegNext(io.s3Fire), "Table(%d): valids:%b, resp_ctrs:%b, resp_us:%b\n", i.U, VecInit(resps(i).map(_.valid)).asUInt, Cat(resps(i).map(_.bits.ctr)), Cat(resps(i).map(_.bits.u))) + XSDebug(RegNext(io.s3Fire), "TageTable(%d): valids:%b, resp_ctrs:%b, resp_us:%b\n", i.U, VecInit(resps(i).map(_.valid)).asUInt, Cat(resps(i).map(_.bits.ctr)), Cat(resps(i).map(_.bits.u))) } XSDebug(io.update.valid, "update: pc=%x, fetchpc=%x, cycle=%d, hist=%x, taken:%d, misPred:%d, histPtr:%d, bimctr:%d, pvdr(%d):%d, altDiff:%d, pvdrU:%d, pvdrCtr:%d, alloc(%d):%d\n", u.pc, u.pc - (bri.fetchIdx << 1.U), bri.debug_tage_cycle, updateHist, u.taken, u.isMisPred, bri.histPtr, bri.bimCtr, m.provider.valid, m.provider.bits, m.altDiffers, m.providerU, m.providerCtr, m.allocate.valid, m.allocate.bits) + XSDebug(io.update.valid && updateIsBr, p"update: sc: ${updateSCMeta}\n") + XSDebug(true.B, p"scThres: use(${useThreshold}), update(${updateThreshold})\n") } } \ No newline at end of file diff --git a/src/main/scala/xiangshan/frontend/uBTB.scala b/src/main/scala/xiangshan/frontend/uBTB.scala index 8e20a8e93e7148edc52042a5bfc12987fa3d077c..9d5545cd59bc54c35f0b91d50ca5fb1e67ae75f4 100644 --- a/src/main/scala/xiangshan/frontend/uBTB.scala +++ b/src/main/scala/xiangshan/frontend/uBTB.scala @@ -4,18 +4,24 @@ import chisel3._ import chisel3.util._ import utils._ import xiangshan._ +import chisel3.experimental.chiselName import scala.math.min trait MicroBTBPatameter{ val nWays = 16 - val offsetSize = 20 + val lowerBitsSize = 20 + val tagSize = 20 + + val extended_stat = false } +@chiselName class MicroBTB extends BasePredictor with MicroBTBPatameter { - val tagSize = VAddrBits - log2Ceil(PredictWidth) - 1 + // val tagSize = VAddrBits - log2Ceil(PredictWidth) - 1 + val untaggedBits = log2Up(PredictWidth) + 1 class MicroBTBResp extends Resp { @@ -44,16 +50,8 @@ class MicroBTB extends BasePredictor override val io = IO(new MicroBTBIO) io.uBTBBranchInfo <> out_ubtb_br_info - def getTag(pc: UInt) = (pc >> (log2Ceil(PredictWidth) + 1)).asUInt() + def getTag(pc: UInt) = (pc >> untaggedBits)(tagSize-1, 0) def getBank(pc: UInt) = pc(log2Ceil(PredictWidth) ,1) - def satUpdate(old: UInt, len: Int, taken: Bool): UInt = { - val oldSatTaken = old === ((1 << len)-1).U - val oldSatNotTaken = old === 0.U - Mux(oldSatTaken && taken, ((1 << len)-1).U, - Mux(oldSatNotTaken && !taken, 0.U, - Mux(taken, old + 1.U, old - 1.U))) - } - class MicroBTBMeta extends XSBundle { @@ -66,17 +64,116 @@ class MicroBTB extends BasePredictor class MicroBTBEntry extends XSBundle { - val offset = SInt(offsetSize.W) + val lower = UInt(lowerBitsSize.W) + } + + // val uBTBMeta = RegInit((0.U).asTypeOf(Vec(nWays, Vec(PredictWidth, new MicroBTBMeta)))) + // val uBTB = Reg(Vec(nWays, Vec(PredictWidth, new MicroBTBEntry))) + + // class UBTBMem[T <: Data](gen: T, nWays: Int) extends XSModule { + // class UBTBBundleR[T <: Data](private val gen: T, val way: Int) extends Bundle { + // val data = Output(Vec(way, gen)) + // } + // class UBTBReadBus[T <: Data](private val gen: T, val way: Int) { + // val resp = Output(new UBTBBundleR(gen, way)) + // } + // class UBTBWriteBus[T <: Data](private val gen: T, val set: Int, val way: Int) extends Bundle { + // val req = + // } + // val io = IO(new Bundle { + // val wen = Input(Bool()) + // val wWay = Input(UInt(log2Up(nWays).W)) + // val wRow = Input(UInt(log2Up(PredictWidth).W)) + // val wdata = Input(new T) + // val entries = Output(Vec(nWays, Vec(PredictWidth, gen))) + // }) + // val mem = RegInit((0.U).asTypeOf(Vec(nWays, Vec(PredictWidth, new T)))) + // io.entries := mem + // when (io.wen) { + // mem(wWay)(wRow) := wdata + // } + // } + + class MetaOutput extends XSBundle { + val is_Br = Bool() + val is_RVC = Bool() + val pred = UInt(2.W) } - val uBTBMeta = RegInit((0.U).asTypeOf(Vec(nWays, Vec(PredictWidth, new MicroBTBMeta)))) - val uBTB = Reg(Vec(nWays, Vec(PredictWidth, new MicroBTBEntry))) + @chiselName + class UBTBMetaBank(nWays: Int) extends XSModule { + val io = IO(new Bundle { + val wen = Input(Bool()) + val wWay = Input(UInt(log2Up(nWays).W)) + val wdata = Input(new MicroBTBMeta) + val rtag = Input(UInt(tagSize.W)) + val rdata = Output(new MetaOutput) + val hit_ohs = Output(Vec(nWays, Bool())) + val hit_way = Output(UInt(log2Up(nWays).W)) + val allocatable_way = Valid(UInt(log2Up(nWays).W)) + val rWay = Input(UInt(log2Up(nWays).W)) + val rpred = Output(UInt(2.W)) + }) + val mem = Mem(nWays, new MicroBTBMeta) + val rentries = VecInit((0 until nWays) map (i => mem(i))) + val hit_ohs = VecInit(rentries map (e => e.valid && e.tag === io.rtag)) + val hit_way = PriorityEncoder(hit_ohs) + val hit_entry = rentries(hit_way) + io.hit_ohs := hit_ohs + io.hit_way := hit_way + io.rdata.is_Br := hit_entry.is_Br + io.rdata.is_RVC := hit_entry.is_RVC + io.rdata.pred := hit_entry.pred + val entry_emptys = VecInit(rentries.map(e => !e.valid)) + val allocatable = ParallelOR(entry_emptys) + io.allocatable_way.bits := PriorityEncoder(entry_emptys) + io.allocatable_way.valid := allocatable + io.rpred := rentries(io.rWay).pred + when (io.wen) { + mem.write(io.wWay, io.wdata) + } + } + + @chiselName + class UBTBDataBank(nWays: Int) extends XSModule { + val io = IO(new Bundle { + val wen = Input(Bool()) + val wWay = Input(UInt(log2Up(nWays).W)) + val wdata = Input(new MicroBTBEntry) + val rWay = Input(UInt(log2Up(nWays).W)) + val rdata = Output(new MicroBTBEntry) + }) + val mem = Mem(nWays, new MicroBTBEntry) + val rentries = VecInit((0 until nWays) map (i => mem(i))) + io.rdata := rentries(io.rWay) + when (io.wen) { + mem.write(io.wWay, io.wdata) + } + } + + val metaBanks = Seq.fill(PredictWidth)(Module(new UBTBMetaBank(nWays))) + val dataBanks = Seq.fill(PredictWidth)(Module(new UBTBDataBank(nWays))) + val metas = VecInit(metaBanks.map(_.io)) + val datas = VecInit(dataBanks.map(_.io)) + + val uBTBMeta = VecInit(metas.map(m => m.rdata)) + val uBTB = VecInit(datas.map(d => d.rdata)) + + val do_reset = RegInit(true.B) + val reset_way = RegInit(0.U(log2Ceil(nWays).W)) + when (do_reset) { reset_way := reset_way + 1.U } + when (reset_way === nWays.U) { do_reset := false.B } //uBTB read //tag is bank align + val bankAlignedPC = bankAligned(io.pc.bits) + val startsAtOddBank = bankInGroup(bankAlignedPC)(0).asBool + + + val read_valid = io.pc.valid - val read_req_tag = getTag(io.pc.bits) - val read_req_basebank = getBank(io.pc.bits) + val read_req_tag = getTag(bankAlignedPC) + val next_tag = read_req_tag + 1.U // val read_mask = circularShiftLeft(io.inMask, PredictWidth, read_req_basebank) @@ -89,29 +186,28 @@ class MicroBTB extends BasePredictor val is_Br = Bool() } val read_resp = Wire(Vec(PredictWidth,new ReadRespEntry)) - - val read_bank_inOrder = VecInit((0 until PredictWidth).map(b => (read_req_basebank + b.U)(log2Up(PredictWidth)-1,0) )) - val isInNextRow = VecInit((0 until PredictWidth).map(_.U < read_req_basebank)) - val read_hit_ohs = read_bank_inOrder.map{ b => - VecInit((0 until nWays) map {w => - Mux(isInNextRow(b),read_req_tag + 1.U,read_req_tag) === uBTBMeta(w)(b).tag - }) - } + //val read_bank_inOrder = VecInit((0 until PredictWidth).map(b => (read_req_basebank + b.U)(log2Up(PredictWidth)-1,0) )) + // val isInNextRow = VecInit((0 until PredictWidth).map(_.U < read_req_basebank)) + + (0 until PredictWidth).map{ b => metas(b).rtag := Mux(startsAtOddBank && (b > PredictWidth).B,next_tag,read_req_tag) } + val read_hit_ohs = (0 until PredictWidth).map{ b => metas(b).hit_ohs } val read_hit_vec = VecInit(read_hit_ohs.map{oh => ParallelOR(oh).asBool}) - val read_hit_ways = VecInit(read_hit_ohs.map{oh => PriorityEncoder(oh)}) - val read_hit = ParallelOR(read_hit_vec).asBool - val read_hit_way = PriorityEncoder(ParallelOR(read_hit_ohs.map(_.asUInt))) + val read_hit_ways = (0 until PredictWidth).map{ b => metas(b).hit_way } + // val read_hit = ParallelOR(read_hit_vec).asBool + // val read_hit_way = PriorityEncoder(ParallelOR(read_hit_ohs.map(_.asUInt))) - val uBTBMeta_resp = VecInit((0 until PredictWidth).map(b => uBTBMeta(read_hit_ways(b))(read_bank_inOrder(b)))) - val btb_resp = VecInit((0 until PredictWidth).map(b => uBTB(read_hit_ways(b))(read_bank_inOrder(b)))) + (0 until PredictWidth).map(b => datas(b).rWay := read_hit_ways(b)) + + val uBTBMeta_resp = VecInit((0 until PredictWidth).map(b => metas(b).rdata)) + val btb_resp = VecInit((0 until PredictWidth).map(b => datas(b).rdata)) for(i <- 0 until PredictWidth){ // do not need to decide whether to produce results\ - read_resp(i).valid := uBTBMeta_resp(i).valid && read_hit_vec(i) && io.inMask(i) + read_resp(i).valid := read_hit_vec(i) && io.inMask(i) read_resp(i).taken := read_resp(i).valid && uBTBMeta_resp(i).pred(1) read_resp(i).is_Br := read_resp(i).valid && uBTBMeta_resp(i).is_Br - read_resp(i).target := ((io.pc.bits).asSInt + (i<<1).S + btb_resp(i).offset).asUInt + read_resp(i).target := Cat(io.pc.bits(VAddrBits-1, lowerBitsSize+1), btb_resp(i).asUInt, 0.U(1.W)) read_resp(i).is_RVC := read_resp(i).valid && uBTBMeta_resp(i).is_RVC out_ubtb_br_info.hits(i) := read_hit_vec(i) @@ -130,12 +226,16 @@ class MicroBTB extends BasePredictor way := Mux(all_valid,chunks.reduce(_^_),PriorityEncoder(~valids)) way } - val alloc_ways = read_bank_inOrder.map{ b => - alloc_way(VecInit(uBTBMeta.map(w => w(b).valid)).asUInt, - VecInit(uBTBMeta.map(w => w(b).tag)).asUInt, - Mux(isInNextRow(b).asBool,read_req_tag + 1.U,read_req_tag)) + + // val alloc_ways = read_bank_inOrder.map{ b => + // alloc_way(VecInit(uBTBMeta.map(w => w(b).valid)).asUInt, + // VecInit(uBTBMeta.map(w => w(b).tag)).asUInt, + // Mux(isInNextRow(b).asBool,read_req_tag + 1.U,read_req_tag)) - } + // } + + val alloc_ways = (0 until PredictWidth).map{ b => + Mux(metas(b).allocatable_way.valid, metas(b).allocatable_way.bits, LFSR64()(log2Ceil(nWays)-1,0))} (0 until PredictWidth).map(i => out_ubtb_br_info.writeWay(i) := Mux(read_hit_vec(i).asBool,read_hit_ways(i),alloc_ways(i))) //response @@ -164,50 +264,66 @@ class MicroBTB extends BasePredictor val update_base_bank = getBank(update_fetch_pc) val update_tag = getTag(update_br_pc) val update_target = Mux(u.pd.isBr, u.brTarget, u.target) - val update_taget_offset = update_target.asSInt - update_br_pc.asSInt + val update_target_lower = update_target(lowerBitsSize, 1) val update_is_BR_or_JAL = (u.pd.brType === BrType.branch) || (u.pd.brType === BrType.jal) val jalFirstEncountered = !u.isMisPred && !u.brInfo.btbHitJal && (u.pd.brType === BrType.jal) - val entry_write_valid = io.update.valid && (u.isMisPred || !u.isMisPred && u.pd.isBr || jalFirstEncountered)//io.update.valid //&& update_is_BR_or_JAL - val meta_write_valid = io.update.valid && (u.isMisPred || !u.isMisPred && u.pd.isBr || jalFirstEncountered)//io.update.valid //&& update_is_BR_or_JAL + val entry_write_valid = io.update.valid && (u.isMisPred || jalFirstEncountered)//io.update.valid //&& update_is_BR_or_JAL + val meta_write_valid = io.update.valid && (u.isMisPred || jalFirstEncountered)//io.update.valid //&& update_is_BR_or_JAL //write btb target when miss prediction - when(entry_write_valid) - { - uBTB(update_write_way)(update_bank).offset := update_taget_offset + // when(entry_write_valid) + // { + // uBTB(update_write_way)(update_bank).offset := update_target_offset + // } + for (b <- 0 until PredictWidth) { + datas(b).wen := do_reset || (entry_write_valid && b.U === update_bank) + datas(b).wWay := Mux(do_reset, reset_way, update_write_way) + datas(b).wdata := Mux(do_reset, 0.U.asTypeOf(new MicroBTBEntry), update_target_lower.asTypeOf(new MicroBTBEntry)) } + + + //write the uBTBMeta - when(meta_write_valid) - { - //commit update - uBTBMeta(update_write_way)(update_bank).is_Br := u.pd.brType === BrType.branch - uBTBMeta(update_write_way)(update_bank).is_RVC := u.pd.isRVC - //(0 until PredictWidth).foreach{b => uBTBMeta(update_write_way)(b).valid := false.B} - uBTBMeta(update_write_way)(update_bank).valid := true.B - uBTBMeta(update_write_way)(update_bank).tag := update_tag - uBTBMeta(update_write_way)(update_bank).pred := - Mux(!update_hits, - Mux(update_taken,3.U,0.U), - satUpdate( uBTBMeta(update_write_way)(update_bank).pred,2,update_taken) - ) + (0 until PredictWidth).map(i => metas(i).rWay := update_write_way) + val update_write_meta = Wire(new MicroBTBMeta) + update_write_meta.is_Br := u.pd.brType === BrType.branch + update_write_meta.is_RVC := u.pd.isRVC + update_write_meta.valid := true.B + update_write_meta.tag := update_tag + update_write_meta.pred := Mux(!update_hits, + Mux(update_taken,3.U,0.U), + satUpdate( metas(update_bank).rpred,2,update_taken) + ) + + for (b <- 0 until PredictWidth) { + metas(b).wen := do_reset || (meta_write_valid && b.U === update_bank) + metas(b).wWay := Mux(do_reset, reset_way, update_write_way) + metas(b).wdata := Mux(do_reset, 0.U.asTypeOf(new MicroBTBMeta), update_write_meta) } if (BPUDebug && debug) { - XSDebug(read_valid,"uBTB read req: pc:0x%x, tag:%x basebank:%d\n",io.pc.bits,read_req_tag,read_req_basebank) + XSDebug(read_valid,"uBTB read req: pc:0x%x, tag:%x startAtOdd:%d\n",io.pc.bits,read_req_tag,startsAtOddBank) XSDebug(read_valid,"uBTB read resp: read_hit_vec:%b, \n",read_hit_vec.asUInt) for(i <- 0 until PredictWidth) { XSDebug(read_valid,"bank(%d) hit:%d way:%d valid:%d is_RVC:%d taken:%d isBr:%d target:0x%x alloc_way:%d\n", i.U,read_hit_vec(i),read_hit_ways(i),read_resp(i).valid,read_resp(i).is_RVC,read_resp(i).taken,read_resp(i).is_Br,read_resp(i).target,out_ubtb_br_info.writeWay(i)) } - XSDebug(meta_write_valid,"uBTB update: update | pc:0x%x | update hits:%b | | update_write_way:%d | update_bank: %d| update_br_index:%d | update_tag:%x | upadate_offset 0x%x\n " - ,update_br_pc,update_hits,update_write_way,update_bank,update_br_idx,update_tag,update_taget_offset(offsetSize-1,0)) + XSDebug(meta_write_valid,"uBTB update: update | pc:0x%x | update hits:%b | | update_write_way:%d | update_bank: %d| update_br_index:%d | update_tag:%x | update_lower 0x%x\n " + ,update_br_pc,update_hits,update_write_way,update_bank,update_br_idx,update_tag,update_target_lower(lowerBitsSize-1,0)) XSDebug(meta_write_valid, "uBTB update: update_taken:%d | old_pred:%b | new_pred:%b\n", - update_taken, uBTBMeta(update_write_way)(update_bank).pred, + update_taken, metas(update_bank).rpred, Mux(!update_hits, - Mux(update_taken,3.U,0.U), - satUpdate( uBTBMeta(update_write_way)(update_bank).pred,2,update_taken))) + Mux(update_taken,3.U,0.U), + satUpdate( metas(update_bank).rpred,2,update_taken) + )) + + } + if (extended_stat) { + val high_identical = update_target(VAddrBits-1, lowerBitsSize) =/= update_fetch_pc(VAddrBits-1, lowerBitsSize) + XSDebug(io.update.valid, "extended_stat: identical %d\n", high_identical) } //bypass:read-after-write diff --git a/src/main/scala/xiangshan/mem/Memend.scala b/src/main/scala/xiangshan/mem/Memend.scala index 5be41ff7a867afdec8355cc1baf343b03ffae210..af1694f82b55ec91c86d0f375818e1ba80b9e416 100644 --- a/src/main/scala/xiangshan/mem/Memend.scala +++ b/src/main/scala/xiangshan/mem/Memend.scala @@ -2,14 +2,11 @@ package xiangshan.mem import chisel3._ import chisel3.util._ -import chisel3.util.experimental.BoringUtils import xiangshan._ import utils._ -import chisel3.util.experimental.BoringUtils import xiangshan.backend.roq.RoqPtr - import xiangshan.cache._ -import bus.tilelink.{TLArbiter, TLCached, TLMasterUtilities, TLParameters} +import xiangshan.backend.fu.FenceToSbuffer object genWmask { def apply(addr: UInt, sizeEncode: UInt): UInt = { @@ -36,12 +33,13 @@ object genWdata { class LsPipelineBundle extends XSBundle { val vaddr = UInt(VAddrBits.W) val paddr = UInt(PAddrBits.W) - val func = UInt(6.W) + val func = UInt(6.W) //fixme??? val mask = UInt(8.W) val data = UInt(XLEN.W) val uop = new MicroOp val miss = Bool() + val tlbMiss = Bool() val mmio = Bool() val rollback = Bool() @@ -55,174 +53,10 @@ class LoadForwardQueryIO extends XSBundle { val uop = Output(new MicroOp) // for replay val pc = Output(UInt(VAddrBits.W)) //for debug val valid = Output(Bool()) //for debug - + val forwardMask = Input(Vec(8, Bool())) val forwardData = Input(Vec(8, UInt(8.W))) - val lsroqIdx = Output(UInt(LsroqIdxWidth.W)) // val lqIdx = Output(UInt(LoadQueueIdxWidth.W)) val sqIdx = Output(new SqPtr) -} - -class MemToBackendIO extends XSBundle { - val ldin = Vec(exuParameters.LduCnt, Flipped(Decoupled(new ExuInput))) - val stin = Vec(exuParameters.StuCnt, Flipped(Decoupled(new ExuInput))) - val ldout = Vec(exuParameters.LduCnt, Decoupled(new ExuOutput)) - val stout = Vec(exuParameters.StuCnt, Decoupled(new ExuOutput)) - val redirect = Flipped(ValidIO(new Redirect)) - // replay all instructions form dispatch - val replayAll = ValidIO(new Redirect) - // replay mem instructions form Load Queue/Store Queue - val tlbFeedback = Vec(exuParameters.LduCnt + exuParameters.LduCnt, ValidIO(new TlbFeedback)) - val commits = Flipped(Vec(CommitWidth, Valid(new RoqCommit))) - val dp1Req = Vec(RenameWidth, Flipped(DecoupledIO(new MicroOp))) - val lsIdxs = Output(Vec(RenameWidth, new LSIdx)) - val oldestStore = Output(Valid(new RoqPtr)) - val roqDeqPtr = Input(new RoqPtr) -} - -// Memory pipeline wrapper -// -// Wrap the whole memory access pipeline as a single module "Memend" -class Memend extends XSModule { - val io = IO(new Bundle{ - val backend = new MemToBackendIO - val loadUnitToDcacheVec = Vec(exuParameters.LduCnt, new DCacheWordIO) - val loadMiss = new DCacheLineIO - val atomics = new DCacheWordIO - val sbufferToDcache = new DCacheLineIO - val uncache = new DCacheWordIO - val ptw = new TlbPtwIO - }) - - // inner modules - val loadUnits = (0 until exuParameters.LduCnt).map(_ => Module(new LoadUnit)) - val storeUnits = (0 until exuParameters.StuCnt).map(_ => Module(new StoreUnit)) - val atomicsUnit = Module(new AtomicsUnit) - val dtlb = Module(new TLB(Width = DTLBWidth, isDtlb = true)) - val lsroq = Module(new LsqWrappper) - val sbuffer = Module(new NewSbuffer) - // if you wants to stress test dcache store, use FakeSbuffer - // val sbuffer = Module(new FakeSbuffer) - - // dtlb - io.ptw <> dtlb.io.ptw - - // LoadUnit - for (i <- 0 until exuParameters.LduCnt) { - // get input form dispatch - loadUnits(i).io.ldin <> io.backend.ldin(i) - loadUnits(i).io.ldout <> io.backend.ldout(i) - loadUnits(i).io.redirect <> io.backend.redirect - loadUnits(i).io.tlbFeedback <> io.backend.tlbFeedback(i) - // dtlb access - loadUnits(i).io.dtlb <> dtlb.io.requestor(i) - // dcache access - loadUnits(i).io.dcache <> io.loadUnitToDcacheVec(i) - // forward - loadUnits(i).io.lsroq.forward <> lsroq.io.forward(i) - loadUnits(i).io.sbuffer <> sbuffer.io.forward(i) - - // passdown to lsroq - lsroq.io.loadIn(i) <> loadUnits(i).io.lsroq.loadIn - lsroq.io.ldout(i) <> loadUnits(i).io.lsroq.ldout - } - - // StoreUnit - for (i <- 0 until exuParameters.StuCnt) { - // get input form dispatch - storeUnits(i).io.stin <> io.backend.stin(i) - storeUnits(i).io.redirect <> io.backend.redirect - storeUnits(i).io.tlbFeedback <> io.backend.tlbFeedback(exuParameters.LduCnt + i) - - // dtlb access - storeUnits(i).io.dtlb <> dtlb.io.requestor(exuParameters.LduCnt + i) // FIXME - - // passdown to lsroq - storeUnits(i).io.lsroq <> lsroq.io.storeIn(i) - } - - // Lsroq - lsroq.io.stout <> io.backend.stout - lsroq.io.commits <> io.backend.commits - lsroq.io.dp1Req <> io.backend.dp1Req - lsroq.io.oldestStore <> io.backend.oldestStore - lsroq.io.lsIdxs <> io.backend.lsIdxs - lsroq.io.brqRedirect := io.backend.redirect - lsroq.io.roqDeqPtr := io.backend.roqDeqPtr - io.backend.replayAll <> lsroq.io.rollback - - lsroq.io.dcache <> io.loadMiss - lsroq.io.uncache <> io.uncache - - // LSROQ to store buffer - lsroq.io.sbuffer <> sbuffer.io.in - - // Sbuffer - sbuffer.io.dcache <> io.sbufferToDcache - - // flush sbuffer - val fenceFlush = WireInit(false.B) - val atomicsFlush = atomicsUnit.io.flush_sbuffer.valid - BoringUtils.addSink(fenceFlush, "FenceUnitSbufferFlush") - val sbEmpty = WireInit(false.B) - sbEmpty := sbuffer.io.flush.empty - BoringUtils.addSource(sbEmpty, "SBufferEmpty") - // if both of them tries to flush sbuffer at the same time - // something must have gone wrong - assert(!(fenceFlush && atomicsFlush)) - sbuffer.io.flush.valid := fenceFlush || atomicsFlush - - // TODO: make 0/1 configurable - // AtomicsUnit - // AtomicsUnit will override other control signials, - // as atomics insts (LR/SC/AMO) will block the pipeline - val st0_atomics = io.backend.stin(0).valid && io.backend.stin(0).bits.uop.ctrl.fuType === FuType.mou - val st1_atomics = io.backend.stin(1).valid && io.backend.stin(1).bits.uop.ctrl.fuType === FuType.mou - // amo should always go through store issue queue 0 - assert(!st1_atomics) - - atomicsUnit.io.dtlb.resp.valid := false.B - atomicsUnit.io.dtlb.resp.bits := DontCare - atomicsUnit.io.out.ready := false.B - - // dispatch 0 takes priority - atomicsUnit.io.in.valid := st0_atomics || st1_atomics - atomicsUnit.io.in.bits := Mux(st0_atomics, io.backend.stin(0).bits, io.backend.stin(1).bits) - when (st0_atomics) { - io.backend.stin(0).ready := atomicsUnit.io.in.ready - // explitly set st1 ready to false, do not let it fire - when (st1_atomics) { io.backend.stin(1).ready := false.B } - } - - when (!st0_atomics && st1_atomics) { io.backend.stin(1).ready := atomicsUnit.io.in.ready } - - // for atomics, do not let them enter store unit - when (st0_atomics) { storeUnits(0).io.stin.valid := false.B } - when (st1_atomics) { storeUnits(1).io.stin.valid := false.B } - - when(atomicsUnit.io.dtlb.req.valid) { - dtlb.io.requestor(0) <> atomicsUnit.io.dtlb // TODO: check it later - // take load unit 0's tlb port - // make sure not to disturb loadUnit - assert(!loadUnits(0).io.dtlb.req.valid) - loadUnits(0).io.dtlb.resp.valid := false.B - } - - when(atomicsUnit.io.tlbFeedback.valid) { - assert(!storeUnits(0).io.tlbFeedback.valid) - atomicsUnit.io.tlbFeedback <> io.backend.tlbFeedback(exuParameters.LduCnt + 0) - } - - atomicsUnit.io.dcache <> io.atomics - atomicsUnit.io.flush_sbuffer.empty := sbEmpty - - atomicsUnit.io.redirect <> io.backend.redirect - - when(atomicsUnit.io.out.valid){ - io.backend.ldout(0) <> atomicsUnit.io.out - // take load unit 0's write back port - assert(!loadUnits(0).io.ldout.valid) - loadUnits(0).io.ldout.ready := false.B - } -} +} \ No newline at end of file diff --git a/src/main/scala/xiangshan/mem/lsqueue/LSQWrapper.scala b/src/main/scala/xiangshan/mem/lsqueue/LSQWrapper.scala new file mode 100644 index 0000000000000000000000000000000000000000..576c6b5628c1eb3a085f90a8bd68ea210e3c0858 --- /dev/null +++ b/src/main/scala/xiangshan/mem/lsqueue/LSQWrapper.scala @@ -0,0 +1,343 @@ +package xiangshan.mem + +import chisel3._ +import chisel3.util._ +import utils._ +import xiangshan._ +import xiangshan.cache._ +import xiangshan.cache.{DCacheWordIO, DCacheLineIO, TlbRequestIO, MemoryOpConstants} +import xiangshan.backend.LSUOpType +import xiangshan.mem._ +import xiangshan.backend.roq.RoqPtr + +class ExceptionAddrIO extends XSBundle { + val lsIdx = Input(new LSIdx) + val isStore = Input(Bool()) + val vaddr = Output(UInt(VAddrBits.W)) +} + + +class LsqEntry extends XSBundle { + val vaddr = UInt(VAddrBits.W) // TODO: need opt + val paddr = UInt(PAddrBits.W) + val mask = UInt(8.W) + val data = UInt(XLEN.W) + val exception = UInt(16.W) // TODO: opt size + val mmio = Bool() + val fwdMask = Vec(8, Bool()) + val fwdData = Vec(8, UInt(8.W)) +} + +class FwdEntry extends XSBundle { + val mask = Vec(8, Bool()) + val data = Vec(8, UInt(8.W)) +} + + +class LSQueueData(size: Int, nchannel: Int) extends XSModule with HasDCacheParameters with HasCircularQueuePtrHelper { + val io = IO(new Bundle() { + val wb = Vec(nchannel, new Bundle() { + val wen = Input(Bool()) + val index = Input(UInt(log2Up(size).W)) + val wdata = Input(new LsqEntry) + }) + val uncache = new Bundle() { + val wen = Input(Bool()) + val index = Input(UInt(log2Up(size).W)) + val wdata = Input(UInt(XLEN.W)) + } + val refill = new Bundle() { + val wen = Input(Vec(size, Bool())) + val dcache = Input(new DCacheLineResp) + } + val needForward = Input(Vec(nchannel, Vec(2, UInt(size.W)))) + val forward = Vec(nchannel, Flipped(new LoadForwardQueryIO)) + val rdata = Output(Vec(size, new LsqEntry)) + + // val debug = new Bundle() { + // val debug_data = Vec(LoadQueueSize, new LsqEntry) + // } + + def wbWrite(channel: Int, index: UInt, wdata: LsqEntry): Unit = { + require(channel < nchannel && channel >= 0) + // need extra "this.wb(channel).wen := true.B" + this.wb(channel).index := index + this.wb(channel).wdata := wdata + } + + def uncacheWrite(index: UInt, wdata: UInt): Unit = { + // need extra "this.uncache.wen := true.B" + this.uncache.index := index + this.uncache.wdata := wdata + } + + def forwardQuery(channel: Int, paddr: UInt, needForward1: Data, needForward2: Data): Unit = { + this.needForward(channel)(0) := needForward1 + this.needForward(channel)(1) := needForward2 + this.forward(channel).paddr := paddr + } + + // def refillWrite(ldIdx: Int): Unit = { + // } + // use "this.refill.wen(ldIdx) := true.B" instead + }) + + io := DontCare + + val data = Reg(Vec(size, new LsqEntry)) + + // writeback to lq/sq + (0 until 2).map(i => { + when(io.wb(i).wen){ + data(io.wb(i).index) := io.wb(i).wdata + } + }) + + when(io.uncache.wen){ + data(io.uncache.index).data := io.uncache.wdata + } + + // refill missed load + def mergeRefillData(refill: UInt, fwd: UInt, fwdMask: UInt): UInt = { + val res = Wire(Vec(8, UInt(8.W))) + (0 until 8).foreach(i => { + res(i) := Mux(fwdMask(i), fwd(8 * (i + 1) - 1, 8 * i), refill(8 * (i + 1) - 1, 8 * i)) + }) + res.asUInt + } + + // split dcache result into words + val words = VecInit((0 until blockWords) map { i => + io.refill.dcache.data(DataBits * (i + 1) - 1, DataBits * i) + }) + + + (0 until size).map(i => { + when(io.refill.wen(i) ){ + val refillData = words(get_word(data(i).paddr)) + data(i).data := mergeRefillData(refillData, data(i).fwdData.asUInt, data(i).fwdMask.asUInt) + XSDebug("miss resp: pos %d addr %x data %x + %x(%b)\n", i.U, data(i).paddr, refillData, data(i).fwdData.asUInt, data(i).fwdMask.asUInt) + } + }) + + // forwarding + // Compare ringBufferTail (deqPtr) and forward.sqIdx, we have two cases: + // (1) if they have the same flag, we need to check range(tail, sqIdx) + // (2) if they have different flags, we need to check range(tail, LoadQueueSize) and range(0, sqIdx) + // Forward1: Mux(same_flag, range(tail, sqIdx), range(tail, LoadQueueSize)) + // Forward2: Mux(same_flag, 0.U, range(0, sqIdx) ) + // i.e. forward1 is the target entries with the same flag bits and forward2 otherwise + + // entry with larger index should have higher priority since it's data is younger + + // FIXME: old fwd logic for assertion, remove when rtl freeze + (0 until nchannel).map(i => { + + val forwardMask1 = WireInit(VecInit(Seq.fill(8)(false.B))) + val forwardData1 = WireInit(VecInit(Seq.fill(8)(0.U(8.W)))) + val forwardMask2 = WireInit(VecInit(Seq.fill(8)(false.B))) + val forwardData2 = WireInit(VecInit(Seq.fill(8)(0.U(8.W)))) + + for (j <- 0 until size) { + val needCheck = io.forward(i).paddr(PAddrBits - 1, 3) === data(j).paddr(PAddrBits - 1, 3) + (0 until XLEN / 8).foreach(k => { + when (needCheck && data(j).mask(k)) { + when (io.needForward(i)(0)(j)) { + forwardMask1(k) := true.B + forwardData1(k) := data(j).data(8 * (k + 1) - 1, 8 * k) + } + when (io.needForward(i)(1)(j)) { + forwardMask2(k) := true.B + forwardData2(k) := data(j).data(8 * (k + 1) - 1, 8 * k) + } + XSDebug(io.needForward(i)(0)(j) || io.needForward(i)(1)(j), + p"forwarding $k-th byte ${Hexadecimal(data(j).data(8 * (k + 1) - 1, 8 * k))} " + + p"from ptr $j\n") + } + }) + } + + // merge forward lookup results + // forward2 is younger than forward1 and should have higher priority + val oldFwdResult = Wire(new FwdEntry) + (0 until XLEN / 8).map(k => { + oldFwdResult.mask(k) := RegNext(forwardMask1(k) || forwardMask2(k)) + oldFwdResult.data(k) := RegNext(Mux(forwardMask2(k), forwardData2(k), forwardData1(k))) + }) + + // parallel fwd logic + val paddrMatch = Wire(Vec(size, Bool())) + val matchResultVec = Wire(Vec(size * 2, new FwdEntry)) + + def parallelFwd(xs: Seq[Data]): Data = { + ParallelOperation(xs, (a: Data, b: Data) => { + val l = a.asTypeOf(new FwdEntry) + val r = b.asTypeOf(new FwdEntry) + val res = Wire(new FwdEntry) + (0 until 8).map(p => { + res.mask(p) := l.mask(p) || r.mask(p) + res.data(p) := Mux(r.mask(p), r.data(p), l.data(p)) + }) + res + }) + } + + for (j <- 0 until size) { + paddrMatch(j) := io.forward(i).paddr(PAddrBits - 1, 3) === data(j).paddr(PAddrBits - 1, 3) + } + + for (j <- 0 until size) { + val needCheck0 = RegNext(paddrMatch(j) && io.needForward(i)(0)(j)) + val needCheck1 = RegNext(paddrMatch(j) && io.needForward(i)(1)(j)) + (0 until XLEN / 8).foreach(k => { + matchResultVec(j).mask(k) := needCheck0 && data(j).mask(k) + matchResultVec(j).data(k) := data(j).data(8 * (k + 1) - 1, 8 * k) + matchResultVec(size + j).mask(k) := needCheck1 && data(j).mask(k) + matchResultVec(size + j).data(k) := data(j).data(8 * (k + 1) - 1, 8 * k) + }) + } + + val parallelFwdResult = parallelFwd(matchResultVec).asTypeOf(new FwdEntry) + + io.forward(i).forwardMask := parallelFwdResult.mask + io.forward(i).forwardData := parallelFwdResult.data + + when( + oldFwdResult.mask.asUInt =/= parallelFwdResult.mask.asUInt + ){ + printf("%d: mask error: right: %b false %b\n", GTimer(), oldFwdResult.mask.asUInt, parallelFwdResult.mask.asUInt) + } + + for (p <- 0 until 8) { + when( + oldFwdResult.data(p) =/= parallelFwdResult.data(p) && oldFwdResult.mask(p) + ){ + printf("%d: data "+p+" error: right: %x false %x\n", GTimer(), oldFwdResult.data(p), parallelFwdResult.data(p)) + } + } + + }) + + // data read + io.rdata := data + // io.debug.debug_data := data +} + +// inflight miss block reqs +class InflightBlockInfo extends XSBundle { + val block_addr = UInt(PAddrBits.W) + val valid = Bool() +} + +// Load / Store Queue Wrapper for XiangShan Out of Order LSU +class LsqWrappper extends XSModule with HasDCacheParameters { + val io = IO(new Bundle() { + val enq = new Bundle() { + val canAccept = Output(Bool()) + val req = Vec(RenameWidth, Flipped(ValidIO(new MicroOp))) + val resp = Vec(RenameWidth, Output(new LSIdx)) + } + val brqRedirect = Input(Valid(new Redirect)) + val loadIn = Vec(LoadPipelineWidth, Flipped(Valid(new LsPipelineBundle))) + val storeIn = Vec(StorePipelineWidth, Flipped(Valid(new LsPipelineBundle))) + val sbuffer = Vec(StorePipelineWidth, Decoupled(new DCacheWordReq)) + val ldout = Vec(2, DecoupledIO(new ExuOutput)) // writeback store + val mmioStout = DecoupledIO(new ExuOutput) // writeback uncached store + val forward = Vec(LoadPipelineWidth, Flipped(new LoadForwardQueryIO)) + val commits = Flipped(Vec(CommitWidth, Valid(new RoqCommit))) + val rollback = Output(Valid(new Redirect)) + val dcache = new DCacheLineIO + val uncache = new DCacheWordIO + val roqDeqPtr = Input(new RoqPtr) + val exceptionAddr = new ExceptionAddrIO + }) + + val loadQueue = Module(new LoadQueue) + val storeQueue = Module(new StoreQueue) + + // io.enq logic + // LSQ: send out canAccept when both load queue and store queue are ready + // Dispatch: send instructions to LSQ only when they are ready + io.enq.canAccept := loadQueue.io.enq.canAccept && storeQueue.io.enq.canAccept + for (i <- 0 until RenameWidth) { + val isStore = CommitType.lsInstIsStore(io.enq.req(i).bits.ctrl.commitType) + loadQueue.io.enq.req(i).valid := !isStore && io.enq.req(i).valid + storeQueue.io.enq.req(i).valid := isStore && io.enq.req(i).valid + loadQueue.io.enq.req(i).bits := io.enq.req(i).bits + storeQueue.io.enq.req(i).bits := io.enq.req(i).bits + io.enq.resp(i).lqIdx := loadQueue.io.enq.resp(i) + io.enq.resp(i).sqIdx := storeQueue.io.enq.resp(i) + + XSError(!io.enq.canAccept && io.enq.req(i).valid, "should not enqueue LSQ when not") + } + + // load queue wiring + loadQueue.io.brqRedirect <> io.brqRedirect + loadQueue.io.loadIn <> io.loadIn + loadQueue.io.storeIn <> io.storeIn + loadQueue.io.ldout <> io.ldout + loadQueue.io.commits <> io.commits + loadQueue.io.rollback <> io.rollback + loadQueue.io.dcache <> io.dcache + loadQueue.io.roqDeqPtr <> io.roqDeqPtr + loadQueue.io.exceptionAddr.lsIdx := io.exceptionAddr.lsIdx + loadQueue.io.exceptionAddr.isStore := DontCare + + // store queue wiring + // storeQueue.io <> DontCare + storeQueue.io.brqRedirect <> io.brqRedirect + storeQueue.io.storeIn <> io.storeIn + storeQueue.io.sbuffer <> io.sbuffer + storeQueue.io.mmioStout <> io.mmioStout + storeQueue.io.commits <> io.commits + storeQueue.io.roqDeqPtr <> io.roqDeqPtr + storeQueue.io.exceptionAddr.lsIdx := io.exceptionAddr.lsIdx + storeQueue.io.exceptionAddr.isStore := DontCare + + loadQueue.io.forward <> io.forward + storeQueue.io.forward <> io.forward // overlap forwardMask & forwardData, DO NOT CHANGE SEQUENCE + + io.exceptionAddr.vaddr := Mux(io.exceptionAddr.isStore, storeQueue.io.exceptionAddr.vaddr, loadQueue.io.exceptionAddr.vaddr) + + // naive uncache arbiter + val s_idle :: s_load :: s_store :: Nil = Enum(3) + val uncacheState = RegInit(s_idle) + + switch(uncacheState){ + is(s_idle){ + when(io.uncache.req.fire()){ + uncacheState := Mux(loadQueue.io.uncache.req.valid, s_load, s_store) + } + } + is(s_load){ + when(io.uncache.resp.fire()){ + uncacheState := s_idle + } + } + is(s_store){ + when(io.uncache.resp.fire()){ + uncacheState := s_idle + } + } + } + + loadQueue.io.uncache := DontCare + storeQueue.io.uncache := DontCare + loadQueue.io.uncache.resp.valid := false.B + storeQueue.io.uncache.resp.valid := false.B + when(loadQueue.io.uncache.req.valid){ + io.uncache.req <> loadQueue.io.uncache.req + }.otherwise{ + io.uncache.req <> storeQueue.io.uncache.req + } + when(uncacheState === s_load){ + io.uncache.resp <> loadQueue.io.uncache.resp + }.otherwise{ + io.uncache.resp <> storeQueue.io.uncache.resp + } + + assert(!(loadQueue.io.uncache.req.valid && storeQueue.io.uncache.req.valid)) + assert(!(loadQueue.io.uncache.resp.valid && storeQueue.io.uncache.resp.valid)) + assert(!((loadQueue.io.uncache.resp.valid || storeQueue.io.uncache.resp.valid) && uncacheState === s_idle)) + +} diff --git a/src/main/scala/xiangshan/mem/lsqueue/separated/LoadQueue.scala b/src/main/scala/xiangshan/mem/lsqueue/LoadQueue.scala similarity index 69% rename from src/main/scala/xiangshan/mem/lsqueue/separated/LoadQueue.scala rename to src/main/scala/xiangshan/mem/lsqueue/LoadQueue.scala index 21a64cc349684856a61848c059a0178cdbd720a6..e664e281cb68c6a189a22bb5fcb99291265e4e1b 100644 --- a/src/main/scala/xiangshan/mem/lsqueue/separated/LoadQueue.scala +++ b/src/main/scala/xiangshan/mem/lsqueue/LoadQueue.scala @@ -9,6 +9,7 @@ import xiangshan.cache.{DCacheWordIO, DCacheLineIO, TlbRequestIO, MemoryOpConsta import xiangshan.backend.LSUOpType import xiangshan.mem._ import xiangshan.backend.roq.RoqPtr +import xiangshan.backend.fu.fpu.boxF32ToF64 class LqPtr extends CircularQueuePtr(LqPtr.LoadQueueSize) { } @@ -26,83 +27,86 @@ object LqPtr extends HasXSParameter { // Load Queue class LoadQueue extends XSModule with HasDCacheParameters with HasCircularQueuePtrHelper { val io = IO(new Bundle() { - val dp1Req = Vec(RenameWidth, Flipped(DecoupledIO(new MicroOp))) - val lqIdxs = Output(Vec(RenameWidth, new LqPtr)) // LSIdx will be assembled in LSQWrapper + val enq = new Bundle() { + val canAccept = Output(Bool()) + val req = Vec(RenameWidth, Flipped(ValidIO(new MicroOp))) + val resp = Vec(RenameWidth, Output(new LqPtr)) + } val brqRedirect = Input(Valid(new Redirect)) val loadIn = Vec(LoadPipelineWidth, Flipped(Valid(new LsPipelineBundle))) val storeIn = Vec(StorePipelineWidth, Flipped(Valid(new LsPipelineBundle))) // FIXME: Valid() only - val ldout = Vec(2, DecoupledIO(new ExuOutput)) // writeback store + val ldout = Vec(2, DecoupledIO(new ExuOutput)) // writeback load val forward = Vec(LoadPipelineWidth, Flipped(new LoadForwardQueryIO)) val commits = Flipped(Vec(CommitWidth, Valid(new RoqCommit))) val rollback = Output(Valid(new Redirect)) // replay now starts from load instead of store val dcache = new DCacheLineIO val uncache = new DCacheWordIO val roqDeqPtr = Input(new RoqPtr) + val exceptionAddr = new ExceptionAddrIO // val refill = Flipped(Valid(new DCacheLineReq )) }) - + val uop = Reg(Vec(LoadQueueSize, new MicroOp)) - val data = Reg(Vec(LoadQueueSize, new LsRoqEntry)) // FIXME: use LoadQueueEntry instead + // val data = Reg(Vec(LoadQueueSize, new LsRoqEntry)) + val dataModule = Module(new LSQueueData(LoadQueueSize, LoadPipelineWidth)) + dataModule.io := DontCare val allocated = RegInit(VecInit(List.fill(LoadQueueSize)(false.B))) // lq entry has been allocated - val valid = RegInit(VecInit(List.fill(LoadQueueSize)(false.B))) // data is valid + val datavalid = RegInit(VecInit(List.fill(LoadQueueSize)(false.B))) // data is valid val writebacked = RegInit(VecInit(List.fill(LoadQueueSize)(false.B))) // inst has been writebacked to CDB val commited = Reg(Vec(LoadQueueSize, Bool())) // inst has been writebacked to CDB val miss = Reg(Vec(LoadQueueSize, Bool())) // load inst missed, waiting for miss queue to accept miss request val listening = Reg(Vec(LoadQueueSize, Bool())) // waiting for refill result val pending = Reg(Vec(LoadQueueSize, Bool())) // mmio pending: inst is an mmio inst, it will not be executed until it reachs the end of roq - - val ringBufferHeadExtended = RegInit(0.U.asTypeOf(new LqPtr)) - val ringBufferTailExtended = RegInit(0.U.asTypeOf(new LqPtr)) - val ringBufferHead = ringBufferHeadExtended.value - val ringBufferTail = ringBufferTailExtended.value - val ringBufferSameFlag = ringBufferHeadExtended.flag === ringBufferTailExtended.flag - val ringBufferEmpty = ringBufferHead === ringBufferTail && ringBufferSameFlag - val ringBufferFull = ringBufferHead === ringBufferTail && !ringBufferSameFlag - val ringBufferAllowin = !ringBufferFull - + + val enqPtrExt = RegInit(0.U.asTypeOf(new LqPtr)) + val deqPtrExt = RegInit(0.U.asTypeOf(new LqPtr)) + val enqPtr = enqPtrExt.value + val deqPtr = deqPtrExt.value + val sameFlag = enqPtrExt.flag === deqPtrExt.flag + val isEmpty = enqPtr === deqPtr && sameFlag + val isFull = enqPtr === deqPtr && !sameFlag + val allowIn = !isFull + val loadCommit = (0 until CommitWidth).map(i => io.commits(i).valid && !io.commits(i).bits.isWalk && io.commits(i).bits.uop.ctrl.commitType === CommitType.LOAD) val mcommitIdx = (0 until CommitWidth).map(i => io.commits(i).bits.uop.lqIdx.value) - val tailMask = (((1.U((LoadQueueSize + 1).W)) << ringBufferTail).asUInt - 1.U)(LoadQueueSize - 1, 0) - val headMask = (((1.U((LoadQueueSize + 1).W)) << ringBufferHead).asUInt - 1.U)(LoadQueueSize - 1, 0) + val tailMask = (((1.U((LoadQueueSize + 1).W)) << deqPtr).asUInt - 1.U)(LoadQueueSize - 1, 0) + val headMask = (((1.U((LoadQueueSize + 1).W)) << enqPtr).asUInt - 1.U)(LoadQueueSize - 1, 0) val enqDeqMask1 = tailMask ^ headMask - val enqDeqMask = Mux(ringBufferSameFlag, enqDeqMask1, ~enqDeqMask1) - - // TODO: misc arbitor + val enqDeqMask = Mux(sameFlag, enqDeqMask1, ~enqDeqMask1) // Enqueue at dispatch - val emptyEntries = LoadQueueSize.U - distanceBetween(ringBufferHeadExtended, ringBufferTailExtended) - XSDebug("(ready, valid): ") + val validEntries = distanceBetween(enqPtrExt, deqPtrExt) + val firedDispatch = io.enq.req.map(_.valid) + io.enq.canAccept := validEntries <= (LoadQueueSize - RenameWidth).U + XSDebug(p"(ready, valid): ${io.enq.canAccept}, ${Binary(Cat(firedDispatch))}\n") for (i <- 0 until RenameWidth) { - val offset = if (i == 0) 0.U else PopCount((0 until i).map(io.dp1Req(_).valid)) - val lqIdx = ringBufferHeadExtended + offset + val offset = if (i == 0) 0.U else PopCount((0 until i).map(firedDispatch(_))) + val lqIdx = enqPtrExt + offset val index = lqIdx.value - when(io.dp1Req(i).fire()) { - uop(index) := io.dp1Req(i).bits + when(io.enq.req(i).valid) { + uop(index) := io.enq.req(i).bits allocated(index) := true.B - valid(index) := false.B + datavalid(index) := false.B writebacked(index) := false.B commited(index) := false.B miss(index) := false.B listening(index) := false.B pending(index) := false.B - // data(index).bwdMask := 0.U(8.W).asBools } - val numTryEnqueue = offset +& io.dp1Req(i).valid - io.dp1Req(i).ready := numTryEnqueue <= emptyEntries - io.lqIdxs(i) := lqIdx - XSDebug(false, true.B, "(%d, %d) ", io.dp1Req(i).ready, io.dp1Req(i).valid) + io.enq.resp(i) := lqIdx + + XSError(!io.enq.canAccept && io.enq.req(i).valid, "should not valid when not ready\n") } - XSDebug(false, true.B, "\n") - val firedDispatch = VecInit((0 until CommitWidth).map(io.dp1Req(_).fire())).asUInt - when(firedDispatch.orR) { - ringBufferHeadExtended := ringBufferHeadExtended + PopCount(firedDispatch) + when(Cat(firedDispatch).orR) { + enqPtrExt := enqPtrExt + PopCount(firedDispatch) XSInfo("dispatched %d insts to lq\n", PopCount(firedDispatch)) } // writeback load (0 until LoadPipelineWidth).map(i => { + dataModule.io.wb(i).wen := false.B when(io.loadIn(i).fire()) { when(io.loadIn(i).bits.miss) { XSInfo(io.loadIn(i).valid, "load miss write to lq idx %d pc 0x%x vaddr %x paddr %x data %x mask %x forwardData %x forwardMask: %x mmio %x roll %x exc %x\n", @@ -119,7 +123,7 @@ class LoadQueue extends XSModule with HasDCacheParameters with HasCircularQueueP io.loadIn(i).bits.uop.cf.exceptionVec.asUInt ) }.otherwise { - XSInfo(io.loadIn(i).valid, "load hit write to cbd idx %d pc 0x%x vaddr %x paddr %x data %x mask %x forwardData %x forwardMask: %x mmio %x roll %x exc %x\n", + XSInfo(io.loadIn(i).valid, "load hit write to cbd lqidx %d pc 0x%x vaddr %x paddr %x data %x mask %x forwardData %x forwardMask: %x mmio %x roll %x exc %x\n", io.loadIn(i).bits.uop.lqIdx.asUInt, io.loadIn(i).bits.uop.cf.pc, io.loadIn(i).bits.vaddr, @@ -134,17 +138,22 @@ class LoadQueue extends XSModule with HasDCacheParameters with HasCircularQueueP ) } val loadWbIndex = io.loadIn(i).bits.uop.lqIdx.value - valid(loadWbIndex) := !io.loadIn(i).bits.miss && !io.loadIn(i).bits.mmio + datavalid(loadWbIndex) := !io.loadIn(i).bits.miss && !io.loadIn(i).bits.mmio writebacked(loadWbIndex) := !io.loadIn(i).bits.miss && !io.loadIn(i).bits.mmio - // allocated(loadWbIndex) := io.loadIn(i).bits.miss // if hit, lq entry can be recycled - data(loadWbIndex).paddr := io.loadIn(i).bits.paddr - data(loadWbIndex).vaddr := io.loadIn(i).bits.vaddr - data(loadWbIndex).mask := io.loadIn(i).bits.mask - data(loadWbIndex).data := io.loadIn(i).bits.data // for mmio / misc / debug - data(loadWbIndex).mmio := io.loadIn(i).bits.mmio - data(loadWbIndex).fwdMask := io.loadIn(i).bits.forwardMask - data(loadWbIndex).fwdData := io.loadIn(i).bits.forwardData - data(loadWbIndex).exception := io.loadIn(i).bits.uop.cf.exceptionVec.asUInt + allocated(loadWbIndex) := !io.loadIn(i).bits.uop.cf.exceptionVec.asUInt.orR + + val loadWbData = Wire(new LsqEntry) + loadWbData.paddr := io.loadIn(i).bits.paddr + loadWbData.vaddr := io.loadIn(i).bits.vaddr + loadWbData.mask := io.loadIn(i).bits.mask + loadWbData.data := io.loadIn(i).bits.data // for mmio / misc / debug + loadWbData.mmio := io.loadIn(i).bits.mmio + loadWbData.fwdMask := io.loadIn(i).bits.forwardMask + loadWbData.fwdData := io.loadIn(i).bits.forwardData + loadWbData.exception := io.loadIn(i).bits.uop.cf.exceptionVec.asUInt + dataModule.io.wbWrite(i, loadWbIndex, loadWbData) + dataModule.io.wb(i).wen := true.B + val dcacheMissed = io.loadIn(i).bits.miss && !io.loadIn(i).bits.mmio miss(loadWbIndex) := dcacheMissed listening(loadWbIndex) := dcacheMissed @@ -160,30 +169,30 @@ class LoadQueue extends XSModule with HasDCacheParameters with HasCircularQueueP val missRefillSelVec = VecInit( (0 until LoadQueueSize).map{ i => - val inflight = inflightReqs.map(req => req.valid && req.block_addr === get_block_addr(data(i).paddr)).reduce(_||_) + val inflight = inflightReqs.map(req => req.valid && req.block_addr === get_block_addr(dataModule.io.rdata(i).paddr)).reduce(_||_) allocated(i) && miss(i) && !inflight }) val missRefillSel = getFirstOne(missRefillSelVec, tailMask) - val missRefillBlockAddr = get_block_addr(data(missRefillSel).paddr) + val missRefillBlockAddr = get_block_addr(dataModule.io.rdata(missRefillSel).paddr) io.dcache.req.valid := missRefillSelVec.asUInt.orR io.dcache.req.bits.cmd := MemoryOpConstants.M_XRD io.dcache.req.bits.addr := missRefillBlockAddr io.dcache.req.bits.data := DontCare io.dcache.req.bits.mask := DontCare - io.dcache.req.bits.meta.id := DontCare // TODO: // FIXME - io.dcache.req.bits.meta.vaddr := DontCare // data(missRefillSel).vaddr + io.dcache.req.bits.meta.id := DontCare + io.dcache.req.bits.meta.vaddr := DontCare // dataModule.io.rdata(missRefillSel).vaddr io.dcache.req.bits.meta.paddr := missRefillBlockAddr io.dcache.req.bits.meta.uop := uop(missRefillSel) - io.dcache.req.bits.meta.mmio := false.B // data(missRefillSel).mmio + io.dcache.req.bits.meta.mmio := false.B // dataModule.io.rdata(missRefillSel).mmio io.dcache.req.bits.meta.tlb_miss := false.B io.dcache.req.bits.meta.mask := DontCare io.dcache.req.bits.meta.replay := false.B io.dcache.resp.ready := true.B - assert(!(data(missRefillSel).mmio && io.dcache.req.valid)) + assert(!(dataModule.io.rdata(missRefillSel).mmio && io.dcache.req.valid)) when(io.dcache.req.fire()) { miss(missRefillSel) := false.B @@ -217,50 +226,41 @@ class LoadQueue extends XSModule with HasDCacheParameters with HasCircularQueueP XSDebug("miss resp: pc:0x%x roqIdx:%d lqIdx:%d (p)addr:0x%x data %x\n", io.dcache.resp.bits.meta.uop.cf.pc, io.dcache.resp.bits.meta.uop.roqIdx.asUInt, io.dcache.resp.bits.meta.uop.lqIdx.asUInt, io.dcache.resp.bits.meta.paddr, io.dcache.resp.bits.data - ) + ) } // Refill 64 bit in a cycle // Refill data comes back from io.dcache.resp - def mergeRefillData(refill: UInt, fwd: UInt, fwdMask: UInt): UInt = { - val res = Wire(Vec(8, UInt(8.W))) - (0 until 8).foreach(i => { - res(i) := Mux(fwdMask(i), fwd(8 * (i + 1) - 1, 8 * i), refill(8 * (i + 1) - 1, 8 * i)) - }) - res.asUInt - } + dataModule.io.refill.dcache := io.dcache.resp.bits (0 until LoadQueueSize).map(i => { - val blockMatch = get_block_addr(data(i).paddr) === io.dcache.resp.bits.meta.paddr + val blockMatch = get_block_addr(dataModule.io.rdata(i).paddr) === io.dcache.resp.bits.meta.paddr + dataModule.io.refill.wen(i) := false.B when(allocated(i) && listening(i) && blockMatch && io.dcache.resp.fire()) { - // split them into words - val words = VecInit((0 until blockWords) map { i => - io.dcache.resp.bits.data(DataBits * (i + 1) - 1, DataBits * i) - }) - - val refillData = words(get_word(data(i).paddr)) - data(i).data := mergeRefillData(refillData, data(i).fwdData.asUInt, data(i).fwdMask.asUInt) - valid(i) := true.B + dataModule.io.refill.wen(i) := true.B + datavalid(i) := true.B listening(i) := false.B - XSDebug("miss resp: pos %d addr %x data %x + %x(%b)\n", i.U, data(i).paddr, refillData, data(i).fwdData.asUInt, data(i).fwdMask.asUInt) } }) // writeback up to 2 missed load insts to CDB // just randomly pick 2 missed load (data refilled), write them back to cdb val loadWbSelVec = VecInit((0 until LoadQueueSize).map(i => { - allocated(i) && valid(i) && !writebacked(i) + allocated(i) && datavalid(i) && !writebacked(i) })).asUInt() // use uint instead vec to reduce verilog lines val loadWbSel = Wire(Vec(StorePipelineWidth, UInt(log2Up(LoadQueueSize).W))) + val loadWbSelV= Wire(Vec(StorePipelineWidth, Bool())) val lselvec0 = PriorityEncoderOH(loadWbSelVec) val lselvec1 = PriorityEncoderOH(loadWbSelVec & (~lselvec0).asUInt) loadWbSel(0) := OHToUInt(lselvec0) + loadWbSelV(0):= lselvec0.orR loadWbSel(1) := OHToUInt(lselvec1) + loadWbSelV(1) := lselvec1.orR (0 until StorePipelineWidth).map(i => { // data select - val rdata = data(loadWbSel(i)).data + val rdata = dataModule.io.rdata(loadWbSel(i)).data val func = uop(loadWbSel(i)).ctrl.fuOpType - val raddr = data(loadWbSel(i)).paddr + val raddr = dataModule.io.rdata(loadWbSel(i)).paddr val rdataSel = LookupTree(raddr(2, 0), List( "b000".U -> rdata(63, 0), "b001".U -> rdata(63, 8), @@ -278,25 +278,28 @@ class LoadQueue extends XSModule with HasDCacheParameters with HasCircularQueueP LSUOpType.ld -> SignExt(rdataSel(63, 0), XLEN), LSUOpType.lbu -> ZeroExt(rdataSel(7, 0) , XLEN), LSUOpType.lhu -> ZeroExt(rdataSel(15, 0), XLEN), - LSUOpType.lwu -> ZeroExt(rdataSel(31, 0), XLEN) + LSUOpType.lwu -> ZeroExt(rdataSel(31, 0), XLEN), + LSUOpType.flw -> boxF32ToF64(rdataSel(31, 0)) )) io.ldout(i).bits.uop := uop(loadWbSel(i)) - io.ldout(i).bits.uop.cf.exceptionVec := data(loadWbSel(i)).exception.asBools + io.ldout(i).bits.uop.cf.exceptionVec := dataModule.io.rdata(loadWbSel(i)).exception.asBools io.ldout(i).bits.uop.lqIdx := loadWbSel(i).asTypeOf(new LqPtr) io.ldout(i).bits.data := rdataPartialLoad io.ldout(i).bits.redirectValid := false.B io.ldout(i).bits.redirect := DontCare io.ldout(i).bits.brUpdate := DontCare - io.ldout(i).bits.debug.isMMIO := data(loadWbSel(i)).mmio - io.ldout(i).valid := loadWbSelVec(loadWbSel(i)) + io.ldout(i).bits.debug.isMMIO := dataModule.io.rdata(loadWbSel(i)).mmio + io.ldout(i).bits.fflags := DontCare + io.ldout(i).valid := loadWbSelVec(loadWbSel(i)) && loadWbSelV(i) when(io.ldout(i).fire()) { writebacked(loadWbSel(i)) := true.B - XSInfo(io.loadIn(i).valid, "load miss write to cbd idx %d pc 0x%x paddr %x data %x mmio %x\n", + XSInfo("load miss write to cbd roqidx %d lqidx %d pc 0x%x paddr %x data %x mmio %x\n", + io.ldout(i).bits.uop.roqIdx.asUInt, io.ldout(i).bits.uop.lqIdx.asUInt, io.ldout(i).bits.uop.cf.pc, - data(loadWbSel(i)).paddr, - data(loadWbSel(i)).data, - data(loadWbSel(i)).mmio + dataModule.io.rdata(loadWbSel(i)).paddr, + dataModule.io.rdata(loadWbSel(i)).data, + dataModule.io.rdata(loadWbSel(i)).mmio ) } }) @@ -304,10 +307,10 @@ class LoadQueue extends XSModule with HasDCacheParameters with HasCircularQueueP // move tailPtr // allocatedMask: dequeuePtr can go to the next 1-bit val allocatedMask = VecInit((0 until LoadQueueSize).map(i => allocated(i) || !enqDeqMask(i))) - // find the first one from deqPtr (ringBufferTail) - val nextTail1 = getFirstOneWithFlag(allocatedMask, tailMask, ringBufferTailExtended.flag) - val nextTail = Mux(Cat(allocatedMask).orR, nextTail1, ringBufferHeadExtended) - ringBufferTailExtended := nextTail + // find the first one from deqPtr (deqPtr) + val nextTail1 = getFirstOneWithFlag(allocatedMask, tailMask, deqPtrExt.flag) + val nextTail = Mux(Cat(allocatedMask).orR, nextTail1, enqPtrExt) + deqPtrExt := nextTail // When load commited, mark it as !allocated, this entry will be recycled later (0 until CommitWidth).map(i => { @@ -378,14 +381,16 @@ class LoadQueue extends XSModule with HasDCacheParameters with HasCircularQueueP val startIndex = io.storeIn(i).bits.uop.lqIdx.value val lqIdxMask = ((1.U((LoadQueueSize + 1).W) << startIndex).asUInt - 1.U)(LoadQueueSize - 1, 0) val xorMask = lqIdxMask ^ headMask - val sameFlag = io.storeIn(i).bits.uop.lqIdx.flag === ringBufferHeadExtended.flag + val sameFlag = io.storeIn(i).bits.uop.lqIdx.flag === enqPtrExt.flag val toEnqPtrMask = Mux(sameFlag, xorMask, ~xorMask) + + // check if load already in lq needs to be rolledback val lqViolationVec = VecInit((0 until LoadQueueSize).map(j => { val addrMatch = allocated(j) && - io.storeIn(i).bits.paddr(PAddrBits - 1, 3) === data(j).paddr(PAddrBits - 1, 3) - val entryNeedCheck = toEnqPtrMask(j) && addrMatch && (valid(j) || listening(j) || miss(j)) + io.storeIn(i).bits.paddr(PAddrBits - 1, 3) === dataModule.io.rdata(j).paddr(PAddrBits - 1, 3) + val entryNeedCheck = toEnqPtrMask(j) && addrMatch && (datavalid(j) || listening(j) || miss(j)) // TODO: update refilled data - val violationVec = (0 until 8).map(k => data(j).mask(k) && io.storeIn(i).bits.mask(k)) + val violationVec = (0 until 8).map(k => dataModule.io.rdata(j).mask(k) && io.storeIn(i).bits.mask(k)) Cat(violationVec).orR() && entryNeedCheck })) val lqViolation = lqViolationVec.asUInt().orR() @@ -404,18 +409,19 @@ class LoadQueue extends XSModule with HasDCacheParameters with HasCircularQueueP val wbViolationUop = getOldestInTwo(wbViolationVec, io.loadIn.map(_.bits.uop)) XSDebug(wbViolation, p"${Binary(Cat(wbViolationVec))}, $wbViolationUop\n") - // check if rollback is needed for load in l4 - val l4ViolationVec = VecInit((0 until LoadPipelineWidth).map(j => { + // check if rollback is needed for load in l1 + val l1ViolationVec = VecInit((0 until LoadPipelineWidth).map(j => { io.forward(j).valid && // L4 valid\ isAfter(io.forward(j).uop.roqIdx, io.storeIn(i).bits.uop.roqIdx) && io.storeIn(i).bits.paddr(PAddrBits - 1, 3) === io.forward(j).paddr(PAddrBits - 1, 3) && (io.storeIn(i).bits.mask & io.forward(j).mask).orR })) - val l4Violation = l4ViolationVec.asUInt().orR() - val l4ViolationUop = getOldestInTwo(l4ViolationVec, io.forward.map(_.uop)) + val l1Violation = l1ViolationVec.asUInt().orR() + val l1ViolationUop = getOldestInTwo(l1ViolationVec, io.forward.map(_.uop)) + XSDebug(l1Violation, p"${Binary(Cat(l1ViolationVec))}, $l1ViolationUop\n") - val rollbackValidVec = Seq(lqViolation, wbViolation, l4Violation) - val rollbackUopVec = Seq(lqViolationUop, wbViolationUop, l4ViolationUop) + val rollbackValidVec = Seq(lqViolation, wbViolation, l1Violation) + val rollbackUopVec = Seq(lqViolationUop, wbViolationUop, l1ViolationUop) rollback(i).valid := Cat(rollbackValidVec).orR val mask = getAfterMask(rollbackValidVec, rollbackUopVec) val oneAfterZero = mask(1)(0) @@ -428,7 +434,14 @@ class LoadQueue extends XSModule with HasDCacheParameters with HasCircularQueueP rollback(i).bits.isMisPred := false.B rollback(i).bits.isException := false.B rollback(i).bits.isFlushPipe := false.B + rollback(i).bits.target := rollbackUop.cf.pc + rollback(i).bits.brTag := rollbackUop.brTag + XSDebug( + l1Violation, + "need rollback (l4 load) pc %x roqidx %d target %x\n", + io.storeIn(i).bits.uop.cf.pc, io.storeIn(i).bits.uop.roqIdx.asUInt, l1ViolationUop.roqIdx.asUInt + ) XSDebug( lqViolation, "need rollback (ld wb before store) pc %x roqidx %d target %x\n", @@ -439,11 +452,6 @@ class LoadQueue extends XSModule with HasDCacheParameters with HasCircularQueueP "need rollback (ld/st wb together) pc %x roqidx %d target %x\n", io.storeIn(i).bits.uop.cf.pc, io.storeIn(i).bits.uop.roqIdx.asUInt, wbViolationUop.roqIdx.asUInt ) - XSDebug( - l4Violation, - "need rollback (l4 load) pc %x roqidx %d target %x\n", - io.storeIn(i).bits.uop.cf.pc, io.storeIn(i).bits.uop.roqIdx.asUInt, l4ViolationUop.roqIdx.asUInt - ) }.otherwise { rollback(i).valid := false.B } @@ -467,42 +475,43 @@ class LoadQueue extends XSModule with HasDCacheParameters with HasCircularQueueP // setup misc mem access req // mask / paddr / data can be get from lq.data - val commitType = io.commits(0).bits.uop.ctrl.commitType - io.uncache.req.valid := pending(ringBufferTail) && allocated(ringBufferTail) && - commitType === CommitType.LOAD && - io.roqDeqPtr === uop(ringBufferTail).roqIdx && + val commitType = io.commits(0).bits.uop.ctrl.commitType + io.uncache.req.valid := pending(deqPtr) && allocated(deqPtr) && + commitType === CommitType.LOAD && + io.roqDeqPtr === uop(deqPtr).roqIdx && !io.commits(0).bits.isWalk io.uncache.req.bits.cmd := MemoryOpConstants.M_XRD - io.uncache.req.bits.addr := data(ringBufferTail).paddr - io.uncache.req.bits.data := data(ringBufferTail).data - io.uncache.req.bits.mask := data(ringBufferTail).mask + io.uncache.req.bits.addr := dataModule.io.rdata(deqPtr).paddr + io.uncache.req.bits.data := dataModule.io.rdata(deqPtr).data + io.uncache.req.bits.mask := dataModule.io.rdata(deqPtr).mask io.uncache.req.bits.meta.id := DontCare // TODO: // FIXME io.uncache.req.bits.meta.vaddr := DontCare - io.uncache.req.bits.meta.paddr := data(ringBufferTail).paddr - io.uncache.req.bits.meta.uop := uop(ringBufferTail) - io.uncache.req.bits.meta.mmio := true.B // data(ringBufferTail).mmio + io.uncache.req.bits.meta.paddr := dataModule.io.rdata(deqPtr).paddr + io.uncache.req.bits.meta.uop := uop(deqPtr) + io.uncache.req.bits.meta.mmio := true.B // dataModule.io.rdata(deqPtr).mmio io.uncache.req.bits.meta.tlb_miss := false.B - io.uncache.req.bits.meta.mask := data(ringBufferTail).mask + io.uncache.req.bits.meta.mask := dataModule.io.rdata(deqPtr).mask io.uncache.req.bits.meta.replay := false.B io.uncache.resp.ready := true.B - io.uncache.s1_kill := false.B when(io.uncache.req.fire()){ - pending(ringBufferTail) := false.B + pending(deqPtr) := false.B } + dataModule.io.uncache.wen := false.B when(io.uncache.resp.fire()){ - valid(ringBufferTail) := true.B - data(ringBufferTail).data := io.uncache.resp.bits.data(XLEN-1, 0) + datavalid(deqPtr) := true.B + dataModule.io.uncacheWrite(deqPtr, io.uncache.resp.bits.data(XLEN-1, 0)) + dataModule.io.uncache.wen := true.B // TODO: write back exception info } when(io.uncache.req.fire()){ XSDebug("uncache req: pc %x addr %x data %x op %x mask %x\n", - uop(ringBufferTail).cf.pc, + uop(deqPtr).cf.pc, io.uncache.req.bits.addr, io.uncache.req.bits.data, io.uncache.req.bits.cmd, @@ -511,14 +520,11 @@ class LoadQueue extends XSModule with HasDCacheParameters with HasCircularQueueP } when(io.uncache.resp.fire()){ - XSDebug("uncache resp: data %x\n", io.dcache.resp.bits.data) + XSDebug("uncache resp: data %x\n", io.dcache.resp.bits.data) } // Read vaddr for mem exception - val mexcLsIdx = WireInit(0.U.asTypeOf(new LSIdx())) - val memExceptionAddr = WireInit(data(mexcLsIdx.lqIdx.value).vaddr) - ExcitingUtils.addSink(mexcLsIdx, "EXECPTION_LSROQIDX") - ExcitingUtils.addSource(memExceptionAddr, "EXECPTION_LOAD_VADDR") + io.exceptionAddr.vaddr := dataModule.io.rdata(io.exceptionAddr.lsIdx.lqIdx.value).vaddr // misprediction recovery / exception redirect // invalidate lq term using robIdx @@ -526,19 +532,19 @@ class LoadQueue extends XSModule with HasDCacheParameters with HasCircularQueueP for (i <- 0 until LoadQueueSize) { needCancel(i) := uop(i).roqIdx.needFlush(io.brqRedirect) && allocated(i) && !commited(i) when(needCancel(i)) { - when(io.brqRedirect.bits.isReplay){ - valid(i) := false.B - writebacked(i) := false.B - listening(i) := false.B - miss(i) := false.B - pending(i) := false.B - }.otherwise{ + // when(io.brqRedirect.bits.isReplay){ + // valid(i) := false.B + // writebacked(i) := false.B + // listening(i) := false.B + // miss(i) := false.B + // pending(i) := false.B + // }.otherwise{ allocated(i) := false.B - } + // } } } when (io.brqRedirect.valid && io.brqRedirect.bits.isMisPred) { - ringBufferHeadExtended := ringBufferHeadExtended - PopCount(needCancel) + enqPtrExt := enqPtrExt - PopCount(needCancel) } // assert(!io.rollback.valid) @@ -547,7 +553,7 @@ class LoadQueue extends XSModule with HasDCacheParameters with HasCircularQueueP } // debug info - XSDebug("head %d:%d tail %d:%d\n", ringBufferHeadExtended.flag, ringBufferHead, ringBufferTailExtended.flag, ringBufferTail) + XSDebug("head %d:%d tail %d:%d\n", enqPtrExt.flag, enqPtr, deqPtrExt.flag, deqPtr) def PrintFlag(flag: Bool, name: String): Unit = { when(flag) { @@ -559,9 +565,9 @@ class LoadQueue extends XSModule with HasDCacheParameters with HasCircularQueueP for (i <- 0 until LoadQueueSize) { if (i % 4 == 0) XSDebug("") - XSDebug(false, true.B, "%x ", uop(i).cf.pc) + XSDebug(false, true.B, "%x [%x] ", uop(i).cf.pc, dataModule.io.rdata(i).paddr) PrintFlag(allocated(i), "a") - PrintFlag(allocated(i) && valid(i), "v") + PrintFlag(allocated(i) && datavalid(i), "v") PrintFlag(allocated(i) && writebacked(i), "w") PrintFlag(allocated(i) && commited(i), "c") PrintFlag(allocated(i) && miss(i), "m") diff --git a/src/main/scala/xiangshan/mem/lsqueue/StoreQueue.scala b/src/main/scala/xiangshan/mem/lsqueue/StoreQueue.scala new file mode 100644 index 0000000000000000000000000000000000000000..657a251495c1eca7c73685344ca8a50821940d4d --- /dev/null +++ b/src/main/scala/xiangshan/mem/lsqueue/StoreQueue.scala @@ -0,0 +1,371 @@ +package xiangshan.mem + +import chisel3._ +import chisel3.util._ +import utils._ +import xiangshan._ +import xiangshan.cache._ +import xiangshan.cache.{DCacheWordIO, DCacheLineIO, TlbRequestIO, MemoryOpConstants} +import xiangshan.backend.LSUOpType +import xiangshan.backend.roq.RoqPtr + + +class SqPtr extends CircularQueuePtr(SqPtr.StoreQueueSize) { } + +object SqPtr extends HasXSParameter { + def apply(f: Bool, v: UInt): SqPtr = { + val ptr = Wire(new SqPtr) + ptr.flag := f + ptr.value := v + ptr + } +} + +// Store Queue +class StoreQueue extends XSModule with HasDCacheParameters with HasCircularQueuePtrHelper { + val io = IO(new Bundle() { + val enq = new Bundle() { + val canAccept = Output(Bool()) + val req = Vec(RenameWidth, Flipped(ValidIO(new MicroOp))) + val resp = Vec(RenameWidth, Output(new SqPtr)) + } + val brqRedirect = Input(Valid(new Redirect)) + val storeIn = Vec(StorePipelineWidth, Flipped(Valid(new LsPipelineBundle))) + val sbuffer = Vec(StorePipelineWidth, Decoupled(new DCacheWordReq)) + val mmioStout = DecoupledIO(new ExuOutput) // writeback uncached store + val forward = Vec(LoadPipelineWidth, Flipped(new LoadForwardQueryIO)) + val commits = Flipped(Vec(CommitWidth, Valid(new RoqCommit))) + val uncache = new DCacheWordIO + val roqDeqPtr = Input(new RoqPtr) + // val refill = Flipped(Valid(new DCacheLineReq )) + val exceptionAddr = new ExceptionAddrIO + }) + + val uop = Reg(Vec(StoreQueueSize, new MicroOp)) + // val data = Reg(Vec(StoreQueueSize, new LsqEntry)) + val dataModule = Module(new LSQueueData(StoreQueueSize, StorePipelineWidth)) + dataModule.io := DontCare + val allocated = RegInit(VecInit(List.fill(StoreQueueSize)(false.B))) // sq entry has been allocated + val datavalid = RegInit(VecInit(List.fill(StoreQueueSize)(false.B))) // non-mmio data is valid + val writebacked = RegInit(VecInit(List.fill(StoreQueueSize)(false.B))) // inst has been writebacked to CDB + val commited = Reg(Vec(StoreQueueSize, Bool())) // inst has been commited by roq + val pending = Reg(Vec(StoreQueueSize, Bool())) // mmio pending: inst is an mmio inst, it will not be executed until it reachs the end of roq + + val enqPtrExt = RegInit(0.U.asTypeOf(new SqPtr)) + val deqPtrExt = RegInit(0.U.asTypeOf(new SqPtr)) + val enqPtr = enqPtrExt.value + val deqPtr = deqPtrExt.value + val sameFlag = enqPtrExt.flag === deqPtrExt.flag + val isEmpty = enqPtr === deqPtr && sameFlag + val isFull = enqPtr === deqPtr && !sameFlag + val allowIn = !isFull + + val storeCommit = (0 until CommitWidth).map(i => io.commits(i).valid && !io.commits(i).bits.isWalk && io.commits(i).bits.uop.ctrl.commitType === CommitType.STORE) + val mcommitIdx = (0 until CommitWidth).map(i => io.commits(i).bits.uop.sqIdx.value) + + val tailMask = (((1.U((StoreQueueSize + 1).W)) << deqPtr).asUInt - 1.U)(StoreQueueSize - 1, 0) + val headMask = (((1.U((StoreQueueSize + 1).W)) << enqPtr).asUInt - 1.U)(StoreQueueSize - 1, 0) + val enqDeqMask1 = tailMask ^ headMask + val enqDeqMask = Mux(sameFlag, enqDeqMask1, ~enqDeqMask1) + + // Enqueue at dispatch + val validEntries = distanceBetween(enqPtrExt, deqPtrExt) + val firedDispatch = io.enq.req.map(_.valid) + io.enq.canAccept := validEntries <= (StoreQueueSize - RenameWidth).U + XSDebug(p"(ready, valid): ${io.enq.canAccept}, ${Binary(Cat(firedDispatch))}\n") + for (i <- 0 until RenameWidth) { + val offset = if (i == 0) 0.U else PopCount((0 until i).map(firedDispatch(_))) + val sqIdx = enqPtrExt + offset + val index = sqIdx.value + when(io.enq.req(i).valid) { + uop(index) := io.enq.req(i).bits + allocated(index) := true.B + datavalid(index) := false.B + writebacked(index) := false.B + commited(index) := false.B + pending(index) := false.B + } + io.enq.resp(i) := sqIdx + + XSError(!io.enq.canAccept && io.enq.req(i).valid, "should not valid when not ready\n") + } + + when(Cat(firedDispatch).orR) { + enqPtrExt := enqPtrExt + PopCount(firedDispatch) + XSInfo("dispatched %d insts to sq\n", PopCount(firedDispatch)) + } + + // writeback store + (0 until StorePipelineWidth).map(i => { + dataModule.io.wb(i).wen := false.B + when(io.storeIn(i).fire()) { + val stWbIndex = io.storeIn(i).bits.uop.sqIdx.value + val hasException = io.storeIn(i).bits.uop.cf.exceptionVec.asUInt.orR + val hasWritebacked = !io.storeIn(i).bits.mmio || hasException + datavalid(stWbIndex) := hasWritebacked + writebacked(stWbIndex) := hasWritebacked + pending(stWbIndex) := !hasWritebacked // valid mmio require + + val storeWbData = Wire(new LsqEntry) + storeWbData := DontCare + storeWbData.paddr := io.storeIn(i).bits.paddr + storeWbData.vaddr := io.storeIn(i).bits.vaddr + storeWbData.mask := io.storeIn(i).bits.mask + storeWbData.data := io.storeIn(i).bits.data + storeWbData.mmio := io.storeIn(i).bits.mmio + storeWbData.exception := io.storeIn(i).bits.uop.cf.exceptionVec.asUInt + + dataModule.io.wbWrite(i, stWbIndex, storeWbData) + dataModule.io.wb(i).wen := true.B + + XSInfo("store write to sq idx %d pc 0x%x vaddr %x paddr %x data %x mmio %x roll %x exc %x\n", + io.storeIn(i).bits.uop.sqIdx.value, + io.storeIn(i).bits.uop.cf.pc, + io.storeIn(i).bits.vaddr, + io.storeIn(i).bits.paddr, + io.storeIn(i).bits.data, + io.storeIn(i).bits.mmio, + io.storeIn(i).bits.rollback, + io.storeIn(i).bits.uop.cf.exceptionVec.asUInt + ) + } + }) + + def getFirstOne(mask: Vec[Bool], startMask: UInt) = { + val length = mask.length + val highBits = (0 until length).map(i => mask(i) & ~startMask(i)) + val highBitsUint = Cat(highBits.reverse) + PriorityEncoder(Mux(highBitsUint.orR(), highBitsUint, mask.asUInt)) + } + + def getFirstOneWithFlag(mask: Vec[Bool], startMask: UInt, startFlag: Bool) = { + val length = mask.length + val highBits = (0 until length).map(i => mask(i) & ~startMask(i)) + val highBitsUint = Cat(highBits.reverse) + val changeDirection = !highBitsUint.orR() + val index = PriorityEncoder(Mux(!changeDirection, highBitsUint, mask.asUInt)) + SqPtr(startFlag ^ changeDirection, index) + } + + def selectFirstTwo(valid: Vec[Bool], startMask: UInt) = { + val selVec = Wire(Vec(2, UInt(log2Up(StoreQueueSize).W))) + val selValid = Wire(Vec(2, Bool())) + selVec(0) := getFirstOne(valid, startMask) + val firstSelMask = UIntToOH(selVec(0)) + val secondSelVec = VecInit((0 until valid.length).map(i => valid(i) && !firstSelMask(i))) + selVec(1) := getFirstOne(secondSelVec, startMask) + selValid(0) := Cat(valid).orR + selValid(1) := Cat(secondSelVec).orR + (selValid, selVec) + } + + def selectFirstTwoRoughly(valid: Vec[Bool]) = { + // TODO: do not select according to seq, just select 2 valid bit randomly + val firstSelVec = valid + val notFirstVec = Wire(Vec(valid.length, Bool())) + (0 until valid.length).map(i => + notFirstVec(i) := (if(i != 0) { valid(i) || !notFirstVec(i) } else { false.B }) + ) + val secondSelVec = VecInit((0 until valid.length).map(i => valid(i) && !notFirstVec(i))) + + val selVec = Wire(Vec(2, UInt(log2Up(valid.length).W))) + val selValid = Wire(Vec(2, Bool())) + selVec(0) := PriorityEncoder(firstSelVec) + selVec(1) := PriorityEncoder(secondSelVec) + selValid(0) := Cat(firstSelVec).orR + selValid(1) := Cat(secondSelVec).orR + (selValid, selVec) + } + + // writeback finished mmio store + io.mmioStout.bits.uop := uop(deqPtr) + io.mmioStout.bits.uop.sqIdx := deqPtrExt + io.mmioStout.bits.uop.cf.exceptionVec := dataModule.io.rdata(deqPtr).exception.asBools + io.mmioStout.bits.data := dataModule.io.rdata(deqPtr).data + io.mmioStout.bits.redirectValid := false.B + io.mmioStout.bits.redirect := DontCare + io.mmioStout.bits.brUpdate := DontCare + io.mmioStout.bits.debug.isMMIO := true.B + io.mmioStout.bits.fflags := DontCare + io.mmioStout.valid := allocated(deqPtr) && datavalid(deqPtr) && !writebacked(deqPtr) // finished mmio store + when(io.mmioStout.fire()) { + writebacked(deqPtr) := true.B + allocated(deqPtr) := false.B // potential opt: move deqPtr immediately + } + + // remove retired insts from sq, add retired store to sbuffer + + // move tailPtr + // TailPtr slow recovery: recycle bubbles in store queue + // allocatedMask: dequeuePtr can go to the next 1-bit + val allocatedMask = VecInit((0 until StoreQueueSize).map(i => allocated(i) || !enqDeqMask(i))) + // find the first one from deqPtr (deqPtr) + val nextTail1 = getFirstOneWithFlag(allocatedMask, tailMask, deqPtrExt.flag) + val nextTail = Mux(Cat(allocatedMask).orR, nextTail1, enqPtrExt) + deqPtrExt := nextTail + + // TailPtr fast recovery + // val tailRecycle = VecInit(List( + // io.uncache.resp.fire() || io.sbuffer(0).fire(), + // io.sbuffer(1).fire() + // )) + + when(io.sbuffer(0).fire()){ + deqPtrExt := deqPtrExt + Mux(io.sbuffer(1).fire(), 2.U, 1.U) + } + + // load forward query + // check over all lq entries and forward data from the first matched store + (0 until LoadPipelineWidth).map(i => { + io.forward(i).forwardMask := 0.U(8.W).asBools + io.forward(i).forwardData := DontCare + + // Compare deqPtr (deqPtr) and forward.sqIdx, we have two cases: + // (1) if they have the same flag, we need to check range(tail, sqIdx) + // (2) if they have different flags, we need to check range(tail, LoadQueueSize) and range(0, sqIdx) + // Forward1: Mux(same_flag, range(tail, sqIdx), range(tail, LoadQueueSize)) + // Forward2: Mux(same_flag, 0.U, range(0, sqIdx) ) + // i.e. forward1 is the target entries with the same flag bits and forward2 otherwise + + val differentFlag = deqPtrExt.flag =/= io.forward(i).sqIdx.flag + val forwardMask = ((1.U((StoreQueueSize + 1).W)) << io.forward(i).sqIdx.value).asUInt - 1.U + val storeWritebackedVec = WireInit(VecInit(Seq.fill(StoreQueueSize)(false.B))) + for (j <- 0 until StoreQueueSize) { + storeWritebackedVec(j) := datavalid(j) && allocated(j) // all datavalid terms need to be checked + } + val needForward1 = Mux(differentFlag, ~tailMask, tailMask ^ forwardMask) & storeWritebackedVec.asUInt + val needForward2 = Mux(differentFlag, forwardMask, 0.U(StoreQueueSize.W)) & storeWritebackedVec.asUInt + + XSDebug("" + i + " f1 %b f2 %b sqIdx %d pa %x\n", needForward1, needForward2, io.forward(i).sqIdx.asUInt, io.forward(i).paddr) + + // do real fwd query + dataModule.io.forwardQuery( + channel = i, + paddr = io.forward(i).paddr, + needForward1 = needForward1, + needForward2 = needForward2 + ) + + io.forward(i).forwardMask := dataModule.io.forward(i).forwardMask + io.forward(i).forwardData := dataModule.io.forward(i).forwardData + }) + + // When store commited, mark it as commited (will not be influenced by redirect), + (0 until CommitWidth).map(i => { + when(storeCommit(i)) { + commited(mcommitIdx(i)) := true.B + XSDebug("store commit %d: idx %d %x\n", i.U, mcommitIdx(i), uop(mcommitIdx(i)).cf.pc) + } + }) + + (0 until 2).map(i => { + val ptr = (deqPtrExt + i.U).value + val mmio = dataModule.io.rdata(ptr).mmio + io.sbuffer(i).valid := allocated(ptr) && commited(ptr) && !mmio + io.sbuffer(i).bits.cmd := MemoryOpConstants.M_XWR + io.sbuffer(i).bits.addr := dataModule.io.rdata(ptr).paddr + io.sbuffer(i).bits.data := dataModule.io.rdata(ptr).data + io.sbuffer(i).bits.mask := dataModule.io.rdata(ptr).mask + io.sbuffer(i).bits.meta := DontCare + io.sbuffer(i).bits.meta.tlb_miss := false.B + io.sbuffer(i).bits.meta.uop := DontCare + io.sbuffer(i).bits.meta.mmio := mmio + io.sbuffer(i).bits.meta.mask := dataModule.io.rdata(ptr).mask + + when(io.sbuffer(i).fire()) { + allocated(ptr) := false.B + XSDebug("sbuffer "+i+" fire: ptr %d\n", ptr) + } + }) + + // Memory mapped IO / other uncached operations + + // setup misc mem access req + // mask / paddr / data can be get from sq.data + val commitType = io.commits(0).bits.uop.ctrl.commitType + io.uncache.req.valid := pending(deqPtr) && allocated(deqPtr) && + commitType === CommitType.STORE && + io.roqDeqPtr === uop(deqPtr).roqIdx && + !io.commits(0).bits.isWalk + + io.uncache.req.bits.cmd := MemoryOpConstants.M_XWR + io.uncache.req.bits.addr := dataModule.io.rdata(deqPtr).paddr + io.uncache.req.bits.data := dataModule.io.rdata(deqPtr).data + io.uncache.req.bits.mask := dataModule.io.rdata(deqPtr).mask + + io.uncache.req.bits.meta.id := DontCare // TODO: // FIXME + io.uncache.req.bits.meta.vaddr := DontCare + io.uncache.req.bits.meta.paddr := dataModule.io.rdata(deqPtr).paddr + io.uncache.req.bits.meta.uop := uop(deqPtr) + io.uncache.req.bits.meta.mmio := true.B // dataModule.io.rdata(deqPtr).mmio + io.uncache.req.bits.meta.tlb_miss := false.B + io.uncache.req.bits.meta.mask := dataModule.io.rdata(deqPtr).mask + io.uncache.req.bits.meta.replay := false.B + + io.uncache.resp.ready := true.B + + when(io.uncache.req.fire()){ + pending(deqPtr) := false.B + } + + when(io.uncache.resp.fire()){ + datavalid(deqPtr) := true.B // will be writeback to CDB in the next cycle + // TODO: write back exception info + } + + when(io.uncache.req.fire()){ + XSDebug("uncache req: pc %x addr %x data %x op %x mask %x\n", + uop(deqPtr).cf.pc, + io.uncache.req.bits.addr, + io.uncache.req.bits.data, + io.uncache.req.bits.cmd, + io.uncache.req.bits.mask + ) + } + + // Read vaddr for mem exception + io.exceptionAddr.vaddr := dataModule.io.rdata(io.exceptionAddr.lsIdx.sqIdx.value).vaddr + + // misprediction recovery / exception redirect + // invalidate sq term using robIdx + val needCancel = Wire(Vec(StoreQueueSize, Bool())) + for (i <- 0 until StoreQueueSize) { + needCancel(i) := uop(i).roqIdx.needFlush(io.brqRedirect) && allocated(i) && !commited(i) + when(needCancel(i)) { + // when(io.brqRedirect.bits.isReplay){ + // datavalid(i) := false.B + // writebacked(i) := false.B + // pending(i) := false.B + // }.otherwise{ + allocated(i) := false.B + // } + } + } + when (io.brqRedirect.valid && io.brqRedirect.bits.isMisPred) { + enqPtrExt := enqPtrExt - PopCount(needCancel) + } + + // debug info + XSDebug("head %d:%d tail %d:%d\n", enqPtrExt.flag, enqPtr, deqPtrExt.flag, deqPtr) + + def PrintFlag(flag: Bool, name: String): Unit = { + when(flag) { + XSDebug(false, true.B, name) + }.otherwise { + XSDebug(false, true.B, " ") + } + } + + for (i <- 0 until StoreQueueSize) { + if (i % 4 == 0) XSDebug("") + XSDebug(false, true.B, "%x [%x] ", uop(i).cf.pc, dataModule.io.rdata(i).paddr) + PrintFlag(allocated(i), "a") + PrintFlag(allocated(i) && datavalid(i), "v") + PrintFlag(allocated(i) && writebacked(i), "w") + PrintFlag(allocated(i) && commited(i), "c") + PrintFlag(allocated(i) && pending(i), "p") + XSDebug(false, true.B, " ") + if (i % 4 == 3 || i == StoreQueueSize - 1) XSDebug(false, true.B, "\n") + } + +} diff --git a/src/main/scala/xiangshan/mem/lsqueue/separated/LSQWrapper.scala b/src/main/scala/xiangshan/mem/lsqueue/separated/LSQWrapper.scala deleted file mode 100644 index 07dec80164006f86b9c2e68a8b2fe654dc7cb3be..0000000000000000000000000000000000000000 --- a/src/main/scala/xiangshan/mem/lsqueue/separated/LSQWrapper.scala +++ /dev/null @@ -1,137 +0,0 @@ -package xiangshan.mem - -import chisel3._ -import chisel3.util._ -import utils._ -import xiangshan._ -import xiangshan.cache._ -import xiangshan.cache.{DCacheWordIO, DCacheLineIO, TlbRequestIO, MemoryOpConstants} -import xiangshan.backend.LSUOpType -import xiangshan.mem._ -import xiangshan.backend.roq.RoqPtr - -// Load / Store Queue Wrapper for XiangShan Out of Order LSU -// -// By using this Wrapper, interface of unified lsroq and ldq / stq are the same -class LsqWrappper extends XSModule with HasDCacheParameters with NeedImpl { - val io = IO(new Bundle() { - val dp1Req = Vec(RenameWidth, Flipped(DecoupledIO(new MicroOp))) - val lsIdxs = Output(Vec(RenameWidth, new LSIdx)) - val brqRedirect = Input(Valid(new Redirect)) - val loadIn = Vec(LoadPipelineWidth, Flipped(Valid(new LsPipelineBundle))) - val storeIn = Vec(StorePipelineWidth, Flipped(Valid(new LsPipelineBundle))) - val sbuffer = Vec(StorePipelineWidth, Decoupled(new DCacheWordReq)) - val ldout = Vec(2, DecoupledIO(new ExuOutput)) // writeback store - val stout = Vec(2, DecoupledIO(new ExuOutput)) // writeback store - val forward = Vec(LoadPipelineWidth, Flipped(new LoadForwardQueryIO)) - val commits = Flipped(Vec(CommitWidth, Valid(new RoqCommit))) - val rollback = Output(Valid(new Redirect)) - val dcache = new DCacheLineIO - val uncache = new DCacheWordIO - val roqDeqPtr = Input(new RoqPtr) - val oldestStore = Output(Valid(new RoqPtr)) - }) - - if(EnableUnifiedLSQ){ - val lsroq = Module(new Lsroq) - - lsroq.io.dp1Req <> io.dp1Req - lsroq.io.brqRedirect <> io.brqRedirect - lsroq.io.loadIn <> io.loadIn - lsroq.io.storeIn <> io.storeIn - lsroq.io.sbuffer <> io.sbuffer - lsroq.io.ldout <> io.ldout - lsroq.io.stout <> io.stout - lsroq.io.forward <> io.forward - lsroq.io.commits <> io.commits - lsroq.io.rollback <> io.rollback - lsroq.io.dcache <> io.dcache - lsroq.io.uncache <> io.uncache - lsroq.io.roqDeqPtr <> io.roqDeqPtr - lsroq.io.oldestStore <> io.oldestStore - (0 until RenameWidth).map(i => { - io.lsIdxs(i).lsroqIdx := lsroq.io.lsroqIdxs(i) - }) - } else { - val loadQueue = Module(new LoadQueue) - val storeQueue = Module(new StoreQueue) - - // load queue wiring - loadQueue.io.dp1Req <> io.dp1Req - loadQueue.io.brqRedirect <> io.brqRedirect - loadQueue.io.loadIn <> io.loadIn - loadQueue.io.storeIn <> io.storeIn - loadQueue.io.ldout <> io.ldout - loadQueue.io.commits <> io.commits - loadQueue.io.rollback <> io.rollback - loadQueue.io.dcache <> io.dcache - loadQueue.io.roqDeqPtr <> io.roqDeqPtr - - // store queue wiring - // storeQueue.io <> DontCare - storeQueue.io.dp1Req <> io.dp1Req - storeQueue.io.brqRedirect <> io.brqRedirect - storeQueue.io.storeIn <> io.storeIn - storeQueue.io.sbuffer <> io.sbuffer - storeQueue.io.stout <> io.stout - storeQueue.io.commits <> io.commits - storeQueue.io.roqDeqPtr <> io.roqDeqPtr - storeQueue.io.oldestStore <> io.oldestStore - - loadQueue.io.forward <> io.forward - storeQueue.io.forward <> io.forward // overlap forwardMask & forwardData, DO NOT CHANGE SEQUENCE - - // naive uncache arbiter - val s_idle :: s_load :: s_store :: Nil = Enum(3) - val uncacheState = RegInit(s_idle) - - switch(uncacheState){ - is(s_idle){ - when(io.uncache.req.fire()){ - uncacheState := Mux(loadQueue.io.uncache.req.valid, s_load, s_store) - } - } - is(s_load){ - when(io.uncache.resp.fire()){ - uncacheState := s_idle - } - } - is(s_store){ - when(io.uncache.resp.fire()){ - uncacheState := s_idle - } - } - } - - loadQueue.io.uncache := DontCare - storeQueue.io.uncache := DontCare - loadQueue.io.uncache.resp.valid := false.B - storeQueue.io.uncache.resp.valid := false.B - when(loadQueue.io.uncache.req.valid){ - io.uncache.req <> loadQueue.io.uncache.req - }.otherwise{ - io.uncache.req <> storeQueue.io.uncache.req - } - when(uncacheState === s_load){ - io.uncache.resp <> loadQueue.io.uncache.resp - }.otherwise{ - io.uncache.resp <> storeQueue.io.uncache.resp - } - io.uncache.s1_kill := false.B - - assert(!(loadQueue.io.uncache.req.valid && storeQueue.io.uncache.req.valid)) - assert(!(loadQueue.io.uncache.resp.valid && storeQueue.io.uncache.resp.valid)) - assert(!((loadQueue.io.uncache.resp.valid || storeQueue.io.uncache.resp.valid) && uncacheState === s_idle)) - - // fix valid, allocate lq / sq index - (0 until RenameWidth).map(i => { - val isStore = CommitType.lsInstIsStore(io.dp1Req(i).bits.ctrl.commitType) - val prevCanIn = if (i == 0) true.B else Cat((0 until i).map(i => io.dp1Req(i).ready)).andR - loadQueue.io.dp1Req(i).valid := !isStore && io.dp1Req(i).valid && prevCanIn - storeQueue.io.dp1Req(i).valid := isStore && io.dp1Req(i).valid && prevCanIn - loadQueue.io.lqIdxs(i) <> io.lsIdxs(i).lqIdx - storeQueue.io.sqIdxs(i) <> io.lsIdxs(i).sqIdx - io.dp1Req(i).ready := storeQueue.io.dp1Req(i).ready && loadQueue.io.dp1Req(i).ready - }) - } -} diff --git a/src/main/scala/xiangshan/mem/lsqueue/separated/StoreQueue.scala b/src/main/scala/xiangshan/mem/lsqueue/separated/StoreQueue.scala deleted file mode 100644 index a057c3b22d164ab7216339c24fcc982ac4a4ce2e..0000000000000000000000000000000000000000 --- a/src/main/scala/xiangshan/mem/lsqueue/separated/StoreQueue.scala +++ /dev/null @@ -1,380 +0,0 @@ -package xiangshan.mem - -import chisel3._ -import chisel3.util._ -import utils._ -import xiangshan._ -import xiangshan.cache._ -import xiangshan.cache.{DCacheWordIO, DCacheLineIO, TlbRequestIO, MemoryOpConstants} -import xiangshan.backend.LSUOpType -import xiangshan.backend.roq.RoqPtr - - -class SqPtr extends CircularQueuePtr(SqPtr.StoreQueueSize) { } - -object SqPtr extends HasXSParameter { - def apply(f: Bool, v: UInt): SqPtr = { - val ptr = Wire(new SqPtr) - ptr.flag := f - ptr.value := v - ptr - } -} - -// Store Queue -class StoreQueue extends XSModule with HasDCacheParameters with HasCircularQueuePtrHelper { - val io = IO(new Bundle() { - val dp1Req = Vec(RenameWidth, Flipped(DecoupledIO(new MicroOp))) - val sqIdxs = Output(Vec(RenameWidth, new SqPtr)) - val brqRedirect = Input(Valid(new Redirect)) - val storeIn = Vec(StorePipelineWidth, Flipped(Valid(new LsPipelineBundle))) - val sbuffer = Vec(StorePipelineWidth, Decoupled(new DCacheWordReq)) - val stout = Vec(2, DecoupledIO(new ExuOutput)) // writeback store - val forward = Vec(LoadPipelineWidth, Flipped(new LoadForwardQueryIO)) - val commits = Flipped(Vec(CommitWidth, Valid(new RoqCommit))) - val uncache = new DCacheWordIO - val roqDeqPtr = Input(new RoqPtr) - // val refill = Flipped(Valid(new DCacheLineReq )) - val oldestStore = Output(Valid(new RoqPtr)) - }) - - val uop = Reg(Vec(StoreQueueSize, new MicroOp)) - val data = Reg(Vec(StoreQueueSize, new LsRoqEntry)) // FIXME: use StoreQueueEntry instead - val allocated = RegInit(VecInit(List.fill(StoreQueueSize)(false.B))) // sq entry has been allocated - val valid = RegInit(VecInit(List.fill(StoreQueueSize)(false.B))) // data is valid - val writebacked = RegInit(VecInit(List.fill(StoreQueueSize)(false.B))) // inst has been writebacked to CDB - val commited = Reg(Vec(StoreQueueSize, Bool())) // inst has been writebacked to CDB - val miss = Reg(Vec(StoreQueueSize, Bool())) // load inst missed, waiting for miss queue to accept miss request - val listening = Reg(Vec(StoreQueueSize, Bool())) // waiting for refill result - val pending = Reg(Vec(StoreQueueSize, Bool())) // mmio pending: inst is an mmio inst, it will not be executed until it reachs the end of roq - - val ringBufferHeadExtended = RegInit(0.U.asTypeOf(new SqPtr)) - val ringBufferTailExtended = RegInit(0.U.asTypeOf(new SqPtr)) - val ringBufferHead = ringBufferHeadExtended.value - val ringBufferTail = ringBufferTailExtended.value - val ringBufferSameFlag = ringBufferHeadExtended.flag === ringBufferTailExtended.flag - val ringBufferEmpty = ringBufferHead === ringBufferTail && ringBufferSameFlag - val ringBufferFull = ringBufferHead === ringBufferTail && !ringBufferSameFlag - val ringBufferAllowin = !ringBufferFull - - val storeCommit = (0 until CommitWidth).map(i => io.commits(i).valid && !io.commits(i).bits.isWalk && io.commits(i).bits.uop.ctrl.commitType === CommitType.STORE) - val mcommitIdx = (0 until CommitWidth).map(i => io.commits(i).bits.uop.sqIdx.value) - - val tailMask = (((1.U((StoreQueueSize + 1).W)) << ringBufferTail).asUInt - 1.U)(StoreQueueSize - 1, 0) - val headMask = (((1.U((StoreQueueSize + 1).W)) << ringBufferHead).asUInt - 1.U)(StoreQueueSize - 1, 0) - val enqDeqMask1 = tailMask ^ headMask - val enqDeqMask = Mux(ringBufferSameFlag, enqDeqMask1, ~enqDeqMask1) - - // TODO: misc arbitor - - // Enqueue at dispatch - val emptyEntries = StoreQueueSize.U - distanceBetween(ringBufferHeadExtended, ringBufferTailExtended) - XSDebug("(ready, valid): ") - for (i <- 0 until RenameWidth) { - val offset = if (i == 0) 0.U else PopCount((0 until i).map(io.dp1Req(_).valid)) - val sqIdx = ringBufferHeadExtended + offset - val index = sqIdx.value - when(io.dp1Req(i).fire()) { - uop(index) := io.dp1Req(i).bits - allocated(index) := true.B - valid(index) := false.B - writebacked(index) := false.B - commited(index) := false.B - miss(index) := false.B - listening(index) := false.B - pending(index) := false.B - // data(index).bwdMask := 0.U(8.W).asBools - } - val numTryEnqueue = offset +& io.dp1Req(i).valid - io.dp1Req(i).ready := numTryEnqueue <= emptyEntries - io.sqIdxs(i) := sqIdx - XSDebug(false, true.B, "(%d, %d) ", io.dp1Req(i).ready, io.dp1Req(i).valid) - } - XSDebug(false, true.B, "\n") - - val firedDispatch = VecInit((0 until CommitWidth).map(io.dp1Req(_).fire())).asUInt - when(firedDispatch.orR) { - ringBufferHeadExtended := ringBufferHeadExtended + PopCount(firedDispatch) - XSInfo("dispatched %d insts to sq\n", PopCount(firedDispatch)) - } - - // writeback store - (0 until StorePipelineWidth).map(i => { - when(io.storeIn(i).fire()) { - val stWbIndex = io.storeIn(i).bits.uop.sqIdx.value - valid(stWbIndex) := !io.storeIn(i).bits.mmio - data(stWbIndex).paddr := io.storeIn(i).bits.paddr - data(stWbIndex).vaddr := io.storeIn(i).bits.vaddr - data(stWbIndex).mask := io.storeIn(i).bits.mask - data(stWbIndex).data := io.storeIn(i).bits.data - data(stWbIndex).mmio := io.storeIn(i).bits.mmio - data(stWbIndex).exception := io.storeIn(i).bits.uop.cf.exceptionVec.asUInt - miss(stWbIndex) := io.storeIn(i).bits.miss - pending(stWbIndex) := io.storeIn(i).bits.mmio - uop(stWbIndex).debugInfo.issueTime := io.storeIn(i).bits.uop.debugInfo.issueTime - XSInfo("store write to sq idx %d pc 0x%x vaddr %x paddr %x data %x miss %x mmio %x roll %x exc %x\n", - io.storeIn(i).bits.uop.sqIdx.value, - io.storeIn(i).bits.uop.cf.pc, - io.storeIn(i).bits.vaddr, - io.storeIn(i).bits.paddr, - io.storeIn(i).bits.data, - io.storeIn(i).bits.miss, - io.storeIn(i).bits.mmio, - io.storeIn(i).bits.rollback, - io.storeIn(i).bits.uop.cf.exceptionVec.asUInt - ) - } - }) - - def getFirstOne(mask: Vec[Bool], startMask: UInt) = { - val length = mask.length - val highBits = (0 until length).map(i => mask(i) & ~startMask(i)) - val highBitsUint = Cat(highBits.reverse) - PriorityEncoder(Mux(highBitsUint.orR(), highBitsUint, mask.asUInt)) - } - - def getFirstOneWithFlag(mask: Vec[Bool], startMask: UInt, startFlag: Bool) = { - val length = mask.length - val highBits = (0 until length).map(i => mask(i) & ~startMask(i)) - val highBitsUint = Cat(highBits.reverse) - val changeDirection = !highBitsUint.orR() - val index = PriorityEncoder(Mux(!changeDirection, highBitsUint, mask.asUInt)) - SqPtr(startFlag ^ changeDirection, index) - } - - def selectFirstTwo(valid: Vec[Bool], startMask: UInt) = { - val selVec = Wire(Vec(2, UInt(log2Up(StoreQueueSize).W))) - val selValid = Wire(Vec(2, Bool())) - selVec(0) := getFirstOne(valid, startMask) - val firstSelMask = UIntToOH(selVec(0)) - val secondSelVec = VecInit((0 until valid.length).map(i => valid(i) && !firstSelMask(i))) - selVec(1) := getFirstOne(secondSelVec, startMask) - selValid(0) := Cat(valid).orR - selValid(1) := Cat(secondSelVec).orR - (selValid, selVec) - } - - // select the last writebacked instruction - val validStoreVec = VecInit((0 until StoreQueueSize).map(i => !(allocated(i) && valid(i)))) - val storeNotValid = SqPtr(false.B, getFirstOne(validStoreVec, tailMask)) - val storeValidIndex = (storeNotValid - 1.U).value - io.oldestStore.valid := allocated(ringBufferTailExtended.value) && valid(ringBufferTailExtended.value) && !commited(storeValidIndex) - io.oldestStore.bits := uop(storeValidIndex).roqIdx - - // writeback up to 2 store insts to CDB - // choose the first two valid store requests from deqPtr - val storeWbSelVec = VecInit((0 until StoreQueueSize).map(i => allocated(i) && valid(i) && !writebacked(i))) - val (storeWbValid, storeWbSel) = selectFirstTwo(storeWbSelVec, tailMask) - - (0 until StorePipelineWidth).map(i => { - io.stout(i).bits.uop := uop(storeWbSel(i)) - io.stout(i).bits.uop.sqIdx := storeWbSel(i).asTypeOf(new SqPtr) - io.stout(i).bits.uop.cf.exceptionVec := data(storeWbSel(i)).exception.asBools - io.stout(i).bits.data := data(storeWbSel(i)).data - io.stout(i).bits.redirectValid := false.B - io.stout(i).bits.redirect := DontCare - io.stout(i).bits.brUpdate := DontCare - io.stout(i).bits.debug.isMMIO := data(storeWbSel(i)).mmio - io.stout(i).valid := storeWbSelVec(storeWbSel(i)) && storeWbValid(i) - when(io.stout(i).fire()) { - writebacked(storeWbSel(i)) := true.B - } - }) - - // remove retired insts from sq, add retired store to sbuffer - - // move tailPtr - // allocatedMask: dequeuePtr can go to the next 1-bit - val allocatedMask = VecInit((0 until StoreQueueSize).map(i => allocated(i) || !enqDeqMask(i))) - // find the first one from deqPtr (ringBufferTail) - val nextTail1 = getFirstOneWithFlag(allocatedMask, tailMask, ringBufferTailExtended.flag) - val nextTail = Mux(Cat(allocatedMask).orR, nextTail1, ringBufferHeadExtended) - ringBufferTailExtended := nextTail - - // load forward query - // check over all lq entries and forward data from the first matched store - (0 until LoadPipelineWidth).map(i => { - io.forward(i).forwardMask := 0.U(8.W).asBools - io.forward(i).forwardData := DontCare - - // Compare ringBufferTail (deqPtr) and forward.sqIdx, we have two cases: - // (1) if they have the same flag, we need to check range(tail, sqIdx) - // (2) if they have different flags, we need to check range(tail, LoadQueueSize) and range(0, sqIdx) - // Forward1: Mux(same_flag, range(tail, sqIdx), range(tail, LoadQueueSize)) - // Forward2: Mux(same_flag, 0.U, range(0, sqIdx) ) - // i.e. forward1 is the target entries with the same flag bits and forward2 otherwise - val forwardMask1 = WireInit(VecInit(Seq.fill(8)(false.B))) - val forwardData1 = WireInit(VecInit(Seq.fill(8)(0.U(8.W)))) - val forwardMask2 = WireInit(VecInit(Seq.fill(8)(false.B))) - val forwardData2 = WireInit(VecInit(Seq.fill(8)(0.U(8.W)))) - - val differentFlag = ringBufferTailExtended.flag =/= io.forward(i).sqIdx.flag - val forwardMask = ((1.U((StoreQueueSize + 1).W)) << io.forward(i).sqIdx.value).asUInt - 1.U - val needForward1 = Mux(differentFlag, ~tailMask, tailMask ^ forwardMask) - val needForward2 = Mux(differentFlag, forwardMask, 0.U(StoreQueueSize.W)) - - XSDebug("" + i + " f1 %b f2 %b sqIdx %d pa %x\n", needForward1, needForward2, io.forward(i).sqIdx.asUInt, io.forward(i).paddr) - - // entry with larger index should have higher priority since it's data is younger - for (j <- 0 until StoreQueueSize) { - val needCheck = valid(j) && allocated(j) && // all valid terms need to be checked - io.forward(i).paddr(PAddrBits - 1, 3) === data(j).paddr(PAddrBits - 1, 3) - (0 until XLEN / 8).foreach(k => { - when (needCheck && data(j).mask(k)) { - when (needForward1(j)) { - forwardMask1(k) := true.B - forwardData1(k) := data(j).data(8 * (k + 1) - 1, 8 * k) - } - when (needForward2(j)) { - forwardMask2(k) := true.B - forwardData2(k) := data(j).data(8 * (k + 1) - 1, 8 * k) - } - XSDebug(needForward1(j) || needForward2(j), - p"forwarding $k-th byte ${Hexadecimal(data(j).data(8 * (k + 1) - 1, 8 * k))} " + - p"from ptr $j pc ${Hexadecimal(uop(j).cf.pc)}\n") - } - }) - } - - // merge forward lookup results - // forward2 is younger than forward1 and should have higher priority - (0 until XLEN / 8).map(k => { - io.forward(i).forwardMask(k) := forwardMask1(k) || forwardMask2(k) - io.forward(i).forwardData(k) := Mux(forwardMask2(k), forwardData2(k), forwardData1(k)) - }) - }) - - (0 until CommitWidth).map(i => { - when(storeCommit(i)) { - commited(mcommitIdx(i)) := true.B - XSDebug("store commit %d: idx %d %x\n", i.U, mcommitIdx(i), uop(mcommitIdx(i)).cf.pc) - } - }) - - val storeCommitSelVec = VecInit((0 until StoreQueueSize).map(i => { - allocated(i) && commited(i) - })) - val (storeCommitValid, storeCommitSel) = selectFirstTwo(storeCommitSelVec, tailMask) - - // get no more than 2 commited store from storeCommitedQueue - // send selected store inst to sbuffer - (0 until 2).map(i => { - val ptr = storeCommitSel(i) - val mmio = data(ptr).mmio - io.sbuffer(i).valid := storeCommitValid(i) && !mmio - io.sbuffer(i).bits.cmd := MemoryOpConstants.M_XWR - io.sbuffer(i).bits.addr := data(ptr).paddr - io.sbuffer(i).bits.data := data(ptr).data - io.sbuffer(i).bits.mask := data(ptr).mask - io.sbuffer(i).bits.meta := DontCare - io.sbuffer(i).bits.meta.tlb_miss := false.B - io.sbuffer(i).bits.meta.uop := uop(ptr) - io.sbuffer(i).bits.meta.mmio := mmio - io.sbuffer(i).bits.meta.mask := data(ptr).mask - - // update sq meta if store inst is send to sbuffer - when(storeCommitValid(i) && (mmio || io.sbuffer(i).ready)) { - allocated(ptr) := false.B - } - }) - - // Memory mapped IO / other uncached operations - - // setup misc mem access req - // mask / paddr / data can be get from sq.data - val commitType = io.commits(0).bits.uop.ctrl.commitType - io.uncache.req.valid := pending(ringBufferTail) && allocated(ringBufferTail) && - commitType === CommitType.STORE && - io.roqDeqPtr === uop(ringBufferTail).roqIdx && - !io.commits(0).bits.isWalk - - io.uncache.req.bits.cmd := MemoryOpConstants.M_XWR - io.uncache.req.bits.addr := data(ringBufferTail).paddr - io.uncache.req.bits.data := data(ringBufferTail).data - io.uncache.req.bits.mask := data(ringBufferTail).mask - - io.uncache.req.bits.meta.id := DontCare // TODO: // FIXME - io.uncache.req.bits.meta.vaddr := DontCare - io.uncache.req.bits.meta.paddr := data(ringBufferTail).paddr - io.uncache.req.bits.meta.uop := uop(ringBufferTail) - io.uncache.req.bits.meta.mmio := true.B // data(ringBufferTail).mmio - io.uncache.req.bits.meta.tlb_miss := false.B - io.uncache.req.bits.meta.mask := data(ringBufferTail).mask - io.uncache.req.bits.meta.replay := false.B - - io.uncache.resp.ready := true.B - io.uncache.s1_kill := false.B - - when(io.uncache.req.fire()){ - pending(ringBufferTail) := false.B - } - - when(io.uncache.resp.fire()){ - valid(ringBufferTail) := true.B - data(ringBufferTail).data := io.uncache.resp.bits.data(XLEN-1, 0) - // TODO: write back exception info - } - - when(io.uncache.req.fire()){ - XSDebug("uncache req: pc %x addr %x data %x op %x mask %x\n", - uop(ringBufferTail).cf.pc, - io.uncache.req.bits.addr, - io.uncache.req.bits.data, - io.uncache.req.bits.cmd, - io.uncache.req.bits.mask - ) - } - - // Read vaddr for mem exception - val mexcLsIdx = WireInit(0.U.asTypeOf(new LSIdx())) - val memExceptionAddr = WireInit(data(mexcLsIdx.sqIdx.value).vaddr) - ExcitingUtils.addSink(mexcLsIdx, "EXECPTION_LSROQIDX") - ExcitingUtils.addSource(memExceptionAddr, "EXECPTION_STORE_VADDR") - - // misprediction recovery / exception redirect - // invalidate sq term using robIdx - val needCancel = Wire(Vec(StoreQueueSize, Bool())) - for (i <- 0 until StoreQueueSize) { - needCancel(i) := uop(i).roqIdx.needFlush(io.brqRedirect) && allocated(i) && !commited(i) - when(needCancel(i)) { - when(io.brqRedirect.bits.isReplay){ - valid(i) := false.B - writebacked(i) := false.B - listening(i) := false.B - miss(i) := false.B - pending(i) := false.B - }.otherwise{ - allocated(i) := false.B - } - } - } - when (io.brqRedirect.valid && io.brqRedirect.bits.isMisPred) { - ringBufferHeadExtended := ringBufferHeadExtended - PopCount(needCancel) - } - - // debug info - XSDebug("head %d:%d tail %d:%d\n", ringBufferHeadExtended.flag, ringBufferHead, ringBufferTailExtended.flag, ringBufferTail) - - def PrintFlag(flag: Bool, name: String): Unit = { - when(flag) { - XSDebug(false, true.B, name) - }.otherwise { - XSDebug(false, true.B, " ") - } - } - - for (i <- 0 until StoreQueueSize) { - if (i % 4 == 0) XSDebug("") - XSDebug(false, true.B, "%x ", uop(i).cf.pc) - PrintFlag(allocated(i), "a") - PrintFlag(allocated(i) && valid(i), "v") - PrintFlag(allocated(i) && writebacked(i), "w") - PrintFlag(allocated(i) && commited(i), "c") - PrintFlag(allocated(i) && miss(i), "m") - PrintFlag(allocated(i) && listening(i), "l") - PrintFlag(allocated(i) && pending(i), "p") - XSDebug(false, true.B, " ") - if (i % 4 == 3 || i == StoreQueueSize - 1) XSDebug(false, true.B, "\n") - } - -} diff --git a/src/main/scala/xiangshan/mem/lsqueue/unified/Lsroq.scala b/src/main/scala/xiangshan/mem/lsqueue/unified/Lsroq.scala deleted file mode 100644 index 02399b11f06f1d9d3e8fe2565adc69b64ef38b88..0000000000000000000000000000000000000000 --- a/src/main/scala/xiangshan/mem/lsqueue/unified/Lsroq.scala +++ /dev/null @@ -1,763 +0,0 @@ -package xiangshan.mem - -import chisel3._ -import chisel3.util._ -import utils._ -import xiangshan._ -import xiangshan.cache._ -import chisel3.ExcitingUtils._ -import xiangshan.cache.{DCacheWordIO, DCacheLineIO, TlbRequestIO, MemoryOpConstants} -import xiangshan.backend.LSUOpType -import xiangshan.backend.roq.RoqPtr - -class LsRoqEntry extends XSBundle { - val vaddr = UInt(VAddrBits.W) // TODO: need opt - val paddr = UInt(PAddrBits.W) - val op = UInt(6.W) - val mask = UInt(8.W) - val data = UInt(XLEN.W) - val exception = UInt(16.W) // TODO: opt size - val mmio = Bool() - val fwdMask = Vec(8, Bool()) - val fwdData = Vec(8, UInt(8.W)) -} - -// inflight miss block reqs -class InflightBlockInfo extends XSBundle { - val block_addr = UInt(PAddrBits.W) - val valid = Bool() -} - -// Load/Store Roq (Lsroq) for XiangShan Out of Order LSU -class Lsroq extends XSModule with HasDCacheParameters with HasCircularQueuePtrHelper{ - val io = IO(new Bundle() { - val dp1Req = Vec(RenameWidth, Flipped(DecoupledIO(new MicroOp))) - val lsroqIdxs = Output(Vec(RenameWidth, UInt(LsroqIdxWidth.W))) - val oldestStore = Output(Valid(new RoqPtr)) - val brqRedirect = Input(Valid(new Redirect)) - val loadIn = Vec(LoadPipelineWidth, Flipped(Valid(new LsPipelineBundle))) - val storeIn = Vec(StorePipelineWidth, Flipped(Valid(new LsPipelineBundle))) - val sbuffer = Vec(StorePipelineWidth, Decoupled(new DCacheWordReq)) - val ldout = Vec(2, DecoupledIO(new ExuOutput)) // writeback store - val stout = Vec(2, DecoupledIO(new ExuOutput)) // writeback store - val forward = Vec(LoadPipelineWidth, Flipped(new LoadForwardQueryIO)) - val commits = Flipped(Vec(CommitWidth, Valid(new RoqCommit))) - val rollback = Output(Valid(new Redirect)) - val dcache = new DCacheLineIO - val uncache = new DCacheWordIO - val roqDeqPtr = Input(new RoqPtr) - // val refill = Flipped(Valid(new DCacheLineReq )) - }) - - val uop = Reg(Vec(LsroqSize, new MicroOp)) - val data = Reg(Vec(LsroqSize, new LsRoqEntry)) - val allocated = RegInit(VecInit(List.fill(LsroqSize)(false.B))) // lsroq entry has been allocated - val valid = RegInit(VecInit(List.fill(LsroqSize)(false.B))) // data is valid - val writebacked = RegInit(VecInit(List.fill(LsroqSize)(false.B))) // inst has been writebacked to CDB - val commited = Reg(Vec(LsroqSize, Bool())) // inst has been writebacked to CDB - val store = Reg(Vec(LsroqSize, Bool())) // inst is a store inst - val miss = Reg(Vec(LsroqSize, Bool())) // load inst missed, waiting for miss queue to accept miss request - val listening = Reg(Vec(LsroqSize, Bool())) // waiting for refill result - val pending = Reg(Vec(LsroqSize, Bool())) // mmio pending: inst is an mmio inst, it will not be executed until it reachs the end of roq - - val ringBufferHeadExtended = RegInit(0.U(LsroqIdxWidth.W)) - val ringBufferTailExtended = RegInit(0.U(LsroqIdxWidth.W)) - val ringBufferHead = ringBufferHeadExtended(InnerLsroqIdxWidth - 1, 0) - val ringBufferTail = ringBufferTailExtended(InnerLsroqIdxWidth - 1, 0) - val ringBufferSameFlag = ringBufferHeadExtended(InnerLsroqIdxWidth) === ringBufferTailExtended(InnerLsroqIdxWidth) - val ringBufferEmpty = ringBufferHead === ringBufferTail && ringBufferSameFlag - val ringBufferFull = ringBufferHead === ringBufferTail && !ringBufferSameFlag - val ringBufferAllowin = !ringBufferFull - - val storeCommit = (0 until CommitWidth).map(i => io.commits(i).valid && !io.commits(i).bits.isWalk && io.commits(i).bits.uop.ctrl.commitType === CommitType.STORE) - val loadCommit = (0 until CommitWidth).map(i => io.commits(i).valid && !io.commits(i).bits.isWalk && io.commits(i).bits.uop.ctrl.commitType === CommitType.LOAD) - val mcommitIdx = (0 until CommitWidth).map(i => io.commits(i).bits.uop.lsroqIdx(InnerLsroqIdxWidth-1,0)) - - val tailMask = (((1.U((LsroqSize + 1).W)) << ringBufferTail).asUInt - 1.U)(LsroqSize - 1, 0) - val headMask = (((1.U((LsroqSize + 1).W)) << ringBufferHead).asUInt - 1.U)(LsroqSize - 1, 0) - val enqDeqMask1 = tailMask ^ headMask - val enqDeqMask = Mux(ringBufferSameFlag, enqDeqMask1, ~enqDeqMask1) - - // TODO: misc arbitor - - // Enqueue at dispatch - val validDispatch = VecInit((0 until RenameWidth).map(io.dp1Req(_).valid)).asUInt - XSDebug("(ready, valid): ") - for (i <- 0 until RenameWidth) { - val offset = if (i == 0) 0.U else PopCount(validDispatch(i - 1, 0)) - val lsroqIdx = ringBufferHeadExtended + offset - val index = lsroqIdx(InnerLsroqIdxWidth - 1, 0) - when(io.dp1Req(i).fire()) { - uop(index) := io.dp1Req(i).bits - allocated(index) := true.B - valid(index) := false.B - writebacked(index) := false.B - commited(index) := false.B - store(index) := false.B - miss(index) := false.B - listening(index) := false.B - pending(index) := false.B - // data(index).bwdMask := 0.U(8.W).asBools - } - if (i == 0) { - io.dp1Req(i).ready := ringBufferAllowin && !allocated(index) - } else { - io.dp1Req(i).ready := ringBufferAllowin && !allocated(index) && io.dp1Req(i - 1).ready - } - io.lsroqIdxs(i) := lsroqIdx - XSDebug(false, true.B, "(%d, %d) ", io.dp1Req(i).ready, io.dp1Req(i).valid) - } - XSDebug(false, true.B, "\n") - - val firedDispatch = VecInit((0 until CommitWidth).map(io.dp1Req(_).fire())).asUInt - when(firedDispatch.orR) { - ringBufferHeadExtended := ringBufferHeadExtended + PopCount(firedDispatch) - XSInfo("dispatched %d insts to lsroq\n", PopCount(firedDispatch)) - } - - // writeback load - (0 until LoadPipelineWidth).map(i => { - when(io.loadIn(i).fire()) { - when(io.loadIn(i).bits.miss) { - XSInfo(io.loadIn(i).valid, "load miss write to lsroq idx %d pc 0x%x vaddr %x paddr %x data %x mask %x forwardData %x forwardMask: %x mmio %x roll %x exc %x\n", - io.loadIn(i).bits.uop.lsroqIdx, - io.loadIn(i).bits.uop.cf.pc, - io.loadIn(i).bits.vaddr, - io.loadIn(i).bits.paddr, - io.loadIn(i).bits.data, - io.loadIn(i).bits.mask, - io.loadIn(i).bits.forwardData.asUInt, - io.loadIn(i).bits.forwardMask.asUInt, - io.loadIn(i).bits.mmio, - io.loadIn(i).bits.rollback, - io.loadIn(i).bits.uop.cf.exceptionVec.asUInt - ) - }.otherwise { - XSInfo(io.loadIn(i).valid, "load hit write to cbd idx %d pc 0x%x vaddr %x paddr %x data %x mask %x forwardData %x forwardMask: %x mmio %x roll %x exc %x\n", - io.loadIn(i).bits.uop.lsroqIdx, - io.loadIn(i).bits.uop.cf.pc, - io.loadIn(i).bits.vaddr, - io.loadIn(i).bits.paddr, - io.loadIn(i).bits.data, - io.loadIn(i).bits.mask, - io.loadIn(i).bits.forwardData.asUInt, - io.loadIn(i).bits.forwardMask.asUInt, - io.loadIn(i).bits.mmio, - io.loadIn(i).bits.rollback, - io.loadIn(i).bits.uop.cf.exceptionVec.asUInt - ) - } - valid(io.loadIn(i).bits.uop.lsroqIdx) := !io.loadIn(i).bits.miss && !io.loadIn(i).bits.mmio - writebacked(io.loadIn(i).bits.uop.lsroqIdx) := !io.loadIn(i).bits.miss && !io.loadIn(i).bits.mmio - // allocated(io.loadIn(i).bits.uop.lsroqIdx) := io.loadIn(i).bits.miss // if hit, lsroq entry can be recycled - data(io.loadIn(i).bits.uop.lsroqIdx).paddr := io.loadIn(i).bits.paddr - data(io.loadIn(i).bits.uop.lsroqIdx).vaddr := io.loadIn(i).bits.vaddr - data(io.loadIn(i).bits.uop.lsroqIdx).mask := io.loadIn(i).bits.mask - data(io.loadIn(i).bits.uop.lsroqIdx).data := io.loadIn(i).bits.data // for mmio / misc / debug - data(io.loadIn(i).bits.uop.lsroqIdx).mmio := io.loadIn(i).bits.mmio - data(io.loadIn(i).bits.uop.lsroqIdx).fwdMask := io.loadIn(i).bits.forwardMask - data(io.loadIn(i).bits.uop.lsroqIdx).fwdData := io.loadIn(i).bits.forwardData - data(io.loadIn(i).bits.uop.lsroqIdx).exception := io.loadIn(i).bits.uop.cf.exceptionVec.asUInt - val dcacheMissed = io.loadIn(i).bits.miss && !io.loadIn(i).bits.mmio - miss(io.loadIn(i).bits.uop.lsroqIdx) := dcacheMissed - listening(io.loadIn(i).bits.uop.lsroqIdx) := dcacheMissed - store(io.loadIn(i).bits.uop.lsroqIdx) := false.B - pending(io.loadIn(i).bits.uop.lsroqIdx) := io.loadIn(i).bits.mmio - } - }) - - // find first store req that has not been writebacked - val storeNotWritebacked = VecInit((0 until LsroqSize).map(i => store(i) && !writebacked(i))) - val firstStore = getFirstOne(storeNotWritebacked, tailMask) - io.oldestStore.valid := false.B - io.oldestStore.bits := DontCare - // writeback store - (0 until StorePipelineWidth).map(i => { - when(io.storeIn(i).fire()) { - valid(io.storeIn(i).bits.uop.lsroqIdx) := !io.storeIn(i).bits.mmio - data(io.storeIn(i).bits.uop.lsroqIdx).paddr := io.storeIn(i).bits.paddr - data(io.storeIn(i).bits.uop.lsroqIdx).vaddr := io.storeIn(i).bits.vaddr - data(io.storeIn(i).bits.uop.lsroqIdx).mask := io.storeIn(i).bits.mask - data(io.storeIn(i).bits.uop.lsroqIdx).data := io.storeIn(i).bits.data - data(io.storeIn(i).bits.uop.lsroqIdx).mmio := io.storeIn(i).bits.mmio - data(io.storeIn(i).bits.uop.lsroqIdx).exception := io.storeIn(i).bits.uop.cf.exceptionVec.asUInt - miss(io.storeIn(i).bits.uop.lsroqIdx) := io.storeIn(i).bits.miss - store(io.storeIn(i).bits.uop.lsroqIdx) := true.B - pending(io.storeIn(i).bits.uop.lsroqIdx) := io.storeIn(i).bits.mmio - XSInfo("store write to lsroq idx %d pc 0x%x vaddr %x paddr %x data %x miss %x mmio %x roll %x exc %x\n", - io.storeIn(i).bits.uop.lsroqIdx(InnerLsroqIdxWidth - 1, 0), - io.storeIn(i).bits.uop.cf.pc, - io.storeIn(i).bits.vaddr, - io.storeIn(i).bits.paddr, - io.storeIn(i).bits.data, - io.storeIn(i).bits.miss, - io.storeIn(i).bits.mmio, - io.storeIn(i).bits.rollback, - io.storeIn(i).bits.uop.cf.exceptionVec.asUInt - ) - when (io.storeIn(i).bits.uop.lsroqIdx(InnerLsroqIdxWidth - 1, 0) === firstStore) { - io.oldestStore.valid := true.B - io.oldestStore.bits := io.storeIn(i).bits.uop.roqIdx - } - } - }) - - // cache miss request - val inflightReqs = RegInit(VecInit(Seq.fill(cfg.nLoadMissEntries)(0.U.asTypeOf(new InflightBlockInfo)))) - val inflightReqFull = inflightReqs.map(req => req.valid).reduce(_&&_) - val reqBlockIndex = PriorityEncoder(~VecInit(inflightReqs.map(req => req.valid)).asUInt) - - val missRefillSelVec = VecInit( - (0 until LsroqSize).map{ i => - val inflight = inflightReqs.map(req => req.valid && req.block_addr === get_block_addr(data(i).paddr)).reduce(_||_) - allocated(i) && miss(i) && !inflight - }) - - val missRefillSel = getFirstOne(missRefillSelVec, tailMask) - val missRefillBlockAddr = get_block_addr(data(missRefillSel).paddr) - io.dcache.req.valid := missRefillSelVec.asUInt.orR - io.dcache.req.bits.cmd := MemoryOpConstants.M_XRD - io.dcache.req.bits.addr := missRefillBlockAddr - io.dcache.req.bits.data := DontCare - io.dcache.req.bits.mask := DontCare - - io.dcache.req.bits.meta.id := DontCare // TODO: // FIXME - io.dcache.req.bits.meta.vaddr := DontCare // data(missRefillSel).vaddr - io.dcache.req.bits.meta.paddr := missRefillBlockAddr - io.dcache.req.bits.meta.uop := uop(missRefillSel) - io.dcache.req.bits.meta.mmio := false.B // data(missRefillSel).mmio - io.dcache.req.bits.meta.tlb_miss := false.B - io.dcache.req.bits.meta.mask := DontCare - io.dcache.req.bits.meta.replay := false.B - - io.dcache.resp.ready := true.B - - assert(!(data(missRefillSel).mmio && io.dcache.req.valid)) - - when(io.dcache.req.fire()) { - miss(missRefillSel) := false.B - listening(missRefillSel) := true.B - - // mark this block as inflight - inflightReqs(reqBlockIndex).valid := true.B - inflightReqs(reqBlockIndex).block_addr := missRefillBlockAddr - assert(!inflightReqs(reqBlockIndex).valid) - } - - when(io.dcache.resp.fire()) { - val inflight = inflightReqs.map(req => req.valid && req.block_addr === get_block_addr(io.dcache.resp.bits.meta.paddr)).reduce(_||_) - assert(inflight) - for (i <- 0 until cfg.nLoadMissEntries) { - when (inflightReqs(i).valid && inflightReqs(i).block_addr === get_block_addr(io.dcache.resp.bits.meta.paddr)) { - inflightReqs(i).valid := false.B - } - } - } - - - when(io.dcache.req.fire()){ - XSDebug("miss req: pc:0x%x roqIdx:%d lsroqIdx:%d (p)addr:0x%x vaddr:0x%x\n", - io.dcache.req.bits.meta.uop.cf.pc, io.dcache.req.bits.meta.uop.roqIdx.asUInt, io.dcache.req.bits.meta.uop.lsroqIdx, - io.dcache.req.bits.addr, io.dcache.req.bits.meta.vaddr - ) - } - - when(io.dcache.resp.fire()){ - XSDebug("miss resp: pc:0x%x roqIdx:%d lsroqIdx:%d (p)addr:0x%x data %x\n", - io.dcache.resp.bits.meta.uop.cf.pc, io.dcache.resp.bits.meta.uop.roqIdx.asUInt, io.dcache.resp.bits.meta.uop.lsroqIdx, - io.dcache.resp.bits.meta.paddr, io.dcache.resp.bits.data - ) - } - - // Refill 64 bit in a cycle - // Refill data comes back from io.dcache.resp - def mergeRefillData(refill: UInt, fwd: UInt, fwdMask: UInt): UInt = { - val res = Wire(Vec(8, UInt(8.W))) - (0 until 8).foreach(i => { - res(i) := Mux(fwdMask(i), fwd(8 * (i + 1) - 1, 8 * i), refill(8 * (i + 1) - 1, 8 * i)) - }) - res.asUInt - } - - (0 until LsroqSize).map(i => { - val blockMatch = get_block_addr(data(i).paddr) === io.dcache.resp.bits.meta.paddr - when(allocated(i) && listening(i) && blockMatch && io.dcache.resp.fire()) { - // split them into words - val words = VecInit((0 until blockWords) map { i => - io.dcache.resp.bits.data(DataBits * (i + 1) - 1, DataBits * i) - }) - - val refillData = words(get_word(data(i).paddr)) - data(i).data := mergeRefillData(refillData, data(i).fwdData.asUInt, data(i).fwdMask.asUInt) - valid(i) := true.B - listening(i) := false.B - XSDebug("miss resp: pos %d addr %x data %x + %x(%b)\n", i.U, data(i).paddr, refillData, data(i).fwdData.asUInt, data(i).fwdMask.asUInt) - } - }) - - // writeback up to 2 missed load insts to CDB - // just randomly pick 2 missed load (data refilled), write them back to cdb - val loadWbSelVec = VecInit((0 until LsroqSize).map(i => { - allocated(i) && valid(i) && !writebacked(i) && !store(i) - })).asUInt() // use uint instead vec to reduce verilog lines - val loadWbSel = Wire(Vec(LoadPipelineWidth, UInt(log2Up(LsroqSize).W))) - val lselvec0 = PriorityEncoderOH(loadWbSelVec) - val lselvec1 = PriorityEncoderOH(loadWbSelVec & (~lselvec0).asUInt) - loadWbSel(0) := OHToUInt(lselvec0) - loadWbSel(1) := OHToUInt(lselvec1) - (0 until LoadPipelineWidth).map(i => { - // data select - val rdata = data(loadWbSel(i)).data - val func = uop(loadWbSel(i)).ctrl.fuOpType - val raddr = data(loadWbSel(i)).paddr - val rdataSel = LookupTree(raddr(2, 0), List( - "b000".U -> rdata(63, 0), - "b001".U -> rdata(63, 8), - "b010".U -> rdata(63, 16), - "b011".U -> rdata(63, 24), - "b100".U -> rdata(63, 32), - "b101".U -> rdata(63, 40), - "b110".U -> rdata(63, 48), - "b111".U -> rdata(63, 56) - )) - val rdataPartialLoad = LookupTree(func, List( - LSUOpType.lb -> SignExt(rdataSel(7, 0) , XLEN), - LSUOpType.lh -> SignExt(rdataSel(15, 0), XLEN), - LSUOpType.lw -> SignExt(rdataSel(31, 0), XLEN), - LSUOpType.ld -> SignExt(rdataSel(63, 0), XLEN), - LSUOpType.lbu -> ZeroExt(rdataSel(7, 0) , XLEN), - LSUOpType.lhu -> ZeroExt(rdataSel(15, 0), XLEN), - LSUOpType.lwu -> ZeroExt(rdataSel(31, 0), XLEN) - )) - io.ldout(i).bits.uop := uop(loadWbSel(i)) - io.ldout(i).bits.uop.cf.exceptionVec := data(loadWbSel(i)).exception.asBools - io.ldout(i).bits.uop.lsroqIdx := loadWbSel(i) - io.ldout(i).bits.data := rdataPartialLoad - io.ldout(i).bits.redirectValid := false.B - io.ldout(i).bits.redirect := DontCare - io.ldout(i).bits.brUpdate := DontCare - io.ldout(i).bits.debug.isMMIO := data(loadWbSel(i)).mmio - io.ldout(i).valid := loadWbSelVec(loadWbSel(i)) - when(io.ldout(i).fire()) { - writebacked(loadWbSel(i)) := true.B - XSInfo(io.loadIn(i).valid, "load miss write to cbd idx %d pc 0x%x paddr %x data %x mmio %x\n", - io.ldout(i).bits.uop.lsroqIdx, - io.ldout(i).bits.uop.cf.pc, - data(loadWbSel(i)).paddr, - data(loadWbSel(i)).data, - data(loadWbSel(i)).mmio - ) - } - ExcitingUtils.addSource(io.ldout(i).fire(), "perfCntCacheLoadMiss"+i, Perf) - }) - - // writeback up to 2 store insts to CDB - // choose the first two valid store requests from deqPtr - val storeWbSelVec = VecInit((0 until LsroqSize).map(i => { - allocated(i) && valid(i) && !writebacked(i) && store(i) - })) - val storeWbSel = Wire(Vec(StorePipelineWidth, UInt(log2Up(LsroqSize).W))) - val storeWbValid = Wire(Vec(StorePipelineWidth, Bool())) - storeWbSel(0) := getFirstOne(storeWbSelVec, tailMask) - val firstSelMask = UIntToOH(storeWbSel(0)) - val secondWbSelVec = VecInit((0 until LsroqSize).map(i => storeWbSelVec(i) && !firstSelMask(i))) - storeWbSel(1) := getFirstOne(secondWbSelVec, tailMask) - storeWbValid(0) := Cat(storeWbSelVec).orR - storeWbValid(1) := Cat(secondWbSelVec).orR - - (0 until StorePipelineWidth).map(i => { - io.stout(i).bits.uop := uop(storeWbSel(i)) - io.stout(i).bits.uop.lsroqIdx := storeWbSel(i) - io.stout(i).bits.uop.cf.exceptionVec := data(storeWbSel(i)).exception.asBools - io.stout(i).bits.data := data(storeWbSel(i)).data - io.stout(i).bits.redirectValid := false.B - io.stout(i).bits.redirect := DontCare - io.stout(i).bits.brUpdate := DontCare - io.stout(i).bits.debug.isMMIO := data(storeWbSel(i)).mmio - io.stout(i).valid := storeWbSelVec(storeWbSel(i)) && storeWbValid(i) - when(io.stout(i).fire()) { - writebacked(storeWbSel(i)) := true.B - } - }) - - // remove retired insts from lsroq, add retired store to sbuffer - - // move tailPtr - // allocatedMask: dequeuePtr can go to the next 1-bit - val allocatedMask = VecInit((0 until LsroqSize).map(i => allocated(i) || !enqDeqMask(i))) - // find the first one from deqPtr (ringBufferTail) - val nextTail1 = getFirstOneWithFlag(allocatedMask, tailMask, ringBufferTailExtended(InnerLsroqIdxWidth)) - val nextTail = Mux(Cat(allocatedMask).orR, nextTail1, ringBufferHeadExtended) - ringBufferTailExtended := nextTail - - // send commited store inst to sbuffer - // select up to 2 writebacked store insts - // scommitPending, scommitIn, scommitOut are for debug only - val commitedStoreQueue = Module(new MIMOQueue( - UInt(InnerLsroqIdxWidth.W), - entries = LsroqSize, - inCnt = 6, - outCnt = 2, - mem = false, - perf = true - )) - - // scommit counter for debugging - val scommitPending = RegInit(0.U(log2Up(LsroqSize).W)) - val scommitIn = PopCount(VecInit(storeCommit).asUInt) - val scommitOut = PopCount(VecInit((0 until 2).map(i => commitedStoreQueue.io.deq(i).fire())).asUInt) - scommitPending := scommitPending + scommitIn - scommitOut - - commitedStoreQueue.io.flush := false.B - - // When store commited, mark it as commited (will not be influenced by redirect), - // then add store's lsroq ptr into commitedStoreQueue - (0 until CommitWidth).map(i => { - when(storeCommit(i)) { - commited(mcommitIdx(i)) := true.B - XSDebug("store commit %d: idx %d %x\n", i.U, mcommitIdx(i), uop(mcommitIdx(i)).cf.pc) - } - commitedStoreQueue.io.enq(i).valid := storeCommit(i) - commitedStoreQueue.io.enq(i).bits := mcommitIdx(i) - // We assume commitedStoreQueue.io.enq(i).ready === true.B, - // for commitedStoreQueue.size = 64 - }) - - // When load commited, mark it as !allocated, this entry will be recycled later - (0 until CommitWidth).map(i => { - when(loadCommit(i)) { - allocated(mcommitIdx(i)) := false.B - XSDebug("load commit %d: idx %d %x\n", i.U, mcommitIdx(i), uop(mcommitIdx(i)).cf.pc) - } - }) - - // get no more than 2 commited store from storeCommitedQueue - // send selected store inst to sbuffer - (0 until 2).map(i => { - val ptr = commitedStoreQueue.io.deq(i).bits - val mmio = data(ptr).mmio - io.sbuffer(i).valid := commitedStoreQueue.io.deq(i).valid && !mmio - io.sbuffer(i).bits.cmd := MemoryOpConstants.M_XWR - io.sbuffer(i).bits.addr := data(ptr).paddr - io.sbuffer(i).bits.data := data(ptr).data - io.sbuffer(i).bits.mask := data(ptr).mask - io.sbuffer(i).bits.meta := DontCare - io.sbuffer(i).bits.meta.tlb_miss := false.B - io.sbuffer(i).bits.meta.uop := uop(ptr) - io.sbuffer(i).bits.meta.mmio := mmio - io.sbuffer(i).bits.meta.mask := data(ptr).mask - - commitedStoreQueue.io.deq(i).ready := io.sbuffer(i).fire() || mmio - - // update lsroq meta if store inst is send to sbuffer - when(commitedStoreQueue.io.deq(i).valid && (mmio || io.sbuffer(i).ready)) { - allocated(commitedStoreQueue.io.deq(i).bits) := false.B - } - }) - - // load forward query - // check over all lsroq entries and forward data from the first matched store - (0 until LoadPipelineWidth).map(i => { - io.forward(i).forwardMask := 0.U(8.W).asBools - io.forward(i).forwardData := DontCare - - // Compare ringBufferTail (deqPtr) and forward.lsroqIdx, we have two cases: - // (1) if they have the same flag, we need to check range(tail, lsroqIdx) - // (2) if they have different flags, we need to check range(tail, lsroqSize) and range(0, lsroqIdx) - // Forward1: Mux(same_flag, range(tail, lsroqIdx), range(tail, lsroqSize)) - // Forward2: Mux(same_flag, 0.U, range(0, lsroqIdx) ) - // i.e. forward1 is the target entries with the same flag bits and forward2 otherwise - val forwardMask1 = WireInit(VecInit(Seq.fill(8)(false.B))) - val forwardData1 = WireInit(VecInit(Seq.fill(8)(0.U(8.W)))) - val forwardMask2 = WireInit(VecInit(Seq.fill(8)(false.B))) - val forwardData2 = WireInit(VecInit(Seq.fill(8)(0.U(8.W)))) - - val differentFlag = ringBufferTailExtended(InnerLsroqIdxWidth) =/= io.forward(i).lsroqIdx(InnerLsroqIdxWidth) - val forwardMask = ((1.U((LsroqSize + 1).W)) << io.forward(i).lsroqIdx(InnerLsroqIdxWidth - 1, 0)).asUInt - 1.U - val needForward1 = Mux(differentFlag, ~tailMask, tailMask ^ forwardMask) - val needForward2 = Mux(differentFlag, forwardMask, 0.U(LsroqSize.W)) - - // entry with larger index should have higher priority since it's data is younger - for (j <- 0 until LsroqSize) { - val needCheck = valid(j) && allocated(j) && store(j) && - io.forward(i).paddr(PAddrBits - 1, 3) === data(j).paddr(PAddrBits - 1, 3) - (0 until XLEN / 8).foreach(k => { - when (needCheck && data(j).mask(k)) { - when (needForward1(j)) { - forwardMask1(k) := true.B - forwardData1(k) := data(j).data(8 * (k + 1) - 1, 8 * k) - } - when (needForward2(j)) { - forwardMask2(k) := true.B - forwardData2(k) := data(j).data(8 * (k + 1) - 1, 8 * k) - } - XSDebug(needForward1(j) || needForward2(j), - p"forwarding $k-th byte ${Hexadecimal(data(j).data(8 * (k + 1) - 1, 8 * k))} " + - p"from ptr $j pc ${Hexadecimal(uop(j).cf.pc)}\n") - } - }) - } - - // merge forward lookup results - // forward2 is younger than forward1 and should have higher priority - (0 until XLEN / 8).map(k => { - io.forward(i).forwardMask(k) := forwardMask1(k) || forwardMask2(k) - io.forward(i).forwardData(k) := Mux(forwardMask2(k), forwardData2(k), forwardData1(k)) - }) - }) - - // rollback check - val rollback = Wire(Vec(StorePipelineWidth, Valid(new Redirect))) - - def getFirstOne(mask: Vec[Bool], startMask: UInt) = { - val length = mask.length - val highBits = (0 until length).map(i => mask(i) & ~startMask(i)) - val highBitsUint = Cat(highBits.reverse) - PriorityEncoder(Mux(highBitsUint.orR(), highBitsUint, mask.asUInt)) - } - - def getFirstOneWithFlag(mask: Vec[Bool], startMask: UInt, startFlag: UInt) = { - val length = mask.length - val highBits = (0 until length).map(i => mask(i) & ~startMask(i)) - val highBitsUint = Cat(highBits.reverse) - val changeDirection = !highBitsUint.orR() - val index = PriorityEncoder(Mux(!changeDirection, highBitsUint, mask.asUInt)) - Cat(startFlag ^ changeDirection, index) - } - - def getOldestInTwo(valid: Seq[Bool], uop: Seq[MicroOp]) = { - assert(valid.length == uop.length) - assert(valid.length == 2) - Mux(valid(0) && valid(1), - Mux(isAfter(uop(0).roqIdx, uop(1).roqIdx), uop(1), uop(0)), - Mux(valid(0) && !valid(1), uop(0), uop(1))) - } - - def getAfterMask(valid: Seq[Bool], uop: Seq[MicroOp]) = { - assert(valid.length == uop.length) - val length = valid.length - (0 until length).map(i => { - (0 until length).map(j => { - Mux(valid(i) && valid(j), - isAfter(uop(i).roqIdx, uop(j).roqIdx), - Mux(!valid(i), true.B, false.B)) - }) - }) - } - - def rangeMask(start: UInt, end: UInt): UInt = { - val startMask = (1.U((LsroqSize + 1).W) << start(InnerLsroqIdxWidth - 1, 0)).asUInt - 1.U - val endMask = (1.U((LsroqSize + 1).W) << end(InnerLsroqIdxWidth - 1, 0)).asUInt - 1.U - val xorMask = startMask(LsroqSize - 1, 0) ^ endMask(LsroqSize - 1, 0) - Mux(start(InnerLsroqIdxWidth) === end(InnerLsroqIdxWidth), xorMask, ~xorMask) - } - - // store backward query and rollback - // val needCheck = Seq.fill(8)(WireInit(true.B)) - (0 until StorePipelineWidth).foreach(i => { - rollback(i) := DontCare - - when(io.storeIn(i).valid) { - val startIndex = io.storeIn(i).bits.uop.lsroqIdx(InnerLsroqIdxWidth - 1, 0) - val lsroqIdxMask = ((1.U((LsroqSize + 1).W) << startIndex).asUInt - 1.U)(LsroqSize - 1, 0) - val xorMask = lsroqIdxMask ^ headMask - val sameFlag = io.storeIn(i).bits.uop.lsroqIdx(InnerLsroqIdxWidth) === ringBufferHeadExtended(InnerLsroqIdxWidth) - val toEnqPtrMask = Mux(sameFlag, xorMask, ~xorMask) - val lsroqViolationVec = VecInit((0 until LsroqSize).map(j => { - val addrMatch = allocated(j) && - io.storeIn(i).bits.paddr(PAddrBits - 1, 3) === data(j).paddr(PAddrBits - 1, 3) - val entryNeedCheck = toEnqPtrMask(j) && addrMatch && !store(j) && (valid(j) || listening(j) || miss(j)) - // TODO: update refilled data - val violationVec = (0 until 8).map(k => data(j).mask(k) && io.storeIn(i).bits.mask(k)) - Cat(violationVec).orR() && entryNeedCheck - })) - val lsroqViolation = lsroqViolationVec.asUInt().orR() - val lsroqViolationIndex = getFirstOne(lsroqViolationVec, lsroqIdxMask) - val lsroqViolationUop = uop(lsroqViolationIndex) - XSDebug(lsroqViolation, p"${Binary(Cat(lsroqViolationVec))}, $startIndex, $lsroqViolationIndex\n") - - // when l/s writeback to roq together, check if rollback is needed - val wbViolationVec = VecInit((0 until LoadPipelineWidth).map(j => { - io.loadIn(j).valid && - isAfter(io.loadIn(j).bits.uop.roqIdx, io.storeIn(i).bits.uop.roqIdx) && - io.storeIn(i).bits.paddr(PAddrBits - 1, 3) === io.loadIn(j).bits.paddr(PAddrBits - 1, 3) && - (io.storeIn(i).bits.mask & io.loadIn(j).bits.mask).orR - })) - val wbViolation = wbViolationVec.asUInt().orR() - val wbViolationUop = getOldestInTwo(wbViolationVec, io.loadIn.map(_.bits.uop)) - XSDebug(wbViolation, p"${Binary(Cat(wbViolationVec))}, $wbViolationUop\n") - - // check if rollback is needed for load in l4 - val l4ViolationVec = VecInit((0 until LoadPipelineWidth).map(j => { - io.forward(j).valid && // L4 valid\ - isAfter(io.forward(j).uop.roqIdx, io.storeIn(i).bits.uop.roqIdx) && - io.storeIn(i).bits.paddr(PAddrBits - 1, 3) === io.forward(j).paddr(PAddrBits - 1, 3) && - (io.storeIn(i).bits.mask & io.forward(j).mask).orR - })) - val l4Violation = l4ViolationVec.asUInt().orR() - val l4ViolationUop = getOldestInTwo(l4ViolationVec, io.forward.map(_.uop)) - - val rollbackValidVec = Seq(lsroqViolation, wbViolation, l4Violation) - val rollbackUopVec = Seq(lsroqViolationUop, wbViolationUop, l4ViolationUop) - rollback(i).valid := Cat(rollbackValidVec).orR - val mask = getAfterMask(rollbackValidVec, rollbackUopVec) - val oneAfterZero = mask(1)(0) - val rollbackUop = Mux(oneAfterZero && mask(2)(0), - rollbackUopVec(0), - Mux(!oneAfterZero && mask(2)(1), rollbackUopVec(1), rollbackUopVec(2))) - rollback(i).bits.roqIdx := rollbackUop.roqIdx - 1.U - - rollback(i).bits.isReplay := true.B - rollback(i).bits.isMisPred := false.B - rollback(i).bits.isException := false.B - rollback(i).bits.isFlushPipe := false.B - - XSDebug( - lsroqViolation, - "need rollback (ld wb before store) pc %x roqidx %d target %x\n", - io.storeIn(i).bits.uop.cf.pc, io.storeIn(i).bits.uop.roqIdx.asUInt, lsroqViolationUop.roqIdx.asUInt - ) - XSDebug( - wbViolation, - "need rollback (ld/st wb together) pc %x roqidx %d target %x\n", - io.storeIn(i).bits.uop.cf.pc, io.storeIn(i).bits.uop.roqIdx.asUInt, wbViolationUop.roqIdx.asUInt - ) - XSDebug( - l4Violation, - "need rollback (l4 load) pc %x roqidx %d target %x\n", - io.storeIn(i).bits.uop.cf.pc, io.storeIn(i).bits.uop.roqIdx.asUInt, l4ViolationUop.roqIdx.asUInt - ) - }.otherwise { - rollback(i).valid := false.B - } - }) - - def rollbackSel(a: Valid[Redirect], b: Valid[Redirect]): ValidIO[Redirect] = { - Mux( - a.valid, - Mux( - b.valid, - Mux(isAfter(a.bits.roqIdx, b.bits.roqIdx), b, a), // a,b both valid, sel oldest - a // sel a - ), - b // sel b - ) - } - - io.rollback := ParallelOperation(rollback, rollbackSel) - - // Memory mapped IO / other uncached operations - - // setup misc mem access req - // mask / paddr / data can be get from lsroq.data - val commitType = io.commits(0).bits.uop.ctrl.commitType - io.uncache.req.valid := pending(ringBufferTail) && allocated(ringBufferTail) && - (commitType === CommitType.STORE || commitType === CommitType.LOAD) && - io.roqDeqPtr === uop(ringBufferTail).roqIdx && - !io.commits(0).bits.isWalk - - io.uncache.req.bits.cmd := Mux(store(ringBufferTail), MemoryOpConstants.M_XWR, MemoryOpConstants.M_XRD) - io.uncache.req.bits.addr := data(ringBufferTail).paddr - io.uncache.req.bits.data := data(ringBufferTail).data - io.uncache.req.bits.mask := data(ringBufferTail).mask - - io.uncache.req.bits.meta.id := DontCare // TODO: // FIXME - io.uncache.req.bits.meta.vaddr := DontCare - io.uncache.req.bits.meta.paddr := data(ringBufferTail).paddr - io.uncache.req.bits.meta.uop := uop(ringBufferTail) - io.uncache.req.bits.meta.mmio := true.B // data(ringBufferTail).mmio - io.uncache.req.bits.meta.tlb_miss := false.B - io.uncache.req.bits.meta.mask := data(ringBufferTail).mask - io.uncache.req.bits.meta.replay := false.B - - io.uncache.resp.ready := true.B - io.uncache.s1_kill := false.B - - when(io.uncache.req.fire()){ - pending(ringBufferTail) := false.B - } - - when(io.uncache.resp.fire()){ - valid(ringBufferTail) := true.B - data(ringBufferTail).data := io.uncache.resp.bits.data(XLEN-1, 0) - // TODO: write back exception info - } - - when(io.uncache.req.fire()){ - XSDebug("uncache req: pc %x addr %x data %x op %x mask %x\n", - uop(ringBufferTail).cf.pc, - io.uncache.req.bits.addr, - io.uncache.req.bits.data, - io.uncache.req.bits.cmd, - io.uncache.req.bits.mask - ) - } - - when(io.uncache.resp.fire()){ - XSDebug("uncache resp: data %x\n", io.dcache.resp.bits.data) - } - - // Read vaddr for mem exception - val mexcLsroqIdx = WireInit(0.U(LsroqIdxWidth.W)) - val memExceptionAddr = WireInit(data(mexcLsroqIdx(InnerLsroqIdxWidth - 1, 0)).vaddr) - ExcitingUtils.addSink(mexcLsroqIdx, "EXECPTION_LSROQIDX") - ExcitingUtils.addSource(memExceptionAddr, "EXECPTION_VADDR") - - // misprediction recovery / exception redirect - // invalidate lsroq term using robIdx - val needCancel = Wire(Vec(LsroqSize, Bool())) - for (i <- 0 until LsroqSize) { - needCancel(i) := uop(i).roqIdx.needFlush(io.brqRedirect) && allocated(i) && !commited(i) - when(needCancel(i)) { - when(io.brqRedirect.bits.isReplay){ - valid(i) := false.B - store(i) := false.B - writebacked(i) := false.B - listening(i) := false.B - miss(i) := false.B - pending(i) := false.B - }.otherwise{ - allocated(i) := false.B - } - } - } - when (io.brqRedirect.valid && io.brqRedirect.bits.isMisPred) { - ringBufferHeadExtended := ringBufferHeadExtended - PopCount(needCancel) - } - - // assert(!io.rollback.valid) - when(io.rollback.valid) { - XSDebug("Mem rollback: pc %x roqidx %d\n", io.rollback.bits.pc, io.rollback.bits.roqIdx.asUInt) - } - - // debug info - XSDebug("head %d:%d tail %d:%d scommit %d\n", ringBufferHeadExtended(InnerLsroqIdxWidth), ringBufferHead, ringBufferTailExtended(InnerLsroqIdxWidth), ringBufferTail, scommitPending) - - def PrintFlag(flag: Bool, name: String): Unit = { - when(flag) { - XSDebug(false, true.B, name) - }.otherwise { - XSDebug(false, true.B, " ") - } - } - - for (i <- 0 until LsroqSize) { - if (i % 4 == 0) XSDebug("") - XSDebug(false, true.B, "%x ", uop(i).cf.pc) - PrintFlag(allocated(i), "a") - PrintFlag(allocated(i) && valid(i), "v") - PrintFlag(allocated(i) && writebacked(i), "w") - PrintFlag(allocated(i) && commited(i), "c") - PrintFlag(allocated(i) && store(i), "s") - PrintFlag(allocated(i) && miss(i), "m") - PrintFlag(allocated(i) && listening(i), "l") - PrintFlag(allocated(i) && pending(i), "p") - XSDebug(false, true.B, " ") - if (i % 4 == 3) XSDebug(false, true.B, "\n") - } - - XSPerf("utilization", PopCount(allocated)) - XSPerf("storeWait", PopCount((0 until LsroqSize).map(i => allocated(i) && store(i) && commited(i)))) - XSPerf("enqInstr", PopCount(io.dp1Req.map(_.fire()))) - XSPerf("replayInstr", Mux(io.brqRedirect.valid && io.brqRedirect.bits.isReplay, PopCount(needCancel), 0.U)) -} diff --git a/src/main/scala/xiangshan/mem/pipeline/AtomicsUnit.scala b/src/main/scala/xiangshan/mem/pipeline/AtomicsUnit.scala index 964131e69926275622d63d949602002535ca5938..ad39f03c5bf71260a5134a6fdc28ebf20f5d5202 100644 --- a/src/main/scala/xiangshan/mem/pipeline/AtomicsUnit.scala +++ b/src/main/scala/xiangshan/mem/pipeline/AtomicsUnit.scala @@ -16,6 +16,7 @@ class AtomicsUnit extends XSModule with MemoryOpConstants{ val flush_sbuffer = new SbufferFlushBundle val tlbFeedback = ValidIO(new TlbFeedback) val redirect = Flipped(ValidIO(new Redirect)) + val exceptionAddr = ValidIO(UInt(VAddrBits.W)) }) //------------------------------------------------------- @@ -31,8 +32,8 @@ class AtomicsUnit extends XSModule with MemoryOpConstants{ val resp_data = Reg(UInt()) val is_lrsc_valid = Reg(Bool()) - ExcitingUtils.addSource(in.src1, "ATOM_EXECPTION_VADDR") - ExcitingUtils.addSource(atom_override_xtval, "ATOM_OVERRIDE_XTVAL") + io.exceptionAddr.valid := atom_override_xtval + io.exceptionAddr.bits := in.src1 // assign default value to output signals io.in.ready := false.B @@ -41,11 +42,11 @@ class AtomicsUnit extends XSModule with MemoryOpConstants{ io.dcache.req.valid := false.B io.dcache.req.bits := DontCare - io.dcache.s1_kill := false.B io.dcache.resp.ready := false.B io.dtlb.req.valid := false.B io.dtlb.req.bits := DontCare + io.dtlb.resp.ready := false.B io.flush_sbuffer.valid := false.B @@ -75,12 +76,12 @@ class AtomicsUnit extends XSModule with MemoryOpConstants{ io.dtlb.req.valid := true.B io.dtlb.req.bits.vaddr := in.src1 io.dtlb.req.bits.roqIdx := in.uop.roqIdx + io.dtlb.resp.ready := true.B val is_lr = in.uop.ctrl.fuOpType === LSUOpType.lr_w || in.uop.ctrl.fuOpType === LSUOpType.lr_d io.dtlb.req.bits.cmd := Mux(is_lr, TlbCmd.read, TlbCmd.write) io.dtlb.req.bits.debug.pc := in.uop.cf.pc - io.dtlb.req.bits.debug.lsroqIdx := in.uop.lsroqIdx // FIXME: need update - when(io.dtlb.resp.valid && !io.dtlb.resp.bits.miss){ + when(io.dtlb.resp.fire && !io.dtlb.resp.bits.miss){ // exception handling val addrAligned = LookupTree(in.uop.ctrl.fuOpType(1,0), List( "b00".U -> true.B, //b @@ -144,7 +145,7 @@ class AtomicsUnit extends XSModule with MemoryOpConstants{ LSUOpType.amomaxu_d -> M_XA_MAXU )) - io.dcache.req.bits.addr := paddr + io.dcache.req.bits.addr := paddr io.dcache.req.bits.data := genWdata(in.src2, in.uop.ctrl.fuOpType(1,0)) // TODO: atomics do need mask: fix mask io.dcache.req.bits.mask := genWmask(paddr, in.uop.ctrl.fuOpType(1,0)) @@ -222,4 +223,4 @@ class AtomicsUnit extends XSModule with MemoryOpConstants{ when(io.redirect.valid){ atom_override_xtval := false.B } -} +} \ No newline at end of file diff --git a/src/main/scala/xiangshan/mem/pipeline/LoadUnit.scala b/src/main/scala/xiangshan/mem/pipeline/LoadUnit.scala index 30bf974215d8c0d52b743149e3429bd59a631794..b8b14c7e942df632a1da3c2b07c878827a50291c 100644 --- a/src/main/scala/xiangshan/mem/pipeline/LoadUnit.scala +++ b/src/main/scala/xiangshan/mem/pipeline/LoadUnit.scala @@ -4,259 +4,177 @@ import chisel3._ import chisel3.util._ import utils._ import xiangshan._ -import xiangshan.cache.{DCacheWordIO, TlbRequestIO, TlbCmd, MemoryOpConstants} +import xiangshan.cache._ +// import xiangshan.cache.{DCacheWordIO, TlbRequestIO, TlbCmd, MemoryOpConstants, TlbReq, DCacheLoadReq, DCacheWordResp} import xiangshan.backend.LSUOpType +import xiangshan.backend.fu.fpu.boxF32ToF64 -class LoadToLsroqIO extends XSBundle { +class LoadToLsqIO extends XSBundle { val loadIn = ValidIO(new LsPipelineBundle) val ldout = Flipped(DecoupledIO(new ExuOutput)) val forward = new LoadForwardQueryIO } -class LoadUnit extends XSModule { +// Load Pipeline Stage 0 +// Generate addr, use addr to query DCache and DTLB +class LoadUnit_S0 extends XSModule { val io = IO(new Bundle() { - val ldin = Flipped(Decoupled(new ExuInput)) - val ldout = Decoupled(new ExuOutput) + val in = Flipped(Decoupled(new ExuInput)) + val out = Decoupled(new LsPipelineBundle) val redirect = Flipped(ValidIO(new Redirect)) + val dtlbReq = DecoupledIO(new TlbReq) + val dtlbResp = Flipped(DecoupledIO(new TlbResp)) val tlbFeedback = ValidIO(new TlbFeedback) - val dcache = new DCacheWordIO - val dtlb = new TlbRequestIO() - val sbuffer = new LoadForwardQueryIO - val lsroq = new LoadToLsroqIO + val dcacheReq = DecoupledIO(new DCacheLoadReq) }) - - when(io.ldin.valid){ - XSDebug("load enpipe %x iw %x fw %x\n", io.ldin.bits.uop.cf.pc, io.ldin.bits.uop.ctrl.rfWen, io.ldin.bits.uop.ctrl.fpWen) - } - //------------------------------------------------------- - // Load Pipeline - //------------------------------------------------------- - - val l2_out = Wire(Decoupled(new LsPipelineBundle)) - val l4_out = Wire(Decoupled(new LsPipelineBundle)) - val l5_in = Wire(Flipped(Decoupled(new LsPipelineBundle))) - - //------------------------------------------------------- - // LD Pipeline Stage 2 - // Generate addr, use addr to query DCache Tag and DTLB - //------------------------------------------------------- - - val l2_dtlb_hit = Wire(new Bool()) - val l2_dtlb_miss = Wire(new Bool()) - val l2_dcache = Wire(new Bool()) - val l2_mmio = Wire(new Bool()) - val isMMIOReq = Wire(new Bool()) - - // send req to dtlb - io.dtlb.req.valid := l2_out.valid - io.dtlb.req.bits.vaddr := l2_out.bits.vaddr - io.dtlb.req.bits.cmd := TlbCmd.read - io.dtlb.req.bits.roqIdx := l2_out.bits.uop.roqIdx - io.dtlb.req.bits.debug.pc := l2_out.bits.uop.cf.pc - io.dtlb.req.bits.debug.lsroqIdx := l2_out.bits.uop.lsroqIdx // FIXME: need update - - l2_dtlb_hit := io.dtlb.resp.valid && !io.dtlb.resp.bits.miss - l2_dtlb_miss := io.dtlb.resp.valid && io.dtlb.resp.bits.miss - isMMIOReq := AddressSpace.isMMIO(io.dtlb.resp.bits.paddr) - l2_dcache := l2_dtlb_hit && !isMMIOReq - l2_mmio := l2_dtlb_hit && isMMIOReq - - // l2_out is used to generate dcache req - l2_out.bits := DontCare - l2_out.bits.vaddr := io.ldin.bits.src1 + io.ldin.bits.uop.ctrl.imm - l2_out.bits.paddr := io.dtlb.resp.bits.paddr - l2_out.bits.mask := genWmask(l2_out.bits.vaddr, io.ldin.bits.uop.ctrl.fuOpType(1,0)) - l2_out.bits.uop := io.ldin.bits.uop - l2_out.bits.miss := false.B - l2_out.bits.mmio := l2_mmio - l2_out.valid := io.ldin.valid && !io.ldin.bits.uop.roqIdx.needFlush(io.redirect) - // when we are sure it's a MMIO req, we do not need to wait for cache ready - l2_out.ready := (l2_dcache && io.dcache.req.ready) || l2_mmio || l2_dtlb_miss - io.ldin.ready := l2_out.ready - - // exception check - val addrAligned = LookupTree(io.ldin.bits.uop.ctrl.fuOpType(1,0), List( - "b00".U -> true.B, //b - "b01".U -> (l2_out.bits.vaddr(0) === 0.U), //h - "b10".U -> (l2_out.bits.vaddr(1,0) === 0.U), //w - "b11".U -> (l2_out.bits.vaddr(2,0) === 0.U) //d + val s0_uop = io.in.bits.uop + val s0_vaddr = io.in.bits.src1 + s0_uop.ctrl.imm + val s0_paddr = io.dtlbResp.bits.paddr + val s0_tlb_miss = io.dtlbResp.bits.miss + val s0_mask = genWmask(s0_vaddr, s0_uop.ctrl.fuOpType(1,0)) + + // query DTLB + io.dtlbReq.valid := io.out.valid + io.dtlbReq.bits.vaddr := s0_vaddr + io.dtlbReq.bits.cmd := TlbCmd.read + io.dtlbReq.bits.roqIdx := s0_uop.roqIdx + io.dtlbReq.bits.debug.pc := s0_uop.cf.pc + io.dtlbResp.ready := io.out.ready // TODO: check it: io.out.fire()? + + // feedback tlb result to RS + // Note: can be moved to s1 + io.tlbFeedback.valid := io.out.valid + io.tlbFeedback.bits.hit := !s0_tlb_miss + io.tlbFeedback.bits.roqIdx := s0_uop.roqIdx + + // query DCache + io.dcacheReq.valid := io.in.valid && !s0_uop.roqIdx.needFlush(io.redirect) + io.dcacheReq.bits.cmd := MemoryOpConstants.M_XRD + io.dcacheReq.bits.addr := s0_vaddr + io.dcacheReq.bits.mask := s0_mask + io.dcacheReq.bits.data := DontCare + + // TODO: update cache meta + io.dcacheReq.bits.meta.id := DontCare + io.dcacheReq.bits.meta.vaddr := s0_vaddr + io.dcacheReq.bits.meta.paddr := DontCare + io.dcacheReq.bits.meta.uop := s0_uop + io.dcacheReq.bits.meta.mmio := false.B + io.dcacheReq.bits.meta.tlb_miss := false.B + io.dcacheReq.bits.meta.mask := s0_mask + io.dcacheReq.bits.meta.replay := false.B + + val addrAligned = LookupTree(s0_uop.ctrl.fuOpType(1, 0), List( + "b00".U -> true.B, //b + "b01".U -> (s0_vaddr(0) === 0.U), //h + "b10".U -> (s0_vaddr(1, 0) === 0.U), //w + "b11".U -> (s0_vaddr(2, 0) === 0.U) //d )) - l2_out.bits.uop.cf.exceptionVec(loadAddrMisaligned) := !addrAligned - l2_out.bits.uop.cf.exceptionVec(loadPageFault) := io.dtlb.resp.bits.excp.pf.ld - - // send result to dcache - // never send tlb missed or MMIO reqs to dcache - io.dcache.req.valid := l2_dcache - - io.dcache.req.bits.cmd := MemoryOpConstants.M_XRD - // TODO: vaddr - io.dcache.req.bits.addr := io.dtlb.resp.bits.paddr - io.dcache.req.bits.data := DontCare - io.dcache.req.bits.mask := l2_out.bits.mask - - io.dcache.req.bits.meta.id := DontCare - io.dcache.req.bits.meta.vaddr := l2_out.bits.vaddr - io.dcache.req.bits.meta.paddr := io.dtlb.resp.bits.paddr - io.dcache.req.bits.meta.uop := l2_out.bits.uop - io.dcache.req.bits.meta.mmio := isMMIOReq - io.dcache.req.bits.meta.tlb_miss := io.dtlb.resp.bits.miss - io.dcache.req.bits.meta.mask := l2_out.bits.mask - io.dcache.req.bits.meta.replay := false.B - - - val l2_tlbFeedback = Wire(new TlbFeedback) - l2_tlbFeedback.hit := !io.dtlb.resp.bits.miss - l2_tlbFeedback.roqIdx := l2_out.bits.uop.roqIdx - - // dump l2 - XSDebug(l2_out.valid, "L2: pc 0x%x addr 0x%x -> 0x%x op %b data 0x%x mask %x dltb_miss %b dcache %b mmio %b\n", - l2_out.bits.uop.cf.pc, l2_out.bits.vaddr, l2_out.bits.paddr, - l2_out.bits.uop.ctrl.fuOpType, l2_out.bits.data, l2_out.bits.mask, - l2_dtlb_miss, l2_dcache, l2_mmio) - - XSDebug(l2_out.fire(), "load req: pc 0x%x addr 0x%x -> 0x%x op %b\n", - l2_out.bits.uop.cf.pc, l2_out.bits.vaddr, l2_out.bits.paddr, l2_out.bits.uop.ctrl.fuOpType) - - XSDebug(io.dcache.req.valid, p"dcache req(${io.dcache.req.valid} ${io.dcache.req.ready}): pc:0x${Hexadecimal(io.dcache.req.bits.meta.uop.cf.pc)} roqIdx:${io.dcache.req.bits.meta.uop.roqIdx} lsroqIdx:${io.dcache.req.bits.meta.uop.lsroqIdx} addr:0x${Hexadecimal(io.dcache.req.bits.addr)} vaddr:0x${Hexadecimal(io.dcache.req.bits.meta.vaddr)} paddr:0x${Hexadecimal(io.dcache.req.bits.meta.paddr)} mmio:${io.dcache.req.bits.meta.mmio} tlb_miss:${io.dcache.req.bits.meta.tlb_miss} mask:${io.dcache.req.bits.meta.mask}\n") - - //------------------------------------------------------- - // LD Pipeline Stage 3 - // Compare tag, use addr to query DCache Data - //------------------------------------------------------- - - val l3_valid = RegNext(l2_out.fire(), false.B) - val l3_dtlb_miss = RegEnable(next = l2_dtlb_miss, enable = l2_out.fire(), init = false.B) - val l3_dcache = RegEnable(next = l2_dcache, enable = l2_out.fire(), init = false.B) - val l3_tlbFeedback = RegEnable(next = l2_tlbFeedback, enable = l2_out.fire()) - val l3_bundle = RegEnable(next = l2_out.bits, enable = l2_out.fire()) - val l3_uop = l3_bundle.uop - // dltb miss reqs ends here - val l3_passdown = l3_valid && !l3_dtlb_miss && !l3_uop.roqIdx.needFlush(io.redirect) - - io.tlbFeedback.valid := l3_valid - io.tlbFeedback.bits := l3_tlbFeedback - io.dcache.s1_kill := l3_valid && l3_dcache && l3_uop.roqIdx.needFlush(io.redirect) - - // dump l3 - XSDebug(l3_valid, "l3: pc 0x%x addr 0x%x -> 0x%x op %b data 0x%x mask %x dltb_miss %b dcache %b mmio %b\n", - l3_bundle.uop.cf.pc, l3_bundle.vaddr, l3_bundle.paddr, - l3_bundle.uop.ctrl.fuOpType, l3_bundle.data, l3_bundle.mask, - l3_dtlb_miss, l3_dcache, l3_bundle.mmio) - - XSDebug(io.tlbFeedback.valid, "tlbFeedback: hit %b roqIdx %d\n", - io.tlbFeedback.bits.hit, io.tlbFeedback.bits.roqIdx.asUInt) - - XSDebug(io.dcache.s1_kill, "l3: dcache s1_kill\n") - - // Done in Dcache - - //------------------------------------------------------- - // LD Pipeline Stage 4 - // Dcache return result, do tag ecc check and forward check - //------------------------------------------------------- - - val l4_valid = RegNext(l3_passdown, false.B) - val l4_dcache = RegNext(l3_dcache, false.B) - val l4_bundle = RegNext(l3_bundle) - - val fullForward = Wire(Bool()) - - assert(!(io.dcache.resp.ready && !io.dcache.resp.valid), "DCache response got lost") - io.dcache.resp.ready := l4_valid && l4_dcache - when (io.dcache.resp.fire()) { - l4_out.bits := DontCare - l4_out.bits.data := io.dcache.resp.bits.data - l4_out.bits.paddr := io.dcache.resp.bits.meta.paddr - l4_out.bits.uop := io.dcache.resp.bits.meta.uop - l4_out.bits.mmio := io.dcache.resp.bits.meta.mmio - l4_out.bits.mask := io.dcache.resp.bits.meta.mask - // when we can get the data completely from forward - // we no longer need to access dcache - // treat nack as miss - l4_out.bits.miss := Mux(fullForward, false.B, - io.dcache.resp.bits.miss || io.dcache.resp.bits.nack) - XSDebug(io.dcache.resp.fire(), p"DcacheResp(l4): data:0x${Hexadecimal(io.dcache.resp.bits.data)} paddr:0x${Hexadecimal(io.dcache.resp.bits.meta.paddr)} pc:0x${Hexadecimal(io.dcache.resp.bits.meta.uop.cf.pc)} roqIdx:${io.dcache.resp.bits.meta.uop.roqIdx} lsroqIdx:${io.dcache.resp.bits.meta.uop.lsroqIdx} miss:${io.dcache.resp.bits.miss}\n") - } .otherwise { - l4_out.bits := l4_bundle - } - l4_out.valid := l4_valid && !l4_out.bits.uop.roqIdx.needFlush(io.redirect) - - // Store addr forward match - // If match, get data / fmask from store queue / store buffer - - // io.lsroq.forward := DontCare - io.lsroq.forward.paddr := l4_out.bits.paddr - io.lsroq.forward.mask := io.dcache.resp.bits.meta.mask - io.lsroq.forward.lsroqIdx := l4_out.bits.uop.lsroqIdx - io.lsroq.forward.sqIdx := l4_out.bits.uop.sqIdx - io.lsroq.forward.uop := l4_out.bits.uop - io.lsroq.forward.pc := l4_out.bits.uop.cf.pc - io.lsroq.forward.valid := io.dcache.resp.valid //TODO: opt timing - - io.sbuffer.paddr := l4_out.bits.paddr - io.sbuffer.mask := io.dcache.resp.bits.meta.mask - io.sbuffer.lsroqIdx := l4_out.bits.uop.lsroqIdx - io.sbuffer.sqIdx := l4_out.bits.uop.sqIdx - io.sbuffer.uop := DontCare - io.sbuffer.pc := l4_out.bits.uop.cf.pc - io.sbuffer.valid := l4_out.valid - - val forwardVec = WireInit(io.sbuffer.forwardData) - val forwardMask = WireInit(io.sbuffer.forwardMask) - // generate XLEN/8 Muxs - (0 until XLEN/8).map(j => { - when(io.lsroq.forward.forwardMask(j)) { - forwardMask(j) := true.B - forwardVec(j) := io.lsroq.forward.forwardData(j) - } + + io.out.valid := io.dcacheReq.fire() && // dcache may not accept load request + !io.in.bits.uop.roqIdx.needFlush(io.redirect) + io.out.bits := DontCare + io.out.bits.vaddr := s0_vaddr + io.out.bits.paddr := s0_paddr + io.out.bits.tlbMiss := io.dtlbResp.bits.miss + io.out.bits.mask := s0_mask + io.out.bits.uop := s0_uop + io.out.bits.uop.cf.exceptionVec(loadAddrMisaligned) := !addrAligned + io.out.bits.uop.cf.exceptionVec(loadPageFault) := io.dtlbResp.bits.excp.pf.ld + + io.in.ready := io.out.fire() + + XSDebug(io.dcacheReq.fire(), "[DCACHE LOAD REQ] pc %x vaddr %x paddr will be %x\n", + s0_uop.cf.pc, s0_vaddr, s0_paddr + ) +} + + +// Load Pipeline Stage 1 +// TLB resp (send paddr to dcache) +class LoadUnit_S1 extends XSModule { + val io = IO(new Bundle() { + val in = Flipped(Decoupled(new LsPipelineBundle)) + val out = Decoupled(new LsPipelineBundle) + val redirect = Flipped(ValidIO(new Redirect)) + val s1_paddr = Output(UInt(PAddrBits.W)) + val sbuffer = new LoadForwardQueryIO + val lsq = new LoadForwardQueryIO }) - l4_out.bits.forwardMask := forwardMask - l4_out.bits.forwardData := forwardVec - fullForward := (~l4_out.bits.forwardMask.asUInt & l4_out.bits.mask) === 0.U - PipelineConnect(l4_out, l5_in, io.ldout.fire() || (l5_in.bits.miss || l5_in.bits.mmio) && l5_in.valid, false.B) + val s1_uop = io.in.bits.uop + val s1_paddr = io.in.bits.paddr + val s1_tlb_miss = io.in.bits.tlbMiss + val s1_mmio = !s1_tlb_miss && AddressSpace.isMMIO(s1_paddr) && !io.out.bits.uop.cf.exceptionVec.asUInt.orR + val s1_mask = io.in.bits.mask - XSDebug(l4_valid, "l4: out.valid:%d pc 0x%x addr 0x%x -> 0x%x op %b data 0x%x mask %x forwardData: 0x%x forwardMask: %x dcache %b mmio %b miss:%d\n", - l4_out.valid, l4_out.bits.uop.cf.pc, l4_out.bits.vaddr, l4_out.bits.paddr, - l4_out.bits.uop.ctrl.fuOpType, l4_out.bits.data, l4_out.bits.mask, - l4_out.bits.forwardData.asUInt, l4_out.bits.forwardMask.asUInt, l4_dcache, l4_out.bits.mmio, l4_out.bits.miss) + io.out.bits := io.in.bits // forwardXX field will be updated in s1 + io.s1_paddr := s1_paddr - XSDebug(l5_in.valid, "L5(%d %d): pc 0x%x addr 0x%x -> 0x%x op %b data 0x%x mask %x forwardData: 0x%x forwardMask: %x\n", - l5_in.valid, l5_in.ready, l5_in.bits.uop.cf.pc, l5_in.bits.vaddr, l5_in.bits.paddr, - l5_in.bits.uop.ctrl.fuOpType , l5_in.bits.data, l5_in.bits.mask, - l5_in.bits.forwardData.asUInt, l5_in.bits.forwardMask.asUInt) + // load forward query datapath + io.sbuffer.valid := io.in.valid + io.sbuffer.paddr := s1_paddr + io.sbuffer.uop := s1_uop + io.sbuffer.sqIdx := s1_uop.sqIdx + io.sbuffer.mask := s1_mask + io.sbuffer.pc := s1_uop.cf.pc // FIXME: remove it - XSDebug(l4_valid, "l4: sbuffer forwardData: 0x%x forwardMask: %x\n", - io.sbuffer.forwardData.asUInt, io.sbuffer.forwardMask.asUInt) + io.lsq.valid := io.in.valid + io.lsq.paddr := s1_paddr + io.lsq.uop := s1_uop + io.lsq.sqIdx := s1_uop.sqIdx + io.lsq.mask := s1_mask + io.lsq.pc := s1_uop.cf.pc // FIXME: remove it - XSDebug(l4_valid, "l4: lsroq forwardData: 0x%x forwardMask: %x\n", - io.lsroq.forward.forwardData.asUInt, io.lsroq.forward.forwardMask.asUInt) + io.out.bits.forwardMask := io.sbuffer.forwardMask + io.out.bits.forwardData := io.sbuffer.forwardData - XSDebug(io.redirect.valid, - p"Redirect: excp:${io.redirect.bits.isException} flushPipe:${io.redirect.bits.isFlushPipe} misp:${io.redirect.bits.isMisPred} " + - p"replay:${io.redirect.bits.isReplay} pc:0x${Hexadecimal(io.redirect.bits.pc)} target:0x${Hexadecimal(io.redirect.bits.target)} " + - p"brTag:${io.redirect.bits.brTag} l2:${io.ldin.bits.uop.roqIdx.needFlush(io.redirect)} l3:${l3_uop.roqIdx.needFlush(io.redirect)} " + - p"l4:${l4_out.bits.uop.roqIdx.needFlush(io.redirect)}\n" - ) - //------------------------------------------------------- - // LD Pipeline Stage 5 - // Do data ecc check, merge result and write back to LS ROQ - // If cache hit, return writeback result to CDB - //------------------------------------------------------- + io.out.valid := io.in.valid && !s1_tlb_miss && !s1_uop.roqIdx.needFlush(io.redirect) + io.out.bits.paddr := s1_paddr + io.out.bits.mmio := s1_mmio + io.out.bits.tlbMiss := s1_tlb_miss + + io.in.ready := io.out.ready || !io.in.valid + +} + + +// Load Pipeline Stage 2 +// DCache resp +class LoadUnit_S2 extends XSModule { + val io = IO(new Bundle() { + val in = Flipped(Decoupled(new LsPipelineBundle)) + val out = Decoupled(new LsPipelineBundle) + val redirect = Flipped(ValidIO(new Redirect)) + val dcacheResp = Flipped(DecoupledIO(new DCacheWordResp)) + val lsq = new LoadForwardQueryIO + }) + + val s2_uop = io.in.bits.uop + val s2_mask = io.in.bits.mask + val s2_paddr = io.in.bits.paddr + val s2_cache_miss = io.dcacheResp.bits.miss + val s2_cache_nack = io.dcacheResp.bits.nack + + + io.dcacheResp.ready := true.B + assert(!(io.in.valid && !io.dcacheResp.valid), "DCache response got lost") - val loadWriteBack = l5_in.fire() + val forwardMask = io.out.bits.forwardMask + val forwardData = io.out.bits.forwardData + val fullForward = (~forwardMask.asUInt & s2_mask) === 0.U + + XSDebug(io.out.fire(), "[FWD LOAD RESP] pc %x fwd %x(%b) + %x(%b)\n", + s2_uop.cf.pc, + io.lsq.forwardData.asUInt, io.lsq.forwardMask.asUInt, + io.in.bits.forwardData.asUInt, io.in.bits.forwardMask.asUInt + ) // data merge - val rdata = VecInit((0 until 8).map(j => { - Mux(l5_in.bits.forwardMask(j), - l5_in.bits.forwardData(j), - l5_in.bits.data(8*(j+1)-1, 8*j) - ) - })).asUInt - val func = l5_in.bits.uop.ctrl.fuOpType - val raddr = l5_in.bits.paddr - val rdataSel = LookupTree(raddr(2, 0), List( + val rdata = VecInit((0 until XLEN / 8).map(j => + Mux(forwardMask(j), forwardData(j), io.dcacheResp.bits.data(8*(j+1)-1, 8*j)))).asUInt + val rdataSel = LookupTree(s2_paddr(2, 0), List( "b000".U -> rdata(63, 0), "b001".U -> rdata(63, 8), "b010".U -> rdata(63, 16), @@ -266,51 +184,137 @@ class LoadUnit extends XSModule { "b110".U -> rdata(63, 48), "b111".U -> rdata(63, 56) )) - val rdataPartialLoad = LookupTree(func, List( + val rdataPartialLoad = LookupTree(s2_uop.ctrl.fuOpType, List( LSUOpType.lb -> SignExt(rdataSel(7, 0) , XLEN), LSUOpType.lh -> SignExt(rdataSel(15, 0), XLEN), LSUOpType.lw -> SignExt(rdataSel(31, 0), XLEN), LSUOpType.ld -> SignExt(rdataSel(63, 0), XLEN), LSUOpType.lbu -> ZeroExt(rdataSel(7, 0) , XLEN), LSUOpType.lhu -> ZeroExt(rdataSel(15, 0), XLEN), - LSUOpType.lwu -> ZeroExt(rdataSel(31, 0), XLEN) + LSUOpType.lwu -> ZeroExt(rdataSel(31, 0), XLEN), + LSUOpType.flw -> boxF32ToF64(rdataSel(31, 0)) )) - // ecc check - // TODO + // TODO: ECC check - // if hit, writeback result to CDB - // val ldout = Vec(2, Decoupled(new ExuOutput)) - // when io.loadIn(i).fire() && !io.io.loadIn(i).miss, commit load to cdb - val hitLoadOut = Wire(Decoupled(new ExuOutput)) - hitLoadOut.bits.uop := l5_in.bits.uop - hitLoadOut.bits.data := rdataPartialLoad - hitLoadOut.bits.redirectValid := false.B - hitLoadOut.bits.redirect := DontCare - hitLoadOut.bits.brUpdate := DontCare - hitLoadOut.bits.debug.isMMIO := l5_in.bits.mmio - hitLoadOut.valid := l5_in.valid && !l5_in.bits.mmio && !l5_in.bits.miss // MMIO will be done in lsroq - XSDebug(hitLoadOut.fire(), "load writeback: pc %x data %x (%x + %x(%b))\n", - hitLoadOut.bits.uop.cf.pc, rdataPartialLoad, l5_in.bits.data, - l5_in.bits.forwardData.asUInt, l5_in.bits.forwardMask.asUInt + io.out.valid := io.in.valid // && !s2_uop.needFlush(io.redirect) will cause comb. loop + // Inst will be canceled in store queue / lsq, + // so we do not need to care about flush in load / store unit's out.valid + io.out.bits := io.in.bits + io.out.bits.data := rdataPartialLoad + io.out.bits.miss := (s2_cache_miss || s2_cache_nack) && !fullForward + io.out.bits.mmio := io.in.bits.mmio + + io.in.ready := io.out.ready || !io.in.valid + + // merge forward result + io.lsq := DontCare + // generate XLEN/8 Muxs + for (i <- 0 until XLEN / 8) { + when(io.lsq.forwardMask(i)) { + io.out.bits.forwardMask(i) := true.B + io.out.bits.forwardData(i) := io.lsq.forwardData(i) + } + } + + XSDebug(io.out.fire(), "[DCACHE LOAD RESP] pc %x rdata %x <- D$ %x + fwd %x(%b)\n", + s2_uop.cf.pc, rdataPartialLoad, io.dcacheResp.bits.data, + io.out.bits.forwardData.asUInt, io.out.bits.forwardMask.asUInt ) - // writeback to LSROQ - // Current dcache use MSHR +} + +// class LoadUnit_S3 extends XSModule { +// val io = IO(new Bundle() { +// val in = Flipped(Decoupled(new LsPipelineBundle)) +// val out = Decoupled(new LsPipelineBundle) +// val redirect = Flipped(ValidIO(new Redirect)) +// }) + +// io.in.ready := true.B +// io.out.bits := io.in.bits +// io.out.valid := io.in.valid && !io.out.bits.uop.roqIdx.needFlush(io.redirect) +// } + +class LoadUnit extends XSModule { + val io = IO(new Bundle() { + val ldin = Flipped(Decoupled(new ExuInput)) + val ldout = Decoupled(new ExuOutput) + val redirect = Flipped(ValidIO(new Redirect)) + val tlbFeedback = ValidIO(new TlbFeedback) + val dcache = new DCacheLoadIO + val dtlb = new TlbRequestIO() + val sbuffer = new LoadForwardQueryIO + val lsq = new LoadToLsqIO + }) - io.lsroq.loadIn.bits := l5_in.bits - io.lsroq.loadIn.bits.data := rdataPartialLoad // for debug - io.lsroq.loadIn.valid := loadWriteBack + val load_s0 = Module(new LoadUnit_S0) + val load_s1 = Module(new LoadUnit_S1) + val load_s2 = Module(new LoadUnit_S2) + // val load_s3 = Module(new LoadUnit_S3) + + load_s0.io.in <> io.ldin + load_s0.io.redirect <> io.redirect + load_s0.io.dtlbReq <> io.dtlb.req + load_s0.io.dtlbResp <> io.dtlb.resp + load_s0.io.dcacheReq <> io.dcache.req + load_s0.io.tlbFeedback <> io.tlbFeedback + + PipelineConnect(load_s0.io.out, load_s1.io.in, true.B, false.B) + + io.dcache.s1_paddr := load_s1.io.out.bits.paddr + load_s1.io.redirect <> io.redirect + io.dcache.s1_kill := DontCare // FIXME + io.sbuffer <> load_s1.io.sbuffer + io.lsq.forward <> load_s1.io.lsq + + PipelineConnect(load_s1.io.out, load_s2.io.in, true.B, false.B) + + load_s2.io.redirect <> io.redirect + load_s2.io.dcacheResp <> io.dcache.resp + load_s2.io.lsq := DontCare + load_s2.io.lsq.forwardData <> io.lsq.forward.forwardData + load_s2.io.lsq.forwardMask <> io.lsq.forward.forwardMask + + // PipelineConnect(load_s2.io.fp_out, load_s3.io.in, true.B, false.B) + // load_s3.io.redirect <> io.redirect + + XSDebug(load_s0.io.out.valid, + p"S0: pc ${Hexadecimal(load_s0.io.out.bits.uop.cf.pc)}, lId ${Hexadecimal(load_s0.io.out.bits.uop.lqIdx.asUInt)}, " + + p"vaddr ${Hexadecimal(load_s0.io.out.bits.vaddr)}, mask ${Hexadecimal(load_s0.io.out.bits.mask)}\n") + XSDebug(load_s1.io.out.valid, + p"S1: pc ${Hexadecimal(load_s1.io.out.bits.uop.cf.pc)}, lId ${Hexadecimal(load_s1.io.out.bits.uop.lqIdx.asUInt)}, tlb_miss ${io.dtlb.resp.bits.miss}, " + + p"paddr ${Hexadecimal(load_s1.io.out.bits.paddr)}, mmio ${load_s1.io.out.bits.mmio}\n") + + // writeback to LSQ + // Current dcache use MSHR + io.lsq.loadIn.valid := load_s2.io.out.valid + io.lsq.loadIn.bits := load_s2.io.out.bits - // pipeline control - l5_in.ready := io.ldout.ready + val hitLoadOut = Wire(Valid(new ExuOutput)) + hitLoadOut.valid := load_s2.io.out.valid && (!load_s2.io.out.bits.miss || load_s2.io.out.bits.uop.cf.exceptionVec.asUInt.orR) + hitLoadOut.bits.uop := load_s2.io.out.bits.uop + hitLoadOut.bits.data := load_s2.io.out.bits.data + hitLoadOut.bits.redirectValid := false.B + hitLoadOut.bits.redirect := DontCare + hitLoadOut.bits.brUpdate := DontCare + hitLoadOut.bits.debug.isMMIO := load_s2.io.out.bits.mmio + hitLoadOut.bits.fflags := DontCare - val cdbArb = Module(new Arbiter(new ExuOutput, 2)) - io.ldout <> cdbArb.io.out - hitLoadOut <> cdbArb.io.in(0) - io.lsroq.ldout <> cdbArb.io.in(1) // missLoadOut + // TODO: arbiter + // if hit, writeback result to CDB + // val ldout = Vec(2, Decoupled(new ExuOutput)) + // when io.loadIn(i).fire() && !io.io.loadIn(i).miss, commit load to cdb + // val cdbArb = Module(new Arbiter(new ExuOutput, 2)) + // io.ldout <> cdbArb.io.out + // hitLoadOut <> cdbArb.io.in(0) + // io.lsq.ldout <> cdbArb.io.in(1) // missLoadOut + load_s2.io.out.ready := true.B + io.lsq.ldout.ready := !hitLoadOut.valid + io.ldout.bits := Mux(hitLoadOut.valid, hitLoadOut.bits, io.lsq.ldout.bits) + io.ldout.valid := hitLoadOut.valid || io.lsq.ldout.valid when(io.ldout.fire()){ XSDebug("ldout %x iw %x fw %x\n", io.ldout.bits.uop.cf.pc, io.ldout.bits.uop.ctrl.rfWen, io.ldout.bits.uop.ctrl.fpWen) } -} +} \ No newline at end of file diff --git a/src/main/scala/xiangshan/mem/pipeline/StoreUnit.scala b/src/main/scala/xiangshan/mem/pipeline/StoreUnit.scala index c1f9296ee027bcfed57013b3159089d784fcf078..d807375bd7cf457727b93c3148b25d3b587cb3f6 100644 --- a/src/main/scala/xiangshan/mem/pipeline/StoreUnit.scala +++ b/src/main/scala/xiangshan/mem/pipeline/StoreUnit.scala @@ -4,135 +4,154 @@ import chisel3._ import chisel3.util._ import utils._ import xiangshan._ -import xiangshan.cache.{TlbRequestIO, TlbCmd} +import xiangshan.cache._ -class StoreUnit extends XSModule { +// Store Pipeline Stage 0 +// Generate addr, use addr to query DCache and DTLB +class StoreUnit_S0 extends XSModule { val io = IO(new Bundle() { - val stin = Flipped(Decoupled(new ExuInput)) + val in = Flipped(Decoupled(new ExuInput)) + val out = Decoupled(new LsPipelineBundle) val redirect = Flipped(ValidIO(new Redirect)) + val dtlbReq = DecoupledIO(new TlbReq) + val dtlbResp = Flipped(DecoupledIO(new TlbResp)) val tlbFeedback = ValidIO(new TlbFeedback) - val dtlb = new TlbRequestIO() - val lsroq = ValidIO(new LsPipelineBundle) }) - //------------------------------------------------------- - // Store Pipeline - //------------------------------------------------------- - val s2_out = Wire(Decoupled(new LsPipelineBundle)) - val s3_in = Wire(Decoupled(new LsPipelineBundle)) - - - private def printPipeLine(pipeline: LsPipelineBundle, cond: Bool, name: String): Unit = { - XSDebug(cond, - p"$name" + p" pc ${Hexadecimal(pipeline.uop.cf.pc)} " + - p"addr ${Hexadecimal(pipeline.vaddr)} -> ${Hexadecimal(pipeline.paddr)} " + - p"op ${Binary(pipeline.uop.ctrl.fuOpType)} " + - p"data ${Hexadecimal(pipeline.data)} " + - p"mask ${Hexadecimal(pipeline.mask)}\n" - ) - } - - printPipeLine(s2_out.bits, s2_out.valid, "S2") - // TODO: is this nesscary ? - XSDebug(s2_out.fire(), "store req: pc 0x%x addr 0x%x -> 0x%x op %b data 0x%x\n", - s2_out.bits.uop.cf.pc, - s2_out.bits.vaddr, - s2_out.bits.paddr, - s2_out.bits.uop.ctrl.fuOpType, - s2_out.bits.data - ) - printPipeLine(s3_in.bits, s3_in.valid, "S3") - - - - //------------------------------------------------------- - // ST Pipeline Stage 2 - // Generate addr, use addr to query DTLB - //------------------------------------------------------- - // send req to dtlb - val saddr = io.stin.bits.src1 + io.stin.bits.uop.ctrl.imm - - io.dtlb.req.bits.vaddr := saddr - io.dtlb.req.valid := io.stin.valid - io.dtlb.req.bits.cmd := TlbCmd.write - io.dtlb.req.bits.roqIdx := io.stin.bits.uop.roqIdx - io.dtlb.req.bits.debug.pc := io.stin.bits.uop.cf.pc - io.dtlb.req.bits.debug.lsroqIdx := io.stin.bits.uop.lsroqIdx // FIXME: need update - - s2_out.bits := DontCare - s2_out.bits.vaddr := saddr - s2_out.bits.paddr := io.dtlb.resp.bits.paddr - s2_out.bits.data := genWdata(io.stin.bits.src2, io.stin.bits.uop.ctrl.fuOpType(1,0)) - s2_out.bits.uop := io.stin.bits.uop - s2_out.bits.miss := io.dtlb.resp.bits.miss - s2_out.bits.mask := genWmask(s2_out.bits.vaddr, io.stin.bits.uop.ctrl.fuOpType(1,0)) - s2_out.valid := io.stin.valid && !io.dtlb.resp.bits.miss && !s2_out.bits.uop.roqIdx.needFlush(io.redirect) - io.stin.ready := s2_out.ready + val saddr = io.in.bits.src1 + io.in.bits.uop.ctrl.imm + + io.dtlbReq.bits.vaddr := saddr + io.dtlbReq.valid := io.in.valid + io.dtlbReq.bits.cmd := TlbCmd.write + io.dtlbReq.bits.roqIdx := io.in.bits.uop.roqIdx + io.dtlbReq.bits.debug.pc := io.in.bits.uop.cf.pc + io.dtlbResp.ready := true.B // TODO: why dtlbResp needs a ready? + + io.out.bits := DontCare + io.out.bits.vaddr := saddr + io.out.bits.paddr := io.dtlbResp.bits.paddr + io.out.bits.data := genWdata(io.in.bits.src2, io.in.bits.uop.ctrl.fuOpType(1,0)) + io.out.bits.uop := io.in.bits.uop + io.out.bits.miss := io.dtlbResp.bits.miss + io.out.bits.mask := genWmask(io.out.bits.vaddr, io.in.bits.uop.ctrl.fuOpType(1,0)) + io.out.valid := io.in.valid && !io.dtlbResp.bits.miss && !io.out.bits.uop.roqIdx.needFlush(io.redirect) + io.in.ready := io.out.ready // exception check - val addrAligned = LookupTree(io.stin.bits.uop.ctrl.fuOpType(1,0), List( + val addrAligned = LookupTree(io.in.bits.uop.ctrl.fuOpType(1,0), List( "b00".U -> true.B, //b - "b01".U -> (s2_out.bits.vaddr(0) === 0.U), //h - "b10".U -> (s2_out.bits.vaddr(1,0) === 0.U), //w - "b11".U -> (s2_out.bits.vaddr(2,0) === 0.U) //d + "b01".U -> (io.out.bits.vaddr(0) === 0.U), //h + "b10".U -> (io.out.bits.vaddr(1,0) === 0.U), //w + "b11".U -> (io.out.bits.vaddr(2,0) === 0.U) //d )) - s2_out.bits.uop.cf.exceptionVec(storeAddrMisaligned) := !addrAligned - s2_out.bits.uop.cf.exceptionVec(storePageFault) := io.dtlb.resp.bits.excp.pf.st - - PipelineConnect(s2_out, s3_in, true.B, false.B) - //------------------------------------------------------- - // ST Pipeline Stage 3 - // Write paddr to LSROQ - //------------------------------------------------------- + io.out.bits.uop.cf.exceptionVec(storeAddrMisaligned) := !addrAligned + io.out.bits.uop.cf.exceptionVec(storePageFault) := io.dtlbResp.bits.excp.pf.st // Send TLB feedback to store issue queue - io.tlbFeedback.valid := RegNext(io.stin.valid && s2_out.ready) - io.tlbFeedback.bits.hit := RegNext(!s2_out.bits.miss) - io.tlbFeedback.bits.roqIdx := RegNext(s2_out.bits.uop.roqIdx) - XSDebug(io.tlbFeedback.valid, - "S3 Store: tlbHit: %d roqIdx: %d\n", + // TODO: should be moved to S1 + io.tlbFeedback.valid := RegNext(io.in.valid && io.out.ready) + io.tlbFeedback.bits.hit := RegNext(!io.out.bits.miss) + io.tlbFeedback.bits.roqIdx := RegNext(io.out.bits.uop.roqIdx) + XSDebug(io.tlbFeedback.valid, + "S1 Store: tlbHit: %d roqIdx: %d\n", io.tlbFeedback.bits.hit, io.tlbFeedback.bits.roqIdx.asUInt ) +} + +// Load Pipeline Stage 1 +// TLB resp (send paddr to dcache) +class StoreUnit_S1 extends XSModule { + val io = IO(new Bundle() { + val in = Flipped(Decoupled(new LsPipelineBundle)) + val out = Decoupled(new LsPipelineBundle) + // val fp_out = Decoupled(new LsPipelineBundle) + val stout = DecoupledIO(new ExuOutput) // writeback store + val redirect = Flipped(ValidIO(new Redirect)) + }) // get paddr from dtlb, check if rollback is needed - // writeback store inst to lsroq - // writeback to LSROQ - s3_in.ready := true.B - io.lsroq.bits := s3_in.bits - io.lsroq.bits.miss := false.B - io.lsroq.bits.mmio := AddressSpace.isMMIO(s3_in.bits.paddr) - io.lsroq.valid := s3_in.fire() + // writeback store inst to lsq + // writeback to LSQ + io.in.ready := true.B + io.out.bits := io.in.bits + io.out.bits.miss := false.B + io.out.bits.mmio := AddressSpace.isMMIO(io.in.bits.paddr) + io.out.valid := io.in.fire() // TODO: && ! FP + + io.stout.bits.uop := io.in.bits.uop + // io.stout.bits.uop.cf.exceptionVec := // TODO: update according to TLB result + io.stout.bits.data := DontCare + io.stout.bits.redirectValid := false.B + io.stout.bits.redirect := DontCare + io.stout.bits.brUpdate := DontCare + io.stout.bits.debug.isMMIO := io.out.bits.mmio + io.stout.bits.fflags := DontCare + + val hasException = io.out.bits.uop.cf.exceptionVec.asUInt.orR + io.stout.valid := io.in.fire() && (!io.out.bits.mmio || hasException) // mmio inst will be writebacked immediately + + // if fp + // io.fp_out.valid := ... + // io.fp_out.bits := ... - //------------------------------------------------------- - // ST Pipeline Stage 4 - // Store writeback, send store request to store buffer - //------------------------------------------------------- +} - // Writeback to CDB - // (0 until LoadPipelineWidth).map(i => { - // io.ldout <> hitLoadOut - // }) +// class StoreUnit_S2 extends XSModule { +// val io = IO(new Bundle() { +// val in = Flipped(Decoupled(new LsPipelineBundle)) +// val out = Decoupled(new LsPipelineBundle) +// val redirect = Flipped(ValidIO(new Redirect)) +// }) - //------------------------------------------------------- - // ST Pipeline Async Stage 1 - // Read paddr from store buffer, query DTAG in DCache - //------------------------------------------------------- +// io.in.ready := true.B +// io.out.bits := io.in.bits +// io.out.valid := io.in.valid && !io.out.bits.uop.roqIdx.needFlush(io.redirect) +// } +class StoreUnit extends XSModule { + val io = IO(new Bundle() { + val stin = Flipped(Decoupled(new ExuInput)) + val redirect = Flipped(ValidIO(new Redirect)) + val tlbFeedback = ValidIO(new TlbFeedback) + val dtlb = new TlbRequestIO() + val lsq = ValidIO(new LsPipelineBundle) + val stout = DecoupledIO(new ExuOutput) // writeback store + }) - //------------------------------------------------------- - // ST Pipeline Async Stage 2 - // DTAG compare, write data to DCache - //------------------------------------------------------- + val store_s0 = Module(new StoreUnit_S0) + val store_s1 = Module(new StoreUnit_S1) + // val store_s2 = Module(new StoreUnit_S2) - // Done in DCache + store_s0.io.in <> io.stin + store_s0.io.redirect <> io.redirect + store_s0.io.dtlbReq <> io.dtlb.req + store_s0.io.dtlbResp <> io.dtlb.resp + store_s0.io.tlbFeedback <> io.tlbFeedback - //------------------------------------------------------- - // ST Pipeline Async Stage 2 - // DCache miss / Shared cache wirte - //------------------------------------------------------- + PipelineConnect(store_s0.io.out, store_s1.io.in, true.B, false.B) + // PipelineConnect(store_s1.io.fp_out, store_s2.io.in, true.B, false.B) - // update store buffer according to store fill buffer + store_s1.io.redirect <> io.redirect + store_s1.io.stout <> io.stout + // send result to sq + io.lsq.valid := store_s1.io.out.valid + io.lsq.bits := store_s1.io.out.bits -} + store_s1.io.out.ready := true.B + + private def printPipeLine(pipeline: LsPipelineBundle, cond: Bool, name: String): Unit = { + XSDebug(cond, + p"$name" + p" pc ${Hexadecimal(pipeline.uop.cf.pc)} " + + p"addr ${Hexadecimal(pipeline.vaddr)} -> ${Hexadecimal(pipeline.paddr)} " + + p"op ${Binary(pipeline.uop.ctrl.fuOpType)} " + + p"data ${Hexadecimal(pipeline.data)} " + + p"mask ${Hexadecimal(pipeline.mask)}\n" + ) + } + + printPipeLine(store_s0.io.out.bits, store_s0.io.out.valid, "S0") + printPipeLine(store_s1.io.out.bits, store_s1.io.out.valid, "S1") + +} \ No newline at end of file diff --git a/src/main/scala/xiangshan/mem/sbuffer/NewSbuffer.scala b/src/main/scala/xiangshan/mem/sbuffer/NewSbuffer.scala index faa96b5381cea9684d9d5850f03d2b9ec28916da..b3b0143bcf3b7da169a53725bcb219bf5be08acd 100644 --- a/src/main/scala/xiangshan/mem/sbuffer/NewSbuffer.scala +++ b/src/main/scala/xiangshan/mem/sbuffer/NewSbuffer.scala @@ -70,7 +70,8 @@ class NewSbuffer extends XSModule with HasSbufferCst { val buffer = Mem(StoreBufferSize, new SbufferLine) val stateVec = RegInit(VecInit(Seq.fill(StoreBufferSize)(s_invalid))) - val lru = new TrueLRU(StoreBufferSize) + //val lru = new SbufferLRU(StoreBufferSize) + val lru = new SbufferLRU(StoreBufferSize) // 2 * enq + 1 * deq val lruAccessWays = Wire(Vec(io.in.getWidth+1, new Valid(UInt(SbufferIndexWidth.W)))) for(w <- lruAccessWays){ @@ -217,7 +218,7 @@ class NewSbuffer extends XSModule with HasSbufferCst { val do_eviction = Wire(Bool()) val empty = Cat(stateVec.map(s => s===s_invalid)).andR() && !Cat(io.in.map(_.valid)).orR() - val replaceIdx = lru.way + val replaceIdx = lru.way(stateVec.map(s => s===s_valid)) val firstValidEntry = PriorityEncoder(stateVec.map(s => s===s_valid)) val evictor = Module(new NaiveEvictor(StoreBufferSize-4)) @@ -248,6 +249,8 @@ class NewSbuffer extends XSModule with HasSbufferCst { } XSDebug(p"sbuffer state:${sbuffer_state} do eviction:${do_eviction} empty:${empty}\n") + //XSDebug(p"replaceIdx:${replaceIdx}\n") + //val evictionIdxWire = replaceIdx val evictionIdxWire = Mux(stateVec(replaceIdx)===s_valid, replaceIdx, firstValidEntry) val evictionIdxEnqReq = Wire(DecoupledIO(UInt(SbufferIndexWidth.W))) val evictionIdxQueue = Module(new Queue(UInt(SbufferIndexWidth.W), StoreBufferSize, pipe = true, flow = false)) @@ -368,6 +371,9 @@ class NewSbuffer extends XSModule with HasSbufferCst { XSDebug(valid_tag_match, p"valid tag match: forward [$i] <> buf[$valid_forward_idx]\n" ) + XSDebug(inflight_tag_match || valid_tag_match, + p"[$i] forward paddr:${Hexadecimal(forward.paddr)}\n" + ) } } diff --git a/src/main/scala/xiangshan/mem/sbuffer/Sbuffer.scala b/src/main/scala/xiangshan/mem/sbuffer/Sbuffer.scala index c7c52090cdbe2553478b634fc4d65ff34a8d26b3..b67388c11589a7bff675c3f26b60459fb0b36e13 100644 --- a/src/main/scala/xiangshan/mem/sbuffer/Sbuffer.scala +++ b/src/main/scala/xiangshan/mem/sbuffer/Sbuffer.scala @@ -10,7 +10,6 @@ import utils.TrueLRU class SbufferUserBundle extends XSBundle { val pc = UInt(VAddrBits.W) //for debug - val lsroqId = UInt(log2Up(LsroqSize).W) } trait HasSBufferConst extends HasXSParameter { @@ -87,7 +86,7 @@ class Sbuffer extends XSModule with HasSBufferConst { val lru_accessed = WireInit(VecInit(Seq.fill(StorePipelineWidth)(false.B))) - // Get retired store from lsroq + // Get retired store from lsq //-------------------------------------------------------------------------------------------------------------------- for (storeIdx <- 0 until StorePipelineWidth) { io.in(storeIdx).ready := false.B // when there is empty line or target address already in this buffer, assert true @@ -457,7 +456,7 @@ class Sbuffer extends XSModule with HasSBufferConst { // send data with mask in this line // this mask is not 'mask for cache line' and we need to check low bits of paddr // to get certain part of one line - // P.S. data in io.in will be manipulated by lsroq + // P.S. data in io.in will be manipulated by lsq (0 until XLEN / 8).foreach(i => { when (cache(sBufIdx).mask(i.U + getByteOffset(io.forward(loadIdx).paddr)) && io.forward(loadIdx).mask(i)) { io.forward(loadIdx).forwardData(i) := cache(sBufIdx).data(i.U + getByteOffset(io.forward(loadIdx).paddr)) diff --git a/src/main/scala/xstransforms/ShowPrintTransform.scala b/src/main/scala/xstransforms/ShowPrintTransform.scala index a41b2ee15bf7203db1b163a35b15b6c28dcc3050..0a7cc3ae7fb3e610c12700ac67e9aea5085671d2 100644 --- a/src/main/scala/xstransforms/ShowPrintTransform.scala +++ b/src/main/scala/xstransforms/ShowPrintTransform.scala @@ -8,9 +8,7 @@ import scala.collection.mutable class ShowPrintTransform extends Transform with DependencyAPIMigration { - // The first transform to run - override def prerequisites = firrtl.stage.Forms.ChirrtlForm - // Invalidates everything + override def optionalPrerequisiteOf = firrtl.stage.Forms.MinimalHighForm override def invalidates(a: Transform) = true override protected def execute(state: CircuitState): CircuitState = { @@ -26,6 +24,10 @@ class ShowPrintTransform extends Transform with DependencyAPIMigration { case DisableAllPrintAnnotation() => true }.nonEmpty + val removeAssert = state.annotations.collectFirst{ + case RemoveAssertAnnotation() => true + }.nonEmpty + assert( !(whiteList.nonEmpty && (disableAll || blackList.nonEmpty)), "'white list' can't be used with 'disable all' or 'black list'!" @@ -56,26 +58,37 @@ class ShowPrintTransform extends Transform with DependencyAPIMigration { } def processModule(m: DefModule): DefModule = { - def disableModulePrint = { + def disableModulePrint(mod: DefModule) = { def disableStmtPrint(s: Statement): Statement = s match { case _: Print => EmptyStmt case other => other.mapStmt(disableStmtPrint) } - m.mapStmt(disableStmtPrint) + mod.mapStmt(disableStmtPrint) } + def removeModuleAssert(mod: DefModule)= { + def removeStmtAssert(s: Statement): Statement = s match { + case _: Stop => + EmptyStmt + case other => + other.mapStmt(removeStmtAssert) + } + mod.mapStmt(removeStmtAssert) + } + val isInBlackList = blackList.nonEmpty && ( blackList.contains(m.name) || blackList.map( b => ancestors(m.name).contains(b)).reduce(_||_) ) val isInWhiteList = whiteList.isEmpty || ( whiteList.nonEmpty && (whiteList.contains(m.name) || whiteList.map( x => ancestors(m.name).contains(x)).reduce(_||_)) ) - if( disableAll || isInBlackList || !isInWhiteList ){ - disableModulePrint + val tmpMod = if(disableAll || isInBlackList || !isInWhiteList){ + disableModulePrint(m) } else { m } + if(removeAssert) removeModuleAssert(tmpMod) else tmpMod } state.copy(c.mapModule(processModule)) diff --git a/src/test/csrc/common.h b/src/test/csrc/common.h index 2fffc9952c236c93e854ef043cef9d249b9fc706..c7158c524ee9e185850bc3209dd456f3ec4f1007 100644 --- a/src/test/csrc/common.h +++ b/src/test/csrc/common.h @@ -17,4 +17,8 @@ #define eprintf(...) fprintf(stdout, ## __VA_ARGS__) +#ifdef WITH_DRAMSIM3 +#include "cosimulation.h" +#endif + #endif // __COMMON_H diff --git a/src/test/csrc/difftest.cpp b/src/test/csrc/difftest.cpp index 01a48ed227358ed0cec5c0624b7afb9d472783ca..74db9a59ccacca71894a54defa9f14d23a1ce975 100644 --- a/src/test/csrc/difftest.cpp +++ b/src/test/csrc/difftest.cpp @@ -204,7 +204,7 @@ int difftest_step(DiffState *s) { ds.exceptionNo = s->cause; ds.mtval = s->reg_scala[DIFFTEST_MTVAL]; ds.stval = s->reg_scala[DIFFTEST_STVAL]; - ref_disambiguate_exec(&s->cause); + ref_disambiguate_exec(&ds); }else{ ref_difftest_exec(1); } diff --git a/src/test/csrc/emu.cpp b/src/test/csrc/emu.cpp index 733039f645473faa0e18c35fa2d948dd4dd0731d..292bd82627442eace5bf859f2d1ee34ab3462100 100644 --- a/src/test/csrc/emu.cpp +++ b/src/test/csrc/emu.cpp @@ -2,6 +2,7 @@ #include "sdcard.h" #include "difftest.h" #include +#include "ram.h" void* get_ram_start(); long get_ram_size(); @@ -14,6 +15,7 @@ static inline void print_help(const char *file) { printf("\n"); printf(" -s, --seed=NUM use this seed\n"); printf(" -C, --max-cycles=NUM execute at most NUM cycles\n"); + printf(" -I, --max-instr=NUM execute at most NUM instructions\n"); printf(" -i, --image=FILE run with this image file\n"); printf(" -b, --log-begin=NUM display log from NUM th cycle\n"); printf(" -e, --log-end=NUM stop display log at NUM th cycle\n"); @@ -31,6 +33,7 @@ inline EmuArgs parse_args(int argc, const char *argv[]) { { "dump-wave", 0, NULL, 0 }, { "seed", 1, NULL, 's' }, { "max-cycles", 1, NULL, 'C' }, + { "max-instr", 1, NULL, 'I' }, { "image", 1, NULL, 'i' }, { "log-begin", 1, NULL, 'b' }, { "log-end", 1, NULL, 'e' }, @@ -40,7 +43,7 @@ inline EmuArgs parse_args(int argc, const char *argv[]) { int o; while ( (o = getopt_long(argc, const_cast(argv), - "-s:C:hi:m:b:e:", long_options, &long_index)) != -1) { + "-s:C:I:hi:m:b:e:", long_options, &long_index)) != -1) { switch (o) { case 0: switch (long_index) { @@ -58,6 +61,7 @@ inline EmuArgs parse_args(int argc, const char *argv[]) { } break; case 'C': args.max_cycles = atoll(optarg); break; + case 'I': args.max_instr = atoll(optarg); break; case 'i': args.image = optarg; break; case 'b': args.log_begin = atoll(optarg); break; case 'e': args.log_end = atoll(optarg); break; @@ -70,7 +74,7 @@ inline EmuArgs parse_args(int argc, const char *argv[]) { Emulator::Emulator(int argc, const char *argv[]): - dut_ptr(new VXSSimTop), + dut_ptr(new VXSSimSoC), cycles(0), hascommit(0), trapCode(STATE_RUNNING) { args = parse_args(argc, argv); @@ -85,7 +89,6 @@ Emulator::Emulator(int argc, const char *argv[]): reset_ncycles(10); // init ram - extern void init_ram(const char *img); init_ram(args.image); // init device @@ -107,11 +110,14 @@ Emulator::Emulator(int argc, const char *argv[]): enable_waveform = false; #endif +#ifdef VM_SAVABLE if (args.snapshot_path != NULL) { printf("loading from snapshot `%s`...\n", args.snapshot_path); snapshot_load(args.snapshot_path); + printf("model cycleCnt = %" PRIu64 "\n", dut_ptr->io_trap_cycleCnt); hascommit = 1; } +#endif // set log time range and log level dut_ptr->io_logCtrl_log_begin = args.log_begin; @@ -119,9 +125,13 @@ Emulator::Emulator(int argc, const char *argv[]): } Emulator::~Emulator() { + ram_finish(); + +#ifdef VM_SAVABLE snapshot_slot[0].save(); snapshot_slot[1].save(); printf("Please remove unused snapshots manually\n"); +#endif } inline void Emulator::read_emu_regs(uint64_t *r) { @@ -180,6 +190,13 @@ inline void Emulator::reset_ncycles(size_t cycles) { inline void Emulator::single_cycle() { dut_ptr->clock = 0; +#ifdef WITH_DRAMSIM3 + axi_channel axi; + axi_copy_from_dut_ptr(dut_ptr, axi); + dramsim3_helper(axi); + axi_set_dut_ptr(dut_ptr, axi); +#endif + dut_ptr->eval(); dut_ptr->clock = 1; @@ -207,12 +224,13 @@ inline void Emulator::single_cycle() { cycles ++; } -uint64_t Emulator::execute(uint64_t n) { +uint64_t Emulator::execute(uint64_t max_cycle, uint64_t max_instr) { extern void poll_event(void); extern uint32_t uptime(void); uint32_t lasttime_poll = 0; uint32_t lasttime_snapshot = 0; - uint64_t lastcommit = n; + uint64_t lastcommit = max_cycle; + uint64_t instr_left_last_cycle = max_instr; const int stuck_limit = 2000; uint32_t wdst[DIFFTEST_WIDTH]; @@ -225,14 +243,19 @@ uint64_t Emulator::execute(uint64_t n) { diff.wdata = wdata; diff.wdst = wdst; - while (trapCode == STATE_RUNNING && n > 0) { + while (!Verilated::gotFinish() && trapCode == STATE_RUNNING) { + if (!(max_cycle > 0 && max_instr > 0 && instr_left_last_cycle >= max_instr /* handle overflow */)) { + trapCode = STATE_LIMIT_EXCEEDED; + break; + } + single_cycle(); - n --; + max_cycle --; if (dut_ptr->io_trap_valid) trapCode = dut_ptr->io_trap_code; if (trapCode != STATE_RUNNING) break; - if (lastcommit - n > stuck_limit && hascommit) { + if (lastcommit - max_cycle > stuck_limit && hascommit) { eprintf("No instruction commits for %d cycles, maybe get stuck\n" "(please also check whether a fence.i instruction requires more than %d cycles to flush the icache)\n", stuck_limit, stuck_limit); @@ -268,7 +291,11 @@ uint64_t Emulator::execute(uint64_t n) { if (difftest_step(&diff)) { trapCode = STATE_ABORT; } - lastcommit = n; + lastcommit = max_cycle; + + // update instr_cnt + instr_left_last_cycle = max_instr; + max_instr -= diff.commit; } uint32_t t = uptime(); @@ -276,8 +303,9 @@ uint64_t Emulator::execute(uint64_t n) { poll_event(); lasttime_poll = t; } +#ifdef VM_SAVABLE static int snapshot_count = 0; - if (t - lasttime_snapshot > 1000 * SNAPSHOT_INTERVAL) { + if (trapCode != STATE_GOODTRAP && t - lasttime_snapshot > 1000 * SNAPSHOT_INTERVAL) { // save snapshot every 10s time_t now = time(NULL); snapshot_save(snapshot_filename(now)); @@ -289,6 +317,12 @@ uint64_t Emulator::execute(uint64_t n) { snapshot_count = 0; } } +#endif + } + + if (Verilated::gotFinish()) { + eprintf("The simulation stopped. There might be some assertion failed.\n"); + trapCode = STATE_ABORT; } #if VM_TRACE == 1 @@ -307,12 +341,14 @@ inline char* Emulator::timestamp_filename(time_t t, char *buf) { return buf + len; } +#ifdef VM_SAVABLE inline char* Emulator::snapshot_filename(time_t t) { static char buf[1024]; char *p = timestamp_filename(t, buf); strcpy(p, ".snapshot"); return buf; } +#endif inline char* Emulator::waveform_filename(time_t t) { static char buf[1024]; @@ -336,6 +372,9 @@ void Emulator::display_trapinfo() { case STATE_ABORT: eprintf(ANSI_COLOR_RED "ABORT at pc = 0x%" PRIx64 "\n" ANSI_COLOR_RESET, pc); break; + case STATE_LIMIT_EXCEEDED: + eprintf(ANSI_COLOR_YELLOW "EXCEEDING CYCLE/INSTR LIMIT at pc = 0x%" PRIx64 "\n" ANSI_COLOR_RESET, pc); + break; default: eprintf(ANSI_COLOR_RED "Unknown trap code: %d\n", trapCode); } @@ -346,6 +385,7 @@ void Emulator::display_trapinfo() { instrCnt, cycleCnt, ipc); } +#ifdef VM_SAVABLE void Emulator::snapshot_save(const char *filename) { static int last_slot = 0; VerilatedSaveMem &stream = snapshot_slot[last_slot]; @@ -425,3 +465,4 @@ void Emulator::snapshot_load(const char *filename) { if(fp) fseek(fp, sdcard_offset, SEEK_SET); } +#endif diff --git a/src/test/csrc/emu.h b/src/test/csrc/emu.h index 444b038b62591c13c39ecb568dc57926de8e9749..4031a85ab1a58f84d76c093b2bbca9655f40f657 100644 --- a/src/test/csrc/emu.h +++ b/src/test/csrc/emu.h @@ -1,6 +1,6 @@ #include "common.h" #include "snapshot.h" -#include "VXSSimTop.h" +#include "VXSSimSoC.h" #include // Trace file format header #define DIFFTEST_WIDTH 6 @@ -9,6 +9,7 @@ struct EmuArgs { uint32_t seed; uint64_t max_cycles; + uint64_t max_instr; uint64_t log_begin, log_end; const char *image; const char *snapshot_path; @@ -17,6 +18,7 @@ struct EmuArgs { EmuArgs() { seed = 0; max_cycles = -1; + max_instr = -1; log_begin = 1; log_end = -1; snapshot_path = NULL; @@ -26,16 +28,19 @@ struct EmuArgs { }; class Emulator { - VXSSimTop *dut_ptr; + VXSSimSoC *dut_ptr; VerilatedVcdC* tfp; bool enable_waveform; +#ifdef VM_SAVABLE VerilatedSaveMem snapshot_slot[2]; +#endif EmuArgs args; enum { STATE_GOODTRAP = 0, STATE_BADTRAP, STATE_ABORT, + STATE_LIMIT_EXCEEDED, STATE_RUNNING = -1 }; @@ -58,7 +63,7 @@ class Emulator { public: Emulator(int argc, const char *argv[]); ~Emulator(); - uint64_t execute(uint64_t n); + uint64_t execute(uint64_t max_cycle, uint64_t max_instr); uint64_t get_cycles() const { return cycles; } EmuArgs get_args() const { return args; } bool is_good_trap() { return trapCode == STATE_GOODTRAP; }; diff --git a/src/test/csrc/main.cpp b/src/test/csrc/main.cpp index 38d0af4dc59b809e40106659e2bb3cb213e51e69..1d9299ac96d2e053544d9f98eca46bf7f7eec416 100644 --- a/src/test/csrc/main.cpp +++ b/src/test/csrc/main.cpp @@ -8,6 +8,8 @@ std::function get_sc_time_stamp = []() -> double { return 0; }; double sc_time_stamp() { return get_sc_time_stamp(); } int main(int argc, const char** argv) { + printf("Emu compiled at %s, %s\n", __DATE__, __TIME__); + setbuf(stderr, mybuf); auto emu = new Emulator(argc, argv); @@ -17,7 +19,7 @@ int main(int argc, const char** argv) { }; auto args = emu->get_args(); - uint64_t cycles = emu->execute(args.max_cycles); + uint64_t cycles = emu->execute(args.max_cycles, args.max_instr); bool is_good_trap = emu->is_good_trap(); delete emu; diff --git a/src/test/csrc/ram.cpp b/src/test/csrc/ram.cpp index a7155630b5b6706bd92d19d4856e5c0e48320a4f..7340c7940d0ea0d5b2d387d42b5c39a8896fcf55 100644 --- a/src/test/csrc/ram.cpp +++ b/src/test/csrc/ram.cpp @@ -1,25 +1,35 @@ +#include +#include + #include "common.h" +#include "ram.h" + +#define RAMSIZE (256 * 1024 * 1024UL) -#define RAMSIZE (128 * 1024 * 1024) +#ifdef WITH_DRAMSIM3 +#include "cosimulation.h" +CoDRAMsim3 *dram = NULL; +#endif -static uint64_t ram[RAMSIZE / sizeof(uint64_t)]; +static uint64_t *ram; static long img_size = 0; void* get_img_start() { return &ram[0]; } long get_img_size() { return img_size; } void* get_ram_start() { return &ram[0]; } long get_ram_size() { return RAMSIZE; } +#ifdef TLB_UNITTEST void addpageSv39() { //three layers //addr range: 0x0000000080000000 - 0x0000000088000000 for 128MB from 2GB - 2GB128MB //the first layer: one entry for 1GB. (512GB in total by 512 entries). need the 2th entries //the second layer: one entry for 2MB. (1GB in total by 512 entries). need the 0th-63rd entries -//the third layer: one entry for 4KB (2MB in total by 512 entries). need 64 with each one all - +//the third layer: one entry for 4KB (2MB in total by 512 entries). need 64 with each one all +#define TOPSIZE (128 * 1024 * 1024) #define PAGESIZE (4 * 1024) // 4KB = 2^12B #define ENTRYNUM (PAGESIZE / 8) //512 2^9 #define PTEVOLUME (PAGESIZE * ENTRYNUM) // 2MB -#define PTENUM (RAMSIZE / PTEVOLUME) // 128MB / 2MB = 64 +#define PTENUM (TOPSIZE / PTEVOLUME) // 128MB / 2MB = 64 #define PDDENUM 1 #define PDENUM 1 #define PDDEADDR (0x88000000 - (PAGESIZE * (PTENUM + 2))) //0x88000000 - 0x1000*66 @@ -33,7 +43,7 @@ void addpageSv39() { uint64_t pdde[ENTRYNUM]; uint64_t pde[ENTRYNUM]; uint64_t pte[PTENUM][ENTRYNUM]; - + // special addr for mmio 0x40000000 - 0x4fffffff uint64_t pdemmio[ENTRYNUM]; uint64_t ptemmio[PTEMMIONUM][ENTRYNUM]; @@ -61,13 +71,13 @@ void addpageSv39() { for(int i = 0; i < PTEMMIONUM; i++) { pdemmio[i] = (((PDDEADDR-PAGESIZE*(PTEMMIONUM+PDEMMIONUM-i)) & 0xfffff000) >> 2) | 0x1; } - + for(int outidx = 0; outidx < PTEMMIONUM; outidx++) { for(int inidx = 0; inidx < ENTRYNUM; inidx++) { ptemmio[outidx][inidx] = (((0x40000000 + outidx*PTEVOLUME + inidx*PAGESIZE) & 0xfffff000) >> 2) | 0xf; } } - + //0x800000000 - 0x87ffffff pdde[2] = ((PDEADDR & 0xfffff000) >> 2) | 0x1; //pdde[2] = ((0x80000000&0xc0000000) >> 2) | 0xf; @@ -83,50 +93,322 @@ void addpageSv39() { } } - memcpy((char *)ram+(RAMSIZE-PAGESIZE*(PTENUM+PDDENUM+PDENUM+PDEMMIONUM+PTEMMIONUM+PDEDEVNUM+PTEDEVNUM)),ptedev,PAGESIZE*PTEDEVNUM); - memcpy((char *)ram+(RAMSIZE-PAGESIZE*(PTENUM+PDDENUM+PDENUM+PDEMMIONUM+PTEMMIONUM+PDEDEVNUM)),pdedev,PAGESIZE*PDEDEVNUM); - memcpy((char *)ram+(RAMSIZE-PAGESIZE*(PTENUM+PDDENUM+PDENUM+PDEMMIONUM+PTEMMIONUM)),ptemmio, PAGESIZE*PTEMMIONUM); - memcpy((char *)ram+(RAMSIZE-PAGESIZE*(PTENUM+PDDENUM+PDENUM+PDEMMIONUM)), pdemmio, PAGESIZE*PDEMMIONUM); - memcpy((char *)ram+(RAMSIZE-PAGESIZE*(PTENUM+PDDENUM+PDENUM)), pdde, PAGESIZE*PDDENUM); - memcpy((char *)ram+(RAMSIZE-PAGESIZE*(PTENUM+PDENUM)), pde, PAGESIZE*PDENUM); - memcpy((char *)ram+(RAMSIZE-PAGESIZE*PTENUM), pte, PAGESIZE*PTENUM); + memcpy((char *)ram+(TOPSIZE-PAGESIZE*(PTENUM+PDDENUM+PDENUM+PDEMMIONUM+PTEMMIONUM+PDEDEVNUM+PTEDEVNUM)),ptedev,PAGESIZE*PTEDEVNUM); + memcpy((char *)ram+(TOPSIZE-PAGESIZE*(PTENUM+PDDENUM+PDENUM+PDEMMIONUM+PTEMMIONUM+PDEDEVNUM)),pdedev,PAGESIZE*PDEDEVNUM); + memcpy((char *)ram+(TOPSIZE-PAGESIZE*(PTENUM+PDDENUM+PDENUM+PDEMMIONUM+PTEMMIONUM)),ptemmio, PAGESIZE*PTEMMIONUM); + memcpy((char *)ram+(TOPSIZE-PAGESIZE*(PTENUM+PDDENUM+PDENUM+PDEMMIONUM)), pdemmio, PAGESIZE*PDEMMIONUM); + memcpy((char *)ram+(TOPSIZE-PAGESIZE*(PTENUM+PDDENUM+PDENUM)), pdde, PAGESIZE*PDDENUM); + memcpy((char *)ram+(TOPSIZE-PAGESIZE*(PTENUM+PDENUM)), pde, PAGESIZE*PDENUM); + memcpy((char *)ram+(TOPSIZE-PAGESIZE*PTENUM), pte, PAGESIZE*PTENUM); +} +#endif + +// Return whether the file is a gz file +int isGzFile(const char *img) { + assert(img != NULL && strlen(img) >= 4); + return !strcmp(img + (strlen(img) - 3), ".gz"); +} + +// Read binary from .gz file +int readFromGz(void* ptr, const char *file_name) { + gzFile compressed_mem = gzopen(file_name, "rb"); + + if(compressed_mem == NULL) { + printf("Can't open compressed binary file '%s'", file_name); + return -1; + } + + uint64_t curr_size = 0; + // read 16KB each time + const uint32_t chunk_size = 16384; + if ((RAMSIZE % chunk_size) != 0) { + printf("RAMSIZE must be divisible by chunk_size\n"); + assert(0); + } + uint64_t *temp_page = new uint64_t[chunk_size]; + uint64_t *pmem_current = (uint64_t *)ptr; + + while (curr_size < RAMSIZE) { + uint32_t bytes_read = gzread(compressed_mem, temp_page, chunk_size); + if (bytes_read == 0) { break; } + assert(bytes_read % sizeof(uint64_t) == 0); + for (uint32_t x = 0; x < bytes_read / sizeof(uint64_t); x++) { + if (*(temp_page + x) != 0) { + pmem_current = (uint64_t*)((uint8_t*)ptr + curr_size + x * sizeof(uint64_t)); + *pmem_current = *(temp_page + x); + } + } + curr_size += bytes_read; + } + // printf("Read 0x%lx bytes from gz stream in total.\n", curr_size); + + delete [] temp_page; + + if(gzclose(compressed_mem)) { + printf("Error closing '%s'\n", file_name); + return -1; + } + return curr_size; } void init_ram(const char *img) { assert(img != NULL); - FILE *fp = fopen(img, "rb"); - if (fp == NULL) { - printf("Can not open '%s'\n", img); + + printf("The image is %s\n", img); + + // initialize memory using Linux mmap + printf("Using simulated %luMB RAM\n", RAMSIZE / (1024 * 1024)); + ram = (uint64_t *)mmap(NULL, RAMSIZE, PROT_READ | PROT_WRITE, MAP_ANON | MAP_PRIVATE, -1, 0); + if (ram == (uint64_t *)MAP_FAILED) { + printf("Cound not mmap 0x%lx bytes\n", RAMSIZE); assert(0); } - printf("The image is %s\n", img); + int ret; + if (isGzFile(img)) { + printf("Gzip file detected and loading image from extracted gz file\n"); + img_size = readFromGz(ram, img); + assert(img_size >= 0); + } + else { + FILE *fp = fopen(img, "rb"); + if (fp == NULL) { + printf("Can not open '%s'\n", img); + assert(0); + } - fseek(fp, 0, SEEK_END); - img_size = ftell(fp); + fseek(fp, 0, SEEK_END); + img_size = ftell(fp); + if (img_size > RAMSIZE) { + img_size = RAMSIZE; + } - fseek(fp, 0, SEEK_SET); - int ret = fread(ram, img_size, 1, fp); - assert(ret == 1); - fclose(fp); + fseek(fp, 0, SEEK_SET); + ret = fread(ram, img_size, 1, fp); + assert(ret == 1); + fclose(fp); + } + +#ifdef TLB_UNITTEST //new add addpageSv39(); //new end +#endif + +#ifdef WITH_DRAMSIM3 + #if !defined(DRAMSIM3_CONFIG) || !defined(DRAMSIM3_OUTDIR) + #error DRAMSIM3_CONFIG or DRAMSIM3_OUTDIR is not defined + #endif + assert(dram == NULL); + dram = new CoDRAMsim3(DRAMSIM3_CONFIG, DRAMSIM3_OUTDIR); +#endif + +} + +void ram_finish() { + munmap(ram, RAMSIZE); +#ifdef WITH_DRAMSIM3 + dramsim3_finish(); +#endif } -extern "C" void ram_helper( - uint64_t rIdx, uint64_t *rdata, uint64_t wIdx, uint64_t wdata, uint64_t wmask, uint8_t wen) { - if (rIdx >= RAMSIZE / sizeof(uint64_t)) { - printf("ERROR: ram idx = 0x%x out of bound!\n", rIdx); - // TODO: don't allow out of bound when crossbar is ready - //assert(rIdx < RAMSIZE / sizeof(uint64_t)); - *rdata = 0xabcd12345678dcbaUL; - return; +extern "C" uint64_t ram_read_helper(uint8_t en, uint64_t rIdx) { + if (en && rIdx >= RAMSIZE / sizeof(uint64_t)) { + printf("WARN: ram rIdx = 0x%lx out of bound!\n", rIdx); + // assert(rIdx < RAMSIZE / sizeof(uint64_t)); + return 0x12345678deadbeafULL; } - *rdata = ram[rIdx]; + return (en) ? ram[rIdx] : 0; +} + +extern "C" void ram_write_helper(uint64_t wIdx, uint64_t wdata, uint64_t wmask, uint8_t wen) { if (wen) { - assert(wIdx < RAMSIZE / sizeof(uint64_t)); + if (wIdx >= RAMSIZE / sizeof(uint64_t)) { + printf("ERROR: ram wIdx = 0x%lx out of bound!\n", wIdx); + assert(wIdx < RAMSIZE / sizeof(uint64_t)); + } ram[wIdx] = (ram[wIdx] & ~wmask) | (wdata & wmask); } } + +#ifdef WITH_DRAMSIM3 +#include + +void dramsim3_finish() { + delete dram; +} + +#define MAX_AXI_DATA_LEN 8 + +// currently does not support masked read or write +struct dramsim3_meta { + uint8_t len; + uint8_t size; + uint8_t offset; + uint8_t id; + uint64_t data[MAX_AXI_DATA_LEN]; +}; + +void axi_read_data(const axi_ar_channel &ar, dramsim3_meta *meta) { + uint64_t address = ar.addr % RAMSIZE; + uint64_t beatsize = 1 << ar.size; + uint8_t beatlen = ar.len + 1; + uint64_t transaction_size = beatsize * beatlen; + assert((transaction_size % sizeof(uint64_t)) == 0); + // axi burst FIXED + if (ar.burst == 0x0) { + std::cout << "axi burst FIXED not supported!" << std::endl; + assert(0); + } + // axi burst INCR + else if (ar.burst == 1) { + assert(transaction_size / sizeof(uint64_t) <= MAX_AXI_DATA_LEN); + for (int i = 0; i < transaction_size / sizeof(uint64_t); i++) { + meta->data[i] = ram[address / sizeof(uint64_t)]; + address += sizeof(uint64_t); + } + } + // axi burst WRAP + else if (ar.burst == 2) { + uint64_t low = (address / transaction_size) * transaction_size; + uint64_t high = low + transaction_size; + assert(transaction_size / sizeof(uint64_t) <= MAX_AXI_DATA_LEN); + for (int i = 0; i < transaction_size / sizeof(uint64_t); i++) { + if (address == high) { + address = low; + } + meta->data[i] = ram[address / sizeof(uint64_t)]; + address += sizeof(uint64_t); + } + } + else { + std::cout << "reserved arburst!" << std::endl; + assert(0); + } + meta->len = beatlen; + meta->size = beatsize; + meta->offset = 0; + meta->id = ar.id; +} + +CoDRAMRequest *dramsim3_request(const axi_channel &axi, bool is_write) { + uint64_t address = (is_write) ? axi.aw.addr : axi.ar.addr; + dramsim3_meta *meta = new dramsim3_meta; + // WRITE + if (is_write) { + meta->len = axi.aw.len + 1; + meta->offset = 0; + meta->id = axi.aw.id; + } + else { + axi_read_data(axi.ar, meta); + } + CoDRAMRequest *req = new CoDRAMRequest(); + req->address = address; + req->is_write = is_write; + req->meta = meta; + return req; +} + +void dramsim3_helper(axi_channel &axi) { + // ticks DRAMsim3 according to CPU_FREQ:DRAM_FREQ + dram->tick(); + + static CoDRAMResponse *wait_resp_r = NULL; + static CoDRAMResponse *wait_resp_b = NULL; + static CoDRAMRequest *wait_req_w = NULL; + // currently only accept one in-flight read + one in-flight write + static uint64_t raddr, roffset = 0, rlen; + static uint64_t waddr, woffset = 0, wlen; + + // default branch to avoid wrong handshake + axi.aw.ready = 0; + axi.w.ready = 1; + axi.b.valid = 0; + axi.ar.ready = 0; + // axi.r.valid = 0; + + // AXI read + // first, check rdata in the last cycle + if (axi.r.ready && axi.r.valid) { + // printf("axi r channel fired data = %lx\n", axi.r.data[0]); + dramsim3_meta *meta = static_cast(wait_resp_r->req->meta); + meta->offset++; + axi.r.valid = 0; + } + if (wait_resp_r) { + dramsim3_meta *meta = static_cast(wait_resp_r->req->meta); + if (meta->offset == meta->len) { + delete meta; + delete wait_resp_r->req; + delete wait_resp_r; + wait_resp_r = NULL; + } + } + // second, check whether we response data in this cycle + if (!wait_resp_r) + wait_resp_r = dram->check_read_response(); + if (wait_resp_r) { + dramsim3_meta *meta = static_cast(wait_resp_r->req->meta); + // axi.r.data = meta->data[meta->offset]; + // printf("meta->size %d offset %d\n", meta->size, meta->offset*meta->size/sizeof(uint64_t)); + memcpy(axi.r.data, meta->data + meta->offset*meta->size/sizeof(uint64_t), meta->size); + axi.r.valid = 1; + axi.r.last = (meta->offset == meta->len - 1) ? 1 : 0; + axi.r.id = meta->id; + } + // third, check ar for next request's address + // put ar in the last since it should be at least one-cycle latency + if (axi.ar.valid && dram->will_accept(axi.ar.addr, false)) { + // printf("axi ar channel fired %lx\n", axi.ar.addr); + dram->add_request(dramsim3_request(axi, false)); + axi.ar.ready = 1; + } + + // AXI write + // first, check wdata in the last cycle + // aw channel + if (axi.aw.valid && dram->will_accept(axi.aw.addr, true)) { + assert(wait_req_w == NULL); // the last request has not finished + wait_req_w = dramsim3_request(axi, true); + axi.aw.ready = 1; + // printf("axi aw channel fired %lx\n", axi.aw.addr); + assert(axi.aw.burst == 1 || (axi.aw.burst == 2 && ((axi.aw.addr & 0x3f) == 0))); + } + + // w channel: ack write data + if (axi.w.valid && axi.w.ready) { + // printf("axi w channel fired\n"); + assert(wait_req_w); + dramsim3_meta *meta = static_cast(wait_req_w->meta); + // meta->data[meta->offset] = axi.w.data; + meta->offset++; + if (meta->offset == meta->len) { + assert(dram->will_accept(wait_req_w->address, true)); + dram->add_request(wait_req_w); + wait_req_w = NULL; + } + } + + // b channel: ack write + if (!wait_resp_b) + wait_resp_b = dram->check_write_response(); + if (wait_resp_b) { + dramsim3_meta *meta = static_cast(wait_resp_b->req->meta); + axi.b.valid = 1; + axi.b.id = meta->id; + // assert(axi.b.ready == 1); + for (int i = 0; i < meta->len; i++) { + uint64_t address = wait_resp_b->req->address % RAMSIZE; + ram[address / sizeof(uint64_t) + i] = meta->data[i]; + } + // printf("axi b channel fired\n"); + delete meta; + delete wait_resp_b->req; + delete wait_resp_b; + wait_resp_b = NULL; + } +} + +#endif diff --git a/src/test/csrc/ram.h b/src/test/csrc/ram.h new file mode 100644 index 0000000000000000000000000000000000000000..9217f45d62d921130a3e9dea1d88c341c2ce55fb --- /dev/null +++ b/src/test/csrc/ram.h @@ -0,0 +1,197 @@ +#ifndef __RAM_H +#define __RAM_H + +#include "common.h" + +void init_ram(const char *img); +void ram_finish(); + +#ifdef WITH_DRAMSIM3 +// 4*64 bits +#define AXI_DATA_WIDTH_64 4 + +typedef uint64_t axi_addr_t; +typedef uint64_t axi_data_t[AXI_DATA_WIDTH_64]; +#define axi_copy_data(dest, src) \ + memcpy(dest, src, sizeof(uint64_t)*AXI_DATA_WIDTH_64); + +struct axi_aw_channel { + uint8_t ready; + uint8_t valid; + axi_addr_t addr; + uint8_t prot; + uint8_t id; + uint8_t user; + uint8_t len; + uint8_t size; + uint8_t burst; + uint8_t lock; + uint8_t cache; + uint8_t qos; +}; + +struct axi_w_channel { + uint8_t ready; + uint8_t valid; + axi_data_t data; + uint8_t strb; + uint8_t last; +}; + +struct axi_b_channel { + uint8_t ready; + uint8_t valid; + uint8_t resp; + uint8_t id; + uint8_t user; +}; + +struct axi_ar_channel { + uint8_t ready; + uint8_t valid; + axi_addr_t addr; + uint8_t prot; + uint8_t id; + uint8_t user; + uint8_t len; + uint8_t size; + uint8_t burst; + uint8_t lock; + uint8_t cache; + uint8_t qos; +}; + +struct axi_r_channel { + uint8_t ready; + uint8_t valid; + uint8_t resp; + axi_data_t data; + uint8_t last; + uint8_t id; + uint8_t user; +}; + +struct axi_channel { + struct axi_aw_channel aw; + struct axi_w_channel w; + struct axi_b_channel b; + struct axi_ar_channel ar; + struct axi_r_channel r; +}; + +// dut helper for AXI + +// NOTE: change this when migrating between different hardware designs +#define DUT_AXI(name) auto_axi_mem_out_##name + +#define axi_aw_copy_from_dut_ptr(dut_ptr, aw) \ + do { \ + aw.ready = dut_ptr->DUT_AXI(aw_ready); \ + aw.valid = dut_ptr->DUT_AXI(aw_valid); \ + aw.addr = dut_ptr->DUT_AXI(aw_bits_addr); \ + aw.prot = dut_ptr->DUT_AXI(aw_bits_prot); \ + aw.id = dut_ptr->DUT_AXI(aw_bits_id); \ + aw.len = dut_ptr->DUT_AXI(aw_bits_len); \ + aw.size = dut_ptr->DUT_AXI(aw_bits_size); \ + aw.burst = dut_ptr->DUT_AXI(aw_bits_burst); \ + aw.lock = dut_ptr->DUT_AXI(aw_bits_lock); \ + aw.cache = dut_ptr->DUT_AXI(aw_bits_cache); \ + aw.qos = dut_ptr->DUT_AXI(aw_bits_qos); \ + } while (0); + +#define axi_aw_set_dut_ptr(dut_ptr, aw) \ + do { \ + dut_ptr->DUT_AXI(aw_ready) = aw.ready; \ + } while (0); + +#define axi_w_copy_from_dut_ptr(dut_ptr, w) \ + do { \ + w.ready = dut_ptr->DUT_AXI(w_ready); \ + w.valid = dut_ptr->DUT_AXI(w_valid); \ + axi_copy_data(w.data, dut_ptr->DUT_AXI(w_bits_data)) \ + w.strb = dut_ptr->DUT_AXI(w_bits_strb); \ + w.last = dut_ptr->DUT_AXI(w_bits_last); \ + } while (0); + +#define axi_w_set_dut_ptr(dut_ptr, w) \ + do { \ + dut_ptr->DUT_AXI(w_ready) = w.ready; \ + } while (0); + +#define axi_b_copy_from_dut_ptr(dut_ptr, b) \ + do { \ + b.ready = dut_ptr->DUT_AXI(b_valid); \ + b.valid = dut_ptr->DUT_AXI(b_valid); \ + b.resp = dut_ptr->DUT_AXI(b_bits_resp); \ + b.id = dut_ptr->DUT_AXI(b_bits_id); \ + } while (0); + +#define axi_b_set_dut_ptr(dut_ptr, b) \ + do { \ + dut_ptr->DUT_AXI(b_valid) = b.valid; \ + dut_ptr->DUT_AXI(b_bits_resp) = b.resp; \ + dut_ptr->DUT_AXI(b_bits_id) = b.id; \ + } while (0); + +#define axi_ar_copy_from_dut_ptr(dut_ptr, ar) \ + do { \ + ar.ready = dut_ptr->DUT_AXI(ar_ready); \ + ar.valid = dut_ptr->DUT_AXI(ar_valid); \ + ar.addr = dut_ptr->DUT_AXI(ar_bits_addr); \ + ar.prot = dut_ptr->DUT_AXI(ar_bits_prot); \ + ar.id = dut_ptr->DUT_AXI(ar_bits_id); \ + ar.len = dut_ptr->DUT_AXI(ar_bits_len); \ + ar.size = dut_ptr->DUT_AXI(ar_bits_size); \ + ar.burst = dut_ptr->DUT_AXI(ar_bits_burst); \ + ar.lock = dut_ptr->DUT_AXI(ar_bits_lock); \ + ar.cache = dut_ptr->DUT_AXI(ar_bits_cache); \ + ar.qos = dut_ptr->DUT_AXI(ar_bits_qos); \ + } while (0); + +#define axi_ar_set_dut_ptr(dut_ptr, ar) \ + do { \ + dut_ptr->DUT_AXI(ar_ready) = ar.ready; \ + } while (0); + +#define axi_r_copy_from_dut_ptr(dut_ptr, r) \ + do { \ + r.ready = dut_ptr->DUT_AXI(r_ready); \ + r.valid = dut_ptr->DUT_AXI(r_valid); \ + r.resp = dut_ptr->DUT_AXI(r_bits_resp); \ + axi_copy_data(r.data, dut_ptr->DUT_AXI(r_bits_data)) \ + r.last = dut_ptr->DUT_AXI(r_bits_last); \ + r.id = dut_ptr->DUT_AXI(r_bits_id); \ + } while (0); + +#define axi_r_set_dut_ptr(dut_ptr, r) \ + do { \ + dut_ptr->DUT_AXI(r_valid) = r.valid; \ + dut_ptr->DUT_AXI(r_bits_resp) = r.resp; \ + axi_copy_data(dut_ptr->DUT_AXI(r_bits_data), r.data) \ + dut_ptr->DUT_AXI(r_bits_last) = r.last; \ + dut_ptr->DUT_AXI(r_bits_id) = r.id; \ + } while (0); + +#define axi_copy_from_dut_ptr(dut_ptr, axi) \ + do { \ + axi_aw_copy_from_dut_ptr(dut_ptr, axi.aw) \ + axi_w_copy_from_dut_ptr(dut_ptr, axi.w) \ + axi_b_copy_from_dut_ptr(dut_ptr, axi.b) \ + axi_ar_copy_from_dut_ptr(dut_ptr, axi.ar) \ + axi_r_copy_from_dut_ptr(dut_ptr, axi.r) \ + } while (0); + +#define axi_set_dut_ptr(dut_ptr, axi) \ + do { \ + axi_aw_set_dut_ptr(dut_ptr, axi.aw) \ + axi_w_set_dut_ptr(dut_ptr, axi.w) \ + axi_b_set_dut_ptr(dut_ptr, axi.b) \ + axi_ar_set_dut_ptr(dut_ptr, axi.ar) \ + axi_r_set_dut_ptr(dut_ptr, axi.r) \ + } while (0); + +void dramsim3_finish(); +void dramsim3_helper(struct axi_channel &axi); +#endif + +#endif diff --git a/src/test/csrc/snapshot.cpp b/src/test/csrc/snapshot.cpp index ea2145f1b67b93e035f0ede7eba8574e54fd2269..15aafafbfbfeaae769cd445a8f75330d4823c289 100644 --- a/src/test/csrc/snapshot.cpp +++ b/src/test/csrc/snapshot.cpp @@ -1,5 +1,6 @@ #include "snapshot.h" +#ifdef VM_SAVABLE void VerilatedSaveMem::flush() { long flush_size = m_cp - m_bufp; assert(buf_size - size > flush_size); @@ -19,3 +20,4 @@ void VerilatedSaveMem::save() { size = 0; printf("save snapshot to %s...\n", m_filename.c_str()); } +#endif diff --git a/src/test/csrc/snapshot.h b/src/test/csrc/snapshot.h index 076e4649b83b71c24252dac344d6eeaaf61612c0..5bebdceda71a0f9f15a5d954277acbaa44a4ce10 100644 --- a/src/test/csrc/snapshot.h +++ b/src/test/csrc/snapshot.h @@ -1,7 +1,11 @@ -#include "VXSSimTop.h" +#ifndef SNAPSHOT_H +#define SNAPSHOT_H + +#ifdef VM_SAVABLE +#include "VXSSimSoC.h" #include -class VerilatedSaveMem : public VerilatedSave { +class VerilatedSaveMem : public VerilatedSerialize { const static long buf_size = 1024 * 1024 * 1024; uint8_t *buf; long size; @@ -28,3 +32,6 @@ public: void flush(); void save(); }; +#endif + +#endif diff --git a/src/test/scala/cache/CacheTest.scala b/src/test/scala/cache/CacheTest.scala index 17dea507eca5c504915b7133a526c15adca6abdf..22c26bf3afc9802448a2e4369c49b1dda19dbadc 100644 --- a/src/test/scala/cache/CacheTest.scala +++ b/src/test/scala/cache/CacheTest.scala @@ -1,7 +1,5 @@ package top -import noop._ -import bus.simplebus._ import device._ import utils._ diff --git a/src/test/scala/cache/L1plusCacheTest.scala b/src/test/scala/cache/L1plusCacheTest.scala new file mode 100644 index 0000000000000000000000000000000000000000..e02af4e5e6c48cdc00d8b841da6daccb0a1a40e3 --- /dev/null +++ b/src/test/scala/cache/L1plusCacheTest.scala @@ -0,0 +1,446 @@ +package cache + +import scala.collection.mutable.ArrayBuffer +import chipsalliance.rocketchip.config.{Field, Parameters} +import chisel3._ +import chisel3.util._ +import chiseltest.experimental.TestOptionBuilder._ +import chiseltest.internal.VerilatorBackendAnnotation +import chiseltest._ +import chisel3.experimental.BundleLiterals._ +import firrtl.stage.RunFirrtlTransformAnnotation +import chiseltest.ChiselScalatestTester +import device.AXI4RAM +import freechips.rocketchip.amba.axi4.AXI4UserYanker +import freechips.rocketchip.diplomacy.{AddressSet, LazyModule, LazyModuleImp} +import freechips.rocketchip.tilelink.{TLBuffer, TLCacheCork, TLToAXI4, TLXbar} +import org.scalatest.flatspec.AnyFlatSpec +import org.scalatest.matchers.must.Matchers +import sifive.blocks.inclusivecache.{CacheParameters, InclusiveCache, InclusiveCacheMicroParameters} +import utils.{DebugIdentityNode, HoldUnless, XSDebug} +import xiangshan.{HasXSLog, MicroOp} +import xiangshan.cache.{DCache, DCacheLineIO, L1plusCache, L1plusCacheIO, MemoryOpConstants} +import xiangshan.testutils.AddSinks +import xstransforms.PrintModuleName + +import scala.util.Random + +case object L1plusCacheTestKey extends Field[Long] + +class L1plusTestTopIO extends Bundle { + val l1plus = new L1plusCacheIO() + val dcacheStore = new DCacheLineIO() +} + +class L1plusTestTop()(implicit p: Parameters) extends LazyModule{ + + val dcache = LazyModule(new DCache()) + val l1plusCache = LazyModule(new L1plusCache()) + val l2 = LazyModule(new InclusiveCache( + CacheParameters( + level = 2, + ways = 4, + sets = 4 * 1024 / (64 * 4 * 4), + blockBytes = 64, + beatBytes = 32, + cacheName = s"L2" + ), + InclusiveCacheMicroParameters( + writeBytes = 8 + ) + )) + + val ram = LazyModule(new AXI4RAM( + Seq(AddressSet(0x0L, 0xffffffffffL)), + memByte = 128 * 1024 * 1024, + useBlackBox = false + )) + + val xbar = TLXbar() + + xbar := TLBuffer() := dcache.clientNode + xbar := TLBuffer() := l1plusCache.clientNode + + l2.node := xbar + + ram.node := + AXI4UserYanker() := + TLToAXI4() := + TLBuffer() := + TLCacheCork() := + l2.node + + lazy val module = new LazyModuleImp(this) with HasXSLog { + + val io = IO(Flipped(new L1plusTestTopIO)) + + AddSinks() + + dcache.module.io <> DontCare + + dcache.module.io.lsu.store <> io.dcacheStore + l1plusCache.module.io <> io.l1plus + } + +} + +class L1plusCacheTest extends AnyFlatSpec with ChiselScalatestTester with Matchers { + behavior of "L1plusCache" + + val mem_size = 128 * 1024 * 1024 + val block_size = 64 + // val nblocks = mem_size / block_size + val nblocks = 100 + + // data structures + // our golden version cache + val cache_blocks = new Array[BigInt](nblocks) + for (i <- 0 until nblocks) { + cache_blocks(i) = BigInt(0) + } + + // ---------------------------------------- + // useful request parameter values + val CMD_READ = MemoryOpConstants.M_XRD + val r = scala.util.Random + + top.Parameters.set(top.Parameters.debugParameters) + + val annos = Seq( + VerilatorBackendAnnotation, + RunFirrtlTransformAnnotation(new PrintModuleName) + ) + + it should "run" in { + + implicit val p = Parameters((site, up, here) => { + case L1plusCacheTestKey => 0 + }) + + + test(LazyModule(new L1plusTestTop()).module) + .withAnnotations(annos){ c => + + c.clock.step(100) + + val sq = new StoreQueue(8) + val lq = new LoadQueue(8) + + def init() = { + sq.init() + lq.init() + + // initialize DUT inputs + c.io.dcacheStore.req.valid.poke(false.B) + c.io.dcacheStore.resp.ready.poke(false.B) + c.io.l1plus.req.valid.poke(false.B) + c.io.l1plus.resp.ready.poke(false.B) + c.io.l1plus.flush.poke(false.B) + } + + def flush_l1plus() = { + c.io.l1plus.flush.poke(true.B) + while (!c.io.l1plus.empty.peek().litToBoolean) { + c.clock.step() + } + c.io.l1plus.flush.poke(false.B) + } + + def evaluate() = { + while (!sq.isFinished() || !lq.isFinished()) { + sq.tick(c.io.dcacheStore) + lq.tick(c.io.l1plus) + c.clock.step() + } + } + + // ---------------------------------------- + // scan test + // write every memory block and then read out every memory cell + def scan_test() = { + println(s"scan test") + init() + // first, initialize every memory block with random numbers + for (i <- 0 until nblocks) { + val addr = i * 64 + val words = (0 until 8) map { _ => + (BigInt(r.nextLong() & 0x7fffffffffffffffL)) + } + val data = words.foldLeft(BigInt(0))((sum, i) => sum << 64 | i) + cache_blocks(i) = data + println(f"enq store addr: $addr%x data: $data%x") + sq.enq(Req(addr, data)) + } + // execute reqs + evaluate() + + // read them out + for (i <- 0 until nblocks) { + val addr = i * 64 + val data = cache_blocks(i) + println(f"enq load addr: $addr%x data: $data%x") + lq.enq(Req(addr, data)) + } + // execute reqs + evaluate() + } + + scan_test() + + // self_modify_test + def self_modify_test() = { + println(s"self_modify_test") + for (i <- 0 until 10) { + flush_l1plus() + scan_test() + } + } + + self_modify_test() + } + } +} + +// emulated queue +class IdPool(val nReqIds: Int, name: String) { + val freeIds = new Array[Boolean](nReqIds) + + def allocate(): Int = { + for (i <- 0 until freeIds.size) { + if (freeIds(i)) { + println(f"$name allocate: $i") + freeIds(i) = false + return i + } + } + // no free id to allocate + println(f"$name allocate failed") + return -1 + } + + def free(id: Int): Unit = { + println(f"$name free: $id") + assert(!freeIds(id)) + freeIds(id) = true + } + + def init(): Unit = { + for (i <- 0 until freeIds.size) { + freeIds(i) = true + } + } +} + +case class Req( + addr: Long, + data: BigInt +) { + override def toString() : String = { + return f"addr: $addr%x data: $data%x" + } +} + +case class QueueEntry( + var id: Int, // it's transaction id + req: Req +) { + override def toString() : String = { + return f"id: $id%d req: $req" + } +} + +class Queue(nEntries: Int, name: String) { + // Queue + // --------------------------------------- + val idPool = new IdPool(nEntries, name + "IdPool") + val queue = new ArrayBuffer[QueueEntry]() + def enq(req: Req) = { + // for unissued reqs, they have id = -1 + queue += new QueueEntry(-1, req) + } + + // select a req to issue + // req with id == -1 are not issued + def select(): Int = { + for (i <- 0 until queue.size) { + if (queue(i).id == -1) + return i + } + return -1 + } + + // retire the req with transaction id tId + def retire(tId: Int): Unit = { + println(f"$name retire transaction: $tId%d") + for (i <- 0 until queue.size) { + if (queue(i).id == tId) { + // remove this request + queue.remove(i) + println(f"$name retire req: $i%d transaction: $tId%d") + return + } + } + assert(false) + } + + def issue(idx: Int, tId: Int) = { + println(f"$name issue req: $idx%d transaction: $tId%d") + assert(queue(idx).id == -1) + queue(idx).id = tId + } + + // look up req by transaction id tId + def lookUp(tId: Int): Req = { + for (i <- 0 until queue.size) { + if (queue(i).id == tId) { + // remove this request + return queue(i).req + } + } + // we must return a value + // just to make scala happy + return Req(0, 0) + } + + var reqWaiting = false + + def init(): Unit = { + idPool.init() + queue.clear() + reqWaiting = false + } + + def isFinished() = queue.isEmpty +} + +class StoreQueue(nEntries: Int) extends Queue(nEntries, "StoreQueue") { + def sendReq(port: DCacheLineIO): Unit = { + val req = port.req + // has last cycle's req been fired? + // can we send a new request in this cycle + if (!reqWaiting) { + val reqIdx = select() + if (reqIdx == -1) { + // no more request to send + req.valid.poke(false.B) + return + } + + val tId = idPool.allocate() + if (tId == -1) { + // no more request to send + req.valid.poke(false.B) + return + } + + // try sending a new request in this cycle + // select a req to issue + + reqWaiting = true + + issue(reqIdx, tId) + + val CMD_WRITE = MemoryOpConstants.M_XWR + val FULL_MASK = BigInt("ffffffffffffffff", 16).U + + val r = queue(reqIdx).req + req.valid.poke(true.B) + req.bits.cmd.poke(CMD_WRITE) + req.bits.addr.poke(r.addr.U) + req.bits.data.poke(r.data.U) + req.bits.mask.poke(FULL_MASK) + req.bits.meta.id.poke(tId.U) + req.bits.meta.vaddr.poke(r.addr.U) + req.bits.meta.paddr.poke(r.addr.U) + // req.bits.meta.uop.poke(0.U.asTypeOf(new MicroOp)) + req.bits.meta.mmio.poke(false.B) + req.bits.meta.tlb_miss.poke(false.B) + req.bits.meta.mask.poke(FULL_MASK) + req.bits.meta.replay.poke(false.B) + } + + if (req.valid.peek().litToBoolean && req.ready.peek().litToBoolean) { + reqWaiting = false + } + } + + def handleResp(port: DCacheLineIO) = { + val resp = port.resp + // always ready + resp.ready.poke(true.B) + if (resp.valid.peek().litToBoolean) { + val id = resp.bits.meta.id.peek().litValue.longValue.toInt + idPool.free(id) + retire(id) + } + } + + def tick(port: DCacheLineIO) = { + // first, try to send reqs + sendReq(port) + // then, receive responses + handleResp(port) + } +} + +class LoadQueue(nEntries: Int) extends Queue(nEntries, "LoadQueue") { + def sendReq(port: L1plusCacheIO): Unit = { + val req = port.req + // has last cycle's req been fired? + // can we send a new request in this cycle + if (!reqWaiting) { + val reqIdx = select() + if (reqIdx == -1) { + // no more request to send + req.valid.poke(false.B) + return + } + + val tId = idPool.allocate() + if (tId == -1) { + // no more request to send + req.valid.poke(false.B) + return + } + + // try sending a new request in this cycle + // select a req to issue + + reqWaiting = true + issue(reqIdx, tId) + + val CMD_READ = MemoryOpConstants.M_XRD + + val r = queue(reqIdx).req + req.valid.poke(true.B) + req.bits.cmd.poke(CMD_READ) + req.bits.addr.poke(r.addr.U) + req.bits.id.poke(tId.U) + } + + if (req.valid.peek().litToBoolean && req.ready.peek().litToBoolean) { + reqWaiting = false + } + } + + def handleResp(port: L1plusCacheIO) = { + val resp = port.resp + // always ready + resp.ready.poke(true.B) + if (resp.valid.peek().litToBoolean) { + val id = resp.bits.id.peek().litValue.longValue.toInt + val rdata = resp.bits.data.peek().litValue + val r = lookUp(id) + assert(r.data == rdata) + idPool.free(id) + retire(id) + } + } + + def tick(port: L1plusCacheIO) = { + // first, try to send reqs + sendReq(port) + // then, receive responses + handleResp(port) + } +} diff --git a/src/test/scala/cache/L2CacheNonInclusiveGetTest.scala b/src/test/scala/cache/L2CacheNonInclusiveGetTest.scala new file mode 100644 index 0000000000000000000000000000000000000000..b0026f72307aa3c2545e2694e29a5cff9b720508 --- /dev/null +++ b/src/test/scala/cache/L2CacheNonInclusiveGetTest.scala @@ -0,0 +1,326 @@ +package cache + +import scala.collection.mutable.ArrayBuffer +import chipsalliance.rocketchip.config.{Field, Parameters} +import chisel3._ +import chisel3.util._ +import chiseltest.experimental.TestOptionBuilder._ +import chiseltest.internal.VerilatorBackendAnnotation +import chiseltest._ +import chisel3.experimental.BundleLiterals._ +import firrtl.stage.RunFirrtlTransformAnnotation +import chiseltest.ChiselScalatestTester +import device.AXI4RAM +import freechips.rocketchip.amba.axi4.AXI4UserYanker +import freechips.rocketchip.diplomacy.{AddressSet, LazyModule, LazyModuleImp} +import freechips.rocketchip.tilelink.{TLBuffer, TLCacheCork, TLToAXI4, TLXbar} +import org.scalatest.flatspec.AnyFlatSpec +import org.scalatest.matchers.must.Matchers +import sifive.blocks.inclusivecache.{CacheParameters, InclusiveCache, InclusiveCacheControlParameters, InclusiveCacheMicroParameters} +import utils.{DebugIdentityNode, HoldUnless, XSDebug} +import xiangshan.{HasXSLog, MicroOp} +import xiangshan.cache.{DCache, DCacheLineIO, DCacheWordIO, L1plusCache, L1plusCacheIO, MemoryOpConstants, Uncache} +import xiangshan.testutils.AddSinks +import xstransforms.PrintModuleName + +import scala.util.Random + +class L2NonInclusiveGetTestTopIO extends Bundle { + val l1plus = new L1plusCacheIO() + val dcacheStore = new DCacheLineIO() + val l2Flush = new DCacheWordIO +} + +class L2NonInclusiveGetTestTop()(implicit p: Parameters) extends LazyModule { + val uncache = LazyModule(new Uncache()) + val dcache = LazyModule(new DCache()) + val l1plusCache = LazyModule(new L1plusCache()) + val l2 = LazyModule(new InclusiveCache( + CacheParameters( + level = 2, + ways = 4, + sets = 4 * 1024 / (64 * 4 * 4), + blockBytes = 64, + beatBytes = 32, + cacheName = s"L2" + ), + InclusiveCacheMicroParameters( + writeBytes = 8 + ), + Some(InclusiveCacheControlParameters( + address = 0x8000000L, + beatBytes = 8)))) + + val ram = LazyModule(new AXI4RAM( + Seq(AddressSet(0x0L, 0x7ffffffL)), + memByte = 128 * 1024 * 1024, + useBlackBox = false + )) + + val xbar = TLXbar() + + xbar := TLBuffer() := DebugIdentityNode() := dcache.clientNode + xbar := TLBuffer() := DebugIdentityNode() := l1plusCache.clientNode + + l2.node := DebugIdentityNode() := xbar + + ram.node := + AXI4UserYanker() := + TLToAXI4() := + TLBuffer() := + TLCacheCork() := + DebugIdentityNode() := + l2.node + + // connect uncache access to l2 control node + l2.ctlnode.get := DebugIdentityNode() := uncache.clientNode + + lazy val module = new LazyModuleImp(this) with HasXSLog { + + val io = IO(Flipped(new L2NonInclusiveGetTestTopIO)) + + AddSinks() + + dcache.module.io <> DontCare + + dcache.module.io.lsu.store <> io.dcacheStore + l1plusCache.module.io <> io.l1plus + uncache.module.io.lsq <> io.l2Flush + } +} + +class L2NonInclusiveGetTest extends AnyFlatSpec with ChiselScalatestTester with Matchers { + behavior of "L2Cache" + + val mem_size = 128 * 1024 * 1024 + val block_size = 64 + val block_bits = log2Up(block_size) + // val nblocks = mem_size / block_size + val nblocks = 100 + + // data structures + // our golden version cache + val cache_blocks = new Array[BigInt](nblocks) + for (i <- 0 until nblocks) { + cache_blocks(i) = BigInt(0) + } + + // ---------------------------------------- + // useful request parameter values + val CMD_READ = MemoryOpConstants.M_XRD + val CMD_WRITE = MemoryOpConstants.M_XWR + // 64bit full mask + val FULL_MASK_64 = BigInt("ffffffffffffffff", 16).U + val L2_FLUSH_BASE_ADDR = 0x8000000L + val CONFIG_ADDR = L2_FLUSH_BASE_ADDR + 0x0 + val FLUSH64_ADDR = L2_FLUSH_BASE_ADDR + 0x200 + val FLUSH32_ADDR = L2_FLUSH_BASE_ADDR + 0x240 + + val r = scala.util.Random + + top.Parameters.set(top.Parameters.debugParameters) + + val annos = Seq( + VerilatorBackendAnnotation, + RunFirrtlTransformAnnotation(new PrintModuleName) + ) + + it should "run" in { + + implicit val p = Parameters((site, up, here) => { + case L1plusCacheTestKey => 0 + }) + + + test(LazyModule(new L2NonInclusiveGetTestTop()).module) + .withAnnotations(annos){ c => + + c.clock.step(100) + + val sq = new StoreQueue(8) + val lq = new LoadQueue(8) + + def init() = { + sq.init() + lq.init() + + // initialize DUT inputs + c.io.dcacheStore.req.valid.poke(false.B) + c.io.dcacheStore.resp.ready.poke(false.B) + c.io.l1plus.req.valid.poke(false.B) + c.io.l1plus.resp.ready.poke(false.B) + c.io.l1plus.flush.poke(false.B) + c.io.l2Flush.req.valid.poke(false.B) + c.io.l2Flush.resp.ready.poke(false.B) + } + + def mmio_read(addr: BigInt): BigInt = { + // send req + val req = c.io.l2Flush.req + req.valid.poke(true.B) + req.bits.cmd.poke(CMD_READ) + req.bits.addr.poke(addr.U) + req.bits.data.poke(0.U) + req.bits.mask.poke(FULL_MASK_64) + req.bits.meta.id.poke(0.U) + req.bits.meta.vaddr.poke(addr.U) + req.bits.meta.paddr.poke(addr.U) + // req.bits.meta.uop.poke(0.U.asTypeOf(new MicroOp)) + req.bits.meta.mmio.poke(true.B) + req.bits.meta.tlb_miss.poke(false.B) + req.bits.meta.mask.poke(FULL_MASK_64) + req.bits.meta.replay.poke(false.B) + + while (!req.ready.peek().litToBoolean) { + c.clock.step() + } + // actually send the req + c.clock.step() + + // lower valid + req.valid.poke(false.B) + + // recv resp + val resp = c.io.l2Flush.resp + resp.ready.poke(true.B) + while (!resp.valid.peek().litToBoolean) { + c.clock.step() + } + val data = resp.bits.data.peek().litValue + // actually recv the response + c.clock.step() + + // lower ready + resp.ready.poke(false.B) + + return data + } + + def mmio_write(addr: BigInt, data: BigInt) = { + // send req + val req = c.io.l2Flush.req + req.valid.poke(true.B) + req.bits.cmd.poke(CMD_WRITE) + req.bits.addr.poke(addr.U) + req.bits.data.poke(data.U) + req.bits.mask.poke(FULL_MASK_64) + req.bits.meta.id.poke(0.U) + req.bits.meta.vaddr.poke(addr.U) + req.bits.meta.paddr.poke(addr.U) + // req.bits.meta.uop.poke(0.U.asTypeOf(new MicroOp)) + req.bits.meta.mmio.poke(true.B) + req.bits.meta.tlb_miss.poke(false.B) + req.bits.meta.mask.poke(FULL_MASK_64) + req.bits.meta.replay.poke(false.B) + + while (!req.ready.peek().litToBoolean) { + c.clock.step() + } + // actually send the req + c.clock.step() + + // lower valid + req.valid.poke(false.B) // recv resp + val resp = c.io.l2Flush.resp + resp.ready.poke(true.B) + while (!resp.valid.peek().litToBoolean) { + c.clock.step() + } + // actually recv the response + c.clock.step() + + // lower ready + resp.ready.poke(false.B) + } + + def get_l2_configurations() = { + val config = mmio_read(CONFIG_ADDR) + val nBank = config & 0xf + val nWay = config >> 8 & 0xf + val nSet = 1 << (config.toInt >> 16 & 0xf) + val nBlock = 1 << (config.toInt >> 24 & 0xf) + println(f"L2 configuration: nBank: $nBank nWay: $nWay nSet: $nSet nBlock: $nBlock") + } + + def flush_l2_block(addr: BigInt) = { + mmio_write(FLUSH64_ADDR, addr) + println(f"L2 flush block: $addr%x") + } + + def flush_l1plus() = { + c.io.l1plus.flush.poke(true.B) + while (!c.io.l1plus.empty.peek().litToBoolean) { + c.clock.step() + } + c.io.l1plus.flush.poke(false.B) + } + + def flush_l2_range(begin: BigInt, end: BigInt) = { + var addr = begin >> block_bits << block_bits + while (addr < end) { + flush_l2_block(addr) + addr += block_size + } + } + + def evaluate() = { + while (!sq.isFinished() || !lq.isFinished()) { + sq.tick(c.io.dcacheStore) + lq.tick(c.io.l1plus) + c.clock.step() + } + } + + get_l2_configurations() + + // ---------------------------------------- + // scan test + def populate_memory() = { + println(s"scan test") + init() + // first, initialize every memory block with random numbers + for (i <- 0 until nblocks) { + val addr = i * 64 + val words = (0 until 8) map { _ => + (BigInt(r.nextLong() & 0x7fffffffffffffffL)) + } + val data = words.foldLeft(BigInt(0))((sum, i) => sum << 64 | i) + cache_blocks(i) = data + println(f"enq store addr: $addr%x data: $data%x") + sq.enq(Req(addr, data)) + } + // execute reqs + evaluate() + } + + def flush_memory() = { + flush_l2_range(0, (nblocks - 1)* block_size) + } + + def read_memory() = { + // read them out + for (i <- 0 until nblocks) { + val addr = i * 64 + val data = cache_blocks(i) + println(f"enq load addr: $addr%x data: $data%x") + lq.enq(Req(addr, data)) + } + // execute reqs + evaluate() + } + + for (i <- 0 until 10) { + populate_memory() + flush_memory() + // these loads should cause get miss + flush_l1plus() + read_memory() + + populate_memory() + // these loads should not miss + flush_l1plus() + read_memory() + } + } + } +} diff --git a/src/test/scala/cache/L2CacheTest.scala b/src/test/scala/cache/L2CacheTest.scala index 936e133f12c8cf916969503d72100c095b7d128d..5119792635bc828d111bceb269823e2652285224 100644 --- a/src/test/scala/cache/L2CacheTest.scala +++ b/src/test/scala/cache/L2CacheTest.scala @@ -4,8 +4,7 @@ import chipsalliance.rocketchip.config.{Field, Parameters} import chisel3._ import chisel3.util._ import chiseltest.experimental.TestOptionBuilder._ -import chiseltest.internal.VerilatorBackendAnnotation -import chiseltest.internal.LineCoverageAnnotation +import chiseltest.internal.{VerilatorBackendAnnotation, LineCoverageAnnotation, ToggleCoverageAnnotation, UserCoverageAnnotation, StructuralCoverageAnnotation} import chiseltest._ import chisel3.experimental.BundleLiterals._ import firrtl.stage.RunFirrtlTransformAnnotation @@ -14,7 +13,8 @@ import device.AXI4RAM import freechips.rocketchip.amba.axi4.AXI4UserYanker import freechips.rocketchip.diplomacy.{AddressSet, LazyModule, LazyModuleImp} import freechips.rocketchip.tilelink.{TLBuffer, TLCacheCork, TLToAXI4, TLXbar} -import org.scalatest.{FlatSpec, Matchers} +import org.scalatest.flatspec.AnyFlatSpec +import org.scalatest.matchers.must.Matchers import sifive.blocks.inclusivecache.{CacheParameters, InclusiveCache, InclusiveCacheMicroParameters} import utils.{DebugIdentityNode, HoldUnless, XSDebug} import xiangshan.HasXSLog @@ -57,7 +57,7 @@ case object L3CacheTestKey extends Field[L3CacheTestParams] class L2TestTopIO extends Bundle { val in = Flipped(DecoupledIO(new Bundle() { val wdata = Input(UInt(64.W)) - val waddr = Input(UInt(20.W)) + val waddr = Input(UInt(40.W)) val hartId = Input(UInt(1.W)) })) val out = DecoupledIO(new Bundle() { @@ -100,7 +100,7 @@ class L2TestTop()(implicit p: Parameters) extends LazyModule{ )) val ram = LazyModule(new AXI4RAM( - AddressSet(0x0L, 0xffffffffffL), + Seq(AddressSet(0x0L, 0xffffffffffL)), memByte = 128 * 1024 * 1024, useBlackBox = false )) @@ -252,13 +252,16 @@ class L2TestTopWrapper()(implicit p: Parameters) extends LazyModule { } } -class L2CacheTest extends FlatSpec with ChiselScalatestTester with Matchers{ +class L2CacheTest extends AnyFlatSpec with ChiselScalatestTester with Matchers{ top.Parameters.set(top.Parameters.debugParameters) val annos = Seq( VerilatorBackendAnnotation, LineCoverageAnnotation, + ToggleCoverageAnnotation, + UserCoverageAnnotation, + StructuralCoverageAnnotation, RunFirrtlTransformAnnotation(new PrintModuleName) ) @@ -280,7 +283,9 @@ class L2CacheTest extends FlatSpec with ChiselScalatestTester with Matchers{ c.clock.step(100) for(i <- 0 until 100000){ - val addr = Random.nextInt(0xfffff) & 0xffe00 // align to block size + // DRAM AddressSet is above 0x80000000L + // also, note that, + has higher priority than & !!! + val addr = (Random.nextInt(0x7fffffff).toLong & 0xfffffe00L) + 0x80000000L // align to block size val data = Random.nextLong() & 0x7fffffffffffffffL c.io.in.enqueue(chiselTypeOf(c.io.in.bits).Lit( _.waddr -> addr.U, diff --git a/src/test/scala/cache/UnalignedGetTest.scala b/src/test/scala/cache/UnalignedGetTest.scala new file mode 100644 index 0000000000000000000000000000000000000000..0a561de19d21cecf60b8ba316e22fd7712b5662e --- /dev/null +++ b/src/test/scala/cache/UnalignedGetTest.scala @@ -0,0 +1,472 @@ +package cache + +import chipsalliance.rocketchip.config.{Field, Parameters} +import chisel3._ +import chisel3.util._ +import chiseltest.experimental.TestOptionBuilder._ +import chiseltest.internal.VerilatorBackendAnnotation +import chiseltest._ +import chisel3.experimental.BundleLiterals._ +import firrtl.stage.RunFirrtlTransformAnnotation +import chiseltest.ChiselScalatestTester +import device.AXI4RAM +import freechips.rocketchip.amba.axi4.AXI4UserYanker +import freechips.rocketchip.diplomacy.{AddressSet, LazyModule, LazyModuleImp, IdRange} +import freechips.rocketchip.tilelink.{TLBuffer, TLCacheCork, TLToAXI4, TLXbar, TLMasterParameters, TLMasterPortParameters, TLClientNode} +import org.scalatest.matchers.should.Matchers +import org.scalatest.flatspec.AnyFlatSpec +import sifive.blocks.inclusivecache.{CacheParameters, InclusiveCache, InclusiveCacheMicroParameters, InclusiveCacheControlParameters} +import utils.{DebugIdentityNode, HoldUnless, XSDebug} +import xiangshan.{HasXSLog, XSBundle, HasXSParameter} +import xiangshan.cache.{DCache, Uncache, DCacheLineReq, DCacheWordReq, MemoryOpConstants} +import xiangshan.testutils.AddSinks +import xstransforms.PrintModuleName +import utils.MaskExpand + + +import scala.util.Random + +// GetGenerator: a tilelink module that generate get of different addr and sizes +class GetGeneratorReq extends XSBundle +{ + val address = Output(UInt(PAddrBits.W)) + val size = Output(UInt(8.W)) +} + +class GetGeneratorResp extends XSBundle +{ + val data = Output(UInt((64 * 8).W)) +} + +class GetGeneratorIO extends XSBundle +{ + val req = DecoupledIO(new GetGeneratorReq) + val resp = Flipped(DecoupledIO(new GetGeneratorResp)) +} + +class GetGenerator()(implicit p: Parameters) extends LazyModule with HasXSParameter { + + val clientParameters = TLMasterPortParameters.v1( + Seq(TLMasterParameters.v1( + name = "GetGenerator", + sourceId = IdRange(0, 1) + )) + ) + + val clientNode = TLClientNode(Seq(clientParameters)) + + lazy val module = new GetGeneratorImp(this) +} + +class GetGeneratorImp(outer: GetGenerator) extends LazyModuleImp(outer) + with HasXSParameter + with HasXSLog +{ + + val io = IO(Flipped(new GetGeneratorIO)) + + val (bus, edge) = outer.clientNode.out.head + require(bus.d.bits.data.getWidth == l1BusDataWidth, "GetGenerator: tilelink width does not match") + + // assign default values to output signals + io.req.ready := false.B + io.resp.valid := false.B + io.resp.bits := DontCare + + bus.a.valid := false.B + bus.a.bits := DontCare + bus.b.ready := false.B + bus.c.valid := false.B + bus.c.bits := DontCare + bus.d.ready := false.B + bus.e.valid := false.B + bus.e.bits := DontCare + + val mem_acquire = bus.a + val mem_grant = bus.d + + // tilelink req/resp state machine + val s_invalid :: s_refill_req :: s_refill_resp :: s_send_resp :: Nil = Enum(4) + + val state = RegInit(s_invalid) + + val req = Reg(new GetGeneratorReq) + + val (_, _, refill_done, refill_address_inc) = edge.addr_inc(mem_grant) + + val refillCycles = 2 + val refill_ctr = Reg(UInt(log2Up(refillCycles).W)) + + val blockSize = 64 + val beatBytes = l1BusDataWidth / 8 + val nBeats = blockSize / beatBytes + val refill_data = Reg(Vec(nBeats, UInt(l1BusDataWidth.W))) + + when (state =/= s_invalid) { + XSDebug("state: %d\n", state) + } + + // -------------------------------------------- + // s_invalid: receive requests + when (state === s_invalid) { + io.req.ready := true.B + + when (io.req.fire()) { + refill_ctr := 0.U + req := io.req.bits + state := s_refill_req + + (0 until nBeats) map { i => refill_data(i) := 0.U } + } + } + + // -------------------------------------------- + // refill + when (state === s_refill_req) { + mem_acquire.valid := true.B + mem_acquire.bits := edge.Get( + fromSource = 0.U, + toAddress = req.address, + lgSize = req.size)._2 + when (mem_acquire.fire()) { + state := s_refill_resp + } + } + + when (state === s_refill_resp) { + mem_grant.ready := true.B + + when (edge.hasData(mem_grant.bits)) { + when (mem_grant.fire()) { + refill_ctr := refill_ctr + 1.U + val beatIdx = (req.address(log2Up(blockSize) - 1, 0) >> log2Up(beatBytes)) + refill_ctr + val mask = MaskExpand(edge.mask(req.address, req.size)) + // zero out unneeded data, so that, we can use expect to compare data outputs + XSDebug("beatIdx: %d data: %x mask: %x\n", beatIdx, mem_grant.bits.data, mask) + refill_data(beatIdx) := mem_grant.bits.data & mask + + when (refill_done) { + state := s_send_resp + } + } + } + } + + // -------------------------------------------- + when (state === s_send_resp) { + + val resp_data = Cat((0 until nBeats).reverse map { r => refill_data(r) }) + io.resp.valid := true.B + io.resp.bits.data := resp_data + + when (io.resp.fire()) { + state := s_invalid + } + } + + // debug output + when (io.req.fire()) { + XSDebug("address: %x size: %d\n", io.req.bits.address, io.req.bits.size) + } + + when (io.resp.fire()) { + XSDebug("data: %x\n", io.resp.bits.data) + } +} + +case object UnalignedGetTestKey extends Field[Long] + +class UnalignedGetTestTopIO extends Bundle { + val in = Flipped(DecoupledIO(new Bundle() { + val wdata = Input(UInt(512.W)) + val waddr = Input(UInt(20.W)) + val raddr = Input(UInt(20.W)) + val rsize = Input(UInt(8.W)) + })) + val out = DecoupledIO(new Bundle() { + val rdata = Output(UInt(512.W)) + }) +} + +class UnalignedGetTestTop()(implicit p: Parameters) extends LazyModule{ + + // use uncache to force L2 eviction + // so that we can test uncached get + val uncache = LazyModule(new Uncache()) + val dcache = LazyModule(new DCache()) + val getGenerator = LazyModule(new GetGenerator()) + val l2 = LazyModule(new InclusiveCache( + CacheParameters( + level = 2, + ways = 4, + sets = 4 * 1024 / (64 * 4 * 4), + blockBytes = 64, + beatBytes = 32, + cacheName = s"L2" + ), + InclusiveCacheMicroParameters( + writeBytes = 8 + ), + Some(InclusiveCacheControlParameters( + address = 0x8000000L, + beatBytes = 8)))) + + val ram = LazyModule(new AXI4RAM( + Seq(AddressSet(0x0L, 0xffffffffffL)), + memByte = 128 * 1024 * 1024, + useBlackBox = false + )) + + val xbar = TLXbar() + + xbar := TLBuffer() := DebugIdentityNode() := dcache.clientNode + xbar := TLBuffer() := DebugIdentityNode() := getGenerator.clientNode + + l2.node := DebugIdentityNode() := xbar + + ram.node := + AXI4UserYanker() := + TLToAXI4() := + TLBuffer() := + TLCacheCork() := + DebugIdentityNode() := + l2.node + + // connect uncache access to l2 control node + l2.ctlnode.get := DebugIdentityNode() := uncache.clientNode + + lazy val module = new LazyModuleImp(this) with HasXSLog { + + val io = IO(new UnalignedGetTestTopIO) + + val in = HoldUnless(io.in.bits, io.in.fire()) + + dcache.module.io <> DontCare + uncache.module.io <> DontCare + + val flushPort = uncache.module.io.lsq + val storePort = dcache.module.io.lsu.store + val loadPort = getGenerator.module.io + + // 64bit full mask + val FULL_MASK_64 = BigInt("ffffffffffffffff", 16).U + val L2_FLUSH_BASE_ADDR = 0x8000000L + val CONFIG_ADDR = L2_FLUSH_BASE_ADDR + 0x0 + val FLUSH64_ADDR = L2_FLUSH_BASE_ADDR + 0x200 + val FLUSH32_ADDR = L2_FLUSH_BASE_ADDR + 0x240 + + def sendFlushReq(addr: UInt): DCacheWordReq = { + val req = Wire(new DCacheWordReq) + req.cmd := MemoryOpConstants.M_XWR + req.addr := FLUSH64_ADDR.U + req.data := addr + req.mask := FULL_MASK_64 + req.meta.id := 0.U + req.meta.vaddr := FLUSH64_ADDR.U + req.meta.paddr := FLUSH64_ADDR.U + req.meta.uop := DontCare + req.meta.mmio := true.B + req.meta.tlb_miss := false.B + req.meta.mask := FULL_MASK_64 + req.meta.replay := false.B + req + } + + def sendStoreReq(addr: UInt, data: UInt): DCacheLineReq = { + val req = Wire(new DCacheLineReq) + req.cmd := MemoryOpConstants.M_XWR + req.addr := addr + req.data := data + req.mask := Fill(req.mask.getWidth, true.B) + req.meta := DontCare + req + } + + def sendLoadReq(addr: UInt, size: UInt): GetGeneratorReq = { + val req = Wire(new GetGeneratorReq) + req.address := addr + req.size := size + req + } + + val s_idle :: s_write_req :: s_write_resp :: s_flush_req :: s_flush_resp :: s_read_req :: s_read_resp :: s_finish :: Nil = Enum(8) + val state = RegInit(s_idle) + + switch(state){ + is(s_idle){ + when(io.in.fire()){ + state := s_write_req + } + } + is(s_write_req){ + when(storePort.req.fire()) { + state := s_write_resp + } + } + is(s_write_resp){ + when(storePort.resp.fire()) { + state := s_flush_req + } + } + is(s_flush_req){ + when(flushPort.req.fire()) { + state := s_flush_resp + } + } + is(s_flush_resp){ + when(flushPort.resp.fire()) { + state := s_read_req + } + } + is(s_read_req){ + when(loadPort.req.fire()) { + state := s_read_resp + } + } + is(s_read_resp){ + when(loadPort.resp.fire()) { + state := s_finish + } + } + } + + io.in.ready := state === s_idle + + XSDebug(p"state: $state\n") + + val storeReq = Wire(new DCacheLineReq) + + storeReq := sendStoreReq(in.waddr, in.wdata) + + storePort.req.bits := storeReq + storePort.req.valid := state === s_write_req + storePort.resp.ready := true.B + XSDebug( + storePort.req.fire(), + "write data %x to dcache\n", + storePort.req.bits.data, + ) + + val flushReq = Wire(new DCacheWordReq) + + flushReq := sendFlushReq(in.waddr) + + flushPort.req.bits := flushReq + flushPort.req.valid := state === s_flush_req + flushPort.resp.ready := true.B + XSDebug( + flushPort.req.fire(), + "flush address %x to memory\n", + flushPort.req.bits.addr, + ) + + val loadReq = sendLoadReq(in.raddr, in.rsize) + + loadPort.req.bits := loadReq + loadPort.req.valid := state === s_read_req + loadPort.resp.ready := true.B + XSDebug( + loadPort.resp.fire(), + "read data %x form getGenerator\n", + loadPort.resp.bits.data, + ) + + val rdata = Reg(UInt(512.W)) + + when(loadPort.resp.fire()) { + state := s_finish + rdata := loadPort.resp.bits.data + } + + io.out.bits.rdata := rdata + io.out.valid := state === s_finish + + when(io.out.fire()){ + state := s_idle + } + } + +} + +class UnalignedGetTestTopWrapper()(implicit p: Parameters) extends LazyModule { + + val testTop = LazyModule(new UnalignedGetTestTop()) + + lazy val module = new LazyModuleImp(this){ + val io = IO(new UnalignedGetTestTopIO) + + AddSinks() + + io <> testTop.module.io + } +} + +class UnalignedGetTest extends AnyFlatSpec with ChiselScalatestTester with Matchers { + + top.Parameters.set(top.Parameters.debugParameters) + + val annos = Seq( + VerilatorBackendAnnotation, + RunFirrtlTransformAnnotation(new PrintModuleName) + ) + + it should "run" in { + + implicit val p = Parameters((site, up, here) => { + case UnalignedGetTestKey => 0 + }) + + + test(LazyModule(new UnalignedGetTestTopWrapper()).module) + .withAnnotations(annos){ c => + + + c.io.in.initSource().setSourceClock(c.clock) + c.io.out.initSink().setSinkClock(c.clock) + + c.clock.step(100) + + val mem_size = 128 * 1024 * 1024 + val block_size = 64 + val nblocks = mem_size / block_size + // val nblocks = 100 + for(i <- 0 until nblocks) { + // we do not support l1plus flush for now + // so we could only scan the whole memory, + // and write every block for only once. + // if we rewrite the same block multiple times + // GetGenerator could not give correct data since it hasn't been flushed + // val addr = Random.nextInt(0xfffff) & 0xffe00 // align to block size + val waddr = i * block_size + val words = (0 until 8) map { _ => + (BigInt(Random.nextLong() & 0x7fffffffffffffffL)) + } + val wdata = words.foldLeft(BigInt(0))((sum, i) => sum << 64 | i) + + val maxSize = block_size + val lgMaxSize = log2Up(maxSize) + val lgRsize = Random.nextInt(lgMaxSize + 1) + val rsize = 1 << lgRsize + + // addr must be aligned to size + val offset = (Random.nextInt(maxSize) >> lgRsize) << lgRsize + val raddr = waddr + offset + // generate mask from raddr and rsize + val mask = (BigInt(1) << (rsize * 8)) - 1 + val rmask = mask << (offset * 8) + val rdata = wdata & rmask + + println(f"UnalignedGetTest: waddr: $waddr%x wdata: $wdata%x offset: $offset%x rsize: $rsize%d rmask: $rmask%x rdata: $rdata%x") + c.io.in.enqueue(chiselTypeOf(c.io.in.bits).Lit( + _.waddr -> waddr.U, + _.wdata -> wdata.U, + _.raddr -> raddr.U, + _.rsize -> lgRsize.U + )) + c.io.out.expectDequeue(chiselTypeOf(c.io.out.bits).Lit( + _.rdata -> rdata.U + )) + } + } + } +} diff --git a/src/test/scala/device/AXI4BurstMaster.scala b/src/test/scala/device/AXI4BurstMaster.scala index b71f2ca606d2460ab097e0a46c018e4c1c8dd21b..5aaa293850983450a5ea6645a7903d264c59d9b6 100644 --- a/src/test/scala/device/AXI4BurstMaster.scala +++ b/src/test/scala/device/AXI4BurstMaster.scala @@ -27,6 +27,8 @@ class AXI4BurstMaster }) val (out, edge) = node.out.head + // do not let dma AXI signals optimized out + chisel3.dontTouch(out) val cnt = RegInit(nOp.U) val addr = RegInit(startAddr.U) val s_idle :: s_addr :: s_data :: Nil = Enum(3) diff --git a/src/test/scala/device/AXI4RamTest.scala b/src/test/scala/device/AXI4RamTest.scala index 3270ff475f33076f61b0beeb883f96a423d53ee8..9c203fa56329119f38957ce88ab0386008c97d93 100644 --- a/src/test/scala/device/AXI4RamTest.scala +++ b/src/test/scala/device/AXI4RamTest.scala @@ -4,9 +4,10 @@ import chipsalliance.rocketchip.config._ import chisel3._ import chiseltest._ import freechips.rocketchip.amba.axi4.{AXI4Deinterleaver, AXI4UserYanker} -import org.scalatest.{FlatSpec, Matchers} import freechips.rocketchip.tilelink._ import freechips.rocketchip.diplomacy._ +import org.scalatest.flatspec.AnyFlatSpec +import org.scalatest.matchers.must.Matchers import utils.DebugIdentityNode class AXI4RamFuzzTest()(implicit p: Parameters) extends LazyModule { @@ -19,7 +20,7 @@ class AXI4RamFuzzTest()(implicit p: Parameters) extends LazyModule { inFlight = 10 )) val ident = LazyModule(new DebugIdentityNode()) - val axiRam = LazyModule(new AXI4RAM(addressSet, memByte = 1024)) + val axiRam = LazyModule(new AXI4RAM(Seq(addressSet), memByte = 1024)) axiRam.node := AXI4UserYanker() := @@ -38,7 +39,7 @@ class AXI4RamBurstTest()(implicit p: Parameters) extends LazyModule { val addressSet = AddressSet(0x38000000L, 0x0000ffffL) val burst = LazyModule(new AXI4BurstMaster(startAddr = addressSet.base.toLong, nOp = 3)) - val axiRam = LazyModule(new AXI4RAM(addressSet, memByte = 1024)) + val axiRam = LazyModule(new AXI4RAM(Seq(addressSet), memByte = 1024)) axiRam.node := burst.node @@ -55,7 +56,7 @@ class AXI4RamTLBurstTest()(implicit p: Parameters) extends LazyModule { val tlburst = LazyModule(new TLBurstMaster(startAddr = addressSet.base.toLong, nOp = 1, burstLen = 32)) val ident = LazyModule(new DebugIdentityNode()) - val axiRam = LazyModule(new AXI4RAM(addressSet, memByte = 1024)) + val axiRam = LazyModule(new AXI4RAM(Seq(addressSet), memByte = 1024)) axiRam.node := AXI4UserYanker() := @@ -70,7 +71,7 @@ class AXI4RamTLBurstTest()(implicit p: Parameters) extends LazyModule { } } -class AXI4RamTest extends FlatSpec with ChiselScalatestTester with Matchers { +class AXI4RamTest extends AnyFlatSpec with ChiselScalatestTester with Matchers { it should "run with fuzz" in { implicit val p = Parameters.empty test(LazyModule(new AXI4RamFuzzTest()).module){ c => diff --git a/src/test/scala/device/AXI4TimerTest.scala b/src/test/scala/device/AXI4TimerTest.scala index f2287e69e258f8d71c94c0709745fac5abd42b1a..baa5cf91b0465f0a488cbe6a390f90ab8984dab8 100644 --- a/src/test/scala/device/AXI4TimerTest.scala +++ b/src/test/scala/device/AXI4TimerTest.scala @@ -4,9 +4,10 @@ import chipsalliance.rocketchip.config._ import chisel3._ import chiseltest._ import freechips.rocketchip.amba.axi4.{AXI4Deinterleaver, AXI4UserYanker} -import org.scalatest.{FlatSpec, Matchers} import freechips.rocketchip.tilelink._ import freechips.rocketchip.diplomacy._ +import org.scalatest.flatspec.AnyFlatSpec +import org.scalatest.matchers.must.Matchers import utils.DebugIdentityNode @@ -15,7 +16,7 @@ class AXI4TimerTestTop(implicit p: Parameters) extends LazyModule { val addressSet = AddressSet(0x38000000L, 0x0000ffffL) val fuzz = LazyModule(new TLFuzzer(nOperations = 10, overrideAddress = Some(addressSet), inFlight = 1)) val ident = LazyModule(new DebugIdentityNode()) - val axiTimer = LazyModule(new AXI4Timer(sim = true, addressSet)) + val axiTimer = LazyModule(new AXI4Timer(sim = true, Seq(addressSet))) axiTimer.node := AXI4UserYanker() := @@ -30,7 +31,7 @@ class AXI4TimerTestTop(implicit p: Parameters) extends LazyModule { } -class AXI4TimerTest extends FlatSpec with Matchers with ChiselScalatestTester { +class AXI4TimerTest extends AnyFlatSpec with Matchers with ChiselScalatestTester { it should "run" in { implicit val p = Parameters.empty test(LazyModule(new AXI4TimerTestTop()).module){ c => diff --git a/src/test/scala/device/SimMMIOTest.scala b/src/test/scala/device/SimMMIOTest.scala index 1e47fc0a6bd509092f17ca6ff8987b9f047bd33a..6b70cedb00d61a9074156bd3657066eaf7e5291b 100644 --- a/src/test/scala/device/SimMMIOTest.scala +++ b/src/test/scala/device/SimMMIOTest.scala @@ -4,9 +4,10 @@ import chipsalliance.rocketchip.config._ import chisel3._ import chiseltest._ import freechips.rocketchip.amba.axi4.{AXI4Deinterleaver, AXI4UserYanker, AXI4Xbar} -import org.scalatest.{FlatSpec, Matchers} import freechips.rocketchip.tilelink._ import freechips.rocketchip.diplomacy._ +import org.scalatest.flatspec.AnyFlatSpec +import org.scalatest.matchers.must.Matchers import top.SimMMIO import utils.DebugIdentityNode @@ -29,7 +30,7 @@ class SimMMIOTestTop()(implicit p: Parameters) extends LazyModule { } } -class SimMMIOTest extends FlatSpec with ChiselScalatestTester with Matchers { +class SimMMIOTest extends AnyFlatSpec with ChiselScalatestTester with Matchers { it should "run" in { implicit val p = Parameters.empty test(LazyModule(new SimMMIOTestTop()).module){c => diff --git a/src/test/scala/device/TLTimerTest.scala b/src/test/scala/device/TLTimerTest.scala index f8f580699c61007cc69589a23be971a5956a3b14..ece177705e9bd845023e3949a6df93505f069dfb 100644 --- a/src/test/scala/device/TLTimerTest.scala +++ b/src/test/scala/device/TLTimerTest.scala @@ -3,9 +3,10 @@ package device import chipsalliance.rocketchip.config._ import chisel3._ import chiseltest._ -import org.scalatest.{FlatSpec, Matchers} import freechips.rocketchip.tilelink._ import freechips.rocketchip.diplomacy._ +import org.scalatest.flatspec.AnyFlatSpec +import org.scalatest.matchers.must.Matchers import utils.DebugIdentityNode class TLTimerTestTop()(implicit p: Parameters) extends LazyModule { @@ -30,7 +31,7 @@ class TLTimerTestTop()(implicit p: Parameters) extends LazyModule { } } -class TLTimerTest extends FlatSpec with ChiselScalatestTester with Matchers { +class TLTimerTest extends AnyFlatSpec with ChiselScalatestTester with Matchers { it should "run" in { implicit val p = Parameters.empty diff --git a/src/test/scala/top/SimMMIO.scala b/src/test/scala/top/SimMMIO.scala index 57423e1144f202cb092287c3b57a6553ceba0768..4b6437ea3c483639dbaa500571e203a99eee509d 100644 --- a/src/test/scala/top/SimMMIO.scala +++ b/src/test/scala/top/SimMMIO.scala @@ -9,14 +9,14 @@ import freechips.rocketchip.tilelink.{TLErrorEvaluator, TLMasterParameters, TLXb class SimMMIO()(implicit p: config.Parameters) extends LazyModule { - val uart = LazyModule(new AXI4UART(AddressSet(0x40600000L, 0xf))) + val uart = LazyModule(new AXI4UART(Seq(AddressSet(0x40600000L, 0xf)))) val vga = LazyModule(new AXI4VGA( sim = false, - fbAddress = AddressSet(0x50000000L, 0x3fffffL), - ctrlAddress = AddressSet(0x40001000L, 0x7L) + fbAddress = Seq(AddressSet(0x50000000L, 0x3fffffL)), + ctrlAddress = Seq(AddressSet(0x40001000L, 0x7L)) )) - val flash = LazyModule(new AXI4Flash(AddressSet(0x40000000L, 0xfff))) - val sd = LazyModule(new AXI4DummySD(AddressSet(0x40002000L, 0xfff))) + val flash = LazyModule(new AXI4Flash(Seq(AddressSet(0x40000000L, 0xfff)))) + val sd = LazyModule(new AXI4DummySD(Seq(AddressSet(0x40002000L, 0xfff)))) val axiBus = AXI4Xbar() diff --git a/src/test/scala/top/XSSim.scala b/src/test/scala/top/XSSim.scala index d52bc2a8fe9f2fe0a519dafdb5a0acaf1a23690a..aa21be4d042c805c5afb2eec0222075efd4408d5 100644 --- a/src/test/scala/top/XSSim.scala +++ b/src/test/scala/top/XSSim.scala @@ -3,17 +3,15 @@ package top import system._ import chisel3._ import chisel3.util._ -import chisel3.util.experimental.BoringUtils import chipsalliance.rocketchip.config import chisel3.stage.ChiselGeneratorAnnotation import device._ -import freechips.rocketchip.amba.axi4.{AXI4Fragmenter, AXI4UserYanker} +import freechips.rocketchip.amba.axi4.{AXI4UserYanker, AXI4Xbar, AXI4IdentityNode} import freechips.rocketchip.diplomacy.{AddressSet, BufferParams, LazyModule, LazyModuleImp} -import freechips.rocketchip.tilelink.{TLBuffer, TLCacheCork, TLFragmenter, TLFuzzer, TLToAXI4, TLXbar} +import freechips.rocketchip.tilelink.{TLToAXI4} import xiangshan._ import utils._ -import firrtl.stage.RunFirrtlTransformAnnotation -import xstransforms.ShowPrintTransform +import ExcitingUtils.Debug class DiffTestIO extends XSBundle { val r = Output(Vec(64, UInt(XLEN.W))) @@ -65,30 +63,47 @@ class TrapIO extends XSBundle { val instrCnt = Output(UInt(XLEN.W)) } - -class XSSimTop()(implicit p: config.Parameters) extends LazyModule { - - val memAddressSet = AddressSet(0x0L, 0xffffffffffL) +class XSSimSoC(axiSim: Boolean)(implicit p: config.Parameters) extends LazyModule with HasXSParameter { + // address space[0G - 1024G) + val fullRange = AddressSet(0x0L, 0xffffffffffL) + // MMIO address space[0G - 2G) + val mmioRange = AddressSet(base = 0x0000000000L, mask = 0x007fffffffL) + // DRAM address range[2G - 1024G) + val dramRange = fullRange.subtract(mmioRange) val soc = LazyModule(new XSSoc()) - val axiRam = LazyModule(new AXI4RAM( - memAddressSet, - memByte = 128 * 1024 * 1024, - useBlackBox = true - )) - val axiMMIO = LazyModule(new SimMMIO()) - - axiRam.node := - AXI4UserYanker() := - TLToAXI4() := - TLBuffer(BufferParams.default) := - DebugIdentityNode() := - soc.mem - axiMMIO.axiBus := - AXI4UserYanker() := - TLToAXI4() := - soc.extDev + // 4x1 crossbar + val xbar = AXI4Xbar() + soc.mem.map{mem => xbar := mem} + + // AXIRam + // ----------------------------------- + val axiMem = { + if (axiSim) + AXI4IdentityNode() + else + LazyModule(new AXI4RAM( + dramRange, + memByte = 64L * 1024 * 1024 * 1024, + useBlackBox = true, + beatBytes = L3BusWidth / 8 + )).node + } + axiMem := xbar + + // AXI DMA + // ----------------------------------- + val burst = LazyModule(new AXI4BurstMaster( + startAddr = 0x80000000L, + nOp = 0, + beatBytes = L3BusWidth / 8)) + soc.dma := burst.node + + // AXI MMIO + // ----------------------------------- + val axiMMIO = LazyModule(new SimMMIO()) + axiMMIO.axiBus := soc.extDev lazy val module = new LazyModuleImp(this) { val io = IO(new Bundle { @@ -98,74 +113,135 @@ class XSSimTop()(implicit p: config.Parameters) extends LazyModule { val uart = new UARTIO }) + dontTouch(io.difftest) + dontTouch(io.logCtrl) + dontTouch(io.trap) + dontTouch(io.uart) + io.uart <> axiMMIO.module.io.uart soc.module.io.meip := false.B val difftest = WireInit(0.U.asTypeOf(new DiffTestIO)) - BoringUtils.addSink(difftest.commit, "difftestCommit") - BoringUtils.addSink(difftest.thisPC, "difftestThisPC") - BoringUtils.addSink(difftest.thisINST, "difftestThisINST") - BoringUtils.addSink(difftest.skip, "difftestSkip") - BoringUtils.addSink(difftest.isRVC, "difftestIsRVC") - BoringUtils.addSink(difftest.wen, "difftestWen") - BoringUtils.addSink(difftest.wdata, "difftestWdata") - BoringUtils.addSink(difftest.wdst, "difftestWdst") - BoringUtils.addSink(difftest.wpc, "difftestWpc") - BoringUtils.addSink(difftest.intrNO, "difftestIntrNO") - BoringUtils.addSink(difftest.cause, "difftestCause") - BoringUtils.addSink(difftest.r, "difftestRegs") - BoringUtils.addSink(difftest.priviledgeMode, "difftestMode") - BoringUtils.addSink(difftest.mstatus, "difftestMstatus") - BoringUtils.addSink(difftest.sstatus, "difftestSstatus") - BoringUtils.addSink(difftest.mepc, "difftestMepc") - BoringUtils.addSink(difftest.sepc, "difftestSepc") - BoringUtils.addSink(difftest.mtval, "difftestMtval") - BoringUtils.addSink(difftest.stval, "difftestStval") - BoringUtils.addSink(difftest.mtvec, "difftestMtvec") - BoringUtils.addSink(difftest.stvec, "difftestStvec") - BoringUtils.addSink(difftest.mcause, "difftestMcause") - BoringUtils.addSink(difftest.scause, "difftestScause") - BoringUtils.addSink(difftest.satp, "difftestSatp") - BoringUtils.addSink(difftest.mip, "difftestMip") - BoringUtils.addSink(difftest.mie, "difftestMie") - BoringUtils.addSink(difftest.mscratch, "difftestMscratch") - BoringUtils.addSink(difftest.sscratch, "difftestSscratch") - BoringUtils.addSink(difftest.mideleg, "difftestMideleg") - BoringUtils.addSink(difftest.medeleg, "difftestMedeleg") - BoringUtils.addSink(difftest.scFailed, "difftestScFailed") + if (!env.FPGAPlatform) { + ExcitingUtils.addSink(difftest.commit, "difftestCommit", Debug) + ExcitingUtils.addSink(difftest.thisPC, "difftestThisPC", Debug) + ExcitingUtils.addSink(difftest.thisINST, "difftestThisINST", Debug) + ExcitingUtils.addSink(difftest.skip, "difftestSkip", Debug) + ExcitingUtils.addSink(difftest.isRVC, "difftestIsRVC", Debug) + ExcitingUtils.addSink(difftest.wen, "difftestWen", Debug) + ExcitingUtils.addSink(difftest.wdata, "difftestWdata", Debug) + ExcitingUtils.addSink(difftest.wdst, "difftestWdst", Debug) + ExcitingUtils.addSink(difftest.wpc, "difftestWpc", Debug) + ExcitingUtils.addSink(difftest.intrNO, "difftestIntrNO", Debug) + ExcitingUtils.addSink(difftest.cause, "difftestCause", Debug) + ExcitingUtils.addSink(difftest.r, "difftestRegs", Debug) + ExcitingUtils.addSink(difftest.priviledgeMode, "difftestMode", Debug) + ExcitingUtils.addSink(difftest.mstatus, "difftestMstatus", Debug) + ExcitingUtils.addSink(difftest.sstatus, "difftestSstatus", Debug) + ExcitingUtils.addSink(difftest.mepc, "difftestMepc", Debug) + ExcitingUtils.addSink(difftest.sepc, "difftestSepc", Debug) + ExcitingUtils.addSink(difftest.mtval, "difftestMtval", Debug) + ExcitingUtils.addSink(difftest.stval, "difftestStval", Debug) + ExcitingUtils.addSink(difftest.mtvec, "difftestMtvec", Debug) + ExcitingUtils.addSink(difftest.stvec, "difftestStvec", Debug) + ExcitingUtils.addSink(difftest.mcause, "difftestMcause", Debug) + ExcitingUtils.addSink(difftest.scause, "difftestScause", Debug) + ExcitingUtils.addSink(difftest.satp, "difftestSatp", Debug) + ExcitingUtils.addSink(difftest.mip, "difftestMip", Debug) + ExcitingUtils.addSink(difftest.mie, "difftestMie", Debug) + ExcitingUtils.addSink(difftest.mscratch, "difftestMscratch", Debug) + ExcitingUtils.addSink(difftest.sscratch, "difftestSscratch", Debug) + ExcitingUtils.addSink(difftest.mideleg, "difftestMideleg", Debug) + ExcitingUtils.addSink(difftest.medeleg, "difftestMedeleg", Debug) + ExcitingUtils.addSink(difftest.scFailed, "difftestScFailed", Debug) + } + // BoringUtils.addSink(difftest.lrscAddr, "difftestLrscAddr") io.difftest := difftest val trap = WireInit(0.U.asTypeOf(new TrapIO)) - ExcitingUtils.addSink(trap.valid, "trapValid") - ExcitingUtils.addSink(trap.code, "trapCode") - ExcitingUtils.addSink(trap.pc, "trapPC") - ExcitingUtils.addSink(trap.cycleCnt, "trapCycleCnt") - ExcitingUtils.addSink(trap.instrCnt, "trapInstrCnt") + if (!env.FPGAPlatform) { + ExcitingUtils.addSink(trap.valid, "trapValid") + ExcitingUtils.addSink(trap.code, "trapCode") + ExcitingUtils.addSink(trap.pc, "trapPC") + ExcitingUtils.addSink(trap.cycleCnt, "trapCycleCnt") + ExcitingUtils.addSink(trap.instrCnt, "trapInstrCnt") + } + io.trap := trap - val timer = GTimer() - val logEnable = (timer >= io.logCtrl.log_begin) && (timer < io.logCtrl.log_end) - ExcitingUtils.addSource(logEnable, "DISPLAY_LOG_ENABLE") - ExcitingUtils.addSource(timer, "logTimestamp") + if (env.EnableDebug) { + val timer = GTimer() + val logEnable = (timer >= io.logCtrl.log_begin) && (timer < io.logCtrl.log_end) + ExcitingUtils.addSource(logEnable, "DISPLAY_LOG_ENABLE") + ExcitingUtils.addSource(timer, "logTimestamp") + } // Check and dispaly all source and sink connections + ExcitingUtils.fixConnections() ExcitingUtils.checkAndDisplay() } } +class XSSimTop(axiSim: Boolean)(implicit p: config.Parameters) extends LazyModule with HasXSParameter { + println(axiSim) + val dut = LazyModule(new XSSimSoC(axiSim)) + val axiSimRam = { + if (axiSim) LazyModule(new AXI4RAM( + dut.dramRange, + memByte = 128 * 1024 * 1024, + useBlackBox = true, + beatBytes = L3BusWidth / 8 + )) + else null + } + if (axiSim) { + axiSimRam.node := dut.axiMem + } + + lazy val module = new LazyModuleImp(this) { + val io = IO(new Bundle { + val difftest = new DiffTestIO + val logCtrl = new LogCtrlIO + val trap = new TrapIO + val uart = new UARTIO + val memAXI = if (axiSim) chiselTypeOf(axiSimRam.module.io) else Input(Bool()) + }) + + io.difftest <> dut.module.io.difftest + io.logCtrl <> dut.module.io.logCtrl + io.trap <> dut.module.io.trap + io.uart <> dut.module.io.uart + if (axiSim) { + io.memAXI <> axiSimRam.module.io + } + else { + io.memAXI <> DontCare + } + } +} + object TestMain extends App { - // set parameters + val axiSim = args.contains("--with-dramsim3") + + // set soc parameters + val socArgs = args.filterNot(_ == "--with-dramsim3") Parameters.set( - if(args.contains("--disable-log")) Parameters.simParameters // sim only, disable log + if(socArgs.contains("--fpga-platform")) { + if (socArgs.contains("--dual-core")) Parameters.dualCoreParameters + else Parameters() + } + else if(socArgs.contains("--disable-log")) Parameters.simParameters // sim only, disable log else Parameters.debugParameters // open log ) + + val otherArgs = socArgs.filterNot(_ == "--disable-log").filterNot(_ == "--fpga-platform").filterNot(_ == "--dual-core") implicit val p = config.Parameters.empty // generate verilog XiangShanStage.execute( - args.filterNot(_ == "--disable-log"), + otherArgs, Seq( - ChiselGeneratorAnnotation(() => LazyModule(new XSSimTop).module) + ChiselGeneratorAnnotation(() => LazyModule(new XSSimTop(axiSim)).module) ) ) } diff --git a/src/test/scala/xiangshan/backend/brq/BrqTest.scala b/src/test/scala/xiangshan/backend/brq/BrqTest.scala index 0bb427a46c056b4ea4837bf0ee0e949756712841..afc8aea46465cfaec786d5ac2a17d51afaafb54c 100644 --- a/src/test/scala/xiangshan/backend/brq/BrqTest.scala +++ b/src/test/scala/xiangshan/backend/brq/BrqTest.scala @@ -7,6 +7,8 @@ import chisel3.experimental.BundleLiterals._ import chisel3.util._ import chiseltest.experimental.TestOptionBuilder._ import chiseltest.internal.VerilatorBackendAnnotation +import org.scalatest.flatspec.AnyFlatSpec +import org.scalatest.matchers.must.Matchers import top.Parameters import utils.XSLog import xiangshan._ @@ -15,7 +17,7 @@ import xiangshan.testutils.TestCaseGenerator._ import scala.util.Random -class BrqTest extends FlatSpec +class BrqTest extends AnyFlatSpec with ChiselScalatestTester with Matchers with ParallelTestExecution diff --git a/src/test/scala/xiangshan/backend/exu/AluTest.scala b/src/test/scala/xiangshan/backend/exu/AluTest.scala deleted file mode 100644 index 284dcb1a341d1bbf9bfdf9ac118a66c951e7301a..0000000000000000000000000000000000000000 --- a/src/test/scala/xiangshan/backend/exu/AluTest.scala +++ /dev/null @@ -1,68 +0,0 @@ -package xiangshan.backend.exu - -import org.scalatest._ -import chiseltest._ -import chisel3._ -import chisel3.experimental.BundleLiterals._ -import chiseltest.experimental.TestOptionBuilder._ -import chiseltest.internal.VerilatorBackendAnnotation -import xiangshan._ -import xiangshan.testutils._ -import xiangshan.testutils.TestCaseGenerator._ - -import scala.util.Random - -class AluTest extends FlatSpec - with ChiselScalatestTester - with Matchers - with ParallelTestExecution - with HasPartialDecoupledDriver -{ - it should "do simple test corrcetly" in { - test(new AluExeUnit){c => - - c.io.in.initSource().setSourceClock(c.clock) - c.io.out.initSink().setSinkClock(c.clock) - - parallel( - c.io.in.enqueuePartial(genAluAdd(c.io.in.bits, 0, 0)), - c.io.out.expectDequeuePartial(chiselTypeOf(c.io.out.bits).Lit(_.data -> 0.U)) - ) - } - } - - it should "do random add correctly" in { - test(new AluExeUnit){c => - - c.io.in.initSource().setSourceClock(c.clock) - c.io.out.initSink().setSinkClock(c.clock) - - def TEST_SIZE = 10 - - val src1, src2, res = Array.fill(TEST_SIZE)(0) - for(i <- 0 until TEST_SIZE){ - // avoid neg add res - src1(i) = Random.nextInt(0x3fffffff) - src2(i) = Random.nextInt(0x3fffffff) - res(i) = src1(i) + src2(i) - } - - val inputSeq = (0 until TEST_SIZE).map(i => - genAluAdd(c.io.in.bits, src1(i), src2(i)) - ) - - val outputSeq = (0 until TEST_SIZE).map(i => - chiselTypeOf(c.io.out.bits).Lit( - _.data -> res(i).U - ) - ) - - parallel( - c.io.in.enqueuePartialSeq(inputSeq), - c.io.out.expectDequeuePartialSeq(outputSeq) - ) - } - } - - -} diff --git a/src/test/scala/xiangshan/backend/exu/MduTest.scala b/src/test/scala/xiangshan/backend/exu/MduTest.scala deleted file mode 100644 index 9d8f83470734b6dca22b6c06aee5d6d0288adfd0..0000000000000000000000000000000000000000 --- a/src/test/scala/xiangshan/backend/exu/MduTest.scala +++ /dev/null @@ -1,152 +0,0 @@ -package xiangshan.backend.exu - -import org.scalatest._ -import chiseltest._ -import chisel3._ -import chisel3.experimental.BundleLiterals._ -import top.Parameters -import utils.XSLog -import xiangshan.testutils._ -import xiangshan.testutils.TestCaseGenerator._ - -import scala.util.Random - - - - -class MduTest extends FlatSpec - with ChiselScalatestTester - with Matchers - with ParallelTestExecution - with HasPartialDecoupledDriver -{ - - Parameters.set(Parameters.debugParameters) - - "MUL" should "random enq and deq correctly" in { - test(new MulExeUnit{ - AddSinks() - }){ c => - - c.io.in.initSource().setSourceClock(c.clock) - c.io.out.initSink().setSinkClock(c.clock) - - def TEST_SIZE = 100 - val pcSeq = (0 until TEST_SIZE).map(_ => Random.nextInt(0x7fffffff)) - - fork{ - c.io.in.enqueuePartialSeq(pcSeq.map(pc => genMul(c.io.in.bits, pc))) - }.fork{ - c.io.out.expectDequeuePartialSeq(pcSeq.map( - pc => chiselTypeOf(c.io.out.bits).Lit( - _.uop.cf.pc -> pc.U - ) - )) - }.join() - - } - } - - - "MUL" should "only flush instrs newer than the redirect instr" in { - test(new MulExeUnit{ - AddSinks() - }){ c => - - c.io.in.initSource().setSourceClock(c.clock) - c.io.out.initSink().setSinkClock(c.clock) - - fork{ - // 29 - c.io.in.enqueuePartial(chiselTypeOf(c.io.in.bits).Lit( - _.uop.cf.pc -> 666.U, - _.uop.brTag.flag -> true.B, - _.uop.brTag.value -> 12.U - )) - // 30 - c.io.redirect.pokePartial(chiselTypeOf(c.io.redirect).Lit( - _.valid -> true.B, - _.bits.isException -> false.B, - _.bits.brTag.flag -> true.B, - _.bits.brTag.value -> 11.U - )) - c.io.in.enqueuePartial(chiselTypeOf(c.io.in.bits).Lit( - _.uop.cf.pc -> 777.U, - _.uop.brTag.flag -> true.B, - _.uop.brTag.value -> 10.U - )) - c.io.redirect.pokePartial(chiselTypeOf(c.io.redirect).Lit(_.valid -> false.B)) - }.fork{ - c.io.out.expectDequeuePartial(chiselTypeOf(c.io.out.bits).Lit(_.uop.cf.pc -> 777.U)) - }.join() - } - } - - - - "MUL" should "dont flush same br tag" in { - test(new MulExeUnit{ - AddSinks() - }){ c => - - c.io.in.initSource().setSourceClock(c.clock) - c.io.out.initSink().setSinkClock(c.clock) - - def TEST_SIZE = 100 - val pcSeq = (0 until TEST_SIZE).map(_ => Random.nextInt(0x7fffffff)) - - fork{ - // 53 - c.io.in.enqueuePartial(chiselTypeOf(c.io.in.bits).Lit( - _.uop.cf.pc -> 666.U, - _.uop.brTag.flag -> true.B, - _.uop.brTag.value -> 15.U - )) - // 54 - c.clock.step(1) - // 55 - c.io.redirect.valid.poke(true.B) - c.io.redirect.bits.pokePartial(chiselTypeOf(c.io.redirect.bits).Lit( - _.isException -> false.B, - _.brTag.flag -> true.B, - _.brTag.value -> 15.U - )) - c.clock.step(1) - // 56 - c.io.redirect.valid.poke(false.B) - }.fork{ - c.io.out.expectDequeuePartial(chiselTypeOf(c.io.out.bits).Lit(_.uop.cf.pc -> 666.U)) - }.join() - - } - } - - - "MDU" should "random enq and deq correctly" in { - test(new MulDivExeUnit{ - AddSinks() - }){ c => - - c.io.in.initSource().setSourceClock(c.clock) - c.io.out.initSink().setSinkClock(c.clock) - - def TEST_SIZE = 50 - val pcSeq = (0 until TEST_SIZE).map(_ => Random.nextInt(0x7fffffff)) - - fork{ - c.io.in.enqueuePartialSeq(pcSeq.map(pc => { - genDiv(c.io.in.bits, pc) - })) - }.fork{ - c.io.out.expectDequeuePartialSeq(pcSeq.map( - pc => chiselTypeOf(c.io.out.bits).Lit( - _.uop.cf.pc -> pc.U - ) - )) - }.join() - - } - } - - -} diff --git a/src/test/scala/xiangshan/backend/issue/IssueQueueTest.scala b/src/test/scala/xiangshan/backend/issue/IssueQueueTest.scala deleted file mode 100644 index 85310a8987d4f633cee6a50c2be797649dd30306..0000000000000000000000000000000000000000 --- a/src/test/scala/xiangshan/backend/issue/IssueQueueTest.scala +++ /dev/null @@ -1,129 +0,0 @@ -package xiangshan.backend.issue - -import org.scalatest._ -import chiseltest._ -import chisel3._ -import chisel3.util._ -import chisel3.experimental.BundleLiterals._ -import top.Parameters -import utils.XSLog -import xiangshan._ -import xiangshan.backend.exu.Exu -import xiangshan.testutils._ - -import scala.util.Random - -class IssueQueueTest extends FlatSpec - with ChiselScalatestTester - with Matchers - with ParallelTestExecution - with HasPartialDecoupledDriver -{ - - it should "enq and deq correctly" in { - test(new IssueQueue(Exu.ldExeUnitCfg, 1, 1){ - AddSinks() - }){ c => - - def genEnqRdyReq(x: => DecoupledIO[MicroOp], roqIdx: Long) = { - chiselTypeOf(x.bits).Lit( - _.src1State -> SrcState.rdy, - _.src2State -> SrcState.rdy, - _.src3State -> SrcState.rdy, - _.roqIdx -> roqIdx.U, - _.cf.pc -> roqIdx.U - ) - } - - c.io.enq.initSource().setSourceClock(c.clock) - c.io.deq.initSink().setSinkClock(c.clock) - - fork { - c.io.enq.enqueuePartialSeq((0 until c.qsize).map(i => genEnqRdyReq(c.io.enq, i))) - }.fork { -// c.clock.step(10) - c.io.deq.expectDequeuePartialSeq((0 until c.qsize).map( - i => chiselTypeOf(c.io.deq.bits).Lit( - _.uop.roqIdx -> i.U, - _.uop.cf.pc -> i.U - ) - )) - }.join() - } - } - - - it should "only deq ready inst" in { - test(new IssueQueue(Exu.ldExeUnitCfg, 1, 1){ - AddSinks() - }){ c => - - def genEnqRdyReq(x: => DecoupledIO[MicroOp], pc: Long, ready: Boolean) = { - chiselTypeOf(x.bits).Lit( - _.src1State -> (if(ready) SrcState.rdy else SrcState.busy), - _.src2State -> SrcState.rdy, - _.src3State -> SrcState.rdy, - _.cf.pc -> pc.U - ) - } - - c.io.enq.initSource().setSourceClock(c.clock) - c.io.deq.initSink().setSinkClock(c.clock) - - fork { - c.io.enq.enqueuePartialSeq((0 until c.qsize).map(i => genEnqRdyReq(c.io.enq, i, i%2==0))) - }.fork { - // c.clock.step(10) - c.io.deq.expectDequeuePartialSeq((0 until c.qsize).filter(i => i%2==0).map( - i => chiselTypeOf(c.io.deq.bits).Lit( - _.uop.cf.pc -> i.U - ) - )) - }.join() - } - } - - it should "enq and deq bubble correctly" in { - test(new IssueQueue(Exu.ldExeUnitCfg, 1, 1){ - AddSinks() - }){ c => - - def genEnqRdyReq(x: => DecoupledIO[MicroOp], pc: Long) = { - chiselTypeOf(x.bits).Lit( - _.src1State -> SrcState.rdy, - _.src2State -> SrcState.rdy, - _.src3State -> SrcState.rdy, - _.cf.pc -> pc.U - ) - } - - c.io.enq.initSource().setSourceClock(c.clock) - c.io.deq.initSink().setSinkClock(c.clock) - - def TEST_SIZE = 100 - - fork { - c.io.enq.enqueuePartialSeq((0 until TEST_SIZE).map(i => genEnqRdyReq(c.io.enq, i))) - }.fork { - c.io.deq.expectDequeuePartialSeq((0 until TEST_SIZE).map( - i => chiselTypeOf(c.io.deq.bits).Lit( - _.uop.cf.pc -> i.U - ) - )) - }.fork{ - c.clock.step(10) - var cnt = 0 - while (cnt != TEST_SIZE){ - c.io.tlbFeedback.valid.poke(true.B) - c.io.tlbFeedback.bits.hit.poke(true.B) - c.clock.step(1) - cnt += 1 - c.io.tlbFeedback.valid.poke(false.B) - c.clock.step(1 + Random.nextInt(10)) - } - }.join() - } - } - - -} \ No newline at end of file diff --git a/src/test/scala/xiangshan/backend/issue/ReservationStationTest.scala b/src/test/scala/xiangshan/backend/issue/ReservationStationTest.scala deleted file mode 100644 index 1c3caa2d760c1a06dcf34c514a2398e6f9053d0e..0000000000000000000000000000000000000000 --- a/src/test/scala/xiangshan/backend/issue/ReservationStationTest.scala +++ /dev/null @@ -1,50 +0,0 @@ -package xiangshan.backend.issue - -import org.scalatest._ -import chiseltest._ -import chisel3._ -import chisel3.util._ -import chisel3.experimental.BundleLiterals._ -import utils.XSLog -import xiangshan._ -import xiangshan.backend.exu.Exu -import xiangshan.testutils._ - -class ReservationStationTest extends FlatSpec - with ChiselScalatestTester - with Matchers - with ParallelTestExecution - with HasPartialDecoupledDriver -{ - it should "do enq issue with no delay correctly" in { - test(new ReservationStation(Exu.aluExeUnitCfg, wakeupCnt = 1, bypassCnt = 1, fifo = false) { - AddSinks() - }) { c => - - def genEnqRdyReq(x: => DecoupledIO[MicroOp], roq: Long) = { - chiselTypeOf(x.bits).Lit( - _.src1State -> SrcState.rdy, - _.src2State -> SrcState.rdy, - _.src3State -> SrcState.rdy, - _.roqIdx -> roq.U - ) - } - - c.io.enqCtrl.initSource().setSourceClock(c.clock) - c.io.deq.initSink().setSinkClock(c.clock) - - def TEST_SIZE = 2 - val roqSeq = 0 until TEST_SIZE - val enqPort = c.io.enqCtrl - fork { - c.io.enqCtrl.enqueuePartialSeq(roqSeq.map(roq => genEnqRdyReq(enqPort, roq))) - }.fork { - c.io.deq.expectDequeuePartialSeq(roqSeq.map( - roq => chiselTypeOf(c.io.deq.bits).Lit( - _.uop.roqIdx -> roq.U - ) - )) - }.join() - } - } -} \ No newline at end of file diff --git a/src/test/scala/xiangshan/frontend/IFUTest.scala b/src/test/scala/xiangshan/frontend/IFUTest.scala index e4cce4726fe22755b2f1726e17048d5bee747421..4ede8b2356630898759e9e190bf69c5bfb67c54b 100644 --- a/src/test/scala/xiangshan/frontend/IFUTest.scala +++ b/src/test/scala/xiangshan/frontend/IFUTest.scala @@ -2,9 +2,10 @@ package xiangshan.frontend import chisel3._ import chiseltest._ -import org.scalatest.{FlatSpec, Matchers} +import org.scalatest.flatspec.AnyFlatSpec +import org.scalatest.matchers.must.Matchers -class IFUTest extends FlatSpec with ChiselScalatestTester with Matchers { +class IFUTest extends AnyFlatSpec with ChiselScalatestTester with Matchers { behavior of "IFU Test" it should "test IFU pipeline" in { @@ -40,7 +41,7 @@ class IFUTest extends FlatSpec with ChiselScalatestTester with Matchers { // Cycle 5 //----------------- c.io.redirect.valid.poke(true.B) - c.io.redirect.bits.target.poke("h80002800".U) + c.io.redirect.bits.poke("h80002800".U) c.clock.step() //----------------- // Cycle 6 diff --git a/src/test/scala/xiangshan/frontend/PDtest.scala b/src/test/scala/xiangshan/frontend/PDtest.scala index 59115aee4836c927c32e6d7c9be6bcd6805d75e3..d2fd0ade88a5d8e24adfee972b1d7e23fa6eff07 100644 --- a/src/test/scala/xiangshan/frontend/PDtest.scala +++ b/src/test/scala/xiangshan/frontend/PDtest.scala @@ -3,8 +3,10 @@ package xiangshan.frontend import chisel3._ import chiseltest._ import org.scalatest._ +import org.scalatest.flatspec.AnyFlatSpec +import org.scalatest.matchers.must.Matchers -class PDtest extends FlatSpec with ChiselScalatestTester with Matchers{ +class PDtest extends AnyFlatSpec with ChiselScalatestTester with Matchers{ val cacheLine2 = ("b" + "100_1_00001_00000_10" + //rvc jalr "100_0_00001_00000_10" + //rvc jr diff --git a/src/test/scala/xiangshan/frontend/RASTest.scala b/src/test/scala/xiangshan/frontend/RASTest.scala index 8a3720a94675296315a003d51dbfcf572fe46862..fbb4e71e1fe3331dc9e89563f5bffe4c4566b9e7 100644 --- a/src/test/scala/xiangshan/frontend/RASTest.scala +++ b/src/test/scala/xiangshan/frontend/RASTest.scala @@ -3,10 +3,12 @@ package xiangshan.frontend import chisel3._ import chiseltest._ import org.scalatest._ +import org.scalatest.flatspec.AnyFlatSpec +import org.scalatest.matchers.must.Matchers import xiangshan.testutils._ -class RASTest extends FlatSpec +class RASTest extends AnyFlatSpec with ChiselScalatestTester with Matchers with ParallelTestExecution diff --git a/src/test/scala/xiangshan/frontend/uBTBTest.scala b/src/test/scala/xiangshan/frontend/uBTBTest.scala index 0ca397bb6dfd7d2e4ebbb0cce563e17a8a2ae2be..3bb35ac9d4ef7c5c76dac830eb3b14d8ae71923c 100644 --- a/src/test/scala/xiangshan/frontend/uBTBTest.scala +++ b/src/test/scala/xiangshan/frontend/uBTBTest.scala @@ -3,10 +3,12 @@ package xiangshan.frontend import chisel3._ import chiseltest._ import org.scalatest._ +import org.scalatest.flatspec.AnyFlatSpec +import org.scalatest.matchers.must.Matchers import xiangshan.testutils._ -class uBTBTest extends FlatSpec +class uBTBTest extends AnyFlatSpec with ChiselScalatestTester with Matchers with ParallelTestExecution diff --git a/src/test/scala/xiangshan/memend/SbufferTest.scala b/src/test/scala/xiangshan/memend/SbufferTest.scala new file mode 100644 index 0000000000000000000000000000000000000000..95bbfbc11f057bdaa3d91413c9c4e8cb90b68d54 --- /dev/null +++ b/src/test/scala/xiangshan/memend/SbufferTest.scala @@ -0,0 +1,125 @@ +package xiangshan.memend + +import org.scalatest._ +import chiseltest._ +import chisel3._ +import chisel3.experimental.BundleLiterals._ +import chisel3.util._ +import org.scalatest.flatspec.AnyFlatSpec +import org.scalatest.matchers.must.Matchers +import xiangshan._ +import xiangshan.cache.{DCacheLineIO, DCacheWordReq} +import xiangshan.mem.{LoadForwardQueryIO, NewSbuffer} +import xiangshan.testutils._ + +import scala.util.Random + +class SbufferWapper extends XSModule { + val io = IO(new Bundle() { + val in = Vec(StorePipelineWidth, Flipped(Decoupled(new DCacheWordReq))) + val dcache = new DCacheLineIO + val forward = Vec(LoadPipelineWidth, Flipped(new LoadForwardQueryIO)) + val flush = new Bundle { + val valid = Input(Bool()) + val empty = Output(Bool()) + } // sbuffer flush + }) + val sbuffer = Module(new NewSbuffer) + io <> sbuffer.io + + // fake dcache + sbuffer.io.dcache.req.ready := true.B + sbuffer.io.dcache.resp.valid := RegNext(RegNext(RegNext(RegNext(sbuffer.io.dcache.req.valid)))) + sbuffer.io.dcache.resp.bits.meta.id := RegNext(RegNext(RegNext(RegNext(sbuffer.io.dcache.req.bits.meta.id)))) +} + +class SbufferTest extends AnyFlatSpec + with ChiselScalatestTester + with Matchers + with HasXSParameter + with ParallelTestExecution + with HasPartialDecoupledDriver { + + + top.Parameters.set(top.Parameters.debugParameters) + + + it should "random req" in { + test(new SbufferWapper{AddSinks()}){ c => + + def store_enq(addr: Seq[UInt], data: Seq[UInt], mask: Seq[UInt]) ={ + (0 until StorePipelineWidth).map { i => + c.io.in(i).valid.poke(true.B) + c.io.in(i).bits.pokePartial(chiselTypeOf(c.io.in(i).bits).Lit( + _.mask -> mask(i), + _.addr -> addr(i), + _.data -> data(i) + )) + } + c.clock.step(1) + for (in <- c.io.in){ in.valid.poke(false.B)} + } + + def forward_req_and_resp(addr: Seq[UInt], data: Seq[UInt], mask:Seq[UInt]) = { + (0 until LoadPipelineWidth).map{ i => + c.io.forward(i).paddr.poke(addr(i)) + c.io.forward(i).mask.poke(mask(i)) + if(c.io.in(i).ready.peek() == true.B) { + (0 until 8).map { j => + c.io.forward(i).forwardData(j).expect(data(i)(j * 8 + 7, j * 8)) + } + } + } + } + + val TEST_SIZE = 100 + for(i <- 0 until TEST_SIZE) { + val addr = Seq.fill(StorePipelineWidth)((Random.nextLong() & 0x7ffffffff8L).U)// align to block size + val data = Seq.fill(StorePipelineWidth)((Random.nextLong() & 0x7fffffffffffffffL).U) + val mask = Seq.fill(StorePipelineWidth)(0xff.U) + store_enq(addr, data, mask) + forward_req_and_resp(addr, data, mask) + } + } + } + + it should "sequence req" in { + test(new SbufferWapper{AddSinks()}){ c => + + def store_enq(addr: Seq[UInt], data: Seq[UInt], mask: Seq[UInt]) = { + (0 until StorePipelineWidth).map { i => + c.io.in(i).valid.poke(true.B) + c.io.in(i).bits.pokePartial(chiselTypeOf(c.io.in(i).bits).Lit( + _.mask -> mask(i), + _.addr -> addr(i), + _.data -> data(i) + )) + } + c.clock.step(1) + for (in <- c.io.in){ in.valid.poke(false.B)} + } + + def forward_req_and_resp(addr: Seq[UInt], data: Seq[UInt], mask:Seq[UInt]) = { + (0 until LoadPipelineWidth).map{ i => + c.io.forward(i).paddr.poke(addr(i)) + c.io.forward(i).mask.poke(mask(i)) + if(c.io.in(i).ready.peek() == true.B) { + (0 until 8).map { j => + c.io.forward(i).forwardData(j).expect(data(i)(j * 8 + 7, j * 8)) + } + } + } + } + + val TEST_SIZE = 100 + val start_addr = Random.nextLong() & 0x7ffffffff8L + for(i <- 0 until TEST_SIZE) { + val addr = Seq(((i<<4) + start_addr).U,((i<<4)+8+start_addr).U) + val data = Seq.fill(StorePipelineWidth)((Random.nextLong() & 0x7fffffffffffffffL).U) + val mask = Seq.fill(StorePipelineWidth)(0xff.U) + store_enq(addr, data, mask) + forward_req_and_resp(addr, data, mask) + } + } + } +} diff --git a/src/test/scala/xiangshan/testutils/AddSinks.scala b/src/test/scala/xiangshan/testutils/AddSinks.scala index b140f9bc7f38d73cbeef17f547112ecb8c909b11..e5599121e33cd296cfe529a0bde34186ccd8a80e 100644 --- a/src/test/scala/xiangshan/testutils/AddSinks.scala +++ b/src/test/scala/xiangshan/testutils/AddSinks.scala @@ -30,11 +30,17 @@ object AddSinks { "perfCntCondMbpIRight", "perfCntCondMbpIWrong", "perfCntCondMbpRRight", - "perfCntCondMbpRWrong" + "perfCntCondMbpRWrong"//, + // "CntFetchFromICache", + // "CntFetchFromLoopBuffer", + // "CntExitLoop1", + // "CntExitLoop2", + // "CntExitLoop3" ) for (s <- sinks){ BoringUtils.addSink(tmp, s) } - val disp_enable = WireInit(dispBegin.S(64.W).asUInt() < dispEnd.S(64.W).asUInt()) + // val disp_enable = WireInit(dispBegin.S(64.W).asUInt() < dispEnd.S(64.W).asUInt()) + val disp_enable = WireInit(true.B) val time = GTimer() BoringUtils.addSource(disp_enable, "DISPLAY_LOG_ENABLE") BoringUtils.addSource(time, "logTimestamp") diff --git a/src/test/scala/xiangshan/testutils/TestCaseGenerator.scala b/src/test/scala/xiangshan/testutils/TestCaseGenerator.scala index ae2f339db541ee6b8fb052050cd99d598614705b..990fa974370f9810561cf4004fe1e096480ba46f 100644 --- a/src/test/scala/xiangshan/testutils/TestCaseGenerator.scala +++ b/src/test/scala/xiangshan/testutils/TestCaseGenerator.scala @@ -4,7 +4,6 @@ import chisel3._ import chisel3.util._ import chisel3.experimental.BundleLiterals._ import chiseltest._ -import noop.MDUOpType import xiangshan._ import xiangshan.backend._ diff --git a/src/test/vsrc/ram.v b/src/test/vsrc/ram.v index da28d257c6eaa4bb06519f8feecf7ba72f3a87e2..6905066d5d53740830d14ce8331a92ae7d0570ea 100644 --- a/src/test/vsrc/ram.v +++ b/src/test/vsrc/ram.v @@ -1,15 +1,20 @@ -import "DPI-C" function void ram_helper +import "DPI-C" function void ram_write_helper ( - input longint rIdx, - output longint rdata, input longint wIdx, input longint wdata, input longint wmask, - input bit wen + input bit wen +); + +import "DPI-C" function longint ram_read_helper +( + input bit en, + input longint rIdx ); module RAMHelper( input clk, + input en, input [63:0] rIdx, output [63:0] rdata, input [63:0] wIdx, @@ -18,8 +23,10 @@ module RAMHelper( input wen ); + assign rdata = ram_read_helper(en, rIdx); + always @(posedge clk) begin - ram_helper(rIdx, rdata, wIdx, wdata, wmask, wen); + ram_write_helper(wIdx, wdata, wmask, wen && en); end endmodule