提交 869a0327 编写于 作者: M Megvii Engine Team

perf(mgb): disable FoldingConvBiasDimshufflePass in cuda10 for performance

GitOrigin-RevId: d1b95a6f01ba73f98c0094e00fee3e61e9139628
上级 0baf6b0d
......@@ -1060,6 +1060,46 @@ TEST_F(CUDA, BENCHMARK_CONV_BIAS_INT8_CHWN4_SMALL_CHANNEL) {
param::ConvBias::Format::CHWN4);
}
TEST_F(CUDA, BENCHMARK_CONV_BIAS_INT8_NCHW4_NCHW) {
CUBenchmarker<ConvBiasForward> benchmarker(handle_cuda());
size_t RUNS = 1000;
benchmarker.set_display(false).set_times(RUNS);
using namespace conv_bias;
UniformIntRNG int_rng{-3, 3};
UniformIntRNG bias_rng{-50, 50};
ConvBias::Param param;
param.format = ConvBias::Param::Format::NCHW4_NCHW;
param.nonlineMode = ConvBias::Param::NonlineMode::IDENTITY;
benchmarker.set_before_exec_callback(
conv_bias::ConvBiasAlgoChecker<ConvBiasForward>(
"INT8_NCHW4_DOTPROD_IMPLICIT_GEMM"));
benchmarker.set_dtype(0, dtype::QuantizedS8(1.9980618f))
.set_dtype(1, dtype::QuantizedS8(1.9980927f))
.set_dtype(2, dtype::Float32())
.set_dtype(3, dtype::Float32())
.set_dtype(4, dtype::Float32())
.set_rng(0, &int_rng)
.set_rng(1, &int_rng)
.set_param(param);
auto run = [&](const TensorShapeArray& shapes) {
auto time_in_ms =
benchmarker.execs({shapes[0], shapes[1], shapes[2], {}, {}}) /
RUNS;
printf("src=%s, filter=%s, dst=%s, time=%.2f\n",
shapes[0].to_string().c_str(), shapes[1].to_string().c_str(),
shapes[2].to_string().c_str(), time_in_ms);
};
run({{16, 16, 224, 224, 4}, {32, 16, 3, 3, 4}, {1, 32, 1, 1}});
run({{16, 16, 92, 160, 4}, {32, 16, 3, 3, 4}, {1, 32, 1, 1}});
run({{16, 16, 46, 80, 4}, {32, 16, 3, 3, 4}, {1, 32, 1, 1}});
}
#if CUDA_VERSION >= 10020
TEST_F(CUDA, BENCHMARK_CUTLASS_CONV_BIAS_INT8_NCHW32) {
......
......@@ -772,7 +772,9 @@ const GraphOptimizer& GraphOptimizer::add_passes_for_optimize_options(
add_pass<RemoveRedundantTypeCvtPass>();
add_pass(FuseNCHW4Int8Preprocess::make());
add_pass<FuseWarpPerspectiveDimshufflePass>();
#if CUDA_VERSION >= 10020
add_pass<FoldingConvBiasDimshufflePass>();
#endif
});
cb(chwn4, {
add_pass<FuseConvBiasNonlinPass>();
......@@ -791,7 +793,9 @@ const GraphOptimizer& GraphOptimizer::add_passes_for_optimize_options(
add_pass<RemoveRedundantTypeCvtPass>();
add_pass(FuseNCHW4Int8Preprocess::make());
add_pass<FuseWarpPerspectiveDimshufflePass>();
#if CUDA_VERSION >= 10020
add_pass<FoldingConvBiasDimshufflePass>();
#endif
});
cb(fuse_conv_bias_nonlinearity, { add_pass<FuseConvBiasNonlinPass>(); });
......
......@@ -3638,6 +3638,7 @@ void ShuffleShuffleRemovePass::apply(OptState& opt) const {
MIDOUT_E
}
#if CUDA_VERSION >= 10020
/* ==================== FoldingConvBiasDimshufflePass ================= */
const char* FoldingConvBiasDimshufflePass::name() const {
return mgb_cstr_log("folding conv bias dimshuffle pass");
......@@ -4068,20 +4069,17 @@ void FoldingConvBiasDimshufflePass::apply(OptState& opt) const {
return true;
};
MGB_MARK_USED_VAR(try_conv_reformat_nchw322nchw4);
MGB_MARK_USED_VAR(try_conv_reformat_nchw42nchw32);
auto on_opr = [&try_conv_dimshuffle_reshape_typecvt,
&try_conv_reformat_nchw42nchw32,
&try_conv_reformat_nchw42nhwc,
#if CUDA_VERSION >= 10020
&try_conv_reformat_nchw322nchw4,
#endif
&rewriter](OperatorNodeBase* opr) {
if (!try_conv_dimshuffle_reshape_typecvt(opr) &&
!try_conv_reformat_nchw42nchw32(opr) &&
!try_conv_reformat_nchw42nhwc(opr)
#if CUDA_VERSION >= 10020
&& !try_conv_reformat_nchw322nchw4(opr)
#endif
) {
rewriter.auto_replace_outputs(opr);
}
......@@ -4091,6 +4089,7 @@ void FoldingConvBiasDimshufflePass::apply(OptState& opt) const {
MIDOUT_E
}
#endif
/* ==================== PaddingChannelPass ================= */
const char* PaddingChannelPass::name() const {
......
......@@ -16,6 +16,10 @@
#include "megbrain/opr/dnn/convolution.h"
#include "megbrain/opr/search_policy/algo_chooser_helper.h"
#if MGB_CUDA
#include <cuda.h>
#endif
namespace mgb {
namespace gopt {
......@@ -427,11 +431,13 @@ namespace gopt {
void apply(OptState& opt) const override;
};
#if CUDA_VERSION >= 10020
class FoldingConvBiasDimshufflePass final : public Pass {
public:
const char* name() const override;
void apply(OptState& opt) const override;
};
#endif
/*!
* \brief padding channel to enable fast int8/int4 support
......
......@@ -4155,6 +4155,7 @@ TEST(TestGoptInference, WarpAndPreProcessCase1) {
MGB_ASSERT_TENSOR_NEAR(host_y, host_y_opt, 1e-5);
}
#if CUDA_VERSION >= 10020
TEST(TestGoptInference, FoldingConvDimshuffle) {
REQUIRE_GPU(1);
auto cn = CompNode::load("gpu0");
......@@ -4307,7 +4308,6 @@ TEST(TestGoptInference, FoldingConvDimshuffleNCHW4NCHW32) {
MGB_ASSERT_TENSOR_EQ(host_y_fuse, host_y_non_fuse);
}
#if CUDA_VERSION >= 10020
TEST(TestGoptInference, FoldingConvDimshuffleNCHW32NCHW4) {
REQUIRE_GPU(1);
auto cn = CompNode::load("gpu0");
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册