提交 4e9be159 编写于 作者: M Megvii Engine Team

feat(mgb/gopt): add opt pass for fusing convolution and reformat

GitOrigin-RevId: d0c5deace2e860cb62002a6cfedd4b32a8ca24df
上级 c3a4b222
......@@ -759,6 +759,7 @@ const GraphOptimizer& GraphOptimizer::add_passes_for_optimize_options(
add_pass<RemoveRedundantTypeCvtPass>();
add_pass(FuseNCHW4Int8Preprocess::make());
add_pass<FuseWarpPerspectiveDimshufflePass>();
add_pass<FoldingConvBiasDimshufflePass>();
});
cb(chwn4, {
add_pass<FuseConvBiasNonlinPass>();
......
......@@ -2825,27 +2825,26 @@ public:
MGB_DEFINE_OPR_CLASS(ShuffleShuffleRemovePass::Impl::AbstractShuffleOpr,
cg::SingleCNOperatorNodeBase) // {
public:
AbstractShuffleOpr(VarNode* inpvar, TensorFormat inp_format,
TensorFormat out_format);
static SymbolVar make(VarNode* inpvar, TensorFormat inp_format,
TensorFormat out_format);
TensorFormat inp_format() const {
return m_inp_format;
}
TensorFormat out_format() const {
return m_out_format;
}
AbstractShuffleOpr(VarNode* inpvar, TensorFormat inp_format,
TensorFormat out_format);
static SymbolVar make(VarNode* inpvar, TensorFormat inp_format,
TensorFormat out_format);
TensorFormat inp_format() const {
return m_inp_format;
}
TensorFormat out_format() const {
return m_out_format;
}
private:
void init_output_static_infer_desc() override;
void scn_do_execute() override;
const TensorFormat m_inp_format;
const TensorFormat m_out_format;
}
;
void init_output_static_infer_desc() override;
void scn_do_execute() override;
const TensorFormat m_inp_format;
const TensorFormat m_out_format;
};
MGB_DYN_TYPE_OBJ_FINAL_IMPL(ShuffleShuffleRemovePass::Impl::AbstractShuffleOpr);
......@@ -3228,4 +3227,353 @@ void ShuffleShuffleRemovePass::apply(OptState& opt) const {
MIDOUT_E
}
/* ==================== FoldingConvBiasDimshufflePass ================= */
const char* FoldingConvBiasDimshufflePass::name() const {
return mgb_cstr_log("folding conv bias dimshuffle pass");
}
void FoldingConvBiasDimshufflePass::apply(OptState& opt) const {
MIDOUT_B("FoldingConvBiasDimshufflePass::apply");
using DepType = cg::OperatorNodeProp::DepType;
ThinHashMap<OperatorNodeBase*,
SmallVector<std::pair<OperatorNodeBase*, DepType>>>
readers;
static const ThinHashSet<Typeinfo*> opr_type_list = {
opr::TypeCvt::typeinfo(), opr::Dimshuffle::typeinfo(),
opr::Reshape::typeinfo(), opr::ConvBias::typeinfo()};
opt.graph().iter([&readers](OperatorNodeBase* opr) {
for (auto&& i : opr->node_prop().dep_map()) {
if (opr_type_list.count(i.first->owner_opr()->dyn_typeinfo())) {
readers[i.first->owner_opr()].emplace_back(opr, i.second);
}
}
});
auto rewriter = opt.graph().make_rewriter();
auto nchw42nchw = [](VarNode* inp) -> VarNode* {
mgb_assert(inp->shape().ndim == 5 && inp->shape()[4] == 4);
auto x = SymbolVar(inp);
auto xshp = opr::GetVarShape::make(x);
auto cv = [&x](int v) { return x.make_scalar(v); };
auto sub = [&xshp, &cv](int idx) {
return opr::IndexAt::make(xshp, {{0, cv(idx)}});
};
auto tshp = opr::Concat::make({sub(0), sub(1) * 4, sub(2), sub(3)}, 0);
auto y0 = opr::Dimshuffle::make(x, {0, 1, 4, 2, 3});
auto y1 = opr::Reshape::make(y0, tshp);
auto y2 = opr::TypeCvt::make(y1, dtype::Float32());
return y2.node();
};
auto nchw42nchw32 = [](VarNode* inp) -> VarNode* {
mgb_assert(inp->shape().ndim == 5 && inp->shape()[4] == 4);
auto x = SymbolVar(inp);
auto xshp = opr::GetVarShape::make(x);
auto cv = [&x](int v) { return x.make_scalar(v); };
auto sub = [&xshp, &cv](int idx) {
return opr::IndexAt::make(xshp, {{0, cv(idx)}});
};
auto tshp0 = opr::Concat::make(
{sub(0), sub(1) / 8, cv(8), sub(2), sub(3), sub(4)}, 0),
tshp1 = opr::Concat::make(
{sub(0), sub(1) / 8, sub(2), sub(3), sub(4) * 8}, 0);
auto y0 = opr::Reshape::make(x, tshp0);
auto y1 = opr::Dimshuffle::make(y0, {0, 1, 3, 4, 2, 5});
auto y2 = opr::Reshape::make(y1, tshp1);
return y2.node();
};
auto nchw322nchw4 = [](VarNode* inp) -> VarNode* {
mgb_assert(inp->shape().ndim == 5 && inp->shape()[4] == 32);
auto x = SymbolVar(inp);
auto xshp = opr::GetVarShape::make(x);
auto cv = [&x](int v) { return x.make_scalar(v); };
auto sub = [&xshp, &cv](int idx) {
return opr::IndexAt::make(xshp, {{0, cv(idx)}});
};
auto tshp0 = opr::Concat::make(
{sub(0), sub(1), sub(2), sub(3), cv(8), sub(4) / 8}, 0),
tshp1 = opr::Concat::make(
{sub(0), sub(1) * 8, sub(2), sub(3), sub(4) / 8}, 0);
auto y0 = opr::Reshape::make(x, tshp0);
auto y1 = opr::Dimshuffle::make(y0, {0, 1, 4, 2, 3, 5});
auto y2 = opr::Reshape::make(y1, tshp1);
return y2.node();
};
auto try_conv_dimshuffle_reshape_typecvt = [&rewriter, &readers,
&nchw42nchw](
OperatorNodeBase* opr) {
ThinHashSet<OperatorNodeBase*> opr_set;
ThinHashSet<OperatorNodeBase*> reader_set;
// check typecvt
auto typecvt = try_cast_as_op<opr::TypeCvt>(opr);
if (typecvt == nullptr)
return false;
auto inp_dtype = typecvt->input(0)->dtype(),
out_dtype = typecvt->output(0)->dtype();
bool is_s82f32 = inp_dtype.enumv() == DTypeEnum::QuantizedS8 &&
out_dtype.enumv() == DTypeEnum::Float32;
if (!is_s82f32)
return false;
opr_set.insert(opr);
// check reshape
auto reshape =
try_cast_as_op<opr::Reshape>(typecvt->input(0)->owner_opr());
if (reshape == nullptr)
return false;
opr_set.insert(reshape);
for (auto&& i : readers[reshape]) {
if (i.second & DepType::DEV_VALUE) {
reader_set.insert(i.first);
}
}
// check shuffle
auto shuffle =
try_cast_as_op<opr::Dimshuffle>(reshape->input(0)->owner_opr());
if (shuffle == nullptr)
return false;
auto&& param = shuffle->param();
if (param.pattern_len != 5)
return false;
bool is_nchw42nchw = param.pattern[0] == 0 && param.pattern[1] == 1 &&
param.pattern[2] == 4 && param.pattern[3] == 2 &&
param.pattern[4] == 3 &&
shuffle->input(0)->shape()[4] == 4;
if (!is_nchw42nchw)
return false;
opr_set.insert(shuffle);
for (auto&& i : readers[shuffle]) {
if (i.second & DepType::DEV_VALUE) {
reader_set.insert(i.first);
}
}
// check conv bias
auto conv_bias =
try_cast_as_op<opr::ConvBias>(shuffle->input(0)->owner_opr());
if (conv_bias == nullptr)
return false;
inp_dtype = conv_bias->input(0)->dtype();
bool is_s8nchw4 = inp_dtype.enumv() == DTypeEnum::QuantizedS8 &&
conv_bias->param().format ==
megdnn::param::ConvBias::Format::NCHW4;
if (!is_s8nchw4)
return false;
if (conv_bias->input().size() != 3)
return false;
opr_set.insert(conv_bias);
for (auto&& i : readers[conv_bias]) {
if (i.second & DepType::DEV_VALUE) {
reader_set.insert(i.first);
}
}
for (auto reader : reader_set) {
if (opr_set.count(reader) <= 0) {
return false;
}
}
auto src = rewriter.get_var(conv_bias->input(0)),
filter = rewriter.get_var(conv_bias->input(1)),
bias = rewriter.get_var(conv_bias->input(2));
auto new_bias = nchw42nchw(bias);
auto new_param = conv_bias->param();
new_param.format = megdnn::param::ConvBias::Format::NCHW4_NCHW;
auto conv_bias_shuffle = opr::ConvBias::make(
src, filter, new_bias, new_param, conv_bias->execution_policy(),
OperatorNodeConfig{dtype::Float32()});
rewriter.replace_var(opr->output(0), conv_bias_shuffle.node(),
mgb_cstr_log("replace conv_bias + typecvt + "
"dimshuffle + "
"reshape to conv_bias(NCHW4_NCHW)"));
return true;
};
auto try_conv_reformat_nchw42nchw32 = [&rewriter, &nchw42nchw32,
&readers](OperatorNodeBase* opr) {
ThinHashSet<OperatorNodeBase*> opr_set;
ThinHashSet<OperatorNodeBase*> reader_set;
// check reshape
auto reshape1 = try_cast_as_op<opr::Reshape>(opr);
if (reshape1 == nullptr)
return false;
opr_set.insert(opr);
// check dimshuffle
auto shuffle = try_cast_as_op<opr::Dimshuffle>(
reshape1->input(0)->owner_opr());
if (shuffle == nullptr)
return false;
auto&& param = shuffle->param();
if (param.pattern_len != 6)
return false;
bool is_nchw42nchw32 = param.pattern[0] == 0 && param.pattern[1] == 1 &&
param.pattern[2] == 3 && param.pattern[3] == 4 &&
param.pattern[4] == 2 && param.pattern[5] == 5 &&
shuffle->output(0)->shape()[5] == 4 &&
shuffle->output(0)->shape()[4] == 8;
if (!is_nchw42nchw32)
return false;
opr_set.insert(shuffle);
for (auto&& i : readers[shuffle]) {
if (i.second & DepType::DEV_VALUE) {
reader_set.insert(i.first);
}
}
// check reshape
auto reshape2 =
try_cast_as_op<opr::Reshape>(shuffle->input(0)->owner_opr());
if (reshape2 == nullptr)
return false;
opr_set.insert(reshape2);
for (auto&& i : readers[reshape2]) {
if (i.second & DepType::DEV_VALUE) {
reader_set.insert(i.first);
}
}
// check conv bias
auto conv_bias =
try_cast_as_op<opr::ConvBias>(reshape2->input(0)->owner_opr());
if (conv_bias == nullptr)
return false;
auto inp_dtype = conv_bias->input(0)->dtype();
bool is_s8nchw4 = inp_dtype.enumv() == DTypeEnum::QuantizedS8 &&
conv_bias->param().format ==
megdnn::param::ConvBias::Format::NCHW4;
if (!is_s8nchw4)
return false;
if (conv_bias->input().size() != 3)
return false;
opr_set.insert(conv_bias);
for (auto&& i : readers[conv_bias]) {
if (i.second & DepType::DEV_VALUE) {
reader_set.insert(i.first);
}
}
for (auto reader : reader_set) {
if (opr_set.count(reader) <= 0) {
return false;
}
}
auto src = rewriter.get_var(conv_bias->input(0)),
filter = rewriter.get_var(conv_bias->input(1)),
bias = rewriter.get_var(conv_bias->input(2));
auto new_bias = nchw42nchw32(bias);
auto new_param = conv_bias->param();
new_param.format = megdnn::param::ConvBias::Format::NCHW4_NCHW32;
auto conv_bias_shuffle = opr::ConvBias::make(
src, filter, new_bias, new_param, conv_bias->execution_policy(),
conv_bias->config());
rewriter.replace_var(
opr->output(0), conv_bias_shuffle.node(),
mgb_cstr_log("replace conv_bias + "
"reformat to conv_bias(NCHW4_NCHW32)"));
return true;
};
auto try_conv_reformat_nchw322nchw4 = [&rewriter, &readers, &nchw322nchw4](
OperatorNodeBase* opr) {
ThinHashSet<OperatorNodeBase*> opr_set;
ThinHashSet<OperatorNodeBase*> reader_set;
// check reshape
auto reshape1 = try_cast_as_op<opr::Reshape>(opr);
if (reshape1 == nullptr)
return false;
opr_set.insert(opr);
// check dimshuffle
auto shuffle = try_cast_as_op<opr::Dimshuffle>(
reshape1->input(0)->owner_opr());
if (shuffle == nullptr)
return false;
auto&& param = shuffle->param();
if (param.pattern_len != 6)
return false;
bool is_nchw322nchw4 = param.pattern[0] == 0 && param.pattern[1] == 1 &&
param.pattern[2] == 4 && param.pattern[3] == 2 &&
param.pattern[4] == 3 && param.pattern[5] == 5 &&
shuffle->input(0)->shape()[5] == 4 &&
shuffle->input(0)->shape()[4] == 8;
if (!is_nchw322nchw4)
return false;
opr_set.insert(shuffle);
for (auto&& i : readers[shuffle]) {
if (i.second & DepType::DEV_VALUE) {
reader_set.insert(i.first);
}
}
// check reshape
auto reshape2 =
try_cast_as_op<opr::Reshape>(shuffle->input(0)->owner_opr());
if (reshape2 == nullptr)
return false;
opr_set.insert(reshape2);
for (auto&& i : readers[reshape2]) {
if (i.second & DepType::DEV_VALUE) {
reader_set.insert(i.first);
}
}
// check conv bias
auto conv_bias =
try_cast_as_op<opr::ConvBias>(reshape2->input(0)->owner_opr());
if (conv_bias == nullptr)
return false;
auto inp_dtype = conv_bias->input(0)->dtype();
bool is_s8nchw32 = inp_dtype.enumv() == DTypeEnum::QuantizedS8 &&
conv_bias->param().format ==
megdnn::param::ConvBias::Format::NCHW32;
if (!is_s8nchw32)
return false;
if (conv_bias->input().size() != 3)
return false;
opr_set.insert(conv_bias);
for (auto&& i : readers[conv_bias]) {
if (i.second & DepType::DEV_VALUE) {
reader_set.insert(i.first);
}
}
for (auto reader : reader_set) {
if (opr_set.count(reader) <= 0) {
return false;
}
}
auto src = rewriter.get_var(conv_bias->input(0)),
filter = rewriter.get_var(conv_bias->input(1)),
bias = rewriter.get_var(conv_bias->input(2));
auto new_bias = nchw322nchw4(bias);
auto new_param = conv_bias->param();
new_param.format = megdnn::param::ConvBias::Format::NCHW32_NCHW4;
auto conv_bias_shuffle = opr::ConvBias::make(
src, filter, new_bias, new_param, conv_bias->execution_policy(),
conv_bias->config());
rewriter.replace_var(
opr->output(0), conv_bias_shuffle.node(),
mgb_cstr_log("replace conv_bias + "
"reformat to conv_bias(NCHW32_NCHW4)"));
return true;
};
MGB_MARK_USED_VAR(try_conv_reformat_nchw322nchw4);
auto on_opr = [&try_conv_dimshuffle_reshape_typecvt,
&try_conv_reformat_nchw42nchw32,
#if CUDA_VERSION >= 10020
&try_conv_reformat_nchw322nchw4,
#endif
&rewriter](OperatorNodeBase* opr) {
if (!try_conv_dimshuffle_reshape_typecvt(opr) &&
!try_conv_reformat_nchw42nchw32(opr)
#if CUDA_VERSION >= 10020
&& !try_conv_reformat_nchw322nchw4(opr)
#endif
) {
rewriter.auto_replace_outputs(opr);
}
};
opt.graph().iter(on_opr);
rewriter.apply_inplace();
MIDOUT_E
}
// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}
......@@ -402,6 +402,12 @@ namespace gopt {
void apply(OptState& opt) const override;
};
class FoldingConvBiasDimshufflePass final : public Pass {
public:
const char* name() const override;
void apply(OptState& opt) const override;
};
} // namespace gopt
} // namespace mgb
......
......@@ -3632,7 +3632,6 @@ TEST(TestGoptInference, ConvertFormatCD4GroupOneConv) {
}
#if MGB_CUDA
TEST(TestGoptInference, PreProcessCase0) {
REQUIRE_GPU(1);
HostTensorGenerator<dtype::Quantized8Asymm, RandomDistribution::UNIFORM>
......@@ -3783,5 +3782,247 @@ TEST(TestGoptInference, WarpAndPreProcessCase) {
func->execute();
MGB_ASSERT_TENSOR_NEAR(host_y, host_y_opt, 1e-5);
}
TEST(TestGoptInference, FoldingConvDimshuffle) {
REQUIRE_GPU(1);
auto cn = CompNode::load("gpu0");
cn.activate();
auto&& prop = CompNodeEnv::from_comp_node(cn).cuda_env().device_prop;
auto sm_ver = prop.major * 10 + prop.minor;
if (sm_ver < 61) {
printf("This testcast ignored due to insufficient cuda cap(got: %d, "
"expected: %d)\n",
sm_ver, 61);
return;
}
HostTensorGenerator<dtype::Int8> gen;
auto graph = ComputingGraph::make();
graph->options().graph_opt_level = 0;
auto mkvar = [&](const char* name, const TensorShape& shp,
const DType& dtype) {
return opr::TypeCvt::make(
opr::Host2DeviceCopy::make(*graph, gen(shp, cn)).rename(name),
dtype);
};
auto mkcvar = [&](const char* name, const TensorShape& shp,
const DType& dtype) {
return opr::TypeCvt::make(
opr::SharedDeviceTensor::make(*graph, *gen(shp, cn))
.rename(name),
dtype);
};
auto nchw42nchw = [](SymbolVar x) {
auto xshp = opr::GetVarShape::make(x);
auto cv = [&x](int v) { return x.make_scalar(v); };
auto sub = [&xshp, &cv](int idx) {
return opr::IndexAt::make(xshp, {{0, cv(idx)}});
};
auto tshp0 = opr::Concat::make({sub(0), sub(1) * 4, sub(2), sub(3)}, 0);
auto y0 = opr::Dimshuffle::make(x, {0, 1, 4, 2, 3});
auto y1 = opr::Reshape::make(y0, tshp0);
return y1;
};
auto x = mkvar("x", {32, 16, 4, 8, 4}, dtype::QuantizedS8(2.5f)),
w = mkcvar("w", {64, 16, 3, 3, 4}, dtype::QuantizedS8(2.5f)),
b = mkcvar("b", {1, 16, 1, 1, 4}, dtype::QuantizedS32(6.25f));
opr::ConvBias::Param param;
param.format = opr::ConvBias::Param::Format::NCHW4;
param.nonlineMode = opr::ConvBias::Param::NonlineMode::RELU;
param.stride_h = param.stride_w = 2;
param.pad_h = param.pad_w = 1;
auto y = opr::ConvBias::make(x, w, b, param, {},
OperatorNodeConfig{dtype::QuantizedS8(2.5f)});
y = opr::TypeCvt::make(y, dtype::Float32());
y = nchw42nchw(y);
SymbolVar y_fuse, y_non_fuse;
unpack_vector(gopt::GraphOptimizer{}
.add_pass<gopt::ShuffleShuffleRemovePass>()
.add_pass<gopt::FoldingConvBiasDimshufflePass>()
.add_pass<gopt::ParamFusePass>()
.apply({{y}})
.endpoint_vars(),
y_fuse);
graph->compile({{y_fuse, {}}})
->to_json()
->writeto_fpath(output_file(
"TestGoptInference.FoldingConvDimshuffle.json"));
ASSERT_EQ(opr::ConvBias::Param::Format::NCHW4_NCHW,
find_opr<opr::ConvBias>(y_fuse).param().format);
ASSERT_EQ(0u, find_opr_num<opr::Dimshuffle>(y_fuse));
unpack_vector(gopt::GraphOptimizer{}.apply({{y}}).endpoint_vars(),
y_non_fuse);
HostTensorND host_y_fuse, host_y_non_fuse;
auto func =
graph->compile({make_callback_copy(y_fuse, host_y_fuse),
make_callback_copy(y_non_fuse, host_y_non_fuse)});
func->execute();
}
TEST(TestGoptInference, FoldingConvDimshuffleNCHW4NCHW32) {
REQUIRE_GPU(1);
auto cn = CompNode::load("gpu0");
cn.activate();
auto&& prop = CompNodeEnv::from_comp_node(cn).cuda_env().device_prop;
auto sm_ver = prop.major * 10 + prop.minor;
if (sm_ver < 61) {
printf("This testcast ignored due to insufficient cuda cap(got: %d, "
"expected: %d)\n",
sm_ver, 61);
return;
}
HostTensorGenerator<dtype::Int8> gen;
auto graph = ComputingGraph::make();
graph->options().graph_opt_level = 0;
auto mkvar = [&](const char* name, const TensorShape& shp,
const DType& dtype) {
return opr::TypeCvt::make(
opr::Host2DeviceCopy::make(*graph, gen(shp, cn)).rename(name),
dtype);
};
auto mkcvar = [&](const char* name, const TensorShape& shp,
const DType& dtype) {
return opr::TypeCvt::make(
opr::SharedDeviceTensor::make(*graph, *gen(shp, cn))
.rename(name),
dtype);
};
auto nchw42nchw32 = [](SymbolVar x) {
auto xshp = opr::GetVarShape::make(x);
auto cv = [&x](int v) { return x.make_scalar(v); };
auto sub = [&xshp, &cv](int idx) {
return opr::IndexAt::make(xshp, {{0, cv(idx)}});
};
auto tshp0 = opr::Concat::make(
{sub(0), sub(1) / 8, cv(8), sub(2), sub(3), sub(4)}, 0),
tshp1 = opr::Concat::make(
{sub(0), sub(1) / 8, sub(2), sub(3), sub(4) * 8}, 0);
auto y0 = opr::Reshape::make(x, tshp0);
auto y1 = opr::Dimshuffle::make(y0, {0, 1, 3, 4, 2, 5});
auto y2 = opr::Reshape::make(y1, tshp1);
return y2;
};
auto x = mkvar("x", {32, 16, 4, 8, 4}, dtype::QuantizedS8(2.5f)),
w = mkcvar("w", {64, 16, 3, 3, 4}, dtype::QuantizedS8(2.5f)),
b = mkcvar("b", {1, 16, 1, 1, 4}, dtype::QuantizedS32(6.25f));
opr::ConvBias::Param param;
param.format = opr::ConvBias::Param::Format::NCHW4;
param.nonlineMode = opr::ConvBias::Param::NonlineMode::RELU;
param.stride_h = param.stride_w = 2;
param.pad_h = param.pad_w = 1;
auto y = opr::ConvBias::make(x, w, b, param, {},
OperatorNodeConfig{dtype::QuantizedS8(2.5f)});
y = nchw42nchw32(y);
y = opr::TypeCvt::make(y, dtype::Float32());
SymbolVar y_fuse, y_non_fuse;
unpack_vector(gopt::GraphOptimizer{}
.add_pass<gopt::FoldingConvBiasDimshufflePass>()
.add_pass<gopt::ParamFusePass>()
.apply({{y}})
.endpoint_vars(),
y_fuse);
graph->compile({{y_fuse, {}}})
->to_json()
->writeto_fpath(output_file(
"TestGoptInference.FoldingConvDimshuffleNCHW4NCHW32.json"));
ASSERT_EQ(opr::ConvBias::Param::Format::NCHW4_NCHW32,
find_opr<opr::ConvBias>(y_fuse).param().format);
ASSERT_EQ(0u, find_opr_num<opr::Dimshuffle>(y_fuse));
unpack_vector(gopt::GraphOptimizer{}.apply({{y}}).endpoint_vars(),
y_non_fuse);
HostTensorND host_y_fuse, host_y_non_fuse;
auto func =
graph->compile({make_callback_copy(y_fuse, host_y_fuse),
make_callback_copy(y_non_fuse, host_y_non_fuse)});
func->execute();
MGB_ASSERT_TENSOR_EQ(host_y_fuse, host_y_non_fuse);
}
#if CUDA_VERSION >= 10020
TEST(TestGoptInference, FoldingConvDimshuffleNCHW32NCHW4) {
REQUIRE_GPU(1);
auto cn = CompNode::load("gpu0");
cn.activate();
auto&& prop = CompNodeEnv::from_comp_node(cn).cuda_env().device_prop;
auto sm_ver = prop.major * 10 + prop.minor;
if (sm_ver < 75) {
printf("This testcast ignored due to insufficient cuda cap(got: %d, "
"expected: %d)\n",
sm_ver, 75);
return;
}
HostTensorGenerator<dtype::Int8> gen;
auto graph = ComputingGraph::make();
graph->options().graph_opt_level = 0;
auto mkvar = [&](const char* name, const TensorShape& shp,
const DType& dtype) {
return opr::TypeCvt::make(
opr::Host2DeviceCopy::make(*graph, gen(shp, cn)).rename(name),
dtype);
};
auto mkcvar = [&](const char* name, const TensorShape& shp,
const DType& dtype) {
return opr::TypeCvt::make(
opr::SharedDeviceTensor::make(*graph, *gen(shp, cn))
.rename(name),
dtype);
};
auto x = mkvar("x", {32, 16, 4, 8, 4}, dtype::QuantizedS8(2.5f)),
w = mkcvar("w", {64, 16, 3, 3, 4}, dtype::QuantizedS8(2.5f)),
b = mkcvar("b", {1, 16, 1, 1, 4}, dtype::QuantizedS32(6.25f)),
w1 = mkcvar("w1", {16, 16, 3, 3, 4}, dtype::QuantizedS8(2.5f)),
b1 = mkcvar("b1", {1, 4, 1, 1, 4}, dtype::QuantizedS32(6.25f));
opr::ConvBias::Param param;
param.format = opr::ConvBias::Param::Format::NCHW4;
param.nonlineMode = opr::ConvBias::Param::NonlineMode::RELU;
param.stride_h = param.stride_w = 2;
param.pad_h = param.pad_w = 1;
auto y = opr::ConvBias::make(x, w, b, param, {},
OperatorNodeConfig{dtype::QuantizedS8(2.5f)});
param.stride_h = param.stride_w = 1;
y = opr::ConvBias::make(y, w1, b1, param, {},
OperatorNodeConfig{dtype::QuantizedS8(2.5f)});
y = opr::TypeCvt::make(y, dtype::Float32());
SymbolVar y_fuse, y_non_fuse;
{
auto options = gopt::OptimizeForInferenceOptions{};
options.enable_nchw32().enable_fuse_conv_bias_nonlinearity();
unpack_vector(gopt::optimize_for_inference({y}, options), y_fuse);
}
graph->compile({{y_fuse, {}}})
->to_json()
->writeto_fpath(output_file(
"TestGoptInference.FoldingConvDimshuffleNCHW32NCHW4.json"));
ASSERT_EQ(1u, find_opr_num<opr::Dimshuffle>(y_fuse));
bool found = false;
cg::DepOprIter{[&found](cg::OperatorNodeBase* opr) {
if (!found && opr->same_type<opr::ConvBias>()) {
opr::ConvBias* cb = &opr->cast_final_safe<opr::ConvBias>();
if (cb->param().format ==
opr::ConvBias::Param::Format::NCHW32_NCHW4)
found = true;
}
}}
.add(y_fuse.node()->owner_opr());
EXPECT_TRUE(found);
unpack_vector(gopt::GraphOptimizer{}.apply({{y}}).endpoint_vars(),
y_non_fuse);
HostTensorND host_y_fuse, host_y_non_fuse;
auto func =
graph->compile({make_callback_copy(y_fuse, host_y_fuse),
make_callback_copy(y_non_fuse, host_y_non_fuse)});
func->execute();
MGB_ASSERT_TENSOR_EQ(host_y_fuse, host_y_non_fuse);
}
#endif
#endif
// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}
......@@ -131,8 +131,10 @@ uint64_t eval_conv_computation(const TensorShape& src_shape,
return dst_shape.total_nr_elems() * fh * fw * src_shape[1] * 32 /
group * 2;
}
mgb_assert(param.format == Param::Format::NCHW4,
"format should be NCHW4/NCHW32");
mgb_assert(param.format == Param::Format::NCHW4 ||
param.format == Param::Format::NCHW4_NCHW ||
param.format == Param::Format::NCHW4_NCHW32,
"format should be NCHW4/NCHW4_NCHW/NCHW4_NCHW32");
return dst_shape.total_nr_elems() * fh * fw * src_shape[1] * 4 / group *
2;
};
......@@ -154,6 +156,8 @@ uint64_t eval_conv_computation(const TensorShape& src_shape,
2;
};
if (param.format == Param::Format::NCHW4 ||
param.format == Param::Format::NCHW4_NCHW ||
param.format == Param::Format::NCHW4_NCHW32 ||
param.format == Param::Format::NCHW88 ||
param.format == Param::Format::NCHW44 ||
param.format == Param::Format::NCHW44_DOT ||
......
Subproject commit 41426ea4074dcfc448b1c9979ea7617407590c04
Subproject commit 9f7431672c17d4a731f84ca9d8f3f4e741e267b1
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册