提交 a3cd3fc7 编写于 作者: M Megvii Engine Team

test(mgb/gopt): add testcase for global layout transform

GitOrigin-RevId: f9669e1ba0d4c46ca8aab3161870d17d7762bf8b
上级 af576e9a
......@@ -28,7 +28,10 @@ public:
private:
using TensorFormatsBitSet = uint32_t;
using State = SmallVector<TensorFormatsBitSet>;
static constexpr uint32_t MAX_TENSOR_FORMATS = sizeof(TensorFormatsBitSet);
/// 1bit represents one kind of tensor formats
static constexpr uint32_t BITS_PER_BYTE = 8;
static constexpr uint32_t MAX_TENSOR_FORMATS =
sizeof(TensorFormatsBitSet) * BITS_PER_BYTE;
TensorFormatsBitSet add(TensorFormatsBitSet& set, TensorFormats fmt) {
mgb_assert(static_cast<uint32_t>(fmt) < MAX_TENSOR_FORMATS);
set |= (1 << static_cast<uint32_t>(fmt));
......
......@@ -111,8 +111,6 @@ void LayoutTransformPass::apply(OptState& opt) const {
}
new_var = reformat({new_var});
}
if (from != to && !new_var->shape().is_scalar())
new_var = reformat({new_var});
new_inp[i] = new_var;
}
VarNode* new_out;
......@@ -164,7 +162,9 @@ void LayoutTransformPass::apply(OptState& opt) const {
}
} else {
auto new_opr = rewriter.auto_replace_outputs(opr);
var2fmts[new_opr->output(0)] = base_fmt;
for (auto&& ov : new_opr->usable_output()) {
var2fmts[ov] = base_fmt;
}
}
};
opt.graph().iter(on_opr);
......
......@@ -245,19 +245,26 @@ struct ConvTensorFormatsDispatcherImpl<Opr, OprFormat::NHWC> {
if (i == 2)
available &= opr->input(i)->dtype().enumv() ==
DTypeEnum::QuantizedS32;
else
available &= opr->input(i)->dtype().enumv() ==
DTypeEnum::Quantized4Asymm ||
opr->input(i)->dtype().enumv() ==
DTypeEnum::QuantizedS4;
else {
bool i4_config = opr->input(i)->dtype().enumv() ==
DTypeEnum::Quantized4Asymm ||
opr->input(i)->dtype().enumv() ==
DTypeEnum::QuantizedS4;
bool i8_config = opr->input(i)->dtype().enumv() ==
DTypeEnum::QuantizedS8;
available &= (i4_config || i8_config);
}
config.input_dtypes.emplace_back(opr->input(i)->dtype().enumv());
TensorType tensor_type =
i == 1 ? TensorType::WEIGHT : TensorType::FEATURE;
config.input_tensor_types.emplace_back(tensor_type);
}
available &=
bool i4_config =
opr->output(0)->dtype().enumv() == DTypeEnum::Quantized4Asymm ||
opr->output(0)->dtype().enumv() == DTypeEnum::QuantizedS4;
bool i8_config =
opr->output(0)->dtype().enumv() == DTypeEnum::QuantizedS8;
available &= (i4_config || i8_config);
config.output_dtypes.emplace_back(opr->output(0)->dtype().enumv());
available &= conv.param().sparse == Opr::Param::Sparse::DENSE;
config.input_tensor_formats = {TensorFormats::NHWC, TensorFormats::NHWC,
......@@ -496,6 +503,38 @@ struct ConvTensorFormatsDispatcherImpl<opr::ConvolutionBackwardData,
}
};
template <>
struct ConvTensorFormatsDispatcherImpl<opr::ConvolutionBackwardData,
OprFormat::NHWC> {
using Opr = opr::ConvolutionBackwardData;
static Maybe<OprTensorFormatsConfiguration> dispatch(
const OperatorNodeBase* opr) {
const auto& conv = opr->cast_final_safe<Opr>();
OprTensorFormatsConfiguration config;
config.typeinfo = opr->dyn_typeinfo();
config.opr_format = OprFormat::NCHW4;
bool available = true;
for (size_t i = 0; i < opr->input().size(); ++i) {
available &=
opr->input(i)->dtype().enumv() == DTypeEnum::QuantizedS8;
config.input_dtypes.emplace_back(opr->input(i)->dtype().enumv());
TensorType tensor_type =
i == 0 ? TensorType::WEIGHT : TensorType::FEATURE;
config.input_tensor_types.emplace_back(tensor_type);
}
available &= opr->output(0)->dtype().enumv() == DTypeEnum::QuantizedS8;
config.output_dtypes.emplace_back(opr->output(0)->dtype().enumv());
available &= conv.param().sparse == opr::ConvBias::Param::Sparse::DENSE;
config.input_tensor_formats = {TensorFormats::NHWC, TensorFormats::NHWC,
TensorFormats::NHWC,
TensorFormats::NHWC};
config.output_tensor_formats = {TensorFormats::NHWC};
if (available)
return config;
return None;
}
};
struct StaticData {
struct KeyHash {
size_t operator()(const std::pair<Typeinfo*, OprFormat>& val) const {
......@@ -543,6 +582,7 @@ StaticData::StaticData() {
OPR_TENSOR_FORMATS_CONFIG_REG(ConvolutionForward, NCHW4);
OPR_TENSOR_FORMATS_CONFIG_REG(ConvolutionBackwardData, NCHW);
OPR_TENSOR_FORMATS_CONFIG_REG(ConvolutionBackwardData, NHWC);
OPR_TENSOR_FORMATS_CONFIG_REG(ConvolutionBackwardData, NCHW4);
OPR_SINGLE_IN_OUT_TENSOR_FORMATS_CONFIG_REG(WarpPerspectiveForward, NCHW);
......
......@@ -17,7 +17,6 @@
#include "megbrain/graph/event.h"
#include "megbrain/opr/dnn/pooling.h"
#include "megbrain/opr/imgproc.h"
#include "megbrain/opr/nn_int.h"
#include "megbrain/opr/io.h"
#include "megbrain/opr/nn_int.h"
#include "megbrain/plugin/base.h"
......@@ -167,11 +166,12 @@ private:
static constexpr float PROFILE_TIME_OUT = 1e7;
using ReformatAttribute = ReformatKey::Attribute;
/*!
* \brief profile opr format agnostic operators (like elemwise, elemwise multi type, typecvt etc.)
* \brief profile opr format agnostic operators (like elemwise, elemwise
* multi type, typecvt etc.)
*
* \param opr pointer to the operator node to be profiled
* \param base_format the original tensor format of the operator node.
* \param available_tensor_formats the available tensor formats
* \param available_tensor_formats the available tensor formats
* \return the operator node record
*/
OperatorNodeRecord profile_operator(
......@@ -220,7 +220,7 @@ private:
ReformatAttribute::DEFAULT) const;
float profile_var_node(const VarNode* var, TensorFormats base_format,
const ReformatKey& key) const;
int m_runs; /// sample times of the profiler
int m_runs; /// sample times of the profiler
};
ProfilerImpl::OperatorNodeRecord ProfilerImpl::profile_operator(
......@@ -281,10 +281,6 @@ ProfilerImpl::OperatorNodeRecord ProfilerImpl::profile_operator(
record.opr = opr;
auto& costs = record.costs;
for (auto&& i : available_configs) {
/// XXXX remove later
if (i.opr_format == OprFormat::NCHW &&
opr->input(0)->dtype().enumv() != DTypeEnum::Float32)
continue;
costs[i.opr_format] =
profile_operator(opr, base_config, i, extra_attribute);
}
......@@ -403,8 +399,8 @@ float ProfilerImpl::profile_var_node(const VarNode* var,
auto builder = ReformatManager::instance().auto_aligned_reformat_featrue(
var, base_format, key);
auto y = builder({aligned_var.node()});
if (!m_var_node_filter(var, aligned_tensor_shape, y->shape(),
TensorFormat{}))
if (!m_var_node_filter(var, aligned_tensor_shape, y->shape(), key))
return PROFILE_TIME_OUT;
ThinHashSet<OperatorNodeBase*> set;
DepOprIter iter([&set](OperatorNodeBase* opr) { set.insert(opr); });
......@@ -533,6 +529,17 @@ ProfilerBase::ProfilerBase(float opr_threshold, float var_node_threshold)
m_var_node_threshold{var_node_threshold} {
m_opr_filter = [this](const OperatorNodeBase* opr,
OperatorNodeBase* new_opr) {
/// \note: for the considerations of performance, we skip nchw(naive)
/// kernels for conv bias on CUDA platform. to remove this later
if (auto conv = try_cast_as_op<opr::ConvBiasForward>(new_opr)) {
if (conv->output(0)->comp_node().device_type() ==
CompNode::DeviceType::CUDA &&
conv->input(0)->dtype().category() ==
DTypeCategory::QUANTIZED &&
conv->param().format == OprFormat::NCHW) {
return false;
}
}
float comp1 = m_opr_footprint.get_computation(
const_cast<OperatorNodeBase*>(opr));
float comp2 = m_opr_footprint.get_computation(new_opr);
......@@ -541,18 +548,27 @@ ProfilerBase::ProfilerBase(float opr_threshold, float var_node_threshold)
return true;
};
m_var_node_filter = [this](const VarNode* var, TensorShape from,
TensorShape to, TensorFormat format) {
TensorFormat default_;
TensorLayout orig_ly, from_ly, to_ly;
if (format == default_) {
orig_ly = {var->shape(), var->dtype()};
from_ly = {from, var->dtype()};
to_ly = {to, var->dtype()};
} else {
orig_ly = {var->shape(), var->dtype(), format};
from_ly = {from, var->dtype(), format};
to_ly = {to, var->dtype(), format};
TensorShape to, ReformatKey key) {
/// \note: due to the alignment requirement of low-bit tensor, we skip
/// some layout transform for low-bit tensors. The skipped layout
/// transforms do not have corresponding dnn kernel and cannot be
/// implemented by tensor manip operators (like reshape, dimshuffle,
/// subtensor, etc.).
if (var->dtype().enumv() == DTypeEnum::QuantizedS4 ||
var->dtype().enumv() == DTypeEnum::Quantized4Asymm) {
if (key.input_format == TensorFormats::NCHW &&
key.output_format != TensorFormats::NHWC &&
key.output_format != TensorFormats::NCHWc64) {
return false;
}
if (key.output_format == TensorFormats::NCHW &&
key.input_format != TensorFormats::NHWC &&
key.input_format != TensorFormats::NCHWc64) {
return false;
}
}
TensorLayout orig_ly = {var->shape(), var->dtype()},
from_ly = {from, var->dtype()}, to_ly = {to, var->dtype()};
float orig_memory = orig_ly.span().dist_byte() * 2.f;
float reformat_memory =
from_ly.span().dist_byte() + to_ly.span().dist_byte();
......
......@@ -329,10 +329,21 @@ ReformatManager::ReformatImpl ReformatManager::get(
const ReformatKey& key) const {
using Attribute = ReformatKey::Attribute;
MGB_TRY {
auto find = m_cache.find(key);
if (find != m_cache.end()) {
auto rst = find->second;
return rst;
{
auto find = m_cache.find(key);
if (find != m_cache.end()) {
auto rst = find->second;
return rst;
}
}
if (key.attribute == Attribute::AUTO_PADDING_NHWC) {
auto key_ = key;
key_.attribute = Attribute::DEFAULT;
auto find = m_cache.find(key_);
if (find != m_cache.end()) {
auto rst = find->second;
return rst;
}
}
mgb_assert(!(key.attribute & Attribute::IMAGE2D) &&
!(key.attribute & Attribute::IC_SMALL));
......
......@@ -222,8 +222,9 @@ public:
};
using OprFilter = thin_function<bool(const cg::OperatorNodeBase*,
cg::OperatorNodeBase*)>;
using VarNodeFilter = thin_function<bool(const VarNode*, TensorShape,
TensorShape, TensorFormat)>;
using VarNodeFilter =
thin_function<bool(const VarNode*, TensorShape, TensorShape,
ReformatManager::ReformatKey)>;
ProfilerBase(float opr_threshold = 2.f, float var_node_threshold = 2.f);
ProfilerBase(OprFilter opr_filter, VarNodeFilter var_node_filter = {})
......
......@@ -146,18 +146,6 @@ private:
};
MGB_DEF_ENUM_CLASS_BIT_OPR(ReformatManager::ReformatKey::Attribute);
//
//TensorShape make_aligned_tensor_shape(
// const VarNode* var, TensorFormats orig_formats,
// TensorFormats target_formats,
// ReformatManager::ReformatKey::Attribute extra_attribute =
// ReformatManager::ReformatKey::Attribute::DEFAULT);
//
//TensorShape make_aligned_weight_shape(
// const VarNode* var, TensorFormats orig_formats,
// TensorFormats target_formats, TensorFormats extra_formats,
// ReformatManager::ReformatKey::Attribute extra_attribute =
// ReformatManager::ReformatKey::Attribute::DEFAULT);
} // namespace gopt
} // namespace mgb
......
......@@ -4104,6 +4104,79 @@ TEST(TestGoptInference, PreProcessCaseAutopadNCHW64) {
opr::RelayoutFormat::Param::Mode::NCHW_NCHW4);
}
TEST(TestGoptInference, PreProcessCaseAutopadNHWC) {
REQUIRE_GPU(1);
HostTensorGenerator<dtype::Uint8, RandomDistribution::UNIFORM> gen(0, 255);
auto cn = CompNode::load("gpu0");
auto&& prop = CompNodeEnv::from_comp_node(cn).cuda_env().device_prop;
auto sm_ver = prop.major * 10 + prop.minor;
if (sm_ver < 75) {
printf("This testcast ignored due to insufficient cuda cap(got: %d, "
"expected: %d)\n",
sm_ver, 75);
return;
}
auto graph = ComputingGraph::make();
graph->options().graph_opt_level = 0;
auto mkcvar = [&](const char* name, const TensorShape& shp,
const DType& dtype) {
return opr::TypeCvt::make(
opr::SharedDeviceTensor::make(*graph, *gen(shp, cn))
.rename(name),
dtype);
};
size_t n = 2;
size_t c = 3;
size_t h = 32;
size_t w = 32;
auto host_x1 = gen({n, c, h, w}, cn);
auto x = opr::Host2DeviceCopy::make(*graph, host_x1);
auto x_u8_fp32 = opr::TypeCvt::make(x, dtype::Float32(), cn);
auto x_s8_fp32 = x_u8_fp32 - 128;
auto x_s8 = opr::TypeCvt::make(x_s8_fp32, dtype::QuantizedS8(2.5f), cn);
auto host_val =
std::make_shared<HostTensorND>(cn, dtype::QuantizedS8(2.5f));
TensorShape scalar{1, 1, 1, 1};
host_val->resize(scalar);
auto ptr = host_val->raw_ptr();
size_t size_bytes =
TensorLayout{scalar, dtype::QuantizedS8(2.5f)}.span().dist_byte();
std::memset(ptr, 0, size_bytes);
auto padding = opr::ImmutableTensor::make(*graph, *host_val);
padding = opr::Broadcast::make(padding, {n, 1, h, w});
auto padded_x = opr::Concat::make({x_s8, padding}, 1);
auto nhwc_x = opr::Dimshuffle::make(padded_x, {0, 2, 3, 1});
auto weight = mkcvar("weight", {16, 3, 3, 4}, dtype::QuantizedS8(2.5f)),
bias = mkcvar("bias", {1, 1, 1, 16}, dtype::QuantizedS32(6.25f));
opr::ConvBias::Param param;
param.format = opr::ConvBias::Param::Format::NHWC;
param.nonlineMode = opr::ConvBias::Param::NonlineMode::RELU;
param.stride_h = param.stride_w = 2;
param.pad_h = param.pad_w = 1;
auto result =
opr::ConvBias::make(nhwc_x, weight, bias, param, {},
OperatorNodeConfig{dtype::QuantizedS8(2.5f)});
auto y = opr::TypeCvt::make(result, dtype::Float32());
SymbolVar y_opt;
auto options = gopt::OptimizeForInferenceOptions{};
options.enable_fuse_preprocess();
unpack_vector(gopt::optimize_for_inference({y}, options), y_opt);
graph->compile({{y_opt, {}}})
->to_json()
->writeto_fpath(output_file(
"TestGoptInference.PreProcessCaseAutopadNHWC.json"));
HostTensorND host_y_opt, host_y;
auto func = graph->compile({make_callback_copy(y, host_y),
make_callback_copy(y_opt, host_y_opt)});
func->execute();
MGB_ASSERT_TENSOR_NEAR(host_y, host_y_opt, 1e-5);
ASSERT_TRUE(find_opr<opr::RelayoutFormat>(y_opt).param().mode ==
opr::RelayoutFormat::Param::Mode::NCHW_NCHW4);
}
TEST(TestGoptInference, WarpAndPreProcessCase1) {
REQUIRE_GPU(1);
HostTensorGenerator<dtype::Uint8, RandomDistribution::UNIFORM> gen(0, 255);
......
......@@ -10,7 +10,8 @@
* implied.
*/
#include "./helper.h"
#include "./network.h"
#include "megbrain/comp_node_env.h"
#include "megbrain/gopt/global_layout_transform.h"
#include "megbrain/gopt/inference.h"
#include "megbrain/opr/dnn/pooling.h"
......@@ -24,23 +25,145 @@ using namespace gopt;
using namespace serialization;
#if MGB_CUDA
TEST(TestLayoutTransform, Feature) {
auto inp_file = InputFile::make_fs("./feat.mdl");
namespace {
//! find first the operator of specific type; raise exception if not found
template <typename T>
T& find_opr(SymbolVar endpoint) {
T* found = nullptr;
auto cb = [&found](cg::OperatorNodeBase* opr) {
if (!found && opr->same_type<T>()) {
found = &opr->cast_final_safe<T>();
}
};
cg::DepOprIter{cb}.add(endpoint.node()->owner_opr());
mgb_assert(found, "not found opr from %s", endpoint.node()->name().c_str());
return *found;
}
auto format = GraphLoader::identify_graph_dump_format(*inp_file);
ASSERT_TRUE(format.valid());
auto loader = GraphLoader::make(std::move(inp_file), format.val());
template <typename T>
size_t find_opr_num(SymbolVar endpoint) {
size_t opr_num = 0;
auto cb = [&opr_num](cg::OperatorNodeBase* opr) {
if (opr->same_type<T>()) {
printf("%s, %s\n", opr->cname(), opr->dyn_typeinfo()->name);
opr_num++;
}
};
cg::DepOprIter{cb}.add(endpoint.node()->owner_opr());
return opr_num;
}
} // namespace
TEST(TestLayoutTransform, Resnet18_QS8) {
REQUIRE_GPU(1);
auto cn = CompNode::load("gpu0");
auto&& prop = CompNodeEnv::from_comp_node(cn).cuda_env().device_prop;
auto sm_ver = prop.major * 10 + prop.minor;
if (sm_ver < 75) {
printf("This testcast ignored due to insufficient cuda cap(got: %d, "
"expected: %d)\n",
sm_ver, 75);
return;
}
Network network(cn);
/// batch size = 1 reduce test time
auto output = make_resnet18(network, 16, dtype::QuantizedS8{1.f});
using S = opr::mixin::AlgoChooserHelper::ExecutionPolicy::Strategy;
S strategy = S::PROFILE;
gopt::modify_opr_algo_strategy_inplace({{output}}, strategy);
GraphLoader::LoadConfig load_config;
load_config.comp_graph = ComputingGraph::make();
auto&& graph_opt = load_config.comp_graph->options();
graph_opt.graph_opt.enable_fuse_conv_bias_nonlinearity();
graph_opt.graph_opt.enable_fuse_conv_bias_with_z();
auto ret = loader->load(load_config, false);
HostTensorND t1;
auto func1 = network.graph->compile({make_callback_copy(output, t1)});
func1->execute();
using OprFormat = LayoutTransformContext::OprFormat;
using OprList = LayoutTransformContext::OprList;
using ReformatAttribute = LayoutTransformContext::ReformatAttribute;
using Attribute = LayoutTransformContext::Attribute;
OprList opr_list = {
opr::ConvBiasForward::typeinfo(),
opr::ElemwiseMultiType::typeinfo(),
opr::Elemwise::typeinfo(),
opr::TypeCvt::typeinfo(),
opr::PoolingForward::typeinfo(),
opr::WarpPerspectiveForward::typeinfo(),
};
SmallVector<TensorFormats> available_tensor_formats = {
TensorFormats::NCHW, TensorFormats::NHWC, TensorFormats::NCHWc4,
TensorFormats::NCHWc32, TensorFormats::CHWNc4};
Attribute attribute = {OprFormat::NCHW, TensorFormats::NCHW,
ReformatAttribute::AUTO_PADDING_NHWC};
auto ctx = std::make_unique<LayoutTransformContext>(
std::move(opr_list), std::move(available_tensor_formats),
attribute);
ctx->add_opr_config(opr::ConvBiasForward::typeinfo(),
{OprFormat::NCHW4, OprFormat::NCHW32, OprFormat::CHWN4,
OprFormat::NHWC})
.add_opr_config(opr::PoolingForward::typeinfo(),
{OprFormat::NCHW4, OprFormat::NCHW32,
OprFormat::NHWC, OprFormat::CHWN4});
auto profiler = ProfilerBase::make_profiler();
std::unique_ptr<SolverBase> solver{
new DynamicProgrammingSolver(std::move(profiler))};
auto new_output = gopt::GraphOptimizer{}
.add_pass<FuseConvBiasNonlinPass>()
.add_pass<FuseConvBiasZPass>()
.add_pass<LayoutTransformPass>(std::move(ctx),
std::move(solver))
.add_pass<ShuffleShuffleRemovePass>()
.add_pass(FuseNCHW4Int8Preprocess::make())
.add_pass<FoldingConvBiasDimshufflePass>()
.add_pass<ParamFusePass>()
.add_pass<ParamMergePass>()
.apply({{output}})
.endpoint_vars();
auto new_out_var = new_output[0];
/// check global layout transform pass
auto nr_dimshuffle = find_opr_num<opr::Dimshuffle>(new_out_var);
ASSERT_EQ(nr_dimshuffle, 3u);
/// check pass fuse conv bias with z
auto nr_elemwise_mult_type =
find_opr_num<opr::ElemwiseMultiType>(new_out_var);
ASSERT_EQ(nr_elemwise_mult_type, 4u);
/// 21 convolutions, 21 weights and 21 bias, total 42 parameters
const auto& param_merge =
find_opr<opr::MultipleDeviceTensorHolder>(new_out_var);
ASSERT_EQ(param_merge.output().size(), 42u);
/// check first conv format
const auto& first_conv = find_opr<opr::ConvBiasForward>(new_out_var);
const auto& cast = first_conv.cast_final_safe<opr::ConvBiasForward>();
ASSERT_EQ(cast.param().format, opr::ConvBias::Param::Format::NCHW4);
GraphProfiler gprof{network.graph.get()};
HostTensorND t2;
auto func2 = network.graph->compile({make_callback_copy(new_out_var, t2)});
func2->execute();
gprof.to_json_full(func2.get())
->writeto_fpath(output_file("resnet18_qs8.json"));
/// check correct
MGB_ASSERT_TENSOR_EQ(t1, t2);
}
TEST(TestLayoutTransform, Resnet18_QS4) {
REQUIRE_GPU(1);
auto cn = CompNode::load("gpu0");
auto&& prop = CompNodeEnv::from_comp_node(cn).cuda_env().device_prop;
auto sm_ver = prop.major * 10 + prop.minor;
if (sm_ver < 75) {
printf("This testcast ignored due to insufficient cuda cap(got: %d, "
"expected: %d)\n",
sm_ver, 75);
return;
}
Network network(cn);
auto output = make_resnet18(network, 16, dtype::QuantizedS4{1.f});
using S = opr::mixin::AlgoChooserHelper::ExecutionPolicy::Strategy;
S strategy = S::PROFILE;
gopt::modify_opr_algo_strategy_inplace({ret.output_var_list}, strategy);
gopt::modify_opr_algo_strategy_inplace({{output}}, strategy);
HostTensorND t1;
auto func1 = network.graph->compile({make_callback_copy(output, t1)});
func1->execute();
using OprFormat = LayoutTransformContext::OprFormat;
using OprList = LayoutTransformContext::OprList;
......@@ -55,74 +178,113 @@ TEST(TestLayoutTransform, Feature) {
opr::WarpPerspectiveForward::typeinfo(),
};
SmallVector<TensorFormats> available_tensor_formats = {
TensorFormats::NCHWc4, TensorFormats::NCHWc32,
TensorFormats::CHWNc4};
Attribute attribute = {OprFormat::NCHW4, TensorFormats::NCHWc4,
ReformatAttribute::DEFAULT};
TensorFormats::NCHW, TensorFormats::NHWC,
TensorFormats::NCHWc4, TensorFormats::NCHWc32,
TensorFormats::NCHWc64, TensorFormats::CHWNc4};
Attribute attribute = {OprFormat::NCHW, TensorFormats::NCHW,
ReformatAttribute::AUTO_PADDING_NHWC};
auto ctx = std::make_unique<LayoutTransformContext>(
std::move(opr_list), std::move(available_tensor_formats),
attribute);
ctx->add_opr_config(opr::ConvBiasForward::typeinfo(),
{OprFormat::NCHW4, OprFormat::NCHW32, OprFormat::CHWN4})
{OprFormat::NCHW4, OprFormat::NCHW32, OprFormat::CHWN4,
OprFormat::NHWC, OprFormat::NCHW64})
.add_opr_config(
opr::PoolingForward::typeinfo(),
{OprFormat::NCHW4, OprFormat::NCHW32, OprFormat::CHWN4})
.add_opr_config(opr::WarpPerspectiveForward::typeinfo(),
OprFormat::NCHW4);
{OprFormat::NCHW4, OprFormat::NCHW32, OprFormat::NCHW64,
OprFormat::NHWC, OprFormat::CHWN4});
auto profiler = ProfilerBase::make_profiler();
auto filter = [](const GraphPartition& partition) {
auto has_nchw4_conv = false;
for (auto&& opr : partition.all_oprs()) {
if (opr->dyn_typeinfo() == opr::ConvBiasForward::typeinfo()) {
auto& conv = opr->cast_final_safe<opr::ConvBiasForward>();
if (conv.param().format ==
LayoutTransformContext::OprFormat::NCHW4) {
has_nchw4_conv = true;
break;
}
}
}
return has_nchw4_conv;
};
std::unique_ptr<SolverBase> solver{new DynamicProgrammingSolver(
std::move(profiler), std::move(filter))};
auto new_out_vars = gopt::GraphOptimizer{}
.add_pass<FuseConvBiasNonlinPass>()
.add_pass<FuseConvBiasZPass>()
.add_pass<LayoutTransformPass>(
std::move(ctx), std::move(solver))
.add_pass<ShuffleShuffleRemovePass>()
.add_pass(FuseNCHW4Int8Preprocess::make())
.add_pass<FoldingConvBiasDimshufflePass>()
.add_pass<ParamFusePass>()
.add_pass<ParamMergePass>()
.apply(ret.output_var_list)
.endpoint_vars();
auto dumper = GraphDumper::make(OutputFile::make_fs("model_opt.mgb"));
dumper->dump({new_out_vars});
std::unique_ptr<SolverBase> solver{
new DynamicProgrammingSolver(std::move(profiler))};
auto new_output = gopt::GraphOptimizer{}
.add_pass<FuseConvBiasNonlinPass>()
.add_pass<FuseConvBiasZPass>()
.add_pass<LayoutTransformPass>(std::move(ctx),
std::move(solver))
.add_pass<ShuffleShuffleRemovePass>()
.add_pass(FuseNCHW4Int8Preprocess::make())
.add_pass<FoldingConvBiasDimshufflePass>()
.add_pass<ParamFusePass>()
.add_pass<ParamMergePass>()
.apply({{output}})
.endpoint_vars();
auto new_out_var = new_output[0];
/// check global layout transform pass
auto nr_dimshuffle = find_opr_num<opr::Dimshuffle>(new_out_var);
ASSERT_EQ(nr_dimshuffle, 3u);
/// check pass fuse conv bias with z
auto nr_elemwise_mult_type =
find_opr_num<opr::ElemwiseMultiType>(new_out_var);
ASSERT_EQ(nr_elemwise_mult_type, 4u);
/// 21 convolutions, 21 weights and 21 bias, total 42 parameters
const auto& param_merge =
find_opr<opr::MultipleDeviceTensorHolder>(new_out_var);
ASSERT_EQ(param_merge.output().size(), 42u);
/// check first conv format
const auto& first_conv = find_opr<opr::ConvBiasForward>(new_out_var);
const auto& cast = first_conv.cast_final_safe<opr::ConvBiasForward>();
ASSERT_EQ(cast.param().format, opr::ConvBias::Param::Format::NHWC);
GraphProfiler gprof{network.graph.get()};
HostTensorND t2;
auto func2 = network.graph->compile({make_callback_copy(new_out_var, t2)});
func2->execute();
gprof.to_json_full(func2.get())
->writeto_fpath(output_file("resnet18_qs4.json"));
MGB_ASSERT_TENSOR_EQ(t1, t2);
}
TEST(TestLayoutTransform, Detection) {
auto inp_file = InputFile::make_fs("./det.mdl");
static const char* magic = "mgbteset0";
size_t skip_size = sizeof(magic) + sizeof(uint32_t);
char skip[skip_size];
inp_file->read(skip, skip_size);
TEST(TestLayoutTransform, Resnet18_NCHW64) {
REQUIRE_GPU(1);
auto cn = CompNode::load("gpu0");
auto&& prop = CompNodeEnv::from_comp_node(cn).cuda_env().device_prop;
auto sm_ver = prop.major * 10 + prop.minor;
if (sm_ver < 75) {
printf("This testcast ignored due to insufficient cuda cap(got: %d, "
"expected: %d)\n",
sm_ver, 75);
return;
}
Network network(cn);
auto output = make_resnet18(network, 64, dtype::QuantizedS4{1.f});
using S = opr::mixin::AlgoChooserHelper::ExecutionPolicy::Strategy;
S strategy = S::PROFILE;
gopt::modify_opr_algo_strategy_inplace({{output}}, strategy);
auto format = GraphLoader::identify_graph_dump_format(*inp_file);
ASSERT_TRUE(format.valid());
auto loader = GraphLoader::make(std::move(inp_file), format.val());
HostTensorND t1;
auto func1 = network.graph->compile({make_callback_copy(output, t1)});
func1->execute();
GraphLoader::LoadConfig load_config;
load_config.comp_graph = ComputingGraph::make();
auto&& graph_opt = load_config.comp_graph->options();
graph_opt.graph_opt.enable_fuse_conv_bias_nonlinearity();
graph_opt.graph_opt.enable_fuse_conv_bias_with_z();
auto ret = loader->load(load_config, false);
SymbolVar new_out_var;
auto options = gopt::OptimizeForInferenceOptions{};
options.enable_nchw64();
unpack_vector(gopt::optimize_for_inference({output}, options), new_out_var);
GraphProfiler gprof{network.graph.get()};
HostTensorND t2;
auto func2 = network.graph->compile({make_callback_copy(new_out_var, t2)});
func2->execute();
gprof.to_json_full(func2.get())
->writeto_fpath(output_file("resnet18_nchw64.json"));
MGB_ASSERT_TENSOR_EQ(t1, t2);
}
TEST(TestLayoutTransform, Detection_QS8) {
REQUIRE_GPU(1);
auto cn = CompNode::load("gpu0");
auto&& prop = CompNodeEnv::from_comp_node(cn).cuda_env().device_prop;
auto sm_ver = prop.major * 10 + prop.minor;
if (sm_ver < 75) {
printf("This testcast ignored due to insufficient cuda cap(got: %d, "
"expected: %d)\n",
sm_ver, 75);
return;
}
Network network(cn);
auto outputs = make_det(network, 16, dtype::QuantizedS8{1.f});
using S = opr::mixin::AlgoChooserHelper::ExecutionPolicy::Strategy;
S strategy = S::PROFILE;
gopt::modify_opr_algo_strategy_inplace({ret.output_var_list}, strategy);
gopt::modify_opr_algo_strategy_inplace({outputs}, strategy);
using OprFormat = LayoutTransformContext::OprFormat;
using OprList = LayoutTransformContext::OprList;
......@@ -130,8 +292,6 @@ TEST(TestLayoutTransform, Detection) {
using Attribute = LayoutTransformContext::Attribute;
OprList opr_list = {
opr::ConvBiasForward::typeinfo(),
opr::ConvolutionForward::typeinfo(),
opr::ConvolutionBackwardData::typeinfo(),
opr::ElemwiseMultiType::typeinfo(),
opr::Elemwise::typeinfo(),
opr::TypeCvt::typeinfo(),
......@@ -143,51 +303,228 @@ TEST(TestLayoutTransform, Detection) {
TensorFormats::NCHWc4, TensorFormats::NCHWc32,
TensorFormats::NCHWc64, TensorFormats::CHWNc4};
Attribute attribute = {OprFormat::NCHW, TensorFormats::NCHW,
ReformatAttribute::DEFAULT};
ReformatAttribute::AUTO_PADDING_NHWC};
auto ctx = std::make_unique<LayoutTransformContext>(
std::move(opr_list), std::move(available_tensor_formats),
attribute);
ctx->add_opr_config(
opr::ConvBiasForward::typeinfo(),
{OprFormat::NCHW, OprFormat::NHWC, OprFormat::NCHW4,
OprFormat::NCHW32, OprFormat::NCHW64, OprFormat::CHWN4})
.add_opr_config(opr::ConvolutionForward::typeinfo(),
{OprFormat::NCHW, OprFormat::NCHW4})
.add_opr_config(opr::ConvolutionBackwardData::typeinfo(),
{OprFormat::NCHW, OprFormat::NCHW4})
ctx->add_opr_config(opr::ConvBiasForward::typeinfo(),
{OprFormat::NCHW4, OprFormat::NCHW32, OprFormat::CHWN4,
OprFormat::NHWC, OprFormat::NCHW64})
.add_opr_config(
opr::PoolingForward::typeinfo(),
{OprFormat::NCHW4, OprFormat::NCHW32, OprFormat::NHWC,
OprFormat::NCHW64, OprFormat::CHWN4})
.add_opr_config(
opr::WarpPerspectiveForward::typeinfo(),
{OprFormat::NHWC, OprFormat::NCHW4, OprFormat::NCHW64});
{OprFormat::NCHW4, OprFormat::NCHW32, OprFormat::NCHW64,
OprFormat::NHWC, OprFormat::CHWN4});
auto profiler = ProfilerBase::make_profiler();
std::unique_ptr<SolverBase> solver{
new DynamicProgrammingSolver(std::move(profiler))};
auto new_outputs = gopt::GraphOptimizer{}
.add_pass<FuseConvBiasNonlinPass>()
.add_pass<FuseConvBiasZPass>()
.add_pass<LayoutTransformPass>(std::move(ctx),
std::move(solver))
.add_pass<ShuffleShuffleRemovePass>()
.add_pass(FuseNCHW4Int8Preprocess::make())
.add_pass<FoldingConvBiasDimshufflePass>()
.add_pass<ParamFusePass>()
.add_pass<ParamMergePass>()
.apply({{outputs}})
.endpoint_vars();
GraphProfiler gprof{network.graph.get()};
using OutputSpecItem = cg::ComputingGraph::OutputSpecItem;
std::vector<OutputSpecItem> output_spec;
for (const auto& i : new_outputs) {
output_spec.emplace_back(OutputSpecItem{i, {}});
}
auto func = network.graph->compile(output_spec);
func->execute();
gprof.to_json_full(func.get())->writeto_fpath(output_file("det_qs8.json"));
}
TEST(TestLayoutTransform, Detection_QS4) {
REQUIRE_GPU(1);
auto cn = CompNode::load("gpu0");
auto&& prop = CompNodeEnv::from_comp_node(cn).cuda_env().device_prop;
auto sm_ver = prop.major * 10 + prop.minor;
if (sm_ver < 75) {
printf("This testcast ignored due to insufficient cuda cap(got: %d, "
"expected: %d)\n",
sm_ver, 75);
return;
}
Network network(cn);
auto outputs = make_det(network, 16, dtype::QuantizedS4{1.f});
using S = opr::mixin::AlgoChooserHelper::ExecutionPolicy::Strategy;
S strategy = S::PROFILE;
gopt::modify_opr_algo_strategy_inplace({outputs}, strategy);
using OprFormat = LayoutTransformContext::OprFormat;
using OprList = LayoutTransformContext::OprList;
using ReformatAttribute = LayoutTransformContext::ReformatAttribute;
using Attribute = LayoutTransformContext::Attribute;
OprList opr_list = {
opr::ConvBiasForward::typeinfo(),
opr::ElemwiseMultiType::typeinfo(),
opr::Elemwise::typeinfo(),
opr::TypeCvt::typeinfo(),
opr::PoolingForward::typeinfo(),
opr::WarpPerspectiveForward::typeinfo(),
};
SmallVector<TensorFormats> available_tensor_formats = {
TensorFormats::NCHW, TensorFormats::NHWC,
TensorFormats::NCHWc4, TensorFormats::NCHWc32,
TensorFormats::NCHWc64, TensorFormats::CHWNc4};
Attribute attribute = {OprFormat::NCHW, TensorFormats::NCHW,
ReformatAttribute::AUTO_PADDING_NHWC};
auto ctx = std::make_unique<LayoutTransformContext>(
std::move(opr_list), std::move(available_tensor_formats),
attribute);
ctx->add_opr_config(opr::ConvBiasForward::typeinfo(),
{OprFormat::NCHW4, OprFormat::NCHW32, OprFormat::CHWN4,
OprFormat::NHWC, OprFormat::NCHW64})
.add_opr_config(
opr::PoolingForward::typeinfo(),
{OprFormat::NCHW4, OprFormat::NCHW32, OprFormat::NCHW64,
OprFormat::NHWC, OprFormat::CHWN4});
auto profiler = ProfilerBase::make_profiler();
std::unique_ptr<SolverBase> solver{
new DynamicProgrammingSolver(std::move(profiler))};
auto new_out_vars = gopt::GraphOptimizer{}
.add_pass<LayoutTransformPass>(
std::move(ctx), std::move(solver))
.add_pass<ShuffleShuffleRemovePass>()
.add_pass(FuseNCHW4Int8Preprocess::make())
.add_pass<FoldingConvBiasDimshufflePass>()
.add_pass<ParamFusePass>()
.add_pass<ParamMergePass>()
.apply(ret.output_var_list)
.endpoint_vars();
auto new_outputs = gopt::GraphOptimizer{}
.add_pass<FuseConvBiasNonlinPass>()
.add_pass<FuseConvBiasZPass>()
.add_pass<LayoutTransformPass>(std::move(ctx),
std::move(solver))
.add_pass<ShuffleShuffleRemovePass>()
.add_pass(FuseNCHW4Int8Preprocess::make())
.add_pass<FoldingConvBiasDimshufflePass>()
.add_pass<ParamFusePass>()
.add_pass<ParamMergePass>()
.apply({{outputs}})
.endpoint_vars();
GraphProfiler gprof{network.graph.get()};
using OutputSpecItem = cg::ComputingGraph::OutputSpecItem;
std::vector<OutputSpecItem> outs(new_out_vars.size());
for (size_t i = 0; i < new_out_vars.size(); ++i) {
auto cb = [](DeviceTensorND& /* d */) {};
outs[i] = std::make_pair(new_out_vars[i], cb);
std::vector<OutputSpecItem> output_spec;
for (const auto& i : new_outputs) {
output_spec.emplace_back(OutputSpecItem{i, {}});
}
GraphProfiler gprof{load_config.comp_graph.get()};
auto func = load_config.comp_graph->compile(outs);
for (size_t i = 0; i < 10; ++i)
func->execute();
func->wait();
gprof.to_json_full(func.get())->writeto_fpath(output_file("det.json"));
auto func = network.graph->compile(output_spec);
func->execute();
gprof.to_json_full(func.get())->writeto_fpath(output_file("det_qs4.json"));
}
/*!
* test the performance of the solver when network is wide.
*/
TEST(TestLayoutTransform, Wide) {
REQUIRE_GPU(1);
auto cn = CompNode::load("gpu0");
Network network(cn);
auto data = network.add_var("data", {16, 3, 64, 64});
auto f = network.add_conv(data, 16, {3, 3}, dtype::Float32(), true, {2, 2},
{1, 1});
f = network.add_conv(f, 16, {3, 3}, dtype::Float32(), true, {2, 2}, {1, 1});
f = network.add_conv(f, 16, {3, 3}, dtype::Float32(), true, {2, 2}, {1, 1});
SymbolVarArray stages;
for (size_t i = 0; i < 8; ++i) {
f = f * f + f;
stages.push_back(f);
}
auto y = stages[0];
for (size_t i = 1; i < stages.size(); ++i) {
y = y + stages[i];
}
using S = opr::mixin::AlgoChooserHelper::ExecutionPolicy::Strategy;
S strategy = S::PROFILE;
gopt::modify_opr_algo_strategy_inplace({y}, strategy);
using OprFormat = LayoutTransformContext::OprFormat;
using OprList = LayoutTransformContext::OprList;
using ReformatAttribute = LayoutTransformContext::ReformatAttribute;
using Attribute = LayoutTransformContext::Attribute;
OprList opr_list = {
opr::ConvBiasForward::typeinfo(),
opr::Elemwise::typeinfo(),
};
SmallVector<TensorFormats> available_tensor_formats = {TensorFormats::NCHW,
TensorFormats::NHWC};
Attribute attribute = {OprFormat::NCHW, TensorFormats::NCHW,
ReformatAttribute::DEFAULT};
auto ctx = std::make_unique<LayoutTransformContext>(
std::move(opr_list), std::move(available_tensor_formats),
attribute);
ctx->add_opr_config(opr::ConvBiasForward::typeinfo(),
{OprFormat::NCHW, OprFormat::NHWC});
auto profiler = ProfilerBase::make_profiler();
std::unique_ptr<SolverBase> solver{
new DynamicProgrammingSolver(std::move(profiler))};
auto v = gopt::GraphOptimizer{}
.add_pass<FuseConvBiasNonlinPass>()
.add_pass<FuseConvBiasZPass>()
.add_pass<LayoutTransformPass>(std::move(ctx),
std::move(solver))
.add_pass<ShuffleShuffleRemovePass>()
.add_pass<ParamFusePass>()
.add_pass<ParamMergePass>()
.apply({{y}})
.endpoint_vars();
const auto& sym_o = v[0];
GraphProfiler gprof{network.graph.get()};
auto func = network.graph->compile({{sym_o, {}}});
func->execute();
gprof.to_json_full(func.get())->writeto_fpath(output_file("wide.json"));
/// check global layout transform pass, no dimshuffle
auto nr_dimshuffle = find_opr_num<opr::Dimshuffle>(sym_o);
ASSERT_EQ(nr_dimshuffle, 0u);
auto nr_param_merge = find_opr_num<opr::MultipleDeviceTensorHolder>(sym_o);
ASSERT_EQ(nr_param_merge, 1u);
/// check first conv format
const auto& first_conv = find_opr<opr::ConvBiasForward>(sym_o);
const auto& cast = first_conv.cast_final_safe<opr::ConvBiasForward>();
ASSERT_EQ(cast.param().format, opr::ConvBias::Param::Format::NCHW);
}
TEST(TestLayoutTransform, ElemwiseMultiType) {
REQUIRE_GPU(1);
auto cn = CompNode::load("gpu0");
Network network(cn);
auto x = network.add_var("x", {64, 64, 1, 2});
auto y = network.add_var("y", {64, 64, 1, 2});
x = network.add_type_cvt(x, dtype::QuantizedS4{1.f});
y = network.add_type_cvt(y, dtype::QuantizedS4{1.f});
auto x_ = network.add_type_cvt(x, dtype::Float32());
auto y_ = network.add_type_cvt(y, dtype::Float32());
auto z = network.add_elemwise({x_, y_}, dtype::Float32(),
opr::Elemwise::Mode::FUSE_ADD_RELU);
z = network.add_type_cvt(z, dtype::QuantizedS4{1.f});
z = network.add_type_cvt(z, dtype::Float32());
auto z2 = network.add_elemwise({x, y}, dtype::QuantizedS4{1.f},
opr::Elemwise::Mode::FUSE_ADD_RELU);
z2 = network.add_type_cvt(z2, dtype::Float32());
HostTensorND t1;
auto func1 = network.graph->compile({make_callback_copy(z, t1)});
func1->execute();
HostTensorND t3;
auto func3 = network.graph->compile({make_callback_copy(z2, t3)});
func3->execute();
auto alter_x = opr::RelayoutFormat::make(
x, megdnn::param::RelayoutFormat::Mode::NCHW_NCHW64);
auto alter_y = opr::RelayoutFormat::make(
y, megdnn::param::RelayoutFormat::Mode::NCHW_NCHW64);
auto alter_z =
network.add_elemwise({alter_x, alter_y}, dtype::QuantizedS4{1.f},
opr::Elemwise::Mode::FUSE_ADD_RELU);
alter_z = opr::RelayoutFormat::make(
alter_z, megdnn::param::RelayoutFormat::Mode::NCHW64_NCHW);
alter_z = network.add_type_cvt(alter_z, dtype::Float32());
HostTensorND t2;
auto func2 = network.graph->compile({make_callback_copy(alter_z, t2)});
func2->execute();
// MGB_ASSERT_TENSOR_EQ(t1, t3);
MGB_ASSERT_TENSOR_EQ(t2, t3);
}
TEST(TestLayoutTransform, DetectionHead) {
......@@ -196,7 +533,7 @@ TEST(TestLayoutTransform, DetectionHead) {
cn.activate();
REQUIRE_CUDA_COMPUTE_CAPABILITY_EQ(7, 5);
constexpr size_t N = 16, C = 3, H = 768, W = 1280;
constexpr size_t N = 16, C = 3, H = 736, W = 1280;
HostTensorGenerator<dtype::Uint8> gen;
auto graph = ComputingGraph::make();
......@@ -284,20 +621,71 @@ TEST(TestLayoutTransform, DetectionHead) {
.add_pass<ParamMergePass>()
.apply(SymbolVarArray{y})
.endpoint_vars();
const auto& v = new_out_vars[0];
using OutputSpecItem = cg::ComputingGraph::OutputSpecItem;
std::vector<OutputSpecItem> outs(new_out_vars.size());
for (size_t i = 0; i < new_out_vars.size(); ++i) {
auto cb = [](DeviceTensorND& /* d */) {};
outs[i] = std::make_pair(new_out_vars[i], cb);
std::vector<OutputSpecItem> outs;
for (const auto& i : new_out_vars) {
outs.emplace_back(OutputSpecItem{i, {}});
}
GraphProfiler gprof{graph.get()};
auto func = graph->compile(outs);
for (size_t i = 0; i < 10; ++i)
func->execute();
func->wait();
func->execute();
gprof.to_json_full(func.get())->writeto_fpath(output_file("det_head.json"));
/// check reformat
auto nr_reformat = find_opr_num<opr::RelayoutFormat>(v);
ASSERT_EQ(nr_reformat, 2u);
/// check dimshuffle
auto nr_dimshuffle = find_opr_num<opr::Dimshuffle>(v);
ASSERT_EQ(nr_dimshuffle, 0u);
/// check conv_bias
auto nr_conv = find_opr_num<opr::ConvBiasForward>(v);
ASSERT_EQ(nr_conv, 2u);
/// check first conv format
const auto& first_conv = find_opr<opr::ConvBiasForward>(v);
const auto& cast = first_conv.cast_final_safe<opr::ConvBiasForward>();
ASSERT_EQ(cast.param().format, opr::ConvBias::Param::Format::NCHW4_NHWC);
}
#endif
TEST(TestLayoutTransform, CanonicalizeLayoutTransform) {
constexpr size_t N = 64, C = 64, H = 1, W = 1;
auto cn = CompNode::load("xpu0");
Network network(cn);
auto x = network.add_var("x", {N, C / 4, H, W, 4});
x = network.add_type_cvt(x, dtype::QuantizedS4{1.f});
using NamedTensorShape = megdnn::NamedTensorShape;
auto src = NamedTensorShape::make_named_tensor_shape(
NamedTensorShape::Format::NCHW4);
auto dst = NamedTensorShape::make_named_tensor_shape(
NamedTensorShape::Format::NHWC);
auto [builder, _] = gopt::ReformatEmitter(src, dst).emit();
MGB_MARK_USED_VAR(_);
x = SymbolVar(builder({x.node()}));
x = opr::Reshape::make(x, {N, H, W, C});
x = network.add_type_cvt(x, dtype::Float32());
SymbolVar another_x;
unpack_vector(gopt::GraphOptimizer{}
.add_pass<gopt::ShuffleShuffleRemovePass>()
.apply({{x}})
.endpoint_vars(),
another_x);
const auto& astype = find_opr<opr::TypeCvt>(x);
EXPECT_TRUE(astype.input(0)->owner_opr()->dyn_typeinfo() ==
opr::Host2DeviceCopy::typeinfo());
const auto& another_astype = find_opr<opr::TypeCvt>(another_x);
EXPECT_TRUE(another_astype.input(0)->owner_opr()->dyn_typeinfo() ==
opr::Reshape::typeinfo());
HostTensorND t1;
auto func1 = network.graph->compile({make_callback_copy(x, t1)});
func1->execute();
HostTensorND t2;
auto func2 = network.graph->compile({make_callback_copy(another_x, t2)});
func2->execute();
MGB_ASSERT_TENSOR_EQ(t1, t2);
}
// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}
/**
* \file src/gopt/test/network.cpp
* MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
*
* Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
* implied.
*/
#include "./network.h"
using namespace mgb;
SymbolVar Network::add_conv(SymbolVar f, size_t output_channels,
KernSize kern_size, DType out_dtype, bool has_relu,
Stride stride, Padding padding) {
static int weight_idx = 0;
static int bias_idx = 0;
size_t input_channels = f.node()->shape()[1];
auto weight = add_cvar(
ssprintf("w%d", weight_idx).c_str(),
{output_channels, input_channels, kern_size[0], kern_size[1]});
auto bias = add_cvar(ssprintf("b%d", bias_idx).c_str(),
{1, output_channels, 1, 1});
if (out_dtype.category() == DTypeCategory::QUANTIZED) {
weight = add_type_cvt(weight, out_dtype);
bias = add_type_cvt(bias, dtype::QuantizedS32{1.f});
}
opr::ConvBias::Param param;
param.stride_h = stride[0], param.stride_w = stride[1];
param.pad_h = padding[0], param.pad_w = padding[1];
if (has_relu) {
param.nonlineMode = opr::ConvBias::Param::NonlineMode::RELU;
} else {
param.nonlineMode = opr::ConvBias::Param::NonlineMode::IDENTITY;
}
auto conv = opr::ConvBias::make(f, weight, bias, param, {},
OperatorNodeConfig{out_dtype});
weight_idx++;
bias_idx++;
return conv;
}
SymbolVar Network::add_deconv(SymbolVar f, size_t ratio, size_t output_channels,
DType out_dtype) {
static int weight_idx = 0;
size_t kernel = ratio * 2 - ratio % 2;
size_t pad = ratio / 2;
size_t input_channels = f.node()->shape()[1];
auto weight = add_cvar(ssprintf("w%d", weight_idx).c_str(),
{input_channels, output_channels, kernel, kernel});
if (out_dtype.category() == DTypeCategory::QUANTIZED) {
weight = add_type_cvt(weight, out_dtype);
}
opr::ConvolutionBackwardData::Param param;
param.stride_h = param.stride_w = ratio;
param.pad_h = param.pad_w = pad;
auto deconv = opr::ConvolutionBackwardData::make(
weight, f, param, {}, OperatorNodeConfig{out_dtype});
weight_idx++;
return deconv;
}
SymbolVar Network::add_elemwise(const SymbolVarArray inps, DType out_dtype,
opr::Elemwise::Param::Mode mode) {
using ElemMode = opr::Elemwise::Param::Mode;
using MultiMode = opr::ElemwiseMultiType::Param::Mode;
static const ThinHashMap<ElemMode, MultiMode> map = {
{ElemMode::ADD, MultiMode::QADD},
{ElemMode::FUSE_ADD_RELU, MultiMode::QFUSE_ADD_RELU}};
if (out_dtype.category() == DTypeCategory::QUANTIZED) {
MultiMode alter_mode = map.at(mode);
return opr::ElemwiseMultiType::make(inps, {alter_mode},
OperatorNodeConfig{out_dtype});
} else {
return opr::Elemwise::make(inps, mode);
}
}
SymbolVar Network::add_pooling(SymbolVar f, Window window, Stride stride,
Padding padding,
opr::Pooling::Param::Mode mode) {
opr::Pooling::Param param;
param.window_h = window[0], param.window_w = window[1];
param.stride_h = stride[0], param.stride_w = stride[1];
param.pad_h = padding[0], param.pad_w = padding[1];
param.mode = mode;
return opr::Pooling::make(f, param);
}
SymbolVar Network::add_type_cvt(SymbolVar f, DType out_dtype) {
return opr::TypeCvt::make(f, out_dtype);
}
SymbolVar mgb::create_block(Network& network, SymbolVar f_in, size_t stride,
size_t num_outputs1, bool has_proj,
DType out_dtype) {
auto proj = f_in;
if (has_proj) {
proj = network.add_conv(f_in, num_outputs1, {1, 1}, out_dtype, false,
{stride, stride});
}
auto f = network.add_conv(f_in, num_outputs1, {3, 3}, out_dtype, true,
{stride, stride}, {1, 1});
f = network.add_conv(f, num_outputs1, {3, 3}, out_dtype, true, {1, 1},
{1, 1});
f = network.add_elemwise({f, proj}, out_dtype,
opr::Elemwise::Mode::FUSE_ADD_RELU);
return f;
}
SymbolVar mgb::make_resnet18(Network& network, size_t batch, DType out_dtype) {
auto data = network.add_var("data", {batch, 4, 224, 224});
if (out_dtype.category() == DTypeCategory::QUANTIZED)
data = network.add_type_cvt(data, dtype::QuantizedS8{1.f});
auto first = out_dtype;
if (out_dtype.category() == DTypeCategory::QUANTIZED)
first = dtype::QuantizedS8{1.f};
auto f = network.add_conv(data, 64, {7, 7}, first, true, {2, 2}, {3, 3});
if (out_dtype.enumv() == DTypeEnum::QuantizedS4 ||
out_dtype.enumv() == DTypeEnum::Quantized4Asymm) {
f = network.add_type_cvt(f, out_dtype);
}
f = network.add_pooling(f, {3, 3}, {2, 2}, {1, 1});
using Vector = SmallVector<size_t, 4>;
Vector stages = {2, 2, 2, 2};
Vector mid_outputs = {64, 128, 256, 512};
Vector enable_stride = {0, 1, 1, 1};
for (size_t i = 0; i < 4; ++i) {
auto s = stages[i];
auto o = mid_outputs[i];
auto es = enable_stride[i];
for (size_t j = 0; j < s; ++j) {
size_t stride = !es || j > 0 ? 1 : 2;
bool has_proj = j > 0 ? false : true;
f = create_block(network, f, stride, o, has_proj, out_dtype);
}
}
f = network.add_pooling(f, {7, 7}, {7, 7}, {0, 0},
opr::Pooling::Param::Mode::AVERAGE);
f = network.add_type_cvt(f, dtype::Float32());
return f;
}
namespace {
SymbolVarArray make_pyramids(Network& network, size_t batch, DType out_dtype) {
SymbolVarArray pyramids;
auto data = network.add_var("data", {batch, 3, 256, 256});
data = data + (-128.f);
if (out_dtype.category() == DTypeCategory::QUANTIZED)
data = network.add_type_cvt(data, dtype::QuantizedS8{1.f});
auto first = out_dtype;
if (out_dtype.category() == DTypeCategory::QUANTIZED)
first = dtype::QuantizedS8{1.f};
auto f = network.add_conv(data, 16, {3, 3}, first, true, {2, 2}, {1, 1});
f = network.add_conv(f, 16, {3, 3}, first, true, {1, 1}, {1, 1});
f = network.add_conv(f, 32, {3, 3}, first, true, {2, 2}, {1, 1});
if (out_dtype.enumv() == DTypeEnum::QuantizedS4 ||
out_dtype.enumv() == DTypeEnum::Quantized4Asymm) {
f = network.add_type_cvt(f, out_dtype);
}
using Vector = SmallVector<size_t, 4>;
Vector stages = {3, 6, 6, 3};
Vector mid_outputs = {32, 64, 128, 256};
Vector enable_stride = {0, 1, 1, 1};
for (size_t i = 0; i < 4; ++i) {
auto s = stages[i];
auto o = mid_outputs[i];
auto es = enable_stride[i];
for (size_t j = 0; j < s; ++j) {
size_t stride = !es || j > 0 ? 1 : 2;
bool has_proj = j > 0 ? false : true;
f = create_block(network, f, stride, o, has_proj, out_dtype);
}
pyramids.push_back(f);
}
for (size_t i = 0; i < pyramids.size(); ++i) {
pyramids[i] = network.add_type_cvt(pyramids[i], first);
}
return pyramids;
}
SymbolVarArray fusion_pyramids_feature(Network& network,
SymbolVarArray pyramids,
size_t fpn_conv_channels) {
bool touch = false;
SymbolVar x;
SymbolVarArray fpn;
for (int i = 5; i >= 3; --i) {
auto f = network.add_conv(pyramids[i - 2], fpn_conv_channels, {1, 1},
dtype::QuantizedS8{1.f}, false, {1, 1},
{0, 0});
if (!touch) {
x = f;
} else {
x = network.add_deconv(x, 2, 16, dtype::QuantizedS8{1.f});
x = network.add_elemwise({x, f}, dtype::QuantizedS8{1.f},
opr::Elemwise::Mode::ADD);
}
fpn.push_back(x);
}
x = fpn[0];
for (int i = 6; i < 8; ++i) {
x = network.add_conv(x, fpn_conv_channels, {3, 3},
dtype::QuantizedS8{1.f}, true, {2, 2}, {1, 1});
}
return fpn;
}
} // namespace
SymbolVarArray mgb::make_det(Network& network, size_t batch, DType out_dtype) {
SymbolVarArray outputs;
auto pyramids = make_pyramids(network, batch, out_dtype);
auto fpn_hv = fusion_pyramids_feature(network, pyramids, 16);
auto fpn_plate = fusion_pyramids_feature(network, pyramids, 16);
outputs.insert(outputs.end(), fpn_hv.begin(), fpn_hv.end());
outputs.insert(outputs.end(), fpn_plate.begin(), fpn_plate.end());
return outputs;
}
// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}
/**
* \file src/gopt/test/network.h
* MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
*
* Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
* implied.
*/
#pragma once
#include "megbrain/test/helper.h"
#include "megbrain/gopt/framework.h"
#include "megbrain/opr/basic_arith_wrapper.h"
#include "megbrain/opr/blas.h"
#include "megbrain/opr/dnn/convolution.h"
#include "megbrain/opr/dnn/pooling.h"
#include "megbrain/opr/imgproc.h"
#include "megbrain/opr/nn_int.h"
#include "megbrain/opr/tensor_gen.h"
#include "megbrain/opr/tensor_manip.h"
#include "megbrain/opr/utility.h"
namespace mgb {
class Network {
private:
HostTensorGenerator<> gen;
CompNode cn;
public:
std::shared_ptr<ComputingGraph> graph = ComputingGraph::make();
Network(CompNode cn_) : cn{cn_} {}
~Network() noexcept = default;
using KernSize = SmallVector<size_t, 2>;
using Stride = SmallVector<size_t, 2>;
using Padding = SmallVector<size_t, 2>;
SymbolVar add_var(const char* name, const TensorShape& shp = {1}) {
return opr::Host2DeviceCopy::make(*graph, gen(shp), cn).rename(name);
}
SymbolVar add_cvar(const char* name, const TensorShape& shp = {1}) {
return opr::SharedDeviceTensor::make(*graph, *gen(shp), cn)
.rename(name);
}
SymbolVar add_conv(SymbolVar f, size_t output_channels, KernSize kern_size,
DType out_dtype = dtype::Float32(), bool has_relu = true,
Stride stride = {1, 1}, Padding padding = {0, 0});
SymbolVar add_deconv(SymbolVar f, size_t ratio, size_t output_channels,
DType out_dtype);
SymbolVar add_elemwise(
const SymbolVarArray inps, DType out_dtype = dtype::Float32(),
opr::Elemwise::Param::Mode mode = opr::Elemwise::Param::Mode::ADD);
using Window = SmallVector<size_t, 2>;
SymbolVar add_pooling(
SymbolVar f, Window window, Stride stride = {1, 1},
Padding padding = {0, 0},
opr::Pooling::Param::Mode mode = opr::Pooling::Param::Mode::MAX);
SymbolVar add_type_cvt(SymbolVar f, DType out_dtype = dtype::Float32());
};
SymbolVar create_block(Network& network, SymbolVar f, size_t stride,
size_t num_outputs1, bool has_proj = false,
DType out_dtype = dtype::Float32());
SymbolVar make_resnet18(Network& network, size_t batch = 16,
DType out_dtype = dtype::Float32());
SymbolVarArray make_det(Network& network, size_t batch = 16,
DType out_dtype = dtype::Float32());
} // namespace mgb
// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册