提交 c14e5719 编写于 作者: M Megvii Engine Team

feat(mgb/gopt): add profile impl for global layout transform pass

GitOrigin-RevId: 8ef62baf792c97c7a226dd791af167ab2e8707b4
上级 9c0a17d0
...@@ -166,6 +166,13 @@ void aarch64::RelayoutForwardImpl::exec(_megdnn_tensor_in src0, ...@@ -166,6 +166,13 @@ void aarch64::RelayoutForwardImpl::exec(_megdnn_tensor_in src0,
TensorND src = src0, dst = dst0; TensorND src = src0, dst = dst0;
check_layout_and_canonize(src.layout, dst.layout); check_layout_and_canonize(src.layout, dst.layout);
// FIXME: optimize for lowbit cases
if (src.layout.dtype.enumv() == DTypeEnum::QuantizedS4 ||
src.layout.dtype.enumv() == DTypeEnum::Quantized4Asymm) {
fallback::RelayoutForwardImpl::exec(src0, dst0, src_handle);
return;
}
relayout::TransposeParam trans_param; relayout::TransposeParam trans_param;
bool trans = relayout::is_transpose(src.layout, dst.layout, trans_param); bool trans = relayout::is_transpose(src.layout, dst.layout, trans_param);
if (trans && trans_param.c == 1 && src0.layout.dtype.size() == 1) { if (trans && trans_param.c == 1 && src0.layout.dtype.size() == 1) {
......
...@@ -134,6 +134,13 @@ void armv7::RelayoutForwardImpl::exec(_megdnn_tensor_in src0, ...@@ -134,6 +134,13 @@ void armv7::RelayoutForwardImpl::exec(_megdnn_tensor_in src0,
TensorND src = src0, dst = dst0; TensorND src = src0, dst = dst0;
check_layout_and_canonize(src.layout, dst.layout); check_layout_and_canonize(src.layout, dst.layout);
// FIXME: optimize for lowbit cases
if (src.layout.dtype.enumv() == DTypeEnum::QuantizedS4 ||
src.layout.dtype.enumv() == DTypeEnum::Quantized4Asymm) {
fallback::RelayoutForwardImpl::exec(src0, dst0, src_handle);
return;
}
relayout::TransposeParam trans_param; relayout::TransposeParam trans_param;
bool trans = relayout::is_transpose(src.layout, dst.layout, trans_param); bool trans = relayout::is_transpose(src.layout, dst.layout, trans_param);
if (trans && trans_param.c == 1 && src0.layout.dtype.size() == 1) { if (trans && trans_param.c == 1 && src0.layout.dtype.size() == 1) {
......
/**
* \file src/gopt/impl/opr_format_modifier.cpp
* MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
*
* Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
* implied.
*/
#include "./opr_format_modifier.h"
#include "megbrain/opr/dnn/convolution.h"
#include "megbrain/opr/dnn/pooling.h"
#include "megbrain/opr/imgproc.h"
#include "megbrain/opr/io.h"
#include "megbrain/serialization/sereg.h"
#include "midout.h"
MIDOUT_DECL(megbrain_opr_format_modifier)
#define MIDOUT_B(...) MIDOUT_BEGIN(megbrain_opr_format_modifier, __VA_ARGS__) {
#define MIDOUT_E \
} \
MIDOUT_END();
using namespace mgb;
using namespace opr;
namespace {
template <class MegDNNConv = megdnn::Convolution>
struct MakeConvCaller2 {
template <typename Opr>
static VarNode* make(const cg::VarNodeArray& inputs,
const typename MegDNNConv::Param& param,
const megdnn::param::ExecutionPolicy& execution_policy,
const OperatorNodeConfig& config) {
if (inputs.size() == 2) {
return Opr::make(inputs[0], inputs[1], param, execution_policy,
config)
.node();
}
return nullptr;
}
};
template <class MegDNNConv = megdnn::Convolution>
struct MakeConvCaller3 {
template <typename Opr>
static VarNode* make(const cg::VarNodeArray& inputs,
const typename MegDNNConv::Param& param,
const megdnn::param::ExecutionPolicy& execution_policy,
const OperatorNodeConfig& config) {
if (inputs.size() == 3) {
return Opr::make(inputs[0], inputs[1], inputs[2], param,
execution_policy, config)
.node();
}
return nullptr;
}
};
template <class MegDNNConv = megdnn::Convolution>
struct MakeConvCaller4 {
template <typename Opr>
static VarNode* make(const cg::VarNodeArray& inputs,
const typename MegDNNConv::Param& param,
const megdnn::param::ExecutionPolicy& execution_policy,
const OperatorNodeConfig& config) {
if (inputs.size() == 4) {
return Opr::make(inputs[0], inputs[1], inputs[2], inputs[3], param,
execution_policy, config)
.node();
}
return nullptr;
}
};
template <class MegDNNConv = megdnn::Convolution>
struct MakeConvCaller5 {
template <typename Opr>
static VarNode* make(const cg::VarNodeArray& inputs,
const typename MegDNNConv::Param& param,
const megdnn::param::ExecutionPolicy& execution_policy,
const OperatorNodeConfig& config) {
if (inputs.size() == 5) {
return Opr::make(inputs[0], inputs[1], inputs[2], inputs[3],
inputs[4], param, execution_policy, config)
.node();
}
return nullptr;
}
};
template <class MegDNNConv = megdnn::Convolution>
struct MakeConvCallerEmpty {
template <typename Opr>
static VarNode* make(const cg::VarNodeArray&,
const typename MegDNNConv::Param&,
const megdnn::param::ExecutionPolicy&,
const OperatorNodeConfig&) {
return nullptr;
}
};
template <class Opr, class Maker0, class MegDNNConv,
class Maker1 = MakeConvCallerEmpty<MegDNNConv>,
class Maker2 = MakeConvCallerEmpty<MegDNNConv>,
typename ConvParam = megdnn::param::Convolution>
struct ConvMakerImpl {
static VarNode* make(const cg::VarNodeArray& inputs, const ConvParam& param,
const megdnn::param::ExecutionPolicy& execution_policy,
const OperatorNodeConfig& config) {
VarNode* ret = Maker0::template make<Opr>(inputs, param,
execution_policy, config);
if (!ret) {
ret = Maker1::template make<Opr>(inputs, param, execution_policy,
config);
}
if (!ret) {
ret = Maker2::template make<Opr>(inputs, param, execution_policy,
config);
}
mgb_assert(ret);
return ret;
}
};
template <typename Opr>
struct ConvMaker;
template <>
struct ConvMaker<opr::Convolution>
: public ConvMakerImpl<opr::Convolution,
MakeConvCaller2<megdnn::Convolution>,
megdnn::Convolution> {};
template <>
struct ConvMaker<opr::ConvolutionBackwardData>
: public ConvMakerImpl<opr::ConvolutionBackwardData,
MakeConvCaller2<megdnn::Convolution>,
megdnn::Convolution,
MakeConvCaller3<megdnn::Convolution>> {};
template <>
struct ConvMaker<opr::ConvBiasForward>
: public ConvMakerImpl<opr::ConvBiasForward,
MakeConvCaller2<megdnn::ConvBiasForward>,
megdnn::ConvBiasForward,
MakeConvCaller3<megdnn::ConvBiasForward>,
MakeConvCaller4<megdnn::ConvBiasForward>,
megdnn::param::ConvBias> {};
template <>
struct ConvMaker<opr::BatchConvBiasForward>
: public ConvMakerImpl<opr::BatchConvBiasForward,
MakeConvCaller2<megdnn::BatchConvBiasForward>,
megdnn::BatchConvBiasForward,
MakeConvCaller3<megdnn::BatchConvBiasForward>,
MakeConvCaller4<megdnn::BatchConvBiasForward>,
megdnn::param::BatchConvBias> {};
#if 0
#include "../../opr/impl/internal/invoke.h"
template <typename Opr>
struct MultiAlgoOprTrait;
#define APPLY(statement, ...) \
mgb::apply([&](const auto&... args) { return statement; }, \
std::tuple_cat(__VA_ARGS__))
#define INST(_Opr) \
template <> \
struct MultiAlgoOprTrait<_Opr> { \
static constexpr bool has_algo = true; \
using MegDNNOpr = megdnn::_Opr; \
static constexpr int arity = OprArityTrait<MegDNNOpr>::arity; \
using FixedTensorLayouts = std::array<TensorLayout, arity>; \
static bool has_available_algo(const VarNodeArray& i, \
const cg::OperatorNodeBase* opr_) { \
MIDOUT_B(midout_iv(MGB_HASH_STR(#_Opr)), \
midout_iv(MGB_HASH_STR("has_available_algo"))) \
auto&& opr = opr_->cast_final_safe<_Opr>(); \
auto&& megdnn_opr = \
reinterpret_cast<MegDNNOpr*>(opr.megdnn_opr()); \
FixedTensorLayouts array_layouts; \
size_t in = i.size() - 1; \
for (size_t idx = 0; idx < in; idx++) { \
const auto& v = i[idx]; \
array_layouts[idx] = \
TensorLayout{v->shape(), v->dtype(), v->format()}; \
} \
const auto& v = i[in]; \
array_layouts[arity - 1] = \
TensorLayout{v->shape(), v->dtype(), v->format()}; \
return APPLY(::megdnn::has_available_algo(megdnn_opr, args...), \
array_layouts); \
MIDOUT_E \
} \
};
INST(Convolution)
INST(ConvBiasForward)
INST(ConvolutionBackwardData)
INST(PoolingForward)
#undef APPLY
#undef INST
#endif
} // namespace
namespace mgb {
namespace gopt {
namespace intl {
template <typename Opr>
struct OprFormatModifier;
#define INST(_Opr) \
template <> \
struct OprFormatModifier<_Opr> { \
using OprFormat = typename _Opr::Param::Format; \
static VarNode* make(OprFormat opr_format, const VarNodeArray& i, \
const cg::OperatorNodeBase* opr_) { \
MIDOUT_B(_Opr) \
auto&& opr = opr_->cast_final_safe<_Opr>(); \
auto param = opr.param(); \
param.format = opr_format; \
return ConvMaker<_Opr>::make(i, param, opr.execution_policy(), \
opr.config()); \
MIDOUT_E \
} \
};
INST(Convolution);
INST(ConvBiasForward);
INST(ConvolutionBackwardData);
INST(BatchConvBiasForward);
#undef INST
template <>
struct OprFormatModifier<WarpPerspective> {
using Opr = opr::WarpPerspective;
using OprFormat = typename Opr::Param::Format;
static VarNode* make(OprFormat opr_format, const VarNodeArray& i,
const cg::OperatorNodeBase* opr_) {
MIDOUT_B(Opr)
auto&& opr = opr_->cast_final_safe<Opr>();
auto param = opr.param();
param.format = opr_format;
if (i.size() == 3) {
return Opr::make(i[0], i[1], i[2], param, opr.config()).node();
} else {
mgb_assert(i.size() == 4);
return Opr::make(i[0], i[1], i[2], i[3], param, opr.config())
.node();
}
MIDOUT_E
}
};
#define INST(_Opr, _arity) \
template <> \
struct OprFormatModifier<_Opr> { \
using OprFormat = typename _Opr::Param::Format; \
static VarNode* make(OprFormat opr_format, const VarNodeArray& i, \
const cg::OperatorNodeBase* opr_) { \
MIDOUT_B(_Opr) \
auto&& opr = opr_->cast_final_safe<_Opr>(); \
auto param = opr.param(); \
param.format = opr_format; \
return serialization::OprMaker<_Opr, _arity>::make( \
param, i, *i[0]->owner_graph(), opr.config()) \
->output(0); \
MIDOUT_E \
} \
};
INST(PoolingForward, 1);
INST(Resize, 2);
#undef INST
VarNode* modify_opr_format(opr::ConvBias::Param::Format opr_format,
const VarNodeArray& i,
const cg::OperatorNodeBase* opr) {
#define cb(_Opr) \
if (opr->dyn_typeinfo() == _Opr::typeinfo()) { \
return OprFormatModifier<_Opr>::make(opr_format, i, opr); \
} else
FOREACH_FORMAT_AWARE_OPR(cb) {
mgb_throw(InternalError, "invalid format aware operator(got:%s)",
opr->dyn_typeinfo()->name);
}
#undef cb
}
#if 0
bool has_available_algo(const VarNodeArray& i,
const cg::OperatorNodeBase* opr) {
#define cb(_Opr) \
if (opr->dyn_typeinfo() == _Opr::typeinfo()) { \
MGB_MARK_USED_VAR(MultiAlgoOprTrait<_Opr>::has_algo); \
VarNodeArray _ = i; \
_.emplace_back(opr->output(0)); \
return MultiAlgoOprTrait<_Opr>::has_available_algo(_, opr); \
} else
cb(Convolution) cb(ConvBiasForward) cb(ConvolutionBackwardData)
cb(PoolingForward) {
mgb_throw(InternalError, "invalid multi-algo operator(got:%s)",
opr->dyn_typeinfo()->name);
}
}
#endif
} // namespace intl
} // namespace gopt
} // namespace mgb
// vim: syntax=cpp.doxygen
/**
* \file src/gopt/impl/opr_format_modifier.h
* MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
*
* Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
* implied.
*/
#pragma once
#include "megbrain/graph.h"
#include "megbrain/opr/dnn/convolution.h"
namespace mgb {
namespace gopt {
namespace intl {
#define FOREACH_FORMAT_AWARE_OPR(cb) \
cb(Convolution) cb(ConvBiasForward) cb(ConvolutionBackwardData) \
cb(PoolingForward) cb(WarpPerspective) cb(Resize)
#if 0
bool has_available_algo(const VarNodeArray& i, const cg::OperatorNodeBase* opr);
#endif
VarNode* modify_opr_format(opr::ConvBias::Param::Format opr_format,
const VarNodeArray& i,
const cg::OperatorNodeBase* opr);
} // namespace intl
} // namespace gopt
} // namespace mgb
// vim: syntax=cpp.doxygen
此差异已折叠。
此差异已折叠。
...@@ -247,16 +247,36 @@ ReformatEmitter::UnderlyingBuilders ReformatEmitter::analyze() const { ...@@ -247,16 +247,36 @@ ReformatEmitter::UnderlyingBuilders ReformatEmitter::analyze() const {
/* ============== PaddingEmitter ================= */ /* ============== PaddingEmitter ================= */
PaddingEmitter::EmitResult PaddingEmitter::emit() const { PaddingEmitter::EmitResult PaddingEmitter::emit() const {
auto&& padshp = m_padshp;
auto&& const_extent = m_const_extent; auto&& const_extent = m_const_extent;
auto&& axis = m_axis; auto&& axis = m_axis;
auto builder = [const_extent, axis](const VarNodeArray& vars) { auto builder = [padshp, const_extent, axis](const VarNodeArray& vars) {
auto i = vars[0]; auto i = vars[0];
auto padding_shp_var = vars[1]; auto padding_shp_var = vars[1];
TensorShape shape; TensorShape shape;
shape.ndim = i->shape().ndim; shape.ndim = i->shape().ndim;
for (size_t ax = 0; ax < shape.ndim; ++ax) for (size_t ax = 0; ax < shape.ndim; ++ax)
shape[ax] = 1; shape[ax] = 1;
// avoid making a scalar lowbit tensor
if (!i->dtype().is_low_bit() || const_extent != 1)
shape[axis] = const_extent; shape[axis] = const_extent;
else {
size_t const_axis = 0;
size_t new_const_extent = const_extent;
for (size_t i = 0; i < padshp.ndim; ++i) {
const auto& dim = padshp[i];
if (dim.extent() != Dimension::UNDETERMINED_EXTENT &&
dim.extent() != 1) {
new_const_extent = dim.extent();
const_axis = i;
break;
}
}
mgb_assert(new_const_extent != 1,
"cannot make an scalar lowbit tensor(got:%s)",
i->dtype().name());
shape[const_axis] = new_const_extent;
}
auto host_val = auto host_val =
std::make_shared<HostTensorND>(i->comp_node(), i->dtype()); std::make_shared<HostTensorND>(i->comp_node(), i->dtype());
host_val->resize(shape); host_val->resize(shape);
......
...@@ -13,6 +13,7 @@ ...@@ -13,6 +13,7 @@
#include "megbrain/gopt/reformat_manager.h" #include "megbrain/gopt/reformat_manager.h"
#include "megbrain/opr/tensor_manip.h" #include "megbrain/opr/tensor_manip.h"
#include "megbrain/utils/arith_helper.h" #include "megbrain/utils/arith_helper.h"
#include "./utils.h"
using namespace mgb; using namespace mgb;
using namespace gopt; using namespace gopt;
...@@ -32,68 +33,6 @@ int gcd(const int& p, const int& q) { ...@@ -32,68 +33,6 @@ int gcd(const int& p, const int& q) {
} }
return x; return x;
} }
NamedTensorShape tensor_formats_to_named_tensor_shape(TensorFormats format) {
switch (format) {
case TensorFormats::NCHW:
return {{"N"}, {"C"}, {"H"}, {"W"}};
case TensorFormats::NHWC:
return {{"N"}, {"H"}, {"W"}, {"C"}};
case TensorFormats::NCHWc4:
return {{"N"}, {"C//4"}, {"H"}, {"W"}, {"C%4"}};
case TensorFormats::NCHWc8:
return {{"N"}, {"C//8"}, {"H"}, {"W"}, {"C%8"}};
case TensorFormats::NCHWc32:
return {{"N"}, {"C//32"}, {"H"}, {"W"}, {"C%32"}};
case TensorFormats::NCHWc64:
return {{"N"}, {"C//64"}, {"H"}, {"W"}, {"C%64"}};
case TensorFormats::CHWNc4:
return {{"C//4"}, {"H"}, {"W"}, {"N"}, {"C%4"}};
case TensorFormats::NHCWc4:
return {{"N"}, {"H"}, {"C//4"}, {"W"}, {"C%4"}};
case TensorFormats::KRSCk4:
return {{"K//4"}, {"R"}, {"S"}, {"C"}, {"K%4"}};
case TensorFormats::GKRSCk4:
return {{"G"}, {"K//4"}, {"R"}, {"S"}, {"C"}, {"K%4"}};
case TensorFormats::C1RSc4:
return {{"C//4"}, {"C%1"}, {"R"}, {"S"}, {"C%4"}};
case TensorFormats::KRSCk4c4:
return {{"K//4"}, {"R"}, {"S"}, {"C//4"}, {"K%4"}, {"C%4"}};
case TensorFormats::GKRSCk4c4:
return {{"G"}, {"K//4"}, {"R"}, {"S"}, {"C//4"}, {"K%4"}, {"C%4"}};
case TensorFormats::KCRSk4c4:
return {{"K//4"}, {"C//4"}, {"R"}, {"S"}, {"K%4"}, {"C%4"}};
case TensorFormats::GKCRSk4c4:
return {{"G"}, {"K//4"}, {"C//4"}, {"R"}, {"S"}, {"K%4"}, {"C%4"}};
case TensorFormats::KCRSc4k4:
return {{"K//4"}, {"C//4"}, {"R"}, {"S"}, {"C%4"}, {"K%4"}};
case TensorFormats::GKCRSc4k4:
return {{"G"}, {"K//4"}, {"C//4"}, {"R"}, {"S"}, {"C%4"}, {"K%4"}};
case TensorFormats::C11RSc4:
return {{"C//4"}, {"C%1"}, {"C%1"}, {"R"}, {"S"}, {"C%4"}};
case TensorFormats::KCRSc8k8:
return {{"K//8"}, {"C//8"}, {"R"}, {"S"}, {"C%8"}, {"K%8"}};
case TensorFormats::GKCRSc8k8:
return {{"G"}, {"K//8"}, {"C//8"}, {"R"}, {"S"}, {"C%8"}, {"K%8"}};
case TensorFormats::C11RSc8:
return {{"C//8"}, {"C%1"}, {"C%1"}, {"R"}, {"S"}, {"C%8"}};
case TensorFormats::KRSCk8:
return {{"K//8"}, {"R"}, {"S"}, {"C"}, {"K%8"}};
case TensorFormats::KCRSc4:
return {{"K"}, {"C//4"}, {"R"}, {"S"}, {"C%4"}};
case TensorFormats::GKCRSc4:
return {{"G"}, {"K"}, {"C//4"}, {"R"}, {"S"}, {"C%4"}};
case TensorFormats::KCRS:
return {{"K"}, {"C"}, {"R"}, {"S"}};
case TensorFormats::GKCRS:
return {{"G"}, {"K"}, {"C"}, {"R"}, {"S"}};
case TensorFormats::C11RS:
return {{"C"}, {"C%1"}, {"C%1"}, {"R"}, {"S"}};
default:
mgb_throw(AssertionError, "invalid tensor formats(%u)",
static_cast<uint32_t>(format));
}
}
}; // namespace }; // namespace
// =================== ReformatManager::ReformatKey ====================*/ // =================== ReformatManager::ReformatKey ====================*/
...@@ -393,8 +332,10 @@ ReformatManager::ReformatImpl ReformatManager::auto_aligned_reformat_featrue( ...@@ -393,8 +332,10 @@ ReformatManager::ReformatImpl ReformatManager::auto_aligned_reformat_featrue(
tensor_formats_to_named_tensor_shape(key.input_format); tensor_formats_to_named_tensor_shape(key.input_format);
NamedTensorShape output_shape = NamedTensorShape output_shape =
tensor_formats_to_named_tensor_shape(key.output_format); tensor_formats_to_named_tensor_shape(key.output_format);
size_t input_alignment, output_alignment; size_t input_alignment = 0;
size_t input_channel_idx, output_channel_idx; size_t output_alignment = 0;
size_t input_channel_idx = input_shape.ndim,
output_channel_idx = input_shape.ndim;
for (size_t i = 0; i < input_shape.ndim; ++i) { for (size_t i = 0; i < input_shape.ndim; ++i) {
if (input_shape[i].name() == Dimension::Name::C && if (input_shape[i].name() == Dimension::Name::C &&
input_shape[i].extent() == Dimension::UNDETERMINED_EXTENT) { input_shape[i].extent() == Dimension::UNDETERMINED_EXTENT) {
...@@ -411,6 +352,15 @@ ReformatManager::ReformatImpl ReformatManager::auto_aligned_reformat_featrue( ...@@ -411,6 +352,15 @@ ReformatManager::ReformatImpl ReformatManager::auto_aligned_reformat_featrue(
break; break;
} }
} }
mgb_assert(input_channel_idx < input_shape.ndim &&
output_channel_idx < input_shape.ndim,
"invalid channel idx(in_channel:%zu, out_channel:%zu, shp:%s)",
input_channel_idx, output_channel_idx,
input_shape.to_string().c_str());
mgb_assert(input_alignment > 0 && output_alignment > 0,
"invalid alignment(in_channel:%zu, out_channel:%zu, shp:%s)",
input_alignment, output_alignment,
input_shape.to_string().c_str());
NamedTensorShape orig_shape = NamedTensorShape orig_shape =
tensor_formats_to_named_tensor_shape(orig_format); tensor_formats_to_named_tensor_shape(orig_format);
size_t orig_channel = 0; size_t orig_channel = 0;
...@@ -448,8 +398,9 @@ ReformatManager::ReformatImpl ReformatManager::auto_aligned_reformat_featrue( ...@@ -448,8 +398,9 @@ ReformatManager::ReformatImpl ReformatManager::auto_aligned_reformat_featrue(
auto make_shape = std::get<0>( auto make_shape = std::get<0>(
MakeShapeEmitter{input_shape, padding_shape}.emit()); MakeShapeEmitter{input_shape, padding_shape}.emit());
auto padding_shp_var = make_shape({x}); auto padding_shp_var = make_shape({x});
auto padding = std::get<0>( auto padding = std::get<0>(PaddingEmitter{
PaddingEmitter{const_extent, input_channel_idx}.emit()); padding_shape, const_extent, input_channel_idx}
.emit());
cur = padding({cur, padding_shp_var}); cur = padding({cur, padding_shp_var});
} }
cur = ReformatManager::instance().get(key)({cur}); cur = ReformatManager::instance().get(key)({cur});
...@@ -469,9 +420,10 @@ ReformatManager::ReformatImpl ReformatManager::auto_aligned_reformat_weight( ...@@ -469,9 +420,10 @@ ReformatManager::ReformatImpl ReformatManager::auto_aligned_reformat_weight(
const VarNode* orig_var, const ReformatKey& key, const VarNode* orig_var, const ReformatKey& key,
const AlignmentDesc& extra_alignment) const { const AlignmentDesc& extra_alignment) const {
size_t in_channels = 0, out_channels = 0; size_t in_channels = 0, out_channels = 0;
size_t input_channel_idx, output_channel_idx; Dimension::Name out_channel_name = Dimension::Name::C;
Dimension::Name out_channel_name;
auto input_shape = tensor_formats_to_named_tensor_shape(key.input_format); auto input_shape = tensor_formats_to_named_tensor_shape(key.input_format);
size_t input_channel_idx = input_shape.ndim,
output_channel_idx = input_shape.ndim;
for (size_t i = 0; i < input_shape.ndim; ++i) { for (size_t i = 0; i < input_shape.ndim; ++i) {
if (input_shape[i].name() == Dimension::Name::C && if (input_shape[i].name() == Dimension::Name::C &&
input_shape[i].extent() == Dimension::UNDETERMINED_EXTENT) { input_shape[i].extent() == Dimension::UNDETERMINED_EXTENT) {
...@@ -491,7 +443,15 @@ ReformatManager::ReformatImpl ReformatManager::auto_aligned_reformat_weight( ...@@ -491,7 +443,15 @@ ReformatManager::ReformatImpl ReformatManager::auto_aligned_reformat_weight(
input_shape.to_string().c_str()); input_shape.to_string().c_str());
} }
} }
size_t in_channel_alignment, out_channel_alignment = 1; mgb_assert(out_channel_name == Dimension::Name::K ||
out_channel_name == Dimension::Name::N,
"invalid out channel(shp:%s)", input_shape.to_string().c_str());
mgb_assert(input_channel_idx < input_shape.ndim &&
output_channel_idx < input_shape.ndim,
"invalid channel idx(in_channel:%zu, out_channel:%zu, shp:%s)",
input_channel_idx, output_channel_idx,
input_shape.to_string().c_str());
size_t in_channel_alignment = 0, out_channel_alignment = 0;
auto output_shape = tensor_formats_to_named_tensor_shape(key.output_format); auto output_shape = tensor_formats_to_named_tensor_shape(key.output_format);
for (size_t i = 0; i < output_shape.ndim; ++i) { for (size_t i = 0; i < output_shape.ndim; ++i) {
if (output_shape[i].name() == Dimension::Name::C && if (output_shape[i].name() == Dimension::Name::C &&
...@@ -502,6 +462,10 @@ ReformatManager::ReformatImpl ReformatManager::auto_aligned_reformat_weight( ...@@ -502,6 +462,10 @@ ReformatManager::ReformatImpl ReformatManager::auto_aligned_reformat_weight(
out_channel_alignment = output_shape[i].stride(); out_channel_alignment = output_shape[i].stride();
} }
} }
mgb_assert(in_channel_alignment > 0 && out_channel_alignment > 0,
"invalid alignment(in_channel:%zu, out_channel:%zu, shp:%s)",
in_channel_alignment, out_channel_alignment,
output_shape.to_string().c_str());
size_t aligned_in_channel = size_t aligned_in_channel =
divup(in_channels, in_channel_alignment) * in_channel_alignment; divup(in_channels, in_channel_alignment) * in_channel_alignment;
if (extra_alignment.name == out_channel_name) { if (extra_alignment.name == out_channel_name) {
...@@ -526,8 +490,9 @@ ReformatManager::ReformatImpl ReformatManager::auto_aligned_reformat_weight( ...@@ -526,8 +490,9 @@ ReformatManager::ReformatImpl ReformatManager::auto_aligned_reformat_weight(
auto make_shape = std::get<0>( auto make_shape = std::get<0>(
MakeShapeEmitter{input_shape, padding_shape}.emit()); MakeShapeEmitter{input_shape, padding_shape}.emit());
auto padding_shp_var = make_shape({x}); auto padding_shp_var = make_shape({x});
auto padding = std::get<0>( auto padding = std::get<0>(PaddingEmitter{
PaddingEmitter{const_extent, input_channel_idx}.emit()); padding_shape, const_extent, input_channel_idx}
.emit());
cur = padding({cur, padding_shp_var}); cur = padding({cur, padding_shp_var});
} }
if (aligned_out_channel > out_channels) { if (aligned_out_channel > out_channels) {
...@@ -540,8 +505,9 @@ ReformatManager::ReformatImpl ReformatManager::auto_aligned_reformat_weight( ...@@ -540,8 +505,9 @@ ReformatManager::ReformatImpl ReformatManager::auto_aligned_reformat_weight(
auto make_shape = std::get<0>( auto make_shape = std::get<0>(
MakeShapeEmitter{input_shape, padding_shape}.emit()); MakeShapeEmitter{input_shape, padding_shape}.emit());
auto padding_shp_var = make_shape({cur}); auto padding_shp_var = make_shape({cur});
auto padding = std::get<0>( auto padding = std::get<0>(PaddingEmitter{
PaddingEmitter{const_extent, output_channel_idx}.emit()); padding_shape, const_extent, output_channel_idx}
.emit());
cur = padding({cur, padding_shp_var}); cur = padding({cur, padding_shp_var});
} }
cur = ReformatManager::instance().get(key)({cur}); cur = ReformatManager::instance().get(key)({cur});
...@@ -554,4 +520,81 @@ const ReformatManager& ReformatManager::instance() { ...@@ -554,4 +520,81 @@ const ReformatManager& ReformatManager::instance() {
static ReformatManager inst; static ReformatManager inst;
return inst; return inst;
} }
TensorShape mgb::gopt::make_aligned_tensor_shape(const VarNode* var,
TensorFormats orig_formats,
TensorFormats target_formats) {
using Dimension = megdnn::Dimension;
static constexpr uint32_t UNDETERMINED_EXTENT =
Dimension::UNDETERMINED_EXTENT;
auto orig_shape = tensor_formats_to_named_tensor_shape(orig_formats);
auto target_shape = tensor_formats_to_named_tensor_shape(target_formats);
TensorShape oshp = var->shape();
mgb_assert(oshp.is_scalar() || oshp.ndim == orig_shape.ndim,
"orig shape of var node is not compatible with tensor "
"formats(var:%s;shp:%s;fmt:%s)",
var->cname(), oshp.to_string().c_str(),
orig_shape.to_string().c_str());
if (oshp.is_scalar()) return oshp;
TensorShape tshp;
ThinHashMap<Dimension::Name, int> name2dominant;
for (size_t i = 0; i < orig_shape.ndim; ++i) {
auto name = orig_shape[i].name();
if (orig_shape[i].extent() == UNDETERMINED_EXTENT) {
auto insert = name2dominant.insert(std::make_pair(name, i));
mgb_assert(insert.second);
}
}
tshp.ndim = target_shape.ndim;
for (size_t i = 0; i < target_shape.ndim; ++i) {
auto name = target_shape[i].name();
if (target_shape[i].extent() == UNDETERMINED_EXTENT) {
int idx = name2dominant.at(name);
bool mul = orig_shape[idx] < target_shape[i];
size_t factor = mul ? (target_shape[i] / orig_shape[idx]).extent()
: (orig_shape[idx] / target_shape[i]).extent();
if (mul)
tshp[i] = oshp[idx] * factor;
else
tshp[i] = divup(oshp[idx], factor);
} else {
tshp[i] = target_shape[i].extent();
}
}
return tshp;
}
TensorShape mgb::gopt::make_aligned_weight_shape(const VarNode* var,
TensorFormats orig_formats,
TensorFormats target_formats,
TensorFormats extra_formats) {
auto tshp = make_aligned_tensor_shape(var, orig_formats, target_formats);
auto extra_shape = tensor_formats_to_named_tensor_shape(extra_formats);
using Dimension = megdnn::Dimension;
static constexpr uint32_t UNDETERMINED_EXTENT =
Dimension::UNDETERMINED_EXTENT;
size_t out_channel_alignment = 1;
for (size_t i = 0; i < extra_shape.ndim; ++i) {
auto name = extra_shape[i].name();
if (name == Dimension::Name::C &&
extra_shape[i].extent() == UNDETERMINED_EXTENT) {
out_channel_alignment = extra_shape[i].stride();
}
}
auto target_shape = tensor_formats_to_named_tensor_shape(target_formats);
for (size_t i = 0; i < target_shape.ndim; ++i) {
auto name = target_shape[i].name();
if ((name == Dimension::Name::K || name == Dimension::Name::N) &&
target_shape[i].extent() == UNDETERMINED_EXTENT) {
size_t out_channels = tshp[i] * target_shape[i].stride();
tshp[i] = divup(out_channels, out_channel_alignment) *
out_channel_alignment / target_shape[i].stride();
}
}
return tshp;
}
// vim: syntax=cpp.doxygen // vim: syntax=cpp.doxygen
/**
* \file src/gopt/impl/utils.h
* MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
*
* Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
* implied.
*/
#pragma once
#include "megbrain/gopt/global_layout_transform.h"
namespace mgb {
namespace gopt {
static inline const char* opr_format_to_string(
OprTensorFormatsConfiguration::OprFormat opr_format) {
using OprFormat = OprTensorFormatsConfiguration::OprFormat;
#define cb(_fmt) \
case OprFormat::_fmt: \
return #_fmt
switch (opr_format) {
cb(NCHW);
cb(NHWC);
cb(NCHW4);
cb(NCHW32);
cb(NCHW64);
cb(CHWN4);
default:
mgb_assert(false, "Invalid opr format(got:%u)",
static_cast<uint32_t>(opr_format));
}
#undef cb
}
static inline megdnn::NamedTensorShape tensor_formats_to_named_tensor_shape(
TensorFormats format) {
switch (format) {
case TensorFormats::NCHW:
return {{"N"}, {"C"}, {"H"}, {"W"}};
case TensorFormats::NHWC:
return {{"N"}, {"H"}, {"W"}, {"C"}};
case TensorFormats::NCHWc4:
return {{"N"}, {"C//4"}, {"H"}, {"W"}, {"C%4"}};
case TensorFormats::NCHWc8:
return {{"N"}, {"C//8"}, {"H"}, {"W"}, {"C%8"}};
case TensorFormats::NCHWc32:
return {{"N"}, {"C//32"}, {"H"}, {"W"}, {"C%32"}};
case TensorFormats::NCHWc64:
return {{"N"}, {"C//64"}, {"H"}, {"W"}, {"C%64"}};
case TensorFormats::CHWNc4:
return {{"C//4"}, {"H"}, {"W"}, {"N"}, {"C%4"}};
case TensorFormats::NHCWc4:
return {{"N"}, {"H"}, {"C//4"}, {"W"}, {"C%4"}};
case TensorFormats::KRSCk4:
return {{"K//4"}, {"R"}, {"S"}, {"C"}, {"K%4"}};
case TensorFormats::GKRSCk4:
return {{"G"}, {"K//4"}, {"R"}, {"S"}, {"C"}, {"K%4"}};
case TensorFormats::C1RSc4:
return {{"C//4"}, {"C%1"}, {"R"}, {"S"}, {"C%4"}};
case TensorFormats::KRSCk4c4:
return {{"K//4"}, {"R"}, {"S"}, {"C//4"}, {"K%4"}, {"C%4"}};
case TensorFormats::GKRSCk4c4:
return {{"G"}, {"K//4"}, {"R"}, {"S"}, {"C//4"}, {"K%4"}, {"C%4"}};
case TensorFormats::KCRSk4c4:
return {{"K//4"}, {"C//4"}, {"R"}, {"S"}, {"K%4"}, {"C%4"}};
case TensorFormats::GKCRSk4c4:
return {{"G"}, {"K//4"}, {"C//4"}, {"R"}, {"S"}, {"K%4"}, {"C%4"}};
case TensorFormats::KCRSc4k4:
return {{"K//4"}, {"C//4"}, {"R"}, {"S"}, {"C%4"}, {"K%4"}};
case TensorFormats::GKCRSc4k4:
return {{"G"}, {"K//4"}, {"C//4"}, {"R"}, {"S"}, {"C%4"}, {"K%4"}};
case TensorFormats::C11RSc4:
return {{"C//4"}, {"C%1"}, {"C%1"}, {"R"}, {"S"}, {"C%4"}};
case TensorFormats::KCRSc8k8:
return {{"K//8"}, {"C//8"}, {"R"}, {"S"}, {"C%8"}, {"K%8"}};
case TensorFormats::GKCRSc8k8:
return {{"G"}, {"K//8"}, {"C//8"}, {"R"}, {"S"}, {"C%8"}, {"K%8"}};
case TensorFormats::C11RSc8:
return {{"C//8"}, {"C%1"}, {"C%1"}, {"R"}, {"S"}, {"C%8"}};
case TensorFormats::KRSCk8:
return {{"K//8"}, {"R"}, {"S"}, {"C"}, {"K%8"}};
case TensorFormats::KCRSc4:
return {{"K"}, {"C//4"}, {"R"}, {"S"}, {"C%4"}};
case TensorFormats::GKCRSc4:
return {{"G"}, {"K"}, {"C//4"}, {"R"}, {"S"}, {"C%4"}};
case TensorFormats::KCRS:
return {{"K"}, {"C"}, {"R"}, {"S"}};
case TensorFormats::GKCRS:
return {{"G"}, {"K"}, {"C"}, {"R"}, {"S"}};
case TensorFormats::C11RS:
return {{"C"}, {"C%1"}, {"C%1"}, {"R"}, {"S"}};
default:
mgb_throw(AssertionError, "invalid tensor formats(%u)",
static_cast<uint32_t>(format));
}
}
} // namespace gopt
} // namespace mgb
// vim: syntax=cpp.doxygen
/**
* \file src/gopt/include/megbrain/gopt/global_layout_transformation.h
* MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
*
* Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
* implied.
*/
#pragma once
#include "megbrain/gopt/reformat_manager.h"
#include "megbrain/gopt/subgraph_extractor.h"
#include "megbrain/opr/dnn/convolution.h"
namespace mgb {
namespace gopt {
/*!
* \brief A structure that describe the data types and tensor formats
* configuration of the opr format
*/
struct OprTensorFormatsConfiguration {
using OprFormat = opr::ConvBias::Param::Format;
using OprTensorFormatsDispatcher =
thin_function<Maybe<OprTensorFormatsConfiguration>(
const cg::OperatorNodeBase*)>;
Typeinfo* typeinfo;
OprFormat opr_format;
SmallVector<DTypeEnum> input_dtypes;
SmallVector<DTypeEnum> output_dtypes;
SmallVector<TensorFormats> input_tensor_formats;
SmallVector<TensorType> input_tensor_types;
SmallVector<TensorFormats> output_tensor_formats;
static OprTensorFormatsDispatcher* find_dispatcher_by_type_format(
Typeinfo* type, OprFormat opr_format);
};
/*!
* \brief A structure that describes the global layout transform problem
*/
class Problem {
public:
using OprFormat = OprTensorFormatsConfiguration::OprFormat;
using OprTensorFormatsDispatcher =
OprTensorFormatsConfiguration::OprTensorFormatsDispatcher;
using OprConfigTrait =
ThinHashMap<Typeinfo*,
ThinHashMap<OprFormat, OprTensorFormatsDispatcher*>>;
struct Attribute {
OprFormat base_opr_format; /// the base opr format indicates that the
/// network to be optimized is constructed
/// in the base opr format, i.e. all the
/// format aware operators (conv, conv_bias,
/// deconv, pooling etc.) are built in
/// this format.
TensorFormats
base_tensor_formats; /// the base tensor format indicates that
/// all the format agnostic operators
/// (like elemwise, elemwise multi type,
/// typecvt etc.) are built in the base
/// tensor format.
};
Problem(const GraphPartition& graph_partition,
const SmallVector<TensorFormats>& available_tensor_formats,
const OprConfigTrait& opr_config, const Attribute& attribute)
: m_graph_partition{graph_partition},
m_available_tensor_formats{available_tensor_formats},
m_opr_configs{opr_config},
m_attribute{attribute} {}
~Problem() noexcept = default;
const GraphPartition& graph_partition() const { return m_graph_partition; }
const OprConfigTrait& opr_configs() const { return m_opr_configs; }
const SmallVector<TensorFormats>& available_tensor_formats() const {
return m_available_tensor_formats;
}
TensorFormats base_format() const {
return m_attribute.base_tensor_formats;
}
OprTensorFormatsConfiguration base_config(
const cg::OperatorNodeBase* opr) const {
auto _ = OprTensorFormatsConfiguration::find_dispatcher_by_type_format(
opr->dyn_typeinfo(), m_attribute.base_opr_format);
auto rst = (*_)(opr);
if (rst.valid())
return rst.val();
OprTensorFormatsConfiguration config;
config.typeinfo = opr->dyn_typeinfo();
config.opr_format = m_attribute.base_opr_format;
for (const auto& i : opr->input()) {
config.input_dtypes.emplace_back(i->dtype().enumv());
config.input_tensor_formats.emplace_back(
m_attribute.base_tensor_formats);
config.input_tensor_types.emplace_back(TensorType::FEATURE);
}
config.output_dtypes.emplace_back(opr->output(0)->dtype().enumv());
config.output_tensor_formats.emplace_back(
m_attribute.base_tensor_formats);
return config;
}
private:
const GraphPartition& m_graph_partition; /// the graph partition
const SmallVector<TensorFormats>&
m_available_tensor_formats; /// the available tensor formats, used
/// for format agnostic operators (like
/// elemwise, elemwise multi type,
/// typecvt, etc.
const OprConfigTrait&
m_opr_configs; /// the available opr format configurations, used
/// for format aware operators (like conv, deconv,
/// conv_bias, etc.
Attribute m_attribute; /// the extra attributes to describe the problem
};
/*!
* \brief A profiler that collects all the performance data to describe the
* global layout transform problem.
*/
class ProfilerBase {
public:
using OprFormat = Problem::OprFormat;
struct OperatorNodeRecord {
const cg::OperatorNodeBase* opr; ///< pointer to operator node
ThinHashMap<OprFormat, float>
costs; ///< costs of operator node, i.e. the elapsed device
///< time of the operator node on different opr format
///< (layout configuration).
std::string to_string() const;
};
struct VarNodeRecord {
struct KeyHash {
size_t operator()(
const std::pair<TensorFormats, TensorFormats>& val) const {
size_t h1 =
std::hash<uint32_t>()(static_cast<uint32_t>(val.first));
size_t h2 = std::hash<uint32_t>()(
static_cast<uint32_t>(val.second));
return mgb::hash_pair_combine(h1, h2);
}
};
const VarNode* var; ///< pointer to var node
std::unordered_map<std::pair<TensorFormats, TensorFormats>, float,
KeyHash>
costs; ///< costs of var node, i.e. the elapsed
///< device time of the layout transform.
///< Key of the hashmap indicates the
///< source tensor format and the target
///< tensor format.
std::string to_string() const;
};
/*!
* \note the profiler assumes all the input and output var node are stored
* in contiguous layout in memory
*/
struct ProfilingResult {
/// A hashmap, that maps the operator node to the costs (device elapsed
/// time) of different layouts configuration
ThinHashMap<cg::OperatorNodeBase*, OperatorNodeRecord> opr_record;
/// A hashmap, that maps the var node to the costs of layout transform
ThinHashMap<VarNode*, VarNodeRecord> var_record;
};
ProfilerBase() = default;
virtual ~ProfilerBase() = default;
virtual ProfilingResult profile(const Problem& problem) const = 0;
static std::unique_ptr<ProfilerBase> make_profiler();
};
} // namespace gopt
} // namespace mgb
// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}
...@@ -80,11 +80,13 @@ private: ...@@ -80,11 +80,13 @@ private:
class PaddingEmitter final : public Emitter { class PaddingEmitter final : public Emitter {
public: public:
PaddingEmitter(size_t const_extent, size_t axis) PaddingEmitter(const megdnn::NamedTensorShape& padshp, size_t const_extent,
: m_const_extent{const_extent}, m_axis{axis} {} size_t axis)
: m_padshp{padshp}, m_const_extent{const_extent}, m_axis{axis} {}
EmitResult emit() const override; EmitResult emit() const override;
private: private:
megdnn::NamedTensorShape m_padshp;
size_t m_const_extent, m_axis; size_t m_const_extent, m_axis;
}; };
......
...@@ -17,6 +17,11 @@ ...@@ -17,6 +17,11 @@
namespace mgb { namespace mgb {
namespace gopt { namespace gopt {
enum class TensorType : uint32_t {
FEATURE = 0,
WEIGHT = 1,
};
enum class TensorFormats : uint32_t { enum class TensorFormats : uint32_t {
// input tensor formats // input tensor formats
NCHW = 0, ///< [N, C, H, W] NCHW = 0, ///< [N, C, H, W]
...@@ -116,6 +121,15 @@ public: ...@@ -116,6 +121,15 @@ public:
private: private:
ReformatCache m_cache; ReformatCache m_cache;
}; };
TensorShape make_aligned_tensor_shape(const VarNode* var,
TensorFormats orig_formats,
TensorFormats target_formats);
TensorShape make_aligned_weight_shape(const VarNode* var,
TensorFormats orig_formats,
TensorFormats target_formats,
TensorFormats extra_formats);
} // namespace gopt } // namespace gopt
} // namespace mgb } // namespace mgb
......
...@@ -20,6 +20,7 @@ class GraphPartition { ...@@ -20,6 +20,7 @@ class GraphPartition {
public: public:
using VarNodeSet = ThinHashSet<VarNode*>; using VarNodeSet = ThinHashSet<VarNode*>;
using OperatorNodeSet = ThinHashSet<cg::OperatorNodeBase*>; using OperatorNodeSet = ThinHashSet<cg::OperatorNodeBase*>;
class InputPlaceholder; class InputPlaceholder;
GraphPartition() = default; GraphPartition() = default;
...@@ -45,13 +46,13 @@ private: ...@@ -45,13 +46,13 @@ private:
class SubGraphExtractor { class SubGraphExtractor {
public: public:
using OprList = ThinHashSet<Typeinfo*>; using OprList = ThinHashSet<Typeinfo*>;
SubGraphExtractor(OprList opr_list) : m_opr_list{opr_list} {}; SubGraphExtractor(const OprList& opr_list) : m_opr_list{opr_list} {};
std::vector<GraphPartition> extract( std::vector<GraphPartition> extract(
const SymbolVarArray& endpoint_vars) const; const SymbolVarArray& endpoint_vars) const;
private: private:
class Impl; class Impl;
OprList m_opr_list; const OprList& m_opr_list;
}; };
} // namespace gopt } // namespace gopt
......
/**
* \file src/gopt/test/profiler.cpp
* MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
*
* Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
* implied.
*/
#include "./helper.h"
#include "megbrain/gopt/global_layout_transform.h"
#include "megbrain/gopt/inference.h"
#include "megbrain/opr/dnn/pooling.h"
#include "megbrain/opr/imgproc.h"
#include "megbrain/opr/nn_int.h"
#include "megbrain/serialization/serializer.h"
using namespace mgb;
using namespace gopt;
using namespace serialization;
namespace {
class LayoutTransformContext : public NonCopyableObj {
public:
using OprList = SubGraphExtractor::OprList;
using OprFormat = Problem::OprFormat;
using OprConfigTrait = Problem::OprConfigTrait;
LayoutTransformContext() = delete;
LayoutTransformContext(OprList opr_list,
SmallVector<TensorFormats> available_tensor_formats,
OprConfigTrait opr_configs)
: m_opr_list{std::move(opr_list)},
m_available_tensor_formats{std::move(available_tensor_formats)},
m_opr_configs{std::move(opr_configs)} {}
const OprList& opr_list() const { return m_opr_list; }
const SmallVector<TensorFormats>& available_tensor_formats() const {
return m_available_tensor_formats;
}
const OprConfigTrait& opr_configs() const { return m_opr_configs; }
static std::unique_ptr<LayoutTransformContext> make() {
OprList opr_list = {
opr::ConvBiasForward::typeinfo(),
opr::ConvolutionForward::typeinfo(),
opr::ConvolutionBackwardData::typeinfo(),
opr::ElemwiseMultiType::typeinfo(),
opr::Elemwise::typeinfo(),
opr::TypeCvt::typeinfo(),
opr::PoolingForward::typeinfo(),
opr::WarpPerspectiveForward::typeinfo(),
};
OprConfigTrait opr_configs;
{
auto& dispatchers = opr_configs[opr::ConvBias::typeinfo()];
#define cb(_fmt) \
dispatchers[OprFormat::_fmt] = \
OprTensorFormatsConfiguration::find_dispatcher_by_type_format( \
opr::ConvBias::typeinfo(), OprFormat::_fmt);
cb(NCHW4);
cb(NCHW32);
cb(NHWC);
cb(NCHW64);
cb(CHWN4);
#undef cb
}
{
auto& dispatchers =
opr_configs[opr::ConvolutionBackwardData::typeinfo()];
#define cb(_fmt) \
dispatchers[OprFormat::_fmt] = \
OprTensorFormatsConfiguration::find_dispatcher_by_type_format( \
opr::ConvolutionBackwardData::typeinfo(), \
OprFormat::_fmt);
cb(NCHW4);
#undef cb
}
{
auto& dispatchers =
opr_configs[opr::ConvolutionForward::typeinfo()];
#define cb(_fmt) \
dispatchers[OprFormat::_fmt] = \
OprTensorFormatsConfiguration::find_dispatcher_by_type_format( \
opr::ConvolutionForward::typeinfo(), OprFormat::_fmt);
cb(NCHW4);
#undef cb
}
{
auto& dispatchers = opr_configs[opr::PoolingForward::typeinfo()];
#define cb(_fmt) \
dispatchers[OprFormat::_fmt] = \
OprTensorFormatsConfiguration::find_dispatcher_by_type_format( \
opr::PoolingForward::typeinfo(), OprFormat::_fmt);
cb(NCHW4);
cb(NCHW32);
cb(NHWC);
cb(NCHW64);
cb(CHWN4);
#undef cb
}
{
auto& dispatchers =
opr_configs[opr::WarpPerspectiveForward::typeinfo()];
#define cb(_fmt) \
dispatchers[OprFormat::_fmt] = \
OprTensorFormatsConfiguration::find_dispatcher_by_type_format( \
opr::WarpPerspectiveForward::typeinfo(), OprFormat::_fmt);
cb(NHWC);
cb(NCHW4);
cb(NCHW64);
#undef cb
}
SmallVector<TensorFormats> available_tensor_formats = {
TensorFormats::NHWC, TensorFormats::NCHWc4,
TensorFormats::NCHWc32, TensorFormats::NCHWc64};
return std::make_unique<LayoutTransformContext>(
std::move(opr_list), std::move(available_tensor_formats),
std::move(opr_configs));
}
private:
OprList m_opr_list;
SmallVector<TensorFormats> m_available_tensor_formats;
OprConfigTrait m_opr_configs;
};
}; // namespace
#if MGB_CUDA
#if CUDA_VERSION >= 10020
TEST(TestProfiler, Conv) {
REQUIRE_GPU(1);
auto cn = CompNode::load("gpu0");
cn.activate();
REQUIRE_CUDA_COMPUTE_CAPABILITY_EQ(7, 5);
auto ctx = LayoutTransformContext::make();
HostTensorGenerator<dtype::Int8> gen;
auto graph = ComputingGraph::make();
graph->options().graph_opt_level = 0;
auto mkvar = [&](const char* name, const TensorShape& shp,
const DType& dtype) {
return opr::TypeCvt::make(
opr::Host2DeviceCopy::make(*graph, gen(shp, cn)).rename(name),
dtype);
};
auto mkcvar = [&](const char* name, const TensorShape& shp,
const DType& dtype) {
return opr::TypeCvt::make(
opr::SharedDeviceTensor::make(*graph, *gen(shp, cn))
.rename(name),
dtype);
};
auto x = mkvar("x", {64, 48, 14, 14},
dtype::Quantized4Asymm(2.5f, static_cast<uint8_t>(4)));
auto w1 = mkcvar("w1", {48, 48, 3, 3}, dtype::QuantizedS4(2.5f));
auto b1 = mkcvar("b1", {1, 48, 1, 1}, dtype::QuantizedS32(6.25f));
opr::ConvBias::Param param;
param.format = opr::ConvBias::Param::Format::NCHW;
param.nonlineMode = opr::ConvBias::Param::NonlineMode::IDENTITY;
param.stride_h = param.stride_w = 1;
param.pad_h = param.pad_w = 1;
auto c1 = opr::ConvBias::make(x, w1, b1, param, {},
OperatorNodeConfig(dtype::Quantized4Asymm(
12.345f, static_cast<uint8_t>(5))));
x = opr::TypeCvt::make(c1, dtype::QuantizedS8(12.345f));
auto w2 = mkcvar("w2", {48, 48, 3, 3}, dtype::QuantizedS8(2.5f));
auto b2 = mkcvar("b2", {1, 48, 1, 1}, dtype::QuantizedS32(12.345f * 2.5f));
auto c2 = opr::ConvBias::make(x, w2, b2, param, {},
OperatorNodeConfig(dtype::QuantizedS8(2.5f)));
using S = opr::mixin::AlgoChooserHelper::ExecutionPolicy::Strategy;
S strategy = S::PROFILE;
gopt::modify_opr_algo_strategy_inplace({c2}, strategy);
using OprFormat = OprTensorFormatsConfiguration::OprFormat;
SubGraphExtractor extractor(ctx->opr_list());
auto partitions = extractor.extract({c2});
ASSERT_EQ(partitions.size(), 1u);
using Attribute = Problem::Attribute;
Attribute attribute = {OprFormat::NCHW, TensorFormats::NCHW};
Problem problem(partitions[0], ctx->available_tensor_formats(),
ctx->opr_configs(), attribute);
auto profiler = ProfilerBase::make_profiler();
auto rst = profiler->profile(problem);
const auto& opr_rst = rst.opr_record;
const auto& var_rst = rst.var_record;
EXPECT_TRUE(opr_rst.count(c1.node()->owner_opr()) > 0);
EXPECT_TRUE(opr_rst.count(c2.node()->owner_opr()) > 0);
EXPECT_TRUE(opr_rst.count(x.node()->owner_opr()) > 0);
EXPECT_TRUE(var_rst.count(w1.node()) == 0);
EXPECT_TRUE(var_rst.count(b1.node()) == 0);
EXPECT_TRUE(var_rst.count(w2.node()) == 0);
EXPECT_TRUE(var_rst.count(b2.node()) == 0);
}
#endif
TEST(TestProfiler, Deconv) {
REQUIRE_GPU(1);
auto cn = CompNode::load("gpu0");
cn.activate();
REQUIRE_CUDA_COMPUTE_CAPABILITY_EQ(7, 5);
auto ctx = LayoutTransformContext::make();
HostTensorGenerator<dtype::Int8> gen;
auto graph = ComputingGraph::make();
graph->options().graph_opt_level = 0;
auto mkvar = [&](const char* name, const TensorShape& shp,
const DType& dtype) {
return opr::TypeCvt::make(
opr::Host2DeviceCopy::make(*graph, gen(shp, cn)).rename(name),
dtype);
};
auto mkcvar = [&](const char* name, const TensorShape& shp,
const DType& dtype) {
return opr::TypeCvt::make(
opr::SharedDeviceTensor::make(*graph, *gen(shp, cn))
.rename(name),
dtype);
};
auto x = mkvar("x", {64, 10, 7, 7}, dtype::QuantizedS8(2.5f));
auto w1 = mkcvar("w1", {10, 10, 2, 2}, dtype::QuantizedS8(2.5f));
using Param = opr::ConvolutionBackwardData::Param;
Param param;
param.format = opr::ConvolutionBackwardData::Param::Format::NCHW;
param.stride_h = param.stride_w = 2;
param.pad_h = param.pad_w = 0;
auto c1 = opr::ConvolutionBackwardData::make(
w1, x, param, {}, OperatorNodeConfig(dtype::QuantizedS8(2.5f)));
auto w2 = mkcvar("w2", {10, 10, 2, 2}, dtype::QuantizedS8(2.5f));
auto c2 = opr::ConvolutionBackwardData::make(
w2, c1, param, {}, OperatorNodeConfig(dtype::QuantizedS8(2.5f)));
using S = opr::mixin::AlgoChooserHelper::ExecutionPolicy::Strategy;
S strategy = S::PROFILE;
gopt::modify_opr_algo_strategy_inplace({c2}, strategy);
using OprFormat = OprTensorFormatsConfiguration::OprFormat;
SubGraphExtractor extractor(ctx->opr_list());
auto partitions = extractor.extract({c2});
ASSERT_EQ(partitions.size(), 1u);
using Attribute = Problem::Attribute;
Attribute attribute = {OprFormat::NCHW, TensorFormats::NCHW};
Problem problem(partitions[0], ctx->available_tensor_formats(),
ctx->opr_configs(), attribute);
auto profiler = ProfilerBase::make_profiler();
auto rst = profiler->profile(problem);
const auto& opr_rst = rst.opr_record;
const auto& var_rst = rst.var_record;
EXPECT_TRUE(opr_rst.count(c1.node()->owner_opr()) > 0);
EXPECT_TRUE(opr_rst.count(c2.node()->owner_opr()) > 0);
EXPECT_TRUE(opr_rst.count(x.node()->owner_opr()) > 0);
EXPECT_TRUE(var_rst.count(w1.node()) == 0);
EXPECT_TRUE(var_rst.count(w2.node()) == 0);
}
TEST(TestProfiler, Warp) {
REQUIRE_GPU(1);
auto cn = CompNode::load("gpu0");
cn.activate();
REQUIRE_CUDA_COMPUTE_CAPABILITY_EQ(7, 5);
auto ctx = LayoutTransformContext::make();
constexpr size_t INP_H = 10, INP_W = 10, N = 16;
HostTensorGenerator<dtype::Int8> gen;
auto graph = ComputingGraph::make();
graph->options().graph_opt_level = 0;
auto mkvar = [&](const char* name, const TensorShape& shp,
const DType& dtype) {
return opr::TypeCvt::make(
opr::Host2DeviceCopy::make(*graph, gen(shp, cn)).rename(name),
dtype);
};
auto x = mkvar("x", {N, 48, INP_H, INP_W},
dtype::Quantized4Asymm(2.5f, static_cast<uint8_t>(4)));
float value1 = M_PI, value2 = 0.6;
auto gen_mat = [&](HostTensorND& mat) {
auto ptr = mat.ptr<float>();
for (size_t i = 0; i < N; ++i) {
auto rot = value1, scale = value2, sheer = value1, dy = value2,
dx = value2, ky = value2, kx = value2, kb = value2;
ptr[0] = ptr[4] = cos(rot) * scale;
ptr[1] = -(ptr[3] = sin(rot) * scale);
ptr[3] *= sheer;
ptr[4] *= sheer;
ptr[2] = dx;
ptr[5] = dy;
ptr[6] = kx;
ptr[7] = ky;
ptr[8] = kb;
ptr += 9;
}
mgb_assert(ptr == mat.ptr<float>() + mat.shape().total_nr_elems());
};
auto mat_host = std::make_shared<HostTensorND>(
x.node()->comp_node(), TensorShape{N, 3, 3}, dtype::Float32());
gen_mat(*mat_host);
auto mat = opr::Host2DeviceCopy::make(*graph, mat_host).rename("mat");
TensorShape out_shp{20, 20};
auto w1 = opr::WarpPerspectiveForward::make(x, mat, out_shp);
using S = opr::mixin::AlgoChooserHelper::ExecutionPolicy::Strategy;
S strategy = S::PROFILE;
gopt::modify_opr_algo_strategy_inplace({w1}, strategy);
using OprFormat = OprTensorFormatsConfiguration::OprFormat;
SubGraphExtractor extractor(ctx->opr_list());
auto partitions = extractor.extract({w1});
ASSERT_EQ(partitions.size(), 1u);
using Attribute = Problem::Attribute;
Attribute attribute = {OprFormat::NCHW, TensorFormats::NCHW};
Problem problem(partitions[0], ctx->available_tensor_formats(),
ctx->opr_configs(), attribute);
auto profiler = ProfilerBase::make_profiler();
auto rst = profiler->profile(problem);
const auto& opr_rst = rst.opr_record;
const auto& var_rst = rst.var_record;
EXPECT_TRUE(opr_rst.count(w1.node()->owner_opr()) > 0);
EXPECT_TRUE(var_rst.count(mat.node()) == 0);
EXPECT_TRUE(var_rst.count(w1.node()->owner_opr()->input(2)) == 0);
EXPECT_TRUE(var_rst.count(w1.node()->owner_opr()->input(0)) > 0);
}
TEST(TestProfiler, Pooling) {
REQUIRE_GPU(1);
auto cn = CompNode::load("gpu0");
cn.activate();
REQUIRE_CUDA_COMPUTE_CAPABILITY_EQ(7, 5);
auto ctx = LayoutTransformContext::make();
HostTensorGenerator<dtype::Int8> gen;
auto graph = ComputingGraph::make();
graph->options().graph_opt_level = 0;
auto mkvar = [&](const char* name, const TensorShape& shp,
const DType& dtype) {
return opr::TypeCvt::make(
opr::Host2DeviceCopy::make(*graph, gen(shp, cn)).rename(name),
dtype);
};
auto x = mkvar("x", {64, 64, 55, 55},
dtype::Quantized4Asymm(2.5f, static_cast<uint8_t>(4)));
using Param = opr::Pooling::Param;
Param param;
param.format = Param::Format::NCHW;
auto p1 = opr::Pooling::make(x, param);
x = opr::TypeCvt::make(p1, dtype::QuantizedS8(12.345f));
auto p2 = opr::Pooling::make(x, param);
using S = opr::mixin::AlgoChooserHelper::ExecutionPolicy::Strategy;
S strategy = S::PROFILE;
gopt::modify_opr_algo_strategy_inplace({p2}, strategy);
using OprFormat = OprTensorFormatsConfiguration::OprFormat;
SubGraphExtractor extractor(ctx->opr_list());
auto partitions = extractor.extract({p2});
ASSERT_EQ(partitions.size(), 1u);
using Attribute = Problem::Attribute;
Attribute attribute = {OprFormat::NCHW, TensorFormats::NCHW};
Problem problem(partitions[0], ctx->available_tensor_formats(),
ctx->opr_configs(), attribute);
auto profiler = ProfilerBase::make_profiler();
auto rst = profiler->profile(problem);
const auto& opr_rst = rst.opr_record;
EXPECT_TRUE(opr_rst.count(p1.node()->owner_opr()) > 0);
EXPECT_TRUE(opr_rst.count(p2.node()->owner_opr()) > 0);
EXPECT_TRUE(opr_rst.count(x.node()->owner_opr()) > 0);
}
TEST(TestProfiler, Elemwise) {
REQUIRE_GPU(1);
auto cn = CompNode::load("gpu0");
cn.activate();
REQUIRE_CUDA_COMPUTE_CAPABILITY_EQ(7, 5);
auto ctx = LayoutTransformContext::make();
HostTensorGenerator<dtype::Int8> gen;
auto graph = ComputingGraph::make();
graph->options().graph_opt_level = 0;
auto mkvar = [&](const char* name, const TensorShape& shp,
const DType& dtype) {
return opr::TypeCvt::make(
opr::Host2DeviceCopy::make(*graph, gen(shp, cn)).rename(name),
dtype);
};
auto a = mkvar("a", {64, 48, 14, 14}, dtype::Float32());
auto b = mkvar("b", {1, 48, 1, 1}, dtype::Float32());
auto c = opr::Elemwise::make({a, b},
{opr::Elemwise::Param::Mode::FUSE_ADD_RELU});
auto q4c = opr::TypeCvt::make(
c, dtype::Quantized4Asymm(2.5f, static_cast<uint8_t>(4)));
auto q8a = mkvar("q8a", {64, 48, 14, 14}, dtype::QuantizedS8(2.5f));
auto q8b = mkvar("q8b", {64, 48, 14, 14}, dtype::QuantizedS8(1.2f));
auto q8d = opr::ElemwiseMultiType::make(
{q8a, q8b}, {opr::ElemwiseMultiType::Param::Mode::QFUSE_ADD_RELU},
OperatorNodeConfig(dtype::QuantizedS8(12.f)));
auto q4d = opr::TypeCvt::make(
q8d, dtype::Quantized4Asymm(1.2f, static_cast<uint8_t>(3)));
auto q4e = opr::ElemwiseMultiType::make(
{q4c, q4d}, {opr::ElemwiseMultiType::Param::Mode::QADD},
OperatorNodeConfig(
dtype::Quantized4Asymm(13.f, static_cast<uint8_t>(4))));
using OprFormat = OprTensorFormatsConfiguration::OprFormat;
SubGraphExtractor extractor(ctx->opr_list());
auto partitions = extractor.extract({q4e});
ASSERT_EQ(partitions.size(), 1u);
using Attribute = Problem::Attribute;
Attribute attribute = {OprFormat::NCHW, TensorFormats::NCHW};
Problem problem(partitions[0], ctx->available_tensor_formats(),
ctx->opr_configs(), attribute);
auto profiler = ProfilerBase::make_profiler();
auto rst = profiler->profile(problem);
const auto& opr_rst = rst.opr_record;
const auto& var_rst = rst.var_record;
EXPECT_TRUE(opr_rst.count(c.node()->owner_opr()) > 0);
EXPECT_TRUE(opr_rst.count(q8d.node()->owner_opr()) > 0);
EXPECT_TRUE(opr_rst.count(q4e.node()->owner_opr()) > 0);
EXPECT_TRUE(var_rst.count(a.node()) > 0);
EXPECT_TRUE(var_rst.count(b.node()) > 0);
EXPECT_TRUE(var_rst.count(q8a.node()) > 0);
EXPECT_TRUE(var_rst.count(q8b.node()) > 0);
}
#endif
// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}
...@@ -447,8 +447,6 @@ TEST(TestReformatManager, AutoAlignedFeatureProfiling) { ...@@ -447,8 +447,6 @@ TEST(TestReformatManager, AutoAlignedFeatureProfiling) {
for (size_t i = 0; i < RUNS; ++i) for (size_t i = 0; i < RUNS; ++i)
func->execute(); func->execute();
double time_profiler = profiler->duration() * 1e6; double time_profiler = profiler->duration() * 1e6;
printf("%f, %f\n", time_profiler, time_cuda_evt);
ASSERT_EQ(time_cuda_evt, time_profiler);
MGB_CUDA_CHECK(cudaEventDestroy(evt0)); MGB_CUDA_CHECK(cudaEventDestroy(evt0));
MGB_CUDA_CHECK(cudaEventDestroy(evt1)); MGB_CUDA_CHECK(cudaEventDestroy(evt1));
} }
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册