提交 c14e5719 编写于 作者: M Megvii Engine Team

feat(mgb/gopt): add profile impl for global layout transform pass

GitOrigin-RevId: 8ef62baf792c97c7a226dd791af167ab2e8707b4
上级 9c0a17d0
...@@ -166,6 +166,13 @@ void aarch64::RelayoutForwardImpl::exec(_megdnn_tensor_in src0, ...@@ -166,6 +166,13 @@ void aarch64::RelayoutForwardImpl::exec(_megdnn_tensor_in src0,
TensorND src = src0, dst = dst0; TensorND src = src0, dst = dst0;
check_layout_and_canonize(src.layout, dst.layout); check_layout_and_canonize(src.layout, dst.layout);
// FIXME: optimize for lowbit cases
if (src.layout.dtype.enumv() == DTypeEnum::QuantizedS4 ||
src.layout.dtype.enumv() == DTypeEnum::Quantized4Asymm) {
fallback::RelayoutForwardImpl::exec(src0, dst0, src_handle);
return;
}
relayout::TransposeParam trans_param; relayout::TransposeParam trans_param;
bool trans = relayout::is_transpose(src.layout, dst.layout, trans_param); bool trans = relayout::is_transpose(src.layout, dst.layout, trans_param);
if (trans && trans_param.c == 1 && src0.layout.dtype.size() == 1) { if (trans && trans_param.c == 1 && src0.layout.dtype.size() == 1) {
......
...@@ -134,6 +134,13 @@ void armv7::RelayoutForwardImpl::exec(_megdnn_tensor_in src0, ...@@ -134,6 +134,13 @@ void armv7::RelayoutForwardImpl::exec(_megdnn_tensor_in src0,
TensorND src = src0, dst = dst0; TensorND src = src0, dst = dst0;
check_layout_and_canonize(src.layout, dst.layout); check_layout_and_canonize(src.layout, dst.layout);
// FIXME: optimize for lowbit cases
if (src.layout.dtype.enumv() == DTypeEnum::QuantizedS4 ||
src.layout.dtype.enumv() == DTypeEnum::Quantized4Asymm) {
fallback::RelayoutForwardImpl::exec(src0, dst0, src_handle);
return;
}
relayout::TransposeParam trans_param; relayout::TransposeParam trans_param;
bool trans = relayout::is_transpose(src.layout, dst.layout, trans_param); bool trans = relayout::is_transpose(src.layout, dst.layout, trans_param);
if (trans && trans_param.c == 1 && src0.layout.dtype.size() == 1) { if (trans && trans_param.c == 1 && src0.layout.dtype.size() == 1) {
......
/**
* \file src/gopt/impl/opr_format_modifier.cpp
* MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
*
* Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
* implied.
*/
#include "./opr_format_modifier.h"
#include "megbrain/opr/dnn/convolution.h"
#include "megbrain/opr/dnn/pooling.h"
#include "megbrain/opr/imgproc.h"
#include "megbrain/opr/io.h"
#include "megbrain/serialization/sereg.h"
#include "midout.h"
MIDOUT_DECL(megbrain_opr_format_modifier)
#define MIDOUT_B(...) MIDOUT_BEGIN(megbrain_opr_format_modifier, __VA_ARGS__) {
#define MIDOUT_E \
} \
MIDOUT_END();
using namespace mgb;
using namespace opr;
namespace {
template <class MegDNNConv = megdnn::Convolution>
struct MakeConvCaller2 {
template <typename Opr>
static VarNode* make(const cg::VarNodeArray& inputs,
const typename MegDNNConv::Param& param,
const megdnn::param::ExecutionPolicy& execution_policy,
const OperatorNodeConfig& config) {
if (inputs.size() == 2) {
return Opr::make(inputs[0], inputs[1], param, execution_policy,
config)
.node();
}
return nullptr;
}
};
template <class MegDNNConv = megdnn::Convolution>
struct MakeConvCaller3 {
template <typename Opr>
static VarNode* make(const cg::VarNodeArray& inputs,
const typename MegDNNConv::Param& param,
const megdnn::param::ExecutionPolicy& execution_policy,
const OperatorNodeConfig& config) {
if (inputs.size() == 3) {
return Opr::make(inputs[0], inputs[1], inputs[2], param,
execution_policy, config)
.node();
}
return nullptr;
}
};
template <class MegDNNConv = megdnn::Convolution>
struct MakeConvCaller4 {
template <typename Opr>
static VarNode* make(const cg::VarNodeArray& inputs,
const typename MegDNNConv::Param& param,
const megdnn::param::ExecutionPolicy& execution_policy,
const OperatorNodeConfig& config) {
if (inputs.size() == 4) {
return Opr::make(inputs[0], inputs[1], inputs[2], inputs[3], param,
execution_policy, config)
.node();
}
return nullptr;
}
};
template <class MegDNNConv = megdnn::Convolution>
struct MakeConvCaller5 {
template <typename Opr>
static VarNode* make(const cg::VarNodeArray& inputs,
const typename MegDNNConv::Param& param,
const megdnn::param::ExecutionPolicy& execution_policy,
const OperatorNodeConfig& config) {
if (inputs.size() == 5) {
return Opr::make(inputs[0], inputs[1], inputs[2], inputs[3],
inputs[4], param, execution_policy, config)
.node();
}
return nullptr;
}
};
template <class MegDNNConv = megdnn::Convolution>
struct MakeConvCallerEmpty {
template <typename Opr>
static VarNode* make(const cg::VarNodeArray&,
const typename MegDNNConv::Param&,
const megdnn::param::ExecutionPolicy&,
const OperatorNodeConfig&) {
return nullptr;
}
};
template <class Opr, class Maker0, class MegDNNConv,
class Maker1 = MakeConvCallerEmpty<MegDNNConv>,
class Maker2 = MakeConvCallerEmpty<MegDNNConv>,
typename ConvParam = megdnn::param::Convolution>
struct ConvMakerImpl {
static VarNode* make(const cg::VarNodeArray& inputs, const ConvParam& param,
const megdnn::param::ExecutionPolicy& execution_policy,
const OperatorNodeConfig& config) {
VarNode* ret = Maker0::template make<Opr>(inputs, param,
execution_policy, config);
if (!ret) {
ret = Maker1::template make<Opr>(inputs, param, execution_policy,
config);
}
if (!ret) {
ret = Maker2::template make<Opr>(inputs, param, execution_policy,
config);
}
mgb_assert(ret);
return ret;
}
};
template <typename Opr>
struct ConvMaker;
template <>
struct ConvMaker<opr::Convolution>
: public ConvMakerImpl<opr::Convolution,
MakeConvCaller2<megdnn::Convolution>,
megdnn::Convolution> {};
template <>
struct ConvMaker<opr::ConvolutionBackwardData>
: public ConvMakerImpl<opr::ConvolutionBackwardData,
MakeConvCaller2<megdnn::Convolution>,
megdnn::Convolution,
MakeConvCaller3<megdnn::Convolution>> {};
template <>
struct ConvMaker<opr::ConvBiasForward>
: public ConvMakerImpl<opr::ConvBiasForward,
MakeConvCaller2<megdnn::ConvBiasForward>,
megdnn::ConvBiasForward,
MakeConvCaller3<megdnn::ConvBiasForward>,
MakeConvCaller4<megdnn::ConvBiasForward>,
megdnn::param::ConvBias> {};
template <>
struct ConvMaker<opr::BatchConvBiasForward>
: public ConvMakerImpl<opr::BatchConvBiasForward,
MakeConvCaller2<megdnn::BatchConvBiasForward>,
megdnn::BatchConvBiasForward,
MakeConvCaller3<megdnn::BatchConvBiasForward>,
MakeConvCaller4<megdnn::BatchConvBiasForward>,
megdnn::param::BatchConvBias> {};
#if 0
#include "../../opr/impl/internal/invoke.h"
template <typename Opr>
struct MultiAlgoOprTrait;
#define APPLY(statement, ...) \
mgb::apply([&](const auto&... args) { return statement; }, \
std::tuple_cat(__VA_ARGS__))
#define INST(_Opr) \
template <> \
struct MultiAlgoOprTrait<_Opr> { \
static constexpr bool has_algo = true; \
using MegDNNOpr = megdnn::_Opr; \
static constexpr int arity = OprArityTrait<MegDNNOpr>::arity; \
using FixedTensorLayouts = std::array<TensorLayout, arity>; \
static bool has_available_algo(const VarNodeArray& i, \
const cg::OperatorNodeBase* opr_) { \
MIDOUT_B(midout_iv(MGB_HASH_STR(#_Opr)), \
midout_iv(MGB_HASH_STR("has_available_algo"))) \
auto&& opr = opr_->cast_final_safe<_Opr>(); \
auto&& megdnn_opr = \
reinterpret_cast<MegDNNOpr*>(opr.megdnn_opr()); \
FixedTensorLayouts array_layouts; \
size_t in = i.size() - 1; \
for (size_t idx = 0; idx < in; idx++) { \
const auto& v = i[idx]; \
array_layouts[idx] = \
TensorLayout{v->shape(), v->dtype(), v->format()}; \
} \
const auto& v = i[in]; \
array_layouts[arity - 1] = \
TensorLayout{v->shape(), v->dtype(), v->format()}; \
return APPLY(::megdnn::has_available_algo(megdnn_opr, args...), \
array_layouts); \
MIDOUT_E \
} \
};
INST(Convolution)
INST(ConvBiasForward)
INST(ConvolutionBackwardData)
INST(PoolingForward)
#undef APPLY
#undef INST
#endif
} // namespace
namespace mgb {
namespace gopt {
namespace intl {
template <typename Opr>
struct OprFormatModifier;
#define INST(_Opr) \
template <> \
struct OprFormatModifier<_Opr> { \
using OprFormat = typename _Opr::Param::Format; \
static VarNode* make(OprFormat opr_format, const VarNodeArray& i, \
const cg::OperatorNodeBase* opr_) { \
MIDOUT_B(_Opr) \
auto&& opr = opr_->cast_final_safe<_Opr>(); \
auto param = opr.param(); \
param.format = opr_format; \
return ConvMaker<_Opr>::make(i, param, opr.execution_policy(), \
opr.config()); \
MIDOUT_E \
} \
};
INST(Convolution);
INST(ConvBiasForward);
INST(ConvolutionBackwardData);
INST(BatchConvBiasForward);
#undef INST
template <>
struct OprFormatModifier<WarpPerspective> {
using Opr = opr::WarpPerspective;
using OprFormat = typename Opr::Param::Format;
static VarNode* make(OprFormat opr_format, const VarNodeArray& i,
const cg::OperatorNodeBase* opr_) {
MIDOUT_B(Opr)
auto&& opr = opr_->cast_final_safe<Opr>();
auto param = opr.param();
param.format = opr_format;
if (i.size() == 3) {
return Opr::make(i[0], i[1], i[2], param, opr.config()).node();
} else {
mgb_assert(i.size() == 4);
return Opr::make(i[0], i[1], i[2], i[3], param, opr.config())
.node();
}
MIDOUT_E
}
};
#define INST(_Opr, _arity) \
template <> \
struct OprFormatModifier<_Opr> { \
using OprFormat = typename _Opr::Param::Format; \
static VarNode* make(OprFormat opr_format, const VarNodeArray& i, \
const cg::OperatorNodeBase* opr_) { \
MIDOUT_B(_Opr) \
auto&& opr = opr_->cast_final_safe<_Opr>(); \
auto param = opr.param(); \
param.format = opr_format; \
return serialization::OprMaker<_Opr, _arity>::make( \
param, i, *i[0]->owner_graph(), opr.config()) \
->output(0); \
MIDOUT_E \
} \
};
INST(PoolingForward, 1);
INST(Resize, 2);
#undef INST
VarNode* modify_opr_format(opr::ConvBias::Param::Format opr_format,
const VarNodeArray& i,
const cg::OperatorNodeBase* opr) {
#define cb(_Opr) \
if (opr->dyn_typeinfo() == _Opr::typeinfo()) { \
return OprFormatModifier<_Opr>::make(opr_format, i, opr); \
} else
FOREACH_FORMAT_AWARE_OPR(cb) {
mgb_throw(InternalError, "invalid format aware operator(got:%s)",
opr->dyn_typeinfo()->name);
}
#undef cb
}
#if 0
bool has_available_algo(const VarNodeArray& i,
const cg::OperatorNodeBase* opr) {
#define cb(_Opr) \
if (opr->dyn_typeinfo() == _Opr::typeinfo()) { \
MGB_MARK_USED_VAR(MultiAlgoOprTrait<_Opr>::has_algo); \
VarNodeArray _ = i; \
_.emplace_back(opr->output(0)); \
return MultiAlgoOprTrait<_Opr>::has_available_algo(_, opr); \
} else
cb(Convolution) cb(ConvBiasForward) cb(ConvolutionBackwardData)
cb(PoolingForward) {
mgb_throw(InternalError, "invalid multi-algo operator(got:%s)",
opr->dyn_typeinfo()->name);
}
}
#endif
} // namespace intl
} // namespace gopt
} // namespace mgb
// vim: syntax=cpp.doxygen
/**
* \file src/gopt/impl/opr_format_modifier.h
* MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
*
* Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
* implied.
*/
#pragma once
#include "megbrain/graph.h"
#include "megbrain/opr/dnn/convolution.h"
namespace mgb {
namespace gopt {
namespace intl {
#define FOREACH_FORMAT_AWARE_OPR(cb) \
cb(Convolution) cb(ConvBiasForward) cb(ConvolutionBackwardData) \
cb(PoolingForward) cb(WarpPerspective) cb(Resize)
#if 0
bool has_available_algo(const VarNodeArray& i, const cg::OperatorNodeBase* opr);
#endif
VarNode* modify_opr_format(opr::ConvBias::Param::Format opr_format,
const VarNodeArray& i,
const cg::OperatorNodeBase* opr);
} // namespace intl
} // namespace gopt
} // namespace mgb
// vim: syntax=cpp.doxygen
/**
* \file src/gopt/impl/opr_tensor_formats_config.cpp
* MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
*
* Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
* implied.
*/
#include "./utils.h"
#include "megbrain/gopt/global_layout_transform.h"
#include "megbrain/opr/dnn/pooling.h"
#include "megbrain/opr/imgproc.h"
#include "midout.h"
MIDOUT_DECL(megbrain_opr_tensor_formats_config)
#define MIDOUT_B(...) \
MIDOUT_BEGIN(megbrain_opr_tensor_formats_config, __VA_ARGS__) {
#define MIDOUT_E \
} \
MIDOUT_END();
using namespace mgb;
using namespace cg;
using namespace gopt;
using OprFormat = opr::ConvBias::Param::Format;
namespace {
template <typename Opr>
struct ConvParamTrait;
#define INST(_conv, _weight_idx, _bias_idx, _has_bias) \
template <> \
struct ConvParamTrait<opr::_conv> { \
static constexpr int weight_idx = _weight_idx; \
static constexpr int bias_idx = _bias_idx; \
static constexpr bool has_bias = _has_bias; \
}
INST(ConvBias, 1, 2, true);
INST(ConvolutionForward, 1, 0, false);
INST(ConvolutionBackwardData, 0, 0, false);
template <typename Opr, size_t weight_idx = ConvParamTrait<Opr>::weight_idx>
static bool is_channel_wise_conv(const OperatorNodeBase* opr) {
MGB_MARK_USED_VAR(ConvParamTrait<Opr>::has_bias);
MGB_MARK_USED_VAR(ConvParamTrait<Opr>::bias_idx);
auto&& conv = opr->cast_final_safe<Opr>();
auto format = conv.param().format;
auto weight = opr->input(weight_idx);
auto weight_shp = weight->shape();
if (conv.param().sparse == Opr::Param::Sparse::DENSE)
return false;
size_t ocpg, icpg;
if (format == Opr::Param::Format::NCHW) {
ocpg = weight_shp[1], icpg = weight_shp[2];
return ocpg == 1 && icpg == 1;
}
return false;
}
template <OprFormat opr_format_>
struct OprSingleInOutTensorFormatsDispatcherImpl;
template <>
struct OprSingleInOutTensorFormatsDispatcherImpl<OprFormat::NCHW> {
static Maybe<OprTensorFormatsConfiguration> dispatch(
const OperatorNodeBase* opr) {
OprTensorFormatsConfiguration config;
config.typeinfo = opr->dyn_typeinfo();
config.opr_format = OprFormat::NCHW;
config.input_dtypes = {opr->input(0)->dtype().enumv()};
config.input_tensor_types = {TensorType::FEATURE};
config.output_dtypes = {opr->output(0)->dtype().enumv()};
config.input_tensor_formats = {TensorFormats::NCHW};
config.output_tensor_formats = {TensorFormats::NCHW};
return config;
}
};
template <>
struct OprSingleInOutTensorFormatsDispatcherImpl<OprFormat::NCHW4> {
static Maybe<OprTensorFormatsConfiguration> dispatch(
const OperatorNodeBase* opr) {
OprTensorFormatsConfiguration config;
config.typeinfo = opr->dyn_typeinfo();
config.opr_format = OprFormat::NCHW4;
bool available = true;
available &= opr->input(0)->dtype().enumv() == DTypeEnum::QuantizedS8;
config.input_dtypes = {opr->input(0)->dtype().enumv()};
config.input_tensor_types = {TensorType::FEATURE};
available &= opr->output(0)->dtype().enumv() == DTypeEnum::QuantizedS8;
config.output_dtypes = {opr->output(0)->dtype().enumv()};
config.input_tensor_formats = {TensorFormats::NCHWc4};
config.output_tensor_formats = {TensorFormats::NCHWc4};
if (available)
return config;
return None;
}
};
template <>
struct OprSingleInOutTensorFormatsDispatcherImpl<OprFormat::CHWN4> {
static Maybe<OprTensorFormatsConfiguration> dispatch(
const OperatorNodeBase* opr) {
OprTensorFormatsConfiguration config;
config.typeinfo = opr->dyn_typeinfo();
config.opr_format = OprFormat::CHWN4;
bool available = true;
available &= opr->input(0)->dtype().enumv() == DTypeEnum::QuantizedS8;
config.input_dtypes = {opr->input(0)->dtype().enumv()};
config.input_tensor_types = {TensorType::FEATURE};
available &= opr->output(0)->dtype().enumv() == DTypeEnum::QuantizedS8;
config.output_dtypes = {opr->output(0)->dtype().enumv()};
config.input_tensor_formats = {TensorFormats::CHWNc4};
config.output_tensor_formats = {TensorFormats::CHWNc4};
if (available)
return config;
return None;
}
};
template <>
struct OprSingleInOutTensorFormatsDispatcherImpl<OprFormat::NCHW32> {
static Maybe<OprTensorFormatsConfiguration> dispatch(
const OperatorNodeBase* opr) {
OprTensorFormatsConfiguration config;
config.typeinfo = opr->dyn_typeinfo();
config.opr_format = OprFormat::NCHW32;
bool available = true;
available &= opr->input(0)->dtype().enumv() == DTypeEnum::QuantizedS8;
config.input_dtypes = {opr->input(0)->dtype().enumv()};
config.input_tensor_types = {TensorType::FEATURE};
available &= opr->output(0)->dtype().enumv() == DTypeEnum::QuantizedS8;
config.output_dtypes = {opr->output(0)->dtype().enumv()};
config.input_tensor_formats = {TensorFormats::NCHWc32};
config.output_tensor_formats = {TensorFormats::NCHWc32};
if (available)
return config;
return None;
}
};
template <>
struct OprSingleInOutTensorFormatsDispatcherImpl<OprFormat::NHWC> {
static Maybe<OprTensorFormatsConfiguration> dispatch(
const OperatorNodeBase* opr) {
OprTensorFormatsConfiguration config;
config.typeinfo = opr->dyn_typeinfo();
config.opr_format = OprFormat::NHWC;
bool available = true;
available &=
opr->input(0)->dtype().enumv() == DTypeEnum::Quantized4Asymm ||
opr->input(0)->dtype().enumv() == DTypeEnum::QuantizedS4;
config.input_dtypes = {opr->input(0)->dtype().enumv()};
config.input_tensor_types = {TensorType::FEATURE};
available &= opr->output(0)->dtype().enumv() ==
opr->input(0)->dtype().enumv();
config.output_dtypes = {opr->output(0)->dtype().enumv()};
config.input_tensor_formats = {TensorFormats::NHWC};
config.output_tensor_formats = {TensorFormats::NHWC};
if (available)
return config;
return None;
}
};
template <>
struct OprSingleInOutTensorFormatsDispatcherImpl<OprFormat::NCHW64> {
static Maybe<OprTensorFormatsConfiguration> dispatch(
const OperatorNodeBase* opr) {
OprTensorFormatsConfiguration config;
config.typeinfo = opr->dyn_typeinfo();
config.opr_format = OprFormat::NCHW64;
bool available = true;
available &=
opr->input(0)->dtype().enumv() == DTypeEnum::Quantized4Asymm ||
opr->input(0)->dtype().enumv() == DTypeEnum::QuantizedS4;
config.input_dtypes = {opr->input(0)->dtype().enumv()};
config.input_tensor_types = {TensorType::FEATURE};
available &= opr->output(0)->dtype().enumv() ==
opr->input(0)->dtype().enumv();
config.output_dtypes = {opr->output(0)->dtype().enumv()};
config.input_tensor_formats = {TensorFormats::NCHWc64};
config.output_tensor_formats = {TensorFormats::NCHWc64};
if (available)
return config;
return None;
}
};
template <typename Opr, OprFormat opr_format_>
struct ConvTensorFormatsDispatcherImpl;
template <typename Opr>
struct ConvTensorFormatsDispatcherImpl<Opr, OprFormat::NCHW> {
static Maybe<OprTensorFormatsConfiguration> dispatch(
const OperatorNodeBase* opr) {
const auto& conv = opr->cast_final_safe<Opr>();
OprTensorFormatsConfiguration config;
config.typeinfo = opr->dyn_typeinfo();
config.opr_format = OprFormat::NCHW;
// setup dtypes
for (size_t i = 0; i < opr->input().size(); ++i) {
config.input_dtypes.emplace_back(opr->input(i)->dtype().enumv());
TensorType tensor_type =
i == 1 ? TensorType::WEIGHT : TensorType::FEATURE;
config.input_tensor_types.emplace_back(tensor_type);
}
config.output_dtypes.emplace_back(opr->output(0)->dtype().enumv());
// setup tensor formats
if (conv.param().sparse == Opr::Param::Sparse::DENSE) {
config.input_tensor_formats = {
TensorFormats::NCHW, TensorFormats::NCHW,
TensorFormats::NCHW, TensorFormats::NCHW};
} else {
mgb_assert(conv.param().sparse == Opr::Param::Sparse::GROUP);
if (is_channel_wise_conv<Opr>(opr)) {
config.input_tensor_formats = {
TensorFormats::NCHW, TensorFormats::C11RS,
TensorFormats::NCHW, TensorFormats::NCHW};
} else {
config.input_tensor_formats = {
TensorFormats::NCHW, TensorFormats::GKCRS,
TensorFormats::NCHW, TensorFormats::NCHW};
}
}
config.output_tensor_formats = {TensorFormats::NCHW};
return config;
}
};
template <typename Opr>
struct ConvTensorFormatsDispatcherImpl<Opr, OprFormat::NHWC> {
static Maybe<OprTensorFormatsConfiguration> dispatch(
const OperatorNodeBase* opr) {
const auto& conv = opr->cast_final_safe<Opr>();
OprTensorFormatsConfiguration config;
config.typeinfo = opr->dyn_typeinfo();
config.opr_format = OprFormat::NHWC;
bool available = true;
for (size_t i = 0; i < opr->input().size(); ++i) {
if (i == 2)
available &= opr->input(i)->dtype().enumv() ==
DTypeEnum::QuantizedS32;
else
available &= opr->input(i)->dtype().enumv() ==
DTypeEnum::Quantized4Asymm ||
opr->input(i)->dtype().enumv() ==
DTypeEnum::QuantizedS4;
config.input_dtypes.emplace_back(opr->input(i)->dtype().enumv());
TensorType tensor_type =
i == 1 ? TensorType::WEIGHT : TensorType::FEATURE;
config.input_tensor_types.emplace_back(tensor_type);
}
available &=
opr->output(0)->dtype().enumv() == DTypeEnum::Quantized4Asymm ||
opr->output(0)->dtype().enumv() == DTypeEnum::QuantizedS4;
config.output_dtypes.emplace_back(opr->output(0)->dtype().enumv());
available &= conv.param().sparse == Opr::Param::Sparse::DENSE;
config.input_tensor_formats = {TensorFormats::NHWC, TensorFormats::NHWC,
TensorFormats::NHWC,
TensorFormats::NHWC};
config.output_tensor_formats = {TensorFormats::NHWC};
if (available)
return config;
return None;
}
};
template <typename Opr>
struct ConvTensorFormatsDispatcherImpl<Opr, OprFormat::NCHW4> {
static Maybe<OprTensorFormatsConfiguration> dispatch(
const OperatorNodeBase* opr) {
const auto& conv = opr->cast_final_safe<Opr>();
OprTensorFormatsConfiguration config;
config.typeinfo = opr->dyn_typeinfo();
config.opr_format = OprFormat::NCHW4;
bool available = true;
// setup dtypes
for (size_t i = 0; i < opr->input().size(); ++i) {
if (i == 2)
available &= opr->input(i)->dtype().enumv() ==
DTypeEnum::QuantizedS32;
else
available &= opr->input(i)->dtype().enumv() ==
DTypeEnum::QuantizedS8;
config.input_dtypes.emplace_back(opr->input(i)->dtype().enumv());
TensorType tensor_type =
i == 1 ? TensorType::WEIGHT : TensorType::FEATURE;
config.input_tensor_types.emplace_back(tensor_type);
}
available &= opr->output(0)->dtype().enumv() == DTypeEnum::QuantizedS8;
config.output_dtypes.emplace_back(opr->output(0)->dtype().enumv());
// setup tensor formats
if (conv.param().sparse == Opr::Param::Sparse::DENSE) {
config.input_tensor_formats = {
TensorFormats::NCHWc4, TensorFormats::NCHWc4,
TensorFormats::NCHWc4, TensorFormats::NCHWc4};
} else {
mgb_assert(conv.param().sparse == Opr::Param::Sparse::GROUP);
if (is_channel_wise_conv<Opr>(opr)) {
config.input_tensor_formats = {
TensorFormats::NCHWc4, TensorFormats::C11RSc4,
TensorFormats::NCHWc4, TensorFormats::NCHWc4};
} else {
config.input_tensor_formats = {
TensorFormats::NCHWc4, TensorFormats::GKCRSc4,
TensorFormats::NCHWc4, TensorFormats::NCHWc4};
}
}
config.output_tensor_formats = {TensorFormats::NCHWc4};
if (available)
return config;
return None;
}
};
template <typename Opr>
struct ConvTensorFormatsDispatcherImpl<Opr, OprFormat::NCHW32> {
static Maybe<OprTensorFormatsConfiguration> dispatch(
const OperatorNodeBase* opr) {
const auto& conv = opr->cast_final_safe<Opr>();
OprTensorFormatsConfiguration config;
config.typeinfo = opr->dyn_typeinfo();
config.opr_format = OprFormat::NCHW32;
bool available = true;
for (size_t i = 0; i < opr->input().size(); ++i) {
if (i == 2)
available &= opr->input(i)->dtype().enumv() ==
DTypeEnum::QuantizedS32;
else
available &= opr->input(i)->dtype().enumv() ==
DTypeEnum::QuantizedS8;
config.input_dtypes.emplace_back(opr->input(i)->dtype().enumv());
TensorType tensor_type =
i == 1 ? TensorType::WEIGHT : TensorType::FEATURE;
config.input_tensor_types.emplace_back(tensor_type);
}
available &= opr->output(0)->dtype().enumv() == DTypeEnum::QuantizedS8;
config.output_dtypes.emplace_back(opr->output(0)->dtype().enumv());
available &= conv.param().sparse == Opr::Param::Sparse::DENSE;
config.input_tensor_formats = {
TensorFormats::NCHWc32, TensorFormats::NCHWc32,
TensorFormats::NCHWc32, TensorFormats::NCHWc32};
config.output_tensor_formats = {TensorFormats::NCHWc32};
if (available)
return config;
return None;
}
};
template <typename Opr>
struct ConvTensorFormatsDispatcherImpl<Opr, OprFormat::NCHW64> {
static Maybe<OprTensorFormatsConfiguration> dispatch(
const OperatorNodeBase* opr) {
const auto& conv = opr->cast_final_safe<Opr>();
OprTensorFormatsConfiguration config;
config.typeinfo = opr->dyn_typeinfo();
config.opr_format = OprFormat::NCHW64;
bool available = true;
for (size_t i = 0; i < opr->input().size(); ++i) {
if (i == 2)
available &= opr->input(i)->dtype().enumv() ==
DTypeEnum::QuantizedS32;
else
available &= opr->input(i)->dtype().enumv() ==
DTypeEnum::Quantized4Asymm ||
opr->input(i)->dtype().enumv() ==
DTypeEnum::QuantizedS4;
config.input_dtypes.emplace_back(opr->input(i)->dtype().enumv());
TensorType tensor_type =
i == 1 ? TensorType::WEIGHT : TensorType::FEATURE;
config.input_tensor_types.emplace_back(tensor_type);
}
available &=
opr->output(0)->dtype().enumv() == DTypeEnum::Quantized4Asymm ||
opr->output(0)->dtype().enumv() == DTypeEnum::QuantizedS4;
config.output_dtypes.emplace_back(opr->output(0)->dtype().enumv());
available &= conv.param().sparse == Opr::Param::Sparse::DENSE;
config.input_tensor_formats = {
TensorFormats::NCHWc64, TensorFormats::NCHWc64,
TensorFormats::NCHWc64, TensorFormats::NCHWc64};
config.output_tensor_formats = {TensorFormats::NCHWc64};
if (available)
return config;
return None;
}
};
template <typename Opr>
struct ConvTensorFormatsDispatcherImpl<Opr, OprFormat::CHWN4> {
static Maybe<OprTensorFormatsConfiguration> dispatch(
const OperatorNodeBase* opr) {
const auto& conv = opr->cast_final_safe<Opr>();
OprTensorFormatsConfiguration config;
config.typeinfo = opr->dyn_typeinfo();
config.opr_format = OprFormat::CHWN4;
bool available = true;
for (size_t i = 0; i < opr->input().size(); ++i) {
if (i == 2)
available &= opr->input(i)->dtype().enumv() ==
DTypeEnum::QuantizedS32;
else
available &= opr->input(i)->dtype().enumv() ==
DTypeEnum::QuantizedS8;
config.input_dtypes.emplace_back(opr->input(i)->dtype().enumv());
TensorType tensor_type =
i == 1 ? TensorType::WEIGHT : TensorType::FEATURE;
config.input_tensor_types.emplace_back(tensor_type);
}
available &= opr->output(0)->dtype().enumv() == DTypeEnum::QuantizedS8;
config.output_dtypes.emplace_back(opr->output(0)->dtype().enumv());
available &= conv.param().sparse == Opr::Param::Sparse::DENSE;
config.input_tensor_formats = {
TensorFormats::CHWNc4, TensorFormats::CHWNc4,
TensorFormats::CHWNc4, TensorFormats::CHWNc4};
config.output_tensor_formats = {TensorFormats::CHWNc4};
if (available)
return config;
return None;
}
};
template <>
struct ConvTensorFormatsDispatcherImpl<opr::ConvolutionBackwardData,
OprFormat::NCHW> {
using Opr = opr::ConvolutionBackwardData;
static Maybe<OprTensorFormatsConfiguration> dispatch(
const OperatorNodeBase* opr) {
const auto& conv = opr->cast_final_safe<Opr>();
OprTensorFormatsConfiguration config;
config.typeinfo = opr->dyn_typeinfo();
config.opr_format = OprFormat::NCHW;
// setup dtypes
for (size_t i = 0; i < opr->input().size(); ++i) {
config.input_dtypes.emplace_back(opr->input(i)->dtype().enumv());
TensorType tensor_type =
i == 0 ? TensorType::WEIGHT : TensorType::FEATURE;
config.input_tensor_types.emplace_back(tensor_type);
}
config.output_dtypes.emplace_back(opr->output(0)->dtype().enumv());
// setup tensor formats
if (conv.param().sparse == Opr::Param::Sparse::DENSE) {
config.input_tensor_formats = {
TensorFormats::NCHW, TensorFormats::NCHW,
TensorFormats::NCHW, TensorFormats::NCHW};
} else {
mgb_assert(conv.param().sparse == Opr::Param::Sparse::GROUP);
if (is_channel_wise_conv<Opr>(opr)) {
config.input_tensor_formats = {
TensorFormats::C11RS, TensorFormats::NCHW,
TensorFormats::NCHW, TensorFormats::NCHW};
} else {
config.input_tensor_formats = {
TensorFormats::GKCRS, TensorFormats::NCHW,
TensorFormats::NCHW, TensorFormats::NCHW};
}
}
config.output_tensor_formats = {TensorFormats::NCHW};
return config;
}
};
template <>
struct ConvTensorFormatsDispatcherImpl<opr::ConvolutionBackwardData,
OprFormat::NCHW4> {
using Opr = opr::ConvolutionBackwardData;
static Maybe<OprTensorFormatsConfiguration> dispatch(
const OperatorNodeBase* opr) {
const auto& conv = opr->cast_final_safe<Opr>();
OprTensorFormatsConfiguration config;
config.typeinfo = opr->dyn_typeinfo();
config.opr_format = OprFormat::NCHW4;
bool available = true;
for (size_t i = 0; i < opr->input().size(); ++i) {
available &=
opr->input(i)->dtype().enumv() == DTypeEnum::QuantizedS8;
config.input_dtypes.emplace_back(opr->input(i)->dtype().enumv());
TensorType tensor_type =
i == 0 ? TensorType::WEIGHT : TensorType::FEATURE;
config.input_tensor_types.emplace_back(tensor_type);
}
available &= opr->output(0)->dtype().enumv() == DTypeEnum::QuantizedS8;
config.output_dtypes.emplace_back(opr->output(0)->dtype().enumv());
available &= conv.param().sparse == opr::ConvBias::Param::Sparse::DENSE;
config.input_tensor_formats = {
TensorFormats::NCHWc4, TensorFormats::NCHWc4,
TensorFormats::NCHWc4, TensorFormats::NCHWc4};
config.output_tensor_formats = {TensorFormats::NCHWc4};
if (available)
return config;
return None;
}
};
struct StaticData {
struct KeyHash {
size_t operator()(const std::pair<Typeinfo*, OprFormat>& val) const {
size_t h1 = mgb::hash<Typeinfo*>(val.first);
size_t h2 =
std::hash<uint32_t>()(static_cast<uint32_t>(val.second));
return mgb::hash_pair_combine(h1, h2);
}
};
using OprTensorFormatsDispatcher =
OprTensorFormatsConfiguration::OprTensorFormatsDispatcher;
std::unordered_map<std::pair<Typeinfo*, OprFormat>,
OprTensorFormatsDispatcher, KeyHash>
typefmt2dispatcher;
StaticData();
};
StaticData::StaticData() {
#define OPR_TENSOR_FORMATS_CONFIG_REG(_Opr, _fmt) \
typefmt2dispatcher[{opr::_Opr::typeinfo(), OprFormat::_fmt}] = \
[](const OperatorNodeBase* opr) { \
MIDOUT_B(opr::_Opr, midout_iv(OprFormat::_fmt)) \
return ConvTensorFormatsDispatcherImpl< \
opr::_Opr, OprFormat::_fmt>::dispatch(opr); \
MIDOUT_E \
}
#define OPR_SINGLE_IN_OUT_TENSOR_FORMATS_CONFIG_REG(_Opr, _fmt) \
typefmt2dispatcher[{opr::_Opr::typeinfo(), OprFormat::_fmt}] = \
[](const OperatorNodeBase* opr) { \
MIDOUT_B(opr::_Opr, midout_iv(OprFormat::_fmt)) \
return OprSingleInOutTensorFormatsDispatcherImpl< \
OprFormat::_fmt>::dispatch(opr); \
MIDOUT_E \
}
OPR_TENSOR_FORMATS_CONFIG_REG(ConvBias, NCHW);
OPR_TENSOR_FORMATS_CONFIG_REG(ConvBias, NHWC);
OPR_TENSOR_FORMATS_CONFIG_REG(ConvBias, NCHW4);
OPR_TENSOR_FORMATS_CONFIG_REG(ConvBias, CHWN4);
OPR_TENSOR_FORMATS_CONFIG_REG(ConvBias, NCHW32);
OPR_TENSOR_FORMATS_CONFIG_REG(ConvBias, NCHW64);
OPR_TENSOR_FORMATS_CONFIG_REG(ConvolutionForward, NCHW);
OPR_TENSOR_FORMATS_CONFIG_REG(ConvolutionForward, NCHW4);
OPR_TENSOR_FORMATS_CONFIG_REG(ConvolutionBackwardData, NCHW);
OPR_TENSOR_FORMATS_CONFIG_REG(ConvolutionBackwardData, NCHW4);
OPR_SINGLE_IN_OUT_TENSOR_FORMATS_CONFIG_REG(WarpPerspectiveForward, NCHW);
OPR_SINGLE_IN_OUT_TENSOR_FORMATS_CONFIG_REG(WarpPerspectiveForward, NHWC);
OPR_SINGLE_IN_OUT_TENSOR_FORMATS_CONFIG_REG(WarpPerspectiveForward, NCHW4);
OPR_SINGLE_IN_OUT_TENSOR_FORMATS_CONFIG_REG(WarpPerspectiveForward, NCHW64);
OPR_SINGLE_IN_OUT_TENSOR_FORMATS_CONFIG_REG(PoolingForward, NCHW);
OPR_SINGLE_IN_OUT_TENSOR_FORMATS_CONFIG_REG(PoolingForward, NHWC);
OPR_SINGLE_IN_OUT_TENSOR_FORMATS_CONFIG_REG(PoolingForward, NCHW4);
OPR_SINGLE_IN_OUT_TENSOR_FORMATS_CONFIG_REG(PoolingForward, CHWN4);
OPR_SINGLE_IN_OUT_TENSOR_FORMATS_CONFIG_REG(PoolingForward, NCHW32);
OPR_SINGLE_IN_OUT_TENSOR_FORMATS_CONFIG_REG(PoolingForward, NCHW64);
#undef OPR_TENSOR_FORMATS_CONFIG_REG
#undef OPR_SINGLE_IN_OUT_TENSOR_FORMATS_CONFIG_REG
}
StaticData& static_data() {
static StaticData inst;
return inst;
}
} // namespace
OprTensorFormatsConfiguration::OprTensorFormatsDispatcher*
OprTensorFormatsConfiguration::find_dispatcher_by_type_format(
Typeinfo* type, OprFormat opr_format) {
auto&& typefmt2dispatcher = static_data().typefmt2dispatcher;
auto iter = typefmt2dispatcher.find(std::make_pair(type, opr_format));
mgb_assert(iter != typefmt2dispatcher.end(),
"cannot find OprTensorFormatsDispatcher for opr type(%s) and "
"opr format(%s)",
type->name, opr_format_to_string(opr_format));
return &iter->second;
}
// vim: syntax=cpp.doxygen
/**
* \file src/gopt/impl/profiler_impl.cpp
* MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
*
* Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
* implied.
*/
#include "./opr_format_modifier.h"
#include "./utils.h"
#include "megbrain/gopt/framework.h"
#include "megbrain/gopt/global_layout_transform.h"
#include "megbrain/graph/event.h"
#include "megbrain/opr/dnn/pooling.h"
#include "megbrain/opr/imgproc.h"
#include "megbrain/opr/io.h"
#include "megbrain/plugin/base.h"
#include "megbrain/serialization/sereg.h"
using namespace mgb;
using namespace cg;
using namespace opr;
using namespace gopt;
using ReformatKey = ReformatManager::ReformatKey;
namespace {
using OprFormat = Problem::OprFormat;
OprFormat tensor_formats_to_opr_format(TensorFormats tensor_format) {
switch (tensor_format) {
case TensorFormats::NCHW:
return OprFormat::NCHW;
case TensorFormats::NCHWc4:
return OprFormat::NCHW4;
case TensorFormats::NCHWc8:
return OprFormat::NCHW8;
case TensorFormats::NCHWc32:
return OprFormat::NCHW32;
case TensorFormats::NCHWc64:
return OprFormat::NCHW64;
case TensorFormats::NHWC:
return OprFormat::NHWC;
case TensorFormats::CHWNc4:
return OprFormat::CHWN4;
default:
mgb_throw(MegBrainError, "tensor format(%u) is not supported",
static_cast<uint32_t>(tensor_format));
}
}
class GraphPartitionProfiler final : public PluginBase {
using CompNodeEventPtr = std::unique_ptr<CompNode::Event>;
public:
using OprFilter = thin_function<bool(OperatorNodeBase*)>;
struct OprKernEvent {
CompNodeEventPtr start, end;
};
GraphPartitionProfiler(ComputingGraph* graph, OprFilter opr_filter);
~GraphPartitionProfiler() noexcept;
float duration_in_usec() const;
private:
void record_event(CompNodeEventPtr& dest, CompNode cn) {
if (dest == nullptr)
dest = cn.create_event(CompNode::Event::NEED_TIMER);
dest->record();
}
ThinHashMap<OperatorNodeBase*, OprKernEvent> m_kern_event;
OprFilter m_opr_filter;
};
GraphPartitionProfiler::GraphPartitionProfiler(ComputingGraph* graph,
OprFilter opr_filter)
: PluginBase(graph), m_opr_filter(opr_filter) {
using namespace event;
auto on_before_kern = [this](BeforeKernel const& event) {
if (!m_opr_filter(event.opr))
return;
auto evptr = &m_kern_event[event.opr].start;
record_event(*evptr, event.comp_node);
};
auto on_after_kern = [this](AfterKernel const& event) {
if (!m_opr_filter(event.opr))
return;
auto evptr = &m_kern_event[event.opr].end;
record_event(*evptr, event.comp_node);
};
auto&& ev = graph->event();
add_event_handler(ev.register_receiver<BeforeKernel>(on_before_kern));
add_event_handler(ev.register_receiver<AfterKernel>(on_after_kern));
}
GraphPartitionProfiler::~GraphPartitionProfiler() noexcept {
auto wait = [](const CompNodeEventPtr& ev) {
if (ev)
ev->host_wait();
};
for (auto&& i : m_kern_event) {
wait(i.second.start);
wait(i.second.end);
}
}
float GraphPartitionProfiler::duration_in_usec() const {
float device_duration = 0.f;
for (auto&& kern_ev : m_kern_event) {
auto&& event = kern_ev.second;
event.end->host_wait();
device_duration += 1e6 * event.start->elapsed_time_until(*event.end);
}
return device_duration;
}
/*!
* \brief An operator that indicates its input var node is contiguous
*/
// clang-format off
MGB_DEFINE_OPR_CLASS(MarkInputContiguous, SingleCNOperatorNodeBase) //{
void scn_do_execute() override {};
void init_output_static_infer_desc() override;
void add_input_layout_constraint() override {
input(0)->add_layout_constraint_contiguous();
}
public:
MarkInputContiguous(VarNode* input, const OperatorNodeConfig& config);
static SymbolVar make(SymbolVar input, const OperatorNodeConfig& config = {});
};
// clang-format on
MGB_DYN_TYPE_OBJ_FINAL_IMPL(MarkInputContiguous);
MarkInputContiguous::MarkInputContiguous(VarNode* input,
const OperatorNodeConfig& config)
: Super(input->owner_graph(), config, "mark_contiguous", {input}) {
add_input({input});
add_output(None);
}
SymbolVar MarkInputContiguous::make(SymbolVar input,
const OperatorNodeConfig& config) {
return input.insert_single_output_opr<MarkInputContiguous>(input.node(),
config);
}
void MarkInputContiguous::init_output_static_infer_desc() {
using namespace cg::static_infer;
auto&& mgr = owner_graph()->static_infer_manager();
mgr.register_shape_infer(output(0),
ShapeInferDesc::make_identity(input(0)));
}
} // namespace
/* ================== ProfilerImpl =================*/
class ProfilerImpl final : public ProfilerBase {
public:
ProfilerImpl(int runs = 10) : m_runs{runs} {};
~ProfilerImpl() = default;
ProfilingResult profile(const Problem& problem) const override;
private:
static constexpr float PROFILE_TIME_OUT = 1e7;
/*!
* \brief profile opr format agnostic operators (like elemwise, elemwise multi type, typecvt etc.)
*
* \param opr pointer to the operator node to be profiled
* \param base_format the original tensor format of the operator node.
* \param available_tensor_formats the available tensor formats
* \return the operator node record
*/
OperatorNodeRecord profile_operator(
const OperatorNodeBase* opr, TensorFormats base_format,
const SmallVector<TensorFormats>& available_tensor_formats) const;
float profile_operator(const OperatorNodeBase* opr,
TensorFormats base_format,
TensorFormats tensor_format) const;
/*!
* \brief profile opr format aware operators (like conv, deconv, conv_bias, etc.)
*
* \param opr pointer to the operator node to be profiled
* \param base_config the tensor formats configuration of base opr format
* \param config all the available configuration
* \return the operator node record
*/
OperatorNodeRecord profile_operator(
const OperatorNodeBase* opr,
const OprTensorFormatsConfiguration& base_config,
const SmallVector<OprTensorFormatsConfiguration>& available_configs)
const;
float profile_operator(const OperatorNodeBase* opr,
const OprTensorFormatsConfiguration& base_config,
const OprTensorFormatsConfiguration& config) const;
/*!
* \brief profile layout transform of the var node
*
* \param var pointer to the var node to be profiled
* \param base_format the original tensor formats in which the var node is stored
* \param available_tensor_formats the available tensor formats
* \param extra_attribute the extra attributes (options) of the problem
* \return the var node record
*/
VarNodeRecord profile_var_node(
const VarNode* var, TensorFormats base_format,
const SmallVector<TensorFormats>& available_tensor_formats,
ReformatKey::Attribute extra_attribute =
ReformatKey::Attribute::DEFAULT) const;
float profile_var_node(const VarNode* var, TensorFormats base_format,
const ReformatKey& key) const;
int m_runs; /// sample times of the profiler
};
ProfilerImpl::OperatorNodeRecord ProfilerImpl::profile_operator(
const OperatorNodeBase* opr, TensorFormats base_format,
const SmallVector<TensorFormats>& available_tensor_formats) const {
OperatorNodeRecord record;
record.opr = opr;
auto& costs = record.costs;
for (auto&& f : available_tensor_formats) {
auto opr_format = tensor_formats_to_opr_format(f);
costs[opr_format] = profile_operator(opr, base_format, f);
}
return record;
}
float ProfilerImpl::profile_operator(const OperatorNodeBase* opr,
TensorFormats base_format,
TensorFormats tensor_format) const {
auto graph = ComputingGraph::make();
graph->options().graph_opt_level = 0;
graph->options().var_sanity_check_first_run = false;
VarNodeArray new_inps(opr->input().size());
for (size_t i = 0; i < opr->input().size(); ++i) {
auto&& var = opr->input(i);
auto&& cn = var->comp_node();
auto&& dtype = var->dtype();
auto dval = std::make_shared<DeviceTensorND>(cn, dtype);
auto aligned_tensor_shape =
make_aligned_tensor_shape(var, base_format, tensor_format);
dval->resize(aligned_tensor_shape);
auto aligned_var = opr::VolatileSharedDeviceTensor::make(*graph, dval);
new_inps[i] = aligned_var.node();
}
auto new_opr = serialization::copy_opr_shallow(
*opr, new_inps, opr->config(), {graph.get()});
auto y = new_opr->output(0);
auto mark = MarkInputContiguous::make(SymbolVar(y));
auto func = graph->compile({{mark, {}}});
auto filter = [new_opr](OperatorNodeBase* opr) { return opr == new_opr; };
auto profiler = std::make_unique<GraphPartitionProfiler>(graph.get(),
std::move(filter));
for (int i = 0; i < m_runs; ++i)
func->execute();
return profiler->duration_in_usec();
}
ProfilerImpl::OperatorNodeRecord ProfilerImpl::profile_operator(
const OperatorNodeBase* opr,
const OprTensorFormatsConfiguration& base_config,
const SmallVector<OprTensorFormatsConfiguration>& available_configs)
const {
OperatorNodeRecord record;
record.opr = opr;
auto& costs = record.costs;
for (auto&& i : available_configs) {
costs[i.opr_format] = profile_operator(opr, base_config, i);
}
return record;
}
float ProfilerImpl::profile_operator(
const OperatorNodeBase* opr,
const OprTensorFormatsConfiguration& base_config,
const OprTensorFormatsConfiguration& config) const {
auto graph = ComputingGraph::make();
graph->options().graph_opt_level = 0;
graph->options().var_sanity_check_first_run = false;
VarNodeArray new_inps(opr->input().size());
size_t i = 0;
size_t nr_input_tensor =
std::min(config.input_tensor_formats.size(), opr->input().size());
for (; i < nr_input_tensor; ++i) {
auto&& var = opr->input(i);
auto&& cn = var->comp_node();
auto&& dtype = var->dtype();
auto dval = std::make_shared<DeviceTensorND>(cn, dtype);
TensorShape aligned_shape;
if (config.input_tensor_types[i] == TensorType::WEIGHT) {
mgb_assert(base_config.input_tensor_types[i] == TensorType::WEIGHT);
aligned_shape = make_aligned_weight_shape(
var, base_config.input_tensor_formats[i],
config.input_tensor_formats[i],
config.output_tensor_formats[0]);
} else {
mgb_assert(base_config.input_tensor_types[i] ==
config.input_tensor_types[i]);
mgb_assert(base_config.input_tensor_types[i] ==
TensorType::FEATURE);
aligned_shape = make_aligned_tensor_shape(
var, base_config.input_tensor_formats[i],
config.input_tensor_formats[i]);
}
dval->resize(aligned_shape);
auto aligned_var = opr::VolatileSharedDeviceTensor::make(*graph, dval);
new_inps[i] = aligned_var.node();
}
for (; i < opr->input().size(); ++i) {
auto&& var = opr->input(i);
auto&& cn = var->comp_node();
auto&& dtype = var->dtype();
auto hval = std::make_shared<HostTensorND>(cn, dtype);
hval->resize(var->shape());
auto cb = [&](DeviceTensorND& d) { hval->copy_from(d).sync(); };
{
auto cg = var->owner_graph();
cg->compile({{var, cb}})->execute();
}
auto imm = opr::ImmutableTensor::make(*graph, *hval);
new_inps[i] = imm.node();
}
VarNode* y = mgb::gopt::intl::modify_opr_format(config.opr_format, new_inps,
opr);
#if 0
static const ThinHashSet<Typeinfo*> multi_algo_oprs = {
opr::Convolution::typeinfo(),
opr::ConvBiasForward::typeinfo(),
opr::ConvolutionBackwardData::typeinfo(),
opr::PoolingForward::typeinfo(),
};
if (multi_algo_oprs.count(opr->dyn_typeinfo()) &&
!mgb::gopt::intl::has_available_algo(new_inps, y->owner_opr()))
return PROFILE_TIME_OUT;
#endif
auto mark = MarkInputContiguous::make(SymbolVar(y));
auto func = graph->compile({{mark, {}}});
auto new_opr = y->owner_opr();
auto filter = [&new_opr](OperatorNodeBase* opr) { return opr == new_opr; };
auto profiler = std::make_unique<GraphPartitionProfiler>(graph.get(),
std::move(filter));
for (int i = 0; i < m_runs; ++i)
func->execute();
return profiler->duration_in_usec();
}
ProfilerImpl::VarNodeRecord ProfilerImpl::profile_var_node(
const VarNode* var, TensorFormats base_format,
const SmallVector<TensorFormats>& available_tensor_formats,
ReformatKey::Attribute attribute) const {
VarNodeRecord record;
record.var = var;
auto& costs = record.costs;
for (auto&& i : available_tensor_formats) {
for (auto&& o : available_tensor_formats) {
if (i == o)
continue;
ReformatKey key{i, o, attribute, var->dtype().enumv(),
var->dtype().enumv()};
costs[{i, o}] = profile_var_node(var, base_format, key);
}
}
return record;
}
float ProfilerImpl::profile_var_node(const VarNode* var,
TensorFormats base_format,
const ReformatKey& key) const {
auto&& cn = var->comp_node();
auto&& dtype = var->dtype();
auto dval = std::make_shared<DeviceTensorND>(cn, dtype);
auto aligned_tensor_shape =
make_aligned_tensor_shape(var, base_format, key.input_format);
dval->resize(aligned_tensor_shape);
auto graph = ComputingGraph::make();
graph->options().graph_opt_level = 0;
graph->options().var_sanity_check_first_run = false;
auto aligned_var = opr::VolatileSharedDeviceTensor::make(*graph, dval);
auto builder = ReformatManager::instance().auto_aligned_reformat_featrue(
var, base_format, key);
auto y = builder({aligned_var.node()});
ThinHashSet<OperatorNodeBase*> set;
DepOprIter iter([&set](OperatorNodeBase* opr) { set.insert(opr); });
iter.add(y->owner_opr());
iter.set_visited(aligned_var.node()->owner_opr());
auto mark = MarkInputContiguous::make(SymbolVar(y));
auto func = graph->compile({{mark, {}}});
auto filter = [&set](OperatorNodeBase* opr) { return set.count(opr) > 0; };
auto profiler = std::make_unique<GraphPartitionProfiler>(graph.get(),
std::move(filter));
for (int i = 0; i < m_runs; ++i)
func->execute();
return profiler->duration_in_usec();
}
ProfilerImpl::ProfilingResult ProfilerImpl::profile(
const Problem& problem) const {
ConstVarPropogate cvprop{ConstVarType::IMMUTABLE_AND_PARAM};
{
auto cb = [&cvprop](OperatorNodeBase* opr) { cvprop.add_opr(opr); };
DepOprIter iter{cb};
for (auto&& o : problem.graph_partition().output()) {
iter.add(o->owner_opr());
}
}
static const ThinHashMap<Typeinfo*, size_t> format_aware_input_tensors = {
#define cb(_Opr, _arity) {_Opr::typeinfo(), _arity}
cb(Convolution, 2),
cb(ConvBiasForward, 4),
cb(ConvolutionBackwardData, 2),
cb(PoolingForward, 1),
cb(WarpPerspective, 1),
cb(Resize, 1),
#undef cb
};
ThinHashSet<VarNode*> vars;
ThinHashSet<OperatorNodeBase*> oprs;
{
auto cb = [&cvprop, &vars, &oprs](OperatorNodeBase* opr) {
if (cvprop.is_const(opr))
return;
oprs.insert(opr);
auto find = format_aware_input_tensors.find(opr->dyn_typeinfo());
if (find == format_aware_input_tensors.end()) {
for (auto&& i : opr->input()) {
if (!cvprop.is_const(i)) {
vars.insert(i);
}
}
} else {
size_t nr_input_tensor =
std::min(find->second, opr->input().size());
for (size_t i = 0; i < nr_input_tensor; ++i) {
if (!cvprop.is_const(opr->input(i))) {
vars.insert(opr->input(i));
}
}
}
vars.insert(opr->output(0));
};
DepOprIter iter{cb};
for (auto&& i : problem.graph_partition().input()) {
iter.set_visited(i->owner_opr());
}
for (auto&& o : problem.graph_partition().output()) {
iter.add(o->owner_opr());
}
}
auto base_format = problem.base_format();
auto&& available_tensor_formats = problem.available_tensor_formats();
ProfilingResult profiling_result;
auto& opr_record = profiling_result.opr_record;
auto& var_record = profiling_result.var_record;
for (auto&& var : vars) {
var_record[var] =
profile_var_node(var, base_format, available_tensor_formats);
}
for (auto&& opr : oprs) {
auto&& opr_configs = problem.opr_configs();
auto find = opr_configs.find(opr->dyn_typeinfo());
if (find == opr_configs.end()) {
opr_record[opr] = profile_operator(opr, base_format,
available_tensor_formats);
} else {
auto&& dispatchers = find->second;
SmallVector<OprTensorFormatsConfiguration> configs;
for (const auto& item : dispatchers) {
auto config = (*item.second)(opr);
if (config.valid()) {
configs.emplace_back(config.val());
}
}
auto base_config = problem.base_config(opr);
opr_record[opr] = profile_operator(opr, base_config, configs);
}
}
for (auto&& rpair : opr_record) {
mgb_log_debug("%s", rpair.second.to_string().c_str());
}
for (auto&& rpair : var_record) {
mgb_log_debug("%s", rpair.second.to_string().c_str());
}
return profiling_result;
}
/* ================== ProfilerBase =================*/
std::string ProfilerBase::OperatorNodeRecord::to_string() const {
auto str = ssprintf("\nopr type: %s\nopr name: %s\ninputs:\n",
opr->dyn_typeinfo()->name, opr->cname());
for (auto&& i : opr->input()) {
str += ssprintf("\tvar: %s\n\tshape: %s\n", i->cname(),
i->shape().to_string().c_str());
}
str += ssprintf("outputs:\n\tvar: %s\n\tshape: %s\ncosts:\n",
opr->output(0)->cname(),
opr->output(0)->shape().to_string().c_str());
for (auto&& cpair : costs) {
str += ssprintf("\tformat: %s; cost:%f",
opr_format_to_string(cpair.first), cpair.second);
}
return str;
}
std::string ProfilerBase::VarNodeRecord::to_string() const {
auto str = ssprintf("\nvar: %s\ncosts:", var->cname());
for (auto&& cpair : costs) {
auto&& formats = cpair.first;
str += ssprintf("\n\tformat: (i:%s;o:%s); cost:%f",
tensor_formats_to_named_tensor_shape(formats.first)
.to_string()
.c_str(),
tensor_formats_to_named_tensor_shape(formats.second)
.to_string()
.c_str(),
cpair.second);
}
return str;
}
std::unique_ptr<ProfilerBase> ProfilerBase::make_profiler() {
return std::make_unique<ProfilerImpl>();
}
// vim: syntax=cpp.doxygen
...@@ -247,16 +247,36 @@ ReformatEmitter::UnderlyingBuilders ReformatEmitter::analyze() const { ...@@ -247,16 +247,36 @@ ReformatEmitter::UnderlyingBuilders ReformatEmitter::analyze() const {
/* ============== PaddingEmitter ================= */ /* ============== PaddingEmitter ================= */
PaddingEmitter::EmitResult PaddingEmitter::emit() const { PaddingEmitter::EmitResult PaddingEmitter::emit() const {
auto&& padshp = m_padshp;
auto&& const_extent = m_const_extent; auto&& const_extent = m_const_extent;
auto&& axis = m_axis; auto&& axis = m_axis;
auto builder = [const_extent, axis](const VarNodeArray& vars) { auto builder = [padshp, const_extent, axis](const VarNodeArray& vars) {
auto i = vars[0]; auto i = vars[0];
auto padding_shp_var = vars[1]; auto padding_shp_var = vars[1];
TensorShape shape; TensorShape shape;
shape.ndim = i->shape().ndim; shape.ndim = i->shape().ndim;
for (size_t ax = 0; ax < shape.ndim; ++ax) for (size_t ax = 0; ax < shape.ndim; ++ax)
shape[ax] = 1; shape[ax] = 1;
// avoid making a scalar lowbit tensor
if (!i->dtype().is_low_bit() || const_extent != 1)
shape[axis] = const_extent; shape[axis] = const_extent;
else {
size_t const_axis = 0;
size_t new_const_extent = const_extent;
for (size_t i = 0; i < padshp.ndim; ++i) {
const auto& dim = padshp[i];
if (dim.extent() != Dimension::UNDETERMINED_EXTENT &&
dim.extent() != 1) {
new_const_extent = dim.extent();
const_axis = i;
break;
}
}
mgb_assert(new_const_extent != 1,
"cannot make an scalar lowbit tensor(got:%s)",
i->dtype().name());
shape[const_axis] = new_const_extent;
}
auto host_val = auto host_val =
std::make_shared<HostTensorND>(i->comp_node(), i->dtype()); std::make_shared<HostTensorND>(i->comp_node(), i->dtype());
host_val->resize(shape); host_val->resize(shape);
......
...@@ -13,6 +13,7 @@ ...@@ -13,6 +13,7 @@
#include "megbrain/gopt/reformat_manager.h" #include "megbrain/gopt/reformat_manager.h"
#include "megbrain/opr/tensor_manip.h" #include "megbrain/opr/tensor_manip.h"
#include "megbrain/utils/arith_helper.h" #include "megbrain/utils/arith_helper.h"
#include "./utils.h"
using namespace mgb; using namespace mgb;
using namespace gopt; using namespace gopt;
...@@ -32,68 +33,6 @@ int gcd(const int& p, const int& q) { ...@@ -32,68 +33,6 @@ int gcd(const int& p, const int& q) {
} }
return x; return x;
} }
NamedTensorShape tensor_formats_to_named_tensor_shape(TensorFormats format) {
switch (format) {
case TensorFormats::NCHW:
return {{"N"}, {"C"}, {"H"}, {"W"}};
case TensorFormats::NHWC:
return {{"N"}, {"H"}, {"W"}, {"C"}};
case TensorFormats::NCHWc4:
return {{"N"}, {"C//4"}, {"H"}, {"W"}, {"C%4"}};
case TensorFormats::NCHWc8:
return {{"N"}, {"C//8"}, {"H"}, {"W"}, {"C%8"}};
case TensorFormats::NCHWc32:
return {{"N"}, {"C//32"}, {"H"}, {"W"}, {"C%32"}};
case TensorFormats::NCHWc64:
return {{"N"}, {"C//64"}, {"H"}, {"W"}, {"C%64"}};
case TensorFormats::CHWNc4:
return {{"C//4"}, {"H"}, {"W"}, {"N"}, {"C%4"}};
case TensorFormats::NHCWc4:
return {{"N"}, {"H"}, {"C//4"}, {"W"}, {"C%4"}};
case TensorFormats::KRSCk4:
return {{"K//4"}, {"R"}, {"S"}, {"C"}, {"K%4"}};
case TensorFormats::GKRSCk4:
return {{"G"}, {"K//4"}, {"R"}, {"S"}, {"C"}, {"K%4"}};
case TensorFormats::C1RSc4:
return {{"C//4"}, {"C%1"}, {"R"}, {"S"}, {"C%4"}};
case TensorFormats::KRSCk4c4:
return {{"K//4"}, {"R"}, {"S"}, {"C//4"}, {"K%4"}, {"C%4"}};
case TensorFormats::GKRSCk4c4:
return {{"G"}, {"K//4"}, {"R"}, {"S"}, {"C//4"}, {"K%4"}, {"C%4"}};
case TensorFormats::KCRSk4c4:
return {{"K//4"}, {"C//4"}, {"R"}, {"S"}, {"K%4"}, {"C%4"}};
case TensorFormats::GKCRSk4c4:
return {{"G"}, {"K//4"}, {"C//4"}, {"R"}, {"S"}, {"K%4"}, {"C%4"}};
case TensorFormats::KCRSc4k4:
return {{"K//4"}, {"C//4"}, {"R"}, {"S"}, {"C%4"}, {"K%4"}};
case TensorFormats::GKCRSc4k4:
return {{"G"}, {"K//4"}, {"C//4"}, {"R"}, {"S"}, {"C%4"}, {"K%4"}};
case TensorFormats::C11RSc4:
return {{"C//4"}, {"C%1"}, {"C%1"}, {"R"}, {"S"}, {"C%4"}};
case TensorFormats::KCRSc8k8:
return {{"K//8"}, {"C//8"}, {"R"}, {"S"}, {"C%8"}, {"K%8"}};
case TensorFormats::GKCRSc8k8:
return {{"G"}, {"K//8"}, {"C//8"}, {"R"}, {"S"}, {"C%8"}, {"K%8"}};
case TensorFormats::C11RSc8:
return {{"C//8"}, {"C%1"}, {"C%1"}, {"R"}, {"S"}, {"C%8"}};
case TensorFormats::KRSCk8:
return {{"K//8"}, {"R"}, {"S"}, {"C"}, {"K%8"}};
case TensorFormats::KCRSc4:
return {{"K"}, {"C//4"}, {"R"}, {"S"}, {"C%4"}};
case TensorFormats::GKCRSc4:
return {{"G"}, {"K"}, {"C//4"}, {"R"}, {"S"}, {"C%4"}};
case TensorFormats::KCRS:
return {{"K"}, {"C"}, {"R"}, {"S"}};
case TensorFormats::GKCRS:
return {{"G"}, {"K"}, {"C"}, {"R"}, {"S"}};
case TensorFormats::C11RS:
return {{"C"}, {"C%1"}, {"C%1"}, {"R"}, {"S"}};
default:
mgb_throw(AssertionError, "invalid tensor formats(%u)",
static_cast<uint32_t>(format));
}
}
}; // namespace }; // namespace
// =================== ReformatManager::ReformatKey ====================*/ // =================== ReformatManager::ReformatKey ====================*/
...@@ -393,8 +332,10 @@ ReformatManager::ReformatImpl ReformatManager::auto_aligned_reformat_featrue( ...@@ -393,8 +332,10 @@ ReformatManager::ReformatImpl ReformatManager::auto_aligned_reformat_featrue(
tensor_formats_to_named_tensor_shape(key.input_format); tensor_formats_to_named_tensor_shape(key.input_format);
NamedTensorShape output_shape = NamedTensorShape output_shape =
tensor_formats_to_named_tensor_shape(key.output_format); tensor_formats_to_named_tensor_shape(key.output_format);
size_t input_alignment, output_alignment; size_t input_alignment = 0;
size_t input_channel_idx, output_channel_idx; size_t output_alignment = 0;
size_t input_channel_idx = input_shape.ndim,
output_channel_idx = input_shape.ndim;
for (size_t i = 0; i < input_shape.ndim; ++i) { for (size_t i = 0; i < input_shape.ndim; ++i) {
if (input_shape[i].name() == Dimension::Name::C && if (input_shape[i].name() == Dimension::Name::C &&
input_shape[i].extent() == Dimension::UNDETERMINED_EXTENT) { input_shape[i].extent() == Dimension::UNDETERMINED_EXTENT) {
...@@ -411,6 +352,15 @@ ReformatManager::ReformatImpl ReformatManager::auto_aligned_reformat_featrue( ...@@ -411,6 +352,15 @@ ReformatManager::ReformatImpl ReformatManager::auto_aligned_reformat_featrue(
break; break;
} }
} }
mgb_assert(input_channel_idx < input_shape.ndim &&
output_channel_idx < input_shape.ndim,
"invalid channel idx(in_channel:%zu, out_channel:%zu, shp:%s)",
input_channel_idx, output_channel_idx,
input_shape.to_string().c_str());
mgb_assert(input_alignment > 0 && output_alignment > 0,
"invalid alignment(in_channel:%zu, out_channel:%zu, shp:%s)",
input_alignment, output_alignment,
input_shape.to_string().c_str());
NamedTensorShape orig_shape = NamedTensorShape orig_shape =
tensor_formats_to_named_tensor_shape(orig_format); tensor_formats_to_named_tensor_shape(orig_format);
size_t orig_channel = 0; size_t orig_channel = 0;
...@@ -448,8 +398,9 @@ ReformatManager::ReformatImpl ReformatManager::auto_aligned_reformat_featrue( ...@@ -448,8 +398,9 @@ ReformatManager::ReformatImpl ReformatManager::auto_aligned_reformat_featrue(
auto make_shape = std::get<0>( auto make_shape = std::get<0>(
MakeShapeEmitter{input_shape, padding_shape}.emit()); MakeShapeEmitter{input_shape, padding_shape}.emit());
auto padding_shp_var = make_shape({x}); auto padding_shp_var = make_shape({x});
auto padding = std::get<0>( auto padding = std::get<0>(PaddingEmitter{
PaddingEmitter{const_extent, input_channel_idx}.emit()); padding_shape, const_extent, input_channel_idx}
.emit());
cur = padding({cur, padding_shp_var}); cur = padding({cur, padding_shp_var});
} }
cur = ReformatManager::instance().get(key)({cur}); cur = ReformatManager::instance().get(key)({cur});
...@@ -469,9 +420,10 @@ ReformatManager::ReformatImpl ReformatManager::auto_aligned_reformat_weight( ...@@ -469,9 +420,10 @@ ReformatManager::ReformatImpl ReformatManager::auto_aligned_reformat_weight(
const VarNode* orig_var, const ReformatKey& key, const VarNode* orig_var, const ReformatKey& key,
const AlignmentDesc& extra_alignment) const { const AlignmentDesc& extra_alignment) const {
size_t in_channels = 0, out_channels = 0; size_t in_channels = 0, out_channels = 0;
size_t input_channel_idx, output_channel_idx; Dimension::Name out_channel_name = Dimension::Name::C;
Dimension::Name out_channel_name;
auto input_shape = tensor_formats_to_named_tensor_shape(key.input_format); auto input_shape = tensor_formats_to_named_tensor_shape(key.input_format);
size_t input_channel_idx = input_shape.ndim,
output_channel_idx = input_shape.ndim;
for (size_t i = 0; i < input_shape.ndim; ++i) { for (size_t i = 0; i < input_shape.ndim; ++i) {
if (input_shape[i].name() == Dimension::Name::C && if (input_shape[i].name() == Dimension::Name::C &&
input_shape[i].extent() == Dimension::UNDETERMINED_EXTENT) { input_shape[i].extent() == Dimension::UNDETERMINED_EXTENT) {
...@@ -491,7 +443,15 @@ ReformatManager::ReformatImpl ReformatManager::auto_aligned_reformat_weight( ...@@ -491,7 +443,15 @@ ReformatManager::ReformatImpl ReformatManager::auto_aligned_reformat_weight(
input_shape.to_string().c_str()); input_shape.to_string().c_str());
} }
} }
size_t in_channel_alignment, out_channel_alignment = 1; mgb_assert(out_channel_name == Dimension::Name::K ||
out_channel_name == Dimension::Name::N,
"invalid out channel(shp:%s)", input_shape.to_string().c_str());
mgb_assert(input_channel_idx < input_shape.ndim &&
output_channel_idx < input_shape.ndim,
"invalid channel idx(in_channel:%zu, out_channel:%zu, shp:%s)",
input_channel_idx, output_channel_idx,
input_shape.to_string().c_str());
size_t in_channel_alignment = 0, out_channel_alignment = 0;
auto output_shape = tensor_formats_to_named_tensor_shape(key.output_format); auto output_shape = tensor_formats_to_named_tensor_shape(key.output_format);
for (size_t i = 0; i < output_shape.ndim; ++i) { for (size_t i = 0; i < output_shape.ndim; ++i) {
if (output_shape[i].name() == Dimension::Name::C && if (output_shape[i].name() == Dimension::Name::C &&
...@@ -502,6 +462,10 @@ ReformatManager::ReformatImpl ReformatManager::auto_aligned_reformat_weight( ...@@ -502,6 +462,10 @@ ReformatManager::ReformatImpl ReformatManager::auto_aligned_reformat_weight(
out_channel_alignment = output_shape[i].stride(); out_channel_alignment = output_shape[i].stride();
} }
} }
mgb_assert(in_channel_alignment > 0 && out_channel_alignment > 0,
"invalid alignment(in_channel:%zu, out_channel:%zu, shp:%s)",
in_channel_alignment, out_channel_alignment,
output_shape.to_string().c_str());
size_t aligned_in_channel = size_t aligned_in_channel =
divup(in_channels, in_channel_alignment) * in_channel_alignment; divup(in_channels, in_channel_alignment) * in_channel_alignment;
if (extra_alignment.name == out_channel_name) { if (extra_alignment.name == out_channel_name) {
...@@ -526,8 +490,9 @@ ReformatManager::ReformatImpl ReformatManager::auto_aligned_reformat_weight( ...@@ -526,8 +490,9 @@ ReformatManager::ReformatImpl ReformatManager::auto_aligned_reformat_weight(
auto make_shape = std::get<0>( auto make_shape = std::get<0>(
MakeShapeEmitter{input_shape, padding_shape}.emit()); MakeShapeEmitter{input_shape, padding_shape}.emit());
auto padding_shp_var = make_shape({x}); auto padding_shp_var = make_shape({x});
auto padding = std::get<0>( auto padding = std::get<0>(PaddingEmitter{
PaddingEmitter{const_extent, input_channel_idx}.emit()); padding_shape, const_extent, input_channel_idx}
.emit());
cur = padding({cur, padding_shp_var}); cur = padding({cur, padding_shp_var});
} }
if (aligned_out_channel > out_channels) { if (aligned_out_channel > out_channels) {
...@@ -540,8 +505,9 @@ ReformatManager::ReformatImpl ReformatManager::auto_aligned_reformat_weight( ...@@ -540,8 +505,9 @@ ReformatManager::ReformatImpl ReformatManager::auto_aligned_reformat_weight(
auto make_shape = std::get<0>( auto make_shape = std::get<0>(
MakeShapeEmitter{input_shape, padding_shape}.emit()); MakeShapeEmitter{input_shape, padding_shape}.emit());
auto padding_shp_var = make_shape({cur}); auto padding_shp_var = make_shape({cur});
auto padding = std::get<0>( auto padding = std::get<0>(PaddingEmitter{
PaddingEmitter{const_extent, output_channel_idx}.emit()); padding_shape, const_extent, output_channel_idx}
.emit());
cur = padding({cur, padding_shp_var}); cur = padding({cur, padding_shp_var});
} }
cur = ReformatManager::instance().get(key)({cur}); cur = ReformatManager::instance().get(key)({cur});
...@@ -554,4 +520,81 @@ const ReformatManager& ReformatManager::instance() { ...@@ -554,4 +520,81 @@ const ReformatManager& ReformatManager::instance() {
static ReformatManager inst; static ReformatManager inst;
return inst; return inst;
} }
TensorShape mgb::gopt::make_aligned_tensor_shape(const VarNode* var,
TensorFormats orig_formats,
TensorFormats target_formats) {
using Dimension = megdnn::Dimension;
static constexpr uint32_t UNDETERMINED_EXTENT =
Dimension::UNDETERMINED_EXTENT;
auto orig_shape = tensor_formats_to_named_tensor_shape(orig_formats);
auto target_shape = tensor_formats_to_named_tensor_shape(target_formats);
TensorShape oshp = var->shape();
mgb_assert(oshp.is_scalar() || oshp.ndim == orig_shape.ndim,
"orig shape of var node is not compatible with tensor "
"formats(var:%s;shp:%s;fmt:%s)",
var->cname(), oshp.to_string().c_str(),
orig_shape.to_string().c_str());
if (oshp.is_scalar()) return oshp;
TensorShape tshp;
ThinHashMap<Dimension::Name, int> name2dominant;
for (size_t i = 0; i < orig_shape.ndim; ++i) {
auto name = orig_shape[i].name();
if (orig_shape[i].extent() == UNDETERMINED_EXTENT) {
auto insert = name2dominant.insert(std::make_pair(name, i));
mgb_assert(insert.second);
}
}
tshp.ndim = target_shape.ndim;
for (size_t i = 0; i < target_shape.ndim; ++i) {
auto name = target_shape[i].name();
if (target_shape[i].extent() == UNDETERMINED_EXTENT) {
int idx = name2dominant.at(name);
bool mul = orig_shape[idx] < target_shape[i];
size_t factor = mul ? (target_shape[i] / orig_shape[idx]).extent()
: (orig_shape[idx] / target_shape[i]).extent();
if (mul)
tshp[i] = oshp[idx] * factor;
else
tshp[i] = divup(oshp[idx], factor);
} else {
tshp[i] = target_shape[i].extent();
}
}
return tshp;
}
TensorShape mgb::gopt::make_aligned_weight_shape(const VarNode* var,
TensorFormats orig_formats,
TensorFormats target_formats,
TensorFormats extra_formats) {
auto tshp = make_aligned_tensor_shape(var, orig_formats, target_formats);
auto extra_shape = tensor_formats_to_named_tensor_shape(extra_formats);
using Dimension = megdnn::Dimension;
static constexpr uint32_t UNDETERMINED_EXTENT =
Dimension::UNDETERMINED_EXTENT;
size_t out_channel_alignment = 1;
for (size_t i = 0; i < extra_shape.ndim; ++i) {
auto name = extra_shape[i].name();
if (name == Dimension::Name::C &&
extra_shape[i].extent() == UNDETERMINED_EXTENT) {
out_channel_alignment = extra_shape[i].stride();
}
}
auto target_shape = tensor_formats_to_named_tensor_shape(target_formats);
for (size_t i = 0; i < target_shape.ndim; ++i) {
auto name = target_shape[i].name();
if ((name == Dimension::Name::K || name == Dimension::Name::N) &&
target_shape[i].extent() == UNDETERMINED_EXTENT) {
size_t out_channels = tshp[i] * target_shape[i].stride();
tshp[i] = divup(out_channels, out_channel_alignment) *
out_channel_alignment / target_shape[i].stride();
}
}
return tshp;
}
// vim: syntax=cpp.doxygen // vim: syntax=cpp.doxygen
/**
* \file src/gopt/impl/utils.h
* MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
*
* Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
* implied.
*/
#pragma once
#include "megbrain/gopt/global_layout_transform.h"
namespace mgb {
namespace gopt {
static inline const char* opr_format_to_string(
OprTensorFormatsConfiguration::OprFormat opr_format) {
using OprFormat = OprTensorFormatsConfiguration::OprFormat;
#define cb(_fmt) \
case OprFormat::_fmt: \
return #_fmt
switch (opr_format) {
cb(NCHW);
cb(NHWC);
cb(NCHW4);
cb(NCHW32);
cb(NCHW64);
cb(CHWN4);
default:
mgb_assert(false, "Invalid opr format(got:%u)",
static_cast<uint32_t>(opr_format));
}
#undef cb
}
static inline megdnn::NamedTensorShape tensor_formats_to_named_tensor_shape(
TensorFormats format) {
switch (format) {
case TensorFormats::NCHW:
return {{"N"}, {"C"}, {"H"}, {"W"}};
case TensorFormats::NHWC:
return {{"N"}, {"H"}, {"W"}, {"C"}};
case TensorFormats::NCHWc4:
return {{"N"}, {"C//4"}, {"H"}, {"W"}, {"C%4"}};
case TensorFormats::NCHWc8:
return {{"N"}, {"C//8"}, {"H"}, {"W"}, {"C%8"}};
case TensorFormats::NCHWc32:
return {{"N"}, {"C//32"}, {"H"}, {"W"}, {"C%32"}};
case TensorFormats::NCHWc64:
return {{"N"}, {"C//64"}, {"H"}, {"W"}, {"C%64"}};
case TensorFormats::CHWNc4:
return {{"C//4"}, {"H"}, {"W"}, {"N"}, {"C%4"}};
case TensorFormats::NHCWc4:
return {{"N"}, {"H"}, {"C//4"}, {"W"}, {"C%4"}};
case TensorFormats::KRSCk4:
return {{"K//4"}, {"R"}, {"S"}, {"C"}, {"K%4"}};
case TensorFormats::GKRSCk4:
return {{"G"}, {"K//4"}, {"R"}, {"S"}, {"C"}, {"K%4"}};
case TensorFormats::C1RSc4:
return {{"C//4"}, {"C%1"}, {"R"}, {"S"}, {"C%4"}};
case TensorFormats::KRSCk4c4:
return {{"K//4"}, {"R"}, {"S"}, {"C//4"}, {"K%4"}, {"C%4"}};
case TensorFormats::GKRSCk4c4:
return {{"G"}, {"K//4"}, {"R"}, {"S"}, {"C//4"}, {"K%4"}, {"C%4"}};
case TensorFormats::KCRSk4c4:
return {{"K//4"}, {"C//4"}, {"R"}, {"S"}, {"K%4"}, {"C%4"}};
case TensorFormats::GKCRSk4c4:
return {{"G"}, {"K//4"}, {"C//4"}, {"R"}, {"S"}, {"K%4"}, {"C%4"}};
case TensorFormats::KCRSc4k4:
return {{"K//4"}, {"C//4"}, {"R"}, {"S"}, {"C%4"}, {"K%4"}};
case TensorFormats::GKCRSc4k4:
return {{"G"}, {"K//4"}, {"C//4"}, {"R"}, {"S"}, {"C%4"}, {"K%4"}};
case TensorFormats::C11RSc4:
return {{"C//4"}, {"C%1"}, {"C%1"}, {"R"}, {"S"}, {"C%4"}};
case TensorFormats::KCRSc8k8:
return {{"K//8"}, {"C//8"}, {"R"}, {"S"}, {"C%8"}, {"K%8"}};
case TensorFormats::GKCRSc8k8:
return {{"G"}, {"K//8"}, {"C//8"}, {"R"}, {"S"}, {"C%8"}, {"K%8"}};
case TensorFormats::C11RSc8:
return {{"C//8"}, {"C%1"}, {"C%1"}, {"R"}, {"S"}, {"C%8"}};
case TensorFormats::KRSCk8:
return {{"K//8"}, {"R"}, {"S"}, {"C"}, {"K%8"}};
case TensorFormats::KCRSc4:
return {{"K"}, {"C//4"}, {"R"}, {"S"}, {"C%4"}};
case TensorFormats::GKCRSc4:
return {{"G"}, {"K"}, {"C//4"}, {"R"}, {"S"}, {"C%4"}};
case TensorFormats::KCRS:
return {{"K"}, {"C"}, {"R"}, {"S"}};
case TensorFormats::GKCRS:
return {{"G"}, {"K"}, {"C"}, {"R"}, {"S"}};
case TensorFormats::C11RS:
return {{"C"}, {"C%1"}, {"C%1"}, {"R"}, {"S"}};
default:
mgb_throw(AssertionError, "invalid tensor formats(%u)",
static_cast<uint32_t>(format));
}
}
} // namespace gopt
} // namespace mgb
// vim: syntax=cpp.doxygen
/**
* \file src/gopt/include/megbrain/gopt/global_layout_transformation.h
* MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
*
* Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
* implied.
*/
#pragma once
#include "megbrain/gopt/reformat_manager.h"
#include "megbrain/gopt/subgraph_extractor.h"
#include "megbrain/opr/dnn/convolution.h"
namespace mgb {
namespace gopt {
/*!
* \brief A structure that describe the data types and tensor formats
* configuration of the opr format
*/
struct OprTensorFormatsConfiguration {
using OprFormat = opr::ConvBias::Param::Format;
using OprTensorFormatsDispatcher =
thin_function<Maybe<OprTensorFormatsConfiguration>(
const cg::OperatorNodeBase*)>;
Typeinfo* typeinfo;
OprFormat opr_format;
SmallVector<DTypeEnum> input_dtypes;
SmallVector<DTypeEnum> output_dtypes;
SmallVector<TensorFormats> input_tensor_formats;
SmallVector<TensorType> input_tensor_types;
SmallVector<TensorFormats> output_tensor_formats;
static OprTensorFormatsDispatcher* find_dispatcher_by_type_format(
Typeinfo* type, OprFormat opr_format);
};
/*!
* \brief A structure that describes the global layout transform problem
*/
class Problem {
public:
using OprFormat = OprTensorFormatsConfiguration::OprFormat;
using OprTensorFormatsDispatcher =
OprTensorFormatsConfiguration::OprTensorFormatsDispatcher;
using OprConfigTrait =
ThinHashMap<Typeinfo*,
ThinHashMap<OprFormat, OprTensorFormatsDispatcher*>>;
struct Attribute {
OprFormat base_opr_format; /// the base opr format indicates that the
/// network to be optimized is constructed
/// in the base opr format, i.e. all the
/// format aware operators (conv, conv_bias,
/// deconv, pooling etc.) are built in
/// this format.
TensorFormats
base_tensor_formats; /// the base tensor format indicates that
/// all the format agnostic operators
/// (like elemwise, elemwise multi type,
/// typecvt etc.) are built in the base
/// tensor format.
};
Problem(const GraphPartition& graph_partition,
const SmallVector<TensorFormats>& available_tensor_formats,
const OprConfigTrait& opr_config, const Attribute& attribute)
: m_graph_partition{graph_partition},
m_available_tensor_formats{available_tensor_formats},
m_opr_configs{opr_config},
m_attribute{attribute} {}
~Problem() noexcept = default;
const GraphPartition& graph_partition() const { return m_graph_partition; }
const OprConfigTrait& opr_configs() const { return m_opr_configs; }
const SmallVector<TensorFormats>& available_tensor_formats() const {
return m_available_tensor_formats;
}
TensorFormats base_format() const {
return m_attribute.base_tensor_formats;
}
OprTensorFormatsConfiguration base_config(
const cg::OperatorNodeBase* opr) const {
auto _ = OprTensorFormatsConfiguration::find_dispatcher_by_type_format(
opr->dyn_typeinfo(), m_attribute.base_opr_format);
auto rst = (*_)(opr);
if (rst.valid())
return rst.val();
OprTensorFormatsConfiguration config;
config.typeinfo = opr->dyn_typeinfo();
config.opr_format = m_attribute.base_opr_format;
for (const auto& i : opr->input()) {
config.input_dtypes.emplace_back(i->dtype().enumv());
config.input_tensor_formats.emplace_back(
m_attribute.base_tensor_formats);
config.input_tensor_types.emplace_back(TensorType::FEATURE);
}
config.output_dtypes.emplace_back(opr->output(0)->dtype().enumv());
config.output_tensor_formats.emplace_back(
m_attribute.base_tensor_formats);
return config;
}
private:
const GraphPartition& m_graph_partition; /// the graph partition
const SmallVector<TensorFormats>&
m_available_tensor_formats; /// the available tensor formats, used
/// for format agnostic operators (like
/// elemwise, elemwise multi type,
/// typecvt, etc.
const OprConfigTrait&
m_opr_configs; /// the available opr format configurations, used
/// for format aware operators (like conv, deconv,
/// conv_bias, etc.
Attribute m_attribute; /// the extra attributes to describe the problem
};
/*!
* \brief A profiler that collects all the performance data to describe the
* global layout transform problem.
*/
class ProfilerBase {
public:
using OprFormat = Problem::OprFormat;
struct OperatorNodeRecord {
const cg::OperatorNodeBase* opr; ///< pointer to operator node
ThinHashMap<OprFormat, float>
costs; ///< costs of operator node, i.e. the elapsed device
///< time of the operator node on different opr format
///< (layout configuration).
std::string to_string() const;
};
struct VarNodeRecord {
struct KeyHash {
size_t operator()(
const std::pair<TensorFormats, TensorFormats>& val) const {
size_t h1 =
std::hash<uint32_t>()(static_cast<uint32_t>(val.first));
size_t h2 = std::hash<uint32_t>()(
static_cast<uint32_t>(val.second));
return mgb::hash_pair_combine(h1, h2);
}
};
const VarNode* var; ///< pointer to var node
std::unordered_map<std::pair<TensorFormats, TensorFormats>, float,
KeyHash>
costs; ///< costs of var node, i.e. the elapsed
///< device time of the layout transform.
///< Key of the hashmap indicates the
///< source tensor format and the target
///< tensor format.
std::string to_string() const;
};
/*!
* \note the profiler assumes all the input and output var node are stored
* in contiguous layout in memory
*/
struct ProfilingResult {
/// A hashmap, that maps the operator node to the costs (device elapsed
/// time) of different layouts configuration
ThinHashMap<cg::OperatorNodeBase*, OperatorNodeRecord> opr_record;
/// A hashmap, that maps the var node to the costs of layout transform
ThinHashMap<VarNode*, VarNodeRecord> var_record;
};
ProfilerBase() = default;
virtual ~ProfilerBase() = default;
virtual ProfilingResult profile(const Problem& problem) const = 0;
static std::unique_ptr<ProfilerBase> make_profiler();
};
} // namespace gopt
} // namespace mgb
// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}
...@@ -80,11 +80,13 @@ private: ...@@ -80,11 +80,13 @@ private:
class PaddingEmitter final : public Emitter { class PaddingEmitter final : public Emitter {
public: public:
PaddingEmitter(size_t const_extent, size_t axis) PaddingEmitter(const megdnn::NamedTensorShape& padshp, size_t const_extent,
: m_const_extent{const_extent}, m_axis{axis} {} size_t axis)
: m_padshp{padshp}, m_const_extent{const_extent}, m_axis{axis} {}
EmitResult emit() const override; EmitResult emit() const override;
private: private:
megdnn::NamedTensorShape m_padshp;
size_t m_const_extent, m_axis; size_t m_const_extent, m_axis;
}; };
......
...@@ -17,6 +17,11 @@ ...@@ -17,6 +17,11 @@
namespace mgb { namespace mgb {
namespace gopt { namespace gopt {
enum class TensorType : uint32_t {
FEATURE = 0,
WEIGHT = 1,
};
enum class TensorFormats : uint32_t { enum class TensorFormats : uint32_t {
// input tensor formats // input tensor formats
NCHW = 0, ///< [N, C, H, W] NCHW = 0, ///< [N, C, H, W]
...@@ -116,6 +121,15 @@ public: ...@@ -116,6 +121,15 @@ public:
private: private:
ReformatCache m_cache; ReformatCache m_cache;
}; };
TensorShape make_aligned_tensor_shape(const VarNode* var,
TensorFormats orig_formats,
TensorFormats target_formats);
TensorShape make_aligned_weight_shape(const VarNode* var,
TensorFormats orig_formats,
TensorFormats target_formats,
TensorFormats extra_formats);
} // namespace gopt } // namespace gopt
} // namespace mgb } // namespace mgb
......
...@@ -20,6 +20,7 @@ class GraphPartition { ...@@ -20,6 +20,7 @@ class GraphPartition {
public: public:
using VarNodeSet = ThinHashSet<VarNode*>; using VarNodeSet = ThinHashSet<VarNode*>;
using OperatorNodeSet = ThinHashSet<cg::OperatorNodeBase*>; using OperatorNodeSet = ThinHashSet<cg::OperatorNodeBase*>;
class InputPlaceholder; class InputPlaceholder;
GraphPartition() = default; GraphPartition() = default;
...@@ -45,13 +46,13 @@ private: ...@@ -45,13 +46,13 @@ private:
class SubGraphExtractor { class SubGraphExtractor {
public: public:
using OprList = ThinHashSet<Typeinfo*>; using OprList = ThinHashSet<Typeinfo*>;
SubGraphExtractor(OprList opr_list) : m_opr_list{opr_list} {}; SubGraphExtractor(const OprList& opr_list) : m_opr_list{opr_list} {};
std::vector<GraphPartition> extract( std::vector<GraphPartition> extract(
const SymbolVarArray& endpoint_vars) const; const SymbolVarArray& endpoint_vars) const;
private: private:
class Impl; class Impl;
OprList m_opr_list; const OprList& m_opr_list;
}; };
} // namespace gopt } // namespace gopt
......
/**
* \file src/gopt/test/profiler.cpp
* MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
*
* Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
* implied.
*/
#include "./helper.h"
#include "megbrain/gopt/global_layout_transform.h"
#include "megbrain/gopt/inference.h"
#include "megbrain/opr/dnn/pooling.h"
#include "megbrain/opr/imgproc.h"
#include "megbrain/opr/nn_int.h"
#include "megbrain/serialization/serializer.h"
using namespace mgb;
using namespace gopt;
using namespace serialization;
namespace {
class LayoutTransformContext : public NonCopyableObj {
public:
using OprList = SubGraphExtractor::OprList;
using OprFormat = Problem::OprFormat;
using OprConfigTrait = Problem::OprConfigTrait;
LayoutTransformContext() = delete;
LayoutTransformContext(OprList opr_list,
SmallVector<TensorFormats> available_tensor_formats,
OprConfigTrait opr_configs)
: m_opr_list{std::move(opr_list)},
m_available_tensor_formats{std::move(available_tensor_formats)},
m_opr_configs{std::move(opr_configs)} {}
const OprList& opr_list() const { return m_opr_list; }
const SmallVector<TensorFormats>& available_tensor_formats() const {
return m_available_tensor_formats;
}
const OprConfigTrait& opr_configs() const { return m_opr_configs; }
static std::unique_ptr<LayoutTransformContext> make() {
OprList opr_list = {
opr::ConvBiasForward::typeinfo(),
opr::ConvolutionForward::typeinfo(),
opr::ConvolutionBackwardData::typeinfo(),
opr::ElemwiseMultiType::typeinfo(),
opr::Elemwise::typeinfo(),
opr::TypeCvt::typeinfo(),
opr::PoolingForward::typeinfo(),
opr::WarpPerspectiveForward::typeinfo(),
};
OprConfigTrait opr_configs;
{
auto& dispatchers = opr_configs[opr::ConvBias::typeinfo()];
#define cb(_fmt) \
dispatchers[OprFormat::_fmt] = \
OprTensorFormatsConfiguration::find_dispatcher_by_type_format( \
opr::ConvBias::typeinfo(), OprFormat::_fmt);
cb(NCHW4);
cb(NCHW32);
cb(NHWC);
cb(NCHW64);
cb(CHWN4);
#undef cb
}
{
auto& dispatchers =
opr_configs[opr::ConvolutionBackwardData::typeinfo()];
#define cb(_fmt) \
dispatchers[OprFormat::_fmt] = \
OprTensorFormatsConfiguration::find_dispatcher_by_type_format( \
opr::ConvolutionBackwardData::typeinfo(), \
OprFormat::_fmt);
cb(NCHW4);
#undef cb
}
{
auto& dispatchers =
opr_configs[opr::ConvolutionForward::typeinfo()];
#define cb(_fmt) \
dispatchers[OprFormat::_fmt] = \
OprTensorFormatsConfiguration::find_dispatcher_by_type_format( \
opr::ConvolutionForward::typeinfo(), OprFormat::_fmt);
cb(NCHW4);
#undef cb
}
{
auto& dispatchers = opr_configs[opr::PoolingForward::typeinfo()];
#define cb(_fmt) \
dispatchers[OprFormat::_fmt] = \
OprTensorFormatsConfiguration::find_dispatcher_by_type_format( \
opr::PoolingForward::typeinfo(), OprFormat::_fmt);
cb(NCHW4);
cb(NCHW32);
cb(NHWC);
cb(NCHW64);
cb(CHWN4);
#undef cb
}
{
auto& dispatchers =
opr_configs[opr::WarpPerspectiveForward::typeinfo()];
#define cb(_fmt) \
dispatchers[OprFormat::_fmt] = \
OprTensorFormatsConfiguration::find_dispatcher_by_type_format( \
opr::WarpPerspectiveForward::typeinfo(), OprFormat::_fmt);
cb(NHWC);
cb(NCHW4);
cb(NCHW64);
#undef cb
}
SmallVector<TensorFormats> available_tensor_formats = {
TensorFormats::NHWC, TensorFormats::NCHWc4,
TensorFormats::NCHWc32, TensorFormats::NCHWc64};
return std::make_unique<LayoutTransformContext>(
std::move(opr_list), std::move(available_tensor_formats),
std::move(opr_configs));
}
private:
OprList m_opr_list;
SmallVector<TensorFormats> m_available_tensor_formats;
OprConfigTrait m_opr_configs;
};
}; // namespace
#if MGB_CUDA
#if CUDA_VERSION >= 10020
TEST(TestProfiler, Conv) {
REQUIRE_GPU(1);
auto cn = CompNode::load("gpu0");
cn.activate();
REQUIRE_CUDA_COMPUTE_CAPABILITY_EQ(7, 5);
auto ctx = LayoutTransformContext::make();
HostTensorGenerator<dtype::Int8> gen;
auto graph = ComputingGraph::make();
graph->options().graph_opt_level = 0;
auto mkvar = [&](const char* name, const TensorShape& shp,
const DType& dtype) {
return opr::TypeCvt::make(
opr::Host2DeviceCopy::make(*graph, gen(shp, cn)).rename(name),
dtype);
};
auto mkcvar = [&](const char* name, const TensorShape& shp,
const DType& dtype) {
return opr::TypeCvt::make(
opr::SharedDeviceTensor::make(*graph, *gen(shp, cn))
.rename(name),
dtype);
};
auto x = mkvar("x", {64, 48, 14, 14},
dtype::Quantized4Asymm(2.5f, static_cast<uint8_t>(4)));
auto w1 = mkcvar("w1", {48, 48, 3, 3}, dtype::QuantizedS4(2.5f));
auto b1 = mkcvar("b1", {1, 48, 1, 1}, dtype::QuantizedS32(6.25f));
opr::ConvBias::Param param;
param.format = opr::ConvBias::Param::Format::NCHW;
param.nonlineMode = opr::ConvBias::Param::NonlineMode::IDENTITY;
param.stride_h = param.stride_w = 1;
param.pad_h = param.pad_w = 1;
auto c1 = opr::ConvBias::make(x, w1, b1, param, {},
OperatorNodeConfig(dtype::Quantized4Asymm(
12.345f, static_cast<uint8_t>(5))));
x = opr::TypeCvt::make(c1, dtype::QuantizedS8(12.345f));
auto w2 = mkcvar("w2", {48, 48, 3, 3}, dtype::QuantizedS8(2.5f));
auto b2 = mkcvar("b2", {1, 48, 1, 1}, dtype::QuantizedS32(12.345f * 2.5f));
auto c2 = opr::ConvBias::make(x, w2, b2, param, {},
OperatorNodeConfig(dtype::QuantizedS8(2.5f)));
using S = opr::mixin::AlgoChooserHelper::ExecutionPolicy::Strategy;
S strategy = S::PROFILE;
gopt::modify_opr_algo_strategy_inplace({c2}, strategy);
using OprFormat = OprTensorFormatsConfiguration::OprFormat;
SubGraphExtractor extractor(ctx->opr_list());
auto partitions = extractor.extract({c2});
ASSERT_EQ(partitions.size(), 1u);
using Attribute = Problem::Attribute;
Attribute attribute = {OprFormat::NCHW, TensorFormats::NCHW};
Problem problem(partitions[0], ctx->available_tensor_formats(),
ctx->opr_configs(), attribute);
auto profiler = ProfilerBase::make_profiler();
auto rst = profiler->profile(problem);
const auto& opr_rst = rst.opr_record;
const auto& var_rst = rst.var_record;
EXPECT_TRUE(opr_rst.count(c1.node()->owner_opr()) > 0);
EXPECT_TRUE(opr_rst.count(c2.node()->owner_opr()) > 0);
EXPECT_TRUE(opr_rst.count(x.node()->owner_opr()) > 0);
EXPECT_TRUE(var_rst.count(w1.node()) == 0);
EXPECT_TRUE(var_rst.count(b1.node()) == 0);
EXPECT_TRUE(var_rst.count(w2.node()) == 0);
EXPECT_TRUE(var_rst.count(b2.node()) == 0);
}
#endif
TEST(TestProfiler, Deconv) {
REQUIRE_GPU(1);
auto cn = CompNode::load("gpu0");
cn.activate();
REQUIRE_CUDA_COMPUTE_CAPABILITY_EQ(7, 5);
auto ctx = LayoutTransformContext::make();
HostTensorGenerator<dtype::Int8> gen;
auto graph = ComputingGraph::make();
graph->options().graph_opt_level = 0;
auto mkvar = [&](const char* name, const TensorShape& shp,
const DType& dtype) {
return opr::TypeCvt::make(
opr::Host2DeviceCopy::make(*graph, gen(shp, cn)).rename(name),
dtype);
};
auto mkcvar = [&](const char* name, const TensorShape& shp,
const DType& dtype) {
return opr::TypeCvt::make(
opr::SharedDeviceTensor::make(*graph, *gen(shp, cn))
.rename(name),
dtype);
};
auto x = mkvar("x", {64, 10, 7, 7}, dtype::QuantizedS8(2.5f));
auto w1 = mkcvar("w1", {10, 10, 2, 2}, dtype::QuantizedS8(2.5f));
using Param = opr::ConvolutionBackwardData::Param;
Param param;
param.format = opr::ConvolutionBackwardData::Param::Format::NCHW;
param.stride_h = param.stride_w = 2;
param.pad_h = param.pad_w = 0;
auto c1 = opr::ConvolutionBackwardData::make(
w1, x, param, {}, OperatorNodeConfig(dtype::QuantizedS8(2.5f)));
auto w2 = mkcvar("w2", {10, 10, 2, 2}, dtype::QuantizedS8(2.5f));
auto c2 = opr::ConvolutionBackwardData::make(
w2, c1, param, {}, OperatorNodeConfig(dtype::QuantizedS8(2.5f)));
using S = opr::mixin::AlgoChooserHelper::ExecutionPolicy::Strategy;
S strategy = S::PROFILE;
gopt::modify_opr_algo_strategy_inplace({c2}, strategy);
using OprFormat = OprTensorFormatsConfiguration::OprFormat;
SubGraphExtractor extractor(ctx->opr_list());
auto partitions = extractor.extract({c2});
ASSERT_EQ(partitions.size(), 1u);
using Attribute = Problem::Attribute;
Attribute attribute = {OprFormat::NCHW, TensorFormats::NCHW};
Problem problem(partitions[0], ctx->available_tensor_formats(),
ctx->opr_configs(), attribute);
auto profiler = ProfilerBase::make_profiler();
auto rst = profiler->profile(problem);
const auto& opr_rst = rst.opr_record;
const auto& var_rst = rst.var_record;
EXPECT_TRUE(opr_rst.count(c1.node()->owner_opr()) > 0);
EXPECT_TRUE(opr_rst.count(c2.node()->owner_opr()) > 0);
EXPECT_TRUE(opr_rst.count(x.node()->owner_opr()) > 0);
EXPECT_TRUE(var_rst.count(w1.node()) == 0);
EXPECT_TRUE(var_rst.count(w2.node()) == 0);
}
TEST(TestProfiler, Warp) {
REQUIRE_GPU(1);
auto cn = CompNode::load("gpu0");
cn.activate();
REQUIRE_CUDA_COMPUTE_CAPABILITY_EQ(7, 5);
auto ctx = LayoutTransformContext::make();
constexpr size_t INP_H = 10, INP_W = 10, N = 16;
HostTensorGenerator<dtype::Int8> gen;
auto graph = ComputingGraph::make();
graph->options().graph_opt_level = 0;
auto mkvar = [&](const char* name, const TensorShape& shp,
const DType& dtype) {
return opr::TypeCvt::make(
opr::Host2DeviceCopy::make(*graph, gen(shp, cn)).rename(name),
dtype);
};
auto x = mkvar("x", {N, 48, INP_H, INP_W},
dtype::Quantized4Asymm(2.5f, static_cast<uint8_t>(4)));
float value1 = M_PI, value2 = 0.6;
auto gen_mat = [&](HostTensorND& mat) {
auto ptr = mat.ptr<float>();
for (size_t i = 0; i < N; ++i) {
auto rot = value1, scale = value2, sheer = value1, dy = value2,
dx = value2, ky = value2, kx = value2, kb = value2;
ptr[0] = ptr[4] = cos(rot) * scale;
ptr[1] = -(ptr[3] = sin(rot) * scale);
ptr[3] *= sheer;
ptr[4] *= sheer;
ptr[2] = dx;
ptr[5] = dy;
ptr[6] = kx;
ptr[7] = ky;
ptr[8] = kb;
ptr += 9;
}
mgb_assert(ptr == mat.ptr<float>() + mat.shape().total_nr_elems());
};
auto mat_host = std::make_shared<HostTensorND>(
x.node()->comp_node(), TensorShape{N, 3, 3}, dtype::Float32());
gen_mat(*mat_host);
auto mat = opr::Host2DeviceCopy::make(*graph, mat_host).rename("mat");
TensorShape out_shp{20, 20};
auto w1 = opr::WarpPerspectiveForward::make(x, mat, out_shp);
using S = opr::mixin::AlgoChooserHelper::ExecutionPolicy::Strategy;
S strategy = S::PROFILE;
gopt::modify_opr_algo_strategy_inplace({w1}, strategy);
using OprFormat = OprTensorFormatsConfiguration::OprFormat;
SubGraphExtractor extractor(ctx->opr_list());
auto partitions = extractor.extract({w1});
ASSERT_EQ(partitions.size(), 1u);
using Attribute = Problem::Attribute;
Attribute attribute = {OprFormat::NCHW, TensorFormats::NCHW};
Problem problem(partitions[0], ctx->available_tensor_formats(),
ctx->opr_configs(), attribute);
auto profiler = ProfilerBase::make_profiler();
auto rst = profiler->profile(problem);
const auto& opr_rst = rst.opr_record;
const auto& var_rst = rst.var_record;
EXPECT_TRUE(opr_rst.count(w1.node()->owner_opr()) > 0);
EXPECT_TRUE(var_rst.count(mat.node()) == 0);
EXPECT_TRUE(var_rst.count(w1.node()->owner_opr()->input(2)) == 0);
EXPECT_TRUE(var_rst.count(w1.node()->owner_opr()->input(0)) > 0);
}
TEST(TestProfiler, Pooling) {
REQUIRE_GPU(1);
auto cn = CompNode::load("gpu0");
cn.activate();
REQUIRE_CUDA_COMPUTE_CAPABILITY_EQ(7, 5);
auto ctx = LayoutTransformContext::make();
HostTensorGenerator<dtype::Int8> gen;
auto graph = ComputingGraph::make();
graph->options().graph_opt_level = 0;
auto mkvar = [&](const char* name, const TensorShape& shp,
const DType& dtype) {
return opr::TypeCvt::make(
opr::Host2DeviceCopy::make(*graph, gen(shp, cn)).rename(name),
dtype);
};
auto x = mkvar("x", {64, 64, 55, 55},
dtype::Quantized4Asymm(2.5f, static_cast<uint8_t>(4)));
using Param = opr::Pooling::Param;
Param param;
param.format = Param::Format::NCHW;
auto p1 = opr::Pooling::make(x, param);
x = opr::TypeCvt::make(p1, dtype::QuantizedS8(12.345f));
auto p2 = opr::Pooling::make(x, param);
using S = opr::mixin::AlgoChooserHelper::ExecutionPolicy::Strategy;
S strategy = S::PROFILE;
gopt::modify_opr_algo_strategy_inplace({p2}, strategy);
using OprFormat = OprTensorFormatsConfiguration::OprFormat;
SubGraphExtractor extractor(ctx->opr_list());
auto partitions = extractor.extract({p2});
ASSERT_EQ(partitions.size(), 1u);
using Attribute = Problem::Attribute;
Attribute attribute = {OprFormat::NCHW, TensorFormats::NCHW};
Problem problem(partitions[0], ctx->available_tensor_formats(),
ctx->opr_configs(), attribute);
auto profiler = ProfilerBase::make_profiler();
auto rst = profiler->profile(problem);
const auto& opr_rst = rst.opr_record;
EXPECT_TRUE(opr_rst.count(p1.node()->owner_opr()) > 0);
EXPECT_TRUE(opr_rst.count(p2.node()->owner_opr()) > 0);
EXPECT_TRUE(opr_rst.count(x.node()->owner_opr()) > 0);
}
TEST(TestProfiler, Elemwise) {
REQUIRE_GPU(1);
auto cn = CompNode::load("gpu0");
cn.activate();
REQUIRE_CUDA_COMPUTE_CAPABILITY_EQ(7, 5);
auto ctx = LayoutTransformContext::make();
HostTensorGenerator<dtype::Int8> gen;
auto graph = ComputingGraph::make();
graph->options().graph_opt_level = 0;
auto mkvar = [&](const char* name, const TensorShape& shp,
const DType& dtype) {
return opr::TypeCvt::make(
opr::Host2DeviceCopy::make(*graph, gen(shp, cn)).rename(name),
dtype);
};
auto a = mkvar("a", {64, 48, 14, 14}, dtype::Float32());
auto b = mkvar("b", {1, 48, 1, 1}, dtype::Float32());
auto c = opr::Elemwise::make({a, b},
{opr::Elemwise::Param::Mode::FUSE_ADD_RELU});
auto q4c = opr::TypeCvt::make(
c, dtype::Quantized4Asymm(2.5f, static_cast<uint8_t>(4)));
auto q8a = mkvar("q8a", {64, 48, 14, 14}, dtype::QuantizedS8(2.5f));
auto q8b = mkvar("q8b", {64, 48, 14, 14}, dtype::QuantizedS8(1.2f));
auto q8d = opr::ElemwiseMultiType::make(
{q8a, q8b}, {opr::ElemwiseMultiType::Param::Mode::QFUSE_ADD_RELU},
OperatorNodeConfig(dtype::QuantizedS8(12.f)));
auto q4d = opr::TypeCvt::make(
q8d, dtype::Quantized4Asymm(1.2f, static_cast<uint8_t>(3)));
auto q4e = opr::ElemwiseMultiType::make(
{q4c, q4d}, {opr::ElemwiseMultiType::Param::Mode::QADD},
OperatorNodeConfig(
dtype::Quantized4Asymm(13.f, static_cast<uint8_t>(4))));
using OprFormat = OprTensorFormatsConfiguration::OprFormat;
SubGraphExtractor extractor(ctx->opr_list());
auto partitions = extractor.extract({q4e});
ASSERT_EQ(partitions.size(), 1u);
using Attribute = Problem::Attribute;
Attribute attribute = {OprFormat::NCHW, TensorFormats::NCHW};
Problem problem(partitions[0], ctx->available_tensor_formats(),
ctx->opr_configs(), attribute);
auto profiler = ProfilerBase::make_profiler();
auto rst = profiler->profile(problem);
const auto& opr_rst = rst.opr_record;
const auto& var_rst = rst.var_record;
EXPECT_TRUE(opr_rst.count(c.node()->owner_opr()) > 0);
EXPECT_TRUE(opr_rst.count(q8d.node()->owner_opr()) > 0);
EXPECT_TRUE(opr_rst.count(q4e.node()->owner_opr()) > 0);
EXPECT_TRUE(var_rst.count(a.node()) > 0);
EXPECT_TRUE(var_rst.count(b.node()) > 0);
EXPECT_TRUE(var_rst.count(q8a.node()) > 0);
EXPECT_TRUE(var_rst.count(q8b.node()) > 0);
}
#endif
// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}
...@@ -447,8 +447,6 @@ TEST(TestReformatManager, AutoAlignedFeatureProfiling) { ...@@ -447,8 +447,6 @@ TEST(TestReformatManager, AutoAlignedFeatureProfiling) {
for (size_t i = 0; i < RUNS; ++i) for (size_t i = 0; i < RUNS; ++i)
func->execute(); func->execute();
double time_profiler = profiler->duration() * 1e6; double time_profiler = profiler->duration() * 1e6;
printf("%f, %f\n", time_profiler, time_cuda_evt);
ASSERT_EQ(time_cuda_evt, time_profiler);
MGB_CUDA_CHECK(cudaEventDestroy(evt0)); MGB_CUDA_CHECK(cudaEventDestroy(evt0));
MGB_CUDA_CHECK(cudaEventDestroy(evt1)); MGB_CUDA_CHECK(cudaEventDestroy(evt1));
} }
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册