提交 9c0a17d0 编写于 作者: M Megvii Engine Team

feat(mgb/gopt): add auto aligned reformat impls

GitOrigin-RevId: fd0814fdb3e9f3418df81f6e9295d3cb44f3a67d
上级 2ed76b16
......@@ -13,6 +13,7 @@
#include "megbrain/gopt/reformat_emitter.h"
#include <numeric>
#include "megbrain/opr/tensor_manip.h"
#include "megbrain/opr/io.h"
using namespace mgb;
using namespace gopt;
......@@ -243,4 +244,63 @@ ReformatEmitter::UnderlyingBuilders ReformatEmitter::analyze() const {
}
return builders;
}
/* ============== PaddingEmitter ================= */
PaddingEmitter::EmitResult PaddingEmitter::emit() const {
auto&& const_extent = m_const_extent;
auto&& axis = m_axis;
auto builder = [const_extent, axis](const VarNodeArray& vars) {
auto i = vars[0];
auto padding_shp_var = vars[1];
TensorShape shape;
shape.ndim = i->shape().ndim;
for (size_t ax = 0; ax < shape.ndim; ++ax)
shape[ax] = 1;
shape[axis] = const_extent;
auto host_val =
std::make_shared<HostTensorND>(i->comp_node(), i->dtype());
host_val->resize(shape);
auto ptr = host_val->raw_ptr();
size_t size_bytes = TensorLayout{shape, i->dtype()}.span().dist_byte();
std::memset(ptr, 0, size_bytes);
auto padding =
opr::ImmutableTensor::make(*i->owner_graph(), *host_val);
padding = opr::Broadcast::make(padding, padding_shp_var);
auto o = opr::Concat::make({i, padding}, axis);
return o.node();
};
auto checker = [axis](const VarNodeArray& vars) {
mgb_assert(vars.size() == 2);
return vars[0]->shape().ndim > axis;
};
return std::make_tuple(builder, checker);
}
/* ============== SubtensorEmitter ================= */
SubtensorEmitter::EmitResult SubtensorEmitter::emit() const {
auto&& const_extent = m_const_extent;
auto&& axis = m_axis;
auto builder = [const_extent, axis](const VarNodeArray& vars) {
auto i = vars[0];
auto x = SymbolVar(i);
auto cv = [&x](int v) { return x.make_scalar(v); };
using AIdx = opr::Subtensor::AxisIndexer;
std::vector<AIdx> index(i->shape().ndim);
for (size_t ax = 0; ax < index.size(); ++ax) {
if (ax == axis)
index[ax] =
AIdx::make_interval(ax, None, cv(const_extent), None);
else
index[ax] = AIdx::make_interval(ax, None, None, cv(1));
}
auto o = opr::Subtensor::make(x, index);
return o.node();
};
auto checker = [axis](const VarNodeArray& vars) {
mgb_assert(vars.size() == 2);
return vars[0]->shape().ndim > axis;
};
return std::make_tuple(builder, checker);
}
// vim: syntax=cpp.doxygen
......@@ -12,12 +12,27 @@
#include "megbrain/gopt/reformat_manager.h"
#include "megbrain/opr/tensor_manip.h"
#include "megbrain/utils/arith_helper.h"
using namespace mgb;
using namespace gopt;
using NamedTensorShape = megdnn::NamedTensorShape;
using Dimension = megdnn::Dimension;
namespace {
int gcd(const int& p, const int& q) {
int x = p, y = q;
while (y != 0) {
if (x < y) {
y = (y % x);
} else {
x = (x % y);
std::swap(x, y);
}
}
return x;
}
NamedTensorShape tensor_formats_to_named_tensor_shape(TensorFormats format) {
switch (format) {
case TensorFormats::NCHW:
......@@ -371,6 +386,170 @@ ReformatManager::ReformatImpl ReformatManager::get(
})
}
ReformatManager::ReformatImpl ReformatManager::auto_aligned_reformat_featrue(
const VarNode* orig_var, TensorFormats orig_format,
const ReformatKey& key) const {
NamedTensorShape input_shape =
tensor_formats_to_named_tensor_shape(key.input_format);
NamedTensorShape output_shape =
tensor_formats_to_named_tensor_shape(key.output_format);
size_t input_alignment, output_alignment;
size_t input_channel_idx, output_channel_idx;
for (size_t i = 0; i < input_shape.ndim; ++i) {
if (input_shape[i].name() == Dimension::Name::C &&
input_shape[i].extent() == Dimension::UNDETERMINED_EXTENT) {
input_channel_idx = i;
input_alignment = input_shape[i].stride();
break;
}
}
for (size_t i = 0; i < output_shape.ndim; ++i) {
if (output_shape[i].name() == Dimension::Name::C &&
output_shape[i].extent() == Dimension::UNDETERMINED_EXTENT) {
output_channel_idx = i;
output_alignment = output_shape[i].stride();
break;
}
}
NamedTensorShape orig_shape =
tensor_formats_to_named_tensor_shape(orig_format);
size_t orig_channel = 0;
for (size_t i = 0; i < orig_shape.ndim; ++i) {
if (orig_shape[i].name() == Dimension::Name::C &&
orig_shape[i].extent() == Dimension::UNDETERMINED_EXTENT) {
orig_channel = orig_var->shape()[i] * orig_shape[i].stride();
break;
}
}
mgb_assert(orig_channel > 0,
"incompatible NamedTensorShape for feature(got:%s)",
orig_shape.to_string().c_str());
size_t aligned_in_channel =
divup(orig_channel, input_alignment) * input_alignment;
size_t aligned_out_channel =
divup(orig_channel, output_alignment) * output_alignment;
size_t common_alignment = input_alignment * output_alignment /
gcd(input_alignment, output_alignment);
size_t aligned_channel =
divup(orig_channel, common_alignment) * common_alignment;
auto builder = [key, aligned_channel, aligned_in_channel,
aligned_out_channel, input_shape, input_channel_idx,
output_shape,
output_channel_idx](const VarNodeArray& vars) {
VarNode *x, *cur;
x = cur = vars[0];
if (aligned_channel > aligned_in_channel) {
auto padding_shape = input_shape;
auto&& dim = padding_shape[input_channel_idx];
size_t const_extent =
(aligned_channel - aligned_in_channel) / dim.stride();
padding_shape[input_channel_idx] =
Dimension(dim.name(), dim.stride(), const_extent);
auto make_shape = std::get<0>(
MakeShapeEmitter{input_shape, padding_shape}.emit());
auto padding_shp_var = make_shape({x});
auto padding = std::get<0>(
PaddingEmitter{const_extent, input_channel_idx}.emit());
cur = padding({cur, padding_shp_var});
}
cur = ReformatManager::instance().get(key)({cur});
if (aligned_channel > aligned_out_channel) {
auto&& dim = output_shape[output_channel_idx];
size_t const_extent = aligned_out_channel / dim.stride();
auto sub = std::get<0>(
SubtensorEmitter{const_extent, output_channel_idx}.emit());
cur = sub({cur});
}
return cur;
};
return builder;
}
ReformatManager::ReformatImpl ReformatManager::auto_aligned_reformat_weight(
const VarNode* orig_var, const ReformatKey& key,
const AlignmentDesc& extra_alignment) const {
size_t in_channels = 0, out_channels = 0;
size_t input_channel_idx, output_channel_idx;
Dimension::Name out_channel_name;
auto input_shape = tensor_formats_to_named_tensor_shape(key.input_format);
for (size_t i = 0; i < input_shape.ndim; ++i) {
if (input_shape[i].name() == Dimension::Name::C &&
input_shape[i].extent() == Dimension::UNDETERMINED_EXTENT) {
in_channels = orig_var->shape()[i];
input_channel_idx = i;
mgb_assert(input_shape[i].stride() == 1,
"unsupport weight format(got:%s)",
input_shape.to_string().c_str());
} else if ((input_shape[i].name() == Dimension::Name::K ||
input_shape[i].name() == Dimension::Name::N) &&
input_shape[i].extent() == Dimension::UNDETERMINED_EXTENT) {
out_channels = orig_var->shape()[i];
out_channel_name = input_shape[i].name();
output_channel_idx = i;
mgb_assert(input_shape[i].stride() == 1,
"unsupport weight format(got:%s)",
input_shape.to_string().c_str());
}
}
size_t in_channel_alignment, out_channel_alignment = 1;
auto output_shape = tensor_formats_to_named_tensor_shape(key.output_format);
for (size_t i = 0; i < output_shape.ndim; ++i) {
if (output_shape[i].name() == Dimension::Name::C &&
output_shape[i].extent() == Dimension::UNDETERMINED_EXTENT) {
in_channel_alignment = output_shape[i].stride();
} else if (output_shape[i].name() == out_channel_name &&
output_shape[i].extent() == Dimension::UNDETERMINED_EXTENT) {
out_channel_alignment = output_shape[i].stride();
}
}
size_t aligned_in_channel =
divup(in_channels, in_channel_alignment) * in_channel_alignment;
if (extra_alignment.name == out_channel_name) {
out_channel_alignment =
extra_alignment.alignment * out_channel_alignment /
gcd(extra_alignment.alignment, out_channel_alignment);
}
size_t aligned_out_channel =
divup(out_channels, out_channel_alignment) * out_channel_alignment;
auto builder = [key, input_shape, in_channels, input_channel_idx,
aligned_in_channel, out_channels, output_channel_idx,
aligned_out_channel](const VarNodeArray& vars) {
VarNode *x, *cur;
x = cur = vars[0];
if (aligned_in_channel > in_channels) {
auto padding_shape = input_shape;
auto&& dim = padding_shape[input_channel_idx];
size_t const_extent =
(aligned_in_channel - in_channels) / dim.stride();
padding_shape[input_channel_idx] =
Dimension(dim.name(), dim.stride(), const_extent);
auto make_shape = std::get<0>(
MakeShapeEmitter{input_shape, padding_shape}.emit());
auto padding_shp_var = make_shape({x});
auto padding = std::get<0>(
PaddingEmitter{const_extent, input_channel_idx}.emit());
cur = padding({cur, padding_shp_var});
}
if (aligned_out_channel > out_channels) {
auto padding_shape = input_shape;
auto&& dim = padding_shape[output_channel_idx];
size_t const_extent =
(aligned_out_channel - out_channels) / dim.stride();
padding_shape[output_channel_idx] =
Dimension(dim.name(), dim.stride(), const_extent);
auto make_shape = std::get<0>(
MakeShapeEmitter{input_shape, padding_shape}.emit());
auto padding_shp_var = make_shape({cur});
auto padding = std::get<0>(
PaddingEmitter{const_extent, output_channel_idx}.emit());
cur = padding({cur, padding_shp_var});
}
cur = ReformatManager::instance().get(key)({cur});
return cur;
};
return builder;
}
const ReformatManager& ReformatManager::instance() {
static ReformatManager inst;
return inst;
......
......@@ -77,6 +77,26 @@ private:
};
UnderlyingBuilders analyze() const;
};
class PaddingEmitter final : public Emitter {
public:
PaddingEmitter(size_t const_extent, size_t axis)
: m_const_extent{const_extent}, m_axis{axis} {}
EmitResult emit() const override;
private:
size_t m_const_extent, m_axis;
};
class SubtensorEmitter final : public Emitter {
public:
SubtensorEmitter(size_t const_extent, size_t axis)
: m_const_extent{const_extent}, m_axis{axis} {}
EmitResult emit() const override;
private:
size_t m_const_extent, m_axis;
};
} // namespace gopt
} // namespace mgb
......
......@@ -101,12 +101,21 @@ public:
ReformatKey::Equal>;
ReformatImpl get(const ReformatKey& key) const;
ReformatImpl get(ReformatKey&& key) const { return get(key); }
ReformatImpl auto_aligned_reformat_featrue(const VarNode* orig_var,
TensorFormats orig_format,
const ReformatKey& key) const;
struct AlignmentDesc {
megdnn::Dimension::Name name;
size_t alignment;
};
ReformatImpl auto_aligned_reformat_weight(
const VarNode* orig_var, const ReformatKey& key,
const AlignmentDesc& extra_alignment = {}) const;
static const ReformatManager& instance();
private:
ReformatCache m_cache;
};
} // namespace gopt
} // namespace mgb
......
......@@ -13,7 +13,10 @@
#include "./helper.h"
#include "megbrain/gopt/reformat_manager.h"
#include "megbrain/graph/event.h"
#include "megbrain/opr/tensor_manip.h"
#include "megbrain/plugin/base.h"
#include "megbrain/plugin/profiler.h"
using namespace mgb;
using namespace gopt;
......@@ -168,4 +171,287 @@ TEST(TestReformatManager, InputChannelSmall) {
MGB_ASSERT_TENSOR_EQ(t1, t2);
}
TEST(TestReformatManager, AutoAlignedFeature) {
constexpr size_t N = 16, C = 22, H = 55, W = 55;
HostTensorGenerator<> gen;
using ReformatKey = ReformatManager::ReformatKey;
auto src_format = TensorFormats::NCHWc4,
dst_format = TensorFormats::NCHWc32;
ReformatKey key{src_format, dst_format};
auto graph = ComputingGraph::make();
graph->options().graph_opt_level = 0;
std::shared_ptr<HostTensorND> host_orig_x = gen({N, C, H, W});
std::shared_ptr<HostTensorND> host_x = gen({N, (C + 3) / 4, H, W, 4});
auto mkvar = [&](const char* name,
const std::shared_ptr<HostTensorND>& host_val) {
return opr::Host2DeviceCopy::make(*graph, host_val).rename(name);
};
auto orig_x = mkvar("orig_x", host_orig_x);
auto x = mkvar("x", host_x);
auto builder = ReformatManager::instance().auto_aligned_reformat_featrue(
orig_x.node(), TensorFormats::NCHW, key);
auto y = builder({x.node()});
HostTensorND t;
auto func = graph->compile({make_callback_copy(y, t)});
func->execute();
*host_x = *gen({(N + 5), (C + 3) / 4, H, W, 4});
func->execute();
*host_x = *gen({(N - 5), (C + 3) / 4, H, W, 4});
func->execute();
auto shp = TensorShape{(N - 5), (C + 31) / 32, H, W, 32};
ASSERT_TRUE(shp.eq_shape(t.shape()));
}
TEST(TestReformatManager, AutoAlignedFeatureB4) {
constexpr size_t N = 16, C = 94, H = 55, W = 55;
HostTensorGenerator<> gen;
using ReformatKey = ReformatManager::ReformatKey;
auto src_format = TensorFormats::NCHWc4,
dst_format = TensorFormats::NCHWc64;
ReformatKey key{src_format, dst_format};
auto graph = ComputingGraph::make();
graph->options().graph_opt_level = 0;
std::shared_ptr<HostTensorND> host_orig_x = gen({N, C, H, W});
std::shared_ptr<HostTensorND> host_x = gen({N, (C + 3) / 4, H, W, 4});
auto mkvar = [&](const char* name,
const std::shared_ptr<HostTensorND>& host_val,
const DType& dtype) {
return opr::TypeCvt::make(
opr::Host2DeviceCopy::make(*graph, host_val).rename(name),
dtype);
};
auto orig_x = mkvar("orig_x", host_orig_x,
dtype::Quantized4Asymm(20.f, static_cast<uint8_t>(8)));
auto x = mkvar("x", host_x,
dtype::Quantized4Asymm(25.f, static_cast<uint8_t>(4)));
auto builder = ReformatManager::instance().auto_aligned_reformat_featrue(
orig_x.node(), TensorFormats::NCHW, key);
auto y = builder({x.node()});
HostTensorND t;
auto func = graph->compile({make_callback_copy(y, t)});
func->execute();
}
TEST(TestReformatManager, AutoAlignedWeight) {
constexpr size_t K = 32, C = 32, R = 3, S = 3;
HostTensorGenerator<> gen;
using ReformatKey = ReformatManager::ReformatKey;
auto src_format = TensorFormats::NCHW, dst_format = TensorFormats::NCHWc64;
ReformatKey key{src_format, dst_format};
auto graph = ComputingGraph::make();
graph->options().graph_opt_level = 0;
auto mkvar = [&](const char* name, const TensorShape& shp) {
return opr::Host2DeviceCopy::make(*graph, gen(shp)).rename(name);
};
auto w = mkvar("w", {K, C, R, S});
auto builder = ReformatManager::instance().auto_aligned_reformat_weight(
w.node(), key,
ReformatManager::AlignmentDesc{megdnn::Dimension::Name::N, 64});
auto y = builder({w.node()});
HostTensorND t;
auto func = graph->compile({make_callback_copy(y, t)});
func->execute();
}
#if MGB_CUDA
#include "megbrain/comp_node_env.h"
namespace {
class ReformatProfiler : public PluginBase {
using CompNodeEventPtr = std::unique_ptr<CompNode::Event>;
public:
class MarkInputContiguous;
ReformatProfiler(cg::ComputingGraph* graph, cg::OperatorNodeBase* opr_start,
cg::OperatorNodeBase* opr_end);
~ReformatProfiler() noexcept;
double duration() const;
private:
CompNodeEventPtr m_start, m_end;
cg::OperatorNodeBase *m_opr_start, *m_opr_end;
};
ReformatProfiler::ReformatProfiler(cg::ComputingGraph* graph,
cg::OperatorNodeBase* opr_start,
cg::OperatorNodeBase* opr_end)
: PluginBase(graph), m_opr_start(opr_start), m_opr_end(opr_end) {
using namespace cg::event;
auto on_reformat_start = [this](BeforeKernel const& event) {
auto opr = event.opr;
if (opr != m_opr_start)
return;
if (m_start == nullptr) {
m_start = event.comp_node.create_event(CompNode::Event::NEED_TIMER);
}
m_start->record();
};
auto on_reformat_end = [this](AfterKernel const& event) {
auto opr = event.opr;
if (opr != m_opr_end)
return;
if (m_end == nullptr) {
m_end = event.comp_node.create_event(CompNode::Event::NEED_TIMER);
}
m_end->record();
};
auto&& ev = graph->event();
add_event_handler(ev.register_receiver<BeforeKernel>(on_reformat_start));
add_event_handler(ev.register_receiver<AfterKernel>(on_reformat_end));
}
ReformatProfiler::~ReformatProfiler() noexcept {
if (m_start)
m_start->host_wait();
if (m_end)
m_end->host_wait();
}
double ReformatProfiler::duration() const {
mgb_assert(m_end);
m_end->host_wait();
return m_start->elapsed_time_until(*m_end) -
m_start->elapsed_time_until(*m_start);
}
MGB_DEFINE_OPR_CLASS(ReformatProfiler::MarkInputContiguous,
cg::SingleCNOperatorNodeBase) // {
void scn_do_execute() override{};
void init_output_static_infer_desc() override;
void add_input_layout_constraint() override;
public:
MarkInputContiguous(VarNode* node, const OperatorNodeConfig& config);
static SymbolVar make(SymbolVar node, const OperatorNodeConfig& config = {});
}; // namespace
MGB_DYN_TYPE_OBJ_FINAL_IMPL(ReformatProfiler::MarkInputContiguous);
ReformatProfiler::MarkInputContiguous::MarkInputContiguous(
VarNode* node, const OperatorNodeConfig& config)
: Super(node->owner_graph(), config, "mark_contiguous", {node}) {
add_input({node});
add_output(None);
}
SymbolVar ReformatProfiler::MarkInputContiguous::make(
SymbolVar node, const OperatorNodeConfig& config) {
return node.insert_single_output_opr<MarkInputContiguous>(node.node(),
config);
}
void ReformatProfiler::MarkInputContiguous::init_output_static_infer_desc() {
using namespace cg::static_infer;
auto&& mgr = owner_graph()->static_infer_manager();
mgr.register_shape_infer(output(0),
ShapeInferDesc::make_identity(input(0)));
}
void ReformatProfiler::MarkInputContiguous::add_input_layout_constraint() {
input(0)->add_layout_constraint_contiguous();
}
class CUTimer {
public:
CUTimer(cudaStream_t& stream, cudaEvent_t& evt0, cudaEvent_t& evt1)
: m_stream{stream}, m_evt0{evt0}, m_evt1{evt1} {
reset();
}
void reset() {
m_started = false;
m_stopped = false;
}
void start() {
mgb_assert(!m_started);
mgb_assert(!m_stopped);
m_started = true;
cudaEventRecord(m_evt0, m_stream);
}
void stop() {
mgb_assert(m_started);
mgb_assert(!m_stopped);
m_stopped = true;
cudaEventRecord(m_evt1, m_stream);
}
size_t get_time_in_us() const {
cudaStreamSynchronize(m_stream);
float t = -1;
cudaEventElapsedTime(&t, m_evt0, m_evt1);
return static_cast<size_t>(t * 1e3);
}
private:
bool m_started, m_stopped;
size_t m_start_point, m_stop_point;
cudaStream_t& m_stream;
cudaEvent_t &m_evt0, &m_evt1;
};
} // namespace
TEST(TestReformatManager, AutoAlignedFeatureProfiling) {
REQUIRE_GPU(1);
auto cn = CompNode::load("gpux");
using ReformatKey = ReformatManager::ReformatKey;
auto dtype = dtype::Quantized4Asymm(20.f, static_cast<uint8_t>(4));
HostTensorND hval(cn, dtype);
constexpr size_t N = 16, C = 18, H = 55, W = 55;
hval.resize({N, (C + 63) / 64, H, W, 64});
std::shared_ptr<DeviceTensorND> dval =
std::make_shared<DeviceTensorND>(cn, dtype);
dval->copy_from(hval).sync();
std::shared_ptr<DeviceTensorND> dprime =
std::make_shared<DeviceTensorND>(cn, dtype);
dprime->resize({N, C, H, W});
auto graph = ComputingGraph::make();
graph->options().graph_opt_level = 0;
graph->options().var_sanity_check_first_run = false;
auto x = opr::VolatileSharedDeviceTensor::make(*graph, dval);
auto xprime = opr::VolatileSharedDeviceTensor::make(*graph, dprime);
ReformatKey key{TensorFormats::NCHWc64, TensorFormats::NCHW};
auto builder = ReformatManager::instance().auto_aligned_reformat_featrue(
xprime.node(), TensorFormats::NCHW, key);
auto y = builder({x.node()});
auto mark = ReformatProfiler::MarkInputContiguous::make(SymbolVar(y));
auto cb = [](DeviceTensorND& d) { MGB_MARK_USED_VAR(d); };
auto output_spec = std::make_pair(mark, cb);
auto func = graph->compile({output_spec});
static constexpr size_t RUNS = 100;
cn.activate();
auto stream = CompNodeEnv::from_comp_node(cn).cuda_env().stream;
cudaEvent_t evt0;
cudaEvent_t evt1;
MGB_CUDA_CHECK(cudaEventCreate(&evt0));
MGB_CUDA_CHECK(cudaEventCreate(&evt1));
CUTimer timer(stream, evt0, evt1);
timer.start();
for (size_t i = 0; i < RUNS; ++i)
func->execute();
timer.stop();
double time_cuda_evt = timer.get_time_in_us() / static_cast<double>(RUNS);
OperatorNodeBase* start = x.node()->owner_opr();
OperatorNodeBase* end = y->owner_opr();
std::unique_ptr<ReformatProfiler> profiler =
std::make_unique<ReformatProfiler>(graph.get(), start, end);
ASSERT_TRUE(y->shape().eq_shape(TensorShape{N, C, H, W}));
for (size_t i = 0; i < RUNS; ++i)
func->execute();
double time_profiler = profiler->duration() * 1e6;
printf("%f, %f\n", time_profiler, time_cuda_evt);
ASSERT_EQ(time_cuda_evt, time_profiler);
MGB_CUDA_CHECK(cudaEventDestroy(evt0));
MGB_CUDA_CHECK(cudaEventDestroy(evt1));
}
#endif
// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册