/** * \file src/core/test/sublinear_memory.cpp * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") * * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. * * Unless required by applicable law or agreed to in writing, * software distributed under the License is distributed on an * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. */ #include "megbrain/graph.h" #include "megbrain/graph/event.h" #include "megbrain/opr/basic_arith_wrapper.h" #include "megbrain/opr/blas.h" #include "megbrain/opr/dnn/convolution.h" #include "megbrain/opr/io.h" #include "megbrain/opr/tensor_manip.h" #include "megbrain/opr/utility.h" #include "megbrain/serialization/sereg.h" #include "megbrain/test/helper.h" using namespace mgb; #if MGB_ENABLE_SUBLINEAR namespace mgb { namespace cg { class SeqModifierForSublinearMemory { public: const CompNode::UnorderedMap& prev_min_bottleneck(); }; class ComputingGraphImpl : public ComputingGraph { public: SeqModifierForSublinearMemory& seq_modifier_for_sublinear_memory(); }; }; // namespace cg }; // namespace mgb namespace { MGB_DEFINE_OPR_CLASS(SublinearBadOpr, cg::SingleCNOperatorNodeBase) // { bool m_flag; size_t m_scale; void scn_do_execute() override { mgb_assert(0); } NodeProp* do_make_node_prop() const override { auto prop = Super::do_make_node_prop(); if (m_flag) { prop->add_flag(NodeProp::Flag::NO_AUTOMATIC_DUP); } return prop; } void init_output_static_infer_desc() override { using namespace cg::static_infer; auto &&mgr = owner_graph()->static_infer_manager(); auto infer_shape = [this](TensorShape& dst, const InpVal &inp) { size_t n = inp.val.at(0).shape().total_nr_elems(); dst = TensorShape{n * m_scale}; return true; }; mgr.register_shape_infer(output(0), {SourceType::DEP, {{input(0), DepType::SHAPE}}, infer_shape}); } public: SublinearBadOpr(VarNode* inp, bool bad, size_t scale, OperatorNodeConfig config = {}): Super{inp->owner_graph(), config, "subliner_bad_op", {inp}}, m_flag{bad}, m_scale{scale} { add_input({inp}); add_output(None); } static SymbolVar make(SymbolVar inp, bool bad, size_t scale, OperatorNodeConfig config = {}) { return inp.node()->owner_graph()->insert_opr( std::make_unique(inp.node(), bad, scale, config)) ->output(0); } bool flag() const { return m_flag; } size_t scale() const { return m_scale; } }; MGB_DYN_TYPE_OBJ_FINAL_IMPL(SublinearBadOpr); cg::OperatorNodeBase* bad_opr_shallow_copy( const serialization::OprShallowCopyContext &ctx, const cg::OperatorNodeBase &opr_, const VarNodeArray &inputs, const OperatorNodeConfig& config) { mgb_assert(inputs.size() == 1); auto &&opr = opr_.cast_final_safe(); return SublinearBadOpr::make( inputs[0], opr.flag(), opr.scale(), config).node()->owner_opr(); } MGB_REG_OPR_SHALLOW_COPY(SublinearBadOpr, bad_opr_shallow_copy); }; // anonymous namespace #if MGB_CUDA #define CHECK_REQ \ do { \ /* force use gpu because on CPU it is too slow */ \ REQUIRE_GPU(1); \ if (CompNode::load("gpu0").get_mem_status_bytes().second <= \ 5ull * 1024 * 1024 * 1024) { \ mgb_log_warn( \ "test skipped due to " \ "insufficient available gpu memory"); \ return; \ } \ } while (0) TEST(TestSublinearMemory, FullConv) { CHECK_REQ; HostTensorGenerator<> gen_; auto gen = [&](const TensorShape& shp) { return gen_(shp, "gpu0"); }; constexpr size_t N = 128, H = 256, W = 256; auto host_data = gen({N, 1, H, W}); auto graph = ComputingGraph::make(); SymbolVarArray params; auto data = opr::Host2DeviceCopy::make(*graph, host_data).rename("data"), out = data; size_t out_chl = host_data->shape(1), layer_count = 0; auto add_layer = [&](size_t oc, size_t h, size_t w) { gen_.std(sqrt(2.0 / (out_chl * h * w))); auto host_kern = gen({oc, out_chl, h, w}); auto dev_kern = std::make_shared(); dev_kern->copy_from(*host_kern); params.emplace_back(opr::SharedDeviceTensor::make(*graph, dev_kern)); out = opr::relu(opr::Convolution::make( out, params.back().rename(ssprintf("param%zu", layer_count)), {})); out.rename(ssprintf("out%zu", layer_count)); ++layer_count; out_chl = oc; }; for (int i = 0; i < 10; ++i) add_layer(5, 3, 3); auto loss = opr::Dot::make(out.flatten(), out.flatten()); std::vector grad_params_get(params.size()); ComputingGraph::OutputSpec out_spec; for (size_t i = 0; i < params.size(); ++i) { out_spec.emplace_back(make_callback_copy(cg::grad(loss, params[i]), grad_params_get[i])); } std::vector grad_params_expect(grad_params_get.size()); for (bool sublinear : {false, true}) { graph->options().enable_sublinear_memory_opt = sublinear; auto func = graph->compile(out_spec); func->execute(); if (!sublinear) { for (size_t i = 0; i < grad_params_get.size(); ++i) grad_params_expect[i].copy_from(grad_params_get[i]); } } for (size_t i = 0; i < grad_params_get.size(); ++i) MGB_ASSERT_TENSOR_NEAR(grad_params_get[i], grad_params_expect[i], 1e-3); } TEST(TestSublinearMemory, ConcatSplit) { CHECK_REQ; HostTensorGenerator<> gen_; auto gen = [&](const TensorShape& shp) { return gen_(shp, "gpu0"); }; constexpr size_t N = 128, H = 256, W = 256; auto host_data = gen({N, 2, H, W}); auto graph = ComputingGraph::make(); SymbolVarArray params; auto data = opr::Host2DeviceCopy::make(*graph, host_data).rename("data"), out = data; size_t out_chl = host_data->shape(1), layer_count = 0; auto add_layer = [&](size_t oc, size_t h, size_t w) { auto prev = opr::Split::make(out, opr::Split::Options::make_average(1, 2)); SymbolVarArray cur_out(2); size_t cur_in_chl[] = {out_chl / 2, out_chl - out_chl / 2}; size_t cur_out_chl[] = {oc / 2, oc - oc / 2}; for (int i = 0; i < 2; ++i) { gen_.std(sqrt(2.0 / (cur_in_chl[i] * h * w))); auto host_kern = gen({cur_out_chl[i], cur_in_chl[i], h, w}); auto dev_kern = std::make_shared(); dev_kern->copy_from(*host_kern); params.emplace_back( opr::SharedDeviceTensor::make(*graph, dev_kern)); cur_out[i] = opr::relu(opr::Convolution::make( prev[i], params.back().rename(ssprintf( "param%zu:%d", layer_count, i)), {})) .rename(ssprintf("out%zu:%d", layer_count, i)); } ++layer_count; out_chl = oc; out = opr::Concat::make(cur_out, 1); }; for (int i = 0; i < 10; ++i) add_layer(6, 3, 3); auto loss = opr::Dot::make(out.flatten(), out.flatten()); std::vector grad_params_get(params.size()); ComputingGraph::OutputSpec out_spec; for (size_t i = 0; i < params.size(); ++i) { out_spec.emplace_back(make_callback_copy(cg::grad(loss, params[i]), grad_params_get[i])); } std::vector grad_params_expect(grad_params_get.size()); for (bool sublinear : {false, true}) { graph->options().enable_sublinear_memory_opt = sublinear; auto func = graph->compile(out_spec); func->execute(); if (!sublinear) { for (size_t i = 0; i < grad_params_get.size(); ++i) grad_params_expect[i].copy_from(grad_params_get[i]); } } for (size_t i = 0; i < grad_params_get.size(); ++i) MGB_ASSERT_TENSOR_NEAR(grad_params_get[i], grad_params_expect[i], 1e-3); } TEST(TestSublinearMemory, MultiOutputOpr) { CHECK_REQ; HostTensorGenerator<> gen_; auto gen = [&](const TensorShape& shp) { return gen_(shp, "gpu0"); }; constexpr size_t N = 128, H = 256, W = 256; auto host_data = gen({N, 3, H, W}); auto graph = ComputingGraph::make(); SymbolVarArray params; auto data = opr::Host2DeviceCopy::make(*graph, host_data).rename("data"), out = data; size_t out_chl = host_data->shape(1), layer_count = 0; auto add_layer = [&](size_t oc, size_t h, size_t w) { auto prev = opr::Split::make(out, opr::Split::Options::make_average(1, 3)); SymbolVarArray cur_out(3); size_t cur_in_chl[] = {out_chl / 3, out_chl / 3, out_chl - out_chl / 3 * 2}; size_t cur_out_chl[] = {oc / 3, oc / 3, oc - oc / 3 * 2}; for (int i = 0; i < 3; ++i) { gen_.std(sqrt(2.0 / (cur_in_chl[i] * h * w))); auto host_kern = gen({cur_out_chl[i], cur_in_chl[i], h, w}); auto dev_kern = std::make_shared(); dev_kern->copy_from(*host_kern); params.emplace_back( opr::SharedDeviceTensor::make(*graph, dev_kern)); auto f = opr::Convolution::make( prev[i], params.back().rename(ssprintf("param%zu:%d", layer_count, i)), {}); if(i == 2) for(size_t j = 0; j < 10; ++ j) f = opr::relu(f); cur_out[i] = f; } ++layer_count; out_chl = oc; out = opr::Concat::make(cur_out, 1); }; add_layer(6, 3, 3); auto loss = opr::Dot::make(out.flatten(), out.flatten()); std::vector grad_params_get(params.size()); ComputingGraph::OutputSpec out_spec; for (size_t i = 0; i < params.size(); ++i) { out_spec.emplace_back(make_callback_copy(cg::grad(loss, params[i]), grad_params_get[i])); } std::vector grad_params_expect(grad_params_get.size()); for (bool sublinear : {false, true}) { graph->options().enable_sublinear_memory_opt = sublinear; auto func = graph->compile(out_spec); func->execute(); if (!sublinear) { for (size_t i = 0; i < grad_params_get.size(); ++i) grad_params_expect[i].copy_from(grad_params_get[i]); } } for (size_t i = 0; i < grad_params_get.size(); ++i) MGB_ASSERT_TENSOR_NEAR(grad_params_get[i], grad_params_expect[i], 1e-3); } TEST(TestSublinearMemory, LongChain) { CHECK_REQ; HostTensorGenerator<> gen_; auto gen = [&](const TensorShape& shp) { return gen_(shp, "gpu0"); }; constexpr size_t N = 32, C = 3, H = 224, W = 224; auto host_data = gen({N, C, H, W}); auto graph = ComputingGraph::make(); SymbolVarArray params; auto data = opr::Host2DeviceCopy::make(*graph, host_data).rename("data"), out = data; size_t out_chl = host_data->shape(1), layer_count = 0; opr::Convolution::Param conv_param; conv_param.pad_h = 1; conv_param.pad_w = 1; auto add_layer = [&](size_t oc, size_t h, size_t w) { gen_.std(sqrt(2.0 / (out_chl * h * w))); auto host_kern = gen({oc, out_chl, h, w}); auto dev_kern = std::make_shared(); dev_kern->copy_from(*host_kern); params.emplace_back(opr::SharedDeviceTensor::make(*graph, dev_kern)); out = opr::relu(opr::Convolution::make( out, params.back().rename(ssprintf("param%zu", layer_count)), conv_param)); out.rename(ssprintf("out%zu", layer_count)); ++layer_count; out_chl = oc; }; int OC[] = {1, 1, 1, 12, 1, 1, 1, 1, 15, 1}; for (int i = 1; i <= 10; ++i) { for (int j = 0; j < 10; j++) add_layer(OC[j], 3, 3); } auto loss = opr::Dot::make(out.flatten(), out.flatten()); std::vector grad_params_get(params.size()); ComputingGraph::OutputSpec out_spec; for (int i = params.size() - 1; i >= 0; --i) { out_spec.emplace_back(make_callback_copy(cg::grad(loss, params[i]), grad_params_get[i])); } std::vector grad_params_expect(grad_params_get.size()); for (bool sublinear : {false, true}) { graph->options().enable_sublinear_memory_opt = sublinear; auto func = graph->compile(out_spec); func->execute(); func->to_json()->writeto_fpath(output_file( ssprintf("TestSublinearMemory.LongChain%d.json", sublinear))); if (!sublinear) { for (size_t i = 0; i < grad_params_get.size(); ++i) grad_params_expect[i].copy_from(grad_params_get[i]); } } for (size_t i = 0; i < grad_params_get.size(); ++i) MGB_ASSERT_TENSOR_NEAR(grad_params_get[i], grad_params_expect[i], 1e-4); } #endif // MGB_CUDA TEST(TestSublinearMemory, MultiReuse) { HostTensorGenerator<> gen; auto graph = ComputingGraph::make(); constexpr size_t N = 1024, NS = N * sizeof(dt_float32); auto host_x = gen({N}), host_y0 = gen({N * 2}), host_y1 = gen({N * 2}), host_z = gen({N}); auto call_check = [&](SymbolVar val, const HostTensorND& expected) { auto cb = [expected](const DeviceTensorND& val) { HostTensorND get; get.copy_from(val).sync(); MGB_ASSERT_TENSOR_EQ(expected, get); }; return opr::CallbackInjector::make(val, {true, cb}); }; // x0 should be discarded after x2 finishes auto x0 = opr::Host2DeviceCopy::make_no_fwd(*graph, host_x), z0 = opr::Host2DeviceCopy::make_no_fwd(*graph, host_z), z1 = call_check(z0, *host_z), x1 = call_check(x0, *host_x), x2 = call_check(x0, *host_x), y0 = opr::Host2DeviceCopy::make_no_fwd(*graph, host_y0), y01 = call_check(y0, *host_y0), y1 = opr::Host2DeviceCopy::make_no_fwd(*graph, host_y1), y11 = call_check(y1, *host_y1), x3 = call_check(x0, *host_x); SymbolVar vars[] = {x0, z0, z1, x1, x2, y0, y01, y1, y11, x3}; ComputingGraph::OutputSpec out_spec; for (size_t i = 0; i < sizeof(vars) / sizeof(vars[0]); ++i) { set_priority(vars[i], i); out_spec.push_back({vars[i], {}}); } size_t alloc_size = 0; auto alloc_size_hdl = graph->event().register_receiver( [&](const cg::event::StaticMemAlloc& s) { if (s.comp_node.valid()) { alloc_size = s.alloc_size; } }); graph->options().enable_sublinear_memory_opt = true; auto func = graph->compile(out_spec); func->execute(); ASSERT_GT(alloc_size, 0u); ASSERT_LT(alloc_size, NS * 2 + (NS / 2)); } TEST(TestSublinearMemory, DynamicShape) { HostTensorGenerator<> gen; auto graph = ComputingGraph::make(); constexpr size_t N = 1024, NS = N * sizeof(dt_float32); auto host_x = gen({N}), host_p = gen({N}), host_t = gen({N / 2 + 1, 2}); auto x = opr::Host2DeviceCopy::make_no_fwd(*graph, host_x).rename("x"), y0 = (x + 1.f).rename("y0"), y1 = (y0 + .4f).rename("y1"), p = opr::Host2DeviceCopy::make_no_fwd(*graph, host_p).rename("p"), po0 = (p + .5f).rename("po0"), po1 = (p + .4f).rename("po1"), po = (po0 + po1).rename("po"), xt = (x + .5f).rename("xt"), xdyn = opr::MarkDynamicVar::make(xt), t1_shp = (opr::GetVarShape::make(xdyn) + 2).rename("t0"), t0 = opr::Host2DeviceCopy::make_no_fwd(*graph, host_t), t1 = t0.reshape(t1_shp); set_priority(y0, 1); set_priority(y1, 1); set_priority(p, 2); set_priority(po, 2); set_priority(xt, 3); set_priority(xdyn, 4); set_priority(t0, 5); HostTensorND host_y1, host_t1; size_t alloc_size = 0; auto alloc_size_hdl = graph->event().register_receiver( [&](const cg::event::StaticMemAlloc& s) { if (s.comp_node.valid()) { alloc_size = s.alloc_size; } }); graph->options().graph_opt_level = 0; graph->options().enable_sublinear_memory_opt = true; auto func = graph->compile({make_callback_copy(y1, host_y1), {po, {}}, make_callback_copy(t1, host_t1)}); func->execute().to_json()->writeto_fpath( output_file("TestSublinearMemory.DynamicShape.json")); ASSERT_GT(alloc_size, 0u); ASSERT_LT(alloc_size, NS * 2 + NS / 2); auto px = host_x->ptr(), py = host_y1.ptr(); for (size_t i = 0; i < N; ++i) { MGB_ASSERT_FLOAT_EQ(px[i] + 1.4f, py[i]); } host_t->resize({N + 2}); MGB_ASSERT_TENSOR_EQ(*host_t, host_t1); } TEST(TestSublinearMemory, EmptyGraph) { HostTensorGenerator<> gen; auto graph = ComputingGraph::make(); graph->options().enable_sublinear_memory_opt = true; auto x = opr::SharedDeviceTensor::make(*graph, *gen({1})); auto func = graph->compile({{x, {}}}); func->execute(); } TEST(TestSublinearMemory, DepsInTopoSort) { HostTensorGenerator<> gen; auto graph = ComputingGraph::make(); constexpr size_t N = 1024; auto host_x0 = gen({N}), host_x1 = gen({N}), host_x2 = gen({N}), host_x3 = gen({N}); auto x0 = opr::Host2DeviceCopy::make_no_fwd(*graph, host_x0), x1 = opr::Host2DeviceCopy::make_no_fwd(*graph, host_x1), x2 = opr::Host2DeviceCopy::make_no_fwd(*graph, host_x2), x3 = opr::Host2DeviceCopy::make_no_fwd(*graph, host_x3), x4 = opr::SharedDeviceTensor::make(*graph, *host_x0), y0 = x3 + x4, y1 = y0 + x2, y2 = y1 + x1, y3 = y2 + x0, y4 = opr::AddUpdate::make(x4, y3); SymbolVar vars[] = {x0, x1, x2, x3, x4, y0, y1, y2, y3, y4}; ComputingGraph::OutputSpec out_spec; for (size_t i = 0; i < sizeof(vars) / sizeof(vars[0]); ++i) { set_priority(vars[i], i); out_spec.push_back({vars[i], {}}); } graph->options().graph_opt_level = 0; for (bool enable_sublinear : {false, true}) { graph->options().enable_sublinear_memory_opt = enable_sublinear; auto func = graph->compile(out_spec); ASSERT_EQ(1u, y4.node()->owner_opr()->node_prop().dep_map().count( y0.node())); } } TEST(TestSublinearMemory, BadOpr) { HostTensorGenerator<> gen; auto cn = CompNode::load("xpu0"); constexpr size_t N = 1024, Scale = 2; auto host_x = gen({N}, cn); for (bool bad : {false, true}) { auto graph = ComputingGraph::make(); auto x = opr::Host2DeviceCopy::make_no_fwd(*graph, host_x), bad_var = SublinearBadOpr::make(x, bad, Scale), y0 = opr::reduce_sum(bad_var, x.make_scalar_dt(1)), y1 = SublinearBadOpr::make(y0, false, N * Scale), y = y1 + 1, z = opr::reduce_max(bad_var, x.make_scalar_dt(1)); set_priority(y0, 0); set_priority(y1, 1); set_priority(y, 2); set_priority(z, 3); graph->options().graph_opt_level = 0; graph->options().enable_sublinear_memory_opt = 1; graph->options().sublinear_mem_cofig.genetic_nr_iter = 50; auto func = graph->compile({{y, {}}, {z, {}}}); auto&& results = static_cast(graph.get()) ->seq_modifier_for_sublinear_memory().prev_min_bottleneck(); // bottleneck: // if bad : y = y1 + 1, bad_var should be saved to calculate // z later, total memory usage is // N * sclae * 2(bad_var and y1) + 1 (immutable tensor 1) // else : bad_var = BadOpr(x), total memory usage is // N(x) + N * scale(bad_var), bad_var would be recomputed // when calculate z = reduce(bad_var) size_t expect = bad ? N * Scale * 2 + 1 : N * Scale + N; ASSERT_EQ(results.at(cn), expect * host_x->dtype().size()); size_t nr_bad_opr = 0; auto count_up = [&nr_bad_opr](cg::OperatorNodeBase* op) { if (op->dyn_typeinfo() == SublinearBadOpr::typeinfo()) { ++ nr_bad_opr; } return true; }; func->iter_opr_seq(count_up); ASSERT_EQ(nr_bad_opr, bad ? 2 : 3); } } #else #pragma message "tests are disabled as Sublinear is not enabled." #endif // MGB_ENABLE_SUBLINEAR // vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}