diff --git a/mindspore/ccsrc/pre_activate/ascend/ascend_backend_optimization.cc b/mindspore/ccsrc/pre_activate/ascend/ascend_backend_optimization.cc index 60ae2cd6cc7c8b4e443e413be54020464a7b42fa..394c5ce2818a03d357f410aeaec0cd4863609c40 100644 --- a/mindspore/ccsrc/pre_activate/ascend/ascend_backend_optimization.cc +++ b/mindspore/ccsrc/pre_activate/ascend/ascend_backend_optimization.cc @@ -21,7 +21,7 @@ #include "pre_activate/ascend/ir_fission/bn_grad_split.h" #include "pre_activate/ascend/ir_fusion/fused_batch_norm_fusion.h" #include "pre_activate/ascend/ir_fission/layer_norm_grad_split.h" -#include "pre_activate/common/ir_fusion/allreduce_fusion.h" +#include "pre_activate/pass/allreduce_fusion.h" #include "pre_activate/ascend/ir_fusion/square_sum_fusion.h" #include "pre_activate/ascend/ir_fusion/clip_by_norm_no_div_square_sum_fusion.h" #include "pre_activate/ascend/ir_fusion/lamb_update_with_lr_rule_fusion.h" @@ -58,8 +58,10 @@ #include "pre_activate/ascend/ir_fission/add_memcpy_async.h" #include "pre_activate/ascend/format_type/insert_cast_for_runop.h" #include "pre_activate/ascend/format_type/insert_transdata_for_runop.h" +#include "pre_activate/ascend/enhancer/getnext_memcpy_elimination.h" #include "pre_activate/ascend/ir_fission/addn_fission.h" #include "utils/context/ms_context.h" +#include "utils/config_manager.h" #include "debug/anf_ir_dump.h" #include "debug/anf_ir_utils.h" @@ -244,6 +246,9 @@ void AscendBackendOptimization(const std::shared_ptr &kern other_pm->AddPass(std::make_shared()); other_pm->AddPass(std::make_shared()); other_pm->AddPass(std::make_shared()); + if (context_ptr->enable_task_sink() && context_ptr->loop_sink_flag() && ConfigManager::GetInstance().iter_num() > 1) { + other_pm->AddPass(std::make_shared()); + } other_pm->AddPass(std::make_shared()); optimizer->AddPassManager(other_pm); (void)optimizer->Optimize(kernel_graph); diff --git a/mindspore/ccsrc/pre_activate/ascend/enhancer/getnext_memcpy_elimination.cc b/mindspore/ccsrc/pre_activate/ascend/enhancer/getnext_memcpy_elimination.cc new file mode 100644 index 0000000000000000000000000000000000000000..a39918eceee1460c71c27b62ced2877102af50fb --- /dev/null +++ b/mindspore/ccsrc/pre_activate/ascend/enhancer/getnext_memcpy_elimination.cc @@ -0,0 +1,72 @@ +/** + * Copyright 2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "pre_activate/ascend/enhancer/getnext_memcpy_elimination.h" +#include +#include "session/anf_runtime_algorithm.h" +#include "optimizer/opt.h" + +namespace mindspore::opt { + +const BaseRef GetnextMemcpyElimination::DefinePattern() const { + auto prim_memcpy = std::make_shared(kMemCpyAsyncOpName); + VarPtr x = std::make_shared(); + VectorRef memcpy_async({prim_memcpy, x}); + return memcpy_async; +} + +const AnfNodePtr GetnextMemcpyElimination::Process(const FuncGraphPtr &graph, const AnfNodePtr &node, + const EquivPtr &equiv) const { + if (graph == nullptr || node == nullptr || equiv == nullptr) { + return nullptr; + } + auto memcpy_cnode = node->cast(); + if (memcpy_cnode == nullptr) { + return nullptr; + } + + // 1. memcpy has attr kAttrLabelForInsertStreamActive + if (!AnfAlgo::HasNodeAttr(kAttrLabelForInsertStreamActive, node)) { + MS_LOG(DEBUG) << "node has no label_for_insert_stream_active attr"; + return nullptr; + } + + // 2. memcpy's output has only one user next_node + auto manager = graph->manager(); + MS_EXCEPTION_IF_NULL(manager); + if (manager->node_users().find(memcpy_cnode) == manager->node_users().end()) { + MS_LOG(EXCEPTION) << "memcpy has no output in manager"; + } + auto next_nodes = manager->node_users()[memcpy_cnode]; + if (next_nodes.size() > 1) { + MS_LOG(DEBUG) << "node's output has more than one users"; + return nullptr; + } + + // 3. next_node has only one input which is memcpy's output + for (auto &item : next_nodes) { + auto next_node = item.first->cast(); + if (next_node->inputs().size() != 2) { + MS_LOG(DEBUG) << "next node has more than one input"; + return nullptr; + } + // add attr label_for_insert_stream_active for next_node + AnfAlgo::SetNodeAttr(kAttrLabelForInsertStreamActive, MakeValue(true), next_node); + } + + return memcpy_cnode->input(1); +} +} // namespace mindspore::opt diff --git a/mindspore/ccsrc/pre_activate/ascend/enhancer/getnext_memcpy_elimination.h b/mindspore/ccsrc/pre_activate/ascend/enhancer/getnext_memcpy_elimination.h new file mode 100644 index 0000000000000000000000000000000000000000..523fc87a383d22fdac61460976c142f1c6afe5e9 --- /dev/null +++ b/mindspore/ccsrc/pre_activate/ascend/enhancer/getnext_memcpy_elimination.h @@ -0,0 +1,33 @@ +/** + * Copyright 2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef MINDSPORE_CCSRC_PRE_ACTIVATE_ASCEND_ENHANCER_GETNEXT_MEMCPY_ELIMINATION_H +#define MINDSPORE_CCSRC_PRE_ACTIVATE_ASCEND_ENHANCER_GETNEXT_MEMCPY_ELIMINATION_H + +#include "pre_activate/common/optimizer.h" + +namespace mindspore { +namespace opt { +class GetnextMemcpyElimination : public PatternProcessPass { + public: + explicit GetnextMemcpyElimination(bool multigraph = true) + : PatternProcessPass("getnext_memcpy_elimination", multigraph) {} + ~GetnextMemcpyElimination() override = default; + const BaseRef DefinePattern() const override; + const AnfNodePtr Process(const FuncGraphPtr &, const AnfNodePtr &, const EquivPtr &) const override; +}; +} // namespace opt +} // namespace mindspore +#endif // MINDSPORE_CCSRC_PRE_ACTIVATE_ASCEND_ENHANCER_GETNEXT_MEMCPY_ELIMINATION_H diff --git a/mindspore/ccsrc/pre_activate/common/ir_fusion/allreduce_fusion.cc b/mindspore/ccsrc/pre_activate/pass/allreduce_fusion.cc similarity index 97% rename from mindspore/ccsrc/pre_activate/common/ir_fusion/allreduce_fusion.cc rename to mindspore/ccsrc/pre_activate/pass/allreduce_fusion.cc index 55efcf905860f6c6585c931c0f9a065ddebcac9a..70a8974ecaf986b581d136aef75a712d09c9bf57 100644 --- a/mindspore/ccsrc/pre_activate/common/ir_fusion/allreduce_fusion.cc +++ b/mindspore/ccsrc/pre_activate/pass/allreduce_fusion.cc @@ -13,7 +13,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -#include "pre_activate/common/ir_fusion/allreduce_fusion.h" +#include "pre_activate/pass/allreduce_fusion.h" #include #include diff --git a/mindspore/ccsrc/pre_activate/common/ir_fusion/allreduce_fusion.h b/mindspore/ccsrc/pre_activate/pass/allreduce_fusion.h similarity index 86% rename from mindspore/ccsrc/pre_activate/common/ir_fusion/allreduce_fusion.h rename to mindspore/ccsrc/pre_activate/pass/allreduce_fusion.h index b49b8373c6965b378a75fe235b6b5363682ee78b..e443767e43db5d32bc06fcd25cf38836b372dd47 100644 --- a/mindspore/ccsrc/pre_activate/common/ir_fusion/allreduce_fusion.h +++ b/mindspore/ccsrc/pre_activate/pass/allreduce_fusion.h @@ -13,8 +13,8 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -#ifndef MINDSPORE_CCSRC_PRE_ACTIVATE_COMMON_IR_FUSION_ALLREDUCE_FUSION_H_ -#define MINDSPORE_CCSRC_PRE_ACTIVATE_COMMON_IR_FUSION_ALLREDUCE_FUSION_H_ +#ifndef MINDSPORE_CCSRC_PRE_ACTIVATE_PASS_ALLREDUCE_FUSION_H_ +#define MINDSPORE_CCSRC_PRE_ACTIVATE_PASS_ALLREDUCE_FUSION_H_ #include #include "pre_activate/common/pass.h" @@ -46,4 +46,4 @@ class AllReduceFusion : public Pass { }; } // namespace opt } // namespace mindspore -#endif // MINDSPORE_CCSRC_PRE_ACTIVATE_COMMON_IR_FUSION_ALLREDUCE_FUSION_H_ +#endif // MINDSPORE_CCSRC_PRE_ACTIVATE_PASS_ALLREDUCE_FUSION_H_ diff --git a/mindspore/ccsrc/session/gpu_session.cc b/mindspore/ccsrc/session/gpu_session.cc index c0b2323e0498b0a2607bcd787e7f0456fffb2aae..c6ab1e46646eead9226b6fc7fc0789df3ca50599 100644 --- a/mindspore/ccsrc/session/gpu_session.cc +++ b/mindspore/ccsrc/session/gpu_session.cc @@ -20,7 +20,7 @@ #include "device/gpu/gpu_stream_assign.h" #include "pre_activate/common/optimizer.h" #include "pre_activate/common/pass_manager.h" -#include "pre_activate/common/ir_fusion/allreduce_fusion.h" +#include "pre_activate/pass/allreduce_fusion.h" #include "device/kernel_runtime_manager.h" #include "predict/predict.h" #include "common/utils.h" diff --git a/mindspore/ccsrc/utils/utils.h b/mindspore/ccsrc/utils/utils.h index 08a98a3129024cd6caaf71a31b9f5898f3f18db2..2859b5613fdd2996167e48afff709ed6af0bc799 100644 --- a/mindspore/ccsrc/utils/utils.h +++ b/mindspore/ccsrc/utils/utils.h @@ -147,6 +147,7 @@ constexpr auto kAttrSrcFormat = "src_format"; constexpr auto kAttrOutputUsedNum = "output_used_num"; constexpr auto kAttrHasBias = "has_bias"; constexpr auto kAttrN = "N"; +constexpr auto kAttrLabelForInsertStreamActive = "label_for_insert_stream_active"; // attr value constexpr auto kValueTargetSwitch = "target_switch"; diff --git a/tests/ut/cpp/pre_activate/ascend/enhancer/getnext_memcpy_elimination.cc b/tests/ut/cpp/pre_activate/ascend/enhancer/getnext_memcpy_elimination.cc new file mode 100644 index 0000000000000000000000000000000000000000..93885a4b3ac988139756c12d1cb23566725774b1 --- /dev/null +++ b/tests/ut/cpp/pre_activate/ascend/enhancer/getnext_memcpy_elimination.cc @@ -0,0 +1,98 @@ +/** + * Copyright 2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "common/backend_common_test.h" +#include "common/py_func_graph_fetcher.h" +#include "session/anf_runtime_algorithm.h" +#include "operator/ops.h" +#include "ir/meta_tensor.h" +#include "debug/anf_ir_dump.h" +#include "utils/utils.h" +#include "kernel/kernel_build_info.h" +#include "pre_activate/common/optimizer.h" +#include "mindspore/ccsrc/pre_activate/ascend/enhancer/getnext_memcpy_elimination.h" + +namespace mindspore { +namespace opt { +class TestGetNextMemcpyElimination : public BackendCommon { + public: + TestGetNextMemcpyElimination() : get_py_fun_("gtest_input.pre_activate.getnext_memcpy_elimination_test", true) {} + + public: + UT::PyFuncGraphFetcher get_py_fun_; +}; + +TEST_F(TestGetNextMemcpyElimination, test_getnext_memcpy_elimination) { + FuncGraphPtr g_before = get_py_fun_.CallAndParseRet("test_getnext_memcpy_elimination", "before"); + ASSERT_TRUE(g_before != nullptr); + + auto optimizer = std::make_shared(); + auto pm = std::make_shared(); + auto pass = std::make_shared(); + pm->AddPass(pass); + optimizer->AddPassManager(pm); + auto new_graph = optimizer->Optimize(g_before); + + FuncGraphPtr g_after = get_py_fun_.CallAndParseRet("test_getnext_memcpy_elimination", "after"); + EXPECT_TRUE(CheckEqualGraph(g_after, new_graph)); +} + +TEST_F(TestGetNextMemcpyElimination, test_getnext_memcpy_elimination_no_attr) { + FuncGraphPtr g_before = get_py_fun_.CallAndParseRet("test_getnext_memcpy_elimination_no_attr", "before"); + ASSERT_TRUE(g_before != nullptr); + + auto optimizer = std::make_shared(); + auto pm = std::make_shared(); + auto pass = std::make_shared(); + pm->AddPass(pass); + optimizer->AddPassManager(pm); + auto new_graph = optimizer->Optimize(g_before); + + FuncGraphPtr g_after = get_py_fun_.CallAndParseRet("test_getnext_memcpy_elimination_no_attr", "after"); + EXPECT_TRUE(CheckEqualGraph(g_after, new_graph)); +} + +TEST_F(TestGetNextMemcpyElimination, test_getnext_memcpy_elimination_memcpy_multi_users) { + FuncGraphPtr g_before = get_py_fun_.CallAndParseRet("test_getnext_memcpy_elimination_memcpy_multi_users", "before"); + ASSERT_TRUE(g_before != nullptr); + + auto optimizer = std::make_shared(); + auto pm = std::make_shared(); + auto pass = std::make_shared(); + pm->AddPass(pass); + optimizer->AddPassManager(pm); + auto new_graph = optimizer->Optimize(g_before); + + FuncGraphPtr g_after = get_py_fun_.CallAndParseRet("test_getnext_memcpy_elimination_memcpy_multi_users", "after"); + EXPECT_TRUE(CheckEqualGraph(g_after, new_graph)); +} + +TEST_F(TestGetNextMemcpyElimination, test_getnext_memcpy_elimination_next_multi_inputs) { + FuncGraphPtr g_before = get_py_fun_.CallAndParseRet("test_getnext_memcpy_elimination_next_multi_inputs", "before"); + ASSERT_TRUE(g_before != nullptr); + + auto optimizer = std::make_shared(); + auto pm = std::make_shared(); + auto pass = std::make_shared(); + pm->AddPass(pass); + optimizer->AddPassManager(pm); + auto new_graph = optimizer->Optimize(g_before); + + FuncGraphPtr g_after = get_py_fun_.CallAndParseRet("test_getnext_memcpy_elimination_next_multi_inputs", "after"); + EXPECT_TRUE(CheckEqualGraph(g_after, new_graph)); +} + +} // namespace opt +} // namespace mindspore diff --git a/tests/ut/cpp/pre_activate/common/ir_fusion/allreduce_fusion_test.cc b/tests/ut/cpp/pre_activate/common/ir_fusion/allreduce_fusion_test.cc index 79a1cf1a8a88def3d7c2ecc49ca325bdc1c9d2ae..d5f2fa636dd8f6b4e9fd5e84bf28c10b16042447 100644 --- a/tests/ut/cpp/pre_activate/common/ir_fusion/allreduce_fusion_test.cc +++ b/tests/ut/cpp/pre_activate/common/ir_fusion/allreduce_fusion_test.cc @@ -20,7 +20,7 @@ #include "ir/manager.h" #include "debug/anf_ir_dump.h" #include "session/anf_runtime_algorithm.h" -#include "pre_activate/common/ir_fusion/allreduce_fusion.h" +#include "pre_activate/pass/allreduce_fusion.h" #include "pre_activate/common/optimizer.h" #include "device/kernel_info.h" #include "pre_activate/common/pass_manager.h" diff --git a/tests/ut/cpp/python_input/gtest_input/pre_activate/getnext_memcpy_elimination_test.py b/tests/ut/cpp/python_input/gtest_input/pre_activate/getnext_memcpy_elimination_test.py new file mode 100644 index 0000000000000000000000000000000000000000..39b60d72d655799a3cb81ac0b4b7ac5aa1faddfd --- /dev/null +++ b/tests/ut/cpp/python_input/gtest_input/pre_activate/getnext_memcpy_elimination_test.py @@ -0,0 +1,117 @@ +# Copyright 2020 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ + +from mindspore.ops import operations as P +from mindspore.ops import Primitive +import mindspore as ms + +get_next = P.GetNext([ms.float32], [[1, 64, 112, 112]], 1, "") +memcpy_async_attr = Primitive('memcpy_async') +memcpy_async_attr.add_prim_attr("label_for_insert_stream_active", True) +memcpy_async = Primitive('memcpy_async') +cast = P.Cast() +add = P.TensorAdd() + + +class FnDict: + def __init__(self): + self.fnDict = {} + + def __call__(self, fn): + self.fnDict[fn.__name__] = fn + + def __getitem__(self, name): + return self.fnDict[name] + + +def test_getnext_memcpy_elimination(tag): + fns = FnDict() + + @fns + def before(x): + res = get_next() + res = memcpy_async_attr(res) + res = cast(res) + return res + + @fns + def after(x): + res = get_next() + res = cast(res) + return res + + return fns[tag] + + +def test_getnext_memcpy_elimination_no_attr(tag): + fns = FnDict() + + @fns + def before(x): + res = get_next() + res = memcpy_async(res) + res = cast(res) + return res + + @fns + def after(x): + res = get_next() + res = memcpy_async(res) + res = cast(res) + return res + + return fns[tag] + + +def test_getnext_memcpy_elimination_memcpy_multi_users(tag): + fns = FnDict() + + @fns + def before(x): + res = get_next() + memcpy_out = memcpy_async_attr(res) + res = cast(memcpy_out) + res = add(memcpy_out, res) + return res + + @fns + def after(x): + res = get_next() + memcpy_out = memcpy_async_attr(res) + res = cast(memcpy_out) + res = add(memcpy_out, res) + return res + + return fns[tag] + + +def test_getnext_memcpy_elimination_next_multi_inputs(tag): + fns = FnDict() + + @fns + def before(x): + res = get_next() + memcpy_out = memcpy_async_attr(res) + res = add(memcpy_out, res) + return res + + @fns + def after(x): + res = get_next() + memcpy_out = memcpy_async_attr(res) + res = add(memcpy_out, res) + return res + + return fns[tag]